From d56e0d07cc5ee8e334fd1ad403eef0b1a771384f Mon Sep 17 00:00:00 2001
From: Romaric Jodin <89833130+rjodinchr@users.noreply.github.com>
Date: Fri, 1 Dec 2023 08:34:44 +0100
Subject: [PATCH 01/72] clang/OpenCL: set sqrt fp accuracy on call to Z4sqrt
 (#66651)

This is reverting the previous implementation to avoid adding inline
function in opencl headers.
This was breaking clspv flow google/clspv#1231, while
https://reviews.llvm.org/D156743 mentioned that just decorating the call
node with `!pfmath` was enough.
This PR is implementing this idea.
The test has been updated with this implementation.
---
 clang/lib/CodeGen/CGCall.cpp            |   4 +
 clang/lib/Headers/opencl-c-base.h       |  58 -----------
 clang/lib/Headers/opencl-c.h            |  26 +++++
 clang/lib/Sema/OpenCLBuiltins.td        |   5 +-
 clang/test/CodeGenOpenCL/sqrt-fpmath.cl | 124 ++++++++++--------------
 5 files changed, 82 insertions(+), 135 deletions(-)

diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 4c2577126e48b3..a24aeea7ae32bf 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -5608,6 +5608,10 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
                               BundleList);
     EmitBlock(Cont);
   }
+  if (CI->getCalledFunction() && CI->getCalledFunction()->hasName() &&
+      CI->getCalledFunction()->getName().startswith("_Z4sqrt")) {
+    SetSqrtFPAccuracy(CI);
+  }
   if (callOrInvoke)
     *callOrInvoke = CI;
 
diff --git a/clang/lib/Headers/opencl-c-base.h b/clang/lib/Headers/opencl-c-base.h
index d56e5ceae652ad..2494f6213fc569 100644
--- a/clang/lib/Headers/opencl-c-base.h
+++ b/clang/lib/Headers/opencl-c-base.h
@@ -819,64 +819,6 @@ int printf(__constant const char* st, ...) __attribute__((format(printf, 1, 2)))
 
 #endif // cl_intel_device_side_avc_motion_estimation
 
-/**
- * Compute square root.
- *
- * Provide inline implementations using the builtin so that we get appropriate
- * !fpmath based on -cl-fp32-correctly-rounded-divide-sqrt, attached to
- * llvm.sqrt. The implementation should still provide an external definition.
- */
-#define __ovld __attribute__((overloadable))
-#define __cnfn __attribute__((const))
-
-inline float __ovld __cnfn sqrt(float __x) {
-  return __builtin_elementwise_sqrt(__x);
-}
-
-inline float2 __ovld __cnfn sqrt(float2 __x) {
-  return __builtin_elementwise_sqrt(__x);
-}
-
-inline float3 __ovld __cnfn sqrt(float3 __x) {
-  return __builtin_elementwise_sqrt(__x);
-}
-
-inline float4 __ovld __cnfn sqrt(float4 __x) {
-  return __builtin_elementwise_sqrt(__x);
-}
-
-inline float8 __ovld __cnfn sqrt(float8 __x) {
-  return __builtin_elementwise_sqrt(__x);
-}
-
-inline float16 __ovld __cnfn sqrt(float16 __x) {
-  return __builtin_elementwise_sqrt(__x);
-}
-
-// We only really want to define the float variants here. However
-// -fdeclare-opencl-builtins will not work if some overloads are already
- // provided in the base header, so provide all overloads here.
-
-#ifdef cl_khr_fp64
-double __ovld __cnfn sqrt(double);
-double2 __ovld __cnfn sqrt(double2);
-double3 __ovld __cnfn sqrt(double3);
-double4 __ovld __cnfn sqrt(double4);
-double8 __ovld __cnfn sqrt(double8);
-double16 __ovld __cnfn sqrt(double16);
-#endif //cl_khr_fp64
-#ifdef cl_khr_fp16
-half __ovld __cnfn sqrt(half);
-half2 __ovld __cnfn sqrt(half2);
-half3 __ovld __cnfn sqrt(half3);
-half4 __ovld __cnfn sqrt(half4);
-half8 __ovld __cnfn sqrt(half8);
-half16 __ovld __cnfn sqrt(half16);
-#endif //cl_khr_fp16
-
-#undef __cnfn
-#undef __ovld
-
 // Disable any extensions we may have enabled previously.
 #pragma OPENCL EXTENSION all : disable
 
diff --git a/clang/lib/Headers/opencl-c.h b/clang/lib/Headers/opencl-c.h
index 1efbbf8f8ee6a0..288bb18bc654eb 100644
--- a/clang/lib/Headers/opencl-c.h
+++ b/clang/lib/Headers/opencl-c.h
@@ -8496,6 +8496,32 @@ half8 __ovld __cnfn sinpi(half8);
 half16 __ovld __cnfn sinpi(half16);
 #endif //cl_khr_fp16
 
+/**
+ * Compute square root.
+ */
+float __ovld __cnfn sqrt(float);
+float2 __ovld __cnfn sqrt(float2);
+float3 __ovld __cnfn sqrt(float3);
+float4 __ovld __cnfn sqrt(float4);
+float8 __ovld __cnfn sqrt(float8);
+float16 __ovld __cnfn sqrt(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn sqrt(double);
+double2 __ovld __cnfn sqrt(double2);
+double3 __ovld __cnfn sqrt(double3);
+double4 __ovld __cnfn sqrt(double4);
+double8 __ovld __cnfn sqrt(double8);
+double16 __ovld __cnfn sqrt(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn sqrt(half);
+half2 __ovld __cnfn sqrt(half2);
+half3 __ovld __cnfn sqrt(half3);
+half4 __ovld __cnfn sqrt(half4);
+half8 __ovld __cnfn sqrt(half8);
+half16 __ovld __cnfn sqrt(half16);
+#endif //cl_khr_fp16
+
 /**
  * Compute tangent.
  */
diff --git a/clang/lib/Sema/OpenCLBuiltins.td b/clang/lib/Sema/OpenCLBuiltins.td
index 9db450281912d2..0cceba090bd8f2 100644
--- a/clang/lib/Sema/OpenCLBuiltins.td
+++ b/clang/lib/Sema/OpenCLBuiltins.td
@@ -563,15 +563,12 @@ foreach name = ["acos", "acosh", "acospi",
                 "log", "log2", "log10", "log1p", "logb",
                 "rint", "round", "rsqrt",
                 "sin", "sinh", "sinpi",
+                "sqrt",
                 "tan", "tanh", "tanpi",
                 "tgamma", "trunc",
                 "lgamma"] in {
     def : Builtin<name, [FGenTypeN, FGenTypeN], Attr.Const>;
 }
-
-// sqrt is handled in opencl-c-base.h to handle
-// -cl-fp32-correctly-rounded-divide-sqrt.
-
 foreach name = ["nan"] in {
   def : Builtin<name, [GenTypeFloatVecAndScalar, GenTypeUIntVecAndScalar], Attr.Const>;
   def : Builtin<name, [GenTypeDoubleVecAndScalar, GenTypeULongVecAndScalar], Attr.Const>;
diff --git a/clang/test/CodeGenOpenCL/sqrt-fpmath.cl b/clang/test/CodeGenOpenCL/sqrt-fpmath.cl
index df30085cba2e7d..7afde7f91bdfeb 100644
--- a/clang/test/CodeGenOpenCL/sqrt-fpmath.cl
+++ b/clang/test/CodeGenOpenCL/sqrt-fpmath.cl
@@ -3,11 +3,15 @@
 // depending on -cl-fp32-correctly-rounded-divide-sqrt
 
 // Test with -fdeclare-opencl-builtins
-// RUN: %clang_cc1 -disable-llvm-passes -triple amdgcn-unknown-unknown -fdeclare-opencl-builtins -finclude-default-header -S -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK,DEFAULT %s
-// RUN: %clang_cc1 -disable-llvm-passes -triple amdgcn-unknown-unknown -fdeclare-opencl-builtins -finclude-default-header -cl-fp32-correctly-rounded-divide-sqrt -S -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK,CORRECTLYROUNDED %s
+// RUN: %clang_cc1 -disable-llvm-passes -triple amdgcn-unknown-unknown -fdeclare-opencl-builtins -finclude-default-header -S -emit-llvm -o %t.ll %s
+// RUN: FileCheck -check-prefixes=CHECK,DEFAULT %s < %t.ll
+// RUN: %clang_cc1 -disable-llvm-passes -triple amdgcn-unknown-unknown -fdeclare-opencl-builtins -finclude-default-header -cl-fp32-correctly-rounded-divide-sqrt -S -emit-llvm -o %t.ll %s
+// RUN: FileCheck -check-prefixes=CHECK,CORRECTLYROUNDED %s < %t.ll
 
-// RUN: %clang_cc1 -disable-llvm-passes -triple amdgcn-unknown-unknown -fdeclare-opencl-builtins -finclude-default-header -cl-unsafe-math-optimizations -S -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK,DEFAULT-UNSAFE %s
-// RUN: %clang_cc1 -disable-llvm-passes -triple amdgcn-unknown-unknown -fdeclare-opencl-builtins -finclude-default-header -cl-fp32-correctly-rounded-divide-sqrt -cl-unsafe-math-optimizations -S -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK,CORRECTLYROUNDED-UNSAFE %s
+// RUN: %clang_cc1 -disable-llvm-passes -triple amdgcn-unknown-unknown -fdeclare-opencl-builtins -finclude-default-header -cl-unsafe-math-optimizations -S -emit-llvm -o %t.ll %s
+// RUN: FileCheck -check-prefixes=CHECK,DEFAULT-UNSAFE %s < %t.ll
+// RUN: %clang_cc1 -disable-llvm-passes -triple amdgcn-unknown-unknown -fdeclare-opencl-builtins -finclude-default-header -cl-fp32-correctly-rounded-divide-sqrt -cl-unsafe-math-optimizations -S -emit-llvm -o %t.ll %s
+// RUN: FileCheck -check-prefixes=CHECK,CORRECTLYROUNDED-UNSAFE %s < %t.ll
 
 // Test without -fdeclare-opencl-builtins
 // RUN: %clang_cc1 -disable-llvm-passes -triple amdgcn-unknown-unknown -finclude-default-header -S -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK,DEFAULT %s
@@ -19,183 +23,157 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
 // CHECK-LABEL: define {{.*}} float @call_sqrt_f32(
-// CHECK: call {{.*}} float @_Z4sqrtf(float noundef %{{.+}}) #{{[0-9]+$}}
+// DEFAULT: call float @_Z4sqrtf(float noundef %{{.+}}) #{{[0-9]+}}, !fpmath [[FPMATH:\![0-9]+]]{{$}}
+// CORRECTLYROUNDED: call float @_Z4sqrtf(float noundef %{{.+}}) #{{[0-9]+}}{{$}}
+
+// DEFAULT-UNSAFE: call reassoc nsz arcp contract afn float @_Z4sqrtf(float noundef %{{.+}}) #{{[0-9]+}}, !fpmath [[FPMATH:\![0-9]+]]{{$}}
+// CORRECTLYROUNDED-UNSAFE: call reassoc nsz arcp contract afn float @_Z4sqrtf(float noundef %{{.+}}) #{{[0-9]+}}{{$}}
 float call_sqrt_f32(float x) {
   return sqrt(x);
 }
 
-// CHECK-LABEL: define available_externally float @_Z4sqrtf(float noundef %__x)
-// DEFAULT: call float @llvm.sqrt.f32(float %{{.+}}), !fpmath [[$FPMATH:![0-9]+]]{{$}}
-// CORRECTLYROUNDED: call float @llvm.sqrt.f32(float %{{.+}}){{$}}
-
-// DEFAULT-UNSAFE: call reassoc nsz arcp contract afn float @llvm.sqrt.f32(float %{{.+}}), !fpmath [[$FPMATH:![0-9]+]]{{$}}
-// CORRECTLYROUNDED-UNSAFE: call reassoc nsz arcp contract afn float @llvm.sqrt.f32(float %{{.+}}){{$}}
-
 // CHECK-LABEL: define {{.*}} <2 x float> @call_sqrt_v2f32(
-// CHECK: call {{.*}} <2 x float> @_Z4sqrtDv2_f(<2 x float> noundef %{{.*}}) #{{[0-9]+$}}
+// DEFAULT: call <2 x float> @_Z4sqrtDv2_f(<2 x float> noundef %{{.+}}) #{{[0-9]+}}, !fpmath [[FPMATH]]{{$}}
+// CORRECTLYROUNDED: call <2 x float> @_Z4sqrtDv2_f(<2 x float> noundef %{{.+}}) #{{[0-9]+}}{{$}}
+
+// DEFAULT-UNSAFE: call reassoc nsz arcp contract afn <2 x float> @_Z4sqrtDv2_f(<2 x float> noundef %{{.+}}) #{{[0-9]+}}, !fpmath [[FPMATH]]{{$}}
+// CORRECTLY-UNSAFE: call reassoc nsz arcp contract afn <2 x float> @_Z4sqrtDv2_f(<2 x float> noundef %{{.+}}) #{{[0-9]+}}{{$}}
 float2 call_sqrt_v2f32(float2 x) {
   return sqrt(x);
 }
 
-// CHECK-LABEL: define available_externally <2 x float> @_Z4sqrtDv2_f(<2 x float> noundef %__x)
-// DEFAULT: call <2 x float> @llvm.sqrt.v2f32(<2 x float> %{{.+}}), !fpmath [[$FPMATH:![0-9]+]]{{$}}
-// CORRECTLYROUNDED: call <2 x float> @llvm.sqrt.v2f32(<2 x float> %{{.+}}){{$}}
-
-// DEFAULT-UNSAFE: call reassoc nsz arcp contract afn <2 x float> @llvm.sqrt.v2f32(<2 x float> %{{.+}}), !fpmath [[$FPMATH:![0-9]+]]{{$}}
-// CORRECTLYROUNDED-UNSAFE: call reassoc nsz arcp contract afn <2 x float> @llvm.sqrt.v2f32(<2 x float> %{{.+}}){{$}}
 
 // CHECK-LABEL: define {{.*}} <3 x float> @call_sqrt_v3f32(
-// CHECK: call {{.*}} <3 x float> @_Z4sqrtDv3_f(<3 x float> noundef %{{.*}}) #{{[0-9]+$}}
+// DEFAULT: call <3 x float> @_Z4sqrtDv3_f(<3 x float> noundef %{{.+}}) #{{[0-9]+}}, !fpmath [[FPMATH]]{{$}}
+// CORRECTLYROUNDED: call <3 x float> @_Z4sqrtDv3_f(<3 x float> noundef %{{.+}}) #{{[0-9]+}}{{$}}
+
+// DEFAULT-UNSAFE: call reassoc nsz arcp contract afn <3 x float> @_Z4sqrtDv3_f(<3 x float> noundef %{{.+}}) #{{[0-9]+}}, !fpmath [[FPMATH]]{{$}}
+// CORRECTLY-UNSAFE: call reassoc nsz arcp contract afn <3 x float> @_Z4sqrtDv3_f(<3 x float> noundef %{{.+}}) #{{[0-9]+}}{{$}}
 float3 call_sqrt_v3f32(float3 x) {
   return sqrt(x);
 }
 
-// CHECK-LABEL: define available_externally <3 x float> @_Z4sqrtDv3_f(<3 x float> noundef %__x)
-// DEFAULT: call <3 x float> @llvm.sqrt.v3f32(<3 x float> %{{.+}}), !fpmath [[$FPMATH:![0-9]+]]{{$}}
-// CORRECTLYROUNDED: call <3 x float> @llvm.sqrt.v3f32(<3 x float> %{{.+}}){{$}}
-
-// DEFAULT-UNSAFE: call reassoc nsz arcp contract afn <3 x float> @llvm.sqrt.v3f32(<3 x float> %{{.+}}), !fpmath [[$FPMATH:![0-9]+]]{{$}}
-// CORRECTLYROUNDED-UNSAFE: call reassoc nsz arcp contract afn <3 x float> @llvm.sqrt.v3f32(<3 x float> %{{.+}}){{$}}
 
 
 // CHECK-LABEL: define {{.*}} <4 x float> @call_sqrt_v4f32(
-// CHECK: call {{.*}} <4 x float> @_Z4sqrtDv4_f(<4 x float> noundef %{{.*}}) #{{[0-9]+$}}
+// DEFAULT: call <4 x float> @_Z4sqrtDv4_f(<4 x float> noundef %{{.+}}) #{{[0-9]+}}, !fpmath [[FPMATH]]{{$}}
+// CORRECTLYROUNDED: call <4 x float> @_Z4sqrtDv4_f(<4 x float> noundef %{{.+}}) #{{[0-9]+}}{{$}}
+
+// DEFAULT-UNSAFE: call reassoc nsz arcp contract afn <4 x float> @_Z4sqrtDv4_f(<4 x float> noundef %{{.+}}) #{{[0-9]+}}, !fpmath [[FPMATH]]{{$}}
+// CORRECTLY-UNSAFE: call reassoc nsz arcp contract afn <4 x float> @_Z4sqrtDv4_f(<4 x float> noundef %{{.+}}) #{{[0-9]+}}{{$}}
 float4 call_sqrt_v4f32(float4 x) {
   return sqrt(x);
 }
 
-// CHECK-LABEL: define available_externally <4 x float> @_Z4sqrtDv4_f(<4 x float> noundef %__x)
-// DEFAULT: call <4 x float> @llvm.sqrt.v4f32(<4 x float> %{{.+}}), !fpmath [[$FPMATH:![0-9]+]]{{$}}
-// CORRECTLYROUNDED: call <4 x float> @llvm.sqrt.v4f32(<4 x float> %{{.+}}){{$}}
-
-// DEFAULT-UNSAFE: call reassoc nsz arcp contract afn <4 x float> @llvm.sqrt.v4f32(<4 x float> %{{.+}}), !fpmath [[$FPMATH:![0-9]+]]{{$}}
-// CORRECTLYROUNDED-UNSAFE: call reassoc nsz arcp contract afn <4 x float> @llvm.sqrt.v4f32(<4 x float> %{{.+}}){{$}}
 
 // CHECK-LABEL: define {{.*}} <8 x float> @call_sqrt_v8f32(
-// CHECK: call {{.*}} <8 x float> @_Z4sqrtDv8_f(<8 x float> noundef %{{.*}}) #{{[0-9]+$}}
+// DEFAULT: call <8 x float> @_Z4sqrtDv8_f(<8 x float> noundef %{{.+}}) #{{[0-9]+}}, !fpmath [[FPMATH]]{{$}}
+// CORRECTLYROUNDED: call <8 x float> @_Z4sqrtDv8_f(<8 x float> noundef %{{.+}}) #{{[0-9]+}}{{$}}
+
+// DEFAULT-UNSAFE: call reassoc nsz arcp contract afn <8 x float> @_Z4sqrtDv8_f(<8 x float> noundef %{{.+}}) #{{[0-9]+}}, !fpmath [[FPMATH]]{{$}}
+// CORRECTLYROUNDED-UNSAFE: call reassoc nsz arcp contract afn <8 x float> @_Z4sqrtDv8_f(<8 x float> noundef %{{.+}}) #{{[0-9]+}}{{$}}
 float8 call_sqrt_v8f32(float8 x) {
   return sqrt(x);
 }
 
-// CHECK-LABEL: define available_externally <8 x float> @_Z4sqrtDv8_f(<8 x float> noundef %__x)
-// DEFAULT: call <8 x float> @llvm.sqrt.v8f32(<8 x float> %{{.+}}), !fpmath [[$FPMATH:![0-9]+]]{{$}}
-// CORRECTLYROUNDED: call <8 x float> @llvm.sqrt.v8f32(<8 x float> %{{.+}}){{$}}
-
-// DEFAULT-UNSAFE: call reassoc nsz arcp contract afn <8 x float> @llvm.sqrt.v8f32(<8 x float> %{{.+}}), !fpmath [[$FPMATH:![0-9]+]]{{$}}
-// CORRECTLYROUNDED-UNSAFE: call reassoc nsz arcp contract afn <8 x float> @llvm.sqrt.v8f32(<8 x float> %{{.+}}){{$}}
 
 
 // CHECK-LABEL: define {{.*}} <16 x float> @call_sqrt_v16f32(
-// CHECK: call {{.*}} <16 x float> @_Z4sqrtDv16_f(<16 x float> noundef %{{.*}}) #{{[0-9]+$}}
+// DEFAULT: call <16 x float> @_Z4sqrtDv16_f(<16 x float> noundef %{{.+}}) #{{[0-9]+}}, !fpmath [[FPMATH]]{{$}}
+// CORRECTLYROUNDED: call <16 x float> @_Z4sqrtDv16_f(<16 x float> noundef %{{.+}}) #{{[0-9]+}}{{$}}
+
+// DEFAULT-UNSAFE: call reassoc nsz arcp contract afn <16 x float> @_Z4sqrtDv16_f(<16 x float> noundef %{{.+}}) #{{[0-9]+}}, !fpmath [[FPMATH]]{{$}}
+// CORRECTLY-UNSAFE: call reassoc nsz arcp contract afn <16 x float> @_Z4sqrtDv16_f(<16 x float> noundef %{{.+}}) #{{[0-9]+}}{{$}}
 float16 call_sqrt_v16f32(float16 x) {
   return sqrt(x);
 }
 
-// CHECK-LABEL: define available_externally <16 x float> @_Z4sqrtDv16_f(<16 x float> noundef %__x)
-// DEFAULT: call <16 x float> @llvm.sqrt.v16f32(<16 x float> %{{.+}}), !fpmath [[$FPMATH:![0-9]+]]{{$}}
-// CORRECTLYROUNDED: call <16 x float> @llvm.sqrt.v16f32(<16 x float> %{{.+}}){{$}}
-
-// DEFAULT-UNSAFE: call reassoc nsz arcp contract afn <16 x float> @llvm.sqrt.v16f32(<16 x float> %{{.+}}), !fpmath [[$FPMATH:![0-9]+]]{{$}}
-// CORRECTLYROUNDED-UNSAFE: call reassoc nsz arcp contract afn <16 x float> @llvm.sqrt.v16f32(<16 x float> %{{.+}}){{$}}
 
 
 // Not for f64
 // CHECK-LABEL: define {{.*}} double @call_sqrt_f64(
-// CHECK: call {{.*}} double @_Z4sqrtd(double noundef %{{.+}}) #{{[0-9]+$}}
+// CHECK: call {{.*}} double @_Z4sqrtd(double noundef %{{.+}}) #{{[0-9]+$}}{{$}}
 double call_sqrt_f64(double x) {
   return sqrt(x);
 }
 
-// CHECK-NOT: define
 
 // Not for f64
 // CHECK-LABEL: define {{.*}} <2 x double> @call_sqrt_v2f64(
-// CHECK: call {{.*}} <2 x double> @_Z4sqrtDv2_d(<2 x double> noundef %{{.+}}) #{{[0-9]+$}}
+// CHECK: call {{.*}} <2 x double> @_Z4sqrtDv2_d(<2 x double> noundef %{{.+}}) #{{[0-9]+$}}{{$}}
 double2 call_sqrt_v2f64(double2 x) {
   return sqrt(x);
 }
 
-// CHECK-NOT: define
 
 // CHECK-LABEL: define {{.*}} <3 x double> @call_sqrt_v3f64(
-// CHECK: call {{.*}} <3 x double> @_Z4sqrtDv3_d(<3 x double> noundef %{{.+}}) #{{[0-9]+$}}
+// CHECK: call {{.*}} <3 x double> @_Z4sqrtDv3_d(<3 x double> noundef %{{.+}}) #{{[0-9]+$}}{{$}}
 double3 call_sqrt_v3f64(double3 x) {
   return sqrt(x);
 }
 
-// CHECK-NOT: define
 
 // CHECK-LABEL: define {{.*}} <4 x double> @call_sqrt_v4f64(
-// CHECK: call {{.*}} <4 x double> @_Z4sqrtDv4_d(<4 x double> noundef %{{.+}}) #{{[0-9]+$}}
+// CHECK: call {{.*}} <4 x double> @_Z4sqrtDv4_d(<4 x double> noundef %{{.+}}) #{{[0-9]+$}}{{$}}
 double4 call_sqrt_v4f64(double4 x) {
   return sqrt(x);
 }
 
-// CHECK-NOT: define
 
 // CHECK-LABEL: define {{.*}} <8 x double> @call_sqrt_v8f64(
-// CHECK: call {{.*}} <8 x double> @_Z4sqrtDv8_d(<8 x double> noundef %{{.+}}) #{{[0-9]+$}}
+// CHECK: call {{.*}} <8 x double> @_Z4sqrtDv8_d(<8 x double> noundef %{{.+}}) #{{[0-9]+$}}{{$}}
 double8 call_sqrt_v8f64(double8 x) {
   return sqrt(x);
 }
 
-// CHECK-NOT: define
 
 // CHECK-LABEL: define {{.*}} <16 x double> @call_sqrt_v16f64(
-// CHECK: call {{.*}} <16 x double> @_Z4sqrtDv16_d(<16 x double> noundef %{{.+}}) #{{[0-9]+$}}
+// CHECK: call {{.*}} <16 x double> @_Z4sqrtDv16_d(<16 x double> noundef %{{.+}}) #{{[0-9]+$}}{{$}}
 double16 call_sqrt_v16f64(double16 x) {
   return sqrt(x);
 }
 
-// CHECK-NOT: define
 
 // Not for f16
 // CHECK-LABEL: define {{.*}} half @call_sqrt_f16(
-// CHECK: call {{.*}} half @_Z4sqrtDh(half noundef %{{.+}}) #{{[0-9]+$}}
+// CHECK: call {{.*}} half @_Z4sqrtDh(half noundef %{{.+}}) #{{[0-9]+$}}{{$}}
 half call_sqrt_f16(half x) {
   return sqrt(x);
 }
 
-// CHECK-NOT: define
 
 // CHECK-LABEL: define {{.*}} <2 x half> @call_sqrt_v2f16(
-// CHECK: call {{.*}} <2 x half> @_Z4sqrtDv2_Dh(<2 x half> noundef %{{.+}}) #{{[0-9]+$}}
+// CHECK: call {{.*}} <2 x half> @_Z4sqrtDv2_Dh(<2 x half> noundef %{{.+}}) #{{[0-9]+$}}{{$}}
 half2 call_sqrt_v2f16(half2 x) {
   return sqrt(x);
 }
 
-// CHECK-NOT: define
 
 // CHECK-LABEL: define {{.*}} <3 x half> @call_sqrt_v3f16(
-// CHECK: call {{.*}} <3 x half> @_Z4sqrtDv3_Dh(<3 x half> noundef %{{.+}}) #{{[0-9]+$}}
+// CHECK: call {{.*}} <3 x half> @_Z4sqrtDv3_Dh(<3 x half> noundef %{{.+}}) #{{[0-9]+$}}{{$}}
 half3 call_sqrt_v3f16(half3 x) {
   return sqrt(x);
 }
 
-// CHECK-NOT: define
 
 // CHECK-LABEL: define {{.*}} <4 x half> @call_sqrt_v4f16(
-// CHECK: call {{.*}} <4 x half> @_Z4sqrtDv4_Dh(<4 x half> noundef %{{.+}}) #{{[0-9]+$}}
+// CHECK: call {{.*}} <4 x half> @_Z4sqrtDv4_Dh(<4 x half> noundef %{{.+}}) #{{[0-9]+$}}{{$}}
 half4 call_sqrt_v4f16(half4 x) {
   return sqrt(x);
 }
 
-// CHECK-NOT: define
 
 // CHECK-LABEL: define {{.*}} <8 x half> @call_sqrt_v8f16(
-// CHECK: call {{.*}} <8 x half> @_Z4sqrtDv8_Dh(<8 x half> noundef %{{.+}}) #{{[0-9]+$}}
+// CHECK: call {{.*}} <8 x half> @_Z4sqrtDv8_Dh(<8 x half> noundef %{{.+}}) #{{[0-9]+$}}{{$}}
 half8 call_sqrt_v8f16(half8 x) {
   return sqrt(x);
 }
 
-// CHECK-NOT: define
 
 // CHECK-LABEL: define {{.*}} <16 x half> @call_sqrt_v16f16(
-// CHECK: call {{.*}} <16 x half> @_Z4sqrtDv16_Dh(<16 x half> noundef %{{.+}}) #{{[0-9]+$}}
+// CHECK: call {{.*}} <16 x half> @_Z4sqrtDv16_Dh(<16 x half> noundef %{{.+}}) #{{[0-9]+$}}{{$}}
 half16 call_sqrt_v16f16(half16 x) {
   return sqrt(x);
 }
 
-// CHECK-NOT: define
-
-// DEFAULT: [[$FPMATH]] = !{float 3.000000e+00}
+// DEFAULT: [[FPMATH]] = !{float 3.000000e+00}

From 5a9354832695d878e86f90010d2b043a9551b072 Mon Sep 17 00:00:00 2001
From: paperchalice <lgamma@163.com>
Date: Fri, 1 Dec 2023 15:43:48 +0800
Subject: [PATCH 02/72] [CodeGen][NFC] Sort and format MachinePassRegistry.def
 (#74044)

Same as #73762.
---
 .../llvm/CodeGen/MachinePassRegistry.def      | 188 ++++++++++--------
 1 file changed, 105 insertions(+), 83 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachinePassRegistry.def b/llvm/include/llvm/CodeGen/MachinePassRegistry.def
index 47dd8f2cc46483..fc2d07fd6616fc 100644
--- a/llvm/include/llvm/CodeGen/MachinePassRegistry.def
+++ b/llvm/include/llvm/CodeGen/MachinePassRegistry.def
@@ -29,29 +29,30 @@ MODULE_PASS("pre-isel-intrinsic-lowering", PreISelIntrinsicLoweringPass, ())
 #define FUNCTION_ANALYSIS(NAME, PASS_NAME, CONSTRUCTOR)
 #endif
 FUNCTION_ANALYSIS("pass-instrumentation", PassInstrumentationAnalysis, (PIC))
-FUNCTION_ANALYSIS("targetir", TargetIRAnalysis, (std::move(TM.getTargetIRAnalysis())))
+FUNCTION_ANALYSIS("targetir", TargetIRAnalysis,
+                  (std::move(TM.getTargetIRAnalysis())))
 #undef FUNCTION_ANALYSIS
 
 #ifndef FUNCTION_PASS
 #define FUNCTION_PASS(NAME, PASS_NAME, CONSTRUCTOR)
 #endif
 FUNCTION_PASS("callbrprepare", CallBrPreparePass, ())
-FUNCTION_PASS("safe-stack", SafeStackPass, (TM))
-FUNCTION_PASS("mergeicmps", MergeICmpsPass, ())
-FUNCTION_PASS("lower-constant-intrinsics", LowerConstantIntrinsicsPass, ())
-FUNCTION_PASS("unreachableblockelim", UnreachableBlockElimPass, ())
 FUNCTION_PASS("consthoist", ConstantHoistingPass, ())
-FUNCTION_PASS("replace-with-veclib", ReplaceWithVeclib, ())
-FUNCTION_PASS("partially-inline-libcalls", PartiallyInlineLibCallsPass, ())
 FUNCTION_PASS("ee-instrument", EntryExitInstrumenterPass, (false))
-FUNCTION_PASS("post-inline-ee-instrument", EntryExitInstrumenterPass, (true))
 FUNCTION_PASS("expand-large-div-rem", ExpandLargeDivRemPass, ())
 FUNCTION_PASS("expand-large-fp-convert", ExpandLargeFpConvertPass, ())
 FUNCTION_PASS("expand-reductions", ExpandReductionsPass, ())
 FUNCTION_PASS("expandvp", ExpandVectorPredicationPass, ())
+FUNCTION_PASS("lower-constant-intrinsics", LowerConstantIntrinsicsPass, ())
 FUNCTION_PASS("lowerinvoke", LowerInvokePass, ())
+FUNCTION_PASS("mergeicmps", MergeICmpsPass, ())
+FUNCTION_PASS("partially-inline-libcalls", PartiallyInlineLibCallsPass, ())
+FUNCTION_PASS("post-inline-ee-instrument", EntryExitInstrumenterPass, (true))
+FUNCTION_PASS("replace-with-veclib", ReplaceWithVeclib, ())
+FUNCTION_PASS("safe-stack", SafeStackPass, (TM))
 FUNCTION_PASS("scalarize-masked-mem-intrin", ScalarizeMaskedMemIntrinPass, ())
 FUNCTION_PASS("tlshoist", TLSVariableHoistPass, ())
+FUNCTION_PASS("unreachableblockelim", UnreachableBlockElimPass, ())
 FUNCTION_PASS("verify", VerifierPass, ())
 #undef FUNCTION_PASS
 
@@ -69,7 +70,8 @@ LOOP_PASS("loop-reduce", LoopStrengthReducePass, ())
 #ifndef MACHINE_FUNCTION_ANALYSIS
 #define MACHINE_FUNCTION_ANALYSIS(NAME, PASS_NAME, CONSTRUCTOR)
 #endif
-MACHINE_FUNCTION_ANALYSIS("pass-instrumentation", PassInstrumentationAnalysis, (PIC))
+MACHINE_FUNCTION_ANALYSIS("pass-instrumentation", PassInstrumentationAnalysis,
+                          (PIC))
 // LiveVariables currently requires pure SSA form.
 // FIXME: Once TwoAddressInstruction pass no longer uses kill flags,
 // LiveVariables can be removed completely, and LiveIntervals can be directly
@@ -80,18 +82,24 @@ MACHINE_FUNCTION_ANALYSIS("pass-instrumentation", PassInstrumentationAnalysis, (
 // MACHINE_FUNCTION_ANALYSIS("live-stacks", LiveStacksPass())
 // MACHINE_FUNCTION_ANALYSIS("slot-indexes", SlotIndexesAnalysis())
 // MACHINE_FUNCTION_ANALYSIS("edge-bundles", EdgeBundlesAnalysis())
-// MACHINE_FUNCTION_ANALYSIS("lazy-machine-bfi", LazyMachineBlockFrequencyInfoAnalysis())
+// MACHINE_FUNCTION_ANALYSIS("lazy-machine-bfi",
+// LazyMachineBlockFrequencyInfoAnalysis())
 // MACHINE_FUNCTION_ANALYSIS("machine-bfi", MachineBlockFrequencyInfoAnalysis())
 // MACHINE_FUNCTION_ANALYSIS("machine-loops", MachineLoopInfoAnalysis())
-// MACHINE_FUNCTION_ANALYSIS("machine-dom-frontier", MachineDominanceFrontierAnalysis())
+// MACHINE_FUNCTION_ANALYSIS("machine-dom-frontier",
+// MachineDominanceFrontierAnalysis())
 // MACHINE_FUNCTION_ANALYSIS("machine-dom-tree", MachineDominatorTreeAnalysis())
-// MACHINE_FUNCTION_ANALYSIS("machine-ore", MachineOptimizationRemarkEmitterPassAnalysis())
-// MACHINE_FUNCTION_ANALYSIS("machine-post-dom-tree", MachinePostDominatorTreeAnalysis())
-// MACHINE_FUNCTION_ANALYSIS("machine-region-info", MachineRegionInfoPassAnalysis())
-// MACHINE_FUNCTION_ANALYSIS("machine-trace-metrics", MachineTraceMetricsAnalysis())
-// MACHINE_FUNCTION_ANALYSIS("reaching-def", ReachingDefAnalysisAnalysis())
-// MACHINE_FUNCTION_ANALYSIS("live-reg-matrix", LiveRegMatrixAnalysis())
-// MACHINE_FUNCTION_ANALYSIS("gc-analysis", GCMachineCodeAnalysisPass())
+// MACHINE_FUNCTION_ANALYSIS("machine-ore",
+// MachineOptimizationRemarkEmitterPassAnalysis())
+// MACHINE_FUNCTION_ANALYSIS("machine-post-dom-tree",
+// MachinePostDominatorTreeAnalysis())
+// MACHINE_FUNCTION_ANALYSIS("machine-region-info",
+// MachineRegionInfoPassAnalysis())
+// MACHINE_FUNCTION_ANALYSIS("machine-trace-metrics",
+// MachineTraceMetricsAnalysis()) MACHINE_FUNCTION_ANALYSIS("reaching-def",
+// ReachingDefAnalysisAnalysis()) MACHINE_FUNCTION_ANALYSIS("live-reg-matrix",
+// LiveRegMatrixAnalysis()) MACHINE_FUNCTION_ANALYSIS("gc-analysis",
+// GCMachineCodeAnalysisPass())
 #undef MACHINE_FUNCTION_ANALYSIS
 
 #ifndef MACHINE_FUNCTION_PASS
@@ -108,22 +116,22 @@ MACHINE_FUNCTION_ANALYSIS("pass-instrumentation", PassInstrumentationAnalysis, (
 #ifndef DUMMY_FUNCTION_PASS
 #define DUMMY_FUNCTION_PASS(NAME, PASS_NAME, CONSTRUCTOR)
 #endif
+DUMMY_FUNCTION_PASS("atomic-expand", AtomicExpandPass, ())
+DUMMY_FUNCTION_PASS("cfguard-check", CFGuardCheckPass, ())
+DUMMY_FUNCTION_PASS("cfguard-dispatch", CFGuardDispatchPass, ())
+DUMMY_FUNCTION_PASS("codegenprepare", CodeGenPreparePass, ())
+DUMMY_FUNCTION_PASS("dwarfehprepare", DwarfEHPass, ())
 DUMMY_FUNCTION_PASS("expandmemcmp", ExpandMemCmpPass, ())
+DUMMY_FUNCTION_PASS("gc-info-printer", GCInfoPrinterPass, ())
 DUMMY_FUNCTION_PASS("gc-lowering", GCLoweringPass, ())
+DUMMY_FUNCTION_PASS("indirectbr-expand", IndirectBrExpandPass, ())
+DUMMY_FUNCTION_PASS("interleaved-access", InterleavedAccessPass, ())
+DUMMY_FUNCTION_PASS("select-optimize", SelectOptimizePass, ())
 DUMMY_FUNCTION_PASS("shadow-stack-gc-lowering", ShadowStackGCLoweringPass, ())
 DUMMY_FUNCTION_PASS("sjljehprepare", SjLjEHPreparePass, ())
-DUMMY_FUNCTION_PASS("dwarfehprepare", DwarfEHPass, ())
-DUMMY_FUNCTION_PASS("winehprepare", WinEHPass, ())
-DUMMY_FUNCTION_PASS("wasmehprepare", WasmEHPass, ())
-DUMMY_FUNCTION_PASS("codegenprepare", CodeGenPreparePass, ())
 DUMMY_FUNCTION_PASS("stack-protector", StackProtectorPass, ())
-DUMMY_FUNCTION_PASS("atomic-expand", AtomicExpandPass, ())
-DUMMY_FUNCTION_PASS("interleaved-access", InterleavedAccessPass, ())
-DUMMY_FUNCTION_PASS("indirectbr-expand", IndirectBrExpandPass, ())
-DUMMY_FUNCTION_PASS("cfguard-dispatch", CFGuardDispatchPass, ())
-DUMMY_FUNCTION_PASS("cfguard-check", CFGuardCheckPass, ())
-DUMMY_FUNCTION_PASS("gc-info-printer", GCInfoPrinterPass, ())
-DUMMY_FUNCTION_PASS("select-optimize", SelectOptimizePass, ())
+DUMMY_FUNCTION_PASS("wasmehprepare", WasmEHPass, ())
+DUMMY_FUNCTION_PASS("winehprepare", WinEHPass, ())
 #undef DUMMY_FUNCTION_PASS
 
 #ifndef DUMMY_MODULE_PASS
@@ -141,71 +149,85 @@ DUMMY_MACHINE_MODULE_PASS("machine-outliner", MachineOutlinerPass, ())
 #ifndef DUMMY_MACHINE_FUNCTION_PASS
 #define DUMMY_MACHINE_FUNCTION_PASS(NAME, PASS_NAME, CONSTRUCTOR)
 #endif
-DUMMY_MACHINE_FUNCTION_PASS("mir-printer", PrintMIRPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("free-machine-function", FreeMachineFunctionPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("finalize-isel", FinalizeISelPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("localstackalloc", LocalStackSlotPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("shrink-wrap", ShrinkWrapPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("prologepilog", PrologEpilogInserterPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("postrapseudos", ExpandPostRAPseudosPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("implicit-null-checks", ImplicitNullChecksPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("postmisched", PostMachineSchedulerPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("machine-scheduler", MachineSchedulerPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("machine-cp", MachineCopyPropagationPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("machine-latecleanup", MachineLateInstrsCleanupPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("post-RA-sched", PostRASchedulerPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("block-placement", MachineBlockPlacementPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("block-placement-stats",
+                            MachineBlockPlacementStatsPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("branch-folder", BranchFolderPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("break-false-deps", BreakFalseDepsPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("cfguard-longjmp", CFGuardLongjmpPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("cfi-instr-inserter", CFIInstrInserterPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("dead-mi-elimination",
+                            DeadMachineInstructionElimPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("detect-dead-lanes", DetectDeadLanesPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("dot-machine-cfg", MachineCFGPrinter, ())
+DUMMY_MACHINE_FUNCTION_PASS("early-ifcvt", EarlyIfConverterPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("early-machinelicm", EarlyMachineLICMPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("early-tailduplication", EarlyTailDuplicatePass, ())
 DUMMY_MACHINE_FUNCTION_PASS("fentry-insert", FEntryInserterPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("xray-instrumentation", XRayInstrumentationPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("patchable-function", PatchableFunctionPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("reg-usage-propagation", RegUsageInfoPropagationPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("reg-usage-collector", RegUsageInfoCollectorPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("finalize-isel", FinalizeISelPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("free-machine-function", FreeMachineFunctionPass,
+                            ())
 DUMMY_MACHINE_FUNCTION_PASS("funclet-layout", FuncletLayoutPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("stackmap-liveness", StackMapLivenessPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("removeredundantdebugvalues", RemoveRedundantDebugValuesPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("dot-machine-cfg", MachineCFGPrinter, ())
+DUMMY_MACHINE_FUNCTION_PASS("implicit-null-checks", ImplicitNullChecksPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("instruction-select", InstructionSelectPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("irtranslator", IRTranslatorPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("legalizer", LegalizerPass, ())
 DUMMY_MACHINE_FUNCTION_PASS("livedebugvalues", LiveDebugValuesPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("early-tailduplication", EarlyTailDuplicatePass, ())
-DUMMY_MACHINE_FUNCTION_PASS("opt-phis", OptimizePHIsPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("stack-coloring", StackColoringPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("dead-mi-elimination", DeadMachineInstructionElimPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("early-machinelicm", EarlyMachineLICMPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("machinelicm", MachineLICMPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("liveintervals", LiveIntervalsPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("localstackalloc", LocalStackSlotPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("lrshrink", LiveRangeShrinkPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("machine-combiner", MachineCombinerPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("machine-cp", MachineCopyPropagationPass, ())
 DUMMY_MACHINE_FUNCTION_PASS("machine-cse", MachineCSEPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("machine-latecleanup", MachineLateInstrsCleanupPass,
+                            ())
+DUMMY_MACHINE_FUNCTION_PASS("machine-sanmd", MachineSanitizerBinaryMetadata, ())
+DUMMY_MACHINE_FUNCTION_PASS("machine-scheduler", MachineSchedulerPass, ())
 DUMMY_MACHINE_FUNCTION_PASS("machine-sink", MachineSinkingPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("postra-machine-sink", PostRAMachineSinkingPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("machine-uniformity",
+                            MachineUniformityInfoWrapperPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("machinelicm", MachineLICMPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("machineverifier", MachineVerifierPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("mir-printer", PrintMIRPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("opt-phis", OptimizePHIsPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("patchable-function", PatchableFunctionPass, ())
 DUMMY_MACHINE_FUNCTION_PASS("peephole-opt", PeepholeOptimizerPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("regalloc", RegAllocPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("virtregrewriter", VirtRegRewriterPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("stack-slot-coloring", StackSlotColoringPass, ())
 DUMMY_MACHINE_FUNCTION_PASS("phi-node-elimination", PHIEliminationPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("twoaddressinstruction", TwoAddressInstructionPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("detect-dead-lanes", DetectDeadLanesPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("post-RA-sched", PostRASchedulerPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("postmisched", PostMachineSchedulerPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("postra-machine-sink", PostRAMachineSinkingPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("postrapseudos", ExpandPostRAPseudosPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("print-machine-cycles", MachineCycleInfoPrinterPass,
+                            ())
+DUMMY_MACHINE_FUNCTION_PASS("print-machine-uniformity",
+                            MachineUniformityInfoPrinterPass, ())
 DUMMY_MACHINE_FUNCTION_PASS("processimpdefs", ProcessImplicitDefsPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("liveintervals", LiveIntervalsPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("simple-register-coalescing", RegisterCoalescerPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("rename-independent-subregs", RenameIndependentSubregsPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("branch-folder", BranchFolderPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("tailduplication", TailDuplicatePass, ())
-DUMMY_MACHINE_FUNCTION_PASS("block-placement", MachineBlockPlacementPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("block-placement-stats", MachineBlockPlacementStatsPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("early-ifcvt", EarlyIfConverterPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("machine-combiner", MachineCombinerPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("lrshrink", LiveRangeShrinkPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("break-false-deps", BreakFalseDepsPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("cfi-instr-inserter", CFIInstrInserterPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("cfguard-longjmp", CFGuardLongjmpPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("prologepilog", PrologEpilogInserterPass, ())
 DUMMY_MACHINE_FUNCTION_PASS("ra-basic", RABasicPass, ())
 DUMMY_MACHINE_FUNCTION_PASS("ra-fast", RAFastPass, ())
 DUMMY_MACHINE_FUNCTION_PASS("ra-greedy", RAGreedyPass, ())
 DUMMY_MACHINE_FUNCTION_PASS("ra-pbqp", RAPBQPPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("legalizer", LegalizerPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("irtranslator", IRTranslatorPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("reg-usage-collector", RegUsageInfoCollectorPass,
+                            ())
+DUMMY_MACHINE_FUNCTION_PASS("reg-usage-propagation",
+                            RegUsageInfoPropagationPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("regalloc", RegAllocPass, ())
 DUMMY_MACHINE_FUNCTION_PASS("regbankselect", RegBankSelectPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("instruction-select", InstructionSelectPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("reset-machine-function", ResetMachineFunctionPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("machineverifier", MachineVerifierPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("print-machine-cycles", MachineCycleInfoPrinterPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("machine-sanmd", MachineSanitizerBinaryMetadata, ())
-DUMMY_MACHINE_FUNCTION_PASS("machine-uniformity", MachineUniformityInfoWrapperPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("print-machine-uniformity", MachineUniformityInfoPrinterPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("removeredundantdebugvalues",
+                            RemoveRedundantDebugValuesPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("rename-independent-subregs",
+                            RenameIndependentSubregsPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("reset-machine-function", ResetMachineFunctionPass,
+                            ())
+DUMMY_MACHINE_FUNCTION_PASS("shrink-wrap", ShrinkWrapPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("simple-register-coalescing", RegisterCoalescerPass,
+                            ())
+DUMMY_MACHINE_FUNCTION_PASS("stack-coloring", StackColoringPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("stack-slot-coloring", StackSlotColoringPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("stackmap-liveness", StackMapLivenessPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("tailduplication", TailDuplicatePass, ())
+DUMMY_MACHINE_FUNCTION_PASS("twoaddressinstruction", TwoAddressInstructionPass,
+                            ())
+DUMMY_MACHINE_FUNCTION_PASS("virtregrewriter", VirtRegRewriterPass, ())
+DUMMY_MACHINE_FUNCTION_PASS("xray-instrumentation", XRayInstrumentationPass, ())
 #undef DUMMY_MACHINE_FUNCTION_PASS

From 520c3b82db7199c1dcd24520f3c0ac573c191791 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 30 Nov 2023 23:45:12 -0800
Subject: [PATCH 03/72] [llvm] Stop including llvm/ADT/StringSet.h (NFC)

Identified with clangd.
---
 llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h  | 1 -
 llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp | 1 -
 llvm/lib/Transforms/Utils/MoveAutoInit.cpp                       | 1 -
 llvm/tools/llvm-dwarfutil/Error.h                                | 1 -
 llvm/tools/llvm-exegesis/lib/BenchmarkResult.h                   | 1 -
 llvm/tools/llvm-gsymutil/llvm-gsymutil.cpp                       | 1 -
 llvm/tools/llvm-objcopy/ObjcopyOptions.cpp                       | 1 -
 7 files changed, 7 deletions(-)

diff --git a/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h b/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
index 0c79aecdd2457f..f5dce01c34e7aa 100644
--- a/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
+++ b/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
@@ -15,7 +15,6 @@
 #ifndef LLVM_TRANSFORMS_IPO_MEMPROF_CONTEXT_DISAMBIGUATION_H
 #define LLVM_TRANSFORMS_IPO_MEMPROF_CONTEXT_DISAMBIGUATION_H
 
-#include "llvm/ADT/StringSet.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/ModuleSummaryIndex.h"
 #include "llvm/IR/PassManager.h"
diff --git a/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp b/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp
index cdc1158ce1c4ca..c6ffd9f7c2e3c5 100644
--- a/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp
@@ -14,7 +14,6 @@
 
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringSet.h"
 #include "llvm/BinaryFormat/MachO.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugLine.h"
diff --git a/llvm/lib/Transforms/Utils/MoveAutoInit.cpp b/llvm/lib/Transforms/Utils/MoveAutoInit.cpp
index 6f5f34461beaf7..a977ad87b79f51 100644
--- a/llvm/lib/Transforms/Utils/MoveAutoInit.cpp
+++ b/llvm/lib/Transforms/Utils/MoveAutoInit.cpp
@@ -14,7 +14,6 @@
 #include "llvm/Transforms/Utils/MoveAutoInit.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringSet.h"
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/ValueTracking.h"
diff --git a/llvm/tools/llvm-dwarfutil/Error.h b/llvm/tools/llvm-dwarfutil/Error.h
index b92c50ca5a452a..fff5978a9d1ad4 100644
--- a/llvm/tools/llvm-dwarfutil/Error.h
+++ b/llvm/tools/llvm-dwarfutil/Error.h
@@ -11,7 +11,6 @@
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSet.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/Format.h"
diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h
index 8a7faa0176e324..38111519a2c898 100644
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h
@@ -18,7 +18,6 @@
 #include "LlvmState.h"
 #include "RegisterValue.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSet.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstBuilder.h"
 #include "llvm/Support/YAMLTraits.h"
diff --git a/llvm/tools/llvm-gsymutil/llvm-gsymutil.cpp b/llvm/tools/llvm-gsymutil/llvm-gsymutil.cpp
index 46ec4bdc28709f..094dca22f77b03 100644
--- a/llvm/tools/llvm-gsymutil/llvm-gsymutil.cpp
+++ b/llvm/tools/llvm-gsymutil/llvm-gsymutil.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringSet.h"
 #include "llvm/DebugInfo/DIContext.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/Object/Archive.h"
diff --git a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
index d33adb0b6a3e47..57129025394437 100644
--- a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
+++ b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
@@ -10,7 +10,6 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSet.h"
 #include "llvm/BinaryFormat/COFF.h"
 #include "llvm/ObjCopy/CommonConfig.h"
 #include "llvm/ObjCopy/ConfigManager.h"

From bc265bd663233c4bfa222f1cc93ec472075a53ff Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 30 Nov 2023 23:52:26 -0800
Subject: [PATCH 04/72] [llvm-reduce] Stop including llvm/ADT/SetVector.h (NFC)

Identified with clangd.
---
 llvm/tools/llvm-reduce/deltas/ReduceFunctions.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/tools/llvm-reduce/deltas/ReduceFunctions.cpp b/llvm/tools/llvm-reduce/deltas/ReduceFunctions.cpp
index 05127ec7b9c808..fdac4a3bf708e2 100644
--- a/llvm/tools/llvm-reduce/deltas/ReduceFunctions.cpp
+++ b/llvm/tools/llvm-reduce/deltas/ReduceFunctions.cpp
@@ -16,7 +16,6 @@
 #include "Delta.h"
 #include "Utils.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SetVector.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 #include <iterator>
 #include <vector>

From 604c29e9340c4a18eab1e53dd5cc4c05d46db2f7 Mon Sep 17 00:00:00 2001
From: Valery Pykhtin <valery.pykhtin@gmail.com>
Date: Fri, 1 Dec 2023 09:10:29 +0100
Subject: [PATCH 05/72] [AMDGPU] NFC. Add test for debug info on CFG annotation
 instructions. (#73959)

---
 .../CodeGen/AMDGPU/si-annotate-dbg-info.ll    | 163 ++++++++++++++++++
 1 file changed, 163 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/si-annotate-dbg-info.ll

diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-dbg-info.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-dbg-info.ll
new file mode 100644
index 00000000000000..703eeb5df86e50
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-dbg-info.ll
@@ -0,0 +1,163 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -mtriple=amdgcn-- -S -structurizecfg -si-annotate-control-flow %s | FileCheck -check-prefix=OPT %s
+
+define amdgpu_ps i32 @if_else(i32 %0) !dbg !5 {
+; OPT-LABEL: define amdgpu_ps i32 @if_else(
+; OPT-SAME: i32 [[TMP0:%.*]]) !dbg [[DBG5:![0-9]+]] {
+; OPT-NEXT:    [[C:%.*]] = icmp ne i32 [[TMP0]], 0, !dbg [[DBG13:![0-9]+]]
+; OPT-NEXT:    tail call void @llvm.dbg.value(metadata i1 [[C]], metadata [[META9:![0-9]+]], metadata !DIExpression()), !dbg [[DBG13]]
+; OPT-NEXT:    [[TMP2:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[C]])
+; OPT-NEXT:    [[TMP3:%.*]] = extractvalue { i1, i64 } [[TMP2]], 0
+; OPT-NEXT:    [[TMP4:%.*]] = extractvalue { i1, i64 } [[TMP2]], 1
+; OPT-NEXT:    br i1 [[TMP3]], label [[FALSE:%.*]], label [[FLOW:%.*]], !dbg [[DBG14:![0-9]+]]
+; OPT:       Flow:
+; OPT-NEXT:    [[TMP5:%.*]] = phi i32 [ 33, [[FALSE]] ], [ undef, [[TMP1:%.*]] ]
+; OPT-NEXT:    [[TMP6:%.*]] = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 [[TMP4]])
+; OPT-NEXT:    [[TMP7:%.*]] = extractvalue { i1, i64 } [[TMP6]], 0
+; OPT-NEXT:    [[TMP8:%.*]] = extractvalue { i1, i64 } [[TMP6]], 1
+; OPT-NEXT:    br i1 [[TMP7]], label [[TRUE:%.*]], label [[EXIT:%.*]], !dbg [[DBG14]]
+; OPT:       true:
+; OPT-NEXT:    br label [[EXIT]], !dbg [[DBG15:![0-9]+]]
+; OPT:       false:
+; OPT-NEXT:    br label [[FLOW]], !dbg [[DBG16:![0-9]+]]
+; OPT:       exit:
+; OPT-NEXT:    [[RET:%.*]] = phi i32 [ [[TMP5]], [[FLOW]] ], [ 42, [[TRUE]] ], !dbg [[DBG17:![0-9]+]]
+; OPT-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP8]])
+; OPT-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[RET]], metadata [[META11:![0-9]+]], metadata !DIExpression()), !dbg [[DBG17]]
+; OPT-NEXT:    ret i32 [[RET]], !dbg [[DBG18:![0-9]+]]
+;
+  %c = icmp eq i32 %0, 0, !dbg !13
+  tail call void @llvm.dbg.value(metadata i1 %c, metadata !9, metadata !DIExpression()), !dbg !13
+  br i1 %c, label %true, label %false, !dbg !14
+
+true:                                             ; preds = %1
+  br label %exit, !dbg !15
+
+false:                                            ; preds = %1
+  br label %exit, !dbg !16
+
+exit:                                             ; preds = %false, %true
+  %ret = phi i32 [ 42, %true ], [ 33, %false ], !dbg !17
+  tail call void @llvm.dbg.value(metadata i32 %ret, metadata !11, metadata !DIExpression()), !dbg !17
+  ret i32 %ret, !dbg !18
+}
+
+define amdgpu_ps void @loop_if_break(i32 %n) !dbg !19 {
+; OPT-LABEL: define amdgpu_ps void @loop_if_break(
+; OPT-SAME: i32 [[N:%.*]]) !dbg [[DBG19:![0-9]+]] {
+; OPT-NEXT:  entry:
+; OPT-NEXT:    br label [[LOOP:%.*]], !dbg [[DBG24:![0-9]+]]
+; OPT:       loop:
+; OPT-NEXT:    [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP5:%.*]], [[FLOW:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; OPT-NEXT:    [[I:%.*]] = phi i32 [ [[N]], [[ENTRY]] ], [ [[TMP3:%.*]], [[FLOW]] ], !dbg [[DBG25:![0-9]+]]
+; OPT-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[I]], metadata [[META21:![0-9]+]], metadata !DIExpression()), !dbg [[DBG25]]
+; OPT-NEXT:    [[C:%.*]] = icmp ugt i32 [[I]], 0, !dbg [[DBG26:![0-9]+]]
+; OPT-NEXT:    tail call void @llvm.dbg.value(metadata i1 [[C]], metadata [[META22:![0-9]+]], metadata !DIExpression()), !dbg [[DBG26]]
+; OPT-NEXT:    [[TMP0:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[C]])
+; OPT-NEXT:    [[TMP1:%.*]] = extractvalue { i1, i64 } [[TMP0]], 0
+; OPT-NEXT:    [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP0]], 1
+; OPT-NEXT:    br i1 [[TMP1]], label [[LOOP_BODY:%.*]], label [[FLOW]], !dbg [[DBG27:![0-9]+]]
+; OPT:       loop_body:
+; OPT-NEXT:    [[I_NEXT:%.*]] = sub i32 [[I]], 1, !dbg [[DBG28:![0-9]+]]
+; OPT-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[I_NEXT]], metadata [[META23:![0-9]+]], metadata !DIExpression()), !dbg [[DBG28]]
+; OPT-NEXT:    br label [[FLOW]], !dbg [[DBG29:![0-9]+]]
+; OPT:       Flow:
+; OPT-NEXT:    [[TMP3]] = phi i32 [ [[I_NEXT]], [[LOOP_BODY]] ], [ undef, [[LOOP]] ]
+; OPT-NEXT:    [[TMP4:%.*]] = phi i1 [ false, [[LOOP_BODY]] ], [ true, [[LOOP]] ]
+; OPT-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP2]])
+; OPT-NEXT:    [[TMP5]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP4]], i64 [[PHI_BROKEN]])
+; OPT-NEXT:    [[TMP6:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP5]])
+; OPT-NEXT:    br i1 [[TMP6]], label [[EXIT:%.*]], label [[LOOP]], !dbg [[DBG27]]
+; OPT:       exit:
+; OPT-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP5]])
+; OPT-NEXT:    ret void, !dbg [[DBG30:![0-9]+]]
+;
+entry:
+  br label %loop, !dbg !24
+
+loop:                                             ; preds = %loop_body, %entry
+  %i = phi i32 [ %n, %entry ], [ %i.next, %loop_body ], !dbg !25
+  tail call void @llvm.dbg.value(metadata i32 %i, metadata !21, metadata !DIExpression()), !dbg !25
+  %c = icmp ugt i32 %i, 0, !dbg !26
+  tail call void @llvm.dbg.value(metadata i1 %c, metadata !22, metadata !DIExpression()), !dbg !26
+  br i1 %c, label %loop_body, label %exit, !dbg !27
+
+loop_body:                                        ; preds = %loop
+  %i.next = sub i32 %i, 1, !dbg !28
+  tail call void @llvm.dbg.value(metadata i32 %i.next, metadata !23, metadata !DIExpression()), !dbg !28
+  br label %loop, !dbg !29
+
+exit:                                             ; preds = %loop
+  ret void, !dbg !30
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare void @llvm.dbg.value(metadata, metadata, metadata) #0
+
+attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+
+!llvm.dbg.cu = !{!0}
+!llvm.debugify = !{!2, !3}
+!llvm.module.flags = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!1 = !DIFile(filename: "../../../test/CodeGen/AMDGPU/si-annotate-dbg-info.ll", directory: "/")
+!2 = !{i32 13}
+!3 = !{i32 5}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = distinct !DISubprogram(name: "if_else", linkageName: "if_else", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !8)
+!6 = !DISubroutineType(types: !7)
+!7 = !{}
+!8 = !{!9, !11}
+!9 = !DILocalVariable(name: "1", scope: !5, file: !1, line: 1, type: !10)
+!10 = !DIBasicType(name: "ty8", size: 8, encoding: DW_ATE_unsigned)
+!11 = !DILocalVariable(name: "2", scope: !5, file: !1, line: 5, type: !12)
+!12 = !DIBasicType(name: "ty32", size: 32, encoding: DW_ATE_unsigned)
+!13 = !DILocation(line: 1, column: 1, scope: !5)
+!14 = !DILocation(line: 2, column: 1, scope: !5)
+!15 = !DILocation(line: 3, column: 1, scope: !5)
+!16 = !DILocation(line: 4, column: 1, scope: !5)
+!17 = !DILocation(line: 5, column: 1, scope: !5)
+!18 = !DILocation(line: 6, column: 1, scope: !5)
+!19 = distinct !DISubprogram(name: "loop_if_break", linkageName: "loop_if_break", scope: null, file: !1, line: 7, type: !6, scopeLine: 7, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !20)
+!20 = !{!21, !22, !23}
+!21 = !DILocalVariable(name: "3", scope: !19, file: !1, line: 8, type: !12)
+!22 = !DILocalVariable(name: "4", scope: !19, file: !1, line: 9, type: !10)
+!23 = !DILocalVariable(name: "5", scope: !19, file: !1, line: 11, type: !12)
+!24 = !DILocation(line: 7, column: 1, scope: !19)
+!25 = !DILocation(line: 8, column: 1, scope: !19)
+!26 = !DILocation(line: 9, column: 1, scope: !19)
+!27 = !DILocation(line: 10, column: 1, scope: !19)
+!28 = !DILocation(line: 11, column: 1, scope: !19)
+!29 = !DILocation(line: 12, column: 1, scope: !19)
+!30 = !DILocation(line: 13, column: 1, scope: !19)
+;.
+; OPT: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C, file: [[META1:![0-9]+]], producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+; OPT: [[META1]] = !DIFile(filename: "../../../test/CodeGen/AMDGPU/si-annotate-dbg-info.ll", directory: {{.*}})
+; OPT: [[DBG5]] = distinct !DISubprogram(name: "if_else", linkageName: "if_else", scope: null, file: [[META1]], line: 1, type: [[META6:![0-9]+]], scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META8:![0-9]+]])
+; OPT: [[META6]] = !DISubroutineType(types: [[META7:![0-9]+]])
+; OPT: [[META7]] = !{}
+; OPT: [[META8]] = !{[[META9]], [[META11]]}
+; OPT: [[META9]] = !DILocalVariable(name: "1", scope: [[DBG5]], file: [[META1]], line: 1, type: [[META10:![0-9]+]])
+; OPT: [[META10]] = !DIBasicType(name: "ty8", size: 8, encoding: DW_ATE_unsigned)
+; OPT: [[META11]] = !DILocalVariable(name: "2", scope: [[DBG5]], file: [[META1]], line: 5, type: [[META12:![0-9]+]])
+; OPT: [[META12]] = !DIBasicType(name: "ty32", size: 32, encoding: DW_ATE_unsigned)
+; OPT: [[DBG13]] = !DILocation(line: 1, column: 1, scope: [[DBG5]])
+; OPT: [[DBG14]] = !DILocation(line: 2, column: 1, scope: [[DBG5]])
+; OPT: [[DBG15]] = !DILocation(line: 3, column: 1, scope: [[DBG5]])
+; OPT: [[DBG16]] = !DILocation(line: 4, column: 1, scope: [[DBG5]])
+; OPT: [[DBG17]] = !DILocation(line: 5, column: 1, scope: [[DBG5]])
+; OPT: [[DBG18]] = !DILocation(line: 6, column: 1, scope: [[DBG5]])
+; OPT: [[DBG19]] = distinct !DISubprogram(name: "loop_if_break", linkageName: "loop_if_break", scope: null, file: [[META1]], line: 7, type: [[META6]], scopeLine: 7, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META20:![0-9]+]])
+; OPT: [[META20]] = !{[[META21]], [[META22]], [[META23]]}
+; OPT: [[META21]] = !DILocalVariable(name: "3", scope: [[DBG19]], file: [[META1]], line: 8, type: [[META12]])
+; OPT: [[META22]] = !DILocalVariable(name: "4", scope: [[DBG19]], file: [[META1]], line: 9, type: [[META10]])
+; OPT: [[META23]] = !DILocalVariable(name: "5", scope: [[DBG19]], file: [[META1]], line: 11, type: [[META12]])
+; OPT: [[DBG24]] = !DILocation(line: 7, column: 1, scope: [[DBG19]])
+; OPT: [[DBG25]] = !DILocation(line: 8, column: 1, scope: [[DBG19]])
+; OPT: [[DBG26]] = !DILocation(line: 9, column: 1, scope: [[DBG19]])
+; OPT: [[DBG27]] = !DILocation(line: 10, column: 1, scope: [[DBG19]])
+; OPT: [[DBG28]] = !DILocation(line: 11, column: 1, scope: [[DBG19]])
+; OPT: [[DBG29]] = !DILocation(line: 12, column: 1, scope: [[DBG19]])
+; OPT: [[DBG30]] = !DILocation(line: 13, column: 1, scope: [[DBG19]])
+;.

From 0e163e75d44cfa024092cda5099bd41af2218215 Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan@intel.com>
Date: Fri, 1 Dec 2023 16:18:33 +0800
Subject: [PATCH 06/72] [X86][MC] Not emit {evex} for VEX-promoted instructions
 with GPR operands (#74039)

To align with
1. GNU binutils's behavior for APX instructions
2. LLVM's behaviour for EVEX intructions with VEX variant
---
 llvm/lib/Target/X86/X86InstrAVX512.td      |  7 ++---
 llvm/test/MC/Disassembler/X86/apx/kmov.txt | 35 ++++++++++++----------
 llvm/test/MC/X86/apx/kmov-att.s            | 28 +++++++++--------
 llvm/test/MC/X86/apx/kmov-intel.s          | 28 +++++++++--------
 4 files changed, 52 insertions(+), 46 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 0514f0d1950670..77b359e84fbd2d 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -2855,8 +2855,8 @@ defm VFPCLASS : avx512_fp_fpclass_all<"vfpclass", 0x66, 0x67, SchedWriteFCmp>, E
 multiclass avx512_mask_mov<bits<8> opc_kk, bits<8> opc_km, bits<8> opc_mk,
                           string OpcodeStr, RegisterClass KRC, ValueType vvt,
                           X86MemOperand x86memop, string Suffix = ""> {
-  let explicitOpPrefix = !if(!eq(Suffix, ""), NoExplicitOpPrefix, ExplicitEVEX) in {
-  let isMoveReg = 1, hasSideEffects = 0, SchedRW = [WriteMove] in
+  let isMoveReg = 1, hasSideEffects = 0, SchedRW = [WriteMove],
+      explicitOpPrefix = !if(!eq(Suffix, ""), NoExplicitOpPrefix, ExplicitEVEX) in
   def kk#Suffix : I<opc_kk, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src),
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
                   Sched<[WriteMove]>;
@@ -2868,13 +2868,12 @@ multiclass avx512_mask_mov<bits<8> opc_kk, bits<8> opc_km, bits<8> opc_mk,
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                     [(store KRC:$src, addr:$dst)]>,
                   Sched<[WriteStore]>;
-  }
 }
 
 multiclass avx512_mask_mov_gpr<bits<8> opc_kr, bits<8> opc_rk,
                                string OpcodeStr, RegisterClass KRC,
                                RegisterClass GRC, string Suffix = ""> {
-  let hasSideEffects = 0, explicitOpPrefix = !if(!eq(Suffix, ""), NoExplicitOpPrefix, ExplicitEVEX) in {
+  let hasSideEffects = 0 in {
     def kr#Suffix : I<opc_kr, MRMSrcReg, (outs KRC:$dst), (ins GRC:$src),
                       !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
                     Sched<[WriteMove]>;
diff --git a/llvm/test/MC/Disassembler/X86/apx/kmov.txt b/llvm/test/MC/Disassembler/X86/apx/kmov.txt
index d089ef192230a5..5d947ff39f2314 100644
--- a/llvm/test/MC/Disassembler/X86/apx/kmov.txt
+++ b/llvm/test/MC/Disassembler/X86/apx/kmov.txt
@@ -1,6 +1,25 @@
 # RUN: llvm-mc -triple x86_64 -disassemble %s | FileCheck %s --check-prefix=ATT
 # RUN: llvm-mc -triple x86_64 -disassemble -output-asm-variant=1 %s | FileCheck %s --check-prefix=INTEL
 
+# ATT:   {evex} kmovb	%k1, %k2
+# INTEL: {evex} kmovb	k2, k1
+0x62,0xf1,0x7d,0x08,0x90,0xd1
+
+# ATT:   {evex} kmovw	%k1, %k2
+# INTEL: {evex} kmovw	k2, k1
+0x62,0xf1,0x7c,0x08,0x90,0xd1
+
+# ATT:   {evex} kmovd	%k1, %k2
+# INTEL: {evex} kmovd	k2, k1
+0x62,0xf1,0xfd,0x08,0x90,0xd1
+
+# ATT:   {evex} kmovq	%k1, %k2
+# INTEL: {evex} kmovq	k2, k1
+0x62,0xf1,0xfc,0x08,0x90,0xd1
+
+# ATT-NOT: {evex}
+# INTEL-NOT: {evex}
+
 # ATT:   kmovb	%r16d, %k1
 # INTEL: kmovb	k1, r16d
 0x62,0xf9,0x7d,0x08,0x92,0xc8
@@ -64,19 +83,3 @@
 # ATT:   kmovq	%k1, (%r16,%r17)
 # INTEL: kmovq	qword ptr [r16 + r17], k1
 0x62,0xf9,0xf8,0x08,0x91,0x0c,0x08
-
-# ATT:   {evex} kmovb	%k1, %k2
-# INTEL: {evex} kmovb	k2, k1
-0x62,0xf1,0x7d,0x08,0x90,0xd1
-
-# ATT:   {evex} kmovw	%k1, %k2
-# INTEL: {evex} kmovw	k2, k1
-0x62,0xf1,0x7c,0x08,0x90,0xd1
-
-# ATT:   {evex} kmovd	%k1, %k2
-# INTEL: {evex} kmovd	k2, k1
-0x62,0xf1,0xfd,0x08,0x90,0xd1
-
-# ATT:   {evex} kmovq	%k1, %k2
-# INTEL: {evex} kmovq	k2, k1
-0x62,0xf1,0xfc,0x08,0x90,0xd1
diff --git a/llvm/test/MC/X86/apx/kmov-att.s b/llvm/test/MC/X86/apx/kmov-att.s
index be5042cf0a30c8..949ef65be98d4c 100644
--- a/llvm/test/MC/X86/apx/kmov-att.s
+++ b/llvm/test/MC/X86/apx/kmov-att.s
@@ -3,6 +3,21 @@
 
 # ERROR-COUNT-20: error:
 # ERROR-NOT: error:
+# CHECK: {evex}	kmovb	%k1, %k2
+# CHECK: encoding: [0x62,0xf1,0x7d,0x08,0x90,0xd1]
+         {evex}	kmovb	%k1, %k2
+# CHECK: {evex}	kmovw	%k1, %k2
+# CHECK: encoding: [0x62,0xf1,0x7c,0x08,0x90,0xd1]
+         {evex}	kmovw	%k1, %k2
+# CHECK: {evex}	kmovd	%k1, %k2
+# CHECK: encoding: [0x62,0xf1,0xfd,0x08,0x90,0xd1]
+         {evex}	kmovd	%k1, %k2
+# CHECK: {evex}	kmovq	%k1, %k2
+# CHECK: encoding: [0x62,0xf1,0xfc,0x08,0x90,0xd1]
+         {evex}	kmovq	%k1, %k2
+
+# CHECK-NOT: {evex}
+
 # CHECK: kmovb	%r16d, %k1
 # CHECK: encoding: [0x62,0xf9,0x7d,0x08,0x92,0xc8]
          kmovb	%r16d, %k1
@@ -54,16 +69,3 @@
 # CHECK: kmovq	%k1, (%r16,%r17)
 # CHECK: encoding: [0x62,0xf9,0xf8,0x08,0x91,0x0c,0x08]
          kmovq	%k1, (%r16,%r17)
-
-# CHECK: {evex}	kmovb	%k1, %k2
-# CHECK: encoding: [0x62,0xf1,0x7d,0x08,0x90,0xd1]
-         {evex}	kmovb	%k1, %k2
-# CHECK: {evex}	kmovw	%k1, %k2
-# CHECK: encoding: [0x62,0xf1,0x7c,0x08,0x90,0xd1]
-         {evex}	kmovw	%k1, %k2
-# CHECK: {evex}	kmovd	%k1, %k2
-# CHECK: encoding: [0x62,0xf1,0xfd,0x08,0x90,0xd1]
-         {evex}	kmovd	%k1, %k2
-# CHECK: {evex}	kmovq	%k1, %k2
-# CHECK: encoding: [0x62,0xf1,0xfc,0x08,0x90,0xd1]
-         {evex}	kmovq	%k1, %k2
diff --git a/llvm/test/MC/X86/apx/kmov-intel.s b/llvm/test/MC/X86/apx/kmov-intel.s
index 8ceb29d32dba6c..0cdbd310062eba 100644
--- a/llvm/test/MC/X86/apx/kmov-intel.s
+++ b/llvm/test/MC/X86/apx/kmov-intel.s
@@ -1,5 +1,20 @@
 # RUN: llvm-mc -triple x86_64 -show-encoding -x86-asm-syntax=intel -output-asm-variant=1 %s | FileCheck %s
 
+# CHECK: {evex}	kmovb	k2, k1
+# CHECK: encoding: [0x62,0xf1,0x7d,0x08,0x90,0xd1]
+         {evex}	kmovb	k2, k1
+# CHECK: {evex}	kmovw	k2, k1
+# CHECK: encoding: [0x62,0xf1,0x7c,0x08,0x90,0xd1]
+         {evex}	kmovw	k2, k1
+# CHECK: {evex}	kmovd	k2, k1
+# CHECK: encoding: [0x62,0xf1,0xfd,0x08,0x90,0xd1]
+         {evex}	kmovd	k2, k1
+# CHECK: {evex}	kmovq	k2, k1
+# CHECK: encoding: [0x62,0xf1,0xfc,0x08,0x90,0xd1]
+         {evex}	kmovq	k2, k1
+
+# CHECK-NOT: {evex}
+
 # CHECK: kmovb	k1, r16d
 # CHECK: encoding: [0x62,0xf9,0x7d,0x08,0x92,0xc8]
          kmovb	k1, r16d
@@ -51,16 +66,3 @@
 # CHECK: kmovq	qword ptr [r16 + r17], k1
 # CHECK: encoding: [0x62,0xf9,0xf8,0x08,0x91,0x0c,0x08]
          kmovq	qword ptr [r16 + r17], k1
-
-# CHECK: {evex}	kmovb	k2, k1
-# CHECK: encoding: [0x62,0xf1,0x7d,0x08,0x90,0xd1]
-         {evex}	kmovb	k2, k1
-# CHECK: {evex}	kmovw	k2, k1
-# CHECK: encoding: [0x62,0xf1,0x7c,0x08,0x90,0xd1]
-         {evex}	kmovw	k2, k1
-# CHECK: {evex}	kmovd	k2, k1
-# CHECK: encoding: [0x62,0xf1,0xfd,0x08,0x90,0xd1]
-         {evex}	kmovd	k2, k1
-# CHECK: {evex}	kmovq	k2, k1
-# CHECK: encoding: [0x62,0xf1,0xfc,0x08,0x90,0xd1]
-         {evex}	kmovq	k2, k1

From ab3fdbdfbe7edc62049c602d87be91c3ad3f5e3b Mon Sep 17 00:00:00 2001
From: Allen <zhongyunde@huawei.com>
Date: Fri, 1 Dec 2023 16:20:38 +0800
Subject: [PATCH 07/72] [ValueTracking] Support srem/urem for
 isKnownNonNullFromDominatingCondition (#74021)

Similar to div, the rem should also proof its second operand is
non-zero, otherwise it is a UB.

Fix https://github.com/llvm/llvm-project/issues/71782
---
 llvm/lib/Analysis/ValueTracking.cpp           |  3 +-
 .../ValueTracking/select-known-non-zero.ll    | 56 ++++++++++++++++++-
 .../Transforms/InstCombine/zext-or-icmp.ll    |  4 +-
 3 files changed, 58 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 250ce739ea5147..ef8fa5826deb94 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -2186,7 +2186,8 @@ static bool isKnownNonNullFromDominatingCondition(const Value *V,
         return true;
     }
 
-    if (match(U, m_IDiv(m_Value(), m_Specific(V))) &&
+    if ((match(U, m_IDiv(m_Value(), m_Specific(V))) ||
+         match(U, m_IRem(m_Value(), m_Specific(V)))) &&
         isValidAssumeForContext(cast<Instruction>(U), CtxI, DT))
       return true;
 
diff --git a/llvm/test/Analysis/ValueTracking/select-known-non-zero.ll b/llvm/test/Analysis/ValueTracking/select-known-non-zero.ll
index 1dc88412041d34..53ed4485c94f08 100644
--- a/llvm/test/Analysis/ValueTracking/select-known-non-zero.ll
+++ b/llvm/test/Analysis/ValueTracking/select-known-non-zero.ll
@@ -2,6 +2,8 @@
 ; RUN: opt < %s -passes=instsimplify -S | FileCheck %s
 
 declare void @llvm.assume(i1)
+declare void @use(i64)
+declare void @use4(i4)
 
 define i1 @select_v_ne_fail(i8 %v, i8 %C, i8 %y) {
 ; CHECK-LABEL: @select_v_ne_fail(
@@ -446,4 +448,56 @@ define i64 @incorrect_safe_div_call_2(i64 %n, i64 %d) {
   ret i64 %3
 }
 
-declare void @use(i64)
+; https://alive2.llvm.org/ce/z/Si_B7b
+define i4 @icmp_urem(i4 %n, i4 %d) {
+; CHECK-LABEL: @icmp_urem(
+; CHECK-NEXT:    [[TMP1:%.*]] = urem i4 [[N:%.*]], [[D:%.*]]
+; CHECK-NEXT:    ret i4 [[TMP1]]
+;
+  %1 = icmp eq i4 %d, 0
+  %2 = urem i4 %n, %d
+  %3 = select i1 %1, i4 -1, i4 %2
+  ret i4 %3
+}
+
+define i4 @icmp_urem_clobber_by_call(i4 %n, i4 %d) {
+; CHECK-LABEL: @icmp_urem_clobber_by_call(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i4 [[D:%.*]], 0
+; CHECK-NEXT:    tail call void @use4(i4 [[D]])
+; CHECK-NEXT:    [[TMP2:%.*]] = urem i4 [[N:%.*]], [[D]]
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP1]], i4 -1, i4 [[TMP2]]
+; CHECK-NEXT:    ret i4 [[TMP3]]
+;
+  %1 = icmp eq i4 %d, 0
+  tail call void @use4(i4 %d)
+  %2 = urem i4 %n, %d
+  %3 = select i1 %1, i4 -1, i4 %2
+  ret i4 %3
+}
+
+; https://alive2.llvm.org/ce/z/Fn3Wac
+define i4 @icmp_srem(i4 %n, i4 %d) {
+; CHECK-LABEL: @icmp_srem(
+; CHECK-NEXT:    [[TMP1:%.*]] = srem i4 [[N:%.*]], [[D:%.*]]
+; CHECK-NEXT:    ret i4 [[TMP1]]
+;
+  %1 = icmp eq i4 %d, 0
+  %2 = srem i4 %n, %d
+  %3 = select i1 %1, i4 -1, i4 %2
+  ret i4 %3
+}
+
+define i4 @icmp_srem_clobber_by_call(i4 %n, i4 %d) {
+; CHECK-LABEL: @icmp_srem_clobber_by_call(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i4 [[D:%.*]], 0
+; CHECK-NEXT:    tail call void @use4(i4 [[D]])
+; CHECK-NEXT:    [[TMP2:%.*]] = srem i4 [[N:%.*]], [[D]]
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP1]], i4 -1, i4 [[TMP2]]
+; CHECK-NEXT:    ret i4 [[TMP3]]
+;
+  %1 = icmp eq i4 %d, 0
+  tail call void @use4(i4 %d)
+  %2 = srem i4 %n, %d
+  %3 = select i1 %1, i4 -1, i4 %2
+  ret i4 %3
+}
diff --git a/llvm/test/Transforms/InstCombine/zext-or-icmp.ll b/llvm/test/Transforms/InstCombine/zext-or-icmp.ll
index bc0e4bdce29b59..9ec3ddc80c57f7 100644
--- a/llvm/test/Transforms/InstCombine/zext-or-icmp.ll
+++ b/llvm/test/Transforms/InstCombine/zext-or-icmp.ll
@@ -231,9 +231,7 @@ define i1 @PR51762(ptr %i, i32 %t0, i16 %t1, ptr %p, ptr %d, ptr %f, i32 %p2, i1
 ; CHECK-NEXT:    [[INSERT_INSERT41:%.*]] = or i64 [[INSERT_SHIFT52]], [[INSERT_EXT39]]
 ; CHECK-NEXT:    [[REM:%.*]] = urem i64 [[S1]], [[INSERT_INSERT41]]
 ; CHECK-NEXT:    [[NE:%.*]] = icmp ne i64 [[REM]], 0
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[INSERT_INSERT41]], 0
-; CHECK-NEXT:    [[SPEC_SELECT57:%.*]] = or i1 [[NE]], [[CMP]]
-; CHECK-NEXT:    [[LOR_EXT:%.*]] = zext i1 [[SPEC_SELECT57]] to i32
+; CHECK-NEXT:    [[LOR_EXT:%.*]] = zext i1 [[NE]] to i32
 ; CHECK-NEXT:    [[T2:%.*]] = load i32, ptr [[D:%.*]], align 4
 ; CHECK-NEXT:    [[CONV15:%.*]] = sext i16 [[T1]] to i32
 ; CHECK-NEXT:    [[CMP16:%.*]] = icmp sge i32 [[T2]], [[CONV15]]

From d48d1edcf3ed0c1352b3c2864feb873f01d6f9da Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <Ramkumar.Ramachandra@imgtec.com>
Date: Fri, 1 Dec 2023 08:22:18 +0000
Subject: [PATCH 08/72] PowerPC/aix-cc-abi: regenerate test using UTC (NFC)
 (#73963)

Split out the parts of aix-cc-abi.ll that requires to be regenerated by
utils/update_mir_test_checks.py into aix-cc-abi-mir.ll, and regenerate
it using the script. Regenerate aix-cc-abi.ll using
utils/update_llc_test_checks.py.
---
 llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll | 2068 +++++++++++
 llvm/test/CodeGen/PowerPC/aix-cc-abi.ll     | 3635 +++++++++----------
 2 files changed, 3811 insertions(+), 1892 deletions(-)
 create mode 100644 llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll

diff --git a/llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll b/llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll
new file mode 100644
index 00000000000000..ccc36530c7957b
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll
@@ -0,0 +1,2068 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple powerpc-ibm-aix-xcoff -stop-after=machine-cp -verify-machineinstrs < %s | \
+; RUN: FileCheck --check-prefix=32BIT %s
+
+; RUN: llc -mtriple powerpc64-ibm-aix-xcoff -stop-after=machine-cp -verify-machineinstrs < %s | \
+; RUN: FileCheck --check-prefix=64BIT %s
+
+define void @call_test_chars() {
+  ; 32BIT-LABEL: name: call_test_chars
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   $r3 = LI 97
+  ; 32BIT-NEXT:   $r4 = LI 97
+  ; 32BIT-NEXT:   $r5 = LI 97
+  ; 32BIT-NEXT:   $r6 = LI 97
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_chars>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit killed $r3, implicit killed $r4, implicit killed $r5, implicit killed $r6, implicit $r2, implicit-def $r1, implicit-def dead $r3
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: call_test_chars
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   $x3 = LI8 97
+  ; 64BIT-NEXT:   $x4 = LI8 97
+  ; 64BIT-NEXT:   $x5 = LI8 97
+  ; 64BIT-NEXT:   $x6 = LI8 97
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_chars>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit killed $x3, implicit killed $x4, implicit killed $x5, implicit killed $x6, implicit $x2, implicit-def $r1, implicit-def dead $x3
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  call i8 @test_chars(i8 signext 97, i8 signext 97, i8 signext 97, i8 signext 97)
+  ret void
+}
+
+define signext i8 @test_chars(i8 signext %c1, i8 signext %c2, i8 signext %c3, i8 signext %c4) {
+  ; 32BIT-LABEL: name: test_chars
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   liveins: $r3, $r4, $r5, $r6
+  ; 32BIT-NEXT: {{  $}}
+  ; 32BIT-NEXT:   renamable $r3 = ADD4 killed renamable $r3, killed renamable $r4
+  ; 32BIT-NEXT:   renamable $r3 = ADD4 killed renamable $r3, killed renamable $r5
+  ; 32BIT-NEXT:   renamable $r3 = ADD4 killed renamable $r3, killed renamable $r6
+  ; 32BIT-NEXT:   renamable $r3 = EXTSB killed renamable $r3
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm, implicit $r3
+  ;
+  ; 64BIT-LABEL: name: test_chars
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   liveins: $x3, $x4, $x5, $x6
+  ; 64BIT-NEXT: {{  $}}
+  ; 64BIT-NEXT:   renamable $r3 = ADD4 renamable $r3, renamable $r4, implicit killed $x4, implicit killed $x3
+  ; 64BIT-NEXT:   renamable $r3 = ADD4 killed renamable $r3, renamable $r5, implicit killed $x5
+  ; 64BIT-NEXT:   renamable $r3 = ADD4 killed renamable $r3, renamable $r6, implicit killed $x6, implicit-def $x3
+  ; 64BIT-NEXT:   renamable $x3 = EXTSB8 killed renamable $x3
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm, implicit $x3
+entry:
+  %conv = sext i8 %c1 to i32
+  %conv1 = sext i8 %c2 to i32
+  %add = add nsw i32 %conv, %conv1
+  %conv2 = sext i8 %c3 to i32
+  %add3 = add nsw i32 %add, %conv2
+  %conv4 = sext i8 %c4 to i32
+  %add5 = add nsw i32 %add3, %conv4
+  %conv6 = trunc i32 %add5 to i8
+  ret i8 %conv6
+}
+
+define void @call_test_chars_mix() {
+  ; 32BIT-LABEL: name: call_test_chars_mix
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   $r3 = LI 97
+  ; 32BIT-NEXT:   $r4 = LI 225
+  ; 32BIT-NEXT:   $r5 = LI 97
+  ; 32BIT-NEXT:   $r6 = LI -31
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_chars_mix>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit killed $r3, implicit killed $r4, implicit killed $r5, implicit killed $r6, implicit $r2, implicit-def $r1, implicit-def dead $r3
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: call_test_chars_mix
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   $x3 = LI8 97
+  ; 64BIT-NEXT:   $x4 = LI8 225
+  ; 64BIT-NEXT:   $x5 = LI8 97
+  ; 64BIT-NEXT:   $x6 = LI8 -31
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_chars_mix>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit killed $x3, implicit killed $x4, implicit killed $x5, implicit killed $x6, implicit $x2, implicit-def $r1, implicit-def dead $x3
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  call i8 @test_chars_mix(i8 signext 97, i8 zeroext -31, i8 zeroext 97, i8 signext -31)
+  ret void
+}
+
+define signext i8 @test_chars_mix(i8 signext %c1, i8 zeroext %c2, i8 zeroext %c3, i8 signext %c4) {
+  ; 32BIT-LABEL: name: test_chars_mix
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   liveins: $r3, $r4, $r5, $r6
+  ; 32BIT-NEXT: {{  $}}
+  ; 32BIT-NEXT:   renamable $r3 = ADD4 killed renamable $r3, killed renamable $r4
+  ; 32BIT-NEXT:   renamable $r3 = ADD4 killed renamable $r3, killed renamable $r5
+  ; 32BIT-NEXT:   renamable $r3 = ADD4 killed renamable $r3, killed renamable $r6
+  ; 32BIT-NEXT:   renamable $r3 = EXTSB killed renamable $r3
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm, implicit $r3
+  ;
+  ; 64BIT-LABEL: name: test_chars_mix
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   liveins: $x3, $x4, $x5, $x6
+  ; 64BIT-NEXT: {{  $}}
+  ; 64BIT-NEXT:   renamable $r3 = ADD4 renamable $r3, renamable $r4, implicit killed $x4, implicit killed $x3
+  ; 64BIT-NEXT:   renamable $r3 = ADD4 killed renamable $r3, renamable $r5, implicit killed $x5
+  ; 64BIT-NEXT:   renamable $r3 = ADD4 killed renamable $r3, renamable $r6, implicit killed $x6, implicit-def $x3
+  ; 64BIT-NEXT:   renamable $x3 = EXTSB8 killed renamable $x3
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm, implicit $x3
+entry:
+  %conv = sext i8 %c1 to i32
+  %conv1 = zext i8 %c2 to i32
+  %add = add nsw i32 %conv, %conv1
+  %conv2 = zext i8 %c3 to i32
+  %add3 = add nsw i32 %add, %conv2
+  %conv4 = sext i8 %c4 to i32
+  %add5 = add nsw i32 %add3, %conv4
+  %conv6 = trunc i32 %add5 to i8
+  ret i8 %conv6
+}
+
+@global_i1 = global i8 0, align 1
+
+define  void @test_i1(i1 %b)  {
+  ; 32BIT-LABEL: name: test_i1
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   liveins: $r3
+  ; 32BIT-NEXT: {{  $}}
+  ; 32BIT-NEXT:   renamable $r4 = LWZtoc @global_i1, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r3 = RLWINM killed renamable $r3, 0, 31, 31
+  ; 32BIT-NEXT:   STB killed renamable $r3, 0, killed renamable $r4 :: (store (s8) into @global_i1)
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: test_i1
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   liveins: $x3
+  ; 64BIT-NEXT: {{  $}}
+  ; 64BIT-NEXT:   renamable $x4 = LDtoc @global_i1, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $r3 = RLWINM renamable $r3, 0, 31, 31, implicit killed $x3
+  ; 64BIT-NEXT:   STB killed renamable $r3, 0, killed renamable $x4 :: (store (s8) into @global_i1)
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+  entry:
+   %frombool = zext i1 %b to i8
+   store i8 %frombool, ptr @global_i1, align 1
+   ret void
+}
+
+define void @call_test_i1() {
+  ; 32BIT-LABEL: name: call_test_i1
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   $r3 = LI 1
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_i1>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit killed $r3, implicit $r2, implicit-def $r1
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: call_test_i1
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   $x3 = LI8 1
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_i1>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit killed $x3, implicit $x2, implicit-def $r1
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  call void @test_i1(i1 1)
+  ret void
+}
+
+define void @test_i1zext(i1 zeroext %b) {
+  ; 32BIT-LABEL: name: test_i1zext
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   liveins: $r3
+  ; 32BIT-NEXT: {{  $}}
+  ; 32BIT-NEXT:   renamable $r4 = LWZtoc @global_i1, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   STB killed renamable $r3, 0, killed renamable $r4 :: (store (s8) into @global_i1)
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: test_i1zext
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   liveins: $x3
+  ; 64BIT-NEXT: {{  $}}
+  ; 64BIT-NEXT:   renamable $x4 = LDtoc @global_i1, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   STB8 killed renamable $x3, 0, killed renamable $x4 :: (store (s8) into @global_i1)
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+  entry:
+    %frombool = zext i1 %b to i8
+    store i8 %frombool, ptr @global_i1, align 1
+    ret void
+  }
+
+define i32 @test_ints(i32 signext %a, i32 zeroext %b, i32 zeroext %c, i32 signext %d, i32 signext %e, i32 signext %f, i32 signext %g, i32 signext %h) {
+  ; 32BIT-LABEL: name: test_ints
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   liveins: $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10
+  ; 32BIT-NEXT: {{  $}}
+  ; 32BIT-NEXT:   renamable $r3 = ADD4 killed renamable $r3, killed renamable $r4
+  ; 32BIT-NEXT:   renamable $r3 = ADD4 killed renamable $r3, killed renamable $r5
+  ; 32BIT-NEXT:   renamable $r3 = ADD4 killed renamable $r3, killed renamable $r6
+  ; 32BIT-NEXT:   renamable $r3 = ADD4 killed renamable $r3, killed renamable $r7
+  ; 32BIT-NEXT:   renamable $r3 = ADD4 killed renamable $r3, killed renamable $r8
+  ; 32BIT-NEXT:   renamable $r3 = ADD4 killed renamable $r3, killed renamable $r9
+  ; 32BIT-NEXT:   renamable $r3 = ADD4 killed renamable $r3, killed renamable $r10
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm, implicit $r3
+  ;
+  ; 64BIT-LABEL: name: test_ints
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   liveins: $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10
+  ; 64BIT-NEXT: {{  $}}
+  ; 64BIT-NEXT:   renamable $r3 = ADD4 renamable $r3, renamable $r4, implicit killed $x4, implicit killed $x3
+  ; 64BIT-NEXT:   renamable $r3 = ADD4 killed renamable $r3, renamable $r5, implicit killed $x5
+  ; 64BIT-NEXT:   renamable $r3 = ADD4 killed renamable $r3, renamable $r6, implicit killed $x6
+  ; 64BIT-NEXT:   renamable $r3 = ADD4 killed renamable $r3, renamable $r7, implicit killed $x7
+  ; 64BIT-NEXT:   renamable $r3 = ADD4 killed renamable $r3, renamable $r8, implicit killed $x8
+  ; 64BIT-NEXT:   renamable $r3 = ADD4 killed renamable $r3, renamable $r9, implicit killed $x9
+  ; 64BIT-NEXT:   renamable $r3 = ADD4 killed renamable $r3, renamable $r10, implicit killed $x10, implicit-def $x3
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm, implicit $x3
+entry:
+    %add = add i32 %a, %b
+    %add1 = add i32 %add, %c
+    %add2 = add i32 %add1, %d
+    %add3 = add i32 %add2, %e
+    %add4 = add i32 %add3, %f
+    %add5 = add i32 %add4, %g
+    %add6 = add i32 %add5, %h
+    ret i32 %add6
+}
+
+define void @call_test_ints() {
+  ; 32BIT-LABEL: name: call_test_ints
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   $r3 = LI 1
+  ; 32BIT-NEXT:   $r4 = LI 1
+  ; 32BIT-NEXT:   $r5 = LIS 32768
+  ; 32BIT-NEXT:   $r6 = LIS 32768
+  ; 32BIT-NEXT:   $r7 = LI 1
+  ; 32BIT-NEXT:   $r8 = LI 1
+  ; 32BIT-NEXT:   $r9 = LI 1
+  ; 32BIT-NEXT:   $r10 = LI 1
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_ints>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit killed $r3, implicit killed $r4, implicit killed $r5, implicit killed $r6, implicit killed $r7, implicit killed $r8, implicit killed $r9, implicit killed $r10, implicit $r2, implicit-def $r1, implicit-def dead $r3
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: call_test_ints
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   renamable $x3 = LI8 1
+  ; 64BIT-NEXT:   renamable $x5 = RLDIC killed renamable $x3, 31, 32
+  ; 64BIT-NEXT:   $x3 = LI8 1
+  ; 64BIT-NEXT:   $x4 = LI8 1
+  ; 64BIT-NEXT:   $x6 = LIS8 32768
+  ; 64BIT-NEXT:   $x7 = LI8 1
+  ; 64BIT-NEXT:   $x8 = LI8 1
+  ; 64BIT-NEXT:   $x9 = LI8 1
+  ; 64BIT-NEXT:   $x10 = LI8 1
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_ints>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit killed $x4, implicit $x5, implicit killed $x6, implicit killed $x7, implicit killed $x8, implicit killed $x9, implicit killed $x10, implicit $x2, implicit-def $r1, implicit-def dead $x3
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  call i32 @test_ints(i32 signext 1, i32 zeroext 1, i32 zeroext 2147483648, i32 signext -2147483648, i32 signext 1, i32 signext 1, i32 signext 1, i32 signext 1)
+  ret void
+}
+
+define void @call_test_i64() {
+  ; 32BIT-LABEL: name: call_test_i64
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   $r3 = LI 0
+  ; 32BIT-NEXT:   $r4 = LI 1
+  ; 32BIT-NEXT:   $r5 = LI 0
+  ; 32BIT-NEXT:   $r6 = LI 2
+  ; 32BIT-NEXT:   $r7 = LI 0
+  ; 32BIT-NEXT:   $r8 = LI 3
+  ; 32BIT-NEXT:   $r9 = LI 0
+  ; 32BIT-NEXT:   $r10 = LI 4
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_i64>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit killed $r3, implicit killed $r4, implicit killed $r5, implicit killed $r6, implicit killed $r7, implicit killed $r8, implicit killed $r9, implicit killed $r10, implicit $r2, implicit-def $r1, implicit-def dead $r3, implicit-def dead $r4
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: call_test_i64
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   $x3 = LI8 1
+  ; 64BIT-NEXT:   $x4 = LI8 2
+  ; 64BIT-NEXT:   $x5 = LI8 3
+  ; 64BIT-NEXT:   $x6 = LI8 4
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_i64>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit killed $x3, implicit killed $x4, implicit killed $x5, implicit killed $x6, implicit $x2, implicit-def $r1, implicit-def dead $x3
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  call i64 @test_i64(i64 1, i64 2, i64 3, i64 4)
+  ret void
+}
+
+define i64 @test_i64(i64 %a, i64 %b, i64 %c, i64 %d) {
+  ; 32BIT-LABEL: name: test_i64
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   liveins: $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10
+  ; 32BIT-NEXT: {{  $}}
+  ; 32BIT-NEXT:   renamable $r4 = ADDC killed renamable $r4, killed renamable $r6, implicit-def $carry
+  ; 32BIT-NEXT:   renamable $r3 = ADDE killed renamable $r3, killed renamable $r5, implicit-def dead $carry, implicit killed $carry
+  ; 32BIT-NEXT:   renamable $r4 = ADDC killed renamable $r4, killed renamable $r8, implicit-def $carry
+  ; 32BIT-NEXT:   renamable $r3 = ADDE killed renamable $r3, killed renamable $r7, implicit-def dead $carry, implicit killed $carry
+  ; 32BIT-NEXT:   renamable $r4 = ADDC killed renamable $r4, killed renamable $r10, implicit-def $carry
+  ; 32BIT-NEXT:   renamable $r3 = ADDE killed renamable $r3, killed renamable $r9, implicit-def dead $carry, implicit killed $carry
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm, implicit $r3, implicit $r4
+  ;
+  ; 64BIT-LABEL: name: test_i64
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   liveins: $x3, $x4, $x5, $x6
+  ; 64BIT-NEXT: {{  $}}
+  ; 64BIT-NEXT:   renamable $x3 = nsw ADD8 killed renamable $x3, killed renamable $x4
+  ; 64BIT-NEXT:   renamable $x3 = nsw ADD8 killed renamable $x3, killed renamable $x5
+  ; 64BIT-NEXT:   renamable $x3 = nsw ADD8 killed renamable $x3, killed renamable $x6
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm, implicit $x3
+entry:
+  %add = add nsw i64 %a, %b
+  %add1 = add nsw i64 %add, %c
+  %add2 = add nsw i64 %add1, %d
+  ret i64 %add2
+}
+
+define void @call_test_int_ptr() {
+  ; 32BIT-LABEL: name: call_test_int_ptr
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   renamable $r3 = LI 0
+  ; 32BIT-NEXT:   STW killed renamable $r3, 0, %stack.0.b :: (store (s32) into %ir.b)
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   renamable $r3 = ADDI %stack.0.b, 0
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_int_ptr>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r2, implicit-def $r1
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: call_test_int_ptr
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   renamable $x3 = LI8 0
+  ; 64BIT-NEXT:   STW8 killed renamable $x3, 0, %stack.0.b :: (store (s32) into %ir.b)
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   renamable $x3 = ADDI8 %stack.0.b, 0
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_int_ptr>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  %b = alloca i32, align 4
+  store i32 0, ptr %b, align 4
+  call void @test_int_ptr(ptr %b)
+  ret void
+}
+
+define void @test_int_ptr(ptr %a) {
+  ; 32BIT-LABEL: name: test_int_ptr
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   liveins: $r3
+  ; 32BIT-NEXT: {{  $}}
+  ; 32BIT-NEXT:   STW killed renamable $r3, 0, %stack.0.a.addr :: (store (s32) into %ir.a.addr, align 8)
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: test_int_ptr
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   liveins: $x3
+  ; 64BIT-NEXT: {{  $}}
+  ; 64BIT-NEXT:   STD killed renamable $x3, 0, %stack.0.a.addr :: (store (s64) into %ir.a.addr)
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  %a.addr = alloca ptr, align 8
+  store ptr %a, ptr %a.addr, align 8
+  ret void
+}
+
+define i32 @caller(i32 %i)  {
+  ; 32BIT-LABEL: name: caller
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   liveins: $r3
+  ; 32BIT-NEXT: {{  $}}
+  ; 32BIT-NEXT:   STW renamable $r3, 0, %stack.0.i.addr :: (store (s32) into %ir.i.addr)
+  ; 32BIT-NEXT:   renamable $r3 = CNTLZW killed renamable $r3
+  ; 32BIT-NEXT:   renamable $r3 = NOR killed renamable $r3, renamable $r3
+  ; 32BIT-NEXT:   renamable $r3 = RLWINM killed renamable $r3, 27, 31, 31
+  ; 32BIT-NEXT:   STB renamable $r3, 0, %stack.1.b :: (store (s8) into %ir.b)
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .call_test_bool[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r2, implicit-def $r1, implicit-def $r3
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm, implicit $r3
+  ;
+  ; 64BIT-LABEL: name: caller
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   liveins: $x3
+  ; 64BIT-NEXT: {{  $}}
+  ; 64BIT-NEXT:   STW renamable $r3, 0, %stack.0.i.addr :: (store (s32) into %ir.i.addr)
+  ; 64BIT-NEXT:   renamable $r3 = CNTLZW renamable $r3, implicit killed $x3
+  ; 64BIT-NEXT:   renamable $r3 = RLWINM killed renamable $r3, 27, 5, 31
+  ; 64BIT-NEXT:   renamable $r3 = XORI killed renamable $r3, 1, implicit-def $x3
+  ; 64BIT-NEXT:   STB renamable $r3, 0, %stack.1.b :: (store (s8) into %ir.b)
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .call_test_bool[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1, implicit-def $x3
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm, implicit $x3
+entry:
+  %i.addr = alloca i32, align 4
+  %b = alloca i8, align 1
+  store i32 %i, ptr %i.addr, align 4
+  %0 = load i32, ptr %i.addr, align 4
+  %cmp = icmp ne i32 %0, 0
+  %frombool = zext i1 %cmp to i8
+  store i8 %frombool, ptr %b, align 1
+  %1 = load i8, ptr %b, align 1
+  %tobool = trunc i8 %1 to i1
+  %call = call i32 @call_test_bool(i1 zeroext %tobool)
+  ret i32 %call
+}
+
+declare i32 @call_test_bool(i1 zeroext)
+
+@f1 = global float 0.000000e+00, align 4
+@d1 = global double 0.000000e+00, align 8
+
+define void @call_test_floats() {
+  ; 32BIT-LABEL: name: call_test_floats
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @f1, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f1 = LFS 0, killed renamable $r3 :: (dereferenceable load (s32) from @f1)
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   $f2 = COPY renamable $f1
+  ; 32BIT-NEXT:   $f3 = COPY renamable $f1
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_floats>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $f1, implicit killed $f2, implicit killed $f3, implicit $r2, implicit-def $r1, implicit-def dead $f1
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: call_test_floats
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   renamable $x3 = LDtoc @f1, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $f1 = LFS 0, killed renamable $x3 :: (dereferenceable load (s32) from @f1)
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   $f2 = COPY renamable $f1
+  ; 64BIT-NEXT:   $f3 = COPY renamable $f1
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_floats>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $f1, implicit killed $f2, implicit killed $f3, implicit $x2, implicit-def $r1, implicit-def dead $f1
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  %0 = load float, ptr @f1, align 4
+  call float @test_floats(float %0, float %0, float %0)
+  ret void
+}
+
+define float @test_floats(float %f1, float %f2, float %f3) {
+  ; 32BIT-LABEL: name: test_floats
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   liveins: $f1, $f2, $f3
+  ; 32BIT-NEXT: {{  $}}
+  ; 32BIT-NEXT:   renamable $f0 = nofpexcept FADDS killed renamable $f1, killed renamable $f2, implicit $rm
+  ; 32BIT-NEXT:   renamable $f1 = nofpexcept FADDS killed renamable $f0, killed renamable $f3, implicit $rm
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm, implicit $f1
+  ;
+  ; 64BIT-LABEL: name: test_floats
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   liveins: $f1, $f2, $f3
+  ; 64BIT-NEXT: {{  $}}
+  ; 64BIT-NEXT:   renamable $f0 = nofpexcept FADDS killed renamable $f1, killed renamable $f2, implicit $rm
+  ; 64BIT-NEXT:   renamable $f1 = nofpexcept FADDS killed renamable $f0, killed renamable $f3, implicit $rm
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm, implicit $f1
+entry:
+  %add = fadd float %f1, %f2
+  %add1 = fadd float %add, %f3
+  ret float %add1
+}
+
+define void @call_test_fpr_max() {
+  ; 32BIT-LABEL: name: call_test_fpr_max
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @d1, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f1 = LFD 0, killed renamable $r3 :: (dereferenceable load (s64) from @d1)
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 128, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   STFD renamable $f1, 120, $r1 :: (store (s64))
+  ; 32BIT-NEXT:   STFD renamable $f1, 112, $r1 :: (store (s64))
+  ; 32BIT-NEXT:   STFD renamable $f1, 104, $r1 :: (store (s64))
+  ; 32BIT-NEXT:   STFD renamable $f1, 96, $r1 :: (store (s64))
+  ; 32BIT-NEXT:   STFD renamable $f1, 88, $r1 :: (store (s64))
+  ; 32BIT-NEXT:   STFD renamable $f1, 80, $r1 :: (store (s64))
+  ; 32BIT-NEXT:   STFD renamable $f1, 72, $r1 :: (store (s64))
+  ; 32BIT-NEXT:   STFD renamable $f1, 64, $r1 :: (store (s64))
+  ; 32BIT-NEXT:   $f2 = COPY renamable $f1
+  ; 32BIT-NEXT:   $f3 = COPY renamable $f1
+  ; 32BIT-NEXT:   $f4 = COPY renamable $f1
+  ; 32BIT-NEXT:   $f5 = COPY renamable $f1
+  ; 32BIT-NEXT:   $f6 = COPY renamable $f1
+  ; 32BIT-NEXT:   $f7 = COPY renamable $f1
+  ; 32BIT-NEXT:   $f8 = COPY renamable $f1
+  ; 32BIT-NEXT:   $f9 = COPY renamable $f1
+  ; 32BIT-NEXT:   $f10 = COPY renamable $f1
+  ; 32BIT-NEXT:   $f11 = COPY renamable $f1
+  ; 32BIT-NEXT:   $f12 = COPY renamable $f1
+  ; 32BIT-NEXT:   $f13 = COPY renamable $f1
+  ; 32BIT-NEXT:   STFD renamable $f1, 56, $r1 :: (store (s64))
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_fpr_max>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $f1, implicit killed $f2, implicit killed $f3, implicit killed $f4, implicit killed $f5, implicit killed $f6, implicit killed $f7, implicit killed $f8, implicit killed $f9, implicit killed $f10, implicit killed $f11, implicit killed $f12, implicit killed $f13, implicit $r2, implicit-def $r1, implicit-def dead $f1
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 128, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: call_test_fpr_max
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   renamable $x3 = LDtoc @d1, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $f1 = LFD 0, killed renamable $x3 :: (dereferenceable load (s64) from @d1)
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 152, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   STFD renamable $f1, 144, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   STFD renamable $f1, 136, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   STFD renamable $f1, 128, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   STFD renamable $f1, 120, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   $f2 = COPY renamable $f1
+  ; 64BIT-NEXT:   $f3 = COPY renamable $f1
+  ; 64BIT-NEXT:   $f4 = COPY renamable $f1
+  ; 64BIT-NEXT:   $f5 = COPY renamable $f1
+  ; 64BIT-NEXT:   $f6 = COPY renamable $f1
+  ; 64BIT-NEXT:   $f7 = COPY renamable $f1
+  ; 64BIT-NEXT:   $f8 = COPY renamable $f1
+  ; 64BIT-NEXT:   $f9 = COPY renamable $f1
+  ; 64BIT-NEXT:   $f10 = COPY renamable $f1
+  ; 64BIT-NEXT:   $f11 = COPY renamable $f1
+  ; 64BIT-NEXT:   $f12 = COPY renamable $f1
+  ; 64BIT-NEXT:   $f13 = COPY renamable $f1
+  ; 64BIT-NEXT:   STFD renamable $f1, 112, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_fpr_max>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $f1, implicit killed $f2, implicit killed $f3, implicit killed $f4, implicit killed $f5, implicit killed $f6, implicit killed $f7, implicit killed $f8, implicit killed $f9, implicit killed $f10, implicit killed $f11, implicit killed $f12, implicit killed $f13, implicit $x2, implicit-def $r1, implicit-def dead $f1
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 152, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  %0 = load double, ptr @d1, align 8
+  call double @test_fpr_max(double %0, double %0, double %0, double %0, double %0, double %0, double %0, double %0, double %0, double %0, double %0, double %0, double %0)
+  ret void
+}
+
+define double @test_fpr_max(double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10, double %d11, double %d12, double %d13) {
+  ; 32BIT-LABEL: name: test_fpr_max
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   liveins: $f1, $f2, $f3, $f4, $f5, $f6, $f7, $f8, $f9, $f10, $f11, $f12, $f13
+  ; 32BIT-NEXT: {{  $}}
+  ; 32BIT-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f1, killed renamable $f2, implicit $rm
+  ; 32BIT-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f3, implicit $rm
+  ; 32BIT-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f4, implicit $rm
+  ; 32BIT-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f5, implicit $rm
+  ; 32BIT-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f6, implicit $rm
+  ; 32BIT-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f7, implicit $rm
+  ; 32BIT-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f8, implicit $rm
+  ; 32BIT-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f9, implicit $rm
+  ; 32BIT-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f10, implicit $rm
+  ; 32BIT-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f11, implicit $rm
+  ; 32BIT-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f12, implicit $rm
+  ; 32BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f0, killed renamable $f13, implicit $rm
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm, implicit $f1
+  ;
+  ; 64BIT-LABEL: name: test_fpr_max
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   liveins: $f1, $f2, $f3, $f4, $f5, $f6, $f7, $f8, $f9, $f10, $f11, $f12, $f13
+  ; 64BIT-NEXT: {{  $}}
+  ; 64BIT-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f1, killed renamable $f2, implicit $rm
+  ; 64BIT-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f3, implicit $rm
+  ; 64BIT-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f4, implicit $rm
+  ; 64BIT-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f5, implicit $rm
+  ; 64BIT-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f6, implicit $rm
+  ; 64BIT-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f7, implicit $rm
+  ; 64BIT-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f8, implicit $rm
+  ; 64BIT-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f9, implicit $rm
+  ; 64BIT-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f10, implicit $rm
+  ; 64BIT-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f11, implicit $rm
+  ; 64BIT-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f12, implicit $rm
+  ; 64BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f0, killed renamable $f13, implicit $rm
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm, implicit $f1
+entry:
+  %add = fadd double %d1, %d2
+  %add1 = fadd double %add, %d3
+  %add2 = fadd double %add1, %d4
+  %add3 = fadd double %add2, %d5
+  %add4 = fadd double %add3, %d6
+  %add5 = fadd double %add4, %d7
+  %add6 = fadd double %add5, %d8
+  %add7 = fadd double %add6, %d9
+  %add8 = fadd double %add7, %d10
+  %add9 = fadd double %add8, %d11
+  %add10 = fadd double %add9, %d12
+  %add11 = fadd double %add10, %d13
+  ret double %add11
+}
+
+define void @call_test_mix() {
+  ; 32BIT-LABEL: name: call_test_mix
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @f1, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r4 = LWZtoc @d1, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f1 = LFS 0, killed renamable $r3 :: (dereferenceable load (s32) from @f1)
+  ; 32BIT-NEXT:   renamable $f2 = LFD 0, killed renamable $r4 :: (dereferenceable load (s64) from @d1)
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   $r4 = LI 1
+  ; 32BIT-NEXT:   $r7 = LI 97
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_mix>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $f1, implicit $r4, implicit $f2, implicit killed $r7, implicit $r2, implicit-def $r1, implicit-def dead $r3
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: call_test_mix
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   renamable $x3 = LDtoc @f1, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x4 = LDtoc @d1, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $f1 = LFS 0, killed renamable $x3 :: (dereferenceable load (s32) from @f1)
+  ; 64BIT-NEXT:   renamable $f2 = LFD 0, killed renamable $x4 :: (dereferenceable load (s64) from @d1)
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   $x4 = LI8 1
+  ; 64BIT-NEXT:   $x6 = LI8 97
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_mix>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $f1, implicit $x4, implicit $f2, implicit killed $x6, implicit $x2, implicit-def $r1, implicit-def dead $x3
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  %0 = load float, ptr @f1, align 4
+  %1 = load double, ptr @d1, align 8
+  call i32 @test_mix(float %0, i32 1, double %1, i8 signext 97)
+  ret void
+}
+
+define i32 @test_mix(float %f, i32 signext %i, double %d, i8 signext %c) {
+  ; 32BIT-LABEL: name: test_mix
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   liveins: $f1, $f2, $r4, $r7
+  ; 32BIT-NEXT: {{  $}}
+  ; 32BIT-NEXT:   renamable $r3 = LIS 17200
+  ; 32BIT-NEXT:   STW killed renamable $r3, 0, %stack.1 :: (store (s32) into %stack.1, align 8)
+  ; 32BIT-NEXT:   renamable $r3 = RLWINM killed renamable $r7, 0, 24, 31
+  ; 32BIT-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r4, killed renamable $r3
+  ; 32BIT-NEXT:   renamable $r4 = LWZtoc %const.0, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r3 = XORIS killed renamable $r3, 32768
+  ; 32BIT-NEXT:   STW killed renamable $r3, 4, %stack.1 :: (store (s32) into %stack.1 + 4)
+  ; 32BIT-NEXT:   renamable $f0 = LFS 0, killed renamable $r4 :: (load (s32) from constant-pool)
+  ; 32BIT-NEXT:   renamable $f3 = LFD 0, %stack.1 :: (load (s64) from %stack.1)
+  ; 32BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f2, implicit $rm
+  ; 32BIT-NEXT:   renamable $f1 = nofpexcept FRSP killed renamable $f1, implicit $rm
+  ; 32BIT-NEXT:   renamable $f0 = nofpexcept FSUB killed renamable $f3, killed renamable $f0, implicit $rm
+  ; 32BIT-NEXT:   renamable $f0 = nofpexcept FRSP killed renamable $f0, implicit $rm
+  ; 32BIT-NEXT:   renamable $f0 = nofpexcept FADDS killed renamable $f0, killed renamable $f1, implicit $rm
+  ; 32BIT-NEXT:   renamable $f0 = nofpexcept FCTIWZ killed renamable $f0, implicit $rm
+  ; 32BIT-NEXT:   STFD killed renamable $f0, 0, %stack.0 :: (store (s64) into %stack.0)
+  ; 32BIT-NEXT:   renamable $r3 = LWZ 4, %stack.0 :: (load (s32) from %stack.0 + 4, basealign 8)
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm, implicit $r3
+  ;
+  ; 64BIT-LABEL: name: test_mix
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   liveins: $f1, $f2, $x4, $x6
+  ; 64BIT-NEXT: {{  $}}
+  ; 64BIT-NEXT:   renamable $r3 = RLWINM renamable $r6, 0, 24, 31, implicit killed $x6
+  ; 64BIT-NEXT:   renamable $r3 = nsw ADD4 renamable $r4, killed renamable $r3, implicit killed $x4
+  ; 64BIT-NEXT:   renamable $x3 = EXTSW_32_64 killed renamable $r3
+  ; 64BIT-NEXT:   STD killed renamable $x3, 0, %stack.1 :: (store (s64) into %stack.1)
+  ; 64BIT-NEXT:   renamable $f0 = LFD 0, %stack.1 :: (load (s64) from %stack.1)
+  ; 64BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f2, implicit $rm
+  ; 64BIT-NEXT:   renamable $f1 = nofpexcept FRSP killed renamable $f1, implicit $rm
+  ; 64BIT-NEXT:   renamable $f0 = nofpexcept FCFID killed renamable $f0, implicit $rm
+  ; 64BIT-NEXT:   renamable $f0 = nofpexcept FRSP killed renamable $f0, implicit $rm
+  ; 64BIT-NEXT:   renamable $f0 = nofpexcept FADDS killed renamable $f0, killed renamable $f1, implicit $rm
+  ; 64BIT-NEXT:   renamable $f0 = nofpexcept FCTIWZ killed renamable $f0, implicit $rm
+  ; 64BIT-NEXT:   STFD killed renamable $f0, 0, %stack.0 :: (store (s64) into %stack.0)
+  ; 64BIT-NEXT:   renamable $x3 = LWZ8 4, %stack.0 :: (load (s32) from %stack.0 + 4, basealign 8)
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm, implicit $x3
+entry:
+  %conv = fpext float %f to double
+  %add = fadd double %conv, %d
+  %conv1 = fptrunc double %add to float
+  %conv2 = zext i8 %c to i32
+  %add3 = add nsw i32 %i, %conv2
+  %conv4 = sitofp i32 %add3 to float
+  %add5 = fadd float %conv4, %conv1
+  %conv6 = fptosi float %add5 to i32
+  ret i32 %conv6
+}
+
+define i64 @callee_mixed_ints(i32 %a, i8 signext %b, i32 %c, i16 signext %d, i64 %e) {
+  ; 32BIT-LABEL: name: callee_mixed_ints
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   liveins: $r3, $r4, $r5, $r6, $r7, $r8
+  ; 32BIT-NEXT: {{  $}}
+  ; 32BIT-NEXT:   renamable $r4 = RLWINM killed renamable $r4, 0, 24, 31
+  ; 32BIT-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r4
+  ; 32BIT-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r5
+  ; 32BIT-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r6
+  ; 32BIT-NEXT:   renamable $r5 = SRAWI renamable $r3, 31, implicit-def dead $carry
+  ; 32BIT-NEXT:   renamable $r4 = ADDC killed renamable $r3, killed renamable $r8, implicit-def $carry
+  ; 32BIT-NEXT:   renamable $r3 = ADDE killed renamable $r5, killed renamable $r7, implicit-def dead $carry, implicit killed $carry
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm, implicit $r3, implicit $r4
+  ;
+  ; 64BIT-LABEL: name: callee_mixed_ints
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   liveins: $x3, $x4, $x5, $x6, $x7
+  ; 64BIT-NEXT: {{  $}}
+  ; 64BIT-NEXT:   renamable $r4 = RLWINM renamable $r4, 0, 24, 31, implicit killed $x4
+  ; 64BIT-NEXT:   renamable $r3 = nsw ADD4 renamable $r3, killed renamable $r4, implicit killed $x3
+  ; 64BIT-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, renamable $r5, implicit killed $x5
+  ; 64BIT-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, renamable $r6, implicit killed $x6
+  ; 64BIT-NEXT:   renamable $x3 = EXTSW_32_64 killed renamable $r3
+  ; 64BIT-NEXT:   renamable $x3 = nsw ADD8 killed renamable $x3, killed renamable $x7
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm, implicit $x3
+entry:
+  %conv = zext i8 %b to i32
+  %add = add nsw i32 %a, %conv
+  %add1 = add nsw i32 %add, %c
+  %conv2 = sext i16 %d to i32
+  %add3 = add nsw i32 %add1, %conv2
+  %conv4 = sext i32 %add3 to i64
+  %add5 = add nsw i64 %conv4, %e
+  ret i64 %add5
+  }
+
+define void @call_test_vararg() {
+  ; 32BIT-LABEL: name: call_test_vararg
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @f1, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f1 = LFS 0, killed renamable $r3 :: (dereferenceable load (s32) from @f1)
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @d1, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   STFD renamable $f1, 0, %stack.1 :: (store (s64) into %stack.1)
+  ; 32BIT-NEXT:   renamable $r4 = LWZ 0, %stack.1 :: (load (s32) from %stack.1, align 8)
+  ; 32BIT-NEXT:   renamable $f2 = LFD 0, killed renamable $r3 :: (dereferenceable load (s64) from @d1)
+  ; 32BIT-NEXT:   renamable $r5 = LWZ 4, %stack.1 :: (load (s32) from %stack.1 + 4)
+  ; 32BIT-NEXT:   STFD renamable $f2, 0, %stack.0 :: (store (s64) into %stack.0)
+  ; 32BIT-NEXT:   renamable $r6 = LWZ 0, %stack.0 :: (load (s32) from %stack.0, align 8)
+  ; 32BIT-NEXT:   renamable $r7 = LWZ 4, %stack.0 :: (load (s32) from %stack.0 + 4)
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   $r3 = LI 42
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_vararg[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $f1, implicit $r4, implicit $r5, implicit $f2, implicit $r6, implicit $r7, implicit $r2, implicit-def $r1
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: call_test_vararg
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   renamable $x3 = LDtoc @f1, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $f1 = LFS 0, killed renamable $x3 :: (dereferenceable load (s32) from @f1)
+  ; 64BIT-NEXT:   renamable $x3 = LDtoc @d1, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   STFD renamable $f1, 0, %stack.1 :: (store (s64) into %stack.1)
+  ; 64BIT-NEXT:   renamable $f2 = LFD 0, killed renamable $x3 :: (dereferenceable load (s64) from @d1)
+  ; 64BIT-NEXT:   renamable $x4 = LD 0, %stack.1 :: (load (s64) from %stack.1)
+  ; 64BIT-NEXT:   STFD renamable $f2, 0, %stack.0 :: (store (s64) into %stack.0)
+  ; 64BIT-NEXT:   renamable $x5 = LD 0, %stack.0 :: (load (s64) from %stack.0)
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   $x3 = LI8 42
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_vararg[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $f1, implicit $x4, implicit $f2, implicit $x5, implicit $x2, implicit-def $r1
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  %0 = load float, ptr @f1, align 4
+  %conv = fpext float %0 to double
+  %1 = load double, ptr @d1, align 8
+  call void (i32, ...) @test_vararg(i32 42, double %conv, double %1)
+  ret void
+}
+
+declare void @test_vararg(i32, ...)
+
+define void @call_test_vararg2() {
+  ; 32BIT-LABEL: name: call_test_vararg2
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @f1, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f1 = LFS 0, killed renamable $r3 :: (dereferenceable load (s32) from @f1)
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @d1, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   STFD renamable $f1, 0, %stack.1 :: (store (s64) into %stack.1)
+  ; 32BIT-NEXT:   renamable $r4 = LWZ 0, %stack.1 :: (load (s32) from %stack.1, align 8)
+  ; 32BIT-NEXT:   renamable $f2 = LFD 0, killed renamable $r3 :: (dereferenceable load (s64) from @d1)
+  ; 32BIT-NEXT:   renamable $r5 = LWZ 4, %stack.1 :: (load (s32) from %stack.1 + 4)
+  ; 32BIT-NEXT:   STFD renamable $f2, 0, %stack.0 :: (store (s64) into %stack.0)
+  ; 32BIT-NEXT:   renamable $r7 = LWZ 0, %stack.0 :: (load (s32) from %stack.0, align 8)
+  ; 32BIT-NEXT:   renamable $r8 = LWZ 4, %stack.0 :: (load (s32) from %stack.0 + 4)
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   $r3 = LI 42
+  ; 32BIT-NEXT:   $r6 = LI 42
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_vararg[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $f1, implicit $r4, implicit $r5, implicit killed $r6, implicit $f2, implicit $r7, implicit $r8, implicit $r2, implicit-def $r1
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: call_test_vararg2
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   renamable $x3 = LDtoc @f1, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $f1 = LFS 0, killed renamable $x3 :: (dereferenceable load (s32) from @f1)
+  ; 64BIT-NEXT:   renamable $x3 = LDtoc @d1, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   STFD renamable $f1, 0, %stack.1 :: (store (s64) into %stack.1)
+  ; 64BIT-NEXT:   renamable $f2 = LFD 0, killed renamable $x3 :: (dereferenceable load (s64) from @d1)
+  ; 64BIT-NEXT:   renamable $x4 = LD 0, %stack.1 :: (load (s64) from %stack.1)
+  ; 64BIT-NEXT:   STFD renamable $f2, 0, %stack.0 :: (store (s64) into %stack.0)
+  ; 64BIT-NEXT:   renamable $x6 = LD 0, %stack.0 :: (load (s64) from %stack.0)
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   $x3 = LI8 42
+  ; 64BIT-NEXT:   $x5 = LI8 42
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_vararg[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $f1, implicit $x4, implicit killed $x5, implicit $f2, implicit $x6, implicit $x2, implicit-def $r1
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  %0 = load float, ptr @f1, align 4
+  %conv = fpext float %0 to double
+  %1 = load double, ptr @d1, align 8
+  call void (i32, ...) @test_vararg(i32 42, double %conv, i32 42, double %1)
+  ret void
+}
+
+define void @call_test_vararg3() {
+  ; 32BIT-LABEL: name: call_test_vararg3
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @f1, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f1 = LFS 0, killed renamable $r3 :: (dereferenceable load (s32) from @f1)
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @d1, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   STFD renamable $f1, 0, %stack.1 :: (store (s64) into %stack.1)
+  ; 32BIT-NEXT:   renamable $r4 = LWZ 0, %stack.1 :: (load (s32) from %stack.1, align 8)
+  ; 32BIT-NEXT:   renamable $f2 = LFD 0, killed renamable $r3 :: (dereferenceable load (s64) from @d1)
+  ; 32BIT-NEXT:   renamable $r5 = LWZ 4, %stack.1 :: (load (s32) from %stack.1 + 4)
+  ; 32BIT-NEXT:   STFD renamable $f2, 0, %stack.0 :: (store (s64) into %stack.0)
+  ; 32BIT-NEXT:   renamable $r8 = LWZ 0, %stack.0 :: (load (s32) from %stack.0, align 8)
+  ; 32BIT-NEXT:   renamable $r9 = LWZ 4, %stack.0 :: (load (s32) from %stack.0 + 4)
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   $r3 = LI 42
+  ; 32BIT-NEXT:   $r6 = LI 0
+  ; 32BIT-NEXT:   $r7 = LI 42
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_vararg[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $f1, implicit $r4, implicit $r5, implicit killed $r6, implicit killed $r7, implicit $f2, implicit $r8, implicit $r9, implicit $r2, implicit-def $r1
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: call_test_vararg3
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   renamable $x3 = LDtoc @f1, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $f1 = LFS 0, killed renamable $x3 :: (dereferenceable load (s32) from @f1)
+  ; 64BIT-NEXT:   renamable $x3 = LDtoc @d1, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   STFD renamable $f1, 0, %stack.1 :: (store (s64) into %stack.1)
+  ; 64BIT-NEXT:   renamable $f2 = LFD 0, killed renamable $x3 :: (dereferenceable load (s64) from @d1)
+  ; 64BIT-NEXT:   renamable $x4 = LD 0, %stack.1 :: (load (s64) from %stack.1)
+  ; 64BIT-NEXT:   STFD renamable $f2, 0, %stack.0 :: (store (s64) into %stack.0)
+  ; 64BIT-NEXT:   renamable $x6 = LD 0, %stack.0 :: (load (s64) from %stack.0)
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   $x3 = LI8 42
+  ; 64BIT-NEXT:   $x5 = LI8 42
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_vararg[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $f1, implicit $x4, implicit killed $x5, implicit $f2, implicit $x6, implicit $x2, implicit-def $r1
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  %0 = load float, ptr @f1, align 4
+  %conv = fpext float %0 to double
+  %1 = load double, ptr @d1, align 8
+  call void (i32, ...) @test_vararg(i32 42, double %conv, i64 42, double %1)
+  ret void
+}
+
+define void @call_test_vararg4() {
+  ; 32BIT-LABEL: name: call_test_vararg4
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @f1, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f1 = LFS 0, killed renamable $r3 :: (dereferenceable load (s32) from @f1)
+  ; 32BIT-NEXT:   STFS renamable $f1, 0, %stack.0 :: (store (s32) into %stack.0)
+  ; 32BIT-NEXT:   renamable $r4 = LWZ 0, %stack.0 :: (load (s32) from %stack.0)
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   $r3 = LI 42
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_vararg[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $f1, implicit $r4, implicit $r2, implicit-def $r1
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: call_test_vararg4
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   renamable $x3 = LDtoc @f1, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $f1 = LFS 0, killed renamable $x3 :: (dereferenceable load (s32) from @f1)
+  ; 64BIT-NEXT:   STFS renamable $f1, 0, %stack.0 :: (store (s32) into %stack.0)
+  ; 64BIT-NEXT:   renamable $x4 = LWZ8 0, %stack.0 :: (load (s32) from %stack.0)
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   $x3 = LI8 42
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_vararg[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $f1, implicit $x4, implicit $x2, implicit-def $r1
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  %0 = load float, ptr @f1, align 4
+  call void (i32, ...) @test_vararg(i32 42, float %0)
+  ret void
+}
+
+@c = common global i8 0, align 1
+@si = common global i16 0, align 2
+@i = common global i32 0, align 4
+@lli = common global i64 0, align 8
+@f = common global float 0.000000e+00, align 4
+@d = common global double 0.000000e+00, align 8
+
+; Basic saving of integral type arguments to the parameter save area.
+define void @call_test_stackarg_int() {
+  ; 32BIT-LABEL: name: call_test_stackarg_int
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @c, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r4 = LWZtoc @si, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r5 = LWZtoc @i, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r11 = LBZ 0, killed renamable $r3 :: (dereferenceable load (s8) from @c)
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @lli, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r4 = LHA 0, killed renamable $r4 :: (dereferenceable load (s16) from @si)
+  ; 32BIT-NEXT:   renamable $r5 = LWZ 0, killed renamable $r5 :: (dereferenceable load (s32) from @i)
+  ; 32BIT-NEXT:   renamable $r6 = LWZ 0, renamable $r3 :: (dereferenceable load (s32) from @lli, align 8)
+  ; 32BIT-NEXT:   renamable $r3 = LWZ 4, killed renamable $r3 :: (dereferenceable load (s32) from @lli + 4, basealign 8)
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 80, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   STW renamable $r5, 76, $r1 :: (store (s32))
+  ; 32BIT-NEXT:   STW killed renamable $r3, 72, $r1 :: (store (s32))
+  ; 32BIT-NEXT:   STW killed renamable $r6, 68, $r1 :: (store (s32))
+  ; 32BIT-NEXT:   STW killed renamable $r5, 64, $r1 :: (store (s32))
+  ; 32BIT-NEXT:   STW killed renamable $r4, 60, $r1 :: (store (s32))
+  ; 32BIT-NEXT:   $r3 = LI 1
+  ; 32BIT-NEXT:   $r4 = LI 2
+  ; 32BIT-NEXT:   $r5 = LI 3
+  ; 32BIT-NEXT:   $r6 = LI 4
+  ; 32BIT-NEXT:   $r7 = LI 5
+  ; 32BIT-NEXT:   $r8 = LI 6
+  ; 32BIT-NEXT:   $r9 = LI 7
+  ; 32BIT-NEXT:   $r10 = LI 8
+  ; 32BIT-NEXT:   STW killed renamable $r11, 56, $r1 :: (store (s32))
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_stackarg_int[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r5, implicit $r6, implicit $r7, implicit $r8, implicit $r9, implicit $r10, implicit $r2, implicit-def $r1
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 80, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: call_test_stackarg_int
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   renamable $x3 = LDtoc @c, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x4 = LDtoc @si, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x5 = LDtoc @i, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x6 = LDtoc @lli, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x11 = LBZ8 0, killed renamable $x3 :: (dereferenceable load (s8) from @c)
+  ; 64BIT-NEXT:   renamable $x12 = LHA8 0, killed renamable $x4 :: (dereferenceable load (s16) from @si)
+  ; 64BIT-NEXT:   renamable $x0 = LWZ8 0, killed renamable $x5 :: (dereferenceable load (s32) from @i)
+  ; 64BIT-NEXT:   renamable $x31 = LD 0, killed renamable $x6 :: (dereferenceable load (s64) from @lli)
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 152, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   $x3 = LI8 1
+  ; 64BIT-NEXT:   $x4 = LI8 2
+  ; 64BIT-NEXT:   $x5 = LI8 3
+  ; 64BIT-NEXT:   $x6 = LI8 4
+  ; 64BIT-NEXT:   $x7 = LI8 5
+  ; 64BIT-NEXT:   $x8 = LI8 6
+  ; 64BIT-NEXT:   $x9 = LI8 7
+  ; 64BIT-NEXT:   $x10 = LI8 8
+  ; 64BIT-NEXT:   STD killed renamable $x31, 136, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   STD renamable $x0, 144, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   STD killed renamable $x0, 128, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   STD killed renamable $x12, 120, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   STD killed renamable $x11, 112, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_stackarg_int[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit $x5, implicit $x6, implicit $x7, implicit $x8, implicit $x9, implicit $x10, implicit $x2, implicit-def $r1
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 152, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  %0 = load i8, ptr @c, align 1
+  %1 = load i16, ptr @si, align 2
+  %2 = load i32, ptr @i, align 4
+  %3 = load i64, ptr @lli, align 8
+  %4 = load i32, ptr @i, align 4
+  call void @test_stackarg_int(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i8 zeroext %0, i16 signext %1, i32 %2, i64 %3, i32 %4)
+  ret void
+}
+
+declare void @test_stackarg_int(i32, i32, i32, i32, i32, i32, i32, i32, i8 zeroext, i16 signext, i32, i64, i32)
+
+; Basic saving of floating point type arguments to the parameter save area.
+; The float and double arguments will pass in both fpr as well as parameter save area.
+define void @call_test_stackarg_float() {
+  ; 32BIT-LABEL: name: call_test_stackarg_float
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @f, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r4 = LWZtoc @d, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f1 = LFS 0, killed renamable $r3 :: (dereferenceable load (s32) from @f)
+  ; 32BIT-NEXT:   renamable $f2 = LFD 0, killed renamable $r4 :: (dereferenceable load (s64) from @d)
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 68, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   STFD renamable $f2, 60, $r1 :: (store (s64))
+  ; 32BIT-NEXT:   $r3 = LI 1
+  ; 32BIT-NEXT:   $r4 = LI 2
+  ; 32BIT-NEXT:   $r5 = LI 3
+  ; 32BIT-NEXT:   $r6 = LI 4
+  ; 32BIT-NEXT:   $r7 = LI 5
+  ; 32BIT-NEXT:   $r8 = LI 6
+  ; 32BIT-NEXT:   $r9 = LI 7
+  ; 32BIT-NEXT:   $r10 = LI 8
+  ; 32BIT-NEXT:   STFS renamable $f1, 56, $r1 :: (store (s32))
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_stackarg_float[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit killed $r5, implicit killed $r6, implicit killed $r7, implicit killed $r8, implicit killed $r9, implicit killed $r10, implicit $f1, implicit $f2, implicit $r2, implicit-def $r1
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 68, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: call_test_stackarg_float
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   renamable $x3 = LDtoc @f, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x4 = LDtoc @d, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $f1 = LFS 0, killed renamable $x3 :: (dereferenceable load (s32) from @f)
+  ; 64BIT-NEXT:   renamable $f2 = LFD 0, killed renamable $x4 :: (dereferenceable load (s64) from @d)
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 128, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   STFD renamable $f2, 120, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   $x3 = LI8 1
+  ; 64BIT-NEXT:   $x4 = LI8 2
+  ; 64BIT-NEXT:   $x5 = LI8 3
+  ; 64BIT-NEXT:   $x6 = LI8 4
+  ; 64BIT-NEXT:   $x7 = LI8 5
+  ; 64BIT-NEXT:   $x8 = LI8 6
+  ; 64BIT-NEXT:   $x9 = LI8 7
+  ; 64BIT-NEXT:   $x10 = LI8 8
+  ; 64BIT-NEXT:   STFS renamable $f1, 112, $x1 :: (store (s32))
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_stackarg_float[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit killed $x5, implicit killed $x6, implicit killed $x7, implicit killed $x8, implicit killed $x9, implicit killed $x10, implicit $f1, implicit $f2, implicit $x2, implicit-def $r1
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 128, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  %0 = load float, ptr @f, align 4
+  %1 = load double, ptr @d, align 8
+  call void @test_stackarg_float(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, float %0, double %1)
+  ret void
+}
+
+declare void @test_stackarg_float(i32, i32, i32, i32, i32, i32, i32, i32, float, double)
+
+define void @call_test_stackarg_float2() {
+  ; 32BIT-LABEL: name: call_test_stackarg_float2
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @d, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f1 = LFD 0, killed renamable $r3 :: (dereferenceable load (s64) from @d)
+  ; 32BIT-NEXT:   STFD renamable $f1, 0, %stack.0 :: (store (s64) into %stack.0)
+  ; 32BIT-NEXT:   renamable $r9 = LWZ 0, %stack.0 :: (load (s32) from %stack.0, align 8)
+  ; 32BIT-NEXT:   renamable $r10 = LWZ 4, %stack.0 :: (load (s32) from %stack.0 + 4)
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   $r3 = LI 1
+  ; 32BIT-NEXT:   $r4 = LI 2
+  ; 32BIT-NEXT:   $r5 = LI 3
+  ; 32BIT-NEXT:   $r6 = LI 4
+  ; 32BIT-NEXT:   $r7 = LI 5
+  ; 32BIT-NEXT:   $r8 = LI 6
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_stackarg_float2[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit killed $r4, implicit killed $r5, implicit killed $r6, implicit killed $r7, implicit killed $r8, implicit $f1, implicit $r9, implicit $r10, implicit $r2, implicit-def $r1
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: call_test_stackarg_float2
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   renamable $x3 = LDtoc @d, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $f1 = LFD 0, killed renamable $x3 :: (dereferenceable load (s64) from @d)
+  ; 64BIT-NEXT:   STFD renamable $f1, 0, %stack.0 :: (store (s64) into %stack.0)
+  ; 64BIT-NEXT:   renamable $x9 = LD 0, %stack.0 :: (load (s64) from %stack.0)
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   $x3 = LI8 1
+  ; 64BIT-NEXT:   $x4 = LI8 2
+  ; 64BIT-NEXT:   $x5 = LI8 3
+  ; 64BIT-NEXT:   $x6 = LI8 4
+  ; 64BIT-NEXT:   $x7 = LI8 5
+  ; 64BIT-NEXT:   $x8 = LI8 6
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_stackarg_float2[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit killed $x4, implicit killed $x5, implicit killed $x6, implicit killed $x7, implicit killed $x8, implicit $f1, implicit $x9, implicit $x2, implicit-def $r1
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  %0 = load double, ptr @d, align 8
+  call void (i32, i32, i32, i32, i32, i32, ...) @test_stackarg_float2(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, double %0)
+  ret void
+}
+
+declare void @test_stackarg_float2(i32, i32, i32, i32, i32, i32, ...)
+
+; A double arg will pass on the stack in PPC32 if there is only one available GPR.
+define void @call_test_stackarg_float3() {
+  ; 32BIT-LABEL: name: call_test_stackarg_float3
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @d, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f1 = LFD 0, killed renamable $r3 :: (dereferenceable load (s64) from @d)
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @f, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   STFD renamable $f1, 0, %stack.0 :: (store (s64) into %stack.0)
+  ; 32BIT-NEXT:   renamable $r10 = LWZ 0, %stack.0 :: (load (s32) from %stack.0, align 8)
+  ; 32BIT-NEXT:   renamable $f2 = LFS 0, killed renamable $r3 :: (dereferenceable load (s32) from @f)
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 64, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   STFS renamable $f2, 60, $r1 :: (store (s32))
+  ; 32BIT-NEXT:   $r3 = LI 1
+  ; 32BIT-NEXT:   $r4 = LI 2
+  ; 32BIT-NEXT:   $r5 = LI 3
+  ; 32BIT-NEXT:   $r6 = LI 4
+  ; 32BIT-NEXT:   $r7 = LI 5
+  ; 32BIT-NEXT:   $r8 = LI 6
+  ; 32BIT-NEXT:   $r9 = LI 7
+  ; 32BIT-NEXT:   STFD renamable $f1, 52, $r1 :: (store (s64))
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_stackarg_float3[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit killed $r4, implicit killed $r5, implicit killed $r6, implicit killed $r7, implicit killed $r8, implicit killed $r9, implicit $f1, implicit $r10, implicit $f2, implicit $r2, implicit-def $r1
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 64, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: call_test_stackarg_float3
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   renamable $x3 = LDtoc @d, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $f1 = LFD 0, killed renamable $x3 :: (dereferenceable load (s64) from @d)
+  ; 64BIT-NEXT:   renamable $x3 = LDtoc @f, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   STFD renamable $f1, 0, %stack.0 :: (store (s64) into %stack.0)
+  ; 64BIT-NEXT:   renamable $x10 = LD 0, %stack.0 :: (load (s64) from %stack.0)
+  ; 64BIT-NEXT:   renamable $f2 = LFS 0, killed renamable $x3 :: (dereferenceable load (s32) from @f)
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 120, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   $x3 = LI8 1
+  ; 64BIT-NEXT:   $x4 = LI8 2
+  ; 64BIT-NEXT:   $x5 = LI8 3
+  ; 64BIT-NEXT:   $x6 = LI8 4
+  ; 64BIT-NEXT:   $x7 = LI8 5
+  ; 64BIT-NEXT:   $x8 = LI8 6
+  ; 64BIT-NEXT:   $x9 = LI8 7
+  ; 64BIT-NEXT:   STFS renamable $f2, 112, $x1 :: (store (s32))
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_stackarg_float3[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit killed $x4, implicit killed $x5, implicit killed $x6, implicit killed $x7, implicit killed $x8, implicit killed $x9, implicit $f1, implicit $x10, implicit $f2, implicit $x2, implicit-def $r1
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 120, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  %0 = load double, ptr @d, align 8
+  %1 = load float, ptr @f, align 4
+  call void (i32, i32, i32, i32, i32, i32, i32, ...) @test_stackarg_float3(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, double %0, float %1)
+  ret void
+}
+
+declare void @test_stackarg_float3(i32, i32, i32, i32, i32, i32, i32, ...)
+
+define i64 @test_ints_stack(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i64 %ll9, i16 signext %s10, i8 zeroext %c11, i32 %ui12, i32 %si13, i64 %ll14, i8 zeroext %uc15, i32 %i16) {
+  ; 32BIT-LABEL: name: test_ints_stack
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   liveins: $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10
+  ; 32BIT-NEXT: {{  $}}
+  ; 32BIT-NEXT:   renamable $r11 = LWZ 0, %fixed-stack.0 :: (load (s32) from %fixed-stack.0)
+  ; 32BIT-NEXT:   renamable $r12 = LWZ 0, %fixed-stack.4 :: (load (s32) from %fixed-stack.4)
+  ; 32BIT-NEXT:   renamable $r0 = LWZ 0, %fixed-stack.1 :: (load (s32) from %fixed-stack.1, align 8)
+  ; 32BIT-NEXT:   renamable $r31 = LWZ 4, %fixed-stack.3 :: (load (s32) from %fixed-stack.3 + 4, basealign 16)
+  ; 32BIT-NEXT:   renamable $r30 = LWZ 0, %fixed-stack.3 :: (load (s32) from %fixed-stack.3, align 16)
+  ; 32BIT-NEXT:   renamable $r29 = LWZ 0, %fixed-stack.5 :: (load (s32) from %fixed-stack.5, align 8)
+  ; 32BIT-NEXT:   renamable $r28 = LWZ 0, %fixed-stack.6 :: (load (s32) from %fixed-stack.6)
+  ; 32BIT-NEXT:   renamable $r27 = LWZ 0, %fixed-stack.7 :: (load (s32) from %fixed-stack.7, align 16)
+  ; 32BIT-NEXT:   renamable $r26 = LWZ 4, %fixed-stack.9 :: (load (s32) from %fixed-stack.9 + 4, basealign 8)
+  ; 32BIT-NEXT:   renamable $r25 = LWZ 0, %fixed-stack.9 :: (load (s32) from %fixed-stack.9, align 8)
+  ; 32BIT-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r4
+  ; 32BIT-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r5
+  ; 32BIT-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r6
+  ; 32BIT-NEXT:   renamable $r5 = SRAWI renamable $r11, 31, implicit-def dead $carry
+  ; 32BIT-NEXT:   renamable $r4 = SRAWI renamable $r12, 31, implicit-def dead $carry
+  ; 32BIT-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r7
+  ; 32BIT-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r8
+  ; 32BIT-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r9
+  ; 32BIT-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r10
+  ; 32BIT-NEXT:   renamable $r6 = SRAWI renamable $r3, 31, implicit-def dead $carry
+  ; 32BIT-NEXT:   renamable $r3 = ADDC killed renamable $r3, killed renamable $r26, implicit-def $carry
+  ; 32BIT-NEXT:   renamable $r6 = ADDE killed renamable $r6, killed renamable $r25, implicit-def dead $carry, implicit $carry
+  ; 32BIT-NEXT:   renamable $r7 = SRAWI renamable $r27, 31, implicit-def dead $carry
+  ; 32BIT-NEXT:   renamable $r3 = ADDC killed renamable $r3, killed renamable $r27, implicit-def $carry
+  ; 32BIT-NEXT:   renamable $r6 = ADDE killed renamable $r6, killed renamable $r7, implicit-def dead $carry, implicit $carry
+  ; 32BIT-NEXT:   renamable $r3 = ADDC killed renamable $r3, killed renamable $r28, implicit-def $carry
+  ; 32BIT-NEXT:   renamable $r6 = ADDZE killed renamable $r6, implicit-def dead $carry, implicit $carry
+  ; 32BIT-NEXT:   renamable $r3 = ADDC killed renamable $r3, killed renamable $r29, implicit-def $carry
+  ; 32BIT-NEXT:   renamable $r6 = ADDZE killed renamable $r6, implicit-def dead $carry, implicit $carry
+  ; 32BIT-NEXT:   renamable $r3 = ADDC killed renamable $r3, killed renamable $r12, implicit-def $carry
+  ; 32BIT-NEXT:   renamable $r4 = ADDE killed renamable $r6, killed renamable $r4, implicit-def dead $carry, implicit $carry
+  ; 32BIT-NEXT:   renamable $r3 = ADDC killed renamable $r3, killed renamable $r31, implicit-def $carry
+  ; 32BIT-NEXT:   renamable $r4 = ADDE killed renamable $r4, killed renamable $r30, implicit-def dead $carry, implicit $carry
+  ; 32BIT-NEXT:   renamable $r3 = ADDC killed renamable $r3, killed renamable $r0, implicit-def $carry
+  ; 32BIT-NEXT:   renamable $r6 = ADDZE killed renamable $r4, implicit-def dead $carry, implicit $carry
+  ; 32BIT-NEXT:   renamable $r4 = ADDC killed renamable $r3, killed renamable $r11, implicit-def $carry
+  ; 32BIT-NEXT:   renamable $r3 = ADDE killed renamable $r6, killed renamable $r5, implicit-def dead $carry, implicit $carry
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm, implicit $r3, implicit $r4
+  ;
+  ; 64BIT-LABEL: name: test_ints_stack
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   liveins: $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10
+  ; 64BIT-NEXT: {{  $}}
+  ; 64BIT-NEXT:   renamable $r11 = LWZ 0, %fixed-stack.1, implicit-def $x11 :: (load (s32) from %fixed-stack.1)
+  ; 64BIT-NEXT:   renamable $x12 = LWZ8 0, %fixed-stack.4 :: (load (s32) from %fixed-stack.4)
+  ; 64BIT-NEXT:   renamable $x0 = LWA 0, %fixed-stack.0 :: (load (s32) from %fixed-stack.0)
+  ; 64BIT-NEXT:   renamable $x31 = LD 0, %fixed-stack.2 :: (load (s64) from %fixed-stack.2)
+  ; 64BIT-NEXT:   renamable $x30 = LWA 0, %fixed-stack.3 :: (load (s32) from %fixed-stack.3)
+  ; 64BIT-NEXT:   renamable $r29 = LWZ 0, %fixed-stack.5, implicit-def $x29 :: (load (s32) from %fixed-stack.5)
+  ; 64BIT-NEXT:   renamable $x28 = LWA 0, %fixed-stack.6 :: (load (s32) from %fixed-stack.6)
+  ; 64BIT-NEXT:   renamable $x27 = LD 0, %fixed-stack.7 :: (load (s64) from %fixed-stack.7, align 16)
+  ; 64BIT-NEXT:   renamable $r3 = nsw ADD4 renamable $r3, renamable $r4, implicit killed $x4, implicit killed $x3
+  ; 64BIT-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, renamable $r5, implicit killed $x5
+  ; 64BIT-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, renamable $r6, implicit killed $x6
+  ; 64BIT-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, renamable $r7, implicit killed $x7
+  ; 64BIT-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, renamable $r8, implicit killed $x8
+  ; 64BIT-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, renamable $r9, implicit killed $x9
+  ; 64BIT-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, renamable $r10, implicit killed $x10
+  ; 64BIT-NEXT:   renamable $x3 = EXTSW_32_64 killed renamable $r3
+  ; 64BIT-NEXT:   renamable $x3 = nsw ADD8 killed renamable $x3, killed renamable $x27
+  ; 64BIT-NEXT:   renamable $x3 = nsw ADD8 killed renamable $x3, killed renamable $x28
+  ; 64BIT-NEXT:   renamable $x3 = nsw ADD8 killed renamable $x3, killed renamable $x29
+  ; 64BIT-NEXT:   renamable $x3 = nsw ADD8 killed renamable $x3, killed renamable $x12
+  ; 64BIT-NEXT:   renamable $x3 = nsw ADD8 killed renamable $x3, killed renamable $x30
+  ; 64BIT-NEXT:   renamable $x3 = nsw ADD8 killed renamable $x3, killed renamable $x31
+  ; 64BIT-NEXT:   renamable $x3 = nsw ADD8 killed renamable $x3, killed renamable $x11
+  ; 64BIT-NEXT:   renamable $x3 = nsw ADD8 killed renamable $x3, killed renamable $x0
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm, implicit $x3
+entry:
+  %add = add nsw i32 %i1, %i2
+  %add1 = add nsw i32 %add, %i3
+  %add2 = add nsw i32 %add1, %i4
+  %add3 = add nsw i32 %add2, %i5
+  %add4 = add nsw i32 %add3, %i6
+  %add5 = add nsw i32 %add4, %i7
+  %add6 = add nsw i32 %add5, %i8
+  %conv = sext i32 %add6 to i64
+  %add7 = add nsw i64 %conv, %ll9
+  %conv8 = sext i16 %s10 to i64
+  %add9 = add nsw i64 %add7, %conv8
+  %conv10 = zext i8 %c11 to i64
+  %add11 = add nsw i64 %add9, %conv10
+  %conv12 = zext i32 %ui12 to i64
+  %add13 = add nsw i64 %add11, %conv12
+  %conv14 = sext i32 %si13 to i64
+  %add15 = add nsw i64 %add13, %conv14
+  %add16 = add nsw i64 %add15, %ll14
+  %conv17 = zext i8 %uc15 to i64
+  %add18 = add nsw i64 %add16, %conv17
+  %conv19 = sext i32 %i16 to i64
+  %add20 = add nsw i64 %add18, %conv19
+  ret i64 %add20
+}
+
+@ll1 = common global i64 0, align 8
+@si1 = common global i16 0, align 2
+@ch = common global i8 0, align 1
+@ui = common global i32 0, align 4
+@sint = common global i32 0, align 4
+@ll2 = common global i64 0, align 8
+@uc1 = common global i8 0, align 1
+@i1 = common global i32 0, align 4
+
+define void @caller_ints_stack() {
+  ; 32BIT-LABEL: name: caller_ints_stack
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @ll1, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r4 = LWZtoc @si1, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r11 = LWZ 0, renamable $r3 :: (dereferenceable load (s32) from @ll1, align 8)
+  ; 32BIT-NEXT:   renamable $r5 = LWZtoc @ch, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r3 = LWZ 4, killed renamable $r3 :: (dereferenceable load (s32) from @ll1 + 4, basealign 8)
+  ; 32BIT-NEXT:   renamable $r6 = LWZtoc @ui, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r4 = LHA 0, killed renamable $r4 :: (dereferenceable load (s16) from @si1)
+  ; 32BIT-NEXT:   renamable $r7 = LWZtoc @sint, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r5 = LBZ 0, killed renamable $r5 :: (dereferenceable load (s8) from @ch)
+  ; 32BIT-NEXT:   renamable $r8 = LWZtoc @ll2, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r6 = LWZ 0, killed renamable $r6 :: (dereferenceable load (s32) from @ui)
+  ; 32BIT-NEXT:   renamable $r7 = LWZ 0, killed renamable $r7 :: (dereferenceable load (s32) from @sint)
+  ; 32BIT-NEXT:   renamable $r9 = LWZtoc @uc1, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r10 = LWZ 0, renamable $r8 :: (dereferenceable load (s32) from @ll2, align 8)
+  ; 32BIT-NEXT:   renamable $r12 = LWZtoc @i1, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r8 = LWZ 4, killed renamable $r8 :: (dereferenceable load (s32) from @ll2 + 4, basealign 8)
+  ; 32BIT-NEXT:   renamable $r9 = LBZ 0, killed renamable $r9 :: (dereferenceable load (s8) from @uc1)
+  ; 32BIT-NEXT:   renamable $r12 = LWZ 0, killed renamable $r12 :: (dereferenceable load (s32) from @i1)
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 96, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   STW killed renamable $r12, 92, $r1 :: (store (s32))
+  ; 32BIT-NEXT:   STW killed renamable $r9, 88, $r1 :: (store (s32))
+  ; 32BIT-NEXT:   STW killed renamable $r8, 84, $r1 :: (store (s32))
+  ; 32BIT-NEXT:   STW killed renamable $r10, 80, $r1 :: (store (s32))
+  ; 32BIT-NEXT:   STW killed renamable $r7, 76, $r1 :: (store (s32))
+  ; 32BIT-NEXT:   STW killed renamable $r6, 72, $r1 :: (store (s32))
+  ; 32BIT-NEXT:   STW killed renamable $r5, 68, $r1 :: (store (s32))
+  ; 32BIT-NEXT:   STW killed renamable $r4, 64, $r1 :: (store (s32))
+  ; 32BIT-NEXT:   STW killed renamable $r3, 60, $r1 :: (store (s32))
+  ; 32BIT-NEXT:   $r3 = LI 1
+  ; 32BIT-NEXT:   $r4 = LI 2
+  ; 32BIT-NEXT:   $r5 = LI 3
+  ; 32BIT-NEXT:   $r6 = LI 4
+  ; 32BIT-NEXT:   $r7 = LI 5
+  ; 32BIT-NEXT:   $r8 = LI 6
+  ; 32BIT-NEXT:   $r9 = LI 7
+  ; 32BIT-NEXT:   $r10 = LI 8
+  ; 32BIT-NEXT:   STW killed renamable $r11, 56, $r1 :: (store (s32))
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_ints_stack>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r5, implicit $r6, implicit $r7, implicit $r8, implicit $r9, implicit $r10, implicit $r2, implicit-def $r1, implicit-def dead $r3, implicit-def dead $r4
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 96, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: caller_ints_stack
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   renamable $x3 = LDtoc @si1, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x4 = LDtoc @ch, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x5 = LDtoc @ui, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x11 = LHA8 0, killed renamable $x3 :: (dereferenceable load (s16) from @si1)
+  ; 64BIT-NEXT:   renamable $x3 = LDtoc @sint, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x12 = LBZ8 0, killed renamable $x4 :: (dereferenceable load (s8) from @ch)
+  ; 64BIT-NEXT:   renamable $x4 = LDtoc @uc1, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x0 = LWZ8 0, killed renamable $x5 :: (dereferenceable load (s32) from @ui)
+  ; 64BIT-NEXT:   renamable $x5 = LDtoc @ll1, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x6 = LDtoc @ll2, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x7 = LDtoc @i1, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x31 = LWZ8 0, killed renamable $x3 :: (dereferenceable load (s32) from @sint)
+  ; 64BIT-NEXT:   renamable $x30 = LBZ8 0, killed renamable $x4 :: (dereferenceable load (s8) from @uc1)
+  ; 64BIT-NEXT:   renamable $x29 = LD 0, killed renamable $x5 :: (dereferenceable load (s64) from @ll1)
+  ; 64BIT-NEXT:   renamable $x28 = LD 0, killed renamable $x6 :: (dereferenceable load (s64) from @ll2)
+  ; 64BIT-NEXT:   renamable $x27 = LWZ8 0, killed renamable $x7 :: (dereferenceable load (s32) from @i1)
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 176, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   $x3 = LI8 1
+  ; 64BIT-NEXT:   $x4 = LI8 2
+  ; 64BIT-NEXT:   $x5 = LI8 3
+  ; 64BIT-NEXT:   $x6 = LI8 4
+  ; 64BIT-NEXT:   $x7 = LI8 5
+  ; 64BIT-NEXT:   $x8 = LI8 6
+  ; 64BIT-NEXT:   $x9 = LI8 7
+  ; 64BIT-NEXT:   $x10 = LI8 8
+  ; 64BIT-NEXT:   STD killed renamable $x27, 168, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   STD killed renamable $x30, 160, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   STD killed renamable $x28, 152, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   STD killed renamable $x31, 144, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   STD killed renamable $x0, 136, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   STD killed renamable $x12, 128, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   STD killed renamable $x11, 120, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   STD killed renamable $x29, 112, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_ints_stack>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit $x5, implicit $x6, implicit $x7, implicit $x8, implicit $x9, implicit $x10, implicit $x2, implicit-def $r1, implicit-def dead $x3
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 176, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  %0 = load i64, ptr @ll1, align 8
+  %1 = load i16, ptr @si1, align 2
+  %2 = load i8, ptr @ch, align 1
+  %3 = load i32, ptr @ui, align 4
+  %4 = load i32, ptr @sint, align 4
+  %5 = load i64, ptr @ll2, align 8
+  %6 = load i8, ptr @uc1, align 1
+  %7 = load i32, ptr @i1, align 4
+  %call = call i64 @test_ints_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i64 %0, i16 signext %1, i8 zeroext %2, i32 %3, i32 %4, i64 %5, i8 zeroext %6, i32 %7)
+  ret void
+}
+
+@globali1 = global i8 0, align 1
+
+define void @test_i1_stack(i32 %a, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i1 zeroext %b) {
+  ; 32BIT-LABEL: name: test_i1_stack
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   renamable $r3 = LBZ 0, %fixed-stack.0 :: (load (s8) from %fixed-stack.0)
+  ; 32BIT-NEXT:   renamable $r4 = LWZtoc @globali1, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   STB killed renamable $r3, 0, killed renamable $r4 :: (store (s8) into @globali1)
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: test_i1_stack
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   renamable $r3 = LBZ 0, %fixed-stack.0 :: (load (s8) from %fixed-stack.0)
+  ; 64BIT-NEXT:   renamable $x4 = LDtoc @globali1, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   STB killed renamable $r3, 0, killed renamable $x4 :: (store (s8) into @globali1)
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+  entry:
+    %frombool = zext i1 %b to i8
+    store i8 %frombool, ptr @globali1, align 1
+    ret void
+}
+
+define void @call_test_i1_stack() {
+  ; 32BIT-LABEL: name: call_test_i1_stack
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 60, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   renamable $r11 = LI 1
+  ; 32BIT-NEXT:   $r3 = LI 1
+  ; 32BIT-NEXT:   $r4 = LI 2
+  ; 32BIT-NEXT:   $r5 = LI 3
+  ; 32BIT-NEXT:   $r6 = LI 4
+  ; 32BIT-NEXT:   $r7 = LI 5
+  ; 32BIT-NEXT:   $r8 = LI 6
+  ; 32BIT-NEXT:   $r9 = LI 7
+  ; 32BIT-NEXT:   $r10 = LI 8
+  ; 32BIT-NEXT:   STW killed renamable $r11, 56, $r1 :: (store (s32))
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_i1_stack>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r5, implicit $r6, implicit $r7, implicit $r8, implicit $r9, implicit $r10, implicit $r2, implicit-def $r1
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 60, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: call_test_i1_stack
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 120, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   renamable $x11 = LI8 1
+  ; 64BIT-NEXT:   $x3 = LI8 1
+  ; 64BIT-NEXT:   $x4 = LI8 2
+  ; 64BIT-NEXT:   $x5 = LI8 3
+  ; 64BIT-NEXT:   $x6 = LI8 4
+  ; 64BIT-NEXT:   $x7 = LI8 5
+  ; 64BIT-NEXT:   $x8 = LI8 6
+  ; 64BIT-NEXT:   $x9 = LI8 7
+  ; 64BIT-NEXT:   $x10 = LI8 8
+  ; 64BIT-NEXT:   STD killed renamable $x11, 112, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_i1_stack>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit $x5, implicit $x6, implicit $x7, implicit $x8, implicit $x9, implicit $x10, implicit $x2, implicit-def $r1
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 120, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+  entry:
+    call void @test_i1_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i1 true)
+    ret void
+}
+
+define double @test_fpr_stack(double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %s10, double %l11, double %d12, double %d13, float %f14, double %d15, float %f16) {
+  ; 32BIT-LABEL: name: test_fpr_stack
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   liveins: $f1, $f2, $f3, $f4, $f5, $f6, $f7, $f8, $f9, $f10, $f11, $f12, $f13
+  ; 32BIT-NEXT: {{  $}}
+  ; 32BIT-NEXT:   renamable $f0 = LFD 0, %fixed-stack.1 :: (load (s64) from %fixed-stack.1)
+  ; 32BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f2, implicit $rm
+  ; 32BIT-NEXT:   renamable $f2 = LFS 0, %fixed-stack.2 :: (load (s32) from %fixed-stack.2, align 16)
+  ; 32BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f3, implicit $rm
+  ; 32BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f4, implicit $rm
+  ; 32BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f5, implicit $rm
+  ; 32BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f6, implicit $rm
+  ; 32BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f7, implicit $rm
+  ; 32BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f8, implicit $rm
+  ; 32BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f9, implicit $rm
+  ; 32BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f10, implicit $rm
+  ; 32BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f11, implicit $rm
+  ; 32BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f12, implicit $rm
+  ; 32BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, renamable $f13, implicit $rm
+  ; 32BIT-NEXT:   renamable $f3 = LFS 0, %fixed-stack.0 :: (load (s32) from %fixed-stack.0)
+  ; 32BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f13, implicit $rm
+  ; 32BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f2, implicit $rm
+  ; 32BIT-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f1, killed renamable $f0, implicit $rm
+  ; 32BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f0, killed renamable $f3, implicit $rm
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm, implicit $f1
+  ;
+  ; 64BIT-LABEL: name: test_fpr_stack
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   liveins: $f1, $f2, $f3, $f4, $f5, $f6, $f7, $f8, $f9, $f10, $f11, $f12, $f13
+  ; 64BIT-NEXT: {{  $}}
+  ; 64BIT-NEXT:   renamable $f0 = LFD 0, %fixed-stack.1 :: (load (s64) from %fixed-stack.1, align 16)
+  ; 64BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f2, implicit $rm
+  ; 64BIT-NEXT:   renamable $f2 = LFS 0, %fixed-stack.2 :: (load (s32) from %fixed-stack.2, align 8)
+  ; 64BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f3, implicit $rm
+  ; 64BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f4, implicit $rm
+  ; 64BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f5, implicit $rm
+  ; 64BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f6, implicit $rm
+  ; 64BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f7, implicit $rm
+  ; 64BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f8, implicit $rm
+  ; 64BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f9, implicit $rm
+  ; 64BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f10, implicit $rm
+  ; 64BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f11, implicit $rm
+  ; 64BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f12, implicit $rm
+  ; 64BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, renamable $f13, implicit $rm
+  ; 64BIT-NEXT:   renamable $f3 = LFS 0, %fixed-stack.0 :: (load (s32) from %fixed-stack.0, align 8)
+  ; 64BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f13, implicit $rm
+  ; 64BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f2, implicit $rm
+  ; 64BIT-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f1, killed renamable $f0, implicit $rm
+  ; 64BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f0, killed renamable $f3, implicit $rm
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm, implicit $f1
+  entry:
+    %add = fadd double %d1, %d2
+    %add1 = fadd double %add, %d3
+    %add2 = fadd double %add1, %d4
+    %add3 = fadd double %add2, %d5
+    %add4 = fadd double %add3, %d6
+    %add5 = fadd double %add4, %d7
+    %add6 = fadd double %add5, %d8
+    %add7 = fadd double %add6, %d9
+    %add8 = fadd double %add7, %s10
+    %add9 = fadd double %add8, %l11
+    %add10 = fadd double %add9, %d12
+    %add11 = fadd double %add10, %d13
+    %add12 = fadd double %add11, %d13
+    %conv = fpext float %f14 to double
+    %add13 = fadd double %add12, %conv
+    %add14 = fadd double %add13, %d15
+    %conv15 = fpext float %f16 to double
+    %add16 = fadd double %add14, %conv15
+    ret double %add16
+  }
+
+@f14 = common global float 0.000000e+00, align 4
+@d15 = common global double 0.000000e+00, align 8
+@f16 = common global float 0.000000e+00, align 4
+
+define void @caller_fpr_stack() {
+  ; 32BIT-LABEL: name: caller_fpr_stack
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @d15, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r4 = LWZtoc @f14, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f0 = LFD 0, killed renamable $r3 :: (dereferenceable load (s64) from @d15)
+  ; 32BIT-NEXT:   renamable $r5 = LWZtoc @f16, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r3 = LWZ 0, killed renamable $r4 :: (load (s32) from @f14)
+  ; 32BIT-NEXT:   renamable $r4 = LWZ 0, killed renamable $r5 :: (load (s32) from @f16)
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 144, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   renamable $r5 = LI 0
+  ; 32BIT-NEXT:   renamable $r6 = LIS 16352
+  ; 32BIT-NEXT:   STW killed renamable $r5, 60, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
+  ; 32BIT-NEXT:   renamable $r5 = LIS 13107
+  ; 32BIT-NEXT:   STW killed renamable $r6, 56, $r1 :: (store (s32), align 8)
+  ; 32BIT-NEXT:   renamable $r6 = LIS 16355
+  ; 32BIT-NEXT:   renamable $r5 = ORI killed renamable $r5, 13107
+  ; 32BIT-NEXT:   STW killed renamable $r5, 68, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
+  ; 32BIT-NEXT:   renamable $r5 = LIS 26214
+  ; 32BIT-NEXT:   renamable $r6 = ORI killed renamable $r6, 13107
+  ; 32BIT-NEXT:   STW killed renamable $r6, 64, $r1 :: (store (s32), align 8)
+  ; 32BIT-NEXT:   renamable $r6 = LIS 16358
+  ; 32BIT-NEXT:   renamable $r5 = ORI killed renamable $r5, 26214
+  ; 32BIT-NEXT:   STW killed renamable $r5, 76, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
+  ; 32BIT-NEXT:   renamable $r5 = LIS 39321
+  ; 32BIT-NEXT:   renamable $r6 = ORI killed renamable $r6, 26214
+  ; 32BIT-NEXT:   STW killed renamable $r6, 72, $r1 :: (store (s32), align 8)
+  ; 32BIT-NEXT:   renamable $r6 = LIS 16361
+  ; 32BIT-NEXT:   renamable $r6 = ORI killed renamable $r6, 39321
+  ; 32BIT-NEXT:   STW killed renamable $r6, 80, $r1 :: (store (s32), align 8)
+  ; 32BIT-NEXT:   renamable $r6 = LIS 52428
+  ; 32BIT-NEXT:   renamable $r6 = ORI killed renamable $r6, 52429
+  ; 32BIT-NEXT:   STW killed renamable $r6, 92, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
+  ; 32BIT-NEXT:   renamable $r6 = LIS 16364
+  ; 32BIT-NEXT:   renamable $r5 = ORI killed renamable $r5, 39322
+  ; 32BIT-NEXT:   STW renamable $r5, 84, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
+  ; 32BIT-NEXT:   renamable $r6 = ORI killed renamable $r6, 52428
+  ; 32BIT-NEXT:   STW killed renamable $r6, 88, $r1 :: (store (s32), align 8)
+  ; 32BIT-NEXT:   renamable $r6 = LIS 16313
+  ; 32BIT-NEXT:   STW killed renamable $r5, 100, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
+  ; 32BIT-NEXT:   renamable $r5 = LIS 49807
+  ; 32BIT-NEXT:   renamable $r6 = ORI killed renamable $r6, 39321
+  ; 32BIT-NEXT:   STW killed renamable $r6, 96, $r1 :: (store (s32), align 8)
+  ; 32BIT-NEXT:   renamable $r6 = LIS 16316
+  ; 32BIT-NEXT:   renamable $r5 = ORI killed renamable $r5, 23593
+  ; 32BIT-NEXT:   STW killed renamable $r5, 108, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
+  ; 32BIT-NEXT:   renamable $r5 = LIS 60293
+  ; 32BIT-NEXT:   renamable $r6 = ORI killed renamable $r6, 10485
+  ; 32BIT-NEXT:   STW killed renamable $r6, 104, $r1 :: (store (s32), align 8)
+  ; 32BIT-NEXT:   renamable $r6 = LIS 16318
+  ; 32BIT-NEXT:   renamable $r5 = ORI killed renamable $r5, 7864
+  ; 32BIT-NEXT:   STW killed renamable $r5, 116, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
+  ; 32BIT-NEXT:   renamable $r5 = LIS 2621
+  ; 32BIT-NEXT:   renamable $r6 = ORI killed renamable $r6, 47185
+  ; 32BIT-NEXT:   STW killed renamable $r6, 112, $r1 :: (store (s32), align 8)
+  ; 32BIT-NEXT:   renamable $r6 = LIS 16320
+  ; 32BIT-NEXT:   renamable $r5 = ORI killed renamable $r5, 28836
+  ; 32BIT-NEXT:   STW killed renamable $r5, 124, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
+  ; 32BIT-NEXT:   renamable $r5 = LWZtoc %const.0, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r6 = ORI killed renamable $r6, 41943
+  ; 32BIT-NEXT:   STW killed renamable $r6, 120, $r1 :: (store (s32), align 8)
+  ; 32BIT-NEXT:   renamable $r6 = LWZtoc %const.1, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f2 = LFD 0, killed renamable $r5 :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   renamable $r5 = LWZtoc %const.2, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f3 = LFD 0, killed renamable $r6 :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   renamable $r6 = LWZtoc %const.3, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f4 = LFD 0, killed renamable $r5 :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   renamable $r5 = LWZtoc %const.4, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f6 = LFD 0, killed renamable $r6 :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   renamable $r6 = LWZtoc %const.5, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f7 = LFD 0, killed renamable $r5 :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   renamable $r5 = LWZtoc %const.6, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f8 = LFD 0, killed renamable $r6 :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   renamable $r6 = LWZtoc %const.7, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f9 = LFD 0, killed renamable $r5 :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   renamable $r5 = LWZtoc %const.8, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f1 = LFD 0, killed renamable $r6 :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   renamable $r6 = LWZtoc %const.9, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f11 = LFD 0, killed renamable $r5 :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   renamable $r5 = LWZtoc %const.10, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f12 = LFD 0, killed renamable $r6 :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   renamable $r6 = LWZtoc %const.11, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f13 = LFD 0, killed renamable $r5 :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   renamable $f5 = LFS 0, killed renamable $r6 :: (load (s32) from constant-pool)
+  ; 32BIT-NEXT:   STW killed renamable $r4, 140, $r1 :: (store (s32))
+  ; 32BIT-NEXT:   STFD killed renamable $f0, 132, $r1 :: (store (s64))
+  ; 32BIT-NEXT:   $f10 = COPY renamable $f1
+  ; 32BIT-NEXT:   STW killed renamable $r3, 128, $r1 :: (store (s32))
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_fpr_stack>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $f1, implicit $f2, implicit $f3, implicit $f4, implicit $f5, implicit $f6, implicit $f7, implicit $f8, implicit $f9, implicit killed $f10, implicit $f11, implicit $f12, implicit $f13, implicit $r2, implicit-def $r1, implicit-def dead $f1
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 144, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: caller_fpr_stack
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   renamable $x3 = LDtoc @f14, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x4 = LDtoc @d15, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x5 = LDtoc @f16, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $r3 = LWZ 0, killed renamable $x3 :: (load (s32) from @f14)
+  ; 64BIT-NEXT:   renamable $x4 = LD 0, killed renamable $x4 :: (load (s64) from @d15)
+  ; 64BIT-NEXT:   renamable $r5 = LWZ 0, killed renamable $x5 :: (load (s32) from @f16)
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 176, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   renamable $x6 = LDtocCPT %const.0, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   STW killed renamable $r5, 168, $x1 :: (store (s32))
+  ; 64BIT-NEXT:   renamable $x5 = LDtocCPT %const.1, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x7 = LDtocCPT %const.2, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $f2 = LFD 0, killed renamable $x6 :: (load (s64) from constant-pool)
+  ; 64BIT-NEXT:   renamable $x6 = LDtocCPT %const.3, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $f3 = LFD 0, killed renamable $x5 :: (load (s64) from constant-pool)
+  ; 64BIT-NEXT:   renamable $x5 = LDtocCPT %const.4, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $f4 = LFD 0, killed renamable $x7 :: (load (s64) from constant-pool)
+  ; 64BIT-NEXT:   renamable $x7 = LDtocCPT %const.5, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $f6 = LFD 0, killed renamable $x6 :: (load (s64) from constant-pool)
+  ; 64BIT-NEXT:   renamable $x6 = LDtocCPT %const.6, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $f7 = LFD 0, killed renamable $x5 :: (load (s64) from constant-pool)
+  ; 64BIT-NEXT:   STD killed renamable $x4, 160, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   renamable $x4 = LDtocCPT %const.7, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $f8 = LFD 0, killed renamable $x7 :: (load (s64) from constant-pool)
+  ; 64BIT-NEXT:   renamable $x5 = LIS8 16320
+  ; 64BIT-NEXT:   renamable $x7 = LDtocCPT %const.8, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $f9 = LFD 0, killed renamable $x6 :: (load (s64) from constant-pool)
+  ; 64BIT-NEXT:   renamable $x6 = LIS8 16318
+  ; 64BIT-NEXT:   renamable $x8 = LDtocCPT %const.9, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $f1 = LFD 0, killed renamable $x4 :: (load (s64) from constant-pool)
+  ; 64BIT-NEXT:   renamable $x4 = LIS8 16316
+  ; 64BIT-NEXT:   renamable $f11 = LFD 0, killed renamable $x7 :: (load (s64) from constant-pool)
+  ; 64BIT-NEXT:   renamable $x7 = LIS8 16313
+  ; 64BIT-NEXT:   renamable $f12 = LFD 0, killed renamable $x8 :: (load (s64) from constant-pool)
+  ; 64BIT-NEXT:   renamable $x8 = LDtocCPT %const.10, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x5 = ORI8 killed renamable $x5, 41943
+  ; 64BIT-NEXT:   renamable $x6 = ORI8 killed renamable $x6, 47185
+  ; 64BIT-NEXT:   renamable $x4 = ORI8 killed renamable $x4, 10485
+  ; 64BIT-NEXT:   renamable $x7 = ORI8 killed renamable $x7, 39321
+  ; 64BIT-NEXT:   renamable $f13 = LFD 0, killed renamable $x8 :: (load (s64) from constant-pool)
+  ; 64BIT-NEXT:   renamable $x8 = LDtocCPT %const.11, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x5 = RLDIC killed renamable $x5, 32, 2
+  ; 64BIT-NEXT:   renamable $x6 = RLDIC killed renamable $x6, 32, 2
+  ; 64BIT-NEXT:   renamable $x4 = RLDIC killed renamable $x4, 32, 2
+  ; 64BIT-NEXT:   renamable $x7 = RLDIC killed renamable $x7, 32, 2
+  ; 64BIT-NEXT:   renamable $x5 = ORIS8 killed renamable $x5, 2621
+  ; 64BIT-NEXT:   renamable $x6 = ORIS8 killed renamable $x6, 60293
+  ; 64BIT-NEXT:   renamable $x4 = ORIS8 killed renamable $x4, 49807
+  ; 64BIT-NEXT:   renamable $x7 = ORIS8 killed renamable $x7, 39321
+  ; 64BIT-NEXT:   renamable $x5 = ORI8 killed renamable $x5, 28836
+  ; 64BIT-NEXT:   renamable $x6 = ORI8 killed renamable $x6, 7864
+  ; 64BIT-NEXT:   renamable $x4 = ORI8 killed renamable $x4, 23593
+  ; 64BIT-NEXT:   renamable $f5 = LFS 0, killed renamable $x8 :: (load (s32) from constant-pool)
+  ; 64BIT-NEXT:   renamable $x8 = LIS8 4091
+  ; 64BIT-NEXT:   renamable $x8 = ORI8 killed renamable $x8, 13107
+  ; 64BIT-NEXT:   renamable $x7 = ORI8 killed renamable $x7, 39322
+  ; 64BIT-NEXT:   renamable $x8 = RLDIC killed renamable $x8, 34, 2
+  ; 64BIT-NEXT:   renamable $x8 = ORIS8 killed renamable $x8, 52428
+  ; 64BIT-NEXT:   renamable $x8 = ORI8 killed renamable $x8, 52429
+  ; 64BIT-NEXT:   $f10 = COPY renamable $f1
+  ; 64BIT-NEXT:   STW killed renamable $r3, 152, $x1 :: (store (s32))
+  ; 64BIT-NEXT:   STD killed renamable $x5, 144, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   STD killed renamable $x6, 136, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   STD killed renamable $x4, 128, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   STD killed renamable $x7, 120, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   STD killed renamable $x8, 112, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_fpr_stack>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $f1, implicit $f2, implicit $f3, implicit $f4, implicit $f5, implicit $f6, implicit $f7, implicit $f8, implicit $f9, implicit killed $f10, implicit $f11, implicit $f12, implicit $f13, implicit $x2, implicit-def $r1, implicit-def dead $f1
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 176, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  %0 = load float, ptr @f14, align 4
+  %1 = load double, ptr @d15, align 8
+  %2 = load float, ptr @f16, align 4
+  %call = call double @test_fpr_stack(double 1.000000e-01, double 2.000000e-01, double 3.000000e-01, double 4.000000e-01, double 5.000000e-01, double 6.000000e-01, double 0x3FE6666666666666, double 8.000000e-01, double 9.000000e-01, double 1.000000e-01, double 1.100000e-01, double 1.200000e-01, double 1.300000e-01, float %0, double %1, float %2)
+  ret void
+}
+
+define i32 @mix_callee(double %d1, double %d2, double %d3, double %d4, i8 zeroext %c1, i16 signext %s1, i64 %ll1, i32 %i1, i32 %i2, i32 %i3) {
+  ; 32BIT-LABEL: name: mix_callee
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   liveins: $f1, $f2, $f3, $f4
+  ; 32BIT-NEXT: {{  $}}
+  ; 32BIT-NEXT:   renamable $r3 = LWZ 0, %fixed-stack.3 :: (load (s32) from %fixed-stack.3)
+  ; 32BIT-NEXT:   renamable $r4 = LWZ 0, %fixed-stack.5 :: (load (s32) from %fixed-stack.5)
+  ; 32BIT-NEXT:   renamable $r5 = LWZ 0, %fixed-stack.6 :: (load (s32) from %fixed-stack.6, align 8)
+  ; 32BIT-NEXT:   renamable $r6 = LWZ 0, %fixed-stack.2 :: (load (s32) from %fixed-stack.2, align 8)
+  ; 32BIT-NEXT:   renamable $r7 = LIS 17200
+  ; 32BIT-NEXT:   STW killed renamable $r7, 0, %stack.1 :: (store (s32) into %stack.1, align 8)
+  ; 32BIT-NEXT:   renamable $r7 = LWZ 0, %fixed-stack.1 :: (load (s32) from %fixed-stack.1)
+  ; 32BIT-NEXT:   renamable $r4 = nsw ADD4 killed renamable $r5, killed renamable $r4
+  ; 32BIT-NEXT:   renamable $r5 = LWZtoc %const.0, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r3 = ADD4 killed renamable $r4, killed renamable $r3
+  ; 32BIT-NEXT:   renamable $r4 = LWZ 0, %fixed-stack.0 :: (load (s32) from %fixed-stack.0, align 16)
+  ; 32BIT-NEXT:   renamable $r3 = ADD4 killed renamable $r3, killed renamable $r6
+  ; 32BIT-NEXT:   renamable $r3 = ADD4 killed renamable $r3, killed renamable $r7
+  ; 32BIT-NEXT:   renamable $f0 = LFS 0, killed renamable $r5 :: (load (s32) from constant-pool)
+  ; 32BIT-NEXT:   renamable $r3 = ADD4 killed renamable $r3, killed renamable $r4
+  ; 32BIT-NEXT:   renamable $r3 = XORIS killed renamable $r3, 32768
+  ; 32BIT-NEXT:   STW killed renamable $r3, 4, %stack.1 :: (store (s32) into %stack.1 + 4)
+  ; 32BIT-NEXT:   renamable $f5 = LFD 0, %stack.1 :: (load (s64) from %stack.1)
+  ; 32BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f2, implicit $rm
+  ; 32BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f3, implicit $rm
+  ; 32BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f4, implicit $rm
+  ; 32BIT-NEXT:   renamable $f0 = nofpexcept FSUB killed renamable $f5, killed renamable $f0, implicit $rm
+  ; 32BIT-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f1, implicit $rm
+  ; 32BIT-NEXT:   renamable $f0 = nofpexcept FCTIWZ killed renamable $f0, implicit $rm
+  ; 32BIT-NEXT:   STFD killed renamable $f0, 0, %stack.0 :: (store (s64) into %stack.0)
+  ; 32BIT-NEXT:   renamable $r3 = LWZ 4, %stack.0 :: (load (s32) from %stack.0 + 4, basealign 8)
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm, implicit $r3
+  ;
+  ; 64BIT-LABEL: name: mix_callee
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   liveins: $f1, $f2, $f3, $f4, $x7, $x8, $x9, $x10
+  ; 64BIT-NEXT: {{  $}}
+  ; 64BIT-NEXT:   renamable $x3 = LWZ8 0, %fixed-stack.1 :: (load (s32) from %fixed-stack.1)
+  ; 64BIT-NEXT:   renamable $r4 = nsw ADD4 renamable $r7, renamable $r8, implicit killed $x8, implicit killed $x7, implicit-def $x4
+  ; 64BIT-NEXT:   renamable $x5 = LWZ8 0, %fixed-stack.0 :: (load (s32) from %fixed-stack.0)
+  ; 64BIT-NEXT:   renamable $x4 = ADD8 killed renamable $x4, killed renamable $x9
+  ; 64BIT-NEXT:   renamable $x4 = ADD8 killed renamable $x4, killed renamable $x10
+  ; 64BIT-NEXT:   renamable $x3 = ADD8 killed renamable $x4, killed renamable $x3
+  ; 64BIT-NEXT:   renamable $x3 = ADD8 killed renamable $x3, killed renamable $x5
+  ; 64BIT-NEXT:   renamable $x3 = EXTSW killed renamable $x3
+  ; 64BIT-NEXT:   STD killed renamable $x3, 0, %stack.1 :: (store (s64) into %stack.1)
+  ; 64BIT-NEXT:   renamable $f0 = LFD 0, %stack.1 :: (load (s64) from %stack.1)
+  ; 64BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f2, implicit $rm
+  ; 64BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f3, implicit $rm
+  ; 64BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f4, implicit $rm
+  ; 64BIT-NEXT:   renamable $f0 = nofpexcept FCFID killed renamable $f0, implicit $rm
+  ; 64BIT-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f1, implicit $rm
+  ; 64BIT-NEXT:   renamable $f0 = nofpexcept FCTIWZ killed renamable $f0, implicit $rm
+  ; 64BIT-NEXT:   STFD killed renamable $f0, 0, %stack.0 :: (store (s64) into %stack.0)
+  ; 64BIT-NEXT:   renamable $x3 = LWZ8 4, %stack.0 :: (load (s32) from %stack.0 + 4, basealign 8)
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm, implicit $x3
+  entry:
+    %add = fadd double %d1, %d2
+    %add1 = fadd double %add, %d3
+    %add2 = fadd double %add1, %d4
+    %conv = zext i8 %c1 to i32
+    %conv3 = sext i16 %s1 to i32
+    %add4 = add nsw i32 %conv, %conv3
+    %conv5 = sext i32 %add4 to i64
+    %add6 = add nsw i64 %conv5, %ll1
+    %conv7 = sext i32 %i1 to i64
+    %add8 = add nsw i64 %add6, %conv7
+    %conv9 = sext i32 %i2 to i64
+    %add10 = add nsw i64 %add8, %conv9
+    %conv11 = sext i32 %i3 to i64
+    %add12 = add nsw i64 %add10, %conv11
+    %conv13 = trunc i64 %add12 to i32
+    %conv14 = sitofp i32 %conv13 to double
+    %add15 = fadd double %conv14, %add2
+    %conv16 = fptosi double %add15 to i32
+    ret i32 %conv16
+  }
+
+define void @caller_mix() {
+  ; 32BIT-LABEL: name: caller_mix
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 84, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   renamable $r3 = LI 60
+  ; 32BIT-NEXT:   STW killed renamable $r3, 80, $r1 :: (store (s32))
+  ; 32BIT-NEXT:   renamable $r3 = LI 50
+  ; 32BIT-NEXT:   STW killed renamable $r3, 76, $r1 :: (store (s32))
+  ; 32BIT-NEXT:   renamable $r3 = LI 40
+  ; 32BIT-NEXT:   STW killed renamable $r3, 72, $r1 :: (store (s32))
+  ; 32BIT-NEXT:   renamable $r3 = LI 0
+  ; 32BIT-NEXT:   renamable $r4 = LWZtoc %const.0, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   STW killed renamable $r3, 64, $r1 :: (store (s32))
+  ; 32BIT-NEXT:   renamable $r3 = LI 2
+  ; 32BIT-NEXT:   STW killed renamable $r3, 60, $r1 :: (store (s32))
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc %const.1, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f1 = LFD 0, killed renamable $r4 :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   renamable $r4 = LWZtoc %const.2, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f2 = LFD 0, killed renamable $r3 :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc %const.3, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f3 = LFD 0, killed renamable $r4 :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   renamable $f4 = LFD 0, killed renamable $r3 :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   renamable $r3 = LI 1
+  ; 32BIT-NEXT:   STW killed renamable $r3, 56, $r1 :: (store (s32))
+  ; 32BIT-NEXT:   renamable $r3 = LIS 457
+  ; 32BIT-NEXT:   renamable $r3 = ORI killed renamable $r3, 50048
+  ; 32BIT-NEXT:   STW killed renamable $r3, 68, $r1 :: (store (s32))
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .mix_callee>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $f1, implicit $f2, implicit $f3, implicit $f4, implicit $r2, implicit-def $r1, implicit-def dead $r3
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 84, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: caller_mix
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 128, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   renamable $x3 = LDtocCPT %const.0, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x4 = LDtocCPT %const.1, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x5 = LDtocCPT %const.2, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $f1 = LFD 0, killed renamable $x3 :: (load (s64) from constant-pool)
+  ; 64BIT-NEXT:   renamable $x3 = LDtocCPT %const.3, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $f2 = LFD 0, killed renamable $x4 :: (load (s64) from constant-pool)
+  ; 64BIT-NEXT:   renamable $x4 = LI8 60
+  ; 64BIT-NEXT:   renamable $f3 = LFD 0, killed renamable $x5 :: (load (s64) from constant-pool)
+  ; 64BIT-NEXT:   renamable $x5 = LI8 50
+  ; 64BIT-NEXT:   renamable $f4 = LFD 0, killed renamable $x3 :: (load (s64) from constant-pool)
+  ; 64BIT-NEXT:   renamable $x3 = LIS8 457
+  ; 64BIT-NEXT:   renamable $x9 = ORI8 killed renamable $x3, 50048
+  ; 64BIT-NEXT:   $x7 = LI8 1
+  ; 64BIT-NEXT:   $x8 = LI8 2
+  ; 64BIT-NEXT:   $x10 = LI8 40
+  ; 64BIT-NEXT:   STD killed renamable $x4, 120, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   STD killed renamable $x5, 112, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .mix_callee>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $f1, implicit $f2, implicit $f3, implicit $f4, implicit killed $x7, implicit killed $x8, implicit $x9, implicit killed $x10, implicit $x2, implicit-def $r1, implicit-def dead $x3
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 128, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+  entry:
+%call = call i32 @mix_callee(double 1.000000e-01, double 2.000000e-01, double 3.000000e-01, double 4.000000e-01, i8 zeroext 1, i16 signext 2, i64 30000000, i32 40, i32 50, i32 60)
+    ret void
+  }
+
+  define i32 @mix_floats(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10, double %d11, double %d12, double %d13, double %d14) {
+  ; 32BIT-LABEL: name: mix_floats
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   liveins: $f1, $f2, $f3, $f4, $f5, $f6, $f7, $f8, $f9, $f10, $f11, $f12, $f13, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10
+  ; 32BIT-NEXT: {{  $}}
+  ; 32BIT-NEXT:   renamable $r11 = LIS 17200
+  ; 32BIT-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r4
+  ; 32BIT-NEXT:   STW killed renamable $r11, 0, %stack.1 :: (store (s32) into %stack.1, align 8)
+  ; 32BIT-NEXT:   renamable $r4 = LWZtoc %const.0, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r5
+  ; 32BIT-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r6
+  ; 32BIT-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r7
+  ; 32BIT-NEXT:   renamable $f0 = LFS 0, killed renamable $r4 :: (load (s32) from constant-pool)
+  ; 32BIT-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r8
+  ; 32BIT-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r9
+  ; 32BIT-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r10
+  ; 32BIT-NEXT:   renamable $r3 = XORIS killed renamable $r3, 32768
+  ; 32BIT-NEXT:   STW killed renamable $r3, 4, %stack.1 :: (store (s32) into %stack.1 + 4)
+  ; 32BIT-NEXT:   renamable $f31 = LFD 0, %stack.1 :: (load (s64) from %stack.1)
+  ; 32BIT-NEXT:   renamable $f30 = LFD 0, %fixed-stack.0 :: (load (s64) from %fixed-stack.0, align 16)
+  ; 32BIT-NEXT:   renamable $f0 = nofpexcept FSUB killed renamable $f31, killed renamable $f0, implicit $rm
+  ; 32BIT-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f1, implicit $rm
+  ; 32BIT-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f2, implicit $rm
+  ; 32BIT-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f3, implicit $rm
+  ; 32BIT-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f4, implicit $rm
+  ; 32BIT-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f5, implicit $rm
+  ; 32BIT-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f6, implicit $rm
+  ; 32BIT-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f7, implicit $rm
+  ; 32BIT-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f8, implicit $rm
+  ; 32BIT-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f9, implicit $rm
+  ; 32BIT-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f10, implicit $rm
+  ; 32BIT-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f11, implicit $rm
+  ; 32BIT-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f12, implicit $rm
+  ; 32BIT-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f13, implicit $rm
+  ; 32BIT-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f30, implicit $rm
+  ; 32BIT-NEXT:   renamable $f0 = nofpexcept FCTIWZ killed renamable $f0, implicit $rm
+  ; 32BIT-NEXT:   STFD killed renamable $f0, 0, %stack.0 :: (store (s64) into %stack.0)
+  ; 32BIT-NEXT:   renamable $r3 = LWZ 4, %stack.0 :: (load (s32) from %stack.0 + 4, basealign 8)
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm, implicit $r3
+  ;
+  ; 64BIT-LABEL: name: mix_floats
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   liveins: $f1, $f2, $f3, $f4, $f5, $f6, $f7, $f8, $f9, $f10, $f11, $f12, $f13, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10
+  ; 64BIT-NEXT: {{  $}}
+  ; 64BIT-NEXT:   renamable $f0 = LFD 0, %fixed-stack.0 :: (load (s64) from %fixed-stack.0)
+  ; 64BIT-NEXT:   renamable $r3 = nsw ADD4 renamable $r3, renamable $r4, implicit killed $x4, implicit killed $x3
+  ; 64BIT-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, renamable $r5, implicit killed $x5
+  ; 64BIT-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, renamable $r6, implicit killed $x6
+  ; 64BIT-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, renamable $r7, implicit killed $x7
+  ; 64BIT-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, renamable $r8, implicit killed $x8
+  ; 64BIT-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, renamable $r9, implicit killed $x9
+  ; 64BIT-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, renamable $r10, implicit killed $x10
+  ; 64BIT-NEXT:   renamable $x3 = EXTSW_32_64 killed renamable $r3
+  ; 64BIT-NEXT:   STD killed renamable $x3, 0, %stack.1 :: (store (s64) into %stack.1)
+  ; 64BIT-NEXT:   renamable $f31 = LFD 0, %stack.1 :: (load (s64) from %stack.1)
+  ; 64BIT-NEXT:   renamable $f31 = nofpexcept FCFID killed renamable $f31, implicit $rm
+  ; 64BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f31, killed renamable $f1, implicit $rm
+  ; 64BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f2, implicit $rm
+  ; 64BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f3, implicit $rm
+  ; 64BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f4, implicit $rm
+  ; 64BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f5, implicit $rm
+  ; 64BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f6, implicit $rm
+  ; 64BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f7, implicit $rm
+  ; 64BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f8, implicit $rm
+  ; 64BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f9, implicit $rm
+  ; 64BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f10, implicit $rm
+  ; 64BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f11, implicit $rm
+  ; 64BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f12, implicit $rm
+  ; 64BIT-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f13, implicit $rm
+  ; 64BIT-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f1, killed renamable $f0, implicit $rm
+  ; 64BIT-NEXT:   renamable $f0 = nofpexcept FCTIWZ killed renamable $f0, implicit $rm
+  ; 64BIT-NEXT:   STFD killed renamable $f0, 0, %stack.0 :: (store (s64) into %stack.0)
+  ; 64BIT-NEXT:   renamable $x3 = LWZ8 4, %stack.0 :: (load (s32) from %stack.0 + 4, basealign 8)
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm, implicit $x3
+  entry:
+    %add = add nsw i32 %i1, %i2
+    %add1 = add nsw i32 %add, %i3
+    %add2 = add nsw i32 %add1, %i4
+    %add3 = add nsw i32 %add2, %i5
+    %add4 = add nsw i32 %add3, %i6
+    %add5 = add nsw i32 %add4, %i7
+    %add6 = add nsw i32 %add5, %i8
+    %conv = sitofp i32 %add6 to double
+    %add7 = fadd double %conv, %d1
+    %add8 = fadd double %add7, %d2
+    %add9 = fadd double %add8, %d3
+    %add10 = fadd double %add9, %d4
+    %add11 = fadd double %add10, %d5
+    %add12 = fadd double %add11, %d6
+    %add13 = fadd double %add12, %d7
+    %add14 = fadd double %add13, %d8
+    %add15 = fadd double %add14, %d9
+    %add16 = fadd double %add15, %d10
+    %add17 = fadd double %add16, %d11
+    %add18 = fadd double %add17, %d12
+    %add19 = fadd double %add18, %d13
+    %add20 = fadd double %add19, %d14
+    %conv21 = fptosi double %add20 to i32
+    ret i32 %conv21
+  }
+
+  define void @mix_floats_caller() {
+  ; 32BIT-LABEL: name: mix_floats_caller
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 168, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   renamable $r3 = LI 0
+  ; 32BIT-NEXT:   renamable $r4 = LIS 16352
+  ; 32BIT-NEXT:   renamable $r5 = LIS 16368
+  ; 32BIT-NEXT:   renamable $r6 = LIS 39321
+  ; 32BIT-NEXT:   renamable $r7 = LIS 16313
+  ; 32BIT-NEXT:   renamable $r8 = LIS 16329
+  ; 32BIT-NEXT:   renamable $r9 = LIS 13107
+  ; 32BIT-NEXT:   renamable $r10 = LIS 16339
+  ; 32BIT-NEXT:   STW renamable $r3, 92, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
+  ; 32BIT-NEXT:   renamable $r11 = LIS 16345
+  ; 32BIT-NEXT:   STW killed renamable $r4, 88, $r1 :: (store (s32), align 8)
+  ; 32BIT-NEXT:   renamable $r4 = LIS 16355
+  ; 32BIT-NEXT:   STW killed renamable $r3, 132, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
+  ; 32BIT-NEXT:   renamable $r3 = LIS 26214
+  ; 32BIT-NEXT:   STW killed renamable $r5, 128, $r1 :: (store (s32), align 8)
+  ; 32BIT-NEXT:   renamable $r5 = ORI killed renamable $r6, 39322
+  ; 32BIT-NEXT:   STW renamable $r5, 60, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
+  ; 32BIT-NEXT:   renamable $r6 = ORI killed renamable $r7, 39321
+  ; 32BIT-NEXT:   STW killed renamable $r6, 56, $r1 :: (store (s32), align 8)
+  ; 32BIT-NEXT:   renamable $r6 = LIS 16358
+  ; 32BIT-NEXT:   STW renamable $r5, 68, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
+  ; 32BIT-NEXT:   renamable $r7 = ORI killed renamable $r8, 39321
+  ; 32BIT-NEXT:   STW killed renamable $r7, 64, $r1 :: (store (s32), align 8)
+  ; 32BIT-NEXT:   renamable $r7 = ORI killed renamable $r9, 13107
+  ; 32BIT-NEXT:   STW renamable $r7, 76, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
+  ; 32BIT-NEXT:   renamable $r8 = ORI killed renamable $r10, 13107
+  ; 32BIT-NEXT:   STW killed renamable $r8, 72, $r1 :: (store (s32), align 8)
+  ; 32BIT-NEXT:   renamable $r8 = LIS 16361
+  ; 32BIT-NEXT:   STW renamable $r5, 84, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
+  ; 32BIT-NEXT:   renamable $r9 = ORI killed renamable $r11, 39321
+  ; 32BIT-NEXT:   STW killed renamable $r9, 80, $r1 :: (store (s32), align 8)
+  ; 32BIT-NEXT:   renamable $r9 = LIS 52428
+  ; 32BIT-NEXT:   STW renamable $r7, 100, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
+  ; 32BIT-NEXT:   renamable $r4 = ORI killed renamable $r4, 13107
+  ; 32BIT-NEXT:   STW killed renamable $r4, 96, $r1 :: (store (s32), align 8)
+  ; 32BIT-NEXT:   renamable $r3 = ORI killed renamable $r3, 26214
+  ; 32BIT-NEXT:   STW renamable $r3, 108, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
+  ; 32BIT-NEXT:   renamable $r4 = ORI killed renamable $r6, 26214
+  ; 32BIT-NEXT:   STW killed renamable $r4, 104, $r1 :: (store (s32), align 8)
+  ; 32BIT-NEXT:   renamable $r4 = LIS 16364
+  ; 32BIT-NEXT:   STW renamable $r5, 116, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
+  ; 32BIT-NEXT:   renamable $r6 = ORI killed renamable $r8, 39321
+  ; 32BIT-NEXT:   STW killed renamable $r6, 112, $r1 :: (store (s32), align 8)
+  ; 32BIT-NEXT:   renamable $r6 = ORI killed renamable $r9, 52429
+  ; 32BIT-NEXT:   STW renamable $r6, 124, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
+  ; 32BIT-NEXT:   renamable $r4 = ORI killed renamable $r4, 52428
+  ; 32BIT-NEXT:   STW killed renamable $r4, 120, $r1 :: (store (s32), align 8)
+  ; 32BIT-NEXT:   renamable $r4 = LIS 16369
+  ; 32BIT-NEXT:   STW killed renamable $r5, 140, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
+  ; 32BIT-NEXT:   renamable $r4 = ORI killed renamable $r4, 39321
+  ; 32BIT-NEXT:   STW killed renamable $r4, 136, $r1 :: (store (s32), align 8)
+  ; 32BIT-NEXT:   renamable $r4 = LIS 16371
+  ; 32BIT-NEXT:   STW killed renamable $r7, 148, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
+  ; 32BIT-NEXT:   renamable $r4 = ORI killed renamable $r4, 13107
+  ; 32BIT-NEXT:   STW killed renamable $r4, 144, $r1 :: (store (s32), align 8)
+  ; 32BIT-NEXT:   renamable $r4 = LIS 16372
+  ; 32BIT-NEXT:   renamable $r5 = LWZtoc %const.0, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   STW killed renamable $r6, 156, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
+  ; 32BIT-NEXT:   renamable $r4 = ORI killed renamable $r4, 52428
+  ; 32BIT-NEXT:   STW killed renamable $r4, 152, $r1 :: (store (s32), align 8)
+  ; 32BIT-NEXT:   renamable $r4 = LWZtoc %const.1, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   STW killed renamable $r3, 164, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc %const.2, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f1 = LFD 0, killed renamable $r5 :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   renamable $r5 = LWZtoc %const.3, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f2 = LFD 0, killed renamable $r4 :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   renamable $r4 = LWZtoc %const.4, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f3 = LFD 0, killed renamable $r3 :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc %const.5, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f4 = LFD 0, killed renamable $r5 :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   renamable $r5 = LWZtoc %const.6, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f6 = LFD 0, killed renamable $r4 :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   renamable $r4 = LWZtoc %const.7, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f7 = LFD 0, killed renamable $r3 :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc %const.8, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f8 = LFD 0, killed renamable $r5 :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   renamable $r5 = LWZtoc %const.9, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f9 = LFD 0, killed renamable $r4 :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   renamable $r4 = LWZtoc %const.10, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f11 = LFD 0, killed renamable $r3 :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc %const.11, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f12 = LFD 0, killed renamable $r5 :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   renamable $r5 = LWZtoc %const.12, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f13 = LFD 0, killed renamable $r4 :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   renamable $r4 = LIS 16374
+  ; 32BIT-NEXT:   renamable $f5 = LFS 0, killed renamable $r3 :: (load (s32) from constant-pool)
+  ; 32BIT-NEXT:   renamable $r11 = ORI killed renamable $r4, 26214
+  ; 32BIT-NEXT:   renamable $f10 = LFS 0, killed renamable $r5 :: (load (s32) from constant-pool)
+  ; 32BIT-NEXT:   $r3 = LI 1
+  ; 32BIT-NEXT:   $r4 = LI 2
+  ; 32BIT-NEXT:   $r5 = LI 3
+  ; 32BIT-NEXT:   $r6 = LI 4
+  ; 32BIT-NEXT:   $r7 = LI 5
+  ; 32BIT-NEXT:   $r8 = LI 6
+  ; 32BIT-NEXT:   $r9 = LI 7
+  ; 32BIT-NEXT:   $r10 = LI 8
+  ; 32BIT-NEXT:   STW killed renamable $r11, 160, $r1 :: (store (s32), align 8)
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .mix_floats>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r5, implicit $r6, implicit $r7, implicit $r8, implicit $r9, implicit $r10, implicit $f1, implicit $f2, implicit $f3, implicit $f4, implicit $f5, implicit $f6, implicit $f7, implicit $f8, implicit $f9, implicit $f10, implicit $f11, implicit $f12, implicit $f13, implicit $r2, implicit-def $r1, implicit-def dead $r3
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 168, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: mix_floats_caller
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 224, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   renamable $x3 = LI8 1023
+  ; 64BIT-NEXT:   renamable $x4 = LI8 511
+  ; 64BIT-NEXT:   renamable $x5 = LIS8 16374
+  ; 64BIT-NEXT:   renamable $x6 = LIS8 16371
+  ; 64BIT-NEXT:   renamable $x7 = LIS8 16358
+  ; 64BIT-NEXT:   renamable $x8 = LIS8 16355
+  ; 64BIT-NEXT:   renamable $x9 = LIS8 16339
+  ; 64BIT-NEXT:   renamable $x10 = LIS8 4093
+  ; 64BIT-NEXT:   renamable $x11 = LDtocCPT %const.0, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x29 = LIS8 16369
+  ; 64BIT-NEXT:   renamable $x28 = LIS8 4091
+  ; 64BIT-NEXT:   renamable $x12 = LDtocCPT %const.1, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x27 = LIS8 16361
+  ; 64BIT-NEXT:   renamable $x31 = LDtocCPT %const.2, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $f1 = LFD 0, killed renamable $x11 :: (load (s64) from constant-pool)
+  ; 64BIT-NEXT:   renamable $x26 = LIS8 16345
+  ; 64BIT-NEXT:   renamable $x11 = LDtocCPT %const.3, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $f2 = LFD 0, killed renamable $x12 :: (load (s64) from constant-pool)
+  ; 64BIT-NEXT:   renamable $x25 = LIS8 16329
+  ; 64BIT-NEXT:   renamable $f3 = LFD 0, killed renamable $x31 :: (load (s64) from constant-pool)
+  ; 64BIT-NEXT:   renamable $x24 = LIS8 16313
+  ; 64BIT-NEXT:   renamable $x23 = LDtocCPT %const.4, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x22 = LDtocCPT %const.5, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x21 = LDtocCPT %const.6, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x20 = LDtocCPT %const.7, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x19 = LDtocCPT %const.8, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x18 = LDtocCPT %const.9, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x17 = LDtocCPT %const.10, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $f4 = LFD 0, killed renamable $x11 :: (load (s64) from constant-pool)
+  ; 64BIT-NEXT:   renamable $x16 = LDtocCPT %const.11, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x11 = ORI8 killed renamable $x5, 26214
+  ; 64BIT-NEXT:   renamable $x12 = ORI8 killed renamable $x6, 13107
+  ; 64BIT-NEXT:   renamable $x0 = ORI8 killed renamable $x7, 26214
+  ; 64BIT-NEXT:   renamable $x31 = ORI8 killed renamable $x8, 13107
+  ; 64BIT-NEXT:   renamable $x30 = ORI8 killed renamable $x9, 13107
+  ; 64BIT-NEXT:   renamable $x5 = ORI8 killed renamable $x10, 13107
+  ; 64BIT-NEXT:   renamable $x6 = ORI8 killed renamable $x29, 39321
+  ; 64BIT-NEXT:   renamable $x7 = ORI8 killed renamable $x28, 13107
+  ; 64BIT-NEXT:   renamable $x8 = ORI8 killed renamable $x27, 39321
+  ; 64BIT-NEXT:   renamable $x9 = ORI8 killed renamable $x26, 39321
+  ; 64BIT-NEXT:   renamable $x10 = ORI8 killed renamable $x25, 39321
+  ; 64BIT-NEXT:   renamable $x27 = ORI8 killed renamable $x24, 39321
+  ; 64BIT-NEXT:   renamable $f6 = LFD 0, killed renamable $x23 :: (load (s64) from constant-pool)
+  ; 64BIT-NEXT:   renamable $x26 = LDtocCPT %const.12, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x29 = RLDIC killed renamable $x3, 52, 2
+  ; 64BIT-NEXT:   renamable $x28 = RLDIC killed renamable $x4, 53, 2
+  ; 64BIT-NEXT:   renamable $x11 = RLDIMI killed renamable $x11, renamable $x11, 32, 0
+  ; 64BIT-NEXT:   renamable $x12 = RLDIMI killed renamable $x12, renamable $x12, 32, 0
+  ; 64BIT-NEXT:   renamable $x0 = RLDIMI killed renamable $x0, renamable $x0, 32, 0
+  ; 64BIT-NEXT:   renamable $x31 = RLDIMI killed renamable $x31, renamable $x31, 32, 0
+  ; 64BIT-NEXT:   renamable $x30 = RLDIMI killed renamable $x30, renamable $x30, 32, 0
+  ; 64BIT-NEXT:   renamable $x3 = RLDIC killed renamable $x5, 34, 2
+  ; 64BIT-NEXT:   renamable $x4 = RLDIC killed renamable $x6, 32, 2
+  ; 64BIT-NEXT:   renamable $x5 = RLDIC killed renamable $x7, 34, 2
+  ; 64BIT-NEXT:   renamable $x6 = RLDIC killed renamable $x8, 32, 2
+  ; 64BIT-NEXT:   renamable $x7 = RLDIC killed renamable $x9, 32, 2
+  ; 64BIT-NEXT:   renamable $x8 = RLDIC killed renamable $x10, 32, 2
+  ; 64BIT-NEXT:   renamable $x9 = RLDIC killed renamable $x27, 32, 2
+  ; 64BIT-NEXT:   renamable $x11 = RLWIMI8 killed renamable $x11, renamable $x11, 16, 0, 15
+  ; 64BIT-NEXT:   renamable $x12 = RLWIMI8 killed renamable $x12, renamable $x12, 16, 0, 15
+  ; 64BIT-NEXT:   renamable $x0 = RLWIMI8 killed renamable $x0, renamable $x0, 16, 0, 15
+  ; 64BIT-NEXT:   renamable $x31 = RLWIMI8 killed renamable $x31, renamable $x31, 16, 0, 15
+  ; 64BIT-NEXT:   renamable $x30 = RLWIMI8 killed renamable $x30, renamable $x30, 16, 0, 15
+  ; 64BIT-NEXT:   renamable $x3 = ORIS8 killed renamable $x3, 52428
+  ; 64BIT-NEXT:   renamable $x4 = ORIS8 killed renamable $x4, 39321
+  ; 64BIT-NEXT:   renamable $x5 = ORIS8 killed renamable $x5, 52428
+  ; 64BIT-NEXT:   renamable $x6 = ORIS8 killed renamable $x6, 39321
+  ; 64BIT-NEXT:   renamable $x7 = ORIS8 killed renamable $x7, 39321
+  ; 64BIT-NEXT:   renamable $x8 = ORIS8 killed renamable $x8, 39321
+  ; 64BIT-NEXT:   renamable $x9 = ORIS8 killed renamable $x9, 39321
+  ; 64BIT-NEXT:   renamable $f7 = LFD 0, killed renamable $x22 :: (load (s64) from constant-pool)
+  ; 64BIT-NEXT:   renamable $x27 = ORI8 killed renamable $x3, 52429
+  ; 64BIT-NEXT:   renamable $f8 = LFD 0, killed renamable $x21 :: (load (s64) from constant-pool)
+  ; 64BIT-NEXT:   renamable $x25 = ORI8 killed renamable $x4, 39322
+  ; 64BIT-NEXT:   renamable $f9 = LFD 0, killed renamable $x20 :: (load (s64) from constant-pool)
+  ; 64BIT-NEXT:   renamable $x24 = ORI8 killed renamable $x5, 52429
+  ; 64BIT-NEXT:   renamable $f11 = LFD 0, killed renamable $x19 :: (load (s64) from constant-pool)
+  ; 64BIT-NEXT:   renamable $x23 = ORI8 killed renamable $x6, 39322
+  ; 64BIT-NEXT:   renamable $f12 = LFD 0, killed renamable $x18 :: (load (s64) from constant-pool)
+  ; 64BIT-NEXT:   renamable $x22 = ORI8 killed renamable $x7, 39322
+  ; 64BIT-NEXT:   renamable $f13 = LFD 0, killed renamable $x17 :: (load (s64) from constant-pool)
+  ; 64BIT-NEXT:   renamable $x21 = ORI8 killed renamable $x8, 39322
+  ; 64BIT-NEXT:   renamable $f5 = LFS 0, killed renamable $x16 :: (load (s32) from constant-pool)
+  ; 64BIT-NEXT:   renamable $x20 = ORI8 killed renamable $x9, 39322
+  ; 64BIT-NEXT:   renamable $f10 = LFS 0, killed renamable $x26 :: (load (s32) from constant-pool)
+  ; 64BIT-NEXT:   $x3 = LI8 1
+  ; 64BIT-NEXT:   $x4 = LI8 2
+  ; 64BIT-NEXT:   $x5 = LI8 3
+  ; 64BIT-NEXT:   $x6 = LI8 4
+  ; 64BIT-NEXT:   $x7 = LI8 5
+  ; 64BIT-NEXT:   $x8 = LI8 6
+  ; 64BIT-NEXT:   $x9 = LI8 7
+  ; 64BIT-NEXT:   $x10 = LI8 8
+  ; 64BIT-NEXT:   STD killed renamable $x29, 184, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   STD killed renamable $x28, 144, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   STD killed renamable $x11, 216, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   STD killed renamable $x12, 200, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   STD killed renamable $x0, 160, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   STD killed renamable $x31, 152, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   STD killed renamable $x30, 128, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   STD killed renamable $x27, 208, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   STD killed renamable $x25, 192, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   STD killed renamable $x24, 176, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   STD killed renamable $x23, 168, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   STD killed renamable $x22, 136, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   STD killed renamable $x21, 120, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   STD killed renamable $x20, 112, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .mix_floats>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit $x5, implicit $x6, implicit $x7, implicit $x8, implicit $x9, implicit $x10, implicit $f1, implicit $f2, implicit $f3, implicit $f4, implicit $f5, implicit $f6, implicit $f7, implicit $f8, implicit $f9, implicit $f10, implicit $f11, implicit $f12, implicit $f13, implicit $x2, implicit-def $r1, implicit-def dead $x3
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 224, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+  entry:
+    %call = call i32 @mix_floats(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, double 1.000000e-01, double 2.000000e-01, double 3.000000e-01, double 4.000000e-01, double 5.000000e-01, double 6.000000e-01, double 0x3FE6666666666666, double 8.000000e-01, double 9.000000e-01, double 1.000000e+00, double 1.100000e+00, double 1.200000e+00, double 1.300000e+00, double 1.400000e+00)
+    ret void
+  }
+
diff --git a/llvm/test/CodeGen/PowerPC/aix-cc-abi.ll b/llvm/test/CodeGen/PowerPC/aix-cc-abi.ll
index 02fe9943f39c41..78d60f06c06786 100644
--- a/llvm/test/CodeGen/PowerPC/aix-cc-abi.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-cc-abi.ll
@@ -1,42 +1,57 @@
-; RUN: llc -mtriple powerpc-ibm-aix-xcoff -stop-after=machine-cp -verify-machineinstrs < %s | \
-; RUN: FileCheck --check-prefixes=CHECK,32BIT %s
-
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -verify-machineinstrs -mcpu=pwr4 -mattr=-altivec \
 ; RUN:  -mtriple powerpc-ibm-aix-xcoff < %s | \
 ; RUN: FileCheck --check-prefixes=CHECKASM,ASM32PWR4 %s
 
-; RUN: llc -mtriple powerpc64-ibm-aix-xcoff -stop-after=machine-cp -verify-machineinstrs < %s | \
-; RUN: FileCheck --check-prefixes=CHECK,64BIT %s
-
 ; RUN: llc -verify-machineinstrs -mcpu=pwr4 -mattr=-altivec \
 ; RUN:  -mtriple powerpc64-ibm-aix-xcoff < %s | \
 ; RUN: FileCheck --check-prefixes=CHECKASM,ASM64PWR4 %s
 
 define void @call_test_chars() {
+; ASM32PWR4-LABEL: call_test_chars:
+; ASM32PWR4:       # %bb.0: # %entry
+; ASM32PWR4-NEXT:    mflr 0
+; ASM32PWR4-NEXT:    stwu 1, -64(1)
+; ASM32PWR4-NEXT:    li 3, 97
+; ASM32PWR4-NEXT:    li 4, 97
+; ASM32PWR4-NEXT:    stw 0, 72(1)
+; ASM32PWR4-NEXT:    li 5, 97
+; ASM32PWR4-NEXT:    li 6, 97
+; ASM32PWR4-NEXT:    bl .test_chars
+; ASM32PWR4-NEXT:    nop
+; ASM32PWR4-NEXT:    addi 1, 1, 64
+; ASM32PWR4-NEXT:    lwz 0, 8(1)
+; ASM32PWR4-NEXT:    mtlr 0
+; ASM32PWR4-NEXT:    blr
+;
+; ASM64PWR4-LABEL: call_test_chars:
+; ASM64PWR4:       # %bb.0: # %entry
+; ASM64PWR4-NEXT:    mflr 0
+; ASM64PWR4-NEXT:    stdu 1, -112(1)
+; ASM64PWR4-NEXT:    li 3, 97
+; ASM64PWR4-NEXT:    li 4, 97
+; ASM64PWR4-NEXT:    std 0, 128(1)
+; ASM64PWR4-NEXT:    li 5, 97
+; ASM64PWR4-NEXT:    li 6, 97
+; ASM64PWR4-NEXT:    bl .test_chars
+; ASM64PWR4-NEXT:    nop
+; ASM64PWR4-NEXT:    addi 1, 1, 112
+; ASM64PWR4-NEXT:    ld 0, 16(1)
+; ASM64PWR4-NEXT:    mtlr 0
+; ASM64PWR4-NEXT:    blr
 entry:
   call i8 @test_chars(i8 signext 97, i8 signext 97, i8 signext 97, i8 signext 97)
   ret void
 }
 
-; CHECK-LABEL: name: call_test_chars
-
-; 32BIT: ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT: $r3 = LI 97
-; 32BIT: $r4 = LI 97
-; 32BIT: $r5 = LI 97
-; 32BIT: $r6 = LI 97
-; 32BIT: BL_NOP <mcsymbol .test_chars>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit killed $r3, implicit killed $r4, implicit killed $r5, implicit killed $r6, implicit $r2, implicit-def $r1
-; 32BIT: ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; 64BIT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT: $x3 = LI8 97
-; 64BIT: $x4 = LI8 97
-; 64BIT: $x5 = LI8 97
-; 64BIT: $x6 = LI8 97
-; 64BIT: BL8_NOP <mcsymbol .test_chars>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit killed $x3, implicit killed $x4, implicit killed $x5, implicit killed $x6, implicit $x2, implicit-def $r1
-; 64BIT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
 define signext i8 @test_chars(i8 signext %c1, i8 signext %c2, i8 signext %c3, i8 signext %c4) {
+; CHECKASM-LABEL: test_chars:
+; CHECKASM:       # %bb.0: # %entry
+; CHECKASM-NEXT:    add 3, 3, 4
+; CHECKASM-NEXT:    add 3, 3, 5
+; CHECKASM-NEXT:    add 3, 3, 6
+; CHECKASM-NEXT:    extsb 3, 3
+; CHECKASM-NEXT:    blr
 entry:
   %conv = sext i8 %c1 to i32
   %conv1 = sext i8 %c2 to i32
@@ -49,51 +64,51 @@ entry:
   ret i8 %conv6
 }
 
-; CHECK-LABEL: name: test_chars
-
-; 32BIT:       liveins:
-; 32BIT-NEXT:  - { reg: '$r3', virtual-reg: '' }
-; 32BIT-NEXT:  - { reg: '$r4', virtual-reg: '' }
-; 32BIT-NEXT:  - { reg: '$r5', virtual-reg: '' }
-; 32BIT-NEXT:  - { reg: '$r6', virtual-reg: '' }
-; 32BIT:       body:
-; 32BIT-NEXT:    bb.0.entry:
-; 32BIT-NEXT:      liveins: $r3, $r4, $r5, $r6
-
-; 64BIT:       liveins:
-; 64BIT-NEXT:  - { reg: '$x3', virtual-reg: '' }
-; 64BIT-NEXT:  - { reg: '$x4', virtual-reg: '' }
-; 64BIT-NEXT:  - { reg: '$x5', virtual-reg: '' }
-; 64BIT-NEXT:  - { reg: '$x6', virtual-reg: '' }
-; 64BIT:       body:
-; 64BIT-NEXT:    bb.0.entry:
-; 64BIT-NEXT:      liveins: $x3, $x4, $x5, $x6
-
 define void @call_test_chars_mix() {
+; ASM32PWR4-LABEL: call_test_chars_mix:
+; ASM32PWR4:       # %bb.0: # %entry
+; ASM32PWR4-NEXT:    mflr 0
+; ASM32PWR4-NEXT:    stwu 1, -64(1)
+; ASM32PWR4-NEXT:    li 3, 97
+; ASM32PWR4-NEXT:    li 4, 225
+; ASM32PWR4-NEXT:    stw 0, 72(1)
+; ASM32PWR4-NEXT:    li 5, 97
+; ASM32PWR4-NEXT:    li 6, -31
+; ASM32PWR4-NEXT:    bl .test_chars_mix
+; ASM32PWR4-NEXT:    nop
+; ASM32PWR4-NEXT:    addi 1, 1, 64
+; ASM32PWR4-NEXT:    lwz 0, 8(1)
+; ASM32PWR4-NEXT:    mtlr 0
+; ASM32PWR4-NEXT:    blr
+;
+; ASM64PWR4-LABEL: call_test_chars_mix:
+; ASM64PWR4:       # %bb.0: # %entry
+; ASM64PWR4-NEXT:    mflr 0
+; ASM64PWR4-NEXT:    stdu 1, -112(1)
+; ASM64PWR4-NEXT:    li 3, 97
+; ASM64PWR4-NEXT:    li 4, 225
+; ASM64PWR4-NEXT:    std 0, 128(1)
+; ASM64PWR4-NEXT:    li 5, 97
+; ASM64PWR4-NEXT:    li 6, -31
+; ASM64PWR4-NEXT:    bl .test_chars_mix
+; ASM64PWR4-NEXT:    nop
+; ASM64PWR4-NEXT:    addi 1, 1, 112
+; ASM64PWR4-NEXT:    ld 0, 16(1)
+; ASM64PWR4-NEXT:    mtlr 0
+; ASM64PWR4-NEXT:    blr
 entry:
   call i8 @test_chars_mix(i8 signext 97, i8 zeroext -31, i8 zeroext 97, i8 signext -31)
   ret void
 }
 
-; CHECK-LABEL: name: call_test_chars_mix
-
-; 32BIT: ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT: $r3 = LI 97
-; 32BIT: $r4 = LI 225
-; 32BIT: $r5 = LI 97
-; 32BIT: $r6 = LI -31
-; 32BIT: BL_NOP <mcsymbol .test_chars_mix>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit killed $r3, implicit killed $r4, implicit killed $r5, implicit killed $r6, implicit $r2, implicit-def $r1
-; 32BIT: ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; 64BIT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT: $x3 = LI8 97
-; 64BIT: $x4 = LI8 225
-; 64BIT: $x5 = LI8 97
-; 64BIT: $x6 = LI8 -31
-; 64BIT: BL8_NOP <mcsymbol .test_chars_mix>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit killed $x3, implicit killed $x4, implicit killed $x5, implicit killed $x6, implicit $x2, implicit-def $r1
-; 64BIT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
 define signext i8 @test_chars_mix(i8 signext %c1, i8 zeroext %c2, i8 zeroext %c3, i8 signext %c4) {
+; CHECKASM-LABEL: test_chars_mix:
+; CHECKASM:       # %bb.0: # %entry
+; CHECKASM-NEXT:    add 3, 3, 4
+; CHECKASM-NEXT:    add 3, 3, 5
+; CHECKASM-NEXT:    add 3, 3, 6
+; CHECKASM-NEXT:    extsb 3, 3
+; CHECKASM-NEXT:    blr
 entry:
   %conv = sext i8 %c1 to i32
   %conv1 = zext i8 %c2 to i32
@@ -106,92 +121,88 @@ entry:
   ret i8 %conv6
 }
 
-; CHECK-LABEL: name: test_chars_mix
-
-; 32BIT:       liveins:
-; 32BIT-NEXT:  - { reg: '$r3', virtual-reg: '' }
-; 32BIT-NEXT:  - { reg: '$r4', virtual-reg: '' }
-; 32BIT-NEXT:  - { reg: '$r5', virtual-reg: '' }
-; 32BIT-NEXT:  - { reg: '$r6', virtual-reg: '' }
-; 32BIT:       body:
-; 32BIT-NEXT:    bb.0.entry:
-; 32BIT-NEXT:      liveins: $r3, $r4, $r5, $r6
-
-; 64BIT:       liveins:
-; 64BIT-NEXT:  - { reg: '$x3', virtual-reg: '' }
-; 64BIT-NEXT:  - { reg: '$x4', virtual-reg: '' }
-; 64BIT-NEXT:  - { reg: '$x5', virtual-reg: '' }
-; 64BIT-NEXT:  - { reg: '$x6', virtual-reg: '' }
-; 64BIT:       body:
-; 64BIT-NEXT:    bb.0.entry:
-; 64BIT-NEXT:      liveins: $x3, $x4, $x5, $x6
-
 @global_i1 = global i8 0, align 1
 
 define  void @test_i1(i1 %b)  {
+; ASM32PWR4-LABEL: test_i1:
+; ASM32PWR4:       # %bb.0: # %entry
+; ASM32PWR4-NEXT:    lwz 4, L..C0(2) # @global_i1
+; ASM32PWR4-NEXT:    clrlwi 3, 3, 31
+; ASM32PWR4-NEXT:    stb 3, 0(4)
+; ASM32PWR4-NEXT:    blr
+;
+; ASM64PWR4-LABEL: test_i1:
+; ASM64PWR4:       # %bb.0: # %entry
+; ASM64PWR4-NEXT:    ld 4, L..C0(2) # @global_i1
+; ASM64PWR4-NEXT:    clrlwi 3, 3, 31
+; ASM64PWR4-NEXT:    stb 3, 0(4)
+; ASM64PWR4-NEXT:    blr
   entry:
    %frombool = zext i1 %b to i8
    store i8 %frombool, ptr @global_i1, align 1
    ret void
 }
 
-; 32BIT:       liveins:
-; 32BIT-NEXT:  - { reg: '$r3', virtual-reg: '' }
-; 32BIT:       body:             |
-; 32BIT-NEXT:    bb.0.entry:
-; 32BIT-NEXT:      liveins: $r3
-; 32BIT:           renamable $r3 = RLWINM killed renamable $r3, 0, 31, 31
-; 32BIT-NEXT:      STB killed renamable $r3, 0, killed renamable $r4 :: (store (s8) into @global_i1)
-
-; 64BIT:       liveins:
-; 64BIT-NEXT:  - { reg: '$x3', virtual-reg: '' }
-; 64BIT:       body:             |
-; 64BIT-NEXT:    bb.0.entry:
-; 64BIT-NEXT:      liveins: $x3
-; 64BIT:           renamable $r[[REG1:[0-9]+]] = RLWINM renamable $r[[REG1]], 0, 31, 31, implicit killed $x3
-; 64BIT-NEXT:      STB killed renamable $r[[REG1]], 0, killed renamable $x4 :: (store (s8) into @global_i1)
-
 define void @call_test_i1() {
+; ASM32PWR4-LABEL: call_test_i1:
+; ASM32PWR4:       # %bb.0: # %entry
+; ASM32PWR4-NEXT:    mflr 0
+; ASM32PWR4-NEXT:    stwu 1, -64(1)
+; ASM32PWR4-NEXT:    li 3, 1
+; ASM32PWR4-NEXT:    stw 0, 72(1)
+; ASM32PWR4-NEXT:    bl .test_i1
+; ASM32PWR4-NEXT:    nop
+; ASM32PWR4-NEXT:    addi 1, 1, 64
+; ASM32PWR4-NEXT:    lwz 0, 8(1)
+; ASM32PWR4-NEXT:    mtlr 0
+; ASM32PWR4-NEXT:    blr
+;
+; ASM64PWR4-LABEL: call_test_i1:
+; ASM64PWR4:       # %bb.0: # %entry
+; ASM64PWR4-NEXT:    mflr 0
+; ASM64PWR4-NEXT:    stdu 1, -112(1)
+; ASM64PWR4-NEXT:    li 3, 1
+; ASM64PWR4-NEXT:    std 0, 128(1)
+; ASM64PWR4-NEXT:    bl .test_i1
+; ASM64PWR4-NEXT:    nop
+; ASM64PWR4-NEXT:    addi 1, 1, 112
+; ASM64PWR4-NEXT:    ld 0, 16(1)
+; ASM64PWR4-NEXT:    mtlr 0
+; ASM64PWR4-NEXT:    blr
 entry:
   call void @test_i1(i1 1)
   ret void
 }
-; CHECK-LABEL: name: call_test_i1
-
-; 32BIT: ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT: $r3 = LI 1
-; 32BIT: BL_NOP <mcsymbol .test_i1>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit killed $r3, implicit $r2, implicit-def $r1
-; 32BIT: ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; 64BIT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT: $x3 = LI8 1
-; 64BIT: BL8_NOP <mcsymbol .test_i1>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit killed $x3, implicit $x2, implicit-def $r1
-; 64BIT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
 
 define void @test_i1zext(i1 zeroext %b) {
+; ASM32PWR4-LABEL: test_i1zext:
+; ASM32PWR4:       # %bb.0: # %entry
+; ASM32PWR4-NEXT:    lwz 4, L..C0(2) # @global_i1
+; ASM32PWR4-NEXT:    stb 3, 0(4)
+; ASM32PWR4-NEXT:    blr
+;
+; ASM64PWR4-LABEL: test_i1zext:
+; ASM64PWR4:       # %bb.0: # %entry
+; ASM64PWR4-NEXT:    ld 4, L..C0(2) # @global_i1
+; ASM64PWR4-NEXT:    stb 3, 0(4)
+; ASM64PWR4-NEXT:    blr
   entry:
     %frombool = zext i1 %b to i8
     store i8 %frombool, ptr @global_i1, align 1
     ret void
   }
 
-; 32BIT:       liveins:
-; 32BIT-NEXT:  - { reg: '$r3', virtual-reg: '' }
-; 32BIT:       body:             |
-; 32BIT-NEXT:    bb.0.entry:
-; 32BIT-NEXT:      liveins: $r3
-; CHECK-NOT:       RLWINM
-; 32BIT:           STB killed renamable $r3, 0, killed renamable $r4 :: (store (s8) into @global_i1)
-
-; 64BIT:       liveins:
-; 64BIT-NEXT:  - { reg: '$x3', virtual-reg: '' }
-; 64BIT:       body:             |
-; 64BIT-NEXT:    bb.0.entry:
-; 64BIT-NEXT:      liveins: $x3
-; CHECK-NOT:       RLWINM
-; 64BIT:           STB8 killed renamable $x3, 0, killed renamable $x4 :: (store (s8) into @global_i1)
-
 define i32 @test_ints(i32 signext %a, i32 zeroext %b, i32 zeroext %c, i32 signext %d, i32 signext %e, i32 signext %f, i32 signext %g, i32 signext %h) {
+; CHECKASM-LABEL: test_ints:
+; CHECKASM:       # %bb.0: # %entry
+; CHECKASM-NEXT:    add 3, 3, 4
+; CHECKASM-NEXT:    add 3, 3, 5
+; CHECKASM-NEXT:    add 3, 3, 6
+; CHECKASM-NEXT:    add 3, 3, 7
+; CHECKASM-NEXT:    add 3, 3, 8
+; CHECKASM-NEXT:    add 3, 3, 9
+; CHECKASM-NEXT:    add 3, 3, 10
+; CHECKASM-NEXT:    blr
 entry:
     %add = add i32 %a, %b
     %add1 = add i32 %add, %c
@@ -203,84 +214,109 @@ entry:
     ret i32 %add6
 }
 
-; CHECK-LABEL: name: test_ints
-
-; 32BIT:       liveins:
-; 32BIT-NEXT:  - { reg: '$r3', virtual-reg: '' }
-; 32BIT-NEXT:  - { reg: '$r4', virtual-reg: '' }
-; 32BIT-NEXT:  - { reg: '$r5', virtual-reg: '' }
-; 32BIT-NEXT:  - { reg: '$r6', virtual-reg: '' }
-; 32BIT-NEXT:  - { reg: '$r7', virtual-reg: '' }
-; 32BIT-NEXT:  - { reg: '$r8', virtual-reg: '' }
-; 32BIT-NEXT:  - { reg: '$r9', virtual-reg: '' }
-; 32BIT-NEXT:  - { reg: '$r10', virtual-reg: '' }
-; 32BIT:       body:             |
-; 32BIT-NEXT:    bb.0.entry:
-; 32BIT-NEXT:      liveins: $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10
-
-; 64BIT:       liveins:
-; 64BIT-NEXT:  - { reg: '$x3', virtual-reg: '' }
-; 64BIT-NEXT:  - { reg: '$x4', virtual-reg: '' }
-; 64BIT-NEXT:  - { reg: '$x5', virtual-reg: '' }
-; 64BIT-NEXT:  - { reg: '$x6', virtual-reg: '' }
-; 64BIT-NEXT:  - { reg: '$x7', virtual-reg: '' }
-; 64BIT-NEXT:  - { reg: '$x8', virtual-reg: '' }
-; 64BIT-NEXT:  - { reg: '$x9', virtual-reg: '' }
-; 64BIT-NEXT:  - { reg: '$x10', virtual-reg: '' }
-; 64BIT:       body:             |
-; 64BIT-NEXT:    bb.0.entry:
-; 64BIT-NEXT:      liveins: $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10
-
 define void @call_test_ints() {
+; ASM32PWR4-LABEL: call_test_ints:
+; ASM32PWR4:       # %bb.0: # %entry
+; ASM32PWR4-NEXT:    mflr 0
+; ASM32PWR4-NEXT:    stwu 1, -64(1)
+; ASM32PWR4-NEXT:    li 3, 1
+; ASM32PWR4-NEXT:    li 4, 1
+; ASM32PWR4-NEXT:    stw 0, 72(1)
+; ASM32PWR4-NEXT:    lis 5, -32768
+; ASM32PWR4-NEXT:    lis 6, -32768
+; ASM32PWR4-NEXT:    li 7, 1
+; ASM32PWR4-NEXT:    li 8, 1
+; ASM32PWR4-NEXT:    li 9, 1
+; ASM32PWR4-NEXT:    li 10, 1
+; ASM32PWR4-NEXT:    bl .test_ints
+; ASM32PWR4-NEXT:    nop
+; ASM32PWR4-NEXT:    addi 1, 1, 64
+; ASM32PWR4-NEXT:    lwz 0, 8(1)
+; ASM32PWR4-NEXT:    mtlr 0
+; ASM32PWR4-NEXT:    blr
+;
+; ASM64PWR4-LABEL: call_test_ints:
+; ASM64PWR4:       # %bb.0: # %entry
+; ASM64PWR4-NEXT:    mflr 0
+; ASM64PWR4-NEXT:    stdu 1, -112(1)
+; ASM64PWR4-NEXT:    li 3, 1
+; ASM64PWR4-NEXT:    li 4, 1
+; ASM64PWR4-NEXT:    std 0, 128(1)
+; ASM64PWR4-NEXT:    rldic 5, 3, 31, 32
+; ASM64PWR4-NEXT:    lis 6, -32768
+; ASM64PWR4-NEXT:    li 7, 1
+; ASM64PWR4-NEXT:    li 8, 1
+; ASM64PWR4-NEXT:    li 9, 1
+; ASM64PWR4-NEXT:    li 10, 1
+; ASM64PWR4-NEXT:    bl .test_ints
+; ASM64PWR4-NEXT:    nop
+; ASM64PWR4-NEXT:    addi 1, 1, 112
+; ASM64PWR4-NEXT:    ld 0, 16(1)
+; ASM64PWR4-NEXT:    mtlr 0
+; ASM64PWR4-NEXT:    blr
 entry:
   call i32 @test_ints(i32 signext 1, i32 zeroext 1, i32 zeroext 2147483648, i32 signext -2147483648, i32 signext 1, i32 signext 1, i32 signext 1, i32 signext 1)
   ret void
 }
 
-; CHECK-LABEL: name: call_test_ints
-
-; 64BIT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT: renamable $x3 = LI8 1
-; 64BIT: renamable $x5 = RLDIC killed renamable $x3, 31, 32
-; 64BIT: $x3 = LI8 1
-; 64BIT: $x4 = LI8 1
-; 64BIT: $x6 = LIS8 32768
-; 64BIT: $x7 = LI8 1
-; 64BIT: $x8 = LI8 1
-; 64BIT: $x9 = LI8 1
-; 64BIT: $x10 = LI8 1
-; 64BIT:  BL8_NOP <mcsymbol .test_ints>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit killed $x4, implicit $x5, implicit killed $x6, implicit killed $x7, implicit killed $x8, implicit killed $x9, implicit killed $x10, implicit $x2, implicit-def $r1, implicit-def dead $x3
-; 64BIT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
 define void @call_test_i64() {
+; ASM32PWR4-LABEL: call_test_i64:
+; ASM32PWR4:       # %bb.0: # %entry
+; ASM32PWR4-NEXT:    mflr 0
+; ASM32PWR4-NEXT:    stwu 1, -64(1)
+; ASM32PWR4-NEXT:    li 3, 0
+; ASM32PWR4-NEXT:    li 4, 1
+; ASM32PWR4-NEXT:    stw 0, 72(1)
+; ASM32PWR4-NEXT:    li 5, 0
+; ASM32PWR4-NEXT:    li 6, 2
+; ASM32PWR4-NEXT:    li 7, 0
+; ASM32PWR4-NEXT:    li 8, 3
+; ASM32PWR4-NEXT:    li 9, 0
+; ASM32PWR4-NEXT:    li 10, 4
+; ASM32PWR4-NEXT:    bl .test_i64
+; ASM32PWR4-NEXT:    nop
+; ASM32PWR4-NEXT:    addi 1, 1, 64
+; ASM32PWR4-NEXT:    lwz 0, 8(1)
+; ASM32PWR4-NEXT:    mtlr 0
+; ASM32PWR4-NEXT:    blr
+;
+; ASM64PWR4-LABEL: call_test_i64:
+; ASM64PWR4:       # %bb.0: # %entry
+; ASM64PWR4-NEXT:    mflr 0
+; ASM64PWR4-NEXT:    stdu 1, -112(1)
+; ASM64PWR4-NEXT:    li 3, 1
+; ASM64PWR4-NEXT:    li 4, 2
+; ASM64PWR4-NEXT:    std 0, 128(1)
+; ASM64PWR4-NEXT:    li 5, 3
+; ASM64PWR4-NEXT:    li 6, 4
+; ASM64PWR4-NEXT:    bl .test_i64
+; ASM64PWR4-NEXT:    nop
+; ASM64PWR4-NEXT:    addi 1, 1, 112
+; ASM64PWR4-NEXT:    ld 0, 16(1)
+; ASM64PWR4-NEXT:    mtlr 0
+; ASM64PWR4-NEXT:    blr
 entry:
   call i64 @test_i64(i64 1, i64 2, i64 3, i64 4)
   ret void
 }
 
-; CHECK-LABEL: name: call_test_i64
-
-; 32BIT: ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT: $r3 = LI 0
-; 32BIT: $r4 = LI 1
-; 32BIT: $r5 = LI 0
-; 32BIT: $r6 = LI 2
-; 32BIT: $r7 = LI 0
-; 32BIT: $r8 = LI 3
-; 32BIT: $r9 = LI 0
-; 32BIT: $r10 = LI 4
-; 32BIT: BL_NOP <mcsymbol .test_i64>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit killed $r3, implicit killed $r4, implicit killed $r5, implicit killed $r6, implicit killed $r7, implicit killed $r8, implicit killed $r9, implicit killed $r10, implicit $r2, implicit-def $r1
-; 32BIT: ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; 64BIT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT: $x3 = LI8 1
-; 64BIT: $x4 = LI8 2
-; 64BIT: $x5 = LI8 3
-; 64BIT: $x6 = LI8 4
-; 64BIT: BL8_NOP <mcsymbol .test_i64>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit killed $x3, implicit killed $x4, implicit killed $x5, implicit killed $x6, implicit $x2, implicit-def $r1
-; 64BIT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
 define i64 @test_i64(i64 %a, i64 %b, i64 %c, i64 %d) {
+; ASM32PWR4-LABEL: test_i64:
+; ASM32PWR4:       # %bb.0: # %entry
+; ASM32PWR4-NEXT:    addc 4, 4, 6
+; ASM32PWR4-NEXT:    adde 3, 3, 5
+; ASM32PWR4-NEXT:    addc 4, 4, 8
+; ASM32PWR4-NEXT:    adde 3, 3, 7
+; ASM32PWR4-NEXT:    addc 4, 4, 10
+; ASM32PWR4-NEXT:    adde 3, 3, 9
+; ASM32PWR4-NEXT:    blr
+;
+; ASM64PWR4-LABEL: test_i64:
+; ASM64PWR4:       # %bb.0: # %entry
+; ASM64PWR4-NEXT:    add 3, 3, 4
+; ASM64PWR4-NEXT:    add 3, 3, 5
+; ASM64PWR4-NEXT:    add 3, 3, 6
+; ASM64PWR4-NEXT:    blr
 entry:
   %add = add nsw i64 %a, %b
   %add1 = add nsw i64 %add, %c
@@ -288,31 +324,36 @@ entry:
   ret i64 %add2
 }
 
-; CHECK-LABEL: name: test_i64
-
-; 32BIT:       liveins:
-; 32BIT-NEXT:  - { reg: '$r3', virtual-reg: '' }
-; 32BIT-NEXT:  - { reg: '$r4', virtual-reg: '' }
-; 32BIT-NEXT:  - { reg: '$r5', virtual-reg: '' }
-; 32BIT-NEXT:  - { reg: '$r6', virtual-reg: '' }
-; 32BIT-NEXT:  - { reg: '$r7', virtual-reg: '' }
-; 32BIT-NEXT:  - { reg: '$r8', virtual-reg: '' }
-; 32BIT-NEXT:  - { reg: '$r9', virtual-reg: '' }
-; 32BIT-NEXT:  - { reg: '$r10', virtual-reg: '' }
-; 32BIT:       body:             |
-; 32BIT-NEXT:    bb.0.entry:
-; 32BIT-NEXT:      liveins: $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10
-
-; 64BIT:       liveins:
-; 64BIT-NEXT:  - { reg: '$x3', virtual-reg: '' }
-; 64BIT-NEXT:  - { reg: '$x4', virtual-reg: '' }
-; 64BIT-NEXT:  - { reg: '$x5', virtual-reg: '' }
-; 64BIT-NEXT:  - { reg: '$x6', virtual-reg: '' }
-; 64BIT:       body:             |
-; 64BIT-NEXT:    bb.0.entry:
-; 64BIT-NEXT:      liveins: $x3, $x4, $x5, $x6
-
 define void @call_test_int_ptr() {
+; ASM32PWR4-LABEL: call_test_int_ptr:
+; ASM32PWR4:       # %bb.0: # %entry
+; ASM32PWR4-NEXT:    mflr 0
+; ASM32PWR4-NEXT:    stwu 1, -64(1)
+; ASM32PWR4-NEXT:    li 3, 0
+; ASM32PWR4-NEXT:    stw 0, 72(1)
+; ASM32PWR4-NEXT:    stw 3, 60(1)
+; ASM32PWR4-NEXT:    addi 3, 1, 60
+; ASM32PWR4-NEXT:    bl .test_int_ptr
+; ASM32PWR4-NEXT:    nop
+; ASM32PWR4-NEXT:    addi 1, 1, 64
+; ASM32PWR4-NEXT:    lwz 0, 8(1)
+; ASM32PWR4-NEXT:    mtlr 0
+; ASM32PWR4-NEXT:    blr
+;
+; ASM64PWR4-LABEL: call_test_int_ptr:
+; ASM64PWR4:       # %bb.0: # %entry
+; ASM64PWR4-NEXT:    mflr 0
+; ASM64PWR4-NEXT:    stdu 1, -128(1)
+; ASM64PWR4-NEXT:    li 3, 0
+; ASM64PWR4-NEXT:    std 0, 144(1)
+; ASM64PWR4-NEXT:    stw 3, 124(1)
+; ASM64PWR4-NEXT:    addi 3, 1, 124
+; ASM64PWR4-NEXT:    bl .test_int_ptr
+; ASM64PWR4-NEXT:    nop
+; ASM64PWR4-NEXT:    addi 1, 1, 128
+; ASM64PWR4-NEXT:    ld 0, 16(1)
+; ASM64PWR4-NEXT:    mtlr 0
+; ASM64PWR4-NEXT:    blr
 entry:
   %b = alloca i32, align 4
   store i32 0, ptr %b, align 4
@@ -320,43 +361,56 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: name: call_test_int_ptr
-
-; 32BIT: ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT: renamable $r3 = ADDI %stack.0.b, 0
-; 32BIT: BL_NOP <mcsymbol .test_int_ptr>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r2, implicit-def $r1
-; 32BIT: ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; 64BIT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT: renamable $x3 = ADDI8 %stack.0.b, 0
-; 64BIT: BL8_NOP <mcsymbol .test_int_ptr>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1
-; 64BIT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
 define void @test_int_ptr(ptr %a) {
+; ASM32PWR4-LABEL: test_int_ptr:
+; ASM32PWR4:       # %bb.0: # %entry
+; ASM32PWR4-NEXT:    stw 3, -8(1)
+; ASM32PWR4-NEXT:    blr
+;
+; ASM64PWR4-LABEL: test_int_ptr:
+; ASM64PWR4:       # %bb.0: # %entry
+; ASM64PWR4-NEXT:    std 3, -8(1)
+; ASM64PWR4-NEXT:    blr
 entry:
   %a.addr = alloca ptr, align 8
   store ptr %a, ptr %a.addr, align 8
   ret void
 }
 
-; CHECK-LABEL: name: test_int_ptr
-
-; 32BIT:       liveins:
-; 32BIT-NEXT:  - { reg: '$r3', virtual-reg: '' }
-; 32BIT:       body:             |
-; 32BIT-NEXT:    bb.0.entry:
-; 32BIT-NEXT:      liveins: $r3
-; 32BIT:           STW killed renamable $r3, 0, %stack.0.a.addr :: (store (s32) into %ir.a.addr, align 8)
-
-; 64BIT:       liveins:
-; 64BIT-NEXT:  - { reg: '$x3', virtual-reg: '' }
-; 64BIT:       body:             |
-; 64BIT-NEXT:    bb.0.entry:
-; 64BIT-NEXT:      liveins: $x3
-; 64BIT:           STD killed renamable $x3, 0, %stack.0.a.addr :: (store (s64) into %ir.a.addr)
-
-
 define i32 @caller(i32 %i)  {
+; ASM32PWR4-LABEL: caller:
+; ASM32PWR4:       # %bb.0: # %entry
+; ASM32PWR4-NEXT:    mflr 0
+; ASM32PWR4-NEXT:    stwu 1, -64(1)
+; ASM32PWR4-NEXT:    stw 0, 72(1)
+; ASM32PWR4-NEXT:    stw 3, 60(1)
+; ASM32PWR4-NEXT:    cntlzw 3, 3
+; ASM32PWR4-NEXT:    not 3, 3
+; ASM32PWR4-NEXT:    rlwinm 3, 3, 27, 31, 31
+; ASM32PWR4-NEXT:    stb 3, 59(1)
+; ASM32PWR4-NEXT:    bl .call_test_bool[PR]
+; ASM32PWR4-NEXT:    nop
+; ASM32PWR4-NEXT:    addi 1, 1, 64
+; ASM32PWR4-NEXT:    lwz 0, 8(1)
+; ASM32PWR4-NEXT:    mtlr 0
+; ASM32PWR4-NEXT:    blr
+;
+; ASM64PWR4-LABEL: caller:
+; ASM64PWR4:       # %bb.0: # %entry
+; ASM64PWR4-NEXT:    mflr 0
+; ASM64PWR4-NEXT:    stdu 1, -128(1)
+; ASM64PWR4-NEXT:    std 0, 144(1)
+; ASM64PWR4-NEXT:    stw 3, 124(1)
+; ASM64PWR4-NEXT:    cntlzw 3, 3
+; ASM64PWR4-NEXT:    srwi 3, 3, 5
+; ASM64PWR4-NEXT:    xori 3, 3, 1
+; ASM64PWR4-NEXT:    stb 3, 123(1)
+; ASM64PWR4-NEXT:    bl .call_test_bool[PR]
+; ASM64PWR4-NEXT:    nop
+; ASM64PWR4-NEXT:    addi 1, 1, 128
+; ASM64PWR4-NEXT:    ld 0, 16(1)
+; ASM64PWR4-NEXT:    mtlr 0
+; ASM64PWR4-NEXT:    blr
 entry:
   %i.addr = alloca i32, align 4
   %b = alloca i8, align 1
@@ -373,187 +427,147 @@ entry:
 
 declare i32 @call_test_bool(i1 zeroext)
 
-; CHECK-LABEL: name:            caller
-
-; 32BIT:        liveins:
-; 32BIT-NEXT:   - { reg: '$r3', virtual-reg: '' }
-; 32BIT:        body:             |
-; 32BIT-NEXT:   bb.0.entry:
-; 32BIT:         liveins: $r3
-; 32BIT:          ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT:          BL_NOP <mcsymbol .call_test_bool[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r2, implicit-def $r1, implicit-def $r3
-; 32BIT:          ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; 64BIT:        liveins:
-; 64BIT-NEXT:   - { reg: '$x3', virtual-reg: '' }
-; 64BIT:        body:             |
-; 64BIT-NEXT:    bb.0.entry:
-; 64BIT-NEXT:     liveins: $x3
-; 64BIT:          ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT:          BL8_NOP <mcsymbol .call_test_bool[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1, implicit-def $x3
-; 64BIT:          ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
 @f1 = global float 0.000000e+00, align 4
 @d1 = global double 0.000000e+00, align 8
 
 define void @call_test_floats() {
+; ASM32PWR4-LABEL: call_test_floats:
+; ASM32PWR4:       # %bb.0: # %entry
+; ASM32PWR4-NEXT:    mflr 0
+; ASM32PWR4-NEXT:    stwu 1, -64(1)
+; ASM32PWR4-NEXT:    lwz 3, L..C1(2) # @f1
+; ASM32PWR4-NEXT:    stw 0, 72(1)
+; ASM32PWR4-NEXT:    lfs 1, 0(3)
+; ASM32PWR4-NEXT:    fmr 2, 1
+; ASM32PWR4-NEXT:    fmr 3, 1
+; ASM32PWR4-NEXT:    bl .test_floats
+; ASM32PWR4-NEXT:    nop
+; ASM32PWR4-NEXT:    addi 1, 1, 64
+; ASM32PWR4-NEXT:    lwz 0, 8(1)
+; ASM32PWR4-NEXT:    mtlr 0
+; ASM32PWR4-NEXT:    blr
+;
+; ASM64PWR4-LABEL: call_test_floats:
+; ASM64PWR4:       # %bb.0: # %entry
+; ASM64PWR4-NEXT:    mflr 0
+; ASM64PWR4-NEXT:    stdu 1, -112(1)
+; ASM64PWR4-NEXT:    ld 3, L..C1(2) # @f1
+; ASM64PWR4-NEXT:    std 0, 128(1)
+; ASM64PWR4-NEXT:    lfs 1, 0(3)
+; ASM64PWR4-NEXT:    fmr 2, 1
+; ASM64PWR4-NEXT:    fmr 3, 1
+; ASM64PWR4-NEXT:    bl .test_floats
+; ASM64PWR4-NEXT:    nop
+; ASM64PWR4-NEXT:    addi 1, 1, 112
+; ASM64PWR4-NEXT:    ld 0, 16(1)
+; ASM64PWR4-NEXT:    mtlr 0
+; ASM64PWR4-NEXT:    blr
 entry:
   %0 = load float, ptr @f1, align 4
   call float @test_floats(float %0, float %0, float %0)
   ret void
 }
 
-; CHECK-LABEL: name: call_test_floats{{.*}}
-
-; 32BIT:      renamable $r3 = LWZtoc @f1, $r2 :: (load (s32) from got)
-; 32BIT-NEXT: renamable $f1 = LFS 0, killed renamable $r3 :: (dereferenceable load (s32) from @f1)
-; 32BIT-NEXT: ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-NEXT: $f2 = COPY renamable $f1
-; 32BIT-NEXT: $f3 = COPY renamable $f1
-; 32BIT-NEXT: BL_NOP <mcsymbol .test_floats>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $f1, implicit killed $f2, implicit killed $f3, implicit $r2, implicit-def $r1
-; 32BIT-NEXT: ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; 64BIT:      renamable $x3 = LDtoc @f1, $x2 :: (load (s64) from got)
-; 64BIT-NEXT: renamable $f1 = LFS 0, killed renamable $x3 :: (dereferenceable load (s32) from @f1)
-; 64BIT-NEXT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-NEXT: $f2 = COPY renamable $f1
-; 64BIT-NEXT: $f3 = COPY renamable $f1
-; 64BIT-NEXT: BL8_NOP <mcsymbol .test_floats>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $f1, implicit killed $f2, implicit killed $f3, implicit $x2, implicit-def $r1
-; 64BIT-NEXT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
 define float @test_floats(float %f1, float %f2, float %f3) {
+; CHECKASM-LABEL: test_floats:
+; CHECKASM:       # %bb.0: # %entry
+; CHECKASM-NEXT:    fadds 0, 1, 2
+; CHECKASM-NEXT:    fadds 1, 0, 3
+; CHECKASM-NEXT:    blr
 entry:
   %add = fadd float %f1, %f2
   %add1 = fadd float %add, %f3
   ret float %add1
 }
 
-; CHECK-LABEL: name: test_floats{{.*}}
-
-; CHECK:      liveins:
-; CHECK-NEXT: - { reg: '$f1', virtual-reg: '' }
-; CHECK-NEXT: - { reg: '$f2', virtual-reg: '' }
-; CHECK-NEXT: - { reg: '$f3', virtual-reg: '' }
-; CHECK:      body:             |
-; CHECK-NEXT:   bb.0.entry:
-; CHECK-NEXT:     liveins: $f1, $f2, $f3
-
 define void @call_test_fpr_max() {
+; ASM32PWR4-LABEL: call_test_fpr_max:
+; ASM32PWR4:       # %bb.0: # %entry
+; ASM32PWR4-NEXT:    mflr 0
+; ASM32PWR4-NEXT:    stwu 1, -128(1)
+; ASM32PWR4-NEXT:    lwz 3, L..C2(2) # @d1
+; ASM32PWR4-NEXT:    stw 0, 136(1)
+; ASM32PWR4-NEXT:    lfd 1, 0(3)
+; ASM32PWR4-NEXT:    fmr 2, 1
+; ASM32PWR4-NEXT:    fmr 3, 1
+; ASM32PWR4-NEXT:    stfd 1, 120(1)
+; ASM32PWR4-NEXT:    stfd 1, 112(1)
+; ASM32PWR4-NEXT:    fmr 4, 1
+; ASM32PWR4-NEXT:    fmr 5, 1
+; ASM32PWR4-NEXT:    stfd 1, 104(1)
+; ASM32PWR4-NEXT:    fmr 6, 1
+; ASM32PWR4-NEXT:    fmr 7, 1
+; ASM32PWR4-NEXT:    stfd 1, 96(1)
+; ASM32PWR4-NEXT:    stfd 1, 88(1)
+; ASM32PWR4-NEXT:    fmr 8, 1
+; ASM32PWR4-NEXT:    fmr 9, 1
+; ASM32PWR4-NEXT:    stfd 1, 80(1)
+; ASM32PWR4-NEXT:    fmr 10, 1
+; ASM32PWR4-NEXT:    fmr 11, 1
+; ASM32PWR4-NEXT:    stfd 1, 72(1)
+; ASM32PWR4-NEXT:    stfd 1, 64(1)
+; ASM32PWR4-NEXT:    fmr 12, 1
+; ASM32PWR4-NEXT:    fmr 13, 1
+; ASM32PWR4-NEXT:    stfd 1, 56(1)
+; ASM32PWR4-NEXT:    bl .test_fpr_max
+; ASM32PWR4-NEXT:    nop
+; ASM32PWR4-NEXT:    addi 1, 1, 128
+; ASM32PWR4-NEXT:    lwz 0, 8(1)
+; ASM32PWR4-NEXT:    mtlr 0
+; ASM32PWR4-NEXT:    blr
+;
+; ASM64PWR4-LABEL: call_test_fpr_max:
+; ASM64PWR4:       # %bb.0: # %entry
+; ASM64PWR4-NEXT:    mflr 0
+; ASM64PWR4-NEXT:    stdu 1, -160(1)
+; ASM64PWR4-NEXT:    ld 3, L..C2(2) # @d1
+; ASM64PWR4-NEXT:    std 0, 176(1)
+; ASM64PWR4-NEXT:    lfd 1, 0(3)
+; ASM64PWR4-NEXT:    fmr 2, 1
+; ASM64PWR4-NEXT:    fmr 3, 1
+; ASM64PWR4-NEXT:    stfd 1, 144(1)
+; ASM64PWR4-NEXT:    stfd 1, 136(1)
+; ASM64PWR4-NEXT:    fmr 4, 1
+; ASM64PWR4-NEXT:    fmr 5, 1
+; ASM64PWR4-NEXT:    stfd 1, 128(1)
+; ASM64PWR4-NEXT:    fmr 6, 1
+; ASM64PWR4-NEXT:    fmr 7, 1
+; ASM64PWR4-NEXT:    stfd 1, 120(1)
+; ASM64PWR4-NEXT:    stfd 1, 112(1)
+; ASM64PWR4-NEXT:    fmr 8, 1
+; ASM64PWR4-NEXT:    fmr 9, 1
+; ASM64PWR4-NEXT:    fmr 10, 1
+; ASM64PWR4-NEXT:    fmr 11, 1
+; ASM64PWR4-NEXT:    fmr 12, 1
+; ASM64PWR4-NEXT:    fmr 13, 1
+; ASM64PWR4-NEXT:    bl .test_fpr_max
+; ASM64PWR4-NEXT:    nop
+; ASM64PWR4-NEXT:    addi 1, 1, 160
+; ASM64PWR4-NEXT:    ld 0, 16(1)
+; ASM64PWR4-NEXT:    mtlr 0
+; ASM64PWR4-NEXT:    blr
 entry:
   %0 = load double, ptr @d1, align 8
   call double @test_fpr_max(double %0, double %0, double %0, double %0, double %0, double %0, double %0, double %0, double %0, double %0, double %0, double %0, double %0)
   ret void
 }
 
-; CHECK-LABEL: name: call_test_fpr_max{{.*}}
-
-; 32BIT:      renamable $r[[REG:[0-9]+]] = LWZtoc @d1, $r2 :: (load (s32) from got)
-; 32BIT-NEXT: renamable $f1 = LFD 0, killed renamable $r[[REG]] :: (dereferenceable load (s64) from @d1)
-; 32BIT-NEXT: ADJCALLSTACKDOWN 128, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-DAG:  STFD renamable $f1, 56, $r1 :: (store (s64))
-; 32BIT-DAG:  STFD renamable $f1, 64, $r1 :: (store (s64))
-; 32BIT-DAG:  STFD renamable $f1, 72, $r1 :: (store (s64))
-; 32BIT-DAG:  STFD renamable $f1, 80, $r1 :: (store (s64))
-; 32BIT-DAG:  STFD renamable $f1, 88, $r1 :: (store (s64))
-; 32BIT-DAG:  STFD renamable $f1, 96, $r1 :: (store (s64))
-; 32BIT-DAG:  STFD renamable $f1, 104, $r1 :: (store (s64))
-; 32BIT-DAG:  STFD renamable $f1, 112, $r1 :: (store (s64))
-; 32BIT-DAG:  STFD renamable $f1, 120, $r1 :: (store (s64))
-; 32BIT-DAG:  $f2 = COPY renamable $f1
-; 32BIT-DAG:  $f3 = COPY renamable $f1
-; 32BIT-DAG:  $f4 = COPY renamable $f1
-; 32BIT-DAG:  $f5 = COPY renamable $f1
-; 32BIT-DAG:  $f6 = COPY renamable $f1
-; 32BIT-DAG:  $f7 = COPY renamable $f1
-; 32BIT-DAG:  $f8 = COPY renamable $f1
-; 32BIT-DAG:  $f9 = COPY renamable $f1
-; 32BIT-DAG:  $f10 = COPY renamable $f1
-; 32BIT-DAG:  $f11 = COPY renamable $f1
-; 32BIT-DAG:  $f12 = COPY renamable $f1
-; 32BIT-DAG:  $f13 = COPY renamable $f1
-; 32BIT-NEXT: BL_NOP <mcsymbol .test_fpr_max>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $f1, implicit killed $f2, implicit killed $f3, implicit killed $f4, implicit killed $f5, implicit killed $f6, implicit killed $f7, implicit killed $f8, implicit killed $f9, implicit killed $f10, implicit killed $f11, implicit killed $f12, implicit killed $f13, implicit $r2, implicit-def $r1, implicit-def dead $f1
-; 32BIT-NEXT: ADJCALLSTACKUP 128, 0, implicit-def dead $r1, implicit $r1
-
-; CHECKASM-LABEL: .call_test_fpr_max:
-
-; ASM32PWR4:       stwu 1, -128(1)
-; ASM32PWR4-NEXT:  lwz [[REG:[0-9]+]], L..C2(2)
-; ASM32PWR4-NEXT:  stw 0, 136(1)
-; ASM32PWR4-NEXT:  lfd 1, 0([[REG]])
-; ASM32PWR4-DAG:   stfd 1, 56(1)
-; ASM32PWR4-DAG:   stfd 1, 64(1)
-; ASM32PWR4-DAG:   stfd 1, 72(1)
-; ASM32PWR4-DAG:   stfd 1, 80(1)
-; ASM32PWR4-DAG:   stfd 1, 88(1)
-; ASM32PWR4-DAG:   stfd 1, 96(1)
-; ASM32PWR4-DAG:   stfd 1, 104(1)
-; ASM32PWR4-DAG:   stfd 1, 112(1)
-; ASM32PWR4-DAG:   stfd 1, 120(1)
-; ASM32PWR4-DAG:   fmr 2, 1
-; ASM32PWR4-DAG:   fmr 3, 1
-; ASM32PWR4-DAG:   fmr 4, 1
-; ASM32PWR4-DAG:   fmr 5, 1
-; ASM32PWR4-DAG:   fmr 6, 1
-; ASM32PWR4-DAG:   fmr 7, 1
-; ASM32PWR4-DAG:   fmr 8, 1
-; ASM32PWR4-DAG:   fmr 9, 1
-; ASM32PWR4-DAG:   fmr 10, 1
-; ASM32PWR4-DAG:   fmr 11, 1
-; ASM32PWR4-DAG:   fmr 12, 1
-; ASM32PWR4-DAG:   fmr 13, 1
-; ASM32PWR4-NEXT:  bl .test_fpr_max
-; ASM32PWR4-NEXT:  nop
-; ASM32PWR4-NEXT:  addi 1, 1, 128
-
-; 64BIT:      renamable $x[[REGD1ADDR:[0-9]+]] = LDtoc @d1, $x2 :: (load (s64) from got)
-; 64BIT-NEXT: renamable $f1 = LFD 0, killed renamable $x[[REGD1ADDR:[0-9]+]] :: (dereferenceable load (s64) from @d1)
-; 64BIT-NEXT: ADJCALLSTACKDOWN 152, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-DAG:  STFD renamable $f1, 112, $x1 :: (store (s64))
-; 64BIT-DAG:  STFD renamable $f1, 120, $x1 :: (store (s64))
-; 64BIT-DAG:  STFD renamable $f1, 128, $x1 :: (store (s64))
-; 64BIT-DAG:  STFD renamable $f1, 136, $x1 :: (store (s64))
-; 64BIT-DAG:  STFD renamable $f1, 144, $x1 :: (store (s64))
-; 64BIT-DAG:  $f2 = COPY renamable $f1
-; 64BIT-DAG:  $f3 = COPY renamable $f1
-; 64BIT-DAG:  $f4 = COPY renamable $f1
-; 64BIT-DAG:  $f5 = COPY renamable $f1
-; 64BIT-DAG:  $f6 = COPY renamable $f1
-; 64BIT-DAG:  $f7 = COPY renamable $f1
-; 64BIT-DAG:  $f8 = COPY renamable $f1
-; 64BIT-DAG:  $f9 = COPY renamable $f1
-; 64BIT-DAG:  $f10 = COPY renamable $f1
-; 64BIT-DAG:  $f11 = COPY renamable $f1
-; 64BIT-DAG:  $f12 = COPY renamable $f1
-; 64BIT-DAG:  $f13 = COPY renamable $f1
-; 64BIT-NEXT: BL8_NOP <mcsymbol .test_fpr_max>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $f1, implicit killed $f2, implicit killed $f3, implicit killed $f4, implicit killed $f5, implicit killed $f6, implicit killed $f7, implicit killed $f8, implicit killed $f9, implicit killed $f10, implicit killed $f11, implicit killed $f12, implicit killed $f13, implicit $x2, implicit-def $r1
-; 64BIT-NEXT: ADJCALLSTACKUP 152, 0, implicit-def dead $r1, implicit $r1
-
-; ASM64PWR4:       stdu 1, -160(1)
-; ASM64PWR4-NEXT:  ld [[REG:[0-9]+]], L..C2(2)
-; ASM64PWR4-NEXT:  std 0, 176(1)
-; ASM64PWR4-NEXT:  lfd 1, 0([[REG]])
-; ASM64PWR4-DAG:   stfd 1, 112(1)
-; ASM64PWR4-DAG:   stfd 1, 120(1)
-; ASM64PWR4-DAG:   stfd 1, 128(1)
-; ASM64PWR4-DAG:   stfd 1, 136(1)
-; ASM64PWR4-DAG:   stfd 1, 144(1)
-; ASM64PWR4-DAG:   fmr 2, 1
-; ASM64PWR4-DAG:   fmr 3, 1
-; ASM64PWR4-DAG:   fmr 4, 1
-; ASM64PWR4-DAG:   fmr 5, 1
-; ASM64PWR4-DAG:   fmr 6, 1
-; ASM64PWR4-DAG:   fmr 7, 1
-; ASM64PWR4-DAG:   fmr 8, 1
-; ASM64PWR4-DAG:   fmr 9, 1
-; ASM64PWR4-DAG:   fmr 10, 1
-; ASM64PWR4-DAG:   fmr 11, 1
-; ASM64PWR4-DAG:   fmr 12, 1
-; ASM64PWR4-DAG:   fmr 13, 1
-; ASM64PWR4-NEXT:  bl .test_fpr_max
-; ASM64PWR4-NEXT:  nop
-; ASM64PWR4-NEXT:  addi 1, 1, 160
-
 define double @test_fpr_max(double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10, double %d11, double %d12, double %d13) {
+; CHECKASM-LABEL: test_fpr_max:
+; CHECKASM:       # %bb.0: # %entry
+; CHECKASM-NEXT:    fadd 0, 1, 2
+; CHECKASM-NEXT:    fadd 0, 0, 3
+; CHECKASM-NEXT:    fadd 0, 0, 4
+; CHECKASM-NEXT:    fadd 0, 0, 5
+; CHECKASM-NEXT:    fadd 0, 0, 6
+; CHECKASM-NEXT:    fadd 0, 0, 7
+; CHECKASM-NEXT:    fadd 0, 0, 8
+; CHECKASM-NEXT:    fadd 0, 0, 9
+; CHECKASM-NEXT:    fadd 0, 0, 10
+; CHECKASM-NEXT:    fadd 0, 0, 11
+; CHECKASM-NEXT:    fadd 0, 0, 12
+; CHECKASM-NEXT:    fadd 1, 0, 13
+; CHECKASM-NEXT:    blr
 entry:
   %add = fadd double %d1, %d2
   %add1 = fadd double %add, %d3
@@ -570,27 +584,42 @@ entry:
   ret double %add11
 }
 
-; CHECK-LABEL: name: test_fpr_max{{.*}}
-
-; CHECK:      liveins:
-; CHECK-NEXT: - { reg: '$f1', virtual-reg: '' }
-; CHECK-NEXT: - { reg: '$f2', virtual-reg: '' }
-; CHECK-NEXT: - { reg: '$f3', virtual-reg: '' }
-; CHECK-NEXT: - { reg: '$f4', virtual-reg: '' }
-; CHECK-NEXT: - { reg: '$f5', virtual-reg: '' }
-; CHECK-NEXT: - { reg: '$f6', virtual-reg: '' }
-; CHECK-NEXT: - { reg: '$f7', virtual-reg: '' }
-; CHECK-NEXT: - { reg: '$f8', virtual-reg: '' }
-; CHECK-NEXT: - { reg: '$f9', virtual-reg: '' }
-; CHECK-NEXT: - { reg: '$f10', virtual-reg: '' }
-; CHECK-NEXT: - { reg: '$f11', virtual-reg: '' }
-; CHECK-NEXT: - { reg: '$f12', virtual-reg: '' }
-; CHECK-NEXT: - { reg: '$f13', virtual-reg: '' }
-; CHECK:      body:             |
-; CHECK-NEXT:   bb.0.entry:
-; CHECK-NEXT:     liveins: $f1, $f2, $f3, $f4, $f5, $f6, $f7, $f8, $f9, $f10, $f11, $f12, $f13
-
 define void @call_test_mix() {
+; ASM32PWR4-LABEL: call_test_mix:
+; ASM32PWR4:       # %bb.0: # %entry
+; ASM32PWR4-NEXT:    mflr 0
+; ASM32PWR4-NEXT:    stwu 1, -64(1)
+; ASM32PWR4-NEXT:    lwz 3, L..C1(2) # @f1
+; ASM32PWR4-NEXT:    stw 0, 72(1)
+; ASM32PWR4-NEXT:    li 4, 1
+; ASM32PWR4-NEXT:    li 7, 97
+; ASM32PWR4-NEXT:    lfs 1, 0(3)
+; ASM32PWR4-NEXT:    lwz 3, L..C2(2) # @d1
+; ASM32PWR4-NEXT:    lfd 2, 0(3)
+; ASM32PWR4-NEXT:    bl .test_mix
+; ASM32PWR4-NEXT:    nop
+; ASM32PWR4-NEXT:    addi 1, 1, 64
+; ASM32PWR4-NEXT:    lwz 0, 8(1)
+; ASM32PWR4-NEXT:    mtlr 0
+; ASM32PWR4-NEXT:    blr
+;
+; ASM64PWR4-LABEL: call_test_mix:
+; ASM64PWR4:       # %bb.0: # %entry
+; ASM64PWR4-NEXT:    mflr 0
+; ASM64PWR4-NEXT:    stdu 1, -112(1)
+; ASM64PWR4-NEXT:    ld 3, L..C1(2) # @f1
+; ASM64PWR4-NEXT:    std 0, 128(1)
+; ASM64PWR4-NEXT:    li 4, 1
+; ASM64PWR4-NEXT:    li 6, 97
+; ASM64PWR4-NEXT:    lfs 1, 0(3)
+; ASM64PWR4-NEXT:    ld 3, L..C2(2) # @d1
+; ASM64PWR4-NEXT:    lfd 2, 0(3)
+; ASM64PWR4-NEXT:    bl .test_mix
+; ASM64PWR4-NEXT:    nop
+; ASM64PWR4-NEXT:    addi 1, 1, 112
+; ASM64PWR4-NEXT:    ld 0, 16(1)
+; ASM64PWR4-NEXT:    mtlr 0
+; ASM64PWR4-NEXT:    blr
 entry:
   %0 = load float, ptr @f1, align 4
   %1 = load double, ptr @d1, align 8
@@ -598,29 +627,46 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: name: call_test_mix{{.*}}
-
-; 32BIT:      renamable $r[[REG1:[0-9]+]] = LWZtoc @f1, $r2 :: (load (s32) from got)
-; 32BIT-NEXT: renamable $r[[REG2:[0-9]+]] = LWZtoc @d1, $r2 :: (load (s32) from got)
-; 32BIT-NEXT: renamable $f1 = LFS 0, killed renamable $r[[REG1]] :: (dereferenceable load (s32) from @f1)
-; 32BIT-NEXT: renamable $f2 = LFD 0, killed renamable $r[[REG2]] :: (dereferenceable load (s64) from @d1)
-; 32BIT-NEXT: ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-NEXT: $r4 = LI 1
-; 32BIT-NEXT: $r7 = LI 97
-; 32BIT-NEXT: BL_NOP <mcsymbol .test_mix>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $f1, implicit $r4, implicit $f2, implicit killed $r7, implicit $r2, implicit-def $r1
-; 32BIT-NEXT: ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; 64BIT:      renamable $x[[REG1:[0-9]+]] = LDtoc @f1, $x2 :: (load (s64) from got)
-; 64BIT-NEXT: renamable $x[[REG2:[0-9]+]] = LDtoc @d1, $x2 :: (load (s64) from got)
-; 64BIT-NEXT: renamable $f1 = LFS 0, killed renamable $x[[REG1]] :: (dereferenceable load (s32) from @f1)
-; 64BIT-NEXT: renamable $f2 = LFD 0, killed renamable $x[[REG2]] :: (dereferenceable load (s64) from @d1)
-; 64BIT-NEXT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-NEXT: $x4 = LI8 1
-; 64BIT-NEXT: $x6 = LI8 97
-; 64BIT-NEXT: BL8_NOP <mcsymbol .test_mix>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $f1, implicit $x4, implicit $f2, implicit killed $x6, implicit $x2, implicit-def $r1
-; 64BIT-NEXT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
 define i32 @test_mix(float %f, i32 signext %i, double %d, i8 signext %c) {
+; ASM32PWR4-LABEL: test_mix:
+; ASM32PWR4:       # %bb.0: # %entry
+; ASM32PWR4-NEXT:    lis 3, 17200
+; ASM32PWR4-NEXT:    fadd 1, 1, 2
+; ASM32PWR4-NEXT:    stw 3, -16(1)
+; ASM32PWR4-NEXT:    lwz 3, L..C3(2) # %const.0
+; ASM32PWR4-NEXT:    frsp 1, 1
+; ASM32PWR4-NEXT:    lfs 0, 0(3)
+; ASM32PWR4-NEXT:    clrlwi 3, 7, 24
+; ASM32PWR4-NEXT:    add 3, 4, 3
+; ASM32PWR4-NEXT:    xoris 3, 3, 32768
+; ASM32PWR4-NEXT:    stw 3, -12(1)
+; ASM32PWR4-NEXT:    addi 3, 1, -4
+; ASM32PWR4-NEXT:    lfd 2, -16(1)
+; ASM32PWR4-NEXT:    fsub 0, 2, 0
+; ASM32PWR4-NEXT:    frsp 0, 0
+; ASM32PWR4-NEXT:    fadds 0, 0, 1
+; ASM32PWR4-NEXT:    fctiwz 0, 0
+; ASM32PWR4-NEXT:    stfiwx 0, 0, 3
+; ASM32PWR4-NEXT:    lwz 3, -4(1)
+; ASM32PWR4-NEXT:    blr
+;
+; ASM64PWR4-LABEL: test_mix:
+; ASM64PWR4:       # %bb.0: # %entry
+; ASM64PWR4-NEXT:    clrlwi 5, 6, 24
+; ASM64PWR4-NEXT:    fadd 0, 1, 2
+; ASM64PWR4-NEXT:    addi 3, 1, -4
+; ASM64PWR4-NEXT:    frsp 0, 0
+; ASM64PWR4-NEXT:    add 4, 4, 5
+; ASM64PWR4-NEXT:    extsw 4, 4
+; ASM64PWR4-NEXT:    std 4, -16(1)
+; ASM64PWR4-NEXT:    lfd 1, -16(1)
+; ASM64PWR4-NEXT:    fcfid 1, 1
+; ASM64PWR4-NEXT:    frsp 1, 1
+; ASM64PWR4-NEXT:    fadds 0, 1, 0
+; ASM64PWR4-NEXT:    fctiwz 0, 0
+; ASM64PWR4-NEXT:    stfiwx 0, 0, 3
+; ASM64PWR4-NEXT:    lwz 3, -4(1)
+; ASM64PWR4-NEXT:    blr
 entry:
   %conv = fpext float %f to double
   %add = fadd double %conv, %d
@@ -633,28 +679,27 @@ entry:
   ret i32 %conv6
 }
 
-; CHECK-LABEL: name: test_mix{{.*}}
-
-; 32BIT:      liveins:
-; 32BIT-NEXT: - { reg: '$f1', virtual-reg: '' }
-; 32BIT-NEXT: - { reg: '$r4', virtual-reg: '' }
-; 32BIT-NEXT: - { reg: '$f2', virtual-reg: '' }
-; 32BIT-NEXT: - { reg: '$r7', virtual-reg: '' }
-; 32BIT:      body:             |
-; 32BIT-NEXT:   bb.0.entry:
-; 32BIT-NEXT:     liveins: $f1, $f2, $r4, $r7
-
-; 64BIT:      liveins:
-; 64BIT-NEXT: - { reg: '$f1', virtual-reg: '' }
-; 64BIT-NEXT: - { reg: '$x4', virtual-reg: '' }
-; 64BIT-NEXT: - { reg: '$f2', virtual-reg: '' }
-; 64BIT-NEXT: - { reg: '$x6', virtual-reg: '' }
-; 64BIT:      body:             |
-; 64BIT-NEXT:   bb.0.entry:
-; 64BIT-NEXT:     liveins: $f1, $f2, $x4, $x6
-
-
 define i64 @callee_mixed_ints(i32 %a, i8 signext %b, i32 %c, i16 signext %d, i64 %e) {
+; ASM32PWR4-LABEL: callee_mixed_ints:
+; ASM32PWR4:       # %bb.0: # %entry
+; ASM32PWR4-NEXT:    clrlwi 4, 4, 24
+; ASM32PWR4-NEXT:    add 3, 3, 4
+; ASM32PWR4-NEXT:    add 3, 3, 5
+; ASM32PWR4-NEXT:    add 3, 3, 6
+; ASM32PWR4-NEXT:    srawi 5, 3, 31
+; ASM32PWR4-NEXT:    addc 4, 3, 8
+; ASM32PWR4-NEXT:    adde 3, 5, 7
+; ASM32PWR4-NEXT:    blr
+;
+; ASM64PWR4-LABEL: callee_mixed_ints:
+; ASM64PWR4:       # %bb.0: # %entry
+; ASM64PWR4-NEXT:    clrlwi 4, 4, 24
+; ASM64PWR4-NEXT:    add 3, 3, 4
+; ASM64PWR4-NEXT:    add 3, 3, 5
+; ASM64PWR4-NEXT:    add 3, 3, 6
+; ASM64PWR4-NEXT:    extsw 3, 3
+; ASM64PWR4-NEXT:    add 3, 3, 7
+; ASM64PWR4-NEXT:    blr
 entry:
   %conv = zext i8 %b to i32
   %add = add nsw i32 %a, %conv
@@ -666,30 +711,50 @@ entry:
   ret i64 %add5
   }
 
-; CHECK-LABEL: name:  callee_mixed_ints
-
-; 32BIT:      liveins:
-; 32BIT-NEXT: - { reg: '$r3', virtual-reg: '' }
-; 32BIT-NEXT: - { reg: '$r4', virtual-reg: '' }
-; 32BIT-NEXT: - { reg: '$r5', virtual-reg: '' }
-; 32BIT-NEXT: - { reg: '$r6', virtual-reg: '' }
-; 32BIT-NEXT: - { reg: '$r7', virtual-reg: '' }
-; 32BIT-NEXT: - { reg: '$r8', virtual-reg: '' }
-; 32BIT:      body:             |
-; 32BIT-NEXT:  bb.0.entry:
-; 32BIT-NEXT:   liveins: $r3, $r4, $r5, $r6, $r7, $r8
-
-; 64BIT:        liveins:
-; 64BIT-NEXT:   - { reg: '$x3', virtual-reg: '' }
-; 64BIT-NEXT:   - { reg: '$x4', virtual-reg: '' }
-; 64BIT-NEXT:   - { reg: '$x5', virtual-reg: '' }
-; 64BIT-NEXT:   - { reg: '$x6', virtual-reg: '' }
-; 64BIT-NEXT:   - { reg: '$x7', virtual-reg: '' }
-; 64BIT:        body:             |
-; 64BIT-NEXT:    bb.0.entry:
-; 64BIT-NEXT:     liveins: $x3, $x4, $x5, $x6, $x7
-
 define void @call_test_vararg() {
+; ASM32PWR4-LABEL: call_test_vararg:
+; ASM32PWR4:       # %bb.0: # %entry
+; ASM32PWR4-NEXT:    mflr 0
+; ASM32PWR4-NEXT:    stwu 1, -80(1)
+; ASM32PWR4-NEXT:    lwz 3, L..C1(2) # @f1
+; ASM32PWR4-NEXT:    stw 0, 88(1)
+; ASM32PWR4-NEXT:    lfs 1, 0(3)
+; ASM32PWR4-NEXT:    lwz 3, L..C2(2) # @d1
+; ASM32PWR4-NEXT:    stfd 1, 64(1)
+; ASM32PWR4-NEXT:    lfd 2, 0(3)
+; ASM32PWR4-NEXT:    li 3, 42
+; ASM32PWR4-NEXT:    stfd 2, 72(1)
+; ASM32PWR4-NEXT:    lwz 4, 64(1)
+; ASM32PWR4-NEXT:    lwz 5, 68(1)
+; ASM32PWR4-NEXT:    lwz 6, 72(1)
+; ASM32PWR4-NEXT:    lwz 7, 76(1)
+; ASM32PWR4-NEXT:    bl .test_vararg[PR]
+; ASM32PWR4-NEXT:    nop
+; ASM32PWR4-NEXT:    addi 1, 1, 80
+; ASM32PWR4-NEXT:    lwz 0, 8(1)
+; ASM32PWR4-NEXT:    mtlr 0
+; ASM32PWR4-NEXT:    blr
+;
+; ASM64PWR4-LABEL: call_test_vararg:
+; ASM64PWR4:       # %bb.0: # %entry
+; ASM64PWR4-NEXT:    mflr 0
+; ASM64PWR4-NEXT:    stdu 1, -128(1)
+; ASM64PWR4-NEXT:    ld 3, L..C1(2) # @f1
+; ASM64PWR4-NEXT:    std 0, 144(1)
+; ASM64PWR4-NEXT:    lfs 1, 0(3)
+; ASM64PWR4-NEXT:    ld 3, L..C2(2) # @d1
+; ASM64PWR4-NEXT:    stfd 1, 112(1)
+; ASM64PWR4-NEXT:    lfd 2, 0(3)
+; ASM64PWR4-NEXT:    li 3, 42
+; ASM64PWR4-NEXT:    stfd 2, 120(1)
+; ASM64PWR4-NEXT:    ld 4, 112(1)
+; ASM64PWR4-NEXT:    ld 5, 120(1)
+; ASM64PWR4-NEXT:    bl .test_vararg[PR]
+; ASM64PWR4-NEXT:    nop
+; ASM64PWR4-NEXT:    addi 1, 1, 128
+; ASM64PWR4-NEXT:    ld 0, 16(1)
+; ASM64PWR4-NEXT:    mtlr 0
+; ASM64PWR4-NEXT:    blr
 entry:
   %0 = load float, ptr @f1, align 4
   %conv = fpext float %0 to double
@@ -700,69 +765,52 @@ entry:
 
 declare void @test_vararg(i32, ...)
 
-; CHECK-LABEL:     name: call_test_vararg
-
-; 32BIT:      renamable $r[[REG:[0-9]+]] = LWZtoc @f1, $r2 :: (load (s32) from got)
-; 32BIT-NEXT: renamable $f1 = LFS 0, killed renamable $r[[REG]] :: (dereferenceable load (s32) from @f1)
-; 32BIT-NEXT: renamable $r[[REG:[0-9]+]] = LWZtoc @d1, $r2 :: (load (s32) from got)
-; 32BIT-NEXT: STFD renamable $f1, 0, %stack.[[SLOT1:[0-9]+]] :: (store (s64) into %stack.[[SLOT1]])
-; 32BIT-NEXT: renamable $r4 = LWZ 0, %stack.[[SLOT1]] :: (load (s32) from %stack.[[SLOT1]], align 8)
-; 32BIT-NEXT: renamable $f2 = LFD 0, killed renamable $r[[REG]] :: (dereferenceable load (s64) from @d1)
-; 32BIT-NEXT: renamable $r5 = LWZ 4, %stack.[[SLOT1]] :: (load (s32) from %stack.[[SLOT1]] + 4)
-; 32BIT-NEXT: STFD renamable $f2, 0, %stack.[[SLOT2:[0-9]+]] :: (store (s64) into %stack.[[SLOT2]])
-; 32BIT-NEXT: renamable $r6 = LWZ 0, %stack.[[SLOT2]] :: (load (s32) from %stack.[[SLOT2]], align 8)
-; 32BIT-NEXT: renamable $r7 = LWZ 4, %stack.[[SLOT2]] :: (load (s32) from %stack.[[SLOT2]] + 4)
-; 32BIT-NEXT: ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-NEXT: $r3 = LI 42
-; 32BIT-NEXT: BL_NOP <mcsymbol .test_vararg[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $f1, implicit $r4, implicit $r5, implicit $f2, implicit $r6, implicit $r7, implicit $r2, implicit-def $r1
-; 32BIT-NEXT: ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; CHECKASM-LABEL: .call_test_vararg:
-
-; ASM32PWR4:      stwu 1, -80(1)
-; ASM32PWR4-NEXT: lwz [[REG:[0-9]+]], L..C1(2)
-; ASM32PWR4-NEXT: stw 0, 88(1)
-; ASM32PWR4-NEXT: lfs 1, 0([[REG]])
-; ASM32PWR4-NEXT: lwz [[REG:[0-9]+]], L..C2(2)
-; ASM32PWR4-NEXT: stfd 1, 64(1)
-; ASM32PWR4-NEXT: lfd 2, 0([[REG]])
-; ASM32PWR4-NEXT: li 3, 42
-; ASM32PWR4-NEXT: stfd 2, 72(1)
-; ASM32PWR4-DAG:  lwz 4, 64(1)
-; ASM32PWR4-DAG:  lwz 5, 68(1)
-; ASM32PWR4-DAG:  lwz 6, 72(1)
-; ASM32PWR4-DAG:  lwz 7, 76(1)
-; ASM32PWR4-NEXT: bl .test_vararg[PR]
-; ASM32PWR4-NEXT: nop
-
-; 64BIT:      renamable $x[[REG:[0-9]+]] = LDtoc @f1, $x2 :: (load (s64) from got)
-; 64BIT-NEXT: renamable $f1 = LFS 0, killed renamable $x[[REG]] :: (dereferenceable load (s32) from @f1)
-; 64BIT-NEXT: renamable $x[[REG:[0-9]+]] = LDtoc @d1, $x2 :: (load (s64) from got)
-; 64BIT-NEXT: STFD renamable $f1, 0, %stack.[[SLOT1:[0-9]+]] :: (store (s64) into %stack.[[SLOT1]])
-; 64BIT-NEXT: renamable $f2 = LFD 0, killed renamable $x[[REG]] :: (dereferenceable load (s64) from @d1)
-; 64BIT-NEXT: renamable $x4 = LD 0, %stack.[[SLOT1]] :: (load (s64) from %stack.[[SLOT1]])
-; 64BIT-NEXT: STFD renamable $f2, 0, %stack.[[SLOT2:[0-9]+]] :: (store (s64) into %stack.[[SLOT2]])
-; 64BIT-NEXT: renamable $x5 = LD 0, %stack.[[SLOT2]] :: (load (s64) from %stack.[[SLOT2]])
-; 64BIT-NEXT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-NEXT: $x3 = LI8 42
-; 64BIT-NEXT: BL8_NOP <mcsymbol .test_vararg[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $f1, implicit $x4, implicit $f2, implicit $x5, implicit $x2, implicit-def $r1
-; 64BIT-NEXT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-; ASM64PWR4:      stdu 1, -128(1)
-; ASM64PWR4-NEXT: ld [[REG:[0-9]+]], L..C1(2)
-; ASM64PWR4-NEXT: std 0, 144(1)
-; ASM64PWR4-NEXT: lfs 1, 0([[REG]])
-; ASM64PWR4-NEXT: ld [[REG:[0-9]+]], L..C2(2)
-; ASM64PWR4-NEXT: stfd 1, 112(1)
-; ASM64PWR4-NEXT: lfd 2, 0([[REG]])
-; ASM64PWR4-NEXT: li 3, 42
-; ASM64PWR4-NEXT: stfd 2, 120(1)
-; ASM64PWR4-NEXT: ld 4, 112(1)
-; ASM64PWR4-NEXT: ld 5, 120(1)
-; ASM64PWR4-NEXT: bl .test_vararg[PR]
-; ASM64PWR4-NEXT: nop
-
 define void @call_test_vararg2() {
+; ASM32PWR4-LABEL: call_test_vararg2:
+; ASM32PWR4:       # %bb.0: # %entry
+; ASM32PWR4-NEXT:    mflr 0
+; ASM32PWR4-NEXT:    stwu 1, -80(1)
+; ASM32PWR4-NEXT:    lwz 3, L..C1(2) # @f1
+; ASM32PWR4-NEXT:    stw 0, 88(1)
+; ASM32PWR4-NEXT:    li 6, 42
+; ASM32PWR4-NEXT:    lfs 1, 0(3)
+; ASM32PWR4-NEXT:    lwz 3, L..C2(2) # @d1
+; ASM32PWR4-NEXT:    stfd 1, 64(1)
+; ASM32PWR4-NEXT:    lfd 2, 0(3)
+; ASM32PWR4-NEXT:    li 3, 42
+; ASM32PWR4-NEXT:    stfd 2, 72(1)
+; ASM32PWR4-NEXT:    lwz 4, 64(1)
+; ASM32PWR4-NEXT:    lwz 5, 68(1)
+; ASM32PWR4-NEXT:    lwz 7, 72(1)
+; ASM32PWR4-NEXT:    lwz 8, 76(1)
+; ASM32PWR4-NEXT:    bl .test_vararg[PR]
+; ASM32PWR4-NEXT:    nop
+; ASM32PWR4-NEXT:    addi 1, 1, 80
+; ASM32PWR4-NEXT:    lwz 0, 8(1)
+; ASM32PWR4-NEXT:    mtlr 0
+; ASM32PWR4-NEXT:    blr
+;
+; ASM64PWR4-LABEL: call_test_vararg2:
+; ASM64PWR4:       # %bb.0: # %entry
+; ASM64PWR4-NEXT:    mflr 0
+; ASM64PWR4-NEXT:    stdu 1, -128(1)
+; ASM64PWR4-NEXT:    ld 3, L..C1(2) # @f1
+; ASM64PWR4-NEXT:    std 0, 144(1)
+; ASM64PWR4-NEXT:    li 5, 42
+; ASM64PWR4-NEXT:    lfs 1, 0(3)
+; ASM64PWR4-NEXT:    ld 3, L..C2(2) # @d1
+; ASM64PWR4-NEXT:    stfd 1, 112(1)
+; ASM64PWR4-NEXT:    lfd 2, 0(3)
+; ASM64PWR4-NEXT:    li 3, 42
+; ASM64PWR4-NEXT:    stfd 2, 120(1)
+; ASM64PWR4-NEXT:    ld 4, 112(1)
+; ASM64PWR4-NEXT:    ld 6, 120(1)
+; ASM64PWR4-NEXT:    bl .test_vararg[PR]
+; ASM64PWR4-NEXT:    nop
+; ASM64PWR4-NEXT:    addi 1, 1, 128
+; ASM64PWR4-NEXT:    ld 0, 16(1)
+; ASM64PWR4-NEXT:    mtlr 0
+; ASM64PWR4-NEXT:    blr
 entry:
   %0 = load float, ptr @f1, align 4
   %conv = fpext float %0 to double
@@ -771,71 +819,53 @@ entry:
   ret void
 }
 
-; CHECK-LABEL:     name: call_test_vararg2
-
-; 32BIT:      renamable $r[[REG:[0-9]+]] = LWZtoc @f1, $r2 :: (load (s32) from got)
-; 32BIT-NEXT: renamable $f1 = LFS 0, killed renamable $r[[REG]] :: (dereferenceable load (s32) from @f1)
-; 32BIT-NEXT: renamable $r[[REG:[0-9]+]] = LWZtoc @d1, $r2 :: (load (s32) from got)
-; 32BIT-NEXT: STFD renamable $f1, 0, %stack.[[SLOT1:[0-9]+]] :: (store (s64) into %stack.[[SLOT1]])
-; 32BIT-NEXT: renamable $r4 = LWZ 0, %stack.[[SLOT1]] :: (load (s32) from %stack.[[SLOT1]], align 8)
-; 32BIT-NEXT: renamable $f2 = LFD 0, killed renamable $r[[REG]] :: (dereferenceable load (s64) from @d1)
-; 32BIT-NEXT: renamable $r5 = LWZ 4, %stack.[[SLOT1]] :: (load (s32) from %stack.[[SLOT1]] + 4)
-; 32BIT-NEXT: STFD renamable $f2, 0, %stack.[[SLOT2:[0-9]+]] :: (store (s64) into %stack.[[SLOT2]])
-; 32BIT-NEXT: renamable $r7 = LWZ 0, %stack.[[SLOT2]] :: (load (s32) from %stack.[[SLOT2]], align 8)
-; 32BIT-NEXT: renamable $r8 = LWZ 4, %stack.[[SLOT2]] :: (load (s32) from %stack.[[SLOT2]] + 4)
-; 32BIT-NEXT: ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-NEXT: $r3 = LI 42
-; 32BIT-NEXT: $r6 = LI 42
-; 32BIT-NEXT: BL_NOP <mcsymbol .test_vararg[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $f1, implicit $r4, implicit $r5, implicit killed $r6, implicit $f2, implicit $r7, implicit $r8, implicit $r2, implicit-def $r1
-; 32BIT-NEXT: ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; ASM32PWR4:      stwu 1, -80(1)
-; ASM32PWR4-NEXT: lwz [[REG:[0-9]+]], L..C1(2)
-; ASM32PWR4-NEXT: stw 0, 88(1)
-; ASM32PWR4-NEXT: li 6, 42
-; ASM32PWR4-NEXT: lfs 1, 0([[REG]])
-; ASM32PWR4-NEXT: lwz [[REG:[0-9]+]], L..C2(2)
-; ASM32PWR4-NEXT: stfd 1, 64(1)
-; ASM32PWR4-NEXT: lfd 2, 0([[REG]])
-; ASM32PWR4-NEXT: li 3, 42
-; ASM32PWR4-NEXT: stfd 2, 72(1)
-; ASM32PWR4-DAG: lwz 4, 64(1)
-; ASM32PWR4-DAG: lwz 5, 68(1)
-; ASM32PWR4-DAG: lwz 7, 72(1)
-; ASM32PWR4-DAG: lwz 8, 76(1)
-; ASM32PWR4-NEXT: bl .test_vararg[PR]
-; ASM32PWR4-NEXT: nop
-
-; 64BIT:      renamable $x[[REG:[0-9]+]] = LDtoc @f1, $x2 :: (load (s64) from got)
-; 64BIT-NEXT: renamable $f1 = LFS 0, killed renamable $x[[REG]] :: (dereferenceable load (s32) from @f1)
-; 64BIT-NEXT: renamable $x[[REG:[0-9]+]] = LDtoc @d1, $x2 :: (load (s64) from got)
-; 64BIT-NEXT: STFD renamable $f1, 0, %stack.[[SLOT1:[0-9]+]] :: (store (s64) into %stack.[[SLOT1]])
-; 64BIT-NEXT: renamable $f2 = LFD 0, killed renamable $x[[REG]] :: (dereferenceable load (s64) from @d1)
-; 64BIT-NEXT: renamable $x4 = LD 0, %stack.[[SLOT1]] :: (load (s64) from %stack.[[SLOT1]])
-; 64BIT-NEXT: STFD renamable $f2, 0, %stack.[[SLOT2:[0-9]+]] :: (store (s64) into %stack.[[SLOT2]])
-; 64BIT-NEXT: renamable $x6 = LD 0, %stack.[[SLOT2]] :: (load (s64) from %stack.[[SLOT2]])
-; 64BIT-NEXT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-NEXT: $x3 = LI8 42
-; 64BIT-NEXT: $x5 = LI8 42
-; 64BIT-NEXT: BL8_NOP <mcsymbol .test_vararg[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $f1, implicit $x4, implicit killed $x5, implicit $f2, implicit $x6, implicit $x2, implicit-def $r1
-; 64BIT-NEXT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-; ASM64PWR4:      stdu 1, -128(1)
-; ASM64PWR4-NEXT: ld [[REG:[0-9]+]], L..C1(2)
-; ASM64PWR4-NEXT: std 0, 144(1)
-; ASM64PWR4-NEXT: li 5, 42
-; ASM64PWR4-NEXT: lfs 1, 0([[REG]])
-; ASM64PWR4-NEXT: ld [[REG:[0-9]+]], L..C2(2)
-; ASM64PWR4-NEXT: stfd 1, 112(1)
-; ASM64PWR4-NEXT: lfd 2, 0([[REG]])
-; ASM64PWR4-NEXT: li 3, 42
-; ASM64PWR4-NEXT: stfd 2, 120(1)
-; ASM64PWR4-NEXT: ld 4, 112(1)
-; ASM64PWR4-NEXT: ld 6, 120(1)
-; ASM64PWR4-NEXT: bl .test_vararg[PR]
-; ASM64PWR4-NEXT: nop
-
 define void @call_test_vararg3() {
+; ASM32PWR4-LABEL: call_test_vararg3:
+; ASM32PWR4:       # %bb.0: # %entry
+; ASM32PWR4-NEXT:    mflr 0
+; ASM32PWR4-NEXT:    stwu 1, -80(1)
+; ASM32PWR4-NEXT:    lwz 3, L..C1(2) # @f1
+; ASM32PWR4-NEXT:    stw 0, 88(1)
+; ASM32PWR4-NEXT:    li 6, 0
+; ASM32PWR4-NEXT:    li 7, 42
+; ASM32PWR4-NEXT:    lfs 1, 0(3)
+; ASM32PWR4-NEXT:    lwz 3, L..C2(2) # @d1
+; ASM32PWR4-NEXT:    stfd 1, 64(1)
+; ASM32PWR4-NEXT:    lfd 2, 0(3)
+; ASM32PWR4-NEXT:    li 3, 42
+; ASM32PWR4-NEXT:    stfd 2, 72(1)
+; ASM32PWR4-NEXT:    lwz 4, 64(1)
+; ASM32PWR4-NEXT:    lwz 5, 68(1)
+; ASM32PWR4-NEXT:    lwz 8, 72(1)
+; ASM32PWR4-NEXT:    lwz 9, 76(1)
+; ASM32PWR4-NEXT:    bl .test_vararg[PR]
+; ASM32PWR4-NEXT:    nop
+; ASM32PWR4-NEXT:    addi 1, 1, 80
+; ASM32PWR4-NEXT:    lwz 0, 8(1)
+; ASM32PWR4-NEXT:    mtlr 0
+; ASM32PWR4-NEXT:    blr
+;
+; ASM64PWR4-LABEL: call_test_vararg3:
+; ASM64PWR4:       # %bb.0: # %entry
+; ASM64PWR4-NEXT:    mflr 0
+; ASM64PWR4-NEXT:    stdu 1, -128(1)
+; ASM64PWR4-NEXT:    ld 3, L..C1(2) # @f1
+; ASM64PWR4-NEXT:    std 0, 144(1)
+; ASM64PWR4-NEXT:    li 5, 42
+; ASM64PWR4-NEXT:    lfs 1, 0(3)
+; ASM64PWR4-NEXT:    ld 3, L..C2(2) # @d1
+; ASM64PWR4-NEXT:    stfd 1, 112(1)
+; ASM64PWR4-NEXT:    lfd 2, 0(3)
+; ASM64PWR4-NEXT:    li 3, 42
+; ASM64PWR4-NEXT:    stfd 2, 120(1)
+; ASM64PWR4-NEXT:    ld 4, 112(1)
+; ASM64PWR4-NEXT:    ld 6, 120(1)
+; ASM64PWR4-NEXT:    bl .test_vararg[PR]
+; ASM64PWR4-NEXT:    nop
+; ASM64PWR4-NEXT:    addi 1, 1, 128
+; ASM64PWR4-NEXT:    ld 0, 16(1)
+; ASM64PWR4-NEXT:    mtlr 0
+; ASM64PWR4-NEXT:    blr
 entry:
   %0 = load float, ptr @f1, align 4
   %conv = fpext float %0 to double
@@ -844,118 +874,46 @@ entry:
   ret void
 }
 
-; CHECK-LABEL:     name: call_test_vararg3
-
-; 32BIT:      renamable $r[[REG:[0-9]+]] = LWZtoc @f1, $r2 :: (load (s32) from got)
-; 32BIT-NEXT: renamable $f1 = LFS 0, killed renamable $r[[REG]] :: (dereferenceable load (s32) from @f1)
-; 32BIT-NEXT: renamable $r[[REG:[0-9]+]] = LWZtoc @d1, $r2 :: (load (s32) from got)
-; 32BIT-NEXT: STFD renamable $f1, 0, %stack.[[SLOT1:[0-9]+]] :: (store (s64) into %stack.[[SLOT1]])
-; 32BIT-NEXT: renamable $r4 = LWZ 0, %stack.[[SLOT1]] :: (load (s32) from %stack.[[SLOT1]], align 8)
-; 32BIT-NEXT: renamable $f2 = LFD 0, killed renamable $r[[REG]] :: (dereferenceable load (s64) from @d1)
-; 32BIT-NEXT: renamable $r5 = LWZ 4, %stack.[[SLOT1]] :: (load (s32) from %stack.[[SLOT1]] + 4)
-; 32BIT-NEXT: STFD renamable $f2, 0, %stack.[[SLOT2:[0-9]+]] :: (store (s64) into %stack.[[SLOT2]])
-; 32BIT-NEXT: renamable $r8 = LWZ 0, %stack.[[SLOT2]] :: (load (s32) from %stack.[[SLOT2]], align 8)
-; 32BIT-NEXT: renamable $r9 = LWZ 4, %stack.[[SLOT2]] :: (load (s32) from %stack.[[SLOT2]] + 4)
-; 32BIT-NEXT: ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-NEXT: $r3 = LI 42
-; 32BIT-NEXT: $r6 = LI 0
-; 32BIT-NEXT: $r7 = LI 42
-; 32BIT-NEXT: BL_NOP <mcsymbol .test_vararg[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $f1, implicit $r4, implicit $r5, implicit killed $r6, implicit killed $r7, implicit $f2, implicit $r8, implicit $r9, implicit $r2, implicit-def $r1
-; 32BIT-NEXT: ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; ASM32PWR4:      stwu 1, -80(1)
-; ASM32PWR4-NEXT: lwz [[REG:[0-9]+]], L..C1(2)
-; ASM32PWR4-DAG:  li 6, 0
-; ASM32PWR4-DAG:  li 7, 42
-; ASM32PWR4-NEXT: lfs 1, 0([[REG]])
-; ASM32PWR4-NEXT: lwz [[REG:[0-9]+]], L..C2(2)
-; ASM32PWR4-NEXT: stfd 1, 64(1)
-; ASM32PWR4-NEXT: lfd 2, 0([[REG]])
-; ASM32PWR4-NEXT: li 3, 42
-; ASM32PWR4-NEXT: stfd 2, 72(1)
-; ASM32PWR4-DAG:  lwz 4, 64(1)
-; ASM32PWR4-DAG:  lwz 5, 68(1)
-; ASM32PWR4-DAG:  lwz 8, 72(1)
-; ASM32PWR4-DAG:  lwz 9, 76(1)
-; ASM32PWR4-NEXT: bl .test_vararg[PR]
-; ASM32PWR4-NEXT: nop
-
-; 64BIT:      renamable $x[[REG:[0-9]+]] = LDtoc @f1, $x2 :: (load (s64) from got)
-; 64BIT-NEXT: renamable $f1 = LFS 0, killed renamable $x[[REG]] :: (dereferenceable load (s32) from @f1)
-; 64BIT-NEXT: renamable $x[[REG:[0-9]+]] = LDtoc @d1, $x2 :: (load (s64) from got)
-; 64BIT-NEXT: STFD renamable $f1, 0, %stack.[[SLOT1:[0-9]+]] :: (store (s64) into %stack.[[SLOT1]])
-; 64BIT-NEXT: renamable $f2 = LFD 0, killed renamable $x[[REG]] :: (dereferenceable load (s64) from @d1)
-; 64BIT-NEXT: renamable $x4 = LD 0, %stack.[[SLOT1]] :: (load (s64) from %stack.[[SLOT1]])
-; 64BIT-NEXT: STFD renamable $f2, 0, %stack.[[SLOT2:[0-9]+]] :: (store (s64) into %stack.[[SLOT2]])
-; 64BIT-NEXT: renamable $x6 = LD 0, %stack.[[SLOT2]] :: (load (s64) from %stack.[[SLOT2]])
-; 64BIT-NEXT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-NEXT: $x3 = LI8 42
-; 64BIT-NEXT: $x5 = LI8 42
-; 64BIT-NEXT: BL8_NOP <mcsymbol .test_vararg[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $f1, implicit $x4, implicit killed $x5, implicit $f2, implicit $x6, implicit $x2, implicit-def $r1
-; 64BIT-NEXT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-; ASM64PWR4:      stdu 1, -128(1)
-; ASM64PWR4-NEXT: ld [[REG:[0-9]+]], L..C1(2)
-; ASM64PWR4-NEXT: std 0, 144(1)
-; ASM64PWR4-NEXT: li 5, 42
-; ASM64PWR4-NEXT: lfs 1, 0([[REG]])
-; ASM64PWR4-NEXT: ld [[REG:[0-9]+]], L..C2(2)
-; ASM64PWR4-NEXT: stfd 1, 112(1)
-; ASM64PWR4-NEXT: lfd 2, 0([[REG]])
-; ASM64PWR4-NEXT: li 3, 42
-; ASM64PWR4-NEXT: stfd 2, 120(1)
-; ASM64PWR4-DAG:  ld 4, 112(1)
-; ASM64PWR4-DAG:  ld 6, 120(1)
-; ASM64PWR4-NEXT: bl .test_vararg[PR]
-; ASM64PWR4-NEXT: nop
-
 define void @call_test_vararg4() {
+; ASM32PWR4-LABEL: call_test_vararg4:
+; ASM32PWR4:       # %bb.0: # %entry
+; ASM32PWR4-NEXT:    mflr 0
+; ASM32PWR4-NEXT:    stwu 1, -64(1)
+; ASM32PWR4-NEXT:    lwz 3, L..C1(2) # @f1
+; ASM32PWR4-NEXT:    stw 0, 72(1)
+; ASM32PWR4-NEXT:    lfs 1, 0(3)
+; ASM32PWR4-NEXT:    li 3, 42
+; ASM32PWR4-NEXT:    stfs 1, 60(1)
+; ASM32PWR4-NEXT:    lwz 4, 60(1)
+; ASM32PWR4-NEXT:    bl .test_vararg[PR]
+; ASM32PWR4-NEXT:    nop
+; ASM32PWR4-NEXT:    addi 1, 1, 64
+; ASM32PWR4-NEXT:    lwz 0, 8(1)
+; ASM32PWR4-NEXT:    mtlr 0
+; ASM32PWR4-NEXT:    blr
+;
+; ASM64PWR4-LABEL: call_test_vararg4:
+; ASM64PWR4:       # %bb.0: # %entry
+; ASM64PWR4-NEXT:    mflr 0
+; ASM64PWR4-NEXT:    stdu 1, -128(1)
+; ASM64PWR4-NEXT:    ld 3, L..C1(2) # @f1
+; ASM64PWR4-NEXT:    std 0, 144(1)
+; ASM64PWR4-NEXT:    lfs 1, 0(3)
+; ASM64PWR4-NEXT:    li 3, 42
+; ASM64PWR4-NEXT:    stfs 1, 124(1)
+; ASM64PWR4-NEXT:    lwz 4, 124(1)
+; ASM64PWR4-NEXT:    bl .test_vararg[PR]
+; ASM64PWR4-NEXT:    nop
+; ASM64PWR4-NEXT:    addi 1, 1, 128
+; ASM64PWR4-NEXT:    ld 0, 16(1)
+; ASM64PWR4-NEXT:    mtlr 0
+; ASM64PWR4-NEXT:    blr
 entry:
   %0 = load float, ptr @f1, align 4
   call void (i32, ...) @test_vararg(i32 42, float %0)
   ret void
 }
 
-; CHECK-LABEL:     name: call_test_vararg4
-
-; 32BIT:      renamable $r[[REG:[0-9]+]] = LWZtoc @f1, $r2 :: (load (s32) from got)
-; 32BIT-NEXT: renamable $f1 = LFS 0, killed renamable $r[[REG]] :: (dereferenceable load (s32) from @f1)
-; 32BIT-NEXT: STFS renamable $f1, 0, %stack.[[SLOT:[0-9]+]] :: (store (s32) into %stack.[[SLOT]])
-; 32BIT-NEXT: renamable $r4 = LWZ 0, %stack.[[SLOT]] :: (load (s32) from %stack.[[SLOT]])
-; 32BIT-NEXT: ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-NEXT: $r3 = LI 42
-; 32BIT-NEXT: BL_NOP <mcsymbol .test_vararg[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $f1, implicit $r4, implicit $r2, implicit-def $r1
-; 32BIT-NEXT: ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; ASM32PWR4:      stwu 1, -64(1)
-; ASM32PWR4-NEXT: lwz [[REG:[0-9]+]], L..C1(2)
-; ASM32PWR4-NEXT: stw 0, 72(1)
-; ASM32PWR4-NEXT: lfs 1, 0([[REG]])
-; ASM32PWR4-NEXT: li 3, 42
-; ASM32PWR4-NEXT: stfs 1, 60(1)
-; ASM32PWR4-NEXT: lwz 4, 60(1)
-; ASM32PWR4-NEXT: bl .test_vararg[PR]
-; ASM32PWR4-NEXT: nop
-
-; 64BIT:      renamable $x[[REG:[0-9]+]] = LDtoc @f1, $x2 :: (load (s64) from got)
-; 64BIT-NEXT: renamable $f1 = LFS 0, killed renamable $x[[REG]] :: (dereferenceable load (s32) from @f1)
-; 64BIT-NEXT: STFS renamable $f1, 0, %stack.[[SLOT:[0-9]+]] :: (store (s32) into %stack.[[SLOT]])
-; 64BIT-NEXT: renamable $x4 = LWZ8 0, %stack.[[SLOT]] :: (load (s32) from %stack.[[SLOT]])
-; 64BIT-NEXT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-NEXT: $x3 = LI8 42
-; 64BIT-NEXT: BL8_NOP <mcsymbol .test_vararg[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $f1, implicit $x4, implicit $x2, implicit-def $r1
-; 64BIT-NEXT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-; ASM64PWR4:      stdu 1, -128(1)
-; ASM64PWR4-NEXT: ld [[REG:[0-9]+]], L..C1(2)
-; ASM64PWR4-NEXT: std 0, 144(1)
-; ASM64PWR4-NEXT: lfs 1, 0([[REG]])
-; ASM64PWR4-NEXT: li 3, 42
-; ASM64PWR4-NEXT: stfs 1, 124(1)
-; ASM64PWR4-NEXT: lwz 4, 124(1)
-; ASM64PWR4-NEXT: bl .test_vararg[PR]
-; ASM64PWR4-NEXT: nop
-
 @c = common global i8 0, align 1
 @si = common global i16 0, align 2
 @i = common global i32 0, align 4
@@ -965,6 +923,73 @@ entry:
 
 ; Basic saving of integral type arguments to the parameter save area.
 define void @call_test_stackarg_int() {
+; ASM32PWR4-LABEL: call_test_stackarg_int:
+; ASM32PWR4:       # %bb.0: # %entry
+; ASM32PWR4-NEXT:    mflr 0
+; ASM32PWR4-NEXT:    stwu 1, -80(1)
+; ASM32PWR4-NEXT:    lwz 3, L..C4(2) # @si
+; ASM32PWR4-NEXT:    stw 0, 88(1)
+; ASM32PWR4-NEXT:    lwz 4, L..C5(2) # @i
+; ASM32PWR4-NEXT:    li 6, 4
+; ASM32PWR4-NEXT:    li 8, 6
+; ASM32PWR4-NEXT:    li 9, 7
+; ASM32PWR4-NEXT:    li 10, 8
+; ASM32PWR4-NEXT:    lha 7, 0(3)
+; ASM32PWR4-NEXT:    lwz 3, L..C6(2) # @c
+; ASM32PWR4-NEXT:    lbz 11, 0(3)
+; ASM32PWR4-NEXT:    lwz 3, L..C7(2) # @lli
+; ASM32PWR4-NEXT:    lwz 5, 0(4)
+; ASM32PWR4-NEXT:    lwz 4, 0(3)
+; ASM32PWR4-NEXT:    lwz 3, 4(3)
+; ASM32PWR4-NEXT:    stw 5, 76(1)
+; ASM32PWR4-NEXT:    stw 3, 72(1)
+; ASM32PWR4-NEXT:    li 3, 1
+; ASM32PWR4-NEXT:    stw 4, 68(1)
+; ASM32PWR4-NEXT:    li 4, 2
+; ASM32PWR4-NEXT:    stw 5, 64(1)
+; ASM32PWR4-NEXT:    li 5, 3
+; ASM32PWR4-NEXT:    stw 7, 60(1)
+; ASM32PWR4-NEXT:    li 7, 5
+; ASM32PWR4-NEXT:    stw 11, 56(1)
+; ASM32PWR4-NEXT:    bl .test_stackarg_int[PR]
+; ASM32PWR4-NEXT:    nop
+; ASM32PWR4-NEXT:    addi 1, 1, 80
+; ASM32PWR4-NEXT:    lwz 0, 8(1)
+; ASM32PWR4-NEXT:    mtlr 0
+; ASM32PWR4-NEXT:    blr
+;
+; ASM64PWR4-LABEL: call_test_stackarg_int:
+; ASM64PWR4:       # %bb.0: # %entry
+; ASM64PWR4-NEXT:    mflr 0
+; ASM64PWR4-NEXT:    stdu 1, -160(1)
+; ASM64PWR4-NEXT:    ld 3, L..C3(2) # @si
+; ASM64PWR4-NEXT:    std 0, 176(1)
+; ASM64PWR4-NEXT:    ld 4, L..C4(2) # @i
+; ASM64PWR4-NEXT:    li 6, 4
+; ASM64PWR4-NEXT:    li 8, 6
+; ASM64PWR4-NEXT:    li 9, 7
+; ASM64PWR4-NEXT:    li 10, 8
+; ASM64PWR4-NEXT:    lha 7, 0(3)
+; ASM64PWR4-NEXT:    ld 3, L..C5(2) # @c
+; ASM64PWR4-NEXT:    lbz 11, 0(3)
+; ASM64PWR4-NEXT:    ld 3, L..C6(2) # @lli
+; ASM64PWR4-NEXT:    lwz 5, 0(4)
+; ASM64PWR4-NEXT:    li 4, 2
+; ASM64PWR4-NEXT:    ld 3, 0(3)
+; ASM64PWR4-NEXT:    std 5, 144(1)
+; ASM64PWR4-NEXT:    std 3, 136(1)
+; ASM64PWR4-NEXT:    li 3, 1
+; ASM64PWR4-NEXT:    std 5, 128(1)
+; ASM64PWR4-NEXT:    li 5, 3
+; ASM64PWR4-NEXT:    std 7, 120(1)
+; ASM64PWR4-NEXT:    li 7, 5
+; ASM64PWR4-NEXT:    std 11, 112(1)
+; ASM64PWR4-NEXT:    bl .test_stackarg_int[PR]
+; ASM64PWR4-NEXT:    nop
+; ASM64PWR4-NEXT:    addi 1, 1, 160
+; ASM64PWR4-NEXT:    ld 0, 16(1)
+; ASM64PWR4-NEXT:    mtlr 0
+; ASM64PWR4-NEXT:    blr
 entry:
   %0 = load i8, ptr @c, align 1
   %1 = load i16, ptr @si, align 2
@@ -977,121 +1002,60 @@ entry:
 
 declare void @test_stackarg_int(i32, i32, i32, i32, i32, i32, i32, i32, i8 zeroext, i16 signext, i32, i64, i32)
 
-; CHECK-LABEL:     name: call_test_stackarg_int{{.*}}
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 32BIT-DAG:  ADJCALLSTACKDOWN 80, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-DAG:  $r3 = LI 1
-; 32BIT-DAG:  $r4 = LI 2
-; 32BIT-DAG:  $r5 = LI 3
-; 32BIT-DAG:  $r6 = LI 4
-; 32BIT-DAG:  $r7 = LI 5
-; 32BIT-DAG:  $r8 = LI 6
-; 32BIT-DAG:  $r9 = LI 7
-; 32BIT-DAG:  $r10 = LI 8
-; 32BIT-DAG:  renamable $r[[REGCADDR:[0-9]+]] = LWZtoc @c, $r2 :: (load (s32) from got)
-; 32BIT-DAG:  renamable $r[[REGC:[0-9]+]] = LBZ 0, killed renamable $r[[REGCADDR]] :: (dereferenceable load (s8) from @c)
-; 32BIT-DAG:  STW killed renamable $r[[REGC]], 56, $r1 :: (store (s32))
-; 32BIT-DAG:  renamable $r[[REGSIADDR:[0-9]+]] = LWZtoc @si, $r2 :: (load (s32) from got)
-; 32BIT-DAG:  renamable $r[[REGSI:[0-9]+]] = LHA 0, killed renamable $r[[REGSIADDR]] :: (dereferenceable load (s16) from @si)
-; 32BIT-DAG:  STW killed renamable $r[[REGSI]], 60, $r1 :: (store (s32))
-; 32BIT-DAG:  renamable $r[[REGIADDR:[0-9]+]] = LWZtoc @i, $r2 :: (load (s32) from got)
-; 32BIT-DAG:  renamable $r[[REGI:[0-9]+]] = LWZ 0, killed renamable $r[[REGIADDR]] :: (dereferenceable load (s32) from @i)
-; 32BIT-DAG:  STW killed renamable $r[[REGI]], 64, $r1 :: (store (s32))
-; 32BIT-DAG:  renamable $r[[REGLLIADDR:[0-9]+]] = LWZtoc @lli, $r2 :: (load (s32) from got)
-; 32BIT-DAG:  renamable $r[[REGLLI1:[0-9]+]] = LWZ 0, renamable $r[[REGLLIADDR]] :: (dereferenceable load (s32) from @lli, align 8)
-; 32BIT-DAG:  STW killed renamable $r[[REGLLI1]], 68, $r1 :: (store (s32))
-; 32BIT-DAG:  renamable $r[[REGLLI2:[0-9]+]] = LWZ 4, killed renamable $r[[REGLLIADDR]] :: (dereferenceable load (s32) from @lli + 4, basealign 8)
-; 32BIT-DAG:  STW killed renamable $r[[REGLLI2]], 72, $r1 :: (store (s32))
-; 32BIT-DAG:  STW renamable $r[[REGI]], 76, $r1 :: (store (s32))
-; 32BIT-NEXT: BL_NOP <mcsymbol .test_stackarg_int[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r5, implicit $r6, implicit $r7, implicit $r8, implicit $r9, implicit $r10, implicit $r2, implicit-def $r1
-; 32BIT-NEXT: ADJCALLSTACKUP 80, 0, implicit-def dead $r1, implicit $r1
-
-; CHECKASM-LABEL: .call_test_stackarg_int:
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM32PWR4:       stwu 1, -80(1)
-; ASM32PWR4-DAG:   li 3, 1
-; ASM32PWR4-DAG:   li 4, 2
-; ASM32PWR4-DAG:   li 5, 3
-; ASM32PWR4-DAG:   li 6, 4
-; ASM32PWR4-DAG:   li 7, 5
-; ASM32PWR4-DAG:   li 8, 6
-; ASM32PWR4-DAG:   li 9, 7
-; ASM32PWR4-DAG:   li 10, 8
-; ASM32PWR4-DAG:   lwz [[REGCADDR:[0-9]+]], L..C6(2)
-; ASM32PWR4-DAG:   lbz [[REGC:[0-9]+]], 0([[REGCADDR]])
-; ASM32PWR4-DAG:   stw [[REGC]], 56(1)
-; ASM32PWR4-DAG:   lwz [[REGSIADDR:[0-9]+]], L..C4(2)
-; ASM32PWR4-DAG:   lha [[REGSI:[0-9]+]], 0([[REGSIADDR]])
-; ASM32PWR4-DAG:   stw [[REGSI]], 60(1)
-; ASM32PWR4-DAG:   lwz [[REGIADDR:[0-9]+]], L..C5(2)
-; ASM32PWR4-DAG:   lwz [[REGI:[0-9]+]], 0([[REGIADDR]])
-; ASM32PWR4-DAG:   stw [[REGI]], 64(1)
-; ASM32PWR4-DAG:   lwz [[REGLLIADDR:[0-9]+]], L..C7(2)
-; ASM32PWR4-DAG:   lwz [[REGLLI1:[0-9]+]], 0([[REGLLIADDR]])
-; ASM32PWR4-DAG:   stw [[REGLLI1]], 68(1)
-; ASM32PWR4-DAG:   lwz [[REGLLI2:[0-9]+]], 4([[REGLLIADDR]])
-; ASM32PWR4-DAG:   stw [[REGLLI2]], 72(1)
-; ASM32PWR4-DAG:   stw [[REGI]], 76(1)
-; ASM32PWR4-NEXT:  bl .test_stackarg_int[PR]
-; ASM32PWR4-NEXT:  nop
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 64BIT-DAG:   ADJCALLSTACKDOWN 152, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-DAG:   $x3 = LI8 1
-; 64BIT-DAG:   $x4 = LI8 2
-; 64BIT-DAG:   $x5 = LI8 3
-; 64BIT-DAG:   $x6 = LI8 4
-; 64BIT-DAG:   $x7 = LI8 5
-; 64BIT-DAG:   $x8 = LI8 6
-; 64BIT-DAG:   $x9 = LI8 7
-; 64BIT-DAG:   $x10 = LI8 8
-; 64BIT-DAG:   renamable $x[[REGCADDR:[0-9]+]] = LDtoc @c, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x[[REGC:[0-9]+]] = LBZ8 0, killed renamable $x[[REGCADDR]] :: (dereferenceable load (s8) from @c)
-; 64BIT-DAG:   STD killed renamable $x[[REGC]], 112, $x1 :: (store (s64))
-; 64BIT-DAG:   renamable $x[[REGSIADDR:[0-9]+]] = LDtoc @si, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x[[REGSI:[0-9]+]] = LHA8 0, killed renamable $x[[REGSIADDR]] :: (dereferenceable load (s16) from @si)
-; 64BIT-DAG:   STD killed renamable $x[[REGSI]], 120, $x1 :: (store (s64))
-; 64BIT-DAG:   renamable $x[[REGIADDR:[0-9]+]] = LDtoc @i, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x[[REGI:[0-9]+]] = LWZ8 0, killed renamable $x[[REGIADDR]] :: (dereferenceable load (s32) from @i)
-; 64BIT-DAG:   STD killed renamable $x[[REGI]], 128, $x1 :: (store (s64))
-; 64BIT-DAG:   renamable $x[[REGLLIADDR:[0-9]+]] = LDtoc @lli, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x[[REGLLI:[0-9]+]] = LD 0, killed renamable $x[[REGLLIADDR]] :: (dereferenceable load (s64) from @lli)
-; 64BIT-DAG:   STD killed renamable $x[[REGLLI]], 136, $x1 :: (store (s64))
-; 64BIT-DAG:   STD renamable $x[[REGI]], 144, $x1 :: (store (s64))
-; 64BIT-NEXT:  BL8_NOP <mcsymbol .test_stackarg_int[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit $x5, implicit $x6, implicit $x7, implicit $x8, implicit $x9, implicit $x10, implicit $x2, implicit-def $r1
-; 64BIT-NEXT:  ADJCALLSTACKUP 152, 0, implicit-def dead $r1, implicit $r1
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM64PWR4-DAG:   stdu 1, -160(1)
-; ASM64PWR4-DAG:   li 3, 1
-; ASM64PWR4-DAG:   li 4, 2
-; ASM64PWR4-DAG:   li 5, 3
-; ASM64PWR4-DAG:   li 6, 4
-; ASM64PWR4-DAG:   li 7, 5
-; ASM64PWR4-DAG:   li 8, 6
-; ASM64PWR4-DAG:   li 9, 7
-; ASM64PWR4-DAG:   li 10, 8
-; ASM64PWR4-DAG:   ld [[REGCADDR:[0-9]+]], L..C5(2)
-; ASM64PWR4-DAG:   lbz [[REGC:[0-9]+]], 0([[REGCADDR]])
-; ASM64PWR4-DAG:   std [[REGC]], 112(1)
-; ASM64PWR4-DAG:   ld [[REGSIADDR:[0-9]+]], L..C3(2)
-; ASM64PWR4-DAG:   lha [[REGSI:[0-9]+]], 0([[REGSIADDR]])
-; ASM64PWR4-DAG:   std [[REGSI]], 120(1)
-; ASM64PWR4-DAG:   ld [[REGIADDR:[0-9]+]], L..C4(2)
-; ASM64PWR4-DAG:   lwz [[REGI:[0-9]+]], 0([[REGIADDR]])
-; ASM64PWR4-DAG:   std [[REGI]], 128(1)
-; ASM64PWR4-DAG:   ld [[REGLLIADDR:[0-9]+]], L..C6(2)
-; ASM64PWR4-DAG:   ld [[REGLLI:[0-9]+]], 0([[REGLLIADDR]])
-; ASM64PWR4-DAG:   std [[REGLLI]], 136(1)
-; ASM64PWR4-DAG:   std [[REGI]], 144(1)
-; ASM64PWR4-NEXT:  bl .test_stackarg_int[PR]
-; ASM64PWR4-NEXT:  nop
-
 ; Basic saving of floating point type arguments to the parameter save area.
 ; The float and double arguments will pass in both fpr as well as parameter save area.
 define void @call_test_stackarg_float() {
+; ASM32PWR4-LABEL: call_test_stackarg_float:
+; ASM32PWR4:       # %bb.0: # %entry
+; ASM32PWR4-NEXT:    mflr 0
+; ASM32PWR4-NEXT:    stwu 1, -80(1)
+; ASM32PWR4-NEXT:    lwz 3, L..C8(2) # @f
+; ASM32PWR4-NEXT:    stw 0, 88(1)
+; ASM32PWR4-NEXT:    li 4, 2
+; ASM32PWR4-NEXT:    li 5, 3
+; ASM32PWR4-NEXT:    li 6, 4
+; ASM32PWR4-NEXT:    li 7, 5
+; ASM32PWR4-NEXT:    lfs 1, 0(3)
+; ASM32PWR4-NEXT:    lwz 3, L..C9(2) # @d
+; ASM32PWR4-NEXT:    li 8, 6
+; ASM32PWR4-NEXT:    li 9, 7
+; ASM32PWR4-NEXT:    lfd 2, 0(3)
+; ASM32PWR4-NEXT:    li 3, 1
+; ASM32PWR4-NEXT:    li 10, 8
+; ASM32PWR4-NEXT:    stfd 2, 60(1)
+; ASM32PWR4-NEXT:    stfs 1, 56(1)
+; ASM32PWR4-NEXT:    bl .test_stackarg_float[PR]
+; ASM32PWR4-NEXT:    nop
+; ASM32PWR4-NEXT:    addi 1, 1, 80
+; ASM32PWR4-NEXT:    lwz 0, 8(1)
+; ASM32PWR4-NEXT:    mtlr 0
+; ASM32PWR4-NEXT:    blr
+;
+; ASM64PWR4-LABEL: call_test_stackarg_float:
+; ASM64PWR4:       # %bb.0: # %entry
+; ASM64PWR4-NEXT:    mflr 0
+; ASM64PWR4-NEXT:    stdu 1, -128(1)
+; ASM64PWR4-NEXT:    ld 3, L..C7(2) # @f
+; ASM64PWR4-NEXT:    std 0, 144(1)
+; ASM64PWR4-NEXT:    li 4, 2
+; ASM64PWR4-NEXT:    li 5, 3
+; ASM64PWR4-NEXT:    li 6, 4
+; ASM64PWR4-NEXT:    li 7, 5
+; ASM64PWR4-NEXT:    lfs 1, 0(3)
+; ASM64PWR4-NEXT:    ld 3, L..C8(2) # @d
+; ASM64PWR4-NEXT:    li 8, 6
+; ASM64PWR4-NEXT:    li 9, 7
+; ASM64PWR4-NEXT:    lfd 2, 0(3)
+; ASM64PWR4-NEXT:    li 3, 1
+; ASM64PWR4-NEXT:    li 10, 8
+; ASM64PWR4-NEXT:    stfd 2, 120(1)
+; ASM64PWR4-NEXT:    stfs 1, 112(1)
+; ASM64PWR4-NEXT:    bl .test_stackarg_float[PR]
+; ASM64PWR4-NEXT:    nop
+; ASM64PWR4-NEXT:    addi 1, 1, 128
+; ASM64PWR4-NEXT:    ld 0, 16(1)
+; ASM64PWR4-NEXT:    mtlr 0
+; ASM64PWR4-NEXT:    blr
 entry:
   %0 = load float, ptr @f, align 4
   %1 = load double, ptr @d, align 8
@@ -1101,89 +1065,51 @@ entry:
 
 declare void @test_stackarg_float(i32, i32, i32, i32, i32, i32, i32, i32, float, double)
 
-; CHECK-LABEL:     name:            call_test_stackarg_float
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 32BIT-DAG:   ADJCALLSTACKDOWN 68, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-DAG:   $r3 = LI 1
-; 32BIT-DAG:   $r4 = LI 2
-; 32BIT-DAG:   $r5 = LI 3
-; 32BIT-DAG:   $r6 = LI 4
-; 32BIT-DAG:   $r7 = LI 5
-; 32BIT-DAG:   $r8 = LI 6
-; 32BIT-DAG:   $r9 = LI 7
-; 32BIT-DAG:   $r10 = LI 8
-; 32BIT-DAG:   renamable $r[[REGF:[0-9]+]] = LWZtoc @f, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $f1 = LFS 0, killed renamable $r[[REGF]] :: (dereferenceable load (s32) from @f)
-; 32BIT-DAG:   renamable $r[[REGD:[0-9]+]] = LWZtoc @d, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $f2 = LFD 0, killed renamable $r[[REGD]] :: (dereferenceable load (s64) from @d)
-; 32BIT-DAG:   STFS renamable $f1, 56, $r1 :: (store (s32))
-; 32BIT-DAG:   STFD renamable $f2, 60, $r1 :: (store (s64))
-; 32BIT-NEXT:  BL_NOP <mcsymbol .test_stackarg_float[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit killed $r5, implicit killed $r6, implicit killed $r7, implicit killed $r8, implicit killed $r9, implicit killed $r10, implicit $f1, implicit $f2, implicit $r2, implicit-def $r1
-; 32BIT-NEXT:  ADJCALLSTACKUP 68, 0, implicit-def dead $r1, implicit $r1
-
-; CHECKASM-LABEL: .call_test_stackarg_float:
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM32PWR4:      stwu 1, -80(1)
-; ASM32PWR4-DAG:  li 3, 1
-; ASM32PWR4-DAG:  li 4, 2
-; ASM32PWR4-DAG:  li 5, 3
-; ASM32PWR4-DAG:  li 6, 4
-; ASM32PWR4-DAG:  li 7, 5
-; ASM32PWR4-DAG:  li 8, 6
-; ASM32PWR4-DAG:  li 9, 7
-; ASM32PWR4-DAG:  li 10, 8
-; ASM32PWR4-DAG:  lwz [[REGF:[0-9]+]], L..C8(2)
-; ASM32PWR4-DAG:  lfs 1, 0([[REGF]])
-; ASM32PWR4-DAG:  lwz [[REGD:[0-9]+]], L..C9(2)
-; ASM32PWR4-DAG:  lfd 2, 0([[REGD:[0-9]+]])
-; ASM32PWR4-DAG:  stfs 1, 56(1)
-; ASM32PWR4-DAG:  stfd 2, 60(1)
-; ASM32PWR4-NEXT: bl .test_stackarg_float[PR]
-; ASM32PWR4-NEXT: nop
-; ASM32PWR4-NEXT: addi 1, 1, 80
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 64BIT-DAG:   ADJCALLSTACKDOWN 128, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-DAG:   $x3 = LI8 1
-; 64BIT-DAG:   $x4 = LI8 2
-; 64BIT-DAG:   $x5 = LI8 3
-; 64BIT-DAG:   $x6 = LI8 4
-; 64BIT-DAG:   $x7 = LI8 5
-; 64BIT-DAG:   $x8 = LI8 6
-; 64BIT-DAG:   $x9 = LI8 7
-; 64BIT-DAG:   $x10 = LI8 8
-; 64BIT-DAG:   renamable $x[[REGF:[0-9]+]] = LDtoc @f, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $f1 = LFS 0, killed renamable $x[[REGF]] :: (dereferenceable load (s32) from @f)
-; 64BIT-DAG:   renamable $x[[REGD:[0-9]+]] = LDtoc @d, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $f2 = LFD 0, killed renamable $x[[REGD]] :: (dereferenceable load (s64) from @d)
-; 64BIT-DAG:   STFS renamable $f1, 112, $x1 :: (store (s32))
-; 64BIT-DAG:   STFD renamable $f2, 120, $x1 :: (store (s64))
-; 64BIT-NEXT:  BL8_NOP <mcsymbol .test_stackarg_float[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit killed $x5, implicit killed $x6, implicit killed $x7, implicit killed $x8, implicit killed $x9, implicit killed $x10, implicit $f1, implicit $f2, implicit $x2, implicit-def $r1
-; 64BIT-NEXT:  ADJCALLSTACKUP 128, 0, implicit-def dead $r1, implicit $r1
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM64PWR4:      stdu 1, -128(1)
-; ASM64PWR4-DAG:  li 3, 1
-; ASM64PWR4-DAG:  li 4, 2
-; ASM64PWR4-DAG:  li 5, 3
-; ASM64PWR4-DAG:  li 6, 4
-; ASM64PWR4-DAG:  li 7, 5
-; ASM64PWR4-DAG:  li 8, 6
-; ASM64PWR4-DAG:  li 9, 7
-; ASM64PWR4-DAG:  li 10, 8
-; ASM64PWR4-DAG:  ld [[REGF:[0-9]+]], L..C7(2)
-; ASM64PWR4-DAG:  lfs 1, 0([[REGF]])
-; ASM64PWR4-DAG:  ld [[REGD:[0-9]+]], L..C8(2)
-; ASM64PWR4-DAG:  lfd 2, 0([[REGD]])
-; ASM64PWR4-DAG:  stfs 1, 112(1)
-; ASM64PWR4-DAG:  stfd 2, 120(1)
-; ASM64PWR4-NEXT: bl .test_stackarg_float[PR]
-; ASM64PWR4-NEXT: nop
-; ASM64PWR4-NEXT: addi 1, 1, 128
-
 define void @call_test_stackarg_float2() {
+; ASM32PWR4-LABEL: call_test_stackarg_float2:
+; ASM32PWR4:       # %bb.0: # %entry
+; ASM32PWR4-NEXT:    mflr 0
+; ASM32PWR4-NEXT:    stwu 1, -64(1)
+; ASM32PWR4-NEXT:    lwz 3, L..C9(2) # @d
+; ASM32PWR4-NEXT:    stw 0, 72(1)
+; ASM32PWR4-NEXT:    li 4, 2
+; ASM32PWR4-NEXT:    li 5, 3
+; ASM32PWR4-NEXT:    li 6, 4
+; ASM32PWR4-NEXT:    li 7, 5
+; ASM32PWR4-NEXT:    lfd 1, 0(3)
+; ASM32PWR4-NEXT:    li 3, 1
+; ASM32PWR4-NEXT:    li 8, 6
+; ASM32PWR4-NEXT:    stfd 1, 56(1)
+; ASM32PWR4-NEXT:    lwz 9, 56(1)
+; ASM32PWR4-NEXT:    lwz 10, 60(1)
+; ASM32PWR4-NEXT:    bl .test_stackarg_float2[PR]
+; ASM32PWR4-NEXT:    nop
+; ASM32PWR4-NEXT:    addi 1, 1, 64
+; ASM32PWR4-NEXT:    lwz 0, 8(1)
+; ASM32PWR4-NEXT:    mtlr 0
+; ASM32PWR4-NEXT:    blr
+;
+; ASM64PWR4-LABEL: call_test_stackarg_float2:
+; ASM64PWR4:       # %bb.0: # %entry
+; ASM64PWR4-NEXT:    mflr 0
+; ASM64PWR4-NEXT:    stdu 1, -128(1)
+; ASM64PWR4-NEXT:    ld 3, L..C8(2) # @d
+; ASM64PWR4-NEXT:    std 0, 144(1)
+; ASM64PWR4-NEXT:    li 4, 2
+; ASM64PWR4-NEXT:    li 5, 3
+; ASM64PWR4-NEXT:    li 6, 4
+; ASM64PWR4-NEXT:    li 7, 5
+; ASM64PWR4-NEXT:    lfd 1, 0(3)
+; ASM64PWR4-NEXT:    li 3, 1
+; ASM64PWR4-NEXT:    li 8, 6
+; ASM64PWR4-NEXT:    stfd 1, 120(1)
+; ASM64PWR4-NEXT:    ld 9, 120(1)
+; ASM64PWR4-NEXT:    bl .test_stackarg_float2[PR]
+; ASM64PWR4-NEXT:    nop
+; ASM64PWR4-NEXT:    addi 1, 1, 128
+; ASM64PWR4-NEXT:    ld 0, 16(1)
+; ASM64PWR4-NEXT:    mtlr 0
+; ASM64PWR4-NEXT:    blr
 entry:
   %0 = load double, ptr @d, align 8
   call void (i32, i32, i32, i32, i32, i32, ...) @test_stackarg_float2(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, double %0)
@@ -1192,76 +1118,60 @@ entry:
 
 declare void @test_stackarg_float2(i32, i32, i32, i32, i32, i32, ...)
 
-; CHECK-LABEL:     name: call_test_stackarg_float2{{.*}}
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 32BIT-DAG:   ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-DAG:   $r3 = LI 1
-; 32BIT-DAG:   $r4 = LI 2
-; 32BIT-DAG:   $r5 = LI 3
-; 32BIT-DAG:   $r6 = LI 4
-; 32BIT-DAG:   $r7 = LI 5
-; 32BIT-DAG:   $r8 = LI 6
-; 32BIT-DAG:   renamable $r[[REG:[0-9]+]] = LWZtoc @d, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $f1 = LFD 0, killed renamable $r[[REG]] :: (dereferenceable load (s64) from @d)
-; 32BIT-DAG:   STFD renamable $f1, 0, %stack.0 :: (store (s64) into %stack.0)
-; 32BIT-DAG:   renamable $r9 = LWZ 0, %stack.0 :: (load (s32) from %stack.0, align 8)
-; 32BIT-DAG:   renamable $r10 = LWZ 4, %stack.0 :: (load (s32) from %stack.0 + 4)
-; 32BIT-NEXT:   BL_NOP <mcsymbol .test_stackarg_float2[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit killed $r4, implicit killed $r5, implicit killed $r6, implicit killed $r7, implicit killed $r8, implicit $f1, implicit $r9, implicit $r10, implicit $r2, implicit-def $r1
-; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; CHECKASM-LABEL: .call_test_stackarg_float2:
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM32PWR4:     stwu 1, -64(1)
-; ASM32PWR4-DAG: li 3, 1
-; ASM32PWR4-DAG: li 4, 2
-; ASM32PWR4-DAG: li 5, 3
-; ASM32PWR4-DAG: li 6, 4
-; ASM32PWR4-DAG: li 7, 5
-; ASM32PWR4-DAG: li 8, 6
-; ASM32PWR4-DAG: lwz [[REG:[0-9]+]], L..C9(2)
-; ASM32PWR4-DAG: lfd 1, 0([[REG]])
-; ASM32PWR4-DAG: stfd 1, 56(1)
-; ASM32PWR4-DAG: lwz 9, 56(1)
-; ASM32PWR4-DAG: lwz 10, 60(1)
-; ASM32PWR4-NEXT: bl .test_stackarg_float2[PR]
-; ASM32PWR4-NEXT: nop
-; ASM32PWR4-NEXT: addi 1, 1, 64
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 64BIT-DAG:   ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-DAG:   $x3 = LI8 1
-; 64BIT-DAG:   $x4 = LI8 2
-; 64BIT-DAG:   $x5 = LI8 3
-; 64BIT-DAG:   $x6 = LI8 4
-; 64BIT-DAG:   $x7 = LI8 5
-; 64BIT-DAG:   $x8 = LI8 6
-; 64BIT-DAG:   renamable $x[[REG:[0-9]+]] = LDtoc @d, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $f1 = LFD 0, killed renamable $x[[REG]] :: (dereferenceable load (s64) from @d)
-; 64BIT-DAG:   STFD renamable $f1, 0, %stack.0 :: (store (s64) into %stack.0)
-; 64BIT-DAG:   renamable $x9 = LD 0, %stack.0 :: (load (s64) from %stack.0)
-; 64BIT-NEXT:  BL8_NOP <mcsymbol .test_stackarg_float2[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit killed $x4, implicit killed $x5, implicit killed $x6, implicit killed $x7, implicit killed $x8, implicit $f1, implicit $x9, implicit $x2, implicit-def $r1
-; 64BIT-NEXT:  ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM64PWR4:     stdu 1, -128(1)
-; ASM64PWR4-DAG: li 3, 1
-; ASM64PWR4-DAG: li 4, 2
-; ASM64PWR4-DAG: li 5, 3
-; ASM64PWR4-DAG: li 6, 4
-; ASM64PWR4-DAG: li 7, 5
-; ASM64PWR4-DAG: li 8, 6
-; ASM64PWR4-DAG: ld [[REG:[0-9]+]], L..C8(2)
-; ASM64PWR4-DAG: lfd 1, 0([[REG]])
-; ASM64PWR4-DAG: stfd 1, 120(1)
-; ASM64PWR4-DAG: ld 9, 120(1)
-; ASM64PWR4-NEXT: bl .test_stackarg_float2[PR]
-; ASM64PWR4-NEXT: nop
-; ASM64PWR4-NEXT: addi 1, 1, 128
-
 ; A double arg will pass on the stack in PPC32 if there is only one available GPR.
 define void @call_test_stackarg_float3() {
+; ASM32PWR4-LABEL: call_test_stackarg_float3:
+; ASM32PWR4:       # %bb.0: # %entry
+; ASM32PWR4-NEXT:    mflr 0
+; ASM32PWR4-NEXT:    stwu 1, -80(1)
+; ASM32PWR4-NEXT:    lwz 3, L..C9(2) # @d
+; ASM32PWR4-NEXT:    stw 0, 88(1)
+; ASM32PWR4-NEXT:    li 4, 2
+; ASM32PWR4-NEXT:    li 5, 3
+; ASM32PWR4-NEXT:    li 6, 4
+; ASM32PWR4-NEXT:    li 7, 5
+; ASM32PWR4-NEXT:    lfd 1, 0(3)
+; ASM32PWR4-NEXT:    lwz 3, L..C8(2) # @f
+; ASM32PWR4-NEXT:    li 8, 6
+; ASM32PWR4-NEXT:    li 9, 7
+; ASM32PWR4-NEXT:    stfd 1, 72(1)
+; ASM32PWR4-NEXT:    lwz 10, 72(1)
+; ASM32PWR4-NEXT:    lfs 2, 0(3)
+; ASM32PWR4-NEXT:    li 3, 1
+; ASM32PWR4-NEXT:    stfs 2, 60(1)
+; ASM32PWR4-NEXT:    stfd 1, 52(1)
+; ASM32PWR4-NEXT:    bl .test_stackarg_float3[PR]
+; ASM32PWR4-NEXT:    nop
+; ASM32PWR4-NEXT:    addi 1, 1, 80
+; ASM32PWR4-NEXT:    lwz 0, 8(1)
+; ASM32PWR4-NEXT:    mtlr 0
+; ASM32PWR4-NEXT:    blr
+;
+; ASM64PWR4-LABEL: call_test_stackarg_float3:
+; ASM64PWR4:       # %bb.0: # %entry
+; ASM64PWR4-NEXT:    mflr 0
+; ASM64PWR4-NEXT:    stdu 1, -128(1)
+; ASM64PWR4-NEXT:    ld 3, L..C8(2) # @d
+; ASM64PWR4-NEXT:    std 0, 144(1)
+; ASM64PWR4-NEXT:    li 4, 2
+; ASM64PWR4-NEXT:    li 5, 3
+; ASM64PWR4-NEXT:    li 6, 4
+; ASM64PWR4-NEXT:    li 7, 5
+; ASM64PWR4-NEXT:    lfd 1, 0(3)
+; ASM64PWR4-NEXT:    ld 3, L..C7(2) # @f
+; ASM64PWR4-NEXT:    li 8, 6
+; ASM64PWR4-NEXT:    li 9, 7
+; ASM64PWR4-NEXT:    stfd 1, 120(1)
+; ASM64PWR4-NEXT:    ld 10, 120(1)
+; ASM64PWR4-NEXT:    lfs 2, 0(3)
+; ASM64PWR4-NEXT:    li 3, 1
+; ASM64PWR4-NEXT:    stfs 2, 112(1)
+; ASM64PWR4-NEXT:    bl .test_stackarg_float3[PR]
+; ASM64PWR4-NEXT:    nop
+; ASM64PWR4-NEXT:    addi 1, 1, 128
+; ASM64PWR4-NEXT:    ld 0, 16(1)
+; ASM64PWR4-NEXT:    mtlr 0
+; ASM64PWR4-NEXT:    blr
 entry:
   %0 = load double, ptr @d, align 8
   %1 = load float, ptr @f, align 4
@@ -1271,94 +1181,79 @@ entry:
 
 declare void @test_stackarg_float3(i32, i32, i32, i32, i32, i32, i32, ...)
 
-; CHECK-LABEL:     name: call_test_stackarg_float3{{.*}}
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; In 32-bit the double arg is written to memory because it cannot be fully stored in the last 32-bit GPR.
-; 32BIT-DAG:   ADJCALLSTACKDOWN 64, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-DAG:   $r3 = LI 1
-; 32BIT-DAG:   $r4 = LI 2
-; 32BIT-DAG:   $r5 = LI 3
-; 32BIT-DAG:   $r6 = LI 4
-; 32BIT-DAG:   $r7 = LI 5
-; 32BIT-DAG:   $r8 = LI 6
-; 32BIT-DAG:   $r9 = LI 7
-; 32BIT-DAG:   renamable $r[[REGD:[0-9]+]] = LWZtoc @d, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $f1 = LFD 0, killed renamable $r[[REGD]] :: (dereferenceable load (s64) from @d)
-; 32BIT-DAG:   renamable $r[[REGF:[0-9]+]] = LWZtoc @f, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $f2 = LFS 0, killed renamable $r[[REGF]] :: (dereferenceable load (s32) from @f)
-; 32BIT-DAG:   STFD renamable $f1, 52, $r1 :: (store (s64))
-; 32BIT-DAG:   STFS renamable $f2, 60, $r1 :: (store (s32))
-; 32BIT-DAG:   STFD renamable $f1, 0, %stack.0 :: (store (s64) into %stack.0)
-; 32BIT-DAG:   renamable $r10 = LWZ 0, %stack.0 :: (load (s32) from %stack.0, align 8)
-; 32BIT-NEXT:  BL_NOP <mcsymbol .test_stackarg_float3[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit killed $r4, implicit killed $r5, implicit killed $r6, implicit killed $r7, implicit killed $r8, implicit killed $r9, implicit $f1, implicit $r10, implicit $f2, implicit $r2, implicit-def $r1
-; 32BIT-NEXT:  ADJCALLSTACKUP 64, 0, implicit-def dead $r1, implicit $r1
-
-; CHECKASM-LABEL: .call_test_stackarg_float3:
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM32PWR4:       stwu 1, -80(1)
-; ASM32PWR4-DAG:   li 3, 1
-; ASM32PWR4-DAG:   li 4, 2
-; ASM32PWR4-DAG:   li 5, 3
-; ASM32PWR4-DAG:   li 6, 4
-; ASM32PWR4-DAG:   li 7, 5
-; ASM32PWR4-DAG:   li 8, 6
-; ASM32PWR4-DAG:   li 9, 7
-; ASM32PWR4-DAG:   lwz [[REGD:[0-9]+]], L..C9(2)
-; ASM32PWR4-DAG:   lfd 1, 0([[REGD]])
-; ASM32PWR4-DAG:   lwz [[REGF:[0-9]+]], L..C8(2)
-; ASM32PWR4-DAG:   lfs 2, 0([[REGF]])
-; ASM32PWR4-DAG:   stfd 1, 52(1)
-; ASM32PWR4-DAG:   stfs 2, 60(1)
-; ASM32PWR4-DAG:   stfd 1, 72(1)
-; ASM32PWR4-DAG:   lwz 10, 72(1)
-; ASM32PWR4-NEXT:  bl .test_stackarg_float3[PR]
-; ASM32PWR4-NEXT:  nop
-; ASM32PWR4-NEXT:  addi 1, 1, 80
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; In 64-bit the double arg is not written to memory because it is fully stored in the last 64-bit GPR.
-; 64BIT-DAG:   ADJCALLSTACKDOWN 120, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-DAG:   $x3 = LI8 1
-; 64BIT-DAG:   $x4 = LI8 2
-; 64BIT-DAG:   $x5 = LI8 3
-; 64BIT-DAG:   $x6 = LI8 4
-; 64BIT-DAG:   $x7 = LI8 5
-; 64BIT-DAG:   $x8 = LI8 6
-; 64BIT-DAG:   $x9 = LI8 7
-; 64BIT-DAG:   renamable $x[[REGD:[0-9]+]] = LDtoc @d, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $f1 = LFD 0, killed renamable $x[[REGD]] :: (dereferenceable load (s64) from @d)
-; 64BIT-DAG:   renamable $x[[REGF:[0-9]+]] = LDtoc @f, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $f2 = LFS 0, killed renamable $x[[REGF]] :: (dereferenceable load (s32) from @f)
-; 64BIT-DAG:   STFS renamable $f2, 112, $x1 :: (store (s32))
-; 64BIT-DAG:   STFD renamable $f1, 0, %stack.0 :: (store (s64) into %stack.0)
-; 64BIT-DAG:   renamable $x10 = LD 0, %stack.0 :: (load (s64) from %stack.0)
-; 64BIT-NEXT:  BL8_NOP <mcsymbol .test_stackarg_float3[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit killed $x4, implicit killed $x5, implicit killed $x6, implicit killed $x7, implicit killed $x8, implicit killed $x9, implicit $f1, implicit $x10, implicit $f2, implicit $x2, implicit-def $r1
-
-; 64BIT-NEXT: ADJCALLSTACKUP 120, 0, implicit-def dead $r1, implicit $r1
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM64PWR4:       stdu 1, -128(1)
-; ASM64PWR4-DAG:   li 3, 1
-; ASM64PWR4-DAG:   li 4, 2
-; ASM64PWR4-DAG:   li 5, 3
-; ASM64PWR4-DAG:   li 6, 4
-; ASM64PWR4-DAG:   li 7, 5
-; ASM64PWR4-DAG:   li 8, 6
-; ASM64PWR4-DAG:   li 9, 7
-; ASM64PWR4-DAG:   ld [[REGD:[0-9]+]], L..C8(2)
-; ASM64PWR4-DAG:   lfd 1, 0([[REGD]])
-; ASM64PWR4-DAG:   ld [[REGF:[0-9]+]], L..C7(2)
-; ASM64PWR4-DAG:   lfs 2, 0([[REGF]])
-; ASM64PWR4-DAG:   stfs 2, 112(1)
-; ASM64PWR4-DAG:   stfd 1, 120(1)
-; ASM64PWR4-DAG:   ld 10, 120(1)
-; ASM64PWR4-NEXT:  bl .test_stackarg_float3[PR]
-; ASM64PWR4-NEXT:  nop
-; ASM64PWR4-NEXT:  addi 1, 1, 128
-
 define i64 @test_ints_stack(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i64 %ll9, i16 signext %s10, i8 zeroext %c11, i32 %ui12, i32 %si13, i64 %ll14, i8 zeroext %uc15, i32 %i16) {
+; ASM32PWR4-LABEL: test_ints_stack:
+; ASM32PWR4:       # %bb.0: # %entry
+; ASM32PWR4-NEXT:    add 3, 3, 4
+; ASM32PWR4-NEXT:    lwz 11, 92(1)
+; ASM32PWR4-NEXT:    add 3, 3, 5
+; ASM32PWR4-NEXT:    add 3, 3, 6
+; ASM32PWR4-NEXT:    add 3, 3, 7
+; ASM32PWR4-NEXT:    lwz 12, 76(1)
+; ASM32PWR4-NEXT:    add 3, 3, 8
+; ASM32PWR4-NEXT:    add 3, 3, 9
+; ASM32PWR4-NEXT:    lwz 6, 60(1)
+; ASM32PWR4-NEXT:    add 3, 3, 10
+; ASM32PWR4-NEXT:    srawi 5, 11, 31
+; ASM32PWR4-NEXT:    srawi 8, 3, 31
+; ASM32PWR4-NEXT:    lwz 4, 64(1)
+; ASM32PWR4-NEXT:    lwz 7, 56(1)
+; ASM32PWR4-NEXT:    stw 31, -4(1) # 4-byte Folded Spill
+; ASM32PWR4-NEXT:    srawi 31, 12, 31
+; ASM32PWR4-NEXT:    addc 3, 3, 6
+; ASM32PWR4-NEXT:    adde 7, 8, 7
+; ASM32PWR4-NEXT:    lwz 6, 68(1)
+; ASM32PWR4-NEXT:    srawi 8, 4, 31
+; ASM32PWR4-NEXT:    addc 3, 3, 4
+; ASM32PWR4-NEXT:    adde 7, 7, 8
+; ASM32PWR4-NEXT:    lwz 4, 72(1)
+; ASM32PWR4-NEXT:    addc 3, 3, 6
+; ASM32PWR4-NEXT:    addze 6, 7
+; ASM32PWR4-NEXT:    addc 3, 3, 4
+; ASM32PWR4-NEXT:    lwz 0, 84(1)
+; ASM32PWR4-NEXT:    addze 4, 6
+; ASM32PWR4-NEXT:    addc 3, 3, 12
+; ASM32PWR4-NEXT:    lwz 7, 80(1)
+; ASM32PWR4-NEXT:    adde 4, 4, 31
+; ASM32PWR4-NEXT:    addc 3, 3, 0
+; ASM32PWR4-NEXT:    lwz 6, 88(1)
+; ASM32PWR4-NEXT:    adde 4, 4, 7
+; ASM32PWR4-NEXT:    addc 3, 3, 6
+; ASM32PWR4-NEXT:    lwz 31, -4(1) # 4-byte Folded Reload
+; ASM32PWR4-NEXT:    addze 6, 4
+; ASM32PWR4-NEXT:    addc 4, 3, 11
+; ASM32PWR4-NEXT:    adde 3, 6, 5
+; ASM32PWR4-NEXT:    blr
+;
+; ASM64PWR4-LABEL: test_ints_stack:
+; ASM64PWR4:       # %bb.0: # %entry
+; ASM64PWR4-NEXT:    add 3, 3, 4
+; ASM64PWR4-NEXT:    ld 4, 112(1)
+; ASM64PWR4-NEXT:    add 3, 3, 5
+; ASM64PWR4-NEXT:    add 3, 3, 6
+; ASM64PWR4-NEXT:    add 3, 3, 7
+; ASM64PWR4-NEXT:    lwa 12, 124(1)
+; ASM64PWR4-NEXT:    add 3, 3, 8
+; ASM64PWR4-NEXT:    add 3, 3, 9
+; ASM64PWR4-NEXT:    add 3, 3, 10
+; ASM64PWR4-NEXT:    extsw 3, 3
+; ASM64PWR4-NEXT:    lwz 5, 132(1)
+; ASM64PWR4-NEXT:    add 3, 3, 4
+; ASM64PWR4-NEXT:    add 3, 3, 12
+; ASM64PWR4-NEXT:    std 31, -8(1) # 8-byte Folded Spill
+; ASM64PWR4-NEXT:    add 3, 3, 5
+; ASM64PWR4-NEXT:    lwz 31, 140(1)
+; ASM64PWR4-NEXT:    lwa 11, 148(1)
+; ASM64PWR4-NEXT:    add 3, 3, 31
+; ASM64PWR4-NEXT:    add 3, 3, 11
+; ASM64PWR4-NEXT:    ld 4, 152(1)
+; ASM64PWR4-NEXT:    lwz 0, 164(1)
+; ASM64PWR4-NEXT:    add 3, 3, 4
+; ASM64PWR4-NEXT:    lwa 5, 172(1)
+; ASM64PWR4-NEXT:    add 3, 3, 0
+; ASM64PWR4-NEXT:    add 3, 3, 5
+; ASM64PWR4-NEXT:    ld 31, -8(1) # 8-byte Folded Reload
+; ASM64PWR4-NEXT:    blr
 entry:
   %add = add nsw i32 %i1, %i2
   %add1 = add nsw i32 %add, %i3
@@ -1385,79 +1280,6 @@ entry:
   ret i64 %add20
 }
 
-; CHECK-LABEL: name: test_ints_stack
-
-; 32BIT-LABEL: liveins:
-; 32BIT-DAG:   - { reg: '$r3', virtual-reg: '' }
-; 32BIT-DAG:   - { reg: '$r4', virtual-reg: '' }
-; 32BIT-DAG:   - { reg: '$r5', virtual-reg: '' }
-; 32BIT-DAG:   - { reg: '$r6', virtual-reg: '' }
-; 32BIT-DAG:   - { reg: '$r7', virtual-reg: '' }
-; 32BIT-DAG:   - { reg: '$r8', virtual-reg: '' }
-; 32BIT-DAG:   - { reg: '$r9', virtual-reg: '' }
-; 32BIT-DAG:   - { reg: '$r10', virtual-reg: '' }
-
-; 32BIT-LABEL: fixedStack:
-; 32BIT-DAG:   - { id: 9, type: default, offset: 56, size: 4
-; 32BIT-DAG:   - { id: 8, type: default, offset: 60, size: 4
-; 32BIT-DAG:   - { id: 7, type: default, offset: 64, size: 4
-; 32BIT-DAG:   - { id: 6, type: default, offset: 68, size: 4
-; 32BIT-DAG:   - { id: 5, type: default, offset: 72, size: 4
-; 32BIT-DAG:   - { id: 4, type: default, offset: 76, size: 4
-; 32BIT-DAG:   - { id: 3, type: default, offset: 80, size: 4
-; 32BIT-DAG:   - { id: 2, type: default, offset: 84, size: 4
-; 32BIT-DAG:   - { id: 1, type: default, offset: 88, size: 4
-; 32BIT-DAG:   - { id: 0, type: default, offset: 92, size: 4
-
-; 32BIT-LABEL: body:             |
-; 32BIT-DAG:    bb.0.entry:
-; 32BIT-DAG:      liveins: $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10
-
-; 64BIT-LABEL: liveins:
-; 64BIT-DAG:   - { reg: '$x3', virtual-reg: '' }
-; 64BIT-DAG:   - { reg: '$x4', virtual-reg: '' }
-; 64BIT-DAG:   - { reg: '$x5', virtual-reg: '' }
-; 64BIT-DAG:   - { reg: '$x6', virtual-reg: '' }
-; 64BIT-DAG:   - { reg: '$x7', virtual-reg: '' }
-; 64BIT-DAG:   - { reg: '$x8', virtual-reg: '' }
-; 64BIT-DAG:   - { reg: '$x9', virtual-reg: '' }
-; 64BIT-DAG:   - { reg: '$x10', virtual-reg: '' }
-
-; 64BIT-LABEL:  fixedStack:
-; 64BIT-DAG:   - { id: 7, type: default, offset: 112, size: 8
-; 64BIT-DAG:   - { id: 6, type: default, offset: 124, size: 4
-; 64BIT-DAG:   - { id: 5, type: default, offset: 132, size: 4
-; 64BIT-DAG:   - { id: 4, type: default, offset: 140, size: 4
-; 64BIT-DAG:   - { id: 3, type: default, offset: 148, size: 4
-; 64BIT-DAG:   - { id: 2, type: default, offset: 152, size: 8
-; 64BIT-DAG:   - { id: 1, type: default, offset: 164, size: 4
-; 64BIT-DAG:   - { id: 0, type: default, offset: 172, size: 4
-; 64BIT-DAG:   body:             |
-; 64BIT-DAG:    bb.0.entry:
-; 64BIT-DAG:     liveins: $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10
-
-; CHECKASM-LABEL:  .test_ints_stack:
-
-; ASM32PWR4-DAG:   lwz [[REG1:[0-9]+]], 56(1)
-; ASM32PWR4-DAG:   lwz [[REG2:[0-9]+]], 60(1)
-; ASM32PWR4-DAG:   lwz [[REG3:[0-9]+]], 64(1)
-; ASM32PWR4-DAG:   lwz [[REG4:[0-9]+]], 68(1)
-; ASM32PWR4-DAG:   lwz [[REG5:[0-9]+]], 72(1)
-; ASM32PWR4-DAG:   lwz [[REG6:[0-9]+]], 76(1)
-; ASM32PWR4-DAG:   lwz [[REG7:[0-9]+]], 80(1)
-; ASM32PWR4-DAG:   lwz [[REG8:[0-9]+]], 84(1)
-; ASM32PWR4-DAG:   lwz [[REG9:[0-9]+]], 88(1)
-; ASM32PWR4-DAG:   lwz [[REG10:[0-9]+]], 92(1)
-
-; ASM64PWR4-DAG:   ld [[REG1:[0-9]+]], 112(1)
-; ASM64PWR4-DAG:   lwa [[REG2:[0-9]+]], 124(1)
-; ASM64PWR4-DAG:   lwz [[REG3:[0-9]+]], 132(1)
-; ASM64PWR4-DAG:   lwz [[REG4:[0-9]+]], 140(1)
-; ASM64PWR4-DAG:   lwa [[REG5:[0-9]+]], 148(1)
-; ASM64PWR4-DAG:   ld [[REG6:[0-9]+]], 152(1)
-; ASM64PWR4-DAG:   lwz [[REG7:[0-9]+]], 164(1)
-; ASM64PWR4-DAG:   lwa [[REG8:[0-9]+]], 172(1)
-
 @ll1 = common global i64 0, align 8
 @si1 = common global i16 0, align 2
 @ch = common global i8 0, align 1
@@ -1468,6 +1290,97 @@ entry:
 @i1 = common global i32 0, align 4
 
 define void @caller_ints_stack() {
+; ASM32PWR4-LABEL: caller_ints_stack:
+; ASM32PWR4:       # %bb.0: # %entry
+; ASM32PWR4-NEXT:    mflr 0
+; ASM32PWR4-NEXT:    stwu 1, -96(1)
+; ASM32PWR4-NEXT:    lwz 3, L..C10(2) # @si1
+; ASM32PWR4-NEXT:    stw 0, 104(1)
+; ASM32PWR4-NEXT:    lwz 4, L..C11(2) # @ch
+; ASM32PWR4-NEXT:    lwz 6, L..C12(2) # @sint
+; ASM32PWR4-NEXT:    lwz 8, L..C13(2) # @ll2
+; ASM32PWR4-NEXT:    lwz 10, L..C14(2) # @uc1
+; ASM32PWR4-NEXT:    lwz 12, L..C15(2) # @i1
+; ASM32PWR4-NEXT:    lha 5, 0(3)
+; ASM32PWR4-NEXT:    lwz 3, L..C16(2) # @ll1
+; ASM32PWR4-NEXT:    lwz 11, 0(3)
+; ASM32PWR4-NEXT:    lwz 7, 4(3)
+; ASM32PWR4-NEXT:    lwz 3, L..C17(2) # @ui
+; ASM32PWR4-NEXT:    lbz 4, 0(4)
+; ASM32PWR4-NEXT:    lwz 3, 0(3)
+; ASM32PWR4-NEXT:    lwz 6, 0(6)
+; ASM32PWR4-NEXT:    lwz 9, 0(8)
+; ASM32PWR4-NEXT:    lwz 8, 4(8)
+; ASM32PWR4-NEXT:    lbz 10, 0(10)
+; ASM32PWR4-NEXT:    lwz 12, 0(12)
+; ASM32PWR4-NEXT:    stw 10, 88(1)
+; ASM32PWR4-NEXT:    li 10, 8
+; ASM32PWR4-NEXT:    stw 8, 84(1)
+; ASM32PWR4-NEXT:    li 8, 6
+; ASM32PWR4-NEXT:    stw 9, 80(1)
+; ASM32PWR4-NEXT:    li 9, 7
+; ASM32PWR4-NEXT:    stw 6, 76(1)
+; ASM32PWR4-NEXT:    li 6, 4
+; ASM32PWR4-NEXT:    stw 3, 72(1)
+; ASM32PWR4-NEXT:    li 3, 1
+; ASM32PWR4-NEXT:    stw 4, 68(1)
+; ASM32PWR4-NEXT:    li 4, 2
+; ASM32PWR4-NEXT:    stw 5, 64(1)
+; ASM32PWR4-NEXT:    li 5, 3
+; ASM32PWR4-NEXT:    stw 7, 60(1)
+; ASM32PWR4-NEXT:    li 7, 5
+; ASM32PWR4-NEXT:    stw 12, 92(1)
+; ASM32PWR4-NEXT:    stw 11, 56(1)
+; ASM32PWR4-NEXT:    bl .test_ints_stack
+; ASM32PWR4-NEXT:    nop
+; ASM32PWR4-NEXT:    addi 1, 1, 96
+; ASM32PWR4-NEXT:    lwz 0, 8(1)
+; ASM32PWR4-NEXT:    mtlr 0
+; ASM32PWR4-NEXT:    blr
+;
+; ASM64PWR4-LABEL: caller_ints_stack:
+; ASM64PWR4:       # %bb.0: # %entry
+; ASM64PWR4-NEXT:    mflr 0
+; ASM64PWR4-NEXT:    stdu 1, -176(1)
+; ASM64PWR4-NEXT:    ld 3, L..C9(2) # @si1
+; ASM64PWR4-NEXT:    std 0, 192(1)
+; ASM64PWR4-NEXT:    ld 4, L..C10(2) # @ch
+; ASM64PWR4-NEXT:    ld 6, L..C11(2) # @ll2
+; ASM64PWR4-NEXT:    ld 8, L..C12(2) # @uc1
+; ASM64PWR4-NEXT:    ld 9, L..C13(2) # @i1
+; ASM64PWR4-NEXT:    li 10, 8
+; ASM64PWR4-NEXT:    lha 7, 0(3)
+; ASM64PWR4-NEXT:    ld 3, L..C14(2) # @ll1
+; ASM64PWR4-NEXT:    ld 11, 0(3)
+; ASM64PWR4-NEXT:    ld 3, L..C15(2) # @ui
+; ASM64PWR4-NEXT:    lbz 5, 0(4)
+; ASM64PWR4-NEXT:    ld 4, L..C16(2) # @sint
+; ASM64PWR4-NEXT:    lwz 3, 0(3)
+; ASM64PWR4-NEXT:    lwz 4, 0(4)
+; ASM64PWR4-NEXT:    ld 6, 0(6)
+; ASM64PWR4-NEXT:    lbz 8, 0(8)
+; ASM64PWR4-NEXT:    lwz 9, 0(9)
+; ASM64PWR4-NEXT:    std 9, 168(1)
+; ASM64PWR4-NEXT:    li 9, 7
+; ASM64PWR4-NEXT:    std 8, 160(1)
+; ASM64PWR4-NEXT:    li 8, 6
+; ASM64PWR4-NEXT:    std 6, 152(1)
+; ASM64PWR4-NEXT:    li 6, 4
+; ASM64PWR4-NEXT:    std 4, 144(1)
+; ASM64PWR4-NEXT:    li 4, 2
+; ASM64PWR4-NEXT:    std 3, 136(1)
+; ASM64PWR4-NEXT:    li 3, 1
+; ASM64PWR4-NEXT:    std 5, 128(1)
+; ASM64PWR4-NEXT:    li 5, 3
+; ASM64PWR4-NEXT:    std 7, 120(1)
+; ASM64PWR4-NEXT:    li 7, 5
+; ASM64PWR4-NEXT:    std 11, 112(1)
+; ASM64PWR4-NEXT:    bl .test_ints_stack
+; ASM64PWR4-NEXT:    nop
+; ASM64PWR4-NEXT:    addi 1, 1, 176
+; ASM64PWR4-NEXT:    ld 0, 16(1)
+; ASM64PWR4-NEXT:    mtlr 0
+; ASM64PWR4-NEXT:    blr
 entry:
   %0 = load i64, ptr @ll1, align 8
   %1 = load i16, ptr @si1, align 2
@@ -1481,267 +1394,123 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: name: caller_ints_stack
-
-; 32BIT-DAG:   $r3 = LI 1
-; 32BIT-DAG:   $r4 = LI 2
-; 32BIT-DAG:   $r5 = LI 3
-; 32BIT-DAG:   $r6 = LI 4
-; 32BIT-DAG:   $r7 = LI 5
-; 32BIT-DAG:   $r8 = LI 6
-; 32BIT-DAG:   $r9 = LI 7
-; 32BIT-DAG:   $r10 = LI 8
-; 32BIT-DAG:   renamable $r[[REGLL1ADDR:[0-9]+]] = LWZtoc @ll1, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $r[[REGLL1A:[0-9]+]] = LWZ 0, renamable $r[[REGLL1ADDR]] :: (dereferenceable load (s32) from @ll1, align 8)
-; 32BIT-DAG:   renamable $r[[REGLL1B:[0-9]+]] = LWZ 4, killed renamable $r[[REGLL1ADDR]] :: (dereferenceable load (s32) from @ll1 + 4, basealign 8)
-; 32BIT-DAG:   STW killed renamable $r[[REGLL1A]], 56, $r1 :: (store (s32))
-; 32BIT-DAG:   STW killed renamable $r[[REGLL1B]], 60, $r1 :: (store (s32))
-; 32BIT-DAG:   renamable $r[[REGSIADDR:[0-9]+]] = LWZtoc @si1, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $r[[REGSI:[0-9]+]] = LHA 0, killed renamable $r[[REGSIADDR]] :: (dereferenceable load (s16) from @si1)
-; 32BIT-DAG:   STW killed renamable $r[[REGSI]], 64, $r1 :: (store (s32))
-; 32BIT-DAG:   renamable $r[[REGCHADDR:[0-9]+]] = LWZtoc @ch, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $r[[REGCH:[0-9]+]] = LBZ 0, killed renamable $r[[REGCHADDR]] :: (dereferenceable load (s8) from @ch)
-; 32BIT-DAG:   STW killed renamable $r[[REGCH]], 68, $r1 :: (store (s32))
-; 32BIT-DAG:   renamable $r[[REGUIADDR:[0-9]+]] = LWZtoc @ui, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $r[[REGUI:[0-9]+]] = LWZ 0, killed renamable $r[[REGUIADDR]] :: (dereferenceable load (s32) from @ui)
-; 32BIT-DAG:   STW killed renamable $r[[REGUI]], 72, $r1 :: (store (s32))
-; 32BIT-DAG:   renamable $r[[REGSIADDR:[0-9]+]] = LWZtoc @sint, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $r[[REGSI:[0-9]+]] = LWZ 0, killed renamable $r[[REGSIADDR]] :: (dereferenceable load (s32) from @sint)
-; 32BIT-DAG:   STW killed renamable $r[[REGSI]], 76, $r1 :: (store (s32))
-; 32BIT-DAG:   renamable $r[[REGLL2ADDR:[0-9]+]] = LWZtoc @ll2, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $r[[REGLL2A:[0-9]+]] = LWZ 0, renamable $r[[REGLL2ADDR]] :: (dereferenceable load (s32) from @ll2, align 8)
-; 32BIT-DAG:   renamable $r[[REGLL2B:[0-9]+]] = LWZ 4, killed renamable $r[[REGLL2ADDR]] :: (dereferenceable load (s32) from @ll2 + 4, basealign 8)
-; 32BIT-DAG:   STW killed renamable $r[[REGLL2A]], 80, $r1 :: (store (s32))
-; 32BIT-DAG:   STW killed renamable $r[[REGLL2B]], 84, $r1 :: (store (s32))
-; 32BIT-DAG:   renamable $r[[REGUCADDR:[0-9]+]] = LWZtoc @uc1, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $r[[REGUC:[0-9]+]] = LBZ 0, killed renamable $r[[REGUCADDR]] :: (dereferenceable load (s8) from @uc1)
-; 32BIT-DAG:   STW killed renamable $r[[REGUC]], 88, $r1 :: (store (s32))
-; 32BIT-DAG:   renamable $r[[REGIADDR:[0-9]+]] = LWZtoc @i1, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $r[[REGI:[0-9]+]] = LWZ 0, killed renamable $r[[REGIADDR]] :: (dereferenceable load (s32) from @i1)
-; 32BIT-DAG:   STW killed renamable $r[[REGI]], 92, $r1 :: (store (s32))
-; 32BIT-DAG:   ADJCALLSTACKDOWN 96, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-NEXT:  BL_NOP <mcsymbol .test_ints_stack>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r5, implicit $r6, implicit $r7, implicit $r8, implicit $r9, implicit $r10, implicit $r2, implicit-def $r1, implicit-def dead $r3
-; 32BIT-NEXT:  ADJCALLSTACKUP 96, 0, implicit-def dead $r1, implicit $r1
-
-; 64BIT-DAG:   $x3 = LI8 1
-; 64BIT-DAG:   $x4 = LI8 2
-; 64BIT-DAG:   $x5 = LI8 3
-; 64BIT-DAG:   $x6 = LI8 4
-; 64BIT-DAG:   $x7 = LI8 5
-; 64BIT-DAG:   $x8 = LI8 6
-; 64BIT-DAG:   $x9 = LI8 7
-; 64BIT-DAG:   $x10 = LI8 8
-; 64BIT-DAG:   renamable $x[[REGLL1ADDR:[0-9]+]] = LDtoc @ll1, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x[[REGLL1:[0-9]+]] = LD 0, killed renamable $x[[REGLL1ADDR]] :: (dereferenceable load (s64) from @ll1)
-; 64BIT-DAG:   STD killed renamable $x[[REGLL1]], 112, $x1 :: (store (s64))
-; 64BIT-DAG:   renamable $x[[REGSIADDR:[0-9]+]] = LDtoc @si1, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x[[REGSI:[0-9]+]] = LHA8 0, killed renamable $x[[REGSIADDR]] :: (dereferenceable load (s16) from @si1)
-; 64BIT-DAG:   STD killed renamable $x[[REGSI]], 120, $x1 :: (store (s64))
-; 64BIT-DAG:   renamable $x[[REGCHADDR:[0-9]+]] = LDtoc @ch, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x[[REGCH:[0-9]+]] = LBZ8 0, killed renamable $x[[REGCHADDR]] :: (dereferenceable load (s8) from @ch)
-; 64BIT-DAG:   STD killed renamable $x[[REGCH]], 128, $x1 :: (store (s64))
-; 64BIT-DAG:   renamable $x[[REGUIADDR:[0-9]+]] = LDtoc @ui, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x[[REGUI:[0-9]+]] = LWZ8 0, killed renamable $x[[REGUIADDR]] :: (dereferenceable load (s32) from @ui)
-; 64BIT-DAG:   STD killed renamable $x[[REGUI]], 136, $x1 :: (store (s64))
-; 64BIT-DAG:   renamable $x[[REGSIADDR:[0-9]+]] = LDtoc @sint, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x[[REGSI:[0-9]+]] = LWZ8 0, killed renamable $x[[REGSIADDR]] :: (dereferenceable load (s32) from @sint)
-; 64BIT-DAG:   STD killed renamable $x[[REGSI]], 144, $x1 :: (store (s64))
-; 64BIT-DAG:   renamable $x[[REGLL2ADDR:[0-9]+]] = LDtoc @ll2, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x[[REGLL2:[0-9]+]] = LD 0, killed renamable $x[[REGLL2ADDR]] :: (dereferenceable load (s64) from @ll2)
-; 64BIT-DAG:   STD killed renamable $x[[REGLL2]], 152, $x1 :: (store (s64))
-; 64BIT-DAG:   renamable $x[[REGUCADDR:[0-9]+]] = LDtoc @uc1, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x[[REGUC:[0-9]+]] = LBZ8 0, killed renamable $x[[REGUCADDR]] :: (dereferenceable load (s8) from @uc1)
-; 64BIT-DAG:   STD killed renamable $x[[REGUC]], 160, $x1 :: (store (s64))
-; 64BIT-DAG:   renamable $x[[REGIADDR:[0-9]+]] = LDtoc @i1, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x[[REGI:[0-9]+]] = LWZ8 0, killed renamable $x[[REGIADDR]] :: (dereferenceable load (s32) from @i1)
-; 64BIT-DAG:   STD killed renamable $x[[REGI]], 168, $x1 :: (store (s64))
-; 64BIT-DAG:   ADJCALLSTACKDOWN 176, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-NEXT:  BL8_NOP <mcsymbol .test_ints_stack>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit $x5, implicit $x6, implicit $x7, implicit $x8, implicit $x9, implicit $x10, implicit $x2, implicit-def $r1, implicit-def dead $x3
-; 64BIT-NEXT:  ADJCALLSTACKUP 176, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-NEXT:  BLR8 implicit $lr8, implicit $rm
-
-; CHECKASM-LABEL:  .caller_ints_stack:
-
-; ASM32PWR4:        mflr 0
-; ASM32PWR4-NEXT:   stwu 1, -96(1)
-; ASM32PWR4-DAG:    stw 0, 104(1)
-; ASM32PWR4-DAG:    li 3, 1
-; ASM32PWR4-DAG:    li 4, 2
-; ASM32PWR4-DAG:    li 5, 3
-; ASM32PWR4-DAG:    li 6, 4
-; ASM32PWR4-DAG:    li 7, 5
-; ASM32PWR4-DAG:    li 9, 7
-; ASM32PWR4-DAG:    li 8, 6
-; ASM32PWR4-DAG:    li 10, 8
-; ASM32PWR4-DAG:    lwz [[REG1:[0-9]+]], L..C10(2)
-; ASM32PWR4-DAG:    lwz [[REG2:[0-9]+]], L..C11(2)
-; ASM32PWR4-DAG:    lwz [[REG3:[0-9]+]], L..C12(2)
-; ASM32PWR4-DAG:    lwz [[REG4:[0-9]+]], L..C13(2)
-; ASM32PWR4-DAG:    lwz [[REG5:[0-9]+]], L..C14(2)
-; ASM32PWR4-DAG:    lwz [[REG6:[0-9]+]], L..C15(2)
-; ASM32PWR4-DAG:    lwz [[REG7:[0-9]+]], L..C16(2)
-; ASM32PWR4-DAG:    lwz [[REG8:[0-9]+]], L..C17(2)
-; ASM32PWR4-DAG:    lha 5, 0([[REG1]])
-; ASM32PWR4-DAG:    lwz 11, 0([[REG7]])
-; ASM32PWR4-DAG:    lwz 7, 4([[REG7]])
-; ASM32PWR4-DAG:    lbz 4, 0([[REG2]])
-; ASM32PWR4-DAG:    lwz 3, 0([[REG8]])
-; ASM32PWR4-DAG:    lwz 6, 0([[REG3]])
-; ASM32PWR4-DAG:    lwz 9, 0([[REG4]])
-; ASM32PWR4-DAG:    lwz 8, 4([[REG4]])
-; ASM32PWR4-DAG:    lbz 10, 0([[REG5]])
-; ASM32PWR4-DAG:    lwz 12, 0([[REG6]])
-; ASM32PWR4-DAG:    stw 11, 56(1)
-; ASM32PWR4-DAG:    stw 7, 60(1)
-; ASM32PWR4-DAG:    stw 5, 64(1)
-; ASM32PWR4-DAG:    stw 4, 68(1)
-; ASM32PWR4-DAG:    stw 3, 72(1)
-; ASM32PWR4-DAG:    stw 6, 76(1)
-; ASM32PWR4-DAG:    stw 9, 80(1)
-; ASM32PWR4-DAG:    stw 8, 84(1)
-; ASM32PWR4-DAG:    stw 10, 88(1)
-; ASM32PWR4-DAG:    stw 12, 92(1)
-; ASM32PWR4-DAG:    bl .test_ints_stack
-; ASM32PWR4-DAG:    nop
-; ASM32PWR4-DAG:    addi 1, 1, 96
-; ASM32PWR4-DAG:    lwz 0, 8(1)
-; ASM32PWR4-NEXT:   mtlr 0
-; ASM32PWR4-NEXT:   blr
-
-; ASM64PWR4:        mflr 0
-; ASM64PWR4-NEXT:   stdu 1, -176(1)
-; ASM64PWR4-DAG:    std 0, 192(1)
-; ASM64PWR4-DAG:    li 3, 1
-; ASM64PWR4-DAG:    li 4, 2
-; ASM64PWR4-DAG:    li 5, 3
-; ASM64PWR4-DAG:    li 6, 4
-; ASM64PWR4-DAG:    li 7, 5
-; ASM64PWR4-DAG:    li 8, 6
-; ASM64PWR4-DAG:    li 9, 7
-; ASM64PWR4-DAG:    li 10, 8
-; ASM64PWR4-DAG:    ld [[REG1:[0-9]+]], L..C9(2)
-; ASM64PWR4-DAG:    ld [[REG2:[0-9]+]], L..C10(2)
-; ASM64PWR4-DAG:    ld [[REG3:[0-9]+]], L..C11(2)
-; ASM64PWR4-DAG:    ld [[REG4:[0-9]+]], L..C12(2)
-; ASM64PWR4-DAG:    ld [[REG5:[0-9]+]], L..C13(2)
-; ASM64PWR4-DAG:    ld [[REG6:[0-9]+]], L..C14(2)
-; ASM64PWR4-DAG:    ld [[REG7:[0-9]+]], L..C15(2)
-; ASM64PWR4-DAG:    ld [[REG8:[0-9]+]], L..C16(2)
-; ASM64PWR4-DAG:    lha 7, 0([[REG1]])
-; ASM64PWR4-DAG:    lbz 5, 0([[REG2]])
-; ASM64PWR4-DAG:    ld 6, 0([[REG3]])
-; ASM64PWR4-DAG:    lbz 8, 0([[REG4]])
-; ASM64PWR4-DAG:    lwz 9, 0([[REG5]])
-; ASM64PWR4-DAG:    ld 11, 0([[REG6]])
-; ASM64PWR4-DAG:    lwz 3, 0([[REG7]])
-; ASM64PWR4-DAG:    lwz 4, 0([[REG8]])
-; ASM64PWR4-DAG:    std 11, 112(1)
-; ASM64PWR4-DAG:    std 7, 120(1)
-; ASM64PWR4-DAG:    std 5, 128(1)
-; ASM64PWR4-DAG:    std 3, 136(1)
-; ASM64PWR4-DAG:    std 4, 144(1)
-; ASM64PWR4-DAG:    std 6, 152(1)
-; ASM64PWR4-DAG:    std 8, 160(1)
-; ASM64PWR4-DAG:    std 9, 168(1)
-; ASM64PWR4-NEXT:   bl .test_ints_stack
-; ASM64PWR4-NEXT:   nop
-; ASM64PWR4-NEXT:   addi 1, 1, 176
-; ASM64PWR4-NEXT:   ld 0, 16(1)
-; ASM64PWR4-NEXT:   mtlr 0
-; ASM64PWR4-NEXT:   blr
-
 @globali1 = global i8 0, align 1
 
 define void @test_i1_stack(i32 %a, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i1 zeroext %b) {
+; ASM32PWR4-LABEL: test_i1_stack:
+; ASM32PWR4:       # %bb.0: # %entry
+; ASM32PWR4-NEXT:    lbz 3, 59(1)
+; ASM32PWR4-NEXT:    lwz 4, L..C18(2) # @globali1
+; ASM32PWR4-NEXT:    stb 3, 0(4)
+; ASM32PWR4-NEXT:    blr
+;
+; ASM64PWR4-LABEL: test_i1_stack:
+; ASM64PWR4:       # %bb.0: # %entry
+; ASM64PWR4-NEXT:    lbz 3, 119(1)
+; ASM64PWR4-NEXT:    ld 4, L..C17(2) # @globali1
+; ASM64PWR4-NEXT:    stb 3, 0(4)
+; ASM64PWR4-NEXT:    blr
   entry:
     %frombool = zext i1 %b to i8
     store i8 %frombool, ptr @globali1, align 1
     ret void
 }
 
-; CHECK-LABEL:  name:   test_i1_stack
-
-; 32BIT-LABEL: fixedStack:
-; 32BIT-DAG:   - { id: 0, type: default, offset: 59, size: 1
-; 32BIT-DAG:   body:             |
-; 32BIT-DAG:    bb.0.entry:
-; 32BIT-DAG:     renamable $r[[REGB:[0-9]+]] = LBZ 0, %fixed-stack.0 :: (load (s8) from %fixed-stack.0)
-; 32BIT-DAG:     renamable $r[[REGBTOC:[0-9]+]] = LWZtoc @globali1, $r2 :: (load (s32) from got)
-; 32BIT-DAG:     STB killed renamable $r[[REGB]], 0, killed renamable $r[[REGBTOC]] :: (store (s8) into @globali1)
-
-; 64BIT-LABEL: fixedStack:
-; 64BIT-DAG:   - { id: 0, type: default, offset: 119, size: 1
-; 64BIT-DAG:   body:             |
-; 64BIT-DAG:     bb.0.entry:
-; 64BIT-DAG:       renamable $r[[REGB:[0-9]+]] = LBZ 0, %fixed-stack.0 :: (load (s8) from %fixed-stack.0)
-; 64BIT-DAG:       renamable $x[[REGBTOC:[0-9]+]] = LDtoc @globali1, $x2 :: (load (s64) from got)
-; 64BIT-DAG:       STB killed renamable $r[[SCRATCHREG:[0-9]+]], 0, killed renamable $x[[REGBTOC]] :: (store (s8) into @globali1)
-; 64BIT-DAG:       BLR8 implicit $lr8, implicit $rm
-
-; CHECKASM-LABEL:  test_i1_stack:
-
-; ASM32PWR4-DAG:   lbz [[REGB:[0-9]+]], 59(1)
-; ASM32PWR4-DAG:   lwz [[REGBTOC:[0-9]+]], L..C18(2)
-; ASM32PWR4-DAG:   stb [[SCRATCHREG:[0-9]+]], 0([[REGBTOC]])
-; ASM32PWR4-DAG:   blr
-
-; ASM64PWR4-DAG:   lbz [[REGB:[0-9]+]], 119(1)
-; ASM64PWR4-DAG:   ld [[REGBTOC:[0-9]+]], L..C17(2)
-; ASM64PWR4-DAG:   stb [[SCRATCHREG:[0-9]+]], 0([[REGBTOC]])
-; ASM64PWR4-DAG:   blr
-
 define void @call_test_i1_stack() {
+; ASM32PWR4-LABEL: call_test_i1_stack:
+; ASM32PWR4:       # %bb.0: # %entry
+; ASM32PWR4-NEXT:    mflr 0
+; ASM32PWR4-NEXT:    stwu 1, -64(1)
+; ASM32PWR4-NEXT:    li 11, 1
+; ASM32PWR4-NEXT:    li 3, 1
+; ASM32PWR4-NEXT:    stw 0, 72(1)
+; ASM32PWR4-NEXT:    li 4, 2
+; ASM32PWR4-NEXT:    li 5, 3
+; ASM32PWR4-NEXT:    stw 11, 56(1)
+; ASM32PWR4-NEXT:    li 6, 4
+; ASM32PWR4-NEXT:    li 7, 5
+; ASM32PWR4-NEXT:    li 8, 6
+; ASM32PWR4-NEXT:    li 9, 7
+; ASM32PWR4-NEXT:    li 10, 8
+; ASM32PWR4-NEXT:    bl .test_i1_stack
+; ASM32PWR4-NEXT:    nop
+; ASM32PWR4-NEXT:    addi 1, 1, 64
+; ASM32PWR4-NEXT:    lwz 0, 8(1)
+; ASM32PWR4-NEXT:    mtlr 0
+; ASM32PWR4-NEXT:    blr
+;
+; ASM64PWR4-LABEL: call_test_i1_stack:
+; ASM64PWR4:       # %bb.0: # %entry
+; ASM64PWR4-NEXT:    mflr 0
+; ASM64PWR4-NEXT:    stdu 1, -128(1)
+; ASM64PWR4-NEXT:    li 11, 1
+; ASM64PWR4-NEXT:    li 3, 1
+; ASM64PWR4-NEXT:    std 0, 144(1)
+; ASM64PWR4-NEXT:    li 4, 2
+; ASM64PWR4-NEXT:    li 5, 3
+; ASM64PWR4-NEXT:    std 11, 112(1)
+; ASM64PWR4-NEXT:    li 6, 4
+; ASM64PWR4-NEXT:    li 7, 5
+; ASM64PWR4-NEXT:    li 8, 6
+; ASM64PWR4-NEXT:    li 9, 7
+; ASM64PWR4-NEXT:    li 10, 8
+; ASM64PWR4-NEXT:    bl .test_i1_stack
+; ASM64PWR4-NEXT:    nop
+; ASM64PWR4-NEXT:    addi 1, 1, 128
+; ASM64PWR4-NEXT:    ld 0, 16(1)
+; ASM64PWR4-NEXT:    mtlr 0
+; ASM64PWR4-NEXT:    blr
   entry:
     call void @test_i1_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i1 true)
     ret void
 }
 
-; CHECK-LABEL:  name:   call_test_i1_stack
-
-; 32BIT-DAG:   ADJCALLSTACKDOWN 60, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-DAG:   $r3 = LI 1
-; 32BIT-DAG:   $r4 = LI 2
-; 32BIT-DAG:   $r5 = LI 3
-; 32BIT-DAG:   $r6 = LI 4
-; 32BIT-DAG:   $r7 = LI 5
-; 32BIT-DAG:   $r8 = LI 6
-; 32BIT-DAG:   $r9 = LI 7
-; 32BIT-DAG:   $r10 = LI 8
-; 32BIT-DAG:   renamable $r[[REGBOOLADDR:[0-9]+]] = LI 1
-; 32BIT-DAG:   STW killed renamable $r[[REGBOOLADDR]], 56, $r1 :: (store (s32))
-; 32BIT-DAG:   BL_NOP <mcsymbol .test_i1_stack>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r5, implicit $r6, implicit $r7, implicit $r8, implicit $r9, implicit $r10, implicit $r2, implicit-def $r1
-; 32BIT-DAG:   ADJCALLSTACKUP 60, 0, implicit-def dead $r1, implicit $r1
-
-; 64BIT-DAG:  ADJCALLSTACKDOWN 120, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-DAG:  $x3 = LI8 1
-; 64BIT-DAG:  $x4 = LI8 2
-; 64BIT-DAG:  $x5 = LI8 3
-; 64BIT-DAG:  $x6 = LI8 4
-; 64BIT-DAG:  $x7 = LI8 5
-; 64BIT-DAG:  $x8 = LI8 6
-; 64BIT-DAG:  $x9 = LI8 7
-; 64BIT-DAG:  $x10 = LI8 8
-; 64BIT-DAG:  renamable $x[[REGBOOLADDR:[0-9]+]] = LI8 1
-; 64BIT-DAG:  STD killed renamable $x[[REGBOOLADDR]], 112, $x1 :: (store (s64))
-; 64BIT-DAG:  BL8_NOP <mcsymbol .test_i1_stack>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit $x5, implicit $x6, implicit $x7, implicit $x8, implicit $x9, implicit $x10, implicit $x2, implicit-def $r1
-; 64BIT-DAG:  ADJCALLSTACKUP 120, 0, implicit-def dead $r1, implicit $r1
-
-; CHECKASM-LABEL: .call_test_i1_stack:
-
-; ASM32PWR4-DAG:   mflr 0
-; ASM32PWR4-DAG:   li 3, 1
-; ASM32PWR4-DAG:   li 4, 2
-; ASM32PWR4-DAG:   li 5, 3
-; ASM32PWR4-DAG:   li 6, 4
-; ASM32PWR4-DAG:   li 7, 5
-; ASM32PWR4-DAG:   li 8, 6
-; ASM32PWR4-DAG:   li 9, 7
-; ASM32PWR4-DAG:   li 10, 8
-; ASM32PWR4-DAG:   stw [[REGB:[0-9]+]], 56(1)
-; ASM32PWR4-DAG:   li [[REGB]], 1
-; ASM32PWR4-DAG:   bl .test_i1
-
 define double @test_fpr_stack(double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %s10, double %l11, double %d12, double %d13, float %f14, double %d15, float %f16) {
+; ASM32PWR4-LABEL: test_fpr_stack:
+; ASM32PWR4:       # %bb.0: # %entry
+; ASM32PWR4-NEXT:    fadd 0, 1, 2
+; ASM32PWR4-NEXT:    lfs 1, 128(1)
+; ASM32PWR4-NEXT:    fadd 0, 0, 3
+; ASM32PWR4-NEXT:    lfd 2, 132(1)
+; ASM32PWR4-NEXT:    fadd 0, 0, 4
+; ASM32PWR4-NEXT:    fadd 0, 0, 5
+; ASM32PWR4-NEXT:    fadd 0, 0, 6
+; ASM32PWR4-NEXT:    fadd 0, 0, 7
+; ASM32PWR4-NEXT:    fadd 0, 0, 8
+; ASM32PWR4-NEXT:    fadd 0, 0, 9
+; ASM32PWR4-NEXT:    fadd 0, 0, 10
+; ASM32PWR4-NEXT:    fadd 0, 0, 11
+; ASM32PWR4-NEXT:    fadd 0, 0, 12
+; ASM32PWR4-NEXT:    fadd 0, 0, 13
+; ASM32PWR4-NEXT:    fadd 0, 0, 13
+; ASM32PWR4-NEXT:    fadd 0, 0, 1
+; ASM32PWR4-NEXT:    lfs 1, 140(1)
+; ASM32PWR4-NEXT:    fadd 0, 0, 2
+; ASM32PWR4-NEXT:    fadd 1, 0, 1
+; ASM32PWR4-NEXT:    blr
+;
+; ASM64PWR4-LABEL: test_fpr_stack:
+; ASM64PWR4:       # %bb.0: # %entry
+; ASM64PWR4-NEXT:    fadd 0, 1, 2
+; ASM64PWR4-NEXT:    lfs 1, 152(1)
+; ASM64PWR4-NEXT:    fadd 0, 0, 3
+; ASM64PWR4-NEXT:    lfd 2, 160(1)
+; ASM64PWR4-NEXT:    fadd 0, 0, 4
+; ASM64PWR4-NEXT:    fadd 0, 0, 5
+; ASM64PWR4-NEXT:    fadd 0, 0, 6
+; ASM64PWR4-NEXT:    fadd 0, 0, 7
+; ASM64PWR4-NEXT:    fadd 0, 0, 8
+; ASM64PWR4-NEXT:    fadd 0, 0, 9
+; ASM64PWR4-NEXT:    fadd 0, 0, 10
+; ASM64PWR4-NEXT:    fadd 0, 0, 11
+; ASM64PWR4-NEXT:    fadd 0, 0, 12
+; ASM64PWR4-NEXT:    fadd 0, 0, 13
+; ASM64PWR4-NEXT:    fadd 0, 0, 13
+; ASM64PWR4-NEXT:    fadd 0, 0, 1
+; ASM64PWR4-NEXT:    lfs 1, 168(1)
+; ASM64PWR4-NEXT:    fadd 0, 0, 2
+; ASM64PWR4-NEXT:    fadd 1, 0, 1
+; ASM64PWR4-NEXT:    blr
   entry:
     %add = fadd double %d1, %d2
     %add1 = fadd double %add, %d3
@@ -1764,57 +1533,182 @@ define double @test_fpr_stack(double %d1, double %d2, double %d3, double %d4, do
     ret double %add16
   }
 
-; CHECK-LABEL: name: test_fpr_stack{{.*}}
-
-; CHECK-LABEL: liveins:
-; CHECK-DAG:   - { reg: '$f1', virtual-reg: '' }
-; CHECK-DAG:   - { reg: '$f2', virtual-reg: '' }
-; CHECK-DAG:   - { reg: '$f3', virtual-reg: '' }
-; CHECK-DAG:   - { reg: '$f4', virtual-reg: '' }
-; CHECK-DAG:   - { reg: '$f5', virtual-reg: '' }
-; CHECK-DAG:   - { reg: '$f6', virtual-reg: '' }
-; CHECK-DAG:   - { reg: '$f7', virtual-reg: '' }
-; CHECK-DAG:   - { reg: '$f8', virtual-reg: '' }
-; CHECK-DAG:   - { reg: '$f9', virtual-reg: '' }
-; CHECK-DAG:   - { reg: '$f10', virtual-reg: '' }
-; CHECK-DAG:   - { reg: '$f11', virtual-reg: '' }
-; CHECK-DAG:   - { reg: '$f12', virtual-reg: '' }
-; CHECK-DAG:   - { reg: '$f13', virtual-reg: '' }
-
-; CHECK-LABEL: fixedStack:
-; 32BIT-DAG:   - { id: 2, type: default, offset: 128, size: 4
-; 32BIT-DAG:   - { id: 1, type: default, offset: 132, size: 8
-; 32BIT-DAG:   - { id: 0, type: default, offset: 140, size: 4
-
-; 64BIT-DAG:   - { id: 2, type: default, offset: 152, size: 4
-; 64BIT-DAG:   - { id: 1, type: default, offset: 160, size: 8
-; 64BIT-DAG:   - { id: 0, type: default, offset: 168, size: 4
-
-; CHECK-LABEL: body:             |
-; CHECK-DAG:    bb.0.entry:
-; CHECK-DAG:      liveins: $f1, $f2, $f3, $f4, $f5, $f6, $f7, $f8, $f9, $f10, $f11, $f12, $f13
-
-; CHECKASM-LABEL:  .test_fpr_stack:
-
-; ASM32PWR4-DAG:   lfs  [[REG1:[0-9]+]], 128(1)
-; ASM32PWR4-DAG:   lfd  [[REG2:[0-9]+]], 132(1)
-; ASM32PWR4-DAG:   lfs  [[REG3:[0-9]+]], 140(1)
-; ASM32PWR4-DAG:   fadd 0, 0, [[REG1]]
-; ASM32PWR4-DAG:   fadd 0, 0, [[REG2]]
-; ASM32PWR4-DAG:   fadd 1, 0, [[REG3]]
-
-; ASM64PWR4-DAG:   lfs [[REG1:[0-9]+]], 152(1)
-; ASM64PWR4-DAG:   lfd [[REG2:[0-9]+]], 160(1)
-; ASM64PWR4-DAG:   lfs [[REG3:[0-9]+]], 168(1)
-; ASM64PWR4-DAG:   fadd 0, 0, [[REG1]]
-; ASM64PWR4-DAG:   fadd 0, 0, [[REG2]]
-; ASM64PWR4-DAG:   fadd 1, 0, [[REG3]]
-
 @f14 = common global float 0.000000e+00, align 4
 @d15 = common global double 0.000000e+00, align 8
 @f16 = common global float 0.000000e+00, align 4
 
 define void @caller_fpr_stack() {
+; ASM32PWR4-LABEL: caller_fpr_stack:
+; ASM32PWR4:       # %bb.0: # %entry
+; ASM32PWR4-NEXT:    mflr 0
+; ASM32PWR4-NEXT:    stwu 1, -144(1)
+; ASM32PWR4-NEXT:    lwz 3, L..C19(2) # @d15
+; ASM32PWR4-NEXT:    lwz 4, L..C20(2) # @f14
+; ASM32PWR4-NEXT:    lwz 5, L..C21(2) # @f16
+; ASM32PWR4-NEXT:    stw 0, 152(1)
+; ASM32PWR4-NEXT:    lis 6, 16361
+; ASM32PWR4-NEXT:    ori 6, 6, 39321
+; ASM32PWR4-NEXT:    lfd 0, 0(3)
+; ASM32PWR4-NEXT:    lwz 3, 0(4)
+; ASM32PWR4-NEXT:    lwz 4, 0(5)
+; ASM32PWR4-NEXT:    li 5, 0
+; ASM32PWR4-NEXT:    stw 5, 60(1)
+; ASM32PWR4-NEXT:    lis 5, 16352
+; ASM32PWR4-NEXT:    stw 5, 56(1)
+; ASM32PWR4-NEXT:    lis 5, 13107
+; ASM32PWR4-NEXT:    ori 5, 5, 13107
+; ASM32PWR4-NEXT:    stw 5, 68(1)
+; ASM32PWR4-NEXT:    lis 5, 16355
+; ASM32PWR4-NEXT:    ori 5, 5, 13107
+; ASM32PWR4-NEXT:    stw 5, 64(1)
+; ASM32PWR4-NEXT:    lis 5, 26214
+; ASM32PWR4-NEXT:    ori 5, 5, 26214
+; ASM32PWR4-NEXT:    stw 5, 76(1)
+; ASM32PWR4-NEXT:    lis 5, 16358
+; ASM32PWR4-NEXT:    ori 5, 5, 26214
+; ASM32PWR4-NEXT:    stw 5, 72(1)
+; ASM32PWR4-NEXT:    lis 5, -26215
+; ASM32PWR4-NEXT:    ori 5, 5, 39322
+; ASM32PWR4-NEXT:    stw 5, 84(1)
+; ASM32PWR4-NEXT:    stw 5, 100(1)
+; ASM32PWR4-NEXT:    lis 5, 16313
+; ASM32PWR4-NEXT:    ori 5, 5, 39321
+; ASM32PWR4-NEXT:    stw 5, 96(1)
+; ASM32PWR4-NEXT:    lis 5, -15729
+; ASM32PWR4-NEXT:    ori 5, 5, 23593
+; ASM32PWR4-NEXT:    stw 5, 108(1)
+; ASM32PWR4-NEXT:    lis 5, 16316
+; ASM32PWR4-NEXT:    ori 5, 5, 10485
+; ASM32PWR4-NEXT:    stw 5, 104(1)
+; ASM32PWR4-NEXT:    lis 5, -5243
+; ASM32PWR4-NEXT:    ori 5, 5, 7864
+; ASM32PWR4-NEXT:    stw 5, 116(1)
+; ASM32PWR4-NEXT:    lis 5, 16318
+; ASM32PWR4-NEXT:    ori 5, 5, 47185
+; ASM32PWR4-NEXT:    stw 6, 80(1)
+; ASM32PWR4-NEXT:    lis 6, -13108
+; ASM32PWR4-NEXT:    ori 6, 6, 52429
+; ASM32PWR4-NEXT:    stw 5, 112(1)
+; ASM32PWR4-NEXT:    lis 5, 2621
+; ASM32PWR4-NEXT:    ori 5, 5, 28836
+; ASM32PWR4-NEXT:    stw 6, 92(1)
+; ASM32PWR4-NEXT:    lis 6, 16364
+; ASM32PWR4-NEXT:    ori 6, 6, 52428
+; ASM32PWR4-NEXT:    stw 5, 124(1)
+; ASM32PWR4-NEXT:    lis 5, 16320
+; ASM32PWR4-NEXT:    ori 5, 5, 41943
+; ASM32PWR4-NEXT:    stw 6, 88(1)
+; ASM32PWR4-NEXT:    lwz 6, L..C22(2) # %const.0
+; ASM32PWR4-NEXT:    stw 5, 120(1)
+; ASM32PWR4-NEXT:    lwz 5, L..C23(2) # %const.1
+; ASM32PWR4-NEXT:    lfd 2, 0(6)
+; ASM32PWR4-NEXT:    lwz 6, L..C24(2) # %const.2
+; ASM32PWR4-NEXT:    lfd 3, 0(5)
+; ASM32PWR4-NEXT:    lwz 5, L..C25(2) # %const.3
+; ASM32PWR4-NEXT:    lfd 4, 0(6)
+; ASM32PWR4-NEXT:    lwz 6, L..C26(2) # %const.4
+; ASM32PWR4-NEXT:    lfd 6, 0(5)
+; ASM32PWR4-NEXT:    lwz 5, L..C27(2) # %const.5
+; ASM32PWR4-NEXT:    lfd 7, 0(6)
+; ASM32PWR4-NEXT:    lwz 6, L..C28(2) # %const.6
+; ASM32PWR4-NEXT:    lfd 8, 0(5)
+; ASM32PWR4-NEXT:    lwz 5, L..C29(2) # %const.7
+; ASM32PWR4-NEXT:    lfd 9, 0(6)
+; ASM32PWR4-NEXT:    lwz 6, L..C30(2) # %const.8
+; ASM32PWR4-NEXT:    lfd 1, 0(5)
+; ASM32PWR4-NEXT:    lwz 5, L..C31(2) # %const.9
+; ASM32PWR4-NEXT:    lfd 11, 0(6)
+; ASM32PWR4-NEXT:    lwz 6, L..C32(2) # %const.10
+; ASM32PWR4-NEXT:    fmr 10, 1
+; ASM32PWR4-NEXT:    lfd 12, 0(5)
+; ASM32PWR4-NEXT:    lwz 5, L..C33(2) # %const.11
+; ASM32PWR4-NEXT:    lfd 13, 0(6)
+; ASM32PWR4-NEXT:    lfs 5, 0(5)
+; ASM32PWR4-NEXT:    stfd 0, 132(1)
+; ASM32PWR4-NEXT:    stw 4, 140(1)
+; ASM32PWR4-NEXT:    stw 3, 128(1)
+; ASM32PWR4-NEXT:    bl .test_fpr_stack
+; ASM32PWR4-NEXT:    nop
+; ASM32PWR4-NEXT:    addi 1, 1, 144
+; ASM32PWR4-NEXT:    lwz 0, 8(1)
+; ASM32PWR4-NEXT:    mtlr 0
+; ASM32PWR4-NEXT:    blr
+;
+; ASM64PWR4-LABEL: caller_fpr_stack:
+; ASM64PWR4:       # %bb.0: # %entry
+; ASM64PWR4-NEXT:    mflr 0
+; ASM64PWR4-NEXT:    stdu 1, -176(1)
+; ASM64PWR4-NEXT:    ld 3, L..C18(2) # @f14
+; ASM64PWR4-NEXT:    std 0, 192(1)
+; ASM64PWR4-NEXT:    ld 4, L..C19(2) # @d15
+; ASM64PWR4-NEXT:    ld 5, L..C20(2) # @f16
+; ASM64PWR4-NEXT:    ld 6, L..C21(2) # %const.9
+; ASM64PWR4-NEXT:    lis 7, 16313
+; ASM64PWR4-NEXT:    lwz 3, 0(3)
+; ASM64PWR4-NEXT:    ld 4, 0(4)
+; ASM64PWR4-NEXT:    lwz 5, 0(5)
+; ASM64PWR4-NEXT:    stw 3, 152(1)
+; ASM64PWR4-NEXT:    ld 3, L..C22(2) # %const.0
+; ASM64PWR4-NEXT:    std 4, 160(1)
+; ASM64PWR4-NEXT:    ld 4, L..C23(2) # %const.1
+; ASM64PWR4-NEXT:    lfd 2, 0(3)
+; ASM64PWR4-NEXT:    ld 3, L..C24(2) # %const.2
+; ASM64PWR4-NEXT:    lfd 3, 0(4)
+; ASM64PWR4-NEXT:    ld 4, L..C25(2) # %const.3
+; ASM64PWR4-NEXT:    lfd 4, 0(3)
+; ASM64PWR4-NEXT:    ld 3, L..C26(2) # %const.4
+; ASM64PWR4-NEXT:    lfd 6, 0(4)
+; ASM64PWR4-NEXT:    ld 4, L..C27(2) # %const.5
+; ASM64PWR4-NEXT:    lfd 7, 0(3)
+; ASM64PWR4-NEXT:    ld 3, L..C28(2) # %const.6
+; ASM64PWR4-NEXT:    lfd 8, 0(4)
+; ASM64PWR4-NEXT:    ld 4, L..C29(2) # %const.7
+; ASM64PWR4-NEXT:    lfd 9, 0(3)
+; ASM64PWR4-NEXT:    ld 3, L..C30(2) # %const.8
+; ASM64PWR4-NEXT:    lfd 1, 0(4)
+; ASM64PWR4-NEXT:    lis 4, 16320
+; ASM64PWR4-NEXT:    ori 4, 4, 41943
+; ASM64PWR4-NEXT:    rldic 4, 4, 32, 2
+; ASM64PWR4-NEXT:    lfd 11, 0(3)
+; ASM64PWR4-NEXT:    lis 3, 16316
+; ASM64PWR4-NEXT:    fmr 10, 1
+; ASM64PWR4-NEXT:    ori 3, 3, 10485
+; ASM64PWR4-NEXT:    oris 4, 4, 2621
+; ASM64PWR4-NEXT:    stw 5, 168(1)
+; ASM64PWR4-NEXT:    lis 5, 16318
+; ASM64PWR4-NEXT:    rldic 3, 3, 32, 2
+; ASM64PWR4-NEXT:    ori 5, 5, 47185
+; ASM64PWR4-NEXT:    ori 4, 4, 28836
+; ASM64PWR4-NEXT:    lfd 12, 0(6)
+; ASM64PWR4-NEXT:    ld 6, L..C31(2) # %const.10
+; ASM64PWR4-NEXT:    oris 3, 3, 49807
+; ASM64PWR4-NEXT:    ori 3, 3, 23593
+; ASM64PWR4-NEXT:    std 4, 144(1)
+; ASM64PWR4-NEXT:    rldic 4, 5, 32, 2
+; ASM64PWR4-NEXT:    oris 4, 4, 60293
+; ASM64PWR4-NEXT:    ori 4, 4, 7864
+; ASM64PWR4-NEXT:    std 3, 128(1)
+; ASM64PWR4-NEXT:    ld 3, L..C32(2) # %const.11
+; ASM64PWR4-NEXT:    ori 5, 7, 39321
+; ASM64PWR4-NEXT:    rldic 5, 5, 32, 2
+; ASM64PWR4-NEXT:    std 4, 136(1)
+; ASM64PWR4-NEXT:    lis 4, 4091
+; ASM64PWR4-NEXT:    ori 4, 4, 13107
+; ASM64PWR4-NEXT:    rldic 4, 4, 34, 2
+; ASM64PWR4-NEXT:    lfs 5, 0(3)
+; ASM64PWR4-NEXT:    oris 3, 5, 39321
+; ASM64PWR4-NEXT:    ori 3, 3, 39322
+; ASM64PWR4-NEXT:    lfd 13, 0(6)
+; ASM64PWR4-NEXT:    std 3, 120(1)
+; ASM64PWR4-NEXT:    oris 3, 4, 52428
+; ASM64PWR4-NEXT:    ori 3, 3, 52429
+; ASM64PWR4-NEXT:    std 3, 112(1)
+; ASM64PWR4-NEXT:    bl .test_fpr_stack
+; ASM64PWR4-NEXT:    nop
+; ASM64PWR4-NEXT:    addi 1, 1, 176
+; ASM64PWR4-NEXT:    ld 0, 16(1)
+; ASM64PWR4-NEXT:    mtlr 0
+; ASM64PWR4-NEXT:    blr
 entry:
   %0 = load float, ptr @f14, align 4
   %1 = load double, ptr @d15, align 8
@@ -1823,152 +1717,60 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: caller_fpr_stack
-
-; 32BIT-DAG:   renamable $r[[SCRATCHREG:[0-9]+]] = LWZtoc %const.0, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $r[[SCRATCHREG:[0-9]+]] = LWZtoc %const.1, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $r[[SCRATCHREG:[0-9]+]] = LWZtoc %const.2, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $r[[SCRATCHREG:[0-9]+]] = LWZtoc %const.3, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $r[[SCRATCHREG:[0-9]+]] = LWZtoc %const.4, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $r[[SCRATCHREG:[0-9]+]] = LWZtoc %const.5, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $r[[SCRATCHREG:[0-9]+]] = LWZtoc %const.6, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $r[[SCRATCHREG:[0-9]+]] = LWZtoc %const.7, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $r[[SCRATCHREG:[0-9]+]] = LWZtoc %const.8, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $r[[SCRATCHREG:[0-9]+]] = LWZtoc %const.9, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $r[[SCRATCHREG:[0-9]+]] = LWZtoc %const.10, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $r[[SCRATCHREG:[0-9]+]] = LWZtoc %const.11, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   STW killed renamable $r[[SCRATCHREG:[0-9]+]], 56, $r1 :: (store (s32), align 8)
-; 32BIT-DAG:   STW killed renamable $r[[SCRATCHREG:[0-9]+]], 60, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
-; 32BIT-DAG:   STW killed renamable $r[[SCRATCHREG:[0-9]+]], 64, $r1 :: (store (s32), align 8)
-; 32BIT-DAG:   STW killed renamable $r[[SCRATCHREG:[0-9]+]], 68, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
-; 32BIT-DAG:   STW killed renamable $r[[SCRATCHREG:[0-9]+]], 72, $r1 :: (store (s32), align 8)
-; 32BIT-DAG:   STW killed renamable $r[[SCRATCHREG:[0-9]+]], 76, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
-; 32BIT-DAG:   STW killed renamable $r[[SCRATCHREG:[0-9]+]], 80, $r1 :: (store (s32), align 8)
-; 32BIT-DAG:   STW renamable $r[[SCRATCHREG:[0-9]+]], 84, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
-; 32BIT-DAG:   STW killed renamable $r[[SCRATCHREG:[0-9]+]], 88, $r1 :: (store (s32), align 8)
-; 32BIT-DAG:   STW killed renamable $r[[SCRATCHREG:[0-9]+]], 92, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
-; 32BIT-DAG:   STW killed renamable $r[[SCRATCHREG:[0-9]+]], 96, $r1 :: (store (s32), align 8)
-; 32BIT-DAG:   STW killed renamable $r[[SCRATCHREG:[0-9]+]], 100, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
-; 32BIT-DAG:   STW killed renamable $r[[SCRATCHREG:[0-9]+]], 104, $r1 :: (store (s32), align 8)
-; 32BIT-DAG:   STW killed renamable $r[[SCRATCHREG:[0-9]+]], 108, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
-; 32BIT-DAG:   STW killed renamable $r[[SCRATCHREG:[0-9]+]], 112, $r1 :: (store (s32), align 8)
-; 32BIT-DAG:   STW killed renamable $r[[SCRATCHREG:[0-9]+]], 116, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
-; 32BIT-DAG:   STW killed renamable $r[[SCRATCHREG:[0-9]+]], 120, $r1 :: (store (s32), align 8)
-; 32BIT-DAG:   STW killed renamable $r[[SCRATCHREG:[0-9]+]], 124, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
-; 32BIT-DAG:   STW killed renamable $r[[SCRATCHREG:[0-9]+]], 128, $r1 :: (store (s32))
-; 32BIT-DAG:   renamable $r[[REGF1:[0-9]+]] = LWZtoc @f14, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $r3 = LWZ 0, killed renamable $r[[REGF1]] :: (load (s32) from @f14)
-; 32BIT-DAG:   STFD killed renamable $f0, 132, $r1 :: (store (s64))
-; 32BIT-DAG:   renamable $r[[REGD:[0-9]+]] = LWZtoc @d15, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $f0 = LFD 0, killed renamable $r[[REGD]] :: (dereferenceable load (s64) from @d15)
-; 32BIT-DAG:   STW killed renamable $r[[SCRATCHREG:[0-9]+]], 140, $r1 :: (store (s32))
-; 32BIT-DAG:   renamable $r[[REGF2:[0-9]+]] = LWZtoc @f16, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $r[[SCRATCHREG:[0-9]+]] = LWZ 0, killed renamable $r[[REGF2]] :: (load (s32) from @f16)
-; 32BIT-DAG:   renamable $f1 = LFD 0, killed renamable $r[[SCRATCHREG:[0-9]+]] :: (load (s64) from constant-pool)
-; 32BIT-DAG:   renamable $f2 = LFD 0, killed renamable $r[[SCRATCHREG:[0-9]+]] :: (load (s64) from constant-pool)
-; 32BIT-DAG:   renamable $f3 = LFD 0, killed renamable $r[[SCRATCHREG:[0-9]+]] :: (load (s64) from constant-pool)
-; 32BIT-DAG:   renamable $f4 = LFD 0, killed renamable $r[[SCRATCHREG:[0-9]+]] :: (load (s64) from constant-pool)
-; 32BIT-DAG:   renamable $f5 = LFS 0, killed renamable $r[[SCRATCHREG:[0-9]+]] :: (load (s32) from constant-pool)
-; 32BIT-DAG:   renamable $f6 = LFD 0, killed renamable $r[[SCRATCHREG:[0-9]+]] :: (load (s64) from constant-pool)
-; 32BIT-DAG:   renamable $f7 = LFD 0, killed renamable $r[[SCRATCHREG:[0-9]+]] :: (load (s64) from constant-pool)
-; 32BIT-DAG:   renamable $f8 = LFD 0, killed renamable $r[[SCRATCHREG:[0-9]+]] :: (load (s64) from constant-pool)
-; 32BIT-DAG:   renamable $f9 = LFD 0, killed renamable $r[[SCRATCHREG:[0-9]+]] :: (load (s64) from constant-pool)
-; 32BIT-DAG:   $f10 = COPY renamable $f1
-; 32BIT-DAG:   renamable $f11 = LFD 0, killed renamable $r[[SCRATCHREG:[0-9]+]] :: (load (s64) from constant-pool)
-; 32BIT-DAG:   renamable $f12 = LFD 0, killed renamable $r[[SCRATCHREG:[0-9]+]] :: (load (s64) from constant-pool)
-; 32BIT-DAG:   renamable $f13 = LFD 0, killed renamable $r[[SCRATCHREG:[0-9]+]] :: (load (s64) from constant-pool)
-; 32BIT-NEXT:  BL_NOP <mcsymbol .test_fpr_stack>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $f1, implicit $f2, implicit $f3, implicit $f4, implicit $f5, implicit $f6, implicit $f7, implicit $f8, implicit $f9, implicit killed $f10, implicit $f11, implicit $f12, implicit $f13, implicit $r2, implicit-def $r1, implicit-def dead $f1
-; 32BIT-NEXT:  ADJCALLSTACKUP 144, 0, implicit-def dead $r1, implicit $r1
-
-; 64BIT-DAG:   renamable $x[[SCRATCHREG:[0-9]+]] = LDtocCPT %const.0, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x[[SCRATCHREG:[0-9]+]] = LDtocCPT %const.1, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x[[SCRATCHREG:[0-9]+]] = LDtocCPT %const.2, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x[[SCRATCHREG:[0-9]+]] = LDtocCPT %const.[[SCRATCHREG:[0-9]+]], $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x[[SCRATCHREG:[0-9]+]] = LDtocCPT %const.4, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x[[SCRATCHREG:[0-9]+]] = LDtocCPT %const.5, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x[[SCRATCHREG:[0-9]+]] = LDtocCPT %const.6, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x[[SCRATCHREG:[0-9]+]] = LDtocCPT %const.7, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x[[SCRATCHREG:[0-9]+]] = LDtocCPT %const.8, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x[[SCRATCHREG:[0-9]+]] = LDtocCPT %const.9, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x[[SCRATCHREG:[0-9]+]] = LDtocCPT %const.10, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x[[REGF1:[0-9]+]] = LDtoc @f14, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $r3 = LWZ 0, killed renamable $x[[REGF1]] :: (load (s32) from @f14)
-; 64BIT-DAG:   renamable $x[[REGF2:[0-9]+]] = LDtoc @f16, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $r5 = LWZ 0, killed renamable $x[[REGF2]] :: (load (s32) from @f16)
-; 64BIT-DAG:   renamable $x[[REGD:[0-9]+]] = LDtoc @d15, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x4 = LD 0, killed renamable $x[[REGD]] :: (load (s64) from @d15)
-; 64BIT-DAG:   ADJCALLSTACKDOWN 176, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-DAG:   renamable $f1 = LFD 0, killed renamable $x[[SCRATCHREG:[0-9]+]] :: (load (s64) from constant-pool)
-; 64BIT-DAG:   renamable $f2 = LFD 0, killed renamable $x[[SCRATCHREG:[0-9]+]] :: (load (s64) from constant-pool)
-; 64BIT-DAG:   renamable $f[[SCRATCHREG:[0-9]+]] = LFD 0, killed renamable $x[[SCRATCHREG:[0-9]+]] :: (load (s64) from constant-pool)
-; 64BIT-DAG:   renamable $f4 = LFD 0, killed renamable $x[[SCRATCHREG:[0-9]+]] :: (load (s64) from constant-pool)
-; 64BIT-DAG:   renamable $f5 = LFS 0, killed renamable $x[[SCRATCHREG:[0-9]+]] :: (load (s32) from constant-pool)
-; 64BIT-DAG:   renamable $f6 = LFD 0, killed renamable $x[[SCRATCHREG:[0-9]+]] :: (load (s64) from constant-pool)
-; 64BIT-DAG:   renamable $f7 = LFD 0, killed renamable $x[[SCRATCHREG:[0-9]+]] :: (load (s64) from constant-pool)
-; 64BIT-DAG:   renamable $f8 = LFD 0, killed renamable $x[[SCRATCHREG:[0-9]+]] :: (load (s64) from constant-pool)
-; 64BIT-DAG:   renamable $f9 = LFD 0, killed renamable $x[[SCRATCHREG:[0-9]+]] :: (load (s64) from constant-pool)
-; 64BIT-DAG:   $f10 = COPY renamable $f1
-; 64BIT-DAG:   renamable $f11 = LFD 0, killed renamable $x[[SCRATCHREG:[0-9]+]] :: (load (s64) from constant-pool)
-; 64BIT-DAG:   renamable $f12 = LFD 0, killed renamable $x[[SCRATCHREG:[0-9]+]] :: (load (s64) from constant-pool)
-; 64BIT-DAG:   renamable $f13 = LFD 0, killed renamable $x[[SCRATCHREG:[0-9]+]] :: (load (s64) from constant-pool)
-; 64BIT-DAG:   BL8_NOP <mcsymbol .test_fpr_stack>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $f1, implicit $f2, implicit $f3, implicit $f4, implicit $f5, implicit $f6, implicit $f7, implicit $f8, implicit $f9, implicit killed $f10, implicit $f11, implicit $f12, implicit $f13, implicit $x2, implicit-def $r1, implicit-def dead $f1
-; 64BIT-NEXT:   ADJCALLSTACKUP 176, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
-
-; CHECKASM-LABEL:  .caller_fpr_stack:
-
-; ASM32PWR4:       mflr 0
-; ASM32PWR4-NEXT:  stwu 1, -144(1)
-; ASM32PWR4-DAG:   stw 0, 152(1)
-; ASM32PWR4-DAG:   lwz [[REGF1ADDR:[0-9]+]], L..C20(2)
-; ASM32PWR4-DAG:   lwz [[REGF1:[0-9]+]], 0([[REGF1ADDR]])
-; ASM32PWR4-DAG:   lwz [[REGDADDR:[0-9]+]], L..C19(2)
-; ASM32PWR4-DAG:   lfd [[REGD:[0-9]+]], 0([[REGDADDR]])
-; ASM32PWR4-DAG:   lwz [[REGF2ADDR:[0-9]+]], L..C21(2)
-; ASM32PWR4-DAG:   lwz [[REGF2:[0-9]+]], 0([[REGF2ADDR]])
-; ASM32PWR4-DAG:   stw [[SCRATCHREG:[0-9]+]], 56(1)
-; ASM32PWR4-DAG:   stw [[SCRATCHREG:[0-9]+]], 60(1)
-; ASM32PWR4-DAG:   stw [[SCRATCHREG:[0-9]+]], 64(1)
-; ASM32PWR4-DAG:   stw [[SCRATCHREG:[0-9]+]], 68(1)
-; ASM32PWR4-DAG:   stw [[SCRATCHREG:[0-9]+]], 72(1)
-; ASM32PWR4-DAG:   stw [[SCRATCHREG:[0-9]+]], 76(1)
-; ASM32PWR4-DAG:   stw [[SCRATCHREG:[0-9]+]], 80(1)
-; ASM32PWR4-DAG:   stw [[SCRATCHREG:[0-9]+]], 84(1)
-; ASM32PWR4-DAG:   stw [[SCRATCHREG:[0-9]+]], 88(1)
-; ASM32PWR4-DAG:   stw [[SCRATCHREG:[0-9]+]], 92(1)
-; ASM32PWR4-DAG:   stw [[SCRATCHREG:[0-9]+]], 96(1)
-; ASM32PWR4-DAG:   stw [[SCRATCHREG:[0-9]+]], 100(1)
-; ASM32PWR4-DAG:   stw [[SCRATCHREG:[0-9]+]], 108(1)
-; ASM32PWR4-DAG:   stw [[SCRATCHREG:[0-9]+]], 104(1)
-; ASM32PWR4-DAG:   stw [[SCRATCHREG:[0-9]+]], 112(1)
-; ASM32PWR4-DAG:   stw [[SCRATCHREG:[0-9]+]], 116(1)
-; ASM32PWR4-DAG:   stw [[SCRATCHREG:[0-9]+]], 120(1)
-; ASM32PWR4-DAG:   stw [[SCRATCHREG:[0-9]+]], 124(1)
-; ASM32PWR4-DAG:   stw [[REGF1]], 128(1)
-; ASM32PWR4-DAG:   stfd [[REGD]], 132(1)
-; ASM32PWR4-DAG:   stw [[REGF2]], 140(1)
-; ASM32PWR4-NEXT:  bl .test_fpr_stack
-
-; ASM64PWR4:       mflr 0
-; ASM64PWR4-NEXT:  stdu 1, -176(1)
-; ASM64PWR4-DAG:   std 0, 192(1)
-; ASM64PWR4-DAG:   ld [[REGF1ADDR:[0-9]+]], L..C18(2)
-; ASM64PWR4-DAG:   lwz [[REGF1:[0-9]+]], 0([[REGF1ADDR]])
-; ASM64PWR4-DAG:   ld [[REGDADDR:[0-9]+]], L..C19(2)
-; ASM64PWR4-DAG:   ld [[REGD:[0-9]+]], 0([[REGDADDR]])
-; ASM64PWR4-DAG:   ld [[REGF2ADDR:[0-9]+]], L..C20(2)
-; ASM64PWR4-DAG:   lwz [[REGF2:[0-9]+]], 0([[REGF2ADDR]])
-; ASM64PWR4-DAG:   std [[SCRATCHREG:[0-9]+]], 112(1)
-; ASM64PWR4-DAG:   std [[SCRATCHREG:[0-9]+]], 120(1)
-; ASM64PWR4-DAG:   std [[SCRATCHREG:[0-9]+]], 128(1)
-; ASM64PWR4-DAG:   std [[SCRATCHREG:[0-9]+]], 136(1)
-; ASM64PWR4-DAG:   std [[SCRATCHREG:[0-9]+]], 144(1)
-; ASM64PWR4-DAG:   stw [[REGF1]], 152(1)
-; ASM64PWR4-DAG:   std [[REGD]], 160(1)
-; ASM64PWR4-DAG:   stw [[REGF2]], 168(1)
-; ASM64PWR4-NEXT:  bl .test_fpr_stack
-
 define i32 @mix_callee(double %d1, double %d2, double %d3, double %d4, i8 zeroext %c1, i16 signext %s1, i64 %ll1, i32 %i1, i32 %i2, i32 %i3) {
+; ASM32PWR4-LABEL: mix_callee:
+; ASM32PWR4:       # %bb.0: # %entry
+; ASM32PWR4-NEXT:    lwz 4, 60(1)
+; ASM32PWR4-NEXT:    lis 8, 17200
+; ASM32PWR4-NEXT:    fadd 1, 1, 2
+; ASM32PWR4-NEXT:    fadd 1, 1, 3
+; ASM32PWR4-NEXT:    lwz 5, 56(1)
+; ASM32PWR4-NEXT:    lwz 3, 68(1)
+; ASM32PWR4-NEXT:    add 4, 5, 4
+; ASM32PWR4-NEXT:    lwz 5, L..C34(2) # %const.0
+; ASM32PWR4-NEXT:    fadd 1, 1, 4
+; ASM32PWR4-NEXT:    lwz 6, 72(1)
+; ASM32PWR4-NEXT:    add 3, 4, 3
+; ASM32PWR4-NEXT:    lwz 7, 76(1)
+; ASM32PWR4-NEXT:    add 3, 3, 6
+; ASM32PWR4-NEXT:    stw 8, -16(1)
+; ASM32PWR4-NEXT:    add 3, 3, 7
+; ASM32PWR4-NEXT:    lwz 8, 80(1)
+; ASM32PWR4-NEXT:    add 3, 3, 8
+; ASM32PWR4-NEXT:    lfs 0, 0(5)
+; ASM32PWR4-NEXT:    xoris 3, 3, 32768
+; ASM32PWR4-NEXT:    stw 3, -12(1)
+; ASM32PWR4-NEXT:    addi 3, 1, -4
+; ASM32PWR4-NEXT:    lfd 2, -16(1)
+; ASM32PWR4-NEXT:    fsub 0, 2, 0
+; ASM32PWR4-NEXT:    fadd 0, 0, 1
+; ASM32PWR4-NEXT:    fctiwz 0, 0
+; ASM32PWR4-NEXT:    stfiwx 0, 0, 3
+; ASM32PWR4-NEXT:    lwz 3, -4(1)
+; ASM32PWR4-NEXT:    blr
+;
+; ASM64PWR4-LABEL: mix_callee:
+; ASM64PWR4:       # %bb.0: # %entry
+; ASM64PWR4-NEXT:    lwz 3, 116(1)
+; ASM64PWR4-NEXT:    add 4, 7, 8
+; ASM64PWR4-NEXT:    fadd 0, 1, 2
+; ASM64PWR4-NEXT:    add 4, 4, 9
+; ASM64PWR4-NEXT:    fadd 0, 0, 3
+; ASM64PWR4-NEXT:    add 4, 4, 10
+; ASM64PWR4-NEXT:    lwz 5, 124(1)
+; ASM64PWR4-NEXT:    add 3, 4, 3
+; ASM64PWR4-NEXT:    add 3, 3, 5
+; ASM64PWR4-NEXT:    fadd 0, 0, 4
+; ASM64PWR4-NEXT:    extsw 3, 3
+; ASM64PWR4-NEXT:    std 3, -16(1)
+; ASM64PWR4-NEXT:    addi 3, 1, -4
+; ASM64PWR4-NEXT:    lfd 1, -16(1)
+; ASM64PWR4-NEXT:    fcfid 1, 1
+; ASM64PWR4-NEXT:    fadd 0, 1, 0
+; ASM64PWR4-NEXT:    fctiwz 0, 0
+; ASM64PWR4-NEXT:    stfiwx 0, 0, 3
+; ASM64PWR4-NEXT:    lwz 3, -4(1)
+; ASM64PWR4-NEXT:    blr
   entry:
     %add = fadd double %d1, %d2
     %add1 = fadd double %add, %d3
@@ -1991,137 +1793,149 @@ define i32 @mix_callee(double %d1, double %d2, double %d3, double %d4, i8 zeroex
     ret i32 %conv16
   }
 
-; CHECK-LABEL: mix_callee
-
-; 32BIT-LABEL: liveins:
-; 32BIT-DAG:   - { reg: '$f1', virtual-reg: '' }
-; 32BIT-DAG:   - { reg: '$f2', virtual-reg: '' }
-; 32BIT-DAG:   - { reg: '$f3', virtual-reg: '' }
-; 32BIT-DAG:   - { reg: '$f4', virtual-reg: '' }
-
-; 32BIT-LABEL: fixedStack:
-; 32BIT-DAG:   - { id: 6, type: default, offset: 56, size: 4
-; 32BIT-DAG:   - { id: 5, type: default, offset: 60, size: 4
-; 32BIT-DAG:   - { id: 4, type: default, offset: 64, size: 4
-; 32BIT-DAG:   - { id: 3, type: default, offset: 68, size: 4
-; 32BIT-DAG:   - { id: 2, type: default, offset: 72, size: 4
-; 32BIT-DAG:   - { id: 1, type: default, offset: 76, size: 4
-; 32BIT-DAG:   - { id: 0, type: default, offset: 80, size: 4
-
-; 32BIT-LABEL: body:             |
-; 32BIT-DAG:     bb.0.entry:
-; 32BIT-DAG:     liveins: $f1, $f2, $f3, $f4
-
-; 64BIT-LABEL:  liveins:
-; 64BIT-DAG:    - { reg: '$f1', virtual-reg: '' }
-; 64BIT-DAG:    - { reg: '$f2', virtual-reg: '' }
-; 64BIT-DAG:    - { reg: '$f3', virtual-reg: '' }
-; 64BIT-DAG:    - { reg: '$f4', virtual-reg: '' }
-; 64BIT-DAG:    - { reg: '$x7', virtual-reg: '' }
-; 64BIT-DAG:    - { reg: '$x8', virtual-reg: '' }
-; 64BIT-DAG:    - { reg: '$x9', virtual-reg: '' }
-; 64BIT-DAG:    - { reg: '$x10', virtual-reg: '' }
-
-; 64BIT-LABEL: fixedStack:
-; 64BIT-DAG:   - { id: 1, type: default, offset: 116, size: 4
-; 64BIT-DAG:   - { id: 0, type: default, offset: 124, size: 4
-
-; 64BIT-LABEL: body:             |
-; 64BIT-DAG:    bb.0.entry:
-; 64BIT-DAG:     liveins: $f1, $f2, $f3, $f4, $x7, $x8, $x9, $x10
-
-; CHECKASM-LABEL:   .mix_callee
-
-; ASM32PWR4-DAG:   lwz [[REG1:[0-9]+]], 56(1)
-; ASM32PWR4-DAG:   lwz [[REG2:[0-9]+]], 60(1)
-; ASM32PWR4-DAG:   lwz [[REG4:[0-9]+]], 68(1)
-; ASM32PWR4-DAG:   lwz [[REG5:[0-9]+]], 72(1)
-; ASM32PWR4-DAG:   lwz [[REG6:[0-9]+]], 76(1)
-; ASM32PWR4-DAG:   lwz [[REG7:[0-9]+]], 80(1)
-; ASM32PWR4-DAG:   blr
-
-; ASM64PWR-DAG:    ld [[REG1:[0-9]+]], 112(1)
-; ASM64PWR-DAG:    ld [[REG2:[0-9]+]], 120(1)
-; ASM64PWR-DAG:    fadd 0, 0, [[REG1]]
-; ASM64PWR-DAG:    add 3, 3, [[REG2]]
-; ASM64PWR-DAG:    blr
-
 define void @caller_mix() {
+; ASM32PWR4-LABEL: caller_mix:
+; ASM32PWR4:       # %bb.0: # %entry
+; ASM32PWR4-NEXT:    mflr 0
+; ASM32PWR4-NEXT:    stwu 1, -96(1)
+; ASM32PWR4-NEXT:    li 3, 60
+; ASM32PWR4-NEXT:    stw 0, 104(1)
+; ASM32PWR4-NEXT:    stw 3, 80(1)
+; ASM32PWR4-NEXT:    li 3, 50
+; ASM32PWR4-NEXT:    stw 3, 76(1)
+; ASM32PWR4-NEXT:    li 3, 40
+; ASM32PWR4-NEXT:    stw 3, 72(1)
+; ASM32PWR4-NEXT:    li 3, 0
+; ASM32PWR4-NEXT:    stw 3, 64(1)
+; ASM32PWR4-NEXT:    li 3, 2
+; ASM32PWR4-NEXT:    stw 3, 60(1)
+; ASM32PWR4-NEXT:    lwz 3, L..C35(2) # %const.0
+; ASM32PWR4-NEXT:    lfd 1, 0(3)
+; ASM32PWR4-NEXT:    lwz 3, L..C36(2) # %const.1
+; ASM32PWR4-NEXT:    lfd 2, 0(3)
+; ASM32PWR4-NEXT:    lwz 3, L..C37(2) # %const.2
+; ASM32PWR4-NEXT:    lfd 3, 0(3)
+; ASM32PWR4-NEXT:    lwz 3, L..C38(2) # %const.3
+; ASM32PWR4-NEXT:    lfd 4, 0(3)
+; ASM32PWR4-NEXT:    li 3, 1
+; ASM32PWR4-NEXT:    stw 3, 56(1)
+; ASM32PWR4-NEXT:    lis 3, 457
+; ASM32PWR4-NEXT:    ori 3, 3, 50048
+; ASM32PWR4-NEXT:    stw 3, 68(1)
+; ASM32PWR4-NEXT:    bl .mix_callee
+; ASM32PWR4-NEXT:    nop
+; ASM32PWR4-NEXT:    addi 1, 1, 96
+; ASM32PWR4-NEXT:    lwz 0, 8(1)
+; ASM32PWR4-NEXT:    mtlr 0
+; ASM32PWR4-NEXT:    blr
+;
+; ASM64PWR4-LABEL: caller_mix:
+; ASM64PWR4:       # %bb.0: # %entry
+; ASM64PWR4-NEXT:    mflr 0
+; ASM64PWR4-NEXT:    stdu 1, -128(1)
+; ASM64PWR4-NEXT:    ld 3, L..C33(2) # %const.0
+; ASM64PWR4-NEXT:    ld 4, L..C34(2) # %const.1
+; ASM64PWR4-NEXT:    lis 5, 457
+; ASM64PWR4-NEXT:    li 7, 1
+; ASM64PWR4-NEXT:    std 0, 144(1)
+; ASM64PWR4-NEXT:    ori 9, 5, 50048
+; ASM64PWR4-NEXT:    li 8, 2
+; ASM64PWR4-NEXT:    lfd 1, 0(3)
+; ASM64PWR4-NEXT:    ld 3, L..C35(2) # %const.2
+; ASM64PWR4-NEXT:    li 10, 40
+; ASM64PWR4-NEXT:    lfd 2, 0(4)
+; ASM64PWR4-NEXT:    ld 4, L..C36(2) # %const.3
+; ASM64PWR4-NEXT:    lfd 3, 0(3)
+; ASM64PWR4-NEXT:    li 3, 60
+; ASM64PWR4-NEXT:    lfd 4, 0(4)
+; ASM64PWR4-NEXT:    li 4, 50
+; ASM64PWR4-NEXT:    std 3, 120(1)
+; ASM64PWR4-NEXT:    std 4, 112(1)
+; ASM64PWR4-NEXT:    bl .mix_callee
+; ASM64PWR4-NEXT:    nop
+; ASM64PWR4-NEXT:    addi 1, 1, 128
+; ASM64PWR4-NEXT:    ld 0, 16(1)
+; ASM64PWR4-NEXT:    mtlr 0
+; ASM64PWR4-NEXT:    blr
   entry:
 %call = call i32 @mix_callee(double 1.000000e-01, double 2.000000e-01, double 3.000000e-01, double 4.000000e-01, i8 zeroext 1, i16 signext 2, i64 30000000, i32 40, i32 50, i32 60)
     ret void
   }
 
-; CHECK-LABEL: name: caller_mix
-
-; 32BIT-DAG:   ADJCALLSTACKDOWN 84, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-DAG:   renamable $r[[SCRATCHREG:[0-9]+]] = LWZtoc %const.0, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $f1 = LFD 0, killed renamable $r[[SCRATCHREG:[0-9]+]] :: (load (s64) from constant-pool)
-; 32BIT-DAG:   renamable $r[[SCRATCHREG:[0-9]+]] = LWZtoc %const.1, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $f2 = LFD 0, killed renamable $r[[SCRATCHREG:[0-9]+]] :: (load (s64) from constant-pool)
-; 32BIT-DAG:   renamable $r[[SCRATCHREG:[0-9]+]] = LWZtoc %const.2, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $f3 = LFD 0, killed renamable $r[[SCRATCHREG:[0-9]+]] :: (load (s64) from constant-pool)
-; 32BIT-DAG:   renamable $r[[SCRATCHREG:[0-9]+]] = LWZtoc %const.3, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $f4 = LFD 0, killed renamable $r[[SCRATCHREG:[0-9]+]] :: (load (s64) from constant-pool)
-; 32BIT-DAG:   renamable $r[[SCRATCHREG:[0-9]+]] = LI 1
-; 32BIT-DAG:   renamable $r[[SCRATCHREG:[0-9]+]] = LI 2
-; 32BIT-DAG:   renamable $r[[SCRATCHREG:[0-9]+]] = LIS 457
-; 32BIT-DAG:   renamable $r[[SCRATCHREG:[0-9]+]] = LI 0
-; 32BIT-DAG:   renamable $r[[SCRATCHREG:[0-9]+]] = LI 40
-; 32BIT-DAG:   renamable $r[[SCRATCHREG:[0-9]+]] = LI 50
-; 32BIT-DAG:   renamable $r[[SCRATCHREG:[0-9]+]] = LI 60
-; 32BIT-DAG:   STW killed renamable $r[[REG1:[0-9]+]], 56, $r1 :: (store (s32))
-; 32BIT-DAG:   STW killed renamable $r[[REG2:[0-9]+]], 60, $r1 :: (store (s32))
-; 32BIT-DAG:   STW killed renamable $r[[REG3:[0-9]+]], 64, $r1 :: (store (s32))
-; 32BIT-DAG:   STW killed renamable $r[[REG4:[0-9]+]], 68, $r1 :: (store (s32))
-; 32BIT-DAG:   STW killed renamable $r[[REG5:[0-9]+]], 72, $r1 :: (store (s32))
-; 32BIT-DAG:   STW killed renamable $r[[REG6:[0-9]+]], 76, $r1 :: (store (s32))
-; 32BIT-DAG:   STW killed renamable $r[[REG7:[0-9]+]], 80, $r1 :: (store (s32))
-; 32BIT-DAG:   BL_NOP <mcsymbol .mix_callee>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $f1, implicit $f2, implicit $f3, implicit $f4, implicit $r2, implicit-def $r1, implicit-def dead $r3
-; 32BIT-DAG:   ADJCALLSTACKUP 84, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-NEXT:  BLR implicit $lr, implicit $rm
-
-; 64BIT-DAG:   ADJCALLSTACKDOWN 128, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-DAG:   renamable $x[[SCRATCHREG:[0-9]+]] = LDtocCPT %const.0, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x[[SCRATCHREG:[0-9]+]] = LDtocCPT %const.1, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x[[SCRATCHREG:[0-9]+]] = LDtocCPT %const.2, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x[[SCRATCHREG:[0-9]+]] = LDtocCPT %const.3, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $f1 = LFD 0, killed renamable $x[[SCRATCHREG:[0-9]+]] :: (load (s64) from constant-pool)
-; 64BIT-DAG:   renamable $f2 = LFD 0, killed renamable $x[[SCRATCHREG:[0-9]+]] :: (load (s64) from constant-pool)
-; 64BIT-DAG:   renamable $f3 = LFD 0, killed renamable $x[[SCRATCHREG:[0-9]+]] :: (load (s64) from constant-pool)
-; 64BIT-DAG:   renamable $f4 = LFD 0, killed renamable $x[[SCRATCHREG:[0-9]+]] :: (load (s64) from constant-pool)
-; 64BIT-DAG:   renamable $x[[SCRATCHREG:[0-9]+]] = LI8 50
-; 64BIT-DAG:   renamable $x[[SCRATCHREG:[0-9]+]] = LI8 60
-; 64BIT-DAG:   renamable $x[[SCRATCHREG:[0-9]+]] = LIS8 457
-; 64BIT-DAG:   $x7 = LI8 1
-; 64BIT-DAG:   $x8 = LI8 2
-; 64BIT-DAG:   $x10 = LI8 40
-; 64BIT-DAG:   STD killed renamable $x[[REG1:[0-9]+]], 112, $x1 :: (store (s64))
-; 64BIT-DAG:   STD killed renamable $x[[REG2:[0-9]+]], 120, $x1 :: (store (s64))
-; 64BIT:       ADJCALLSTACKUP 128, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-NEXT:  BLR8 implicit $lr8, implicit $rm
-
-; CHEKASM-LABEL:   .caller_mix
-
-; ASM32PWR4:        mflr 0
-; ASM32PWR4-DAG:    stw [[REG1:[0-9]+]], 56(1)
-; ASM32PWR4-DAG:    stw [[REG2:[0-9]+]], 60(1)
-; ASM32PWR4-DAG:    stw [[REG3:[0-9]+]], 64(1)
-; ASM32PWR4-DAG:    stw [[REG4:[0-9]+]], 68(1)
-; ASM32PWR4-DAG:    stw [[REG5:[0-9]+]], 72(1)
-; ASM32PWR4-DAG:    stw [[REG6:[0-9]+]], 76(1)
-; ASM32PWR4-DAG:    stw [[REG7:[0-9]+]], 80(1)
-; ASM32PWR4-DAG:    bl .mix_callee
-; ASM32PWR4-DAG:    blr
-
-; ASM64PWR4:        mflr 0
-; ASM64PWR4-DAG:    std [[REG1:[0-9]+]], 112(1)
-; ASM64PWR4-DAG:    std [[REG2:[0-9]+]], 120(1)
-; ASM64PWR4-DAG:    bl .mix_callee
-; ASM64PWR4-DAG:    blr
-
-
   define i32 @mix_floats(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10, double %d11, double %d12, double %d13, double %d14) {
+; ASM32PWR4-LABEL: mix_floats:
+; ASM32PWR4:       # %bb.0: # %entry
+; ASM32PWR4-NEXT:    add 3, 3, 4
+; ASM32PWR4-NEXT:    lwz 4, L..C39(2) # %const.0
+; ASM32PWR4-NEXT:    lis 11, 17200
+; ASM32PWR4-NEXT:    stfd 31, -8(1) # 8-byte Folded Spill
+; ASM32PWR4-NEXT:    add 3, 3, 5
+; ASM32PWR4-NEXT:    add 3, 3, 6
+; ASM32PWR4-NEXT:    add 3, 3, 7
+; ASM32PWR4-NEXT:    stw 11, -24(1)
+; ASM32PWR4-NEXT:    add 3, 3, 8
+; ASM32PWR4-NEXT:    add 3, 3, 9
+; ASM32PWR4-NEXT:    add 3, 3, 10
+; ASM32PWR4-NEXT:    lfs 0, 0(4)
+; ASM32PWR4-NEXT:    xoris 3, 3, 32768
+; ASM32PWR4-NEXT:    stw 3, -20(1)
+; ASM32PWR4-NEXT:    addi 3, 1, -12
+; ASM32PWR4-NEXT:    lfd 31, -24(1)
+; ASM32PWR4-NEXT:    fsub 0, 31, 0
+; ASM32PWR4-NEXT:    fadd 0, 0, 1
+; ASM32PWR4-NEXT:    lfd 1, 160(1)
+; ASM32PWR4-NEXT:    fadd 0, 0, 2
+; ASM32PWR4-NEXT:    fadd 0, 0, 3
+; ASM32PWR4-NEXT:    fadd 0, 0, 4
+; ASM32PWR4-NEXT:    fadd 0, 0, 5
+; ASM32PWR4-NEXT:    fadd 0, 0, 6
+; ASM32PWR4-NEXT:    fadd 0, 0, 7
+; ASM32PWR4-NEXT:    fadd 0, 0, 8
+; ASM32PWR4-NEXT:    fadd 0, 0, 9
+; ASM32PWR4-NEXT:    fadd 0, 0, 10
+; ASM32PWR4-NEXT:    fadd 0, 0, 11
+; ASM32PWR4-NEXT:    fadd 0, 0, 12
+; ASM32PWR4-NEXT:    fadd 0, 0, 13
+; ASM32PWR4-NEXT:    fadd 0, 0, 1
+; ASM32PWR4-NEXT:    fctiwz 0, 0
+; ASM32PWR4-NEXT:    stfiwx 0, 0, 3
+; ASM32PWR4-NEXT:    lwz 3, -12(1)
+; ASM32PWR4-NEXT:    lfd 31, -8(1) # 8-byte Folded Reload
+; ASM32PWR4-NEXT:    blr
+;
+; ASM64PWR4-LABEL: mix_floats:
+; ASM64PWR4:       # %bb.0: # %entry
+; ASM64PWR4-NEXT:    add 3, 3, 4
+; ASM64PWR4-NEXT:    add 3, 3, 5
+; ASM64PWR4-NEXT:    add 3, 3, 6
+; ASM64PWR4-NEXT:    add 3, 3, 7
+; ASM64PWR4-NEXT:    add 3, 3, 8
+; ASM64PWR4-NEXT:    add 3, 3, 9
+; ASM64PWR4-NEXT:    add 3, 3, 10
+; ASM64PWR4-NEXT:    extsw 3, 3
+; ASM64PWR4-NEXT:    std 3, -16(1)
+; ASM64PWR4-NEXT:    addi 3, 1, -4
+; ASM64PWR4-NEXT:    lfd 0, -16(1)
+; ASM64PWR4-NEXT:    fcfid 0, 0
+; ASM64PWR4-NEXT:    fadd 0, 0, 1
+; ASM64PWR4-NEXT:    lfd 1, 216(1)
+; ASM64PWR4-NEXT:    fadd 0, 0, 2
+; ASM64PWR4-NEXT:    fadd 0, 0, 3
+; ASM64PWR4-NEXT:    fadd 0, 0, 4
+; ASM64PWR4-NEXT:    fadd 0, 0, 5
+; ASM64PWR4-NEXT:    fadd 0, 0, 6
+; ASM64PWR4-NEXT:    fadd 0, 0, 7
+; ASM64PWR4-NEXT:    fadd 0, 0, 8
+; ASM64PWR4-NEXT:    fadd 0, 0, 9
+; ASM64PWR4-NEXT:    fadd 0, 0, 10
+; ASM64PWR4-NEXT:    fadd 0, 0, 11
+; ASM64PWR4-NEXT:    fadd 0, 0, 12
+; ASM64PWR4-NEXT:    fadd 0, 0, 13
+; ASM64PWR4-NEXT:    fadd 0, 0, 1
+; ASM64PWR4-NEXT:    fctiwz 0, 0
+; ASM64PWR4-NEXT:    stfiwx 0, 0, 3
+; ASM64PWR4-NEXT:    lwz 3, -4(1)
+; ASM64PWR4-NEXT:    blr
   entry:
     %add = add nsw i32 %i1, %i2
     %add1 = add nsw i32 %add, %i3
@@ -2149,203 +1963,240 @@ define void @caller_mix() {
     ret i32 %conv21
   }
 
-; CHECK-LABEL:   mix_floats
-
-; 32BIT-LABEL:  liveins:
-; 32BIT-DAG:   - { reg: '$r3', virtual-reg: '' }
-; 32BIT-DAG:   - { reg: '$r4', virtual-reg: '' }
-; 32BIT-DAG:   - { reg: '$r5', virtual-reg: '' }
-; 32BIT-DAG:   - { reg: '$r6', virtual-reg: '' }
-; 32BIT-DAG:   - { reg: '$r7', virtual-reg: '' }
-; 32BIT-DAG:   - { reg: '$r8', virtual-reg: '' }
-; 32BIT-DAG:   - { reg: '$r9', virtual-reg: '' }
-; 32BIT-DAG:   - { reg: '$r10', virtual-reg: '' }
-; 32BIT-DAG:   - { reg: '$f1', virtual-reg: '' }
-; 32BIT-DAG:   - { reg: '$f2', virtual-reg: '' }
-; 32BIT-DAG:   - { reg: '$f3', virtual-reg: '' }
-; 32BIT-DAG:   - { reg: '$f4', virtual-reg: '' }
-; 32BIT-DAG:   - { reg: '$f5', virtual-reg: '' }
-; 32BIT-DAG:   - { reg: '$f6', virtual-reg: '' }
-; 32BIT-DAG:   - { reg: '$f7', virtual-reg: '' }
-; 32BIT-DAG:   - { reg: '$f8', virtual-reg: '' }
-; 32BIT-DAG:   - { reg: '$f9', virtual-reg: '' }
-; 32BIT-DAG:   - { reg: '$f10', virtual-reg: '' }
-; 32BIT-DAG:   - { reg: '$f11', virtual-reg: '' }
-; 32BIT-DAG:   - { reg: '$f12', virtual-reg: '' }
-; 32BIT-DAG:   - { reg: '$f13', virtual-reg: '' }
-
-; 32BIT-LABEL: fixedStack:
-; 32BIT-DAG:   - { id: 0, type: default, offset: 160, size: 8
-
-; 32BIT-LABEL: body:             |
-; 32BIT-DAG:     bb.0.entry:
-; 32BIT-DAG:       liveins: $f1, $f2, $f3, $f4, $f5, $f6, $f7, $f8, $f9, $f10, $f11, $f12, $f13, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10
-
-; 64BIT-DAG:   liveins:
-; 64BIT-DAG:   - { reg: '$x3', virtual-reg: '' }
-; 64BIT-DAG:   - { reg: '$x4', virtual-reg: '' }
-; 64BIT-DAG:   - { reg: '$x5', virtual-reg: '' }
-; 64BIT-DAG:   - { reg: '$x6', virtual-reg: '' }
-; 64BIT-DAG:   - { reg: '$x7', virtual-reg: '' }
-; 64BIT-DAG:   - { reg: '$x8', virtual-reg: '' }
-; 64BIT-DAG:   - { reg: '$x9', virtual-reg: '' }
-; 64BIT-DAG:   - { reg: '$x10', virtual-reg: '' }
-; 64BIT-DAG:   - { reg: '$f1', virtual-reg: '' }
-; 64BIT-DAG:   - { reg: '$f2', virtual-reg: '' }
-; 64BIT-DAG:   - { reg: '$f3', virtual-reg: '' }
-; 64BIT-DAG:   - { reg: '$f4', virtual-reg: '' }
-; 64BIT-DAG:   - { reg: '$f5', virtual-reg: '' }
-; 64BIT-DAG:   - { reg: '$f6', virtual-reg: '' }
-; 64BIT-DAG:   - { reg: '$f7', virtual-reg: '' }
-; 64BIT-DAG:   - { reg: '$f8', virtual-reg: '' }
-; 64BIT-DAG:   - { reg: '$f9', virtual-reg: '' }
-; 64BIT-DAG:   - { reg: '$f10', virtual-reg: '' }
-; 64BIT-DAG:   - { reg: '$f11', virtual-reg: '' }
-; 64BIT-DAG:   - { reg: '$f12', virtual-reg: '' }
-; 64BIT-DAG:   - { reg: '$f13', virtual-reg: '' }
-
-; 64BIT-LABEL: fixedStack:
-; 64BIT-DAG:   - { id: 0, type: default, offset: 216, size: 8
-
-; 64BIT-LABEL: body:             |
-; 64BIT-DAG:     bb.0.entry:
-; 64BIT-DAG:       liveins: $f1, $f2, $f3, $f4, $f5, $f6, $f7, $f8, $f9, $f10, $f11, $f12, $f13, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10
-
-; CHECKASM-LABEL: .mix_floats:
-
-; ASM32PWR4-DAG:   lfd [[REGF:[0-9]+]], 160(1)
-; ASM32PWR4-DAG:   fadd 0, 0, [[REGF]]
-; ASM32PWR4-DAG:   blr
-
-; ASM64PWR4-DAG:   lfd [[REG1:[0-9]+]], 216(1)
-; ASM64PWR4-DAG:   fadd 0, 0, [[REG1]]
-; ASM64PWR4-DAG:   blr
-
   define void @mix_floats_caller() {
+; ASM32PWR4-LABEL: mix_floats_caller:
+; ASM32PWR4:       # %bb.0: # %entry
+; ASM32PWR4-NEXT:    mflr 0
+; ASM32PWR4-NEXT:    stwu 1, -176(1)
+; ASM32PWR4-NEXT:    li 3, 0
+; ASM32PWR4-NEXT:    stw 0, 184(1)
+; ASM32PWR4-NEXT:    lis 4, 16352
+; ASM32PWR4-NEXT:    lis 5, 16339
+; ASM32PWR4-NEXT:    lis 6, 16364
+; ASM32PWR4-NEXT:    stw 3, 92(1)
+; ASM32PWR4-NEXT:    ori 5, 5, 13107
+; ASM32PWR4-NEXT:    ori 6, 6, 52428
+; ASM32PWR4-NEXT:    stw 3, 132(1)
+; ASM32PWR4-NEXT:    lis 3, 16368
+; ASM32PWR4-NEXT:    li 8, 6
+; ASM32PWR4-NEXT:    li 9, 7
+; ASM32PWR4-NEXT:    li 10, 8
+; ASM32PWR4-NEXT:    stw 3, 128(1)
+; ASM32PWR4-NEXT:    lis 3, -26215
+; ASM32PWR4-NEXT:    ori 3, 3, 39322
+; ASM32PWR4-NEXT:    stw 4, 88(1)
+; ASM32PWR4-NEXT:    lis 4, 16313
+; ASM32PWR4-NEXT:    ori 4, 4, 39321
+; ASM32PWR4-NEXT:    stw 3, 60(1)
+; ASM32PWR4-NEXT:    stw 3, 68(1)
+; ASM32PWR4-NEXT:    stw 3, 84(1)
+; ASM32PWR4-NEXT:    stw 3, 116(1)
+; ASM32PWR4-NEXT:    stw 3, 140(1)
+; ASM32PWR4-NEXT:    lis 3, 16369
+; ASM32PWR4-NEXT:    ori 3, 3, 39321
+; ASM32PWR4-NEXT:    stw 4, 56(1)
+; ASM32PWR4-NEXT:    lis 4, 16329
+; ASM32PWR4-NEXT:    ori 4, 4, 39321
+; ASM32PWR4-NEXT:    stw 3, 136(1)
+; ASM32PWR4-NEXT:    lis 3, 16371
+; ASM32PWR4-NEXT:    ori 3, 3, 13107
+; ASM32PWR4-NEXT:    stw 4, 64(1)
+; ASM32PWR4-NEXT:    lis 4, 13107
+; ASM32PWR4-NEXT:    ori 4, 4, 13107
+; ASM32PWR4-NEXT:    stw 3, 144(1)
+; ASM32PWR4-NEXT:    lis 3, 16372
+; ASM32PWR4-NEXT:    ori 3, 3, 52428
+; ASM32PWR4-NEXT:    stw 4, 76(1)
+; ASM32PWR4-NEXT:    stw 4, 100(1)
+; ASM32PWR4-NEXT:    stw 4, 148(1)
+; ASM32PWR4-NEXT:    lwz 4, L..C40(2) # %const.0
+; ASM32PWR4-NEXT:    stw 3, 152(1)
+; ASM32PWR4-NEXT:    lwz 3, L..C41(2) # %const.1
+; ASM32PWR4-NEXT:    lfd 1, 0(4)
+; ASM32PWR4-NEXT:    lwz 4, L..C42(2) # %const.2
+; ASM32PWR4-NEXT:    lfd 2, 0(3)
+; ASM32PWR4-NEXT:    lwz 3, L..C43(2) # %const.3
+; ASM32PWR4-NEXT:    stw 5, 72(1)
+; ASM32PWR4-NEXT:    lis 5, 16345
+; ASM32PWR4-NEXT:    ori 5, 5, 39321
+; ASM32PWR4-NEXT:    stw 5, 80(1)
+; ASM32PWR4-NEXT:    lis 5, 16355
+; ASM32PWR4-NEXT:    ori 5, 5, 13107
+; ASM32PWR4-NEXT:    lfd 3, 0(4)
+; ASM32PWR4-NEXT:    lwz 4, L..C44(2) # %const.4
+; ASM32PWR4-NEXT:    lfd 4, 0(3)
+; ASM32PWR4-NEXT:    lwz 3, L..C45(2) # %const.5
+; ASM32PWR4-NEXT:    stw 5, 96(1)
+; ASM32PWR4-NEXT:    lis 5, 26214
+; ASM32PWR4-NEXT:    ori 7, 5, 26214
+; ASM32PWR4-NEXT:    lis 5, 16358
+; ASM32PWR4-NEXT:    lfd 6, 0(4)
+; ASM32PWR4-NEXT:    lwz 4, L..C46(2) # %const.6
+; ASM32PWR4-NEXT:    ori 5, 5, 26214
+; ASM32PWR4-NEXT:    lfd 7, 0(3)
+; ASM32PWR4-NEXT:    lwz 3, L..C47(2) # %const.7
+; ASM32PWR4-NEXT:    stw 5, 104(1)
+; ASM32PWR4-NEXT:    lis 5, 16361
+; ASM32PWR4-NEXT:    ori 5, 5, 39321
+; ASM32PWR4-NEXT:    lfd 8, 0(4)
+; ASM32PWR4-NEXT:    lwz 4, L..C48(2) # %const.8
+; ASM32PWR4-NEXT:    lfd 9, 0(3)
+; ASM32PWR4-NEXT:    lwz 3, L..C49(2) # %const.9
+; ASM32PWR4-NEXT:    stw 5, 112(1)
+; ASM32PWR4-NEXT:    lis 5, -13108
+; ASM32PWR4-NEXT:    ori 5, 5, 52429
+; ASM32PWR4-NEXT:    stw 5, 124(1)
+; ASM32PWR4-NEXT:    stw 5, 156(1)
+; ASM32PWR4-NEXT:    lwz 5, L..C50(2) # %const.12
+; ASM32PWR4-NEXT:    lfd 11, 0(4)
+; ASM32PWR4-NEXT:    lwz 4, L..C51(2) # %const.10
+; ASM32PWR4-NEXT:    lfd 12, 0(3)
+; ASM32PWR4-NEXT:    lwz 3, L..C52(2) # %const.11
+; ASM32PWR4-NEXT:    lfd 13, 0(4)
+; ASM32PWR4-NEXT:    lis 4, 16374
+; ASM32PWR4-NEXT:    ori 11, 4, 26214
+; ASM32PWR4-NEXT:    li 4, 2
+; ASM32PWR4-NEXT:    lfs 5, 0(3)
+; ASM32PWR4-NEXT:    li 3, 1
+; ASM32PWR4-NEXT:    lfs 10, 0(5)
+; ASM32PWR4-NEXT:    li 5, 3
+; ASM32PWR4-NEXT:    stw 7, 108(1)
+; ASM32PWR4-NEXT:    stw 6, 120(1)
+; ASM32PWR4-NEXT:    li 6, 4
+; ASM32PWR4-NEXT:    stw 7, 164(1)
+; ASM32PWR4-NEXT:    li 7, 5
+; ASM32PWR4-NEXT:    stw 11, 160(1)
+; ASM32PWR4-NEXT:    bl .mix_floats
+; ASM32PWR4-NEXT:    nop
+; ASM32PWR4-NEXT:    addi 1, 1, 176
+; ASM32PWR4-NEXT:    lwz 0, 8(1)
+; ASM32PWR4-NEXT:    mtlr 0
+; ASM32PWR4-NEXT:    blr
+;
+; ASM64PWR4-LABEL: mix_floats_caller:
+; ASM64PWR4:       # %bb.0: # %entry
+; ASM64PWR4-NEXT:    mflr 0
+; ASM64PWR4-NEXT:    stdu 1, -240(1)
+; ASM64PWR4-NEXT:    li 3, 1023
+; ASM64PWR4-NEXT:    std 0, 256(1)
+; ASM64PWR4-NEXT:    ld 4, L..C37(2) # %const.0
+; ASM64PWR4-NEXT:    ld 8, L..C38(2) # %const.6
+; ASM64PWR4-NEXT:    lis 5, 16371
+; ASM64PWR4-NEXT:    ld 6, L..C39(2) # %const.3
+; ASM64PWR4-NEXT:    ld 9, L..C40(2) # %const.9
+; ASM64PWR4-NEXT:    ld 10, L..C41(2) # %const.11
+; ASM64PWR4-NEXT:    rldic 3, 3, 52, 2
+; ASM64PWR4-NEXT:    lis 11, 4091
+; ASM64PWR4-NEXT:    std 3, 184(1)
+; ASM64PWR4-NEXT:    li 3, 511
+; ASM64PWR4-NEXT:    lis 12, 16361
+; ASM64PWR4-NEXT:    rldic 3, 3, 53, 2
+; ASM64PWR4-NEXT:    lfd 1, 0(4)
+; ASM64PWR4-NEXT:    ld 4, L..C42(2) # %const.2
+; ASM64PWR4-NEXT:    lis 0, 16345
+; ASM64PWR4-NEXT:    std 3, 144(1)
+; ASM64PWR4-NEXT:    ld 3, L..C43(2) # %const.1
+; ASM64PWR4-NEXT:    lfd 2, 0(3)
+; ASM64PWR4-NEXT:    lis 3, 16374
+; ASM64PWR4-NEXT:    ori 7, 3, 26214
+; ASM64PWR4-NEXT:    ori 3, 5, 13107
+; ASM64PWR4-NEXT:    ld 5, L..C44(2) # %const.5
+; ASM64PWR4-NEXT:    lfd 8, 0(8)
+; ASM64PWR4-NEXT:    ld 8, L..C45(2) # %const.8
+; ASM64PWR4-NEXT:    rldimi 7, 7, 32, 0
+; ASM64PWR4-NEXT:    rlwimi 7, 7, 16, 0, 15
+; ASM64PWR4-NEXT:    rldimi 3, 3, 32, 0
+; ASM64PWR4-NEXT:    lfd 3, 0(4)
+; ASM64PWR4-NEXT:    ld 4, L..C46(2) # %const.4
+; ASM64PWR4-NEXT:    rlwimi 3, 3, 16, 0, 15
+; ASM64PWR4-NEXT:    lfd 4, 0(6)
+; ASM64PWR4-NEXT:    lis 6, 16355
+; ASM64PWR4-NEXT:    lfd 7, 0(5)
+; ASM64PWR4-NEXT:    ori 5, 6, 13107
+; ASM64PWR4-NEXT:    ld 6, L..C47(2) # %const.7
+; ASM64PWR4-NEXT:    rldimi 5, 5, 32, 0
+; ASM64PWR4-NEXT:    rlwimi 5, 5, 16, 0, 15
+; ASM64PWR4-NEXT:    lfd 11, 0(8)
+; ASM64PWR4-NEXT:    ld 8, L..C48(2) # %const.10
+; ASM64PWR4-NEXT:    lfd 6, 0(4)
+; ASM64PWR4-NEXT:    lis 4, 16358
+; ASM64PWR4-NEXT:    ori 4, 4, 26214
+; ASM64PWR4-NEXT:    rldimi 4, 4, 32, 0
+; ASM64PWR4-NEXT:    lfd 9, 0(6)
+; ASM64PWR4-NEXT:    lis 6, 16339
+; ASM64PWR4-NEXT:    rlwimi 4, 4, 16, 0, 15
+; ASM64PWR4-NEXT:    ori 6, 6, 13107
+; ASM64PWR4-NEXT:    lfd 12, 0(9)
+; ASM64PWR4-NEXT:    lis 9, 4093
+; ASM64PWR4-NEXT:    ori 9, 9, 13107
+; ASM64PWR4-NEXT:    lfd 13, 0(8)
+; ASM64PWR4-NEXT:    lis 8, 16369
+; ASM64PWR4-NEXT:    ori 8, 8, 39321
+; ASM64PWR4-NEXT:    rldimi 6, 6, 32, 0
+; ASM64PWR4-NEXT:    std 31, 232(1) # 8-byte Folded Spill
+; ASM64PWR4-NEXT:    ld 31, L..C49(2) # %const.12
+; ASM64PWR4-NEXT:    rldic 9, 9, 34, 2
+; ASM64PWR4-NEXT:    rlwimi 6, 6, 16, 0, 15
+; ASM64PWR4-NEXT:    oris 9, 9, 52428
+; ASM64PWR4-NEXT:    lfs 5, 0(10)
+; ASM64PWR4-NEXT:    lis 10, 16329
+; ASM64PWR4-NEXT:    ori 10, 10, 39321
+; ASM64PWR4-NEXT:    std 7, 216(1)
+; ASM64PWR4-NEXT:    ori 7, 11, 13107
+; ASM64PWR4-NEXT:    ori 11, 12, 39321
+; ASM64PWR4-NEXT:    ori 12, 0, 39321
+; ASM64PWR4-NEXT:    std 4, 160(1)
+; ASM64PWR4-NEXT:    rldic 4, 8, 32, 2
+; ASM64PWR4-NEXT:    rldic 7, 7, 34, 2
+; ASM64PWR4-NEXT:    oris 4, 4, 39321
+; ASM64PWR4-NEXT:    std 30, 224(1) # 8-byte Folded Spill
+; ASM64PWR4-NEXT:    lis 30, 16313
+; ASM64PWR4-NEXT:    rldic 8, 11, 32, 2
+; ASM64PWR4-NEXT:    rldic 11, 12, 32, 2
+; ASM64PWR4-NEXT:    std 3, 200(1)
+; ASM64PWR4-NEXT:    ori 3, 30, 39321
+; ASM64PWR4-NEXT:    ori 4, 4, 39322
+; ASM64PWR4-NEXT:    rldic 3, 3, 32, 2
+; ASM64PWR4-NEXT:    std 5, 152(1)
+; ASM64PWR4-NEXT:    rldic 5, 10, 32, 2
+; ASM64PWR4-NEXT:    oris 5, 5, 39321
+; ASM64PWR4-NEXT:    oris 3, 3, 39321
+; ASM64PWR4-NEXT:    std 6, 128(1)
+; ASM64PWR4-NEXT:    oris 6, 7, 52428
+; ASM64PWR4-NEXT:    ori 7, 9, 52429
+; ASM64PWR4-NEXT:    li 9, 7
+; ASM64PWR4-NEXT:    lfs 10, 0(31)
+; ASM64PWR4-NEXT:    li 10, 8
+; ASM64PWR4-NEXT:    std 7, 208(1)
+; ASM64PWR4-NEXT:    oris 7, 8, 39321
+; ASM64PWR4-NEXT:    oris 8, 11, 39321
+; ASM64PWR4-NEXT:    ori 11, 3, 39322
+; ASM64PWR4-NEXT:    li 3, 1
+; ASM64PWR4-NEXT:    std 4, 192(1)
+; ASM64PWR4-NEXT:    ori 4, 6, 52429
+; ASM64PWR4-NEXT:    ori 6, 8, 39322
+; ASM64PWR4-NEXT:    std 4, 176(1)
+; ASM64PWR4-NEXT:    ori 4, 7, 39322
+; ASM64PWR4-NEXT:    ori 7, 5, 39322
+; ASM64PWR4-NEXT:    li 5, 3
+; ASM64PWR4-NEXT:    li 8, 6
+; ASM64PWR4-NEXT:    std 4, 168(1)
+; ASM64PWR4-NEXT:    li 4, 2
+; ASM64PWR4-NEXT:    std 6, 136(1)
+; ASM64PWR4-NEXT:    li 6, 4
+; ASM64PWR4-NEXT:    std 7, 120(1)
+; ASM64PWR4-NEXT:    li 7, 5
+; ASM64PWR4-NEXT:    std 11, 112(1)
+; ASM64PWR4-NEXT:    bl .mix_floats
+; ASM64PWR4-NEXT:    nop
+; ASM64PWR4-NEXT:    ld 31, 232(1) # 8-byte Folded Reload
+; ASM64PWR4-NEXT:    ld 30, 224(1) # 8-byte Folded Reload
+; ASM64PWR4-NEXT:    addi 1, 1, 240
+; ASM64PWR4-NEXT:    ld 0, 16(1)
+; ASM64PWR4-NEXT:    mtlr 0
+; ASM64PWR4-NEXT:    blr
   entry:
     %call = call i32 @mix_floats(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, double 1.000000e-01, double 2.000000e-01, double 3.000000e-01, double 4.000000e-01, double 5.000000e-01, double 6.000000e-01, double 0x3FE6666666666666, double 8.000000e-01, double 9.000000e-01, double 1.000000e+00, double 1.100000e+00, double 1.200000e+00, double 1.300000e+00, double 1.400000e+00)
     ret void
   }
 
-; CHECK-LABEL: mix_floats_caller
-
-; 32BIT-DAG:   ADJCALLSTACKDOWN 168, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-DAG:   $r3 = LI 1
-; 32BIT-DAG:   $r4 = LI 2
-; 32BIT-DAG:   $r5 = LI 3
-; 32BIT-DAG:   $r6 = LI 4
-; 32BIT-DAG:   $r7 = LI 5
-; 32BIT-DAG:   $r8 = LI 6
-; 32BIT-DAG:   $r9 = LI 7
-; 32BIT-DAG:   $r10 = LI 8
-; 32BIT-DAG:   STW killed renamable $r[[REG1:[0-9]+]], 56, $r1 :: (store (s32), align 8)
-; 32BIT-DAG:   STW renamable $r[[REG2:[0-9]+]], 60, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
-; 32BIT-DAG:   STW killed renamable $r[[REG3:[0-9]+]], 64, $r1 :: (store (s32), align 8)
-; 32BIT-DAG:   STW renamable $r[[REG4:[0-9]+]], 68, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
-; 32BIT-DAG:   STW killed renamable $r[[REG5:[0-9]+]], 72, $r1 :: (store (s32), align 8)
-; 32BIT-DAG:   STW renamable $r[[REG6:[0-9]+]], 76, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
-; 32BIT-DAG:   STW killed renamable $r[[REG7:[0-9]+]], 80, $r1 :: (store (s32), align 8)
-; 32BIT-DAG:   STW renamable $r[[REG8:[0-9]+]], 84, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
-; 32BIT-DAG:   STW killed renamable $r[[REG9:[0-9]+]], 88, $r1 :: (store (s32), align 8)
-; 32BIT-DAG:   STW renamable $r[[REG10:[0-9]+]], 92, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
-; 32BIT-DAG:   STW killed renamable $r[[REG11:[0-9]+]], 96, $r1 :: (store (s32), align 8)
-; 32BIT-DAG:   STW renamable $r[[REG12:[0-9]+]], 100, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
-; 32BIT-DAG:   STW killed renamable $r[[REG13:[0-9]+]], 104, $r1 :: (store (s32), align 8)
-; 32BIT-DAG:   STW renamable $r[[REG14:[0-9]+]], 108, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
-; 32BIT-DAG:   STW killed renamable $r[[REG15:[0-9]+]], 112, $r1 :: (store (s32), align 8)
-; 32BIT-DAG:   STW renamable $r[[REG16:[0-9]+]], 116, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
-; 32BIT-DAG:   STW killed renamable $r[[REG17:[0-9]+]], 120, $r1 :: (store (s32), align 8)
-; 32BIT-DAG:   STW killed renamable $r[[REG18:[0-9]+]], 128, $r1 :: (store (s32), align 8)
-; 32BIT-DAG:   STW renamable $r[[REG19:[0-9]+]], 124, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
-; 32BIT-DAG:   STW killed renamable $r[[REG20:[0-9]+]], 132, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
-; 32BIT-DAG:   STW killed renamable $r[[REG21:[0-9]+]], 136, $r1 :: (store (s32), align 8)
-; 32BIT-DAG:   STW killed renamable $r[[REG22:[0-9]+]], 140, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
-; 32BIT-DAG:   STW killed renamable $r[[REG23:[0-9]+]], 144, $r1 :: (store (s32), align 8)
-; 32BIT-DAG:   STW killed renamable $r[[REG24:[0-9]+]], 148, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
-; 32BIT-DAG:   STW killed renamable $r[[REG25:[0-9]+]], 152, $r1 :: (store (s32), align 8)
-; 32BIT-DAG:   STW killed renamable $r[[REG26:[0-9]+]], 156, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
-; 32BIT-DAG:   STW killed renamable $r[[REG27:[0-9]+]], 160, $r1 :: (store (s32), align 8)
-; 32BIT-DAG:   STW killed renamable $r[[REG28:[0-9]+]], 164, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
-; 32BIT-NEXT:  BL_NOP <mcsymbol .mix_floats>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r5, implicit $r6, implicit $r7, implicit $r8, implicit $r9, implicit $r10, implicit $f1, implicit $f2, implicit $f3, implicit $f4, implicit $f5, implicit $f6, implicit $f7, implicit $f8, implicit $f9, implicit $f10, implicit $f11, implicit $f12, implicit $f13, implicit $r2, implicit-def $r1, implicit-def dead $r3
-; 32BIT-NEXT:   ADJCALLSTACKUP 168, 0, implicit-def dead $r1, implicit $r1
-
-
-; 64BIT-DAG:   ADJCALLSTACKDOWN 224, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-DAG:   $x3 = LI8 1
-; 64BIT-DAG:   $x4 = LI8 2
-; 64BIT-DAG:   $x5 = LI8 3
-; 64BIT-DAG:   $x6 = LI8 4
-; 64BIT-DAG:   $x7 = LI8 5
-; 64BIT-DAG:   $x8 = LI8 6
-; 64BIT-DAG:   $x9 = LI8 7
-; 64BIT-DAG:   $x10 = LI8 8
-; 64BIT-DAG:   STD killed renamable $x[[REG1:[0-9]+]], 112, $x1 :: (store (s64))
-; 64BIT-DAG:   STD killed renamable $x[[REG2:[0-9]+]], 120, $x1 :: (store (s64))
-; 64BIT-DAG:   STD killed renamable $x[[REG3:[0-9]+]], 128, $x1 :: (store (s64))
-; 64BIT-DAG:   STD killed renamable $x[[REG4:[0-9]+]], 136, $x1 :: (store (s64))
-; 64BIT-DAG:   STD killed renamable $x[[REG5:[0-9]+]], 144, $x1 :: (store (s64))
-; 64BIT-DAG:   STD killed renamable $x[[REG6:[0-9]+]], 152, $x1 :: (store (s64))
-; 64BIT-DAG:   STD killed renamable $x[[REG7:[0-9]+]], 160, $x1 :: (store (s64))
-; 64BIT-DAG:   STD killed renamable $x[[REG8:[0-9]+]], 168, $x1 :: (store (s64))
-; 64BIT-DAG:   STD killed renamable $x[[REG9:[0-9]+]], 176, $x1 :: (store (s64))
-; 64BIT-DAG:   STD killed renamable $x[[REG10:[0-9]+]], 184, $x1 :: (store (s64))
-; 64BIT-DAG:   STD killed renamable $x[[REG12:[0-9]+]], 192, $x1 :: (store (s64))
-; 64BIT-DAG:   STD killed renamable $x[[REG13:[0-9]+]], 200, $x1 :: (store (s64))
-; 64BIT-DAG:   STD killed renamable $x[[REG14:[0-9]+]], 208, $x1 :: (store (s64))
-; 64BIT-DAG:   STD killed renamable $x[[REG15:[0-9]+]], 216, $x1 :: (store (s64))
-; 64BIT-DAG:   BL8_NOP <mcsymbol .mix_floats>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit $x5, implicit $x6, implicit $x7, implicit $x8, implicit $x9, implicit $x10, implicit $f1, implicit $f2, implicit $f3, implicit $f4, implicit $f5, implicit $f6, implicit $f7, implicit $f8, implicit $f9, implicit $f10, implicit $f11, implicit $f12, implicit $f13, implicit $x2, implicit-def $r1, implicit-def dead $x3
-; 64BIT-NEXT:  ADJCALLSTACKUP 224, 0, implicit-def dead $r1, implicit $r1
-
-; CHEKASM-LABEL:    .mix_floats_caller:
-
-; ASM32PWR4:       mflr 0
-; ASM32PWR4-NEXT:  stwu 1, -176(1)
-; ASM32PWR4-DAG:   stw 0, 184(1)
-; ASM32PWR4-DAG:   stw [[REG:[0-9]+]], 56(1)
-; ASM32PWR4-DAG:   stw [[REG:[0-9]+]], 60(1)
-; ASM32PWR4-DAG:   stw [[REG:[0-9]+]], 64(1)
-; ASM32PWR4-DAG:   stw [[REG:[0-9]+]], 68(1)
-; ASM32PWR4-DAG:   stw [[REG:[0-9]+]], 72(1)
-; ASM32PWR4-DAG:   stw [[REG:[0-9]+]], 76(1)
-; ASM32PWR4-DAG:   stw [[REG:[0-9]+]], 80(1)
-; ASM32PWR4-DAG:   stw [[REG:[0-9]+]], 84(1)
-; ASM32PWR4-DAG:   stw [[REG:[0-9]+]], 88(1)
-; ASM32PWR4-DAG:   stw [[REG:[0-9]+]], 92(1)
-; ASM32PWR4-DAG:   stw [[REG:[0-9]+]], 96(1)
-; ASM32PWR4-DAG:   stw [[REG:[0-9]+]], 100(1)
-; ASM32PWR4-DAG:   stw [[REG:[0-9]+]], 104(1)
-; ASM32PWR4-DAG:   stw [[REG:[0-9]+]], 108(1)
-; ASM32PWR4-DAG:   stw [[REG:[0-9]+]], 112(1)
-; ASM32PWR4-DAG:   stw [[REG:[0-9]+]], 116(1)
-; ASM32PWR4-DAG:   stw [[REG:[0-9]+]], 120(1)
-; ASM32PWR4-DAG:   stw [[REG:[0-9]+]], 124(1)
-; ASM32PWR4-DAG:   stw [[REG:[0-9]+]], 128(1)
-; ASM32PWR4-DAG:   stw [[REG:[0-9]+]], 132(1)
-; ASM32PWR4-DAG:   stw [[REG:[0-9]+]], 136(1)
-; ASM32PWR4-DAG:   stw [[REG:[0-9]+]], 140(1)
-; ASM32PWR4-DAG:   stw [[REG:[0-9]+]], 144(1)
-; ASM32PWR4-DAG:   stw [[REG:[0-9]+]], 148(1)
-; ASM32PWR4-DAG:   stw [[REG:[0-9]+]], 152(1)
-; ASM32PWR4-DAG:   stw [[REG:[0-9]+]], 156(1)
-; ASM32PWR4-DAG:   stw [[REG:[0-9]+]], 160(1)
-; ASM32PWR4-DAG:   stw [[REG:[0-9]+]], 164(1)
-; ASM32PWR4:       bl .mix_floats
-
-; ASM64PWR4:      mflr 0
-; ASM64PWR4-NEXT: stdu 1, -240(1)
-; ASM64PWR4-DAG:  std 0, 256(1)
-; ASM64PWR4-DAG:  std [[REG:[0-9]+]], 112(1)
-; ASM64PWR4-DAG:  std [[REG:[0-9]+]], 120(1)
-; ASM64PWR4-DAG:  std [[REG:[0-9]+]], 128(1)
-; ASM64PWR4-DAG:  std [[REG:[0-9]+]], 136(1)
-; ASM64PWR4-DAG:  std [[REG:[0-9]+]], 144(1)
-; ASM64PWR4-DAG:  std [[REG:[0-9]+]], 152(1)
-; ASM64PWR4-DAG:  std [[REG:[0-9]+]], 160(1)
-; ASM64PWR4-DAG:  std [[REG:[0-9]+]], 168(1)
-; ASM64PWR4-DAG:  std [[REG:[0-9]+]], 176(1)
-; ASM64PWR4-DAG:  std [[REG:[0-9]+]], 184(1)
-; ASM64PWR4-DAG:  std [[REG:[0-9]+]], 192(1)
-; ASM64PWR4-DAG:  std [[REG:[0-9]+]], 200(1)
-; ASM64PWR4-DAG:  std [[REG:[0-9]+]], 208(1)
-; ASM64PWR4-DAG:  std [[REG:[0-9]+]], 216(1)
-; ASM64PWR4:      bl .mix_floats

From 9557fcca563dba3dd31769c297bb3b97d6e614f9 Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Fri, 1 Dec 2023 09:32:22 +0100
Subject: [PATCH 09/72] [libc] Fix lint message (#73956)

---
 libc/src/__support/CMakeLists.txt                 |  2 ++
 libc/src/__support/str_to_num_result.h            | 13 ++++++++-----
 utils/bazel/llvm-project-overlay/libc/BUILD.bazel |  3 +--
 3 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt
index a76b22960f5a50..decd6ed2dbd2bd 100644
--- a/libc/src/__support/CMakeLists.txt
+++ b/libc/src/__support/CMakeLists.txt
@@ -79,6 +79,8 @@ add_header_library(
   str_to_num_result
   HDRS
     str_to_num_result.h
+  DEPENDS
+    libc.src.__support.macros.attributes
 )
 
 add_header_library(
diff --git a/libc/src/__support/str_to_num_result.h b/libc/src/__support/str_to_num_result.h
index 9ba704c690655e..b32fbdeeb580fe 100644
--- a/libc/src/__support/str_to_num_result.h
+++ b/libc/src/__support/str_to_num_result.h
@@ -9,6 +9,8 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_STR_TO_NUM_RESULT_H
 #define LLVM_LIBC_SRC___SUPPORT_STR_TO_NUM_RESULT_H
 
+#include "src/__support/macros/attributes.h" // LIBC_INLINE
+
 #include <stddef.h>
 
 namespace LIBC_NAMESPACE {
@@ -18,15 +20,16 @@ template <typename T> struct StrToNumResult {
   int error;
   ptrdiff_t parsed_len;
 
-  constexpr StrToNumResult(T value) : value(value), error(0), parsed_len(0) {}
-  constexpr StrToNumResult(T value, ptrdiff_t parsed_len)
+  LIBC_INLINE constexpr StrToNumResult(T value)
+      : value(value), error(0), parsed_len(0) {}
+  LIBC_INLINE constexpr StrToNumResult(T value, ptrdiff_t parsed_len)
       : value(value), error(0), parsed_len(parsed_len) {}
-  constexpr StrToNumResult(T value, ptrdiff_t parsed_len, int error)
+  LIBC_INLINE constexpr StrToNumResult(T value, ptrdiff_t parsed_len, int error)
       : value(value), error(error), parsed_len(parsed_len) {}
 
-  constexpr bool has_error() { return error != 0; }
+  LIBC_INLINE constexpr bool has_error() { return error != 0; }
 
-  constexpr operator T() { return value; }
+  LIBC_INLINE constexpr operator T() { return value; }
 };
 } // namespace LIBC_NAMESPACE
 
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index a0a6a4366ea753..fdd620a4d415c0 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -474,8 +474,7 @@ libc_support_library(
 libc_support_library(
     name = "__support_str_to_num_result",
     hdrs = ["src/__support/str_to_num_result.h"],
-    deps = [
-    ],
+    deps = [":__support_macros_attributes"],
 )
 
 libc_support_library(

From 1726b65e4c273d55dd54838a742b03caff4abcdd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski@arm.com>
Date: Fri, 1 Dec 2023 08:45:13 +0000
Subject: [PATCH 10/72] [MLIR][Vector] Refactor tests for contract -> OP
 transforms (4/N) (#73807)

This patch refactors tests for:

    vector.contract -> vector.outerproduct

for matvec operations (b += Ax). Summary of changes:
  * add 2 missing cases (masked + scalable) when the operation kind is
    `maxf`.

This is a part of a larger effort to add cases with scalable vectors to
tests for the Vector dialect.

Implements #72834.
---
 ...act-to-outerproduct-matvec-transforms.mlir | 46 +++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/mlir/test/Dialect/Vector/vector-contract-to-outerproduct-matvec-transforms.mlir b/mlir/test/Dialect/Vector/vector-contract-to-outerproduct-matvec-transforms.mlir
index e84a43feaff39d..8fed1f8fb34154 100644
--- a/mlir/test/Dialect/Vector/vector-contract-to-outerproduct-matvec-transforms.mlir
+++ b/mlir/test/Dialect/Vector/vector-contract-to-outerproduct-matvec-transforms.mlir
@@ -186,6 +186,52 @@ func.func @matvec_mk_k_m_max(%A: vector<2x2xf32>,
   return %0 : vector<2xf32>
 }
 
+// CHECK-LABEL:   func.func @masked_matvec_mk_k_m_max(
+// CHECK-SAME:      %{{.*}}: vector<2x3xf32>,
+// CHECK-SAME:      %{{.*}}: vector<3xf32>,
+// CHECK-SAME:      %{{.*}}: vector<2xf32>,
+// CHECK-SAME:      %[[IN_MASK:.*]]: vector<2x3xi1>) -> vector<2xf32>
+// CHECK:           %[[T_MASK:.*]] = vector.transpose %[[IN_MASK]], [1, 0] : vector<2x3xi1> to vector<3x2xi1>
+// CHECK:           %[[MASK0:.*]] = vector.extract %[[T_MASK]][0] : vector<2xi1> from vector<3x2xi1>
+// CHECK:           vector.mask %[[MASK0]] { vector.outerproduct {{.*}} {kind = #vector.kind<maxf>} : vector<2xf32>, f32 } : vector<2xi1> -> vector<2xf32>
+
+// CHECK:           %[[MASK1:.*]] = vector.extract %[[T_MASK]][1] : vector<2xi1> from vector<3x2xi1>
+// CHECK:           vector.mask %[[MASK1]] { vector.outerproduct {{.*}} {kind = #vector.kind<maxf>} : vector<2xf32>, f32 } : vector<2xi1> -> vector<2xf32>
+
+// CHECK:           %[[MASK2:.*]] = vector.extract %[[T_MASK]][2] : vector<2xi1> from vector<3x2xi1>
+// CHECK:           vector.mask %[[MASK2]] { vector.outerproduct {{.*}} {kind = #vector.kind<maxf>} : vector<2xf32>, f32 } : vector<2xi1> -> vector<2xf32>
+func.func @masked_matvec_mk_k_m_max(%A: vector<2x3xf32>,
+                                    %x: vector<3xf32>,
+                                    %b: vector<2xf32>,
+                                    %m: vector<2x3xi1>) -> vector<2xf32> {
+  %0 = vector.mask %m { vector.contract #matvecmax_trait %A, %x, %b
+          : vector<2x3xf32>, vector<3xf32> into vector<2xf32> } : vector<2x3xi1> -> vector<2xf32>
+  return %0 : vector<2xf32>
+}
+
+// CHECK-LABEL:   func.func @masked_matvec_mk_k_m_max_scalable_parallel_dim(
+// CHECK-SAME:      %{{.*}}: vector<[2]x3xf32>,
+// CHECK-SAME:      %{{.*}}: vector<3xf32>,
+// CHECK-SAME:      %{{.*}}: vector<[2]xf32>,
+// CHECK-SAME:      %[[IN_MASK:.*]]: vector<[2]x3xi1>) -> vector<[2]xf32>
+// CHECK:           %[[T_MASK:.*]] = vector.transpose %[[IN_MASK]], [1, 0] : vector<[2]x3xi1> to vector<3x[2]xi1>
+// CHECK:           %[[MASK0:.*]] = vector.extract %[[T_MASK]][0] : vector<[2]xi1> from vector<3x[2]xi1>
+// CHECK:           vector.mask %[[MASK0]] { vector.outerproduct {{.*}} {kind = #vector.kind<maxf>} : vector<[2]xf32>, f32 } : vector<[2]xi1> -> vector<[2]xf32>
+
+// CHECK:           %[[MASK1:.*]] = vector.extract %[[T_MASK]][1] : vector<[2]xi1> from vector<3x[2]xi1>
+// CHECK:           vector.mask %[[MASK1]] { vector.outerproduct {{.*}} {kind = #vector.kind<maxf>} : vector<[2]xf32>, f32 } : vector<[2]xi1> -> vector<[2]xf32>
+
+// CHECK:           %[[MASK2:.*]] = vector.extract %[[T_MASK]][2] : vector<[2]xi1> from vector<3x[2]xi1>
+// CHECK:           vector.mask %[[MASK2]] { vector.outerproduct {{.*}} {kind = #vector.kind<maxf>} : vector<[2]xf32>, f32 } : vector<[2]xi1> -> vector<[2]xf32>
+func.func @masked_matvec_mk_k_m_max_scalable_parallel_dim(%A: vector<[2]x3xf32>,
+                                                          %x: vector<3xf32>,
+                                                          %b: vector<[2]xf32>,
+                                                          %m: vector<[2]x3xi1>) -> vector<[2]xf32> {
+  %0 = vector.mask %m { vector.contract #matvecmax_trait %A, %x, %b
+          : vector<[2]x3xf32>, vector<3xf32> into vector<[2]xf32> } : vector<[2]x3xi1> -> vector<[2]xf32>
+  return %0 : vector<[2]xf32>
+}
+
 // ============================================================================
 //  Matvec 2 (plain + masked + scalable)
 // ============================================================================

From 2c976a1fac5c0d6fe1cd7c3637f3d16cc378f52b Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Fri, 1 Dec 2023 09:47:26 +0100
Subject: [PATCH 11/72] [libc] Fix _Float16 detection for x86 (#73947)

---
 libc/src/__support/macros/properties/CMakeLists.txt | 1 +
 libc/src/__support/macros/properties/float.h        | 3 ++-
 utils/bazel/llvm-project-overlay/libc/BUILD.bazel   | 1 +
 3 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/libc/src/__support/macros/properties/CMakeLists.txt b/libc/src/__support/macros/properties/CMakeLists.txt
index e37cdb78bfa2c5..ee87ce68c9da37 100644
--- a/libc/src/__support/macros/properties/CMakeLists.txt
+++ b/libc/src/__support/macros/properties/CMakeLists.txt
@@ -31,5 +31,6 @@ add_header_library(
   DEPENDS
     .architectures
     .compiler
+    .cpu_features
     .os
 )
diff --git a/libc/src/__support/macros/properties/float.h b/libc/src/__support/macros/properties/float.h
index 7e00ddc8f0cd32..4bafc3777a4714 100644
--- a/libc/src/__support/macros/properties/float.h
+++ b/libc/src/__support/macros/properties/float.h
@@ -13,6 +13,7 @@
 
 #include "src/__support/macros/properties/architectures.h"
 #include "src/__support/macros/properties/compiler.h"
+#include "src/__support/macros/properties/cpu_features.h"
 #include "src/__support/macros/properties/os.h"
 
 #include <float.h> // LDBL_MANT_DIG
@@ -30,7 +31,7 @@
 #endif
 
 // float16 support.
-#if defined(LIBC_TARGET_ARCH_IS_X86_64)
+#if defined(LIBC_TARGET_ARCH_IS_X86_64) && defined(LIBC_TARGET_CPU_HAS_SSE2)
 #if (defined(LIBC_COMPILER_CLANG_VER) && (LIBC_COMPILER_CLANG_VER >= 1500)) || \
     (defined(LIBC_COMPILER_GCC_VER) && (LIBC_COMPILER_GCC_VER >= 1201))
 #define LIBC_COMPILER_HAS_C23_FLOAT16
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index fdd620a4d415c0..d53ca202101537 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -86,6 +86,7 @@ libc_support_library(
     deps = [
         ":__support_macros_properties_architectures",
         ":__support_macros_properties_compiler",
+        ":__support_macros_properties_cpu_features",
         ":__support_macros_properties_os",
     ],
 )

From d55692d60d218f402ce107520daabed15f2d9ef6 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 10 Nov 2023 22:49:21 +0900
Subject: [PATCH 12/72] Reapply "ValueTracking: Identify implied fp classes by
 general fcmp (#66505)"

This reverts commit 96a0d714d58e48c363ee6abbbcdfd7a6ce646ac1.

Avoid assert with dynamic denormal-fp-math We don't recognize compares
with 0 as an exact class test if we don't know the denormal mode. We could
try to do better here, but it's probably not worth it.

Fixes asserts reported after 1adce7d8e47e2438f99f91607760b825e5e3cc37
---
 llvm/include/llvm/Analysis/ValueTracking.h    |  21 +
 llvm/lib/Analysis/ValueTracking.cpp           | 185 +++++++-
 .../Attributor/nofpclass-implied-by-fcmp.ll   | 446 +++++++++---------
 .../assume-fcmp-constant-implies-class.ll     | 270 ++++-------
 4 files changed, 500 insertions(+), 422 deletions(-)

diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h
index f353eec8c89bb9..82c87edd6297cd 100644
--- a/llvm/include/llvm/Analysis/ValueTracking.h
+++ b/llvm/include/llvm/Analysis/ValueTracking.h
@@ -214,6 +214,27 @@ std::pair<Value *, FPClassTest> fcmpToClassTest(CmpInst::Predicate Pred,
                                                 const APFloat *ConstRHS,
                                                 bool LookThroughSrc = true);
 
+/// Compute the possible floating-point classes that \p LHS could be based on an
+/// fcmp returning true. Returns { TestedValue, ClassesIfTrue, ClassesIfFalse }
+///
+/// If the compare returns an exact class test, ClassesIfTrue == ~ClassesIfFalse
+///
+/// This is a less exact version of fcmpToClassTest (e.g. fcmpToClassTest will
+/// only succeed for a test of x > 0 implies positive, but not x > 1).
+///
+/// If \p LookThroughSrc is true, consider the input value when computing the
+/// mask. This may look through sign bit operations.
+///
+/// If \p LookThroughSrc is false, ignore the source value (i.e. the first pair
+/// element will always be LHS.
+///
+std::tuple<Value *, FPClassTest, FPClassTest>
+fcmpImpliesClass(CmpInst::Predicate Pred, const Function &F, Value *LHS,
+                 const APFloat *ConstRHS, bool LookThroughSrc = true);
+std::tuple<Value *, FPClassTest, FPClassTest>
+fcmpImpliesClass(CmpInst::Predicate Pred, const Function &F, Value *LHS,
+                 Value *RHS, bool LookThroughSrc = true);
+
 struct KnownFPClass {
   /// Floating-point classes the value could be one of.
   FPClassTest KnownFPClasses = fcAllFlags;
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index ef8fa5826deb94..9cfe7315a7a4dc 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -4164,6 +4164,147 @@ llvm::fcmpToClassTest(FCmpInst::Predicate Pred, const Function &F, Value *LHS,
   return {Src, Mask};
 }
 
+std::tuple<Value *, FPClassTest, FPClassTest>
+llvm::fcmpImpliesClass(CmpInst::Predicate Pred, const Function &F, Value *LHS,
+                       const APFloat *ConstRHS, bool LookThroughSrc) {
+  auto [Val, ClassMask] =
+      fcmpToClassTest(Pred, F, LHS, ConstRHS, LookThroughSrc);
+  if (Val)
+    return {Val, ClassMask, ~ClassMask};
+
+  FPClassTest RHSClass = ConstRHS->classify();
+
+  // If we see a zero here, we are using dynamic denormal-fp-math, and can't
+  // treat comparisons to 0 as an exact class test.
+  //
+  // TODO: We could do better and still recognize non-equality cases.
+  if (RHSClass == fcPosZero || RHSClass == fcNegZero)
+    return {nullptr, fcAllFlags, fcAllFlags};
+
+  assert((RHSClass == fcPosNormal || RHSClass == fcNegNormal ||
+          RHSClass == fcPosSubnormal || RHSClass == fcNegSubnormal) &&
+         "should have been recognized as an exact class test");
+
+  const bool IsNegativeRHS = (RHSClass & fcNegative) == RHSClass;
+  const bool IsPositiveRHS = (RHSClass & fcPositive) == RHSClass;
+
+  assert(IsNegativeRHS == ConstRHS->isNegative());
+  assert(IsPositiveRHS == !ConstRHS->isNegative());
+
+  Value *Src = LHS;
+  const bool IsFabs = LookThroughSrc && match(LHS, m_FAbs(m_Value(Src)));
+
+  if (IsFabs)
+    RHSClass = llvm::inverse_fabs(RHSClass);
+
+  if (Pred == FCmpInst::FCMP_OEQ)
+    return {Src, RHSClass, fcAllFlags};
+
+  if (Pred == FCmpInst::FCMP_UEQ) {
+    FPClassTest Class = RHSClass | fcNan;
+    return {Src, Class, ~fcNan};
+  }
+
+  if (Pred == FCmpInst::FCMP_ONE)
+    return {Src, ~fcNan, RHSClass};
+
+  if (Pred == FCmpInst::FCMP_UNE)
+    return {Src, fcAllFlags, RHSClass};
+
+  if (IsNegativeRHS) {
+    // TODO: Handle fneg(fabs)
+    if (IsFabs) {
+      // fabs(x) o> -k -> fcmp ord x, x
+      // fabs(x) u> -k -> true
+      // fabs(x) o< -k -> false
+      // fabs(x) u< -k -> fcmp uno x, x
+      switch (Pred) {
+      case FCmpInst::FCMP_OGT:
+      case FCmpInst::FCMP_OGE:
+        return {Src, ~fcNan, fcNan};
+      case FCmpInst::FCMP_UGT:
+      case FCmpInst::FCMP_UGE:
+        return {Src, fcAllFlags, fcNone};
+      case FCmpInst::FCMP_OLT:
+      case FCmpInst::FCMP_OLE:
+        return {Src, fcNone, fcAllFlags};
+      case FCmpInst::FCMP_ULT:
+      case FCmpInst::FCMP_ULE:
+        return {Src, fcNan, ~fcNan};
+      default:
+        break;
+      }
+
+      return {nullptr, fcAllFlags, fcAllFlags};
+    }
+
+    FPClassTest ClassesLE = fcNegInf | fcNegNormal;
+    FPClassTest ClassesGE = fcPositive | fcNegZero | fcNegSubnormal;
+
+    if (ConstRHS->isDenormal())
+      ClassesLE |= fcNegSubnormal;
+    else
+      ClassesGE |= fcNegNormal;
+
+    switch (Pred) {
+    case FCmpInst::FCMP_OGT:
+    case FCmpInst::FCMP_OGE:
+      return {Src, ClassesGE, ~ClassesGE | RHSClass};
+    case FCmpInst::FCMP_UGT:
+    case FCmpInst::FCMP_UGE:
+      return {Src, ClassesGE | fcNan, ~(ClassesGE | fcNan) | RHSClass};
+    case FCmpInst::FCMP_OLT:
+    case FCmpInst::FCMP_OLE:
+      return {Src, ClassesLE, ~ClassesLE | RHSClass};
+    case FCmpInst::FCMP_ULT:
+    case FCmpInst::FCMP_ULE:
+      return {Src, ClassesLE | fcNan, ~(ClassesLE | fcNan) | RHSClass};
+    default:
+      break;
+    }
+  } else if (IsPositiveRHS) {
+    FPClassTest ClassesGE = fcPosNormal | fcPosInf;
+    FPClassTest ClassesLE = fcNegative | fcPosZero | fcPosNormal;
+    if (ConstRHS->isDenormal())
+      ClassesGE |= fcPosNormal;
+    else
+      ClassesLE |= fcPosSubnormal;
+
+    if (IsFabs) {
+      ClassesGE = llvm::inverse_fabs(ClassesGE);
+      ClassesLE = llvm::inverse_fabs(ClassesLE);
+    }
+
+    switch (Pred) {
+    case FCmpInst::FCMP_OGT:
+    case FCmpInst::FCMP_OGE:
+      return {Src, ClassesGE, ~ClassesGE | RHSClass};
+    case FCmpInst::FCMP_UGT:
+    case FCmpInst::FCMP_UGE:
+      return {Src, ClassesGE | fcNan, ~(ClassesGE | fcNan) | RHSClass};
+    case FCmpInst::FCMP_OLT:
+    case FCmpInst::FCMP_OLE:
+      return {Src, ClassesLE, ~ClassesLE | RHSClass};
+    case FCmpInst::FCMP_ULT:
+    case FCmpInst::FCMP_ULE:
+      return {Src, ClassesLE | fcNan, ~(ClassesLE | fcNan) | RHSClass};
+    default:
+      break;
+    }
+  }
+
+  return {nullptr, fcAllFlags, fcAllFlags};
+}
+
+std::tuple<Value *, FPClassTest, FPClassTest>
+llvm::fcmpImpliesClass(CmpInst::Predicate Pred, const Function &F, Value *LHS,
+                       Value *RHS, bool LookThroughSrc) {
+  const APFloat *ConstRHS;
+  if (!match(RHS, m_APFloatAllowUndef(ConstRHS)))
+    return {nullptr, fcAllFlags, fcNone};
+  return fcmpImpliesClass(Pred, F, LHS, ConstRHS, LookThroughSrc);
+}
+
 static FPClassTest computeKnownFPClassFromAssumes(const Value *V,
                                                   const SimplifyQuery &Q) {
   FPClassTest KnownFromAssume = fcAllFlags;
@@ -4188,18 +4329,21 @@ static FPClassTest computeKnownFPClassFromAssumes(const Value *V,
     Value *LHS, *RHS;
     uint64_t ClassVal = 0;
     if (match(I->getArgOperand(0), m_FCmp(Pred, m_Value(LHS), m_Value(RHS)))) {
-      auto [TestedValue, TestedMask] =
-          fcmpToClassTest(Pred, *F, LHS, RHS, true);
-      // First see if we can fold in fabs/fneg into the test.
-      if (TestedValue == V)
-        KnownFromAssume &= TestedMask;
-      else {
-        // Try again without the lookthrough if we found a different source
-        // value.
-        auto [TestedValue, TestedMask] =
-            fcmpToClassTest(Pred, *F, LHS, RHS, false);
-        if (TestedValue == V)
-          KnownFromAssume &= TestedMask;
+      const APFloat *CRHS;
+      if (match(RHS, m_APFloat(CRHS))) {
+        // First see if we can fold in fabs/fneg into the test.
+        auto [CmpVal, MaskIfTrue, MaskIfFalse] =
+            fcmpImpliesClass(Pred, *F, LHS, CRHS, true);
+        if (CmpVal == V)
+          KnownFromAssume &= MaskIfTrue;
+        else {
+          // Try again without the lookthrough if we found a different source
+          // value.
+          auto [CmpVal, MaskIfTrue, MaskIfFalse] =
+              fcmpImpliesClass(Pred, *F, LHS, CRHS, false);
+          if (CmpVal == V)
+            KnownFromAssume &= MaskIfTrue;
+        }
       }
     } else if (match(I->getArgOperand(0),
                      m_Intrinsic<Intrinsic::is_fpclass>(
@@ -4347,7 +4491,8 @@ void computeKnownFPClass(const Value *V, const APInt &DemandedElts,
     FPClassTest FilterRHS = fcAllFlags;
 
     Value *TestedValue = nullptr;
-    FPClassTest TestedMask = fcNone;
+    FPClassTest MaskIfTrue = fcAllFlags;
+    FPClassTest MaskIfFalse = fcAllFlags;
     uint64_t ClassVal = 0;
     const Function *F = cast<Instruction>(Op)->getFunction();
     CmpInst::Predicate Pred;
@@ -4359,20 +4504,22 @@ void computeKnownFPClass(const Value *V, const APInt &DemandedElts,
       // TODO: In some degenerate cases we can infer something if we try again
       // without looking through sign operations.
       bool LookThroughFAbsFNeg = CmpLHS != LHS && CmpLHS != RHS;
-      std::tie(TestedValue, TestedMask) =
-          fcmpToClassTest(Pred, *F, CmpLHS, CmpRHS, LookThroughFAbsFNeg);
+      std::tie(TestedValue, MaskIfTrue, MaskIfFalse) =
+          fcmpImpliesClass(Pred, *F, CmpLHS, CmpRHS, LookThroughFAbsFNeg);
     } else if (match(Cond,
                      m_Intrinsic<Intrinsic::is_fpclass>(
                          m_Value(TestedValue), m_ConstantInt(ClassVal)))) {
-      TestedMask = static_cast<FPClassTest>(ClassVal);
+      FPClassTest TestedMask = static_cast<FPClassTest>(ClassVal);
+      MaskIfTrue = TestedMask;
+      MaskIfFalse = ~TestedMask;
     }
 
     if (TestedValue == LHS) {
       // match !isnan(x) ? x : y
-      FilterLHS = TestedMask;
-    } else if (TestedValue == RHS) {
+      FilterLHS = MaskIfTrue;
+    } else if (TestedValue == RHS) { // && IsExactClass
       // match !isnan(x) ? y : x
-      FilterRHS = ~TestedMask;
+      FilterRHS = MaskIfFalse;
     }
 
     KnownFPClass Known2;
diff --git a/llvm/test/Transforms/Attributor/nofpclass-implied-by-fcmp.ll b/llvm/test/Transforms/Attributor/nofpclass-implied-by-fcmp.ll
index ea594398c58014..d19b0ee3dc2dd5 100644
--- a/llvm/test/Transforms/Attributor/nofpclass-implied-by-fcmp.ll
+++ b/llvm/test/Transforms/Attributor/nofpclass-implied-by-fcmp.ll
@@ -11,8 +11,8 @@ declare void @llvm.assume(i1 noundef)
 
 ; can't be +inf
 define float @clamp_is_ogt_1_to_1(float %arg) {
-; CHECK-LABEL: define float @clamp_is_ogt_1_to_1(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2:[0-9]+]] {
+; CHECK-LABEL: define nofpclass(pinf) float @clamp_is_ogt_1_to_1(
+; CHECK-SAME: float nofpclass(pinf) [[ARG:%.*]]) #[[ATTR2:[0-9]+]] {
 ; CHECK-NEXT:    [[IS_OGT_1:%.*]] = fcmp ogt float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_OGT_1]], float 1.000000e+00, float [[ARG]]
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -23,8 +23,8 @@ define float @clamp_is_ogt_1_to_1(float %arg) {
 }
 
 define float @clamp_is_ogt_1_to_1_commute(float %arg) {
-; CHECK-LABEL: define float @clamp_is_ogt_1_to_1_commute(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(pinf) float @clamp_is_ogt_1_to_1_commute(
+; CHECK-SAME: float nofpclass(pinf) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_ULE_1:%.*]] = fcmp ule float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_ULE_1]], float [[ARG]], float 1.000000e+00
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -36,8 +36,8 @@ define float @clamp_is_ogt_1_to_1_commute(float %arg) {
 
 ; can't be +inf or nan
 define float @clamp_is_ugt_1_to_1(float %arg) {
-; CHECK-LABEL: define float @clamp_is_ugt_1_to_1(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(nan pinf) float @clamp_is_ugt_1_to_1(
+; CHECK-SAME: float nofpclass(nan pinf) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_UGT_1:%.*]] = fcmp ugt float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_UGT_1]], float 1.000000e+00, float [[ARG]]
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -49,8 +49,8 @@ define float @clamp_is_ugt_1_to_1(float %arg) {
 
 ; can't be +inf or nan
 define float @clamp_is_ugt_1_to_1_commute(float %arg) {
-; CHECK-LABEL: define float @clamp_is_ugt_1_to_1_commute(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(nan pinf) float @clamp_is_ugt_1_to_1_commute(
+; CHECK-SAME: float nofpclass(nan pinf) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_OLE_1:%.*]] = fcmp ole float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_OLE_1]], float [[ARG]], float 1.000000e+00
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -62,8 +62,8 @@ define float @clamp_is_ugt_1_to_1_commute(float %arg) {
 
 ; can't be +inf
 define float @clamp_is_oge_1_to_1(float %arg) {
-; CHECK-LABEL: define float @clamp_is_oge_1_to_1(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(pinf) float @clamp_is_oge_1_to_1(
+; CHECK-SAME: float nofpclass(pinf) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_OGE_1:%.*]] = fcmp oge float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_OGE_1]], float 1.000000e+00, float [[ARG]]
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -74,8 +74,8 @@ define float @clamp_is_oge_1_to_1(float %arg) {
 }
 
 define float @clamp_is_oge_1_to_1_commute(float %arg) {
-; CHECK-LABEL: define float @clamp_is_oge_1_to_1_commute(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(pinf) float @clamp_is_oge_1_to_1_commute(
+; CHECK-SAME: float nofpclass(pinf) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_ULT_1:%.*]] = fcmp ult float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_ULT_1]], float [[ARG]], float 1.000000e+00
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -87,8 +87,8 @@ define float @clamp_is_oge_1_to_1_commute(float %arg) {
 
 ; can't be +inf or nan
 define float @clamp_is_uge_1_to_1(float %arg) {
-; CHECK-LABEL: define float @clamp_is_uge_1_to_1(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(nan pinf) float @clamp_is_uge_1_to_1(
+; CHECK-SAME: float nofpclass(nan pinf) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_UGT_1:%.*]] = fcmp uge float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_UGT_1]], float 1.000000e+00, float [[ARG]]
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -100,8 +100,8 @@ define float @clamp_is_uge_1_to_1(float %arg) {
 
 ; can't be negative, zero, or denormal
 define float @clamp_is_olt_1_to_1(float %arg) {
-; CHECK-LABEL: define float @clamp_is_olt_1_to_1(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(ninf zero sub nnorm) float @clamp_is_olt_1_to_1(
+; CHECK-SAME: float nofpclass(ninf zero sub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_OLT_1:%.*]] = fcmp olt float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_OLT_1]], float 1.000000e+00, float [[ARG]]
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -113,8 +113,8 @@ define float @clamp_is_olt_1_to_1(float %arg) {
 
 ; can't be negative, zero, or denormal
 define float @clamp_is_olt_1_to_1_commute(float %arg) {
-; CHECK-LABEL: define float @clamp_is_olt_1_to_1_commute(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(ninf zero sub nnorm) float @clamp_is_olt_1_to_1_commute(
+; CHECK-SAME: float nofpclass(ninf zero sub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_UGE_1:%.*]] = fcmp uge float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_UGE_1]], float [[ARG]], float 1.000000e+00
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -126,8 +126,8 @@ define float @clamp_is_olt_1_to_1_commute(float %arg) {
 
 ; can't be negative or zero, nan or denormal
 define float @clamp_is_ult_1_to_1(float %arg) {
-; CHECK-LABEL: define float @clamp_is_ult_1_to_1(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(nan ninf zero sub nnorm) float @clamp_is_ult_1_to_1(
+; CHECK-SAME: float nofpclass(nan ninf zero sub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_ULT_1:%.*]] = fcmp ult float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_ULT_1]], float 1.000000e+00, float [[ARG]]
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -139,8 +139,8 @@ define float @clamp_is_ult_1_to_1(float %arg) {
 
 ; can't be negative or zero, nan or denormal
 define float @clamp_is_ult_1_to_1_commute(float %arg) {
-; CHECK-LABEL: define float @clamp_is_ult_1_to_1_commute(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(nan ninf zero sub nnorm) float @clamp_is_ult_1_to_1_commute(
+; CHECK-SAME: float nofpclass(nan ninf zero sub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_OGE_1:%.*]] = fcmp oge float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_OGE_1]], float [[ARG]], float 1.000000e+00
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -152,8 +152,8 @@ define float @clamp_is_ult_1_to_1_commute(float %arg) {
 
 ; can't be negative, zero or denormal
 define float @clamp_is_ole_1_to_1(float %arg) {
-; CHECK-LABEL: define float @clamp_is_ole_1_to_1(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(ninf zero sub nnorm) float @clamp_is_ole_1_to_1(
+; CHECK-SAME: float nofpclass(ninf zero sub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_OLE_1:%.*]] = fcmp ole float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_OLE_1]], float 1.000000e+00, float [[ARG]]
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -165,8 +165,8 @@ define float @clamp_is_ole_1_to_1(float %arg) {
 
 ; can't be negative or zero, nan or denormal
 define float @clamp_is_ule_1_to_1(float %arg) {
-; CHECK-LABEL: define float @clamp_is_ule_1_to_1(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(nan ninf zero sub nnorm) float @clamp_is_ule_1_to_1(
+; CHECK-SAME: float nofpclass(nan ninf zero sub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_ULE_1:%.*]] = fcmp ule float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_ULE_1]], float 1.000000e+00, float [[ARG]]
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -178,8 +178,8 @@ define float @clamp_is_ule_1_to_1(float %arg) {
 
 ; can't be negative or denormal
 define float @clamp_is_olt_1_to_0(float %arg) {
-; CHECK-LABEL: define float @clamp_is_olt_1_to_0(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(ninf nzero sub nnorm) float @clamp_is_olt_1_to_0(
+; CHECK-SAME: float nofpclass(ninf nzero sub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_OLT_1:%.*]] = fcmp olt float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_OLT_1]], float 0.000000e+00, float [[ARG]]
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -191,8 +191,8 @@ define float @clamp_is_olt_1_to_0(float %arg) {
 
 ; can't be negative, nan or denormal
 define float @clamp_is_ult_1_to_0(float %arg) {
-; CHECK-LABEL: define float @clamp_is_ult_1_to_0(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(ninf nzero sub nnorm) float @clamp_is_ult_1_to_0(
+; CHECK-SAME: float nofpclass(ninf nzero sub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_ULT_1:%.*]] = fcmp olt float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_ULT_1]], float 0.000000e+00, float [[ARG]]
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -204,8 +204,8 @@ define float @clamp_is_ult_1_to_0(float %arg) {
 
 ; can't be negative or denormal
 define float @clamp_is_ole_1_to_0(float %arg) {
-; CHECK-LABEL: define float @clamp_is_ole_1_to_0(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(ninf nzero sub nnorm) float @clamp_is_ole_1_to_0(
+; CHECK-SAME: float nofpclass(ninf nzero sub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_OLE_1:%.*]] = fcmp ole float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_OLE_1]], float 0.000000e+00, float [[ARG]]
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -217,8 +217,8 @@ define float @clamp_is_ole_1_to_0(float %arg) {
 
 ; can't be negative or denormal
 define float @clamp_is_ole_1_to_0_commute(float %arg) {
-; CHECK-LABEL: define float @clamp_is_ole_1_to_0_commute(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(ninf nzero sub nnorm) float @clamp_is_ole_1_to_0_commute(
+; CHECK-SAME: float nofpclass(ninf nzero sub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_UGT_1:%.*]] = fcmp ugt float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_UGT_1]], float [[ARG]], float 0.000000e+00
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -230,8 +230,8 @@ define float @clamp_is_ole_1_to_0_commute(float %arg) {
 
 ; can't be negative or denormal
 define float @clamp_is_ule_1_to_0(float %arg) {
-; CHECK-LABEL: define float @clamp_is_ule_1_to_0(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(ninf nzero sub nnorm) float @clamp_is_ule_1_to_0(
+; CHECK-SAME: float nofpclass(ninf nzero sub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_ULE_1:%.*]] = fcmp ole float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_ULE_1]], float 0.000000e+00, float [[ARG]]
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -243,8 +243,8 @@ define float @clamp_is_ule_1_to_0(float %arg) {
 
 ; can't be positive, zero or denormal
 define float @clamp_is_ogt_neg1_to_neg1(float %arg) {
-; CHECK-LABEL: define float @clamp_is_ogt_neg1_to_neg1(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(pinf zero sub pnorm) float @clamp_is_ogt_neg1_to_neg1(
+; CHECK-SAME: float nofpclass(pinf zero sub pnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_OGT_NEG1:%.*]] = fcmp ogt float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_OGT_NEG1]], float -1.000000e+00, float [[ARG]]
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -256,8 +256,8 @@ define float @clamp_is_ogt_neg1_to_neg1(float %arg) {
 
 ; can't be positive, zero, nan or denormal
 define float @clamp_is_ugt_neg1_to_neg1(float %arg) {
-; CHECK-LABEL: define float @clamp_is_ugt_neg1_to_neg1(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(nan pinf zero sub pnorm) float @clamp_is_ugt_neg1_to_neg1(
+; CHECK-SAME: float nofpclass(nan pinf zero sub pnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_UGT_NEG1:%.*]] = fcmp ugt float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_UGT_NEG1]], float -1.000000e+00, float [[ARG]]
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -269,8 +269,8 @@ define float @clamp_is_ugt_neg1_to_neg1(float %arg) {
 
 ; can't be positive or denormal
 define float @clamp_is_ogt_neg1_to_0(float %arg) {
-; CHECK-LABEL: define float @clamp_is_ogt_neg1_to_0(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(pinf nzero sub pnorm) float @clamp_is_ogt_neg1_to_0(
+; CHECK-SAME: float nofpclass(pinf nzero sub pnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_OGT_NEG1:%.*]] = fcmp ogt float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_OGT_NEG1]], float 0.000000e+00, float [[ARG]]
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -282,8 +282,8 @@ define float @clamp_is_ogt_neg1_to_0(float %arg) {
 
 ; can't be positive, nan or denormal
 define float @clamp_is_ugt_neg1_to_0(float %arg) {
-; CHECK-LABEL: define float @clamp_is_ugt_neg1_to_0(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(nan pinf nzero sub pnorm) float @clamp_is_ugt_neg1_to_0(
+; CHECK-SAME: float nofpclass(nan pinf nzero sub pnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_UGT_NEG1:%.*]] = fcmp ugt float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_UGT_NEG1]], float 0.000000e+00, float [[ARG]]
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -295,8 +295,8 @@ define float @clamp_is_ugt_neg1_to_0(float %arg) {
 
 ; can't be -inf
 define float @clamp_is_olt_neg1_to_neg1_commute(float %arg) {
-; CHECK-LABEL: define float @clamp_is_olt_neg1_to_neg1_commute(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(ninf) float @clamp_is_olt_neg1_to_neg1_commute(
+; CHECK-SAME: float nofpclass(ninf) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_UGE_NEG1:%.*]] = fcmp uge float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_UGE_NEG1]], float [[ARG]], float -1.000000e+00
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -308,8 +308,8 @@ define float @clamp_is_olt_neg1_to_neg1_commute(float %arg) {
 
 ; can't be -inf
 define float @clamp_is_olt_neg1_to_neg1(float %arg) {
-; CHECK-LABEL: define float @clamp_is_olt_neg1_to_neg1(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(ninf) float @clamp_is_olt_neg1_to_neg1(
+; CHECK-SAME: float nofpclass(ninf) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_OLT_NEG1:%.*]] = fcmp olt float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_OLT_NEG1]], float -1.000000e+00, float [[ARG]]
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -321,8 +321,8 @@ define float @clamp_is_olt_neg1_to_neg1(float %arg) {
 
 ; can't be -inf or nan
 define float @clamp_is_ult_neg1_to_neg1(float %arg) {
-; CHECK-LABEL: define float @clamp_is_ult_neg1_to_neg1(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(nan ninf) float @clamp_is_ult_neg1_to_neg1(
+; CHECK-SAME: float nofpclass(nan ninf) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_ULT_NEG1:%.*]] = fcmp ult float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_ULT_NEG1]], float -1.000000e+00, float [[ARG]]
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -334,8 +334,8 @@ define float @clamp_is_ult_neg1_to_neg1(float %arg) {
 
 ; can't be -inf or nan
 define float @clamp_is_ult_neg1_to_neg1_commute(float %arg) {
-; CHECK-LABEL: define float @clamp_is_ult_neg1_to_neg1_commute(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(nan ninf) float @clamp_is_ult_neg1_to_neg1_commute(
+; CHECK-SAME: float nofpclass(nan ninf) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_OGE_NEG1:%.*]] = fcmp oge float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_OGE_NEG1]], float [[ARG]], float -1.000000e+00
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -351,8 +351,8 @@ define float @clamp_is_ult_neg1_to_neg1_commute(float %arg) {
 
 ; Must be 1, only posnormal
 define float @fcmp_oeq_1_else_1(float %arg) {
-; CHECK-LABEL: define float @fcmp_oeq_1_else_1(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(nan inf zero sub nnorm) float @fcmp_oeq_1_else_1(
+; CHECK-SAME: float nofpclass(nan inf zero sub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_OEQ_1:%.*]] = fcmp oeq float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_OEQ_1]], float [[ARG]], float 1.000000e+00
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -364,8 +364,8 @@ define float @fcmp_oeq_1_else_1(float %arg) {
 
 ; Don't know anything
 define float @fcmp_one_1_else_1(float %arg) {
-; CHECK-LABEL: define float @fcmp_one_1_else_1(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(nan) float @fcmp_one_1_else_1(
+; CHECK-SAME: float nofpclass(nan) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_ONE_1:%.*]] = fcmp one float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_ONE_1]], float [[ARG]], float 1.000000e+00
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -377,8 +377,8 @@ define float @fcmp_one_1_else_1(float %arg) {
 
 ; must be 1
 define float @fcmp_one_1_1_else_arg(float %arg) {
-; CHECK-LABEL: define float @fcmp_one_1_1_else_arg(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(nan inf zero sub nnorm) float @fcmp_one_1_1_else_arg(
+; CHECK-SAME: float nofpclass(nan inf zero sub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_ONE_1:%.*]] = fcmp one float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_ONE_1]], float 1.000000e+00, float [[ARG]]
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -390,8 +390,8 @@ define float @fcmp_one_1_1_else_arg(float %arg) {
 
 ; must be 1
 define float @fcmp_une_1_1_else_arg(float %arg) {
-; CHECK-LABEL: define float @fcmp_une_1_1_else_arg(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(nan inf zero sub nnorm) float @fcmp_une_1_1_else_arg(
+; CHECK-SAME: float nofpclass(nan inf zero sub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_UNE_1:%.*]] = fcmp une float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_UNE_1]], float 1.000000e+00, float [[ARG]]
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -403,8 +403,8 @@ define float @fcmp_une_1_1_else_arg(float %arg) {
 
 ; Must be -1, only negnormal
 define float @fcmp_oeq_neg1_else_neg1(float %arg) {
-; CHECK-LABEL: define float @fcmp_oeq_neg1_else_neg1(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(nan inf zero sub pnorm) float @fcmp_oeq_neg1_else_neg1(
+; CHECK-SAME: float nofpclass(nan inf zero sub pnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_OEQ_NEG1:%.*]] = fcmp oeq float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_OEQ_NEG1]], float [[ARG]], float -1.000000e+00
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -416,8 +416,8 @@ define float @fcmp_oeq_neg1_else_neg1(float %arg) {
 
 ; Don't know anything
 define float @fcmp_one_neg1_else_neg1(float %arg) {
-; CHECK-LABEL: define float @fcmp_one_neg1_else_neg1(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(nan) float @fcmp_one_neg1_else_neg1(
+; CHECK-SAME: float nofpclass(nan) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_ONE_NEG1:%.*]] = fcmp one float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_ONE_NEG1]], float [[ARG]], float -1.000000e+00
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -512,8 +512,8 @@ define float @if_fcmp_one_0_1_else_arg(float %arg) {
 }
 
 define float @if_fcmp_one_1_arg_else_0(float %arg) {
-; CHECK-LABEL: define float @if_fcmp_one_1_arg_else_0(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(nan) float @if_fcmp_one_1_arg_else_0(
+; CHECK-SAME: float nofpclass(nan) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_ONE_1:%.*]] = fcmp one float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_ONE_1]], float [[ARG]], float 0.000000e+00
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -524,8 +524,8 @@ define float @if_fcmp_one_1_arg_else_0(float %arg) {
 }
 
 define float @fcmp_fabs_oeq_1_else_1(float %arg) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @fcmp_fabs_oeq_1_else_1(
-; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(nan inf zero sub nnorm) float @fcmp_fabs_oeq_1_else_1(
+; CHECK-SAME: float nofpclass(nan inf zero sub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4:[0-9]+]]
 ; CHECK-NEXT:    [[FABS_IS_OEQ_1:%.*]] = fcmp oeq float [[FABS_ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[FABS_IS_OEQ_1]], float [[ARG]], float 1.000000e+00
@@ -552,8 +552,8 @@ define float @fcmp_fabs_oeq_1_0_else_arg(float %arg) {
 }
 
 define float @fcmp_fabs_ueq_1_0_else_arg(float %arg) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @fcmp_fabs_ueq_1_0_else_arg(
-; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @fcmp_fabs_ueq_1_0_else_arg(
+; CHECK-SAME: float nofpclass(nan ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[FABS_IS_UEQ_1:%.*]] = fcmp ueq float [[FABS_ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[FABS_IS_UEQ_1]], float 0.000000e+00, float [[ARG]]
@@ -566,8 +566,8 @@ define float @fcmp_fabs_ueq_1_0_else_arg(float %arg) {
 }
 
 define float @fcmp_fabs_one_1_arg_else_0(float %arg) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @fcmp_fabs_one_1_arg_else_0(
-; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @fcmp_fabs_one_1_arg_else_0(
+; CHECK-SAME: float nofpclass(nan ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[FABS_IS_ONE_1:%.*]] = fcmp one float [[FABS_ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[FABS_IS_ONE_1]], float [[ARG]], float 0.000000e+00
@@ -594,8 +594,8 @@ define float @fcmp_fabs_une_1_arg_else_0(float %arg) {
 }
 
 define float @fcmp_fabs_one_1_0_else_arg(float %arg) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @fcmp_fabs_one_1_0_else_arg(
-; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(nan inf nzero sub nnorm) float @fcmp_fabs_one_1_0_else_arg(
+; CHECK-SAME: float nofpclass(nan inf nzero sub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[FABS_IS_ONE_1:%.*]] = fcmp one float [[FABS_ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[FABS_IS_ONE_1]], float 0.000000e+00, float [[ARG]]
@@ -608,8 +608,8 @@ define float @fcmp_fabs_one_1_0_else_arg(float %arg) {
 }
 
 define float @fcmp_fabs_une_1_0_else_arg(float %arg) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @fcmp_fabs_une_1_0_else_arg(
-; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(nan inf nzero sub nnorm) float @fcmp_fabs_une_1_0_else_arg(
+; CHECK-SAME: float nofpclass(nan inf nzero sub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[FABS_IS_UNE_1:%.*]] = fcmp une float [[FABS_ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[FABS_IS_UNE_1]], float 0.000000e+00, float [[ARG]]
@@ -622,8 +622,8 @@ define float @fcmp_fabs_une_1_0_else_arg(float %arg) {
 }
 
 define float @fcmp_fabs_one_1_neg2_else_arg(float %arg) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub) float @fcmp_fabs_one_1_neg2_else_arg(
-; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(nan inf zero sub) float @fcmp_fabs_one_1_neg2_else_arg(
+; CHECK-SAME: float nofpclass(nan inf zero sub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[FABS_IS_ONE_1:%.*]] = fcmp one float [[FABS_ARG]], 1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[FABS_IS_ONE_1]], float -2.000000e+00, float [[ARG]]
@@ -640,8 +640,8 @@ define float @fcmp_fabs_one_1_neg2_else_arg(float %arg) {
 ;---------------------------------------------------------------------
 
 define float @clamp_olt_largest_denormal_0.0(float %arg) {
-; CHECK-LABEL: define float @clamp_olt_largest_denormal_0.0(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub norm) float @clamp_olt_largest_denormal_0.0(
+; CHECK-SAME: float nofpclass(ninf nzero nsub norm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_OLT_LARGEST_DENORMAL:%.*]] = fcmp olt float [[ARG]], 0x380FFFFFC0000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_OLT_LARGEST_DENORMAL]], float 0.000000e+00, float [[ARG]]
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -652,8 +652,8 @@ define float @clamp_olt_largest_denormal_0.0(float %arg) {
 }
 
 define float @clamp_ole_largest_denormal_0.0(float %arg) {
-; CHECK-LABEL: define float @clamp_ole_largest_denormal_0.0(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub norm) float @clamp_ole_largest_denormal_0.0(
+; CHECK-SAME: float nofpclass(ninf nzero nsub norm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_OLE_LARGEST_DENORMAL:%.*]] = fcmp ole float [[ARG]], 0x380FFFFFC0000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_OLE_LARGEST_DENORMAL]], float 0.000000e+00, float [[ARG]]
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -664,8 +664,8 @@ define float @clamp_ole_largest_denormal_0.0(float %arg) {
 }
 
 define float @clamp_ult_largest_denormal_0.0(float %arg) {
-; CHECK-LABEL: define float @clamp_ult_largest_denormal_0.0(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(nan ninf nzero nsub norm) float @clamp_ult_largest_denormal_0.0(
+; CHECK-SAME: float nofpclass(nan ninf nzero nsub norm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_ULT_LARGEST_DENORMAL:%.*]] = fcmp ult float [[ARG]], 0x380FFFFFC0000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_ULT_LARGEST_DENORMAL]], float 0.000000e+00, float [[ARG]]
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -676,8 +676,8 @@ define float @clamp_ult_largest_denormal_0.0(float %arg) {
 }
 
 define float @clamp_ule_largest_denormal_0.0(float %arg) {
-; CHECK-LABEL: define float @clamp_ule_largest_denormal_0.0(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(nan ninf nzero nsub norm) float @clamp_ule_largest_denormal_0.0(
+; CHECK-SAME: float nofpclass(nan ninf nzero nsub norm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_ULE_LARGEST_DENORMAL:%.*]] = fcmp ule float [[ARG]], 0x380FFFFFC0000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_ULE_LARGEST_DENORMAL]], float 0.000000e+00, float [[ARG]]
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -688,8 +688,8 @@ define float @clamp_ule_largest_denormal_0.0(float %arg) {
 }
 
 define float @clamp_ogt_largest_denormal_0.0(float %arg) {
-; CHECK-LABEL: define float @clamp_ogt_largest_denormal_0.0(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(ninf nzero sub nnorm) float @clamp_ogt_largest_denormal_0.0(
+; CHECK-SAME: float nofpclass(ninf nzero sub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_OGT_LARGEST_DENORMAL:%.*]] = fcmp ugt float [[ARG]], 0x380FFFFFC0000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_OGT_LARGEST_DENORMAL]], float [[ARG]], float 0.000000e+00
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -700,8 +700,8 @@ define float @clamp_ogt_largest_denormal_0.0(float %arg) {
 }
 
 define float @clamp_oge_largest_denormal_0.0(float %arg) {
-; CHECK-LABEL: define float @clamp_oge_largest_denormal_0.0(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(nan ninf nzero sub nnorm) float @clamp_oge_largest_denormal_0.0(
+; CHECK-SAME: float nofpclass(nan ninf nzero sub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_OGE_LARGEST_DENORMAL:%.*]] = fcmp oge float [[ARG]], 0x380FFFFFC0000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_OGE_LARGEST_DENORMAL]], float [[ARG]], float 0.000000e+00
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -712,8 +712,8 @@ define float @clamp_oge_largest_denormal_0.0(float %arg) {
 }
 
 define float @clamp_ugt_largest_denormal_0.0(float %arg) {
-; CHECK-LABEL: define float @clamp_ugt_largest_denormal_0.0(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(ninf nzero sub nnorm) float @clamp_ugt_largest_denormal_0.0(
+; CHECK-SAME: float nofpclass(ninf nzero sub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_UGT_LARGEST_DENORMAL:%.*]] = fcmp ugt float [[ARG]], 0x380FFFFFC0000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_UGT_LARGEST_DENORMAL]], float [[ARG]], float 0.000000e+00
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -724,8 +724,8 @@ define float @clamp_ugt_largest_denormal_0.0(float %arg) {
 }
 
 define float @clamp_uge_largest_denormal_0.0(float %arg) {
-; CHECK-LABEL: define float @clamp_uge_largest_denormal_0.0(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(ninf nzero sub nnorm) float @clamp_uge_largest_denormal_0.0(
+; CHECK-SAME: float nofpclass(ninf nzero sub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_UGE_LARGEST_DENORMAL:%.*]] = fcmp uge float [[ARG]], 0x380FFFFFC0000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_UGE_LARGEST_DENORMAL]], float [[ARG]], float 0.000000e+00
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -736,8 +736,8 @@ define float @clamp_uge_largest_denormal_0.0(float %arg) {
 }
 
 define float @fcmp_oeq_largest_denormal_arg_else_0.0(float %arg) {
-; CHECK-LABEL: define float @fcmp_oeq_largest_denormal_arg_else_0.0(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(nan inf nzero nsub norm) float @fcmp_oeq_largest_denormal_arg_else_0.0(
+; CHECK-SAME: float nofpclass(nan inf nzero nsub norm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_OEQ_LARGEST_DENORMAL:%.*]] = fcmp oeq float [[ARG]], 0x380FFFFFC0000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_OEQ_LARGEST_DENORMAL]], float [[ARG]], float 0.000000e+00
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -893,8 +893,8 @@ define float @clamp_fabs_value_ule_1_to_1_copysign(float %arg) {
 
 ; Can't be +inf
 define float @clamp_is_ogt_largest_normal_to_largest_normal(float %arg) {
-; CHECK-LABEL: define float @clamp_is_ogt_largest_normal_to_largest_normal(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(pinf) float @clamp_is_ogt_largest_normal_to_largest_normal(
+; CHECK-SAME: float nofpclass(pinf) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_OGT_LARGEST_NORMAL:%.*]] = fcmp ogt float [[ARG]], 0x47EFFFFFE0000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_OGT_LARGEST_NORMAL]], float 0x47EFFFFFE0000000, float [[ARG]]
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -906,8 +906,8 @@ define float @clamp_is_ogt_largest_normal_to_largest_normal(float %arg) {
 
 ; Can't be +inf
 define float @clamp_is_oge_largest_normal_to_largest_normal(float %arg) {
-; CHECK-LABEL: define float @clamp_is_oge_largest_normal_to_largest_normal(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(pinf) float @clamp_is_oge_largest_normal_to_largest_normal(
+; CHECK-SAME: float nofpclass(pinf) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_OGE_LARGEST_NORMAL:%.*]] = fcmp oge float [[ARG]], 0x47EFFFFFE0000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_OGE_LARGEST_NORMAL]], float 0x47EFFFFFE0000000, float [[ARG]]
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -919,8 +919,8 @@ define float @clamp_is_oge_largest_normal_to_largest_normal(float %arg) {
 
 ; Can't be +inf or nan
 define float @clamp_is_ugt_largest_normal_to_largest_normal(float %arg) {
-; CHECK-LABEL: define float @clamp_is_ugt_largest_normal_to_largest_normal(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(nan pinf) float @clamp_is_ugt_largest_normal_to_largest_normal(
+; CHECK-SAME: float nofpclass(nan pinf) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_UGT_LARGEST_NORMAL:%.*]] = fcmp ugt float [[ARG]], 0x47EFFFFFE0000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_UGT_LARGEST_NORMAL]], float 0x47EFFFFFE0000000, float [[ARG]]
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -932,8 +932,8 @@ define float @clamp_is_ugt_largest_normal_to_largest_normal(float %arg) {
 
 ; Can't be +inf or nan
 define float @clamp_is_uge_largest_normal_to_largest_normal(float %arg) {
-; CHECK-LABEL: define float @clamp_is_uge_largest_normal_to_largest_normal(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(nan pinf) float @clamp_is_uge_largest_normal_to_largest_normal(
+; CHECK-SAME: float nofpclass(nan pinf) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[IS_UGE_LARGEST_NORMAL:%.*]] = fcmp uge float [[ARG]], 0x47EFFFFFE0000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_UGE_LARGEST_NORMAL]], float 0x47EFFFFFE0000000, float [[ARG]]
 ; CHECK-NEXT:    ret float [[SELECT]]
@@ -945,8 +945,8 @@ define float @clamp_is_uge_largest_normal_to_largest_normal(float %arg) {
 
 ; Can't be +inf or -inf
 define float @clamp_fabs_is_ogt_largest_normal_to_largest_normal(float %arg) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @clamp_fabs_is_ogt_largest_normal_to_largest_normal(
-; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(inf nzero nsub nnorm) float @clamp_fabs_is_ogt_largest_normal_to_largest_normal(
+; CHECK-SAME: float nofpclass(inf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[IS_OGT_LARGEST_NORMAL:%.*]] = fcmp ogt float [[FABS_ARG]], 0x47EFFFFFE0000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_OGT_LARGEST_NORMAL]], float 0x47EFFFFFE0000000, float [[ARG]]
@@ -960,8 +960,8 @@ define float @clamp_fabs_is_ogt_largest_normal_to_largest_normal(float %arg) {
 
 ; Can't be +inf or -inf
 define float @clamp_fabs_is_oge_largest_normal_to_largest_normal(float %arg) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @clamp_fabs_is_oge_largest_normal_to_largest_normal(
-; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(inf nzero nsub nnorm) float @clamp_fabs_is_oge_largest_normal_to_largest_normal(
+; CHECK-SAME: float nofpclass(inf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[IS_OGE_LARGEST_NORMAL:%.*]] = fcmp oge float [[FABS_ARG]], 0x47EFFFFFE0000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_OGE_LARGEST_NORMAL]], float 0x47EFFFFFE0000000, float [[ARG]]
@@ -975,8 +975,8 @@ define float @clamp_fabs_is_oge_largest_normal_to_largest_normal(float %arg) {
 
 ; Can't be +inf or -inf or nan
 define float @clamp_fabs_is_ugt_largest_normal_to_largest_normal(float %arg) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @clamp_fabs_is_ugt_largest_normal_to_largest_normal(
-; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(nan inf nzero nsub nnorm) float @clamp_fabs_is_ugt_largest_normal_to_largest_normal(
+; CHECK-SAME: float nofpclass(nan inf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[IS_UGT_LARGEST_NORMAL:%.*]] = fcmp ugt float [[FABS_ARG]], 0x47EFFFFFE0000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_UGT_LARGEST_NORMAL]], float 0x47EFFFFFE0000000, float [[ARG]]
@@ -990,8 +990,8 @@ define float @clamp_fabs_is_ugt_largest_normal_to_largest_normal(float %arg) {
 
 ; Can't be +inf or -inf or nan
 define float @clamp_fabs_is_uge_largest_normal_to_largest_normal(float %arg) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @clamp_fabs_is_uge_largest_normal_to_largest_normal(
-; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(nan inf nzero nsub nnorm) float @clamp_fabs_is_uge_largest_normal_to_largest_normal(
+; CHECK-SAME: float nofpclass(nan inf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[IS_UGT_LARGEST_NORMAL:%.*]] = fcmp uge float [[FABS_ARG]], 0x47EFFFFFE0000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_UGT_LARGEST_NORMAL]], float 0x47EFFFFFE0000000, float [[ARG]]
@@ -1009,8 +1009,8 @@ define float @clamp_fabs_is_uge_largest_normal_to_largest_normal(float %arg) {
 
 ; can't be negative or positive subnormal
 define float @clamp_fabs_ogt_smallest_normal_to_zero(float %arg) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @clamp_fabs_ogt_smallest_normal_to_zero(
-; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(inf nzero nsub nnorm) float @clamp_fabs_ogt_smallest_normal_to_zero(
+; CHECK-SAME: float nofpclass(inf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[IS_OGT_SMALLEST_NORMAL:%.*]] = fcmp ogt float [[FABS_ARG]], 0x3810000000000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_OGT_SMALLEST_NORMAL]], float 0.000000e+00, float [[ARG]]
@@ -1055,8 +1055,8 @@ define float @clamp_fabs_olt_smallest_normal_to_zero(float %arg) {
 
 ; can't be negative or subnormal
 define float @clamp_fabs_ole_smallest_normal_to_zero(float %arg) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @clamp_fabs_ole_smallest_normal_to_zero(
-; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(ninf nzero sub nnorm) float @clamp_fabs_ole_smallest_normal_to_zero(
+; CHECK-SAME: float nofpclass(ninf nzero sub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[IS_OLE_SMALLEST_NORMAL:%.*]] = fcmp ole float [[FABS_ARG]], 0x3810000000000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_OLE_SMALLEST_NORMAL]], float 0.000000e+00, float [[ARG]]
@@ -1083,8 +1083,8 @@ define float @clamp_fabs_is_is_olt_smallest_normal_to_0(float %arg) {
 }
 
 define float @clamp_fabs_is_is_ole_smallest_normal_to_0(float %arg) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @clamp_fabs_is_is_ole_smallest_normal_to_0(
-; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(ninf nzero sub nnorm) float @clamp_fabs_is_is_ole_smallest_normal_to_0(
+; CHECK-SAME: float nofpclass(ninf nzero sub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[IS_OLE_SMALLEST_NORMAL:%.*]] = fcmp ole float [[FABS_ARG]], 0x3810000000000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_OLE_SMALLEST_NORMAL]], float 0.000000e+00, float [[ARG]]
@@ -1111,8 +1111,8 @@ define float @clamp_fabs_oeq_smallest_normal_to_zero(float %arg) {
 }
 
 define float @clamp_fabs_one_smallest_normal_to_zero(float %arg) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @clamp_fabs_one_smallest_normal_to_zero(
-; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(nan inf nzero sub nnorm) float @clamp_fabs_one_smallest_normal_to_zero(
+; CHECK-SAME: float nofpclass(nan inf nzero sub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[IS_ONE_SMALLEST_NORMAL:%.*]] = fcmp one float [[FABS_ARG]], 0x3810000000000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_ONE_SMALLEST_NORMAL]], float 0.000000e+00, float [[ARG]]
@@ -1125,8 +1125,8 @@ define float @clamp_fabs_one_smallest_normal_to_zero(float %arg) {
 }
 
 define float @clamp_fabs_ueq_smallest_normal_to_zero(float %arg) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @clamp_fabs_ueq_smallest_normal_to_zero(
-; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @clamp_fabs_ueq_smallest_normal_to_zero(
+; CHECK-SAME: float nofpclass(nan ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[IS_UEQ_SMALLEST_NORMAL:%.*]] = fcmp ueq float [[FABS_ARG]], 0x3810000000000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_UEQ_SMALLEST_NORMAL]], float 0.000000e+00, float [[ARG]]
@@ -1139,8 +1139,8 @@ define float @clamp_fabs_ueq_smallest_normal_to_zero(float %arg) {
 }
 
 define float @clamp_fabs_une_smallest_normal_to_zero(float %arg) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @clamp_fabs_une_smallest_normal_to_zero(
-; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(nan inf nzero sub nnorm) float @clamp_fabs_une_smallest_normal_to_zero(
+; CHECK-SAME: float nofpclass(nan inf nzero sub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[IS_UNE_SMALLEST_NORMAL:%.*]] = fcmp une float [[FABS_ARG]], 0x3810000000000000
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[IS_UNE_SMALLEST_NORMAL]], float 0.000000e+00, float [[ARG]]
@@ -1179,8 +1179,8 @@ define float @clamp_fabs_ole_neg1_to_neg1(float %arg) {
 }
 
 define float @clamp_fabs_ult_neg1_to_neg1(float %arg) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub) float @clamp_fabs_ult_neg1_to_neg1(
-; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(nan ninf nzero nsub) float @clamp_fabs_ult_neg1_to_neg1(
+; CHECK-SAME: float nofpclass(nan ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[FABS_IS_ULT_NEG1:%.*]] = fcmp ult float [[FABS_ARG]], -1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[FABS_IS_ULT_NEG1]], float -1.000000e+00, float [[ARG]]
@@ -1193,8 +1193,8 @@ define float @clamp_fabs_ult_neg1_to_neg1(float %arg) {
 }
 
 define float @clamp_fabs_ule_neg1_to_neg1(float %arg) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub) float @clamp_fabs_ule_neg1_to_neg1(
-; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(nan ninf nzero nsub) float @clamp_fabs_ule_neg1_to_neg1(
+; CHECK-SAME: float nofpclass(nan ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[FABS_IS_ULE_NEG1:%.*]] = fcmp ule float [[FABS_ARG]], -1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[FABS_IS_ULE_NEG1]], float -1.000000e+00, float [[ARG]]
@@ -1207,8 +1207,8 @@ define float @clamp_fabs_ule_neg1_to_neg1(float %arg) {
 }
 
 define float @clamp_fabs_ogt_neg1_to_neg1(float %arg) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub) float @clamp_fabs_ogt_neg1_to_neg1(
-; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(inf zero sub pnorm) float @clamp_fabs_ogt_neg1_to_neg1(
+; CHECK-SAME: float nofpclass(inf zero sub norm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[FABS_IS_OGT_NEG1:%.*]] = fcmp ogt float [[FABS_ARG]], -1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[FABS_IS_OGT_NEG1]], float -1.000000e+00, float [[ARG]]
@@ -1221,8 +1221,8 @@ define float @clamp_fabs_ogt_neg1_to_neg1(float %arg) {
 }
 
 define float @clamp_fabs_oge_neg1_to_neg1(float %arg) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub) float @clamp_fabs_oge_neg1_to_neg1(
-; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(inf zero sub pnorm) float @clamp_fabs_oge_neg1_to_neg1(
+; CHECK-SAME: float nofpclass(inf zero sub norm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[FABS_IS_OGE_NEG1:%.*]] = fcmp oge float [[FABS_ARG]], -1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[FABS_IS_OGE_NEG1]], float -1.000000e+00, float [[ARG]]
@@ -1236,7 +1236,7 @@ define float @clamp_fabs_oge_neg1_to_neg1(float %arg) {
 
 define float @clamp_fabs_ugt_neg1_to_neg1(float %arg) {
 ; CHECK-LABEL: define noundef nofpclass(nan inf zero sub pnorm) float @clamp_fabs_ugt_neg1_to_neg1(
-; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: float nofpclass(all) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    ret float -1.000000e+00
 ;
   %fabs.arg = call float @llvm.fabs.f32(float %arg)
@@ -1247,7 +1247,7 @@ define float @clamp_fabs_ugt_neg1_to_neg1(float %arg) {
 
 define float @clamp_fabs_uge_neg1_to_neg1(float %arg) {
 ; CHECK-LABEL: define noundef nofpclass(nan inf zero sub pnorm) float @clamp_fabs_uge_neg1_to_neg1(
-; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: float nofpclass(all) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    ret float -1.000000e+00
 ;
   %fabs.arg = call float @llvm.fabs.f32(float %arg)
@@ -1268,8 +1268,8 @@ define float @clamp_fabs_oeq_neg1_to_neg1(float %arg) {
 }
 
 define float @clamp_fabs_ueq_neg1_to_neg1(float %arg) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub) float @clamp_fabs_ueq_neg1_to_neg1(
-; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(nan ninf nzero nsub) float @clamp_fabs_ueq_neg1_to_neg1(
+; CHECK-SAME: float nofpclass(nan ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[FABS_IS_UEQ_NEG1:%.*]] = fcmp ueq float [[FABS_ARG]], -1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[FABS_IS_UEQ_NEG1]], float -1.000000e+00, float [[ARG]]
@@ -1282,8 +1282,8 @@ define float @clamp_fabs_ueq_neg1_to_neg1(float %arg) {
 }
 
 define float @clamp_fabs_one_neg1_to_neg1(float %arg) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub) float @clamp_fabs_one_neg1_to_neg1(
-; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define nofpclass(nan inf zero sub pnorm) float @clamp_fabs_one_neg1_to_neg1(
+; CHECK-SAME: float nofpclass(all) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[FABS_IS_ONE_NEG1:%.*]] = fcmp one float [[FABS_ARG]], -1.000000e+00
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[FABS_IS_ONE_NEG1]], float -1.000000e+00, float [[ARG]]
@@ -1297,7 +1297,7 @@ define float @clamp_fabs_one_neg1_to_neg1(float %arg) {
 
 define float @clamp_fabs_une_neg1_to_neg1(float %arg) {
 ; CHECK-LABEL: define noundef nofpclass(nan inf zero sub pnorm) float @clamp_fabs_une_neg1_to_neg1(
-; CHECK-SAME: float nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: float nofpclass(all) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    ret float -1.000000e+00
 ;
   %fabs.arg = call float @llvm.fabs.f32(float %arg)
@@ -1311,8 +1311,8 @@ define float @clamp_fabs_une_neg1_to_neg1(float %arg) {
 ;---------------------------------------------------------------------
 
 define float @ret_assumed_ogt_1(float %arg) {
-; CHECK-LABEL: define float @ret_assumed_ogt_1(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3:[0-9]+]] {
+; CHECK-LABEL: define nofpclass(nan ninf zero sub nnorm) float @ret_assumed_ogt_1(
+; CHECK-SAME: float returned nofpclass(nan ninf zero sub nnorm) [[ARG:%.*]]) #[[ATTR3:[0-9]+]] {
 ; CHECK-NEXT:    [[OGT_1:%.*]] = fcmp ogt float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[OGT_1]]) #[[ATTR5:[0-9]+]]
 ; CHECK-NEXT:    ret float [[ARG]]
@@ -1323,8 +1323,8 @@ define float @ret_assumed_ogt_1(float %arg) {
 }
 
 define float @ret_assumed_oge_1(float %arg) {
-; CHECK-LABEL: define float @ret_assumed_oge_1(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(nan ninf zero sub nnorm) float @ret_assumed_oge_1(
+; CHECK-SAME: float returned nofpclass(nan ninf zero sub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[OGE_1:%.*]] = fcmp ogt float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[OGE_1]]) #[[ATTR5]]
 ; CHECK-NEXT:    ret float [[ARG]]
@@ -1335,8 +1335,8 @@ define float @ret_assumed_oge_1(float %arg) {
 }
 
 define float @ret_assumed_olt_1(float %arg) {
-; CHECK-LABEL: define float @ret_assumed_olt_1(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(nan pinf) float @ret_assumed_olt_1(
+; CHECK-SAME: float returned nofpclass(nan pinf) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[OLT_1:%.*]] = fcmp olt float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[OLT_1]]) #[[ATTR5]]
 ; CHECK-NEXT:    ret float [[ARG]]
@@ -1347,8 +1347,8 @@ define float @ret_assumed_olt_1(float %arg) {
 }
 
 define float @ret_assumed_ole_1(float %arg) {
-; CHECK-LABEL: define float @ret_assumed_ole_1(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(nan pinf) float @ret_assumed_ole_1(
+; CHECK-SAME: float returned nofpclass(nan pinf) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[OLE_1:%.*]] = fcmp ole float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[OLE_1]]) #[[ATTR5]]
 ; CHECK-NEXT:    ret float [[ARG]]
@@ -1359,8 +1359,8 @@ define float @ret_assumed_ole_1(float %arg) {
 }
 
 define float @ret_assumed_ugt_1(float %arg) {
-; CHECK-LABEL: define float @ret_assumed_ugt_1(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(ninf zero sub nnorm) float @ret_assumed_ugt_1(
+; CHECK-SAME: float returned nofpclass(ninf zero sub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[UGT_1:%.*]] = fcmp ugt float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[UGT_1]]) #[[ATTR5]]
 ; CHECK-NEXT:    ret float [[ARG]]
@@ -1371,8 +1371,8 @@ define float @ret_assumed_ugt_1(float %arg) {
 }
 
 define float @ret_assumed_uge_1(float %arg) {
-; CHECK-LABEL: define float @ret_assumed_uge_1(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(ninf zero sub nnorm) float @ret_assumed_uge_1(
+; CHECK-SAME: float returned nofpclass(ninf zero sub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[UGE_1:%.*]] = fcmp uge float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[UGE_1]]) #[[ATTR5]]
 ; CHECK-NEXT:    ret float [[ARG]]
@@ -1383,8 +1383,8 @@ define float @ret_assumed_uge_1(float %arg) {
 }
 
 define float @ret_assumed_ult_1(float %arg) {
-; CHECK-LABEL: define float @ret_assumed_ult_1(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(pinf) float @ret_assumed_ult_1(
+; CHECK-SAME: float returned nofpclass(pinf) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[ULT_1:%.*]] = fcmp ult float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[ULT_1]]) #[[ATTR5]]
 ; CHECK-NEXT:    ret float [[ARG]]
@@ -1395,8 +1395,8 @@ define float @ret_assumed_ult_1(float %arg) {
 }
 
 define float @ret_assumed_ule_1(float %arg) {
-; CHECK-LABEL: define float @ret_assumed_ule_1(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(pinf) float @ret_assumed_ule_1(
+; CHECK-SAME: float returned nofpclass(pinf) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[ULE_1:%.*]] = fcmp ule float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[ULE_1]]) #[[ATTR5]]
 ; CHECK-NEXT:    ret float [[ARG]]
@@ -1407,8 +1407,8 @@ define float @ret_assumed_ule_1(float %arg) {
 }
 
 define float @ret_assumed_fabs_ogt_1(float %arg) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_assumed_fabs_ogt_1(
-; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(nan ninf zero sub nnorm) float @ret_assumed_fabs_ogt_1(
+; CHECK-SAME: float returned nofpclass(nan ninf zero sub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[ARG_FABS:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[OGT_1:%.*]] = fcmp ogt float [[ARG_FABS]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[OGT_1]]) #[[ATTR5]]
@@ -1421,8 +1421,8 @@ define float @ret_assumed_fabs_ogt_1(float %arg) {
 }
 
 define float @ret_assumed_fabs_oge_1(float %arg) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_assumed_fabs_oge_1(
-; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(nan ninf zero sub nnorm) float @ret_assumed_fabs_oge_1(
+; CHECK-SAME: float returned nofpclass(nan ninf zero sub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[ARG_FABS:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[OGE_1:%.*]] = fcmp oge float [[ARG_FABS]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[OGE_1]]) #[[ATTR5]]
@@ -1435,8 +1435,8 @@ define float @ret_assumed_fabs_oge_1(float %arg) {
 }
 
 define float @ret_assumed_fabs_olt_1(float %arg) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_assumed_fabs_olt_1(
-; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(nan inf nzero nsub nnorm) float @ret_assumed_fabs_olt_1(
+; CHECK-SAME: float returned nofpclass(nan inf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[ARG_FABS:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[OLT_1:%.*]] = fcmp olt float [[ARG_FABS]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[OLT_1]]) #[[ATTR5]]
@@ -1449,8 +1449,8 @@ define float @ret_assumed_fabs_olt_1(float %arg) {
 }
 
 define float @ret_assumed_fabs_ole_1(float %arg) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_assumed_fabs_ole_1(
-; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(nan inf nzero nsub nnorm) float @ret_assumed_fabs_ole_1(
+; CHECK-SAME: float returned nofpclass(nan inf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[ARG_FABS:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[OLE_1:%.*]] = fcmp olt float [[ARG_FABS]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[OLE_1]]) #[[ATTR5]]
@@ -1463,8 +1463,8 @@ define float @ret_assumed_fabs_ole_1(float %arg) {
 }
 
 define float @ret_assumed_fabs_ugt_1(float %arg) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_assumed_fabs_ugt_1(
-; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(ninf zero sub nnorm) float @ret_assumed_fabs_ugt_1(
+; CHECK-SAME: float returned nofpclass(ninf zero sub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[ARG_FABS:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[UGT_1:%.*]] = fcmp ugt float [[ARG_FABS]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[UGT_1]]) #[[ATTR5]]
@@ -1477,8 +1477,8 @@ define float @ret_assumed_fabs_ugt_1(float %arg) {
 }
 
 define float @ret_assumed_fabs_uge_1(float %arg) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_assumed_fabs_uge_1(
-; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(ninf zero sub nnorm) float @ret_assumed_fabs_uge_1(
+; CHECK-SAME: float returned nofpclass(ninf zero sub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[ARG_FABS:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[UGE_1:%.*]] = fcmp ugt float [[ARG_FABS]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[UGE_1]]) #[[ATTR5]]
@@ -1491,8 +1491,8 @@ define float @ret_assumed_fabs_uge_1(float %arg) {
 }
 
 define float @ret_assumed_fabs_ult_1(float %arg) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_assumed_fabs_ult_1(
-; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(inf nzero nsub nnorm) float @ret_assumed_fabs_ult_1(
+; CHECK-SAME: float returned nofpclass(inf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[ARG_FABS:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[ULT_1:%.*]] = fcmp ult float [[ARG_FABS]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[ULT_1]]) #[[ATTR5]]
@@ -1505,8 +1505,8 @@ define float @ret_assumed_fabs_ult_1(float %arg) {
 }
 
 define float @ret_assumed_fabs_ule_1(float %arg) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_assumed_fabs_ule_1(
-; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(inf nzero nsub nnorm) float @ret_assumed_fabs_ule_1(
+; CHECK-SAME: float returned nofpclass(inf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[ARG_FABS:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[ULE_1:%.*]] = fcmp ule float [[ARG_FABS]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[ULE_1]]) #[[ATTR5]]
@@ -1519,8 +1519,8 @@ define float @ret_assumed_fabs_ule_1(float %arg) {
 }
 
 define float @ret_assumed_ogt_neg1(float %arg) {
-; CHECK-LABEL: define float @ret_assumed_ogt_neg1(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(nan ninf) float @ret_assumed_ogt_neg1(
+; CHECK-SAME: float returned nofpclass(nan ninf) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[OGT_NEG1:%.*]] = fcmp ogt float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[OGT_NEG1]]) #[[ATTR5]]
 ; CHECK-NEXT:    ret float [[ARG]]
@@ -1531,8 +1531,8 @@ define float @ret_assumed_ogt_neg1(float %arg) {
 }
 
 define float @ret_assumed_oge_neg1(float %arg) {
-; CHECK-LABEL: define float @ret_assumed_oge_neg1(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(nan ninf) float @ret_assumed_oge_neg1(
+; CHECK-SAME: float returned nofpclass(nan ninf) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[OGE_NEG1:%.*]] = fcmp ogt float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[OGE_NEG1]]) #[[ATTR5]]
 ; CHECK-NEXT:    ret float [[ARG]]
@@ -1543,8 +1543,8 @@ define float @ret_assumed_oge_neg1(float %arg) {
 }
 
 define float @ret_assumed_olt_neg1(float %arg) {
-; CHECK-LABEL: define float @ret_assumed_olt_neg1(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(nan pinf zero sub pnorm) float @ret_assumed_olt_neg1(
+; CHECK-SAME: float returned nofpclass(nan pinf zero sub pnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[OLT_NEG1:%.*]] = fcmp olt float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[OLT_NEG1]]) #[[ATTR5]]
 ; CHECK-NEXT:    ret float [[ARG]]
@@ -1555,8 +1555,8 @@ define float @ret_assumed_olt_neg1(float %arg) {
 }
 
 define float @ret_assumed_ole_neg1(float %arg) {
-; CHECK-LABEL: define float @ret_assumed_ole_neg1(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(nan pinf zero sub pnorm) float @ret_assumed_ole_neg1(
+; CHECK-SAME: float returned nofpclass(nan pinf zero sub pnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[OLE_NEG1:%.*]] = fcmp ole float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[OLE_NEG1]]) #[[ATTR5]]
 ; CHECK-NEXT:    ret float [[ARG]]
@@ -1567,8 +1567,8 @@ define float @ret_assumed_ole_neg1(float %arg) {
 }
 
 define float @ret_assumed_ugt_neg1(float %arg) {
-; CHECK-LABEL: define float @ret_assumed_ugt_neg1(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(ninf) float @ret_assumed_ugt_neg1(
+; CHECK-SAME: float returned nofpclass(ninf) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[UGT_NEG1:%.*]] = fcmp ugt float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[UGT_NEG1]]) #[[ATTR5]]
 ; CHECK-NEXT:    ret float [[ARG]]
@@ -1579,8 +1579,8 @@ define float @ret_assumed_ugt_neg1(float %arg) {
 }
 
 define float @ret_assumed_uge_neg1(float %arg) {
-; CHECK-LABEL: define float @ret_assumed_uge_neg1(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(ninf) float @ret_assumed_uge_neg1(
+; CHECK-SAME: float returned nofpclass(ninf) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[UGE_NEG1:%.*]] = fcmp uge float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[UGE_NEG1]]) #[[ATTR5]]
 ; CHECK-NEXT:    ret float [[ARG]]
@@ -1591,8 +1591,8 @@ define float @ret_assumed_uge_neg1(float %arg) {
 }
 
 define float @ret_assumed_ult_neg1(float %arg) {
-; CHECK-LABEL: define float @ret_assumed_ult_neg1(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(pinf zero sub pnorm) float @ret_assumed_ult_neg1(
+; CHECK-SAME: float returned nofpclass(pinf zero sub pnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[ULT_NEG1:%.*]] = fcmp ult float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[ULT_NEG1]]) #[[ATTR5]]
 ; CHECK-NEXT:    ret float [[ARG]]
@@ -1603,8 +1603,8 @@ define float @ret_assumed_ult_neg1(float %arg) {
 }
 
 define float @ret_assumed_ule_neg1(float %arg) {
-; CHECK-LABEL: define float @ret_assumed_ule_neg1(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(pinf zero sub pnorm) float @ret_assumed_ule_neg1(
+; CHECK-SAME: float returned nofpclass(pinf zero sub pnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[ULE_NEG1:%.*]] = fcmp ule float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[ULE_NEG1]]) #[[ATTR5]]
 ; CHECK-NEXT:    ret float [[ARG]]
@@ -1615,8 +1615,8 @@ define float @ret_assumed_ule_neg1(float %arg) {
 }
 
 define float @ret_assumed_oeq_1(float %arg) {
-; CHECK-LABEL: define float @ret_assumed_oeq_1(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(nan inf zero sub nnorm) float @ret_assumed_oeq_1(
+; CHECK-SAME: float returned nofpclass(nan inf zero sub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[OEQ_1:%.*]] = fcmp oeq float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[OEQ_1]]) #[[ATTR5]]
 ; CHECK-NEXT:    ret float [[ARG]]
@@ -1627,8 +1627,8 @@ define float @ret_assumed_oeq_1(float %arg) {
 }
 
 define float @ret_assumed_ueq_1(float %arg) {
-; CHECK-LABEL: define float @ret_assumed_ueq_1(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(inf zero sub nnorm) float @ret_assumed_ueq_1(
+; CHECK-SAME: float returned nofpclass(inf zero sub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[UEQ_1:%.*]] = fcmp ueq float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[UEQ_1]]) #[[ATTR5]]
 ; CHECK-NEXT:    ret float [[ARG]]
@@ -1639,8 +1639,8 @@ define float @ret_assumed_ueq_1(float %arg) {
 }
 
 define float @ret_assumed_one_1(float %arg) {
-; CHECK-LABEL: define float @ret_assumed_one_1(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(nan) float @ret_assumed_one_1(
+; CHECK-SAME: float returned nofpclass(nan) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[ONE_1:%.*]] = fcmp one float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[ONE_1]]) #[[ATTR5]]
 ; CHECK-NEXT:    ret float [[ARG]]
@@ -1651,8 +1651,8 @@ define float @ret_assumed_one_1(float %arg) {
 }
 
 define float @ret_assumed_one_neg1(float %arg) {
-; CHECK-LABEL: define float @ret_assumed_one_neg1(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(nan) float @ret_assumed_one_neg1(
+; CHECK-SAME: float returned nofpclass(nan) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[ONE_NEG1:%.*]] = fcmp one float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[ONE_NEG1]]) #[[ATTR5]]
 ; CHECK-NEXT:    ret float [[ARG]]
@@ -1687,8 +1687,8 @@ define float @ret_assumed_une_1(float %arg) {
 }
 
 define float @ret_assumed_fabs_oeq_1(float %arg) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_assumed_fabs_oeq_1(
-; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(nan inf zero sub nnorm) float @ret_assumed_fabs_oeq_1(
+; CHECK-SAME: float returned nofpclass(nan inf zero sub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[ARG_FABS:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[OEQ_1:%.*]] = fcmp oeq float [[ARG_FABS]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[OEQ_1]]) #[[ATTR5]]
@@ -1701,8 +1701,8 @@ define float @ret_assumed_fabs_oeq_1(float %arg) {
 }
 
 define float @ret_assumed_fabs_ueq_1(float %arg) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_assumed_fabs_ueq_1(
-; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(inf zero sub nnorm) float @ret_assumed_fabs_ueq_1(
+; CHECK-SAME: float returned nofpclass(inf zero sub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[ARG_FABS:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[UEQ_1:%.*]] = fcmp ueq float [[ARG_FABS]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[UEQ_1]]) #[[ATTR5]]
@@ -1715,8 +1715,8 @@ define float @ret_assumed_fabs_ueq_1(float %arg) {
 }
 
 define float @ret_assumed_fabs_one_1(float %arg) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_assumed_fabs_one_1(
-; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @ret_assumed_fabs_one_1(
+; CHECK-SAME: float returned nofpclass(nan ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[ARG_FABS:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[ONE_1:%.*]] = fcmp one float [[ARG_FABS]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[ONE_1]]) #[[ATTR5]]
@@ -1729,8 +1729,8 @@ define float @ret_assumed_fabs_one_1(float %arg) {
 }
 
 define float @ret_assumed_fabs_une_1(float %arg) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_assumed_fabs_une_1(
-; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @ret_assumed_fabs_une_1(
+; CHECK-SAME: float returned nofpclass(nan ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[ARG_FABS:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[UNE_1:%.*]] = fcmp one float [[ARG_FABS]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[UNE_1]]) #[[ATTR5]]
@@ -1743,8 +1743,8 @@ define float @ret_assumed_fabs_une_1(float %arg) {
 }
 
 define float @ret_assumed_fabs_oeq_neg1(float %arg) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_assumed_fabs_oeq_neg1(
-; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(all) float @ret_assumed_fabs_oeq_neg1(
+; CHECK-SAME: float returned nofpclass(all) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef false) #[[ATTR5]]
 ; CHECK-NEXT:    ret float [[ARG]]
 ;
@@ -1755,8 +1755,8 @@ define float @ret_assumed_fabs_oeq_neg1(float %arg) {
 }
 
 define float @ret_assumed_fabs_ueq_neg1(float %arg) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_assumed_fabs_ueq_neg1(
-; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(inf zero sub norm) float @ret_assumed_fabs_ueq_neg1(
+; CHECK-SAME: float returned nofpclass(inf zero sub norm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[ARG_FABS:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[UEQ_NEG1:%.*]] = fcmp ueq float [[ARG_FABS]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[UEQ_NEG1]]) #[[ATTR5]]
@@ -1769,8 +1769,8 @@ define float @ret_assumed_fabs_ueq_neg1(float %arg) {
 }
 
 define float @ret_assumed_fabs_one_neg1(float %arg) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_assumed_fabs_one_neg1(
-; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @ret_assumed_fabs_one_neg1(
+; CHECK-SAME: float returned nofpclass(nan ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[ARG_FABS:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[ONE_NEG1:%.*]] = fcmp one float [[ARG_FABS]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[ONE_NEG1]]) #[[ATTR5]]
@@ -2228,8 +2228,8 @@ define float @ret_assumed_uge_known_negative(float %arg, float %unknown) {
 ;---------------------------------------------------------------------
 
 define float @assume_oeq_smallest_normal(float %arg) {
-; CHECK-LABEL: define float @assume_oeq_smallest_normal(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(nan inf zero sub nnorm) float @assume_oeq_smallest_normal(
+; CHECK-SAME: float returned nofpclass(nan inf zero sub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[IS_OEQ_SMALLEST_NORMAL:%.*]] = fcmp oeq float [[ARG]], 0x3810000000000000
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[IS_OEQ_SMALLEST_NORMAL]]) #[[ATTR5]]
 ; CHECK-NEXT:    ret float [[ARG]]
@@ -2240,8 +2240,8 @@ define float @assume_oeq_smallest_normal(float %arg) {
 }
 
 define float @assume_one_smallest_normal(float %arg) {
-; CHECK-LABEL: define float @assume_one_smallest_normal(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(nan) float @assume_one_smallest_normal(
+; CHECK-SAME: float returned nofpclass(nan) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[IS_ONE_SMALLEST_NORMAL:%.*]] = fcmp one float [[ARG]], 0x3810000000000000
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[IS_ONE_SMALLEST_NORMAL]]) #[[ATTR5]]
 ; CHECK-NEXT:    ret float [[ARG]]
@@ -2252,8 +2252,8 @@ define float @assume_one_smallest_normal(float %arg) {
 }
 
 define float @assume_ueq_smallest_normal(float %arg) {
-; CHECK-LABEL: define float @assume_ueq_smallest_normal(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(inf zero sub nnorm) float @assume_ueq_smallest_normal(
+; CHECK-SAME: float returned nofpclass(inf zero sub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[IS_UEQ_SMALLEST_NORMAL:%.*]] = fcmp ueq float [[ARG]], 0x3810000000000000
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[IS_UEQ_SMALLEST_NORMAL]]) #[[ATTR5]]
 ; CHECK-NEXT:    ret float [[ARG]]
@@ -2300,8 +2300,8 @@ define float @assume_uno_smallest_normal(float %arg) {
 }
 
 define float @assume_fabs_oeq_smallest_normal(float %arg) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @assume_fabs_oeq_smallest_normal(
-; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(nan inf zero sub nnorm) float @assume_fabs_oeq_smallest_normal(
+; CHECK-SAME: float returned nofpclass(nan inf zero sub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[IS_OEQ_SMALLEST_NORMAL:%.*]] = fcmp oeq float [[FABS_ARG]], 0x3810000000000000
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[IS_OEQ_SMALLEST_NORMAL]]) #[[ATTR5]]
@@ -2314,8 +2314,8 @@ define float @assume_fabs_oeq_smallest_normal(float %arg) {
 }
 
 define float @assume_fabs_one_smallest_normal(float %arg) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @assume_fabs_one_smallest_normal(
-; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @assume_fabs_one_smallest_normal(
+; CHECK-SAME: float returned nofpclass(nan ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[IS_ONE_SMALLEST_NORMAL:%.*]] = fcmp one float [[FABS_ARG]], 0x3810000000000000
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[IS_ONE_SMALLEST_NORMAL]]) #[[ATTR5]]
@@ -2328,8 +2328,8 @@ define float @assume_fabs_one_smallest_normal(float %arg) {
 }
 
 define float @assume_fabs_ueq_smallest_normal(float %arg) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @assume_fabs_ueq_smallest_normal(
-; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(inf zero sub nnorm) float @assume_fabs_ueq_smallest_normal(
+; CHECK-SAME: float returned nofpclass(inf zero sub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[FABS_ARG:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[IS_UEQ_SMALLEST_NORMAL:%.*]] = fcmp ueq float [[FABS_ARG]], 0x3810000000000000
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[IS_UEQ_SMALLEST_NORMAL]]) #[[ATTR5]]
@@ -2384,8 +2384,8 @@ define float @assume_fabs_uno_smallest_normal(float %arg) {
 }
 
 define float @assume_oeq_smallest_normal_known_pos(float nofpclass(ninf nsub nnorm nzero) %arg) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @assume_oeq_smallest_normal_known_pos(
-; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(nan inf zero sub nnorm) float @assume_oeq_smallest_normal_known_pos(
+; CHECK-SAME: float returned nofpclass(nan inf zero sub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[IS_OEQ_SMALLEST_NORMAL:%.*]] = fcmp oeq float [[ARG]], 0x3810000000000000
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[IS_OEQ_SMALLEST_NORMAL]]) #[[ATTR5]]
 ; CHECK-NEXT:    ret float [[ARG]]
diff --git a/llvm/test/Transforms/InstSimplify/assume-fcmp-constant-implies-class.ll b/llvm/test/Transforms/InstSimplify/assume-fcmp-constant-implies-class.ll
index 7970f3ce6bf09d..8d5ac063108c23 100644
--- a/llvm/test/Transforms/InstSimplify/assume-fcmp-constant-implies-class.ll
+++ b/llvm/test/Transforms/InstSimplify/assume-fcmp-constant-implies-class.ll
@@ -17,8 +17,7 @@ define i1 @assume_olt_neg1__oeq_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OLT_NEG1:%.*]] = fcmp olt float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OLT_NEG1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
   %olt.neg1 = fcmp olt float %arg, -1.0
   call void @llvm.assume(i1 %olt.neg1)
@@ -31,8 +30,7 @@ define i1 @assume_olt_neg1__ogt_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OLT_NEG1:%.*]] = fcmp olt float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OLT_NEG1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
   %olt.neg1 = fcmp olt float %arg, -1.0
   call void @llvm.assume(i1 %olt.neg1)
@@ -45,8 +43,7 @@ define i1 @assume_olt_neg1__oge_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OLT_NEG1:%.*]] = fcmp olt float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OLT_NEG1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp oge float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
   %olt.neg1 = fcmp olt float %arg, -1.0
   call void @llvm.assume(i1 %olt.neg1)
@@ -59,8 +56,7 @@ define i1 @assume_olt_neg1__olt_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OLT_NEG1:%.*]] = fcmp olt float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OLT_NEG1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %olt.neg1 = fcmp olt float %arg, -1.0
   call void @llvm.assume(i1 %olt.neg1)
@@ -73,8 +69,7 @@ define i1 @assume_olt_neg1__ole_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OLT_NEG1:%.*]] = fcmp olt float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OLT_NEG1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ole float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %olt.neg1 = fcmp olt float %arg, -1.0
   call void @llvm.assume(i1 %olt.neg1)
@@ -87,8 +82,7 @@ define i1 @assume_olt_neg1__one_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OLT_NEG1:%.*]] = fcmp olt float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OLT_NEG1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp one float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %olt.neg1 = fcmp olt float %arg, -1.0
   call void @llvm.assume(i1 %olt.neg1)
@@ -101,8 +95,7 @@ define i1 @assume_olt_neg1__ord_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OLT_NEG1:%.*]] = fcmp olt float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OLT_NEG1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ord float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %olt.neg1 = fcmp olt float %arg, -1.0
   call void @llvm.assume(i1 %olt.neg1)
@@ -115,8 +108,7 @@ define i1 @assume_olt_neg1__ueq_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OLT_NEG1:%.*]] = fcmp olt float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OLT_NEG1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ueq float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
   %olt.neg1 = fcmp olt float %arg, -1.0
   call void @llvm.assume(i1 %olt.neg1)
@@ -129,8 +121,7 @@ define i1 @assume_olt_neg1__ugt_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OLT_NEG1:%.*]] = fcmp olt float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OLT_NEG1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ugt float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
   %olt.neg1 = fcmp olt float %arg, -1.0
   call void @llvm.assume(i1 %olt.neg1)
@@ -143,8 +134,7 @@ define i1 @assume_olt_neg1__uge_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OLT_NEG1:%.*]] = fcmp olt float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OLT_NEG1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp uge float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
   %olt.neg1 = fcmp olt float %arg, -1.0
   call void @llvm.assume(i1 %olt.neg1)
@@ -157,8 +147,7 @@ define i1 @assume_olt_neg1__ult_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OLT_NEG1:%.*]] = fcmp olt float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OLT_NEG1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ult float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %olt.neg1 = fcmp olt float %arg, -1.0
   call void @llvm.assume(i1 %olt.neg1)
@@ -171,8 +160,7 @@ define i1 @assume_olt_neg1__ule_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OLT_NEG1:%.*]] = fcmp olt float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OLT_NEG1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ule float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %olt.neg1 = fcmp olt float %arg, -1.0
   call void @llvm.assume(i1 %olt.neg1)
@@ -185,8 +173,7 @@ define i1 @assume_olt_neg1__une_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OLT_NEG1:%.*]] = fcmp olt float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OLT_NEG1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp une float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %olt.neg1 = fcmp olt float %arg, -1.0
   call void @llvm.assume(i1 %olt.neg1)
@@ -199,8 +186,7 @@ define i1 @assume_olt_neg1__uno_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OLT_NEG1:%.*]] = fcmp olt float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OLT_NEG1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp uno float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
   %olt.neg1 = fcmp olt float %arg, -1.0
   call void @llvm.assume(i1 %olt.neg1)
@@ -217,8 +203,7 @@ define i1 @assume_ole_neg1__oeq_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OLE_NEG1:%.*]] = fcmp ole float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OLE_NEG1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
   %ole.neg1 = fcmp ole float %arg, -1.0
   call void @llvm.assume(i1 %ole.neg1)
@@ -231,8 +216,7 @@ define i1 @assume_ole_neg1__ogt_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OLE_NEG1:%.*]] = fcmp ole float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OLE_NEG1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
   %ole.neg1 = fcmp ole float %arg, -1.0
   call void @llvm.assume(i1 %ole.neg1)
@@ -245,8 +229,7 @@ define i1 @assume_ole_neg1__oge_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OLE_NEG1:%.*]] = fcmp ole float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OLE_NEG1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp oge float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
   %ole.neg1 = fcmp ole float %arg, -1.0
   call void @llvm.assume(i1 %ole.neg1)
@@ -259,8 +242,7 @@ define i1 @assume_ole_neg1__olt_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OLE_NEG1:%.*]] = fcmp ole float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OLE_NEG1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %ole.neg1 = fcmp ole float %arg, -1.0
   call void @llvm.assume(i1 %ole.neg1)
@@ -273,8 +255,7 @@ define i1 @assume_ole_neg1__ole_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OLE_NEG1:%.*]] = fcmp ole float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OLE_NEG1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ole float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %ole.neg1 = fcmp ole float %arg, -1.0
   call void @llvm.assume(i1 %ole.neg1)
@@ -287,8 +268,7 @@ define i1 @assume_ole_neg1__one_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OLE_NEG1:%.*]] = fcmp ole float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OLE_NEG1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp one float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %ole.neg1 = fcmp ole float %arg, -1.0
   call void @llvm.assume(i1 %ole.neg1)
@@ -301,8 +281,7 @@ define i1 @assume_ole_neg1__ord_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OLE_NEG1:%.*]] = fcmp ole float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OLE_NEG1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ord float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %ole.neg1 = fcmp ole float %arg, -1.0
   call void @llvm.assume(i1 %ole.neg1)
@@ -315,8 +294,7 @@ define i1 @assume_ole_neg1__ueq_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OLE_NEG1:%.*]] = fcmp ole float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OLE_NEG1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ueq float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
   %ole.neg1 = fcmp ole float %arg, -1.0
   call void @llvm.assume(i1 %ole.neg1)
@@ -329,8 +307,7 @@ define i1 @assume_ole_neg1__ugt_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OLE_NEG1:%.*]] = fcmp ole float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OLE_NEG1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ugt float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
   %ole.neg1 = fcmp ole float %arg, -1.0
   call void @llvm.assume(i1 %ole.neg1)
@@ -343,8 +320,7 @@ define i1 @assume_ole_neg1__uge_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OLE_NEG1:%.*]] = fcmp ole float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OLE_NEG1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp uge float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
   %ole.neg1 = fcmp ole float %arg, -1.0
   call void @llvm.assume(i1 %ole.neg1)
@@ -357,8 +333,7 @@ define i1 @assume_ole_neg1__ult_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OLE_NEG1:%.*]] = fcmp ole float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OLE_NEG1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ult float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %ole.neg1 = fcmp ole float %arg, -1.0
   call void @llvm.assume(i1 %ole.neg1)
@@ -371,8 +346,7 @@ define i1 @assume_ole_neg1__ule_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OLE_NEG1:%.*]] = fcmp ole float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OLE_NEG1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ule float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %ole.neg1 = fcmp ole float %arg, -1.0
   call void @llvm.assume(i1 %ole.neg1)
@@ -385,8 +359,7 @@ define i1 @assume_ole_neg1__une_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OLE_NEG1:%.*]] = fcmp ole float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OLE_NEG1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp une float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %ole.neg1 = fcmp ole float %arg, -1.0
   call void @llvm.assume(i1 %ole.neg1)
@@ -399,8 +372,7 @@ define i1 @assume_ole_neg1__uno_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OLE_NEG1:%.*]] = fcmp ole float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OLE_NEG1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp uno float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
   %ole.neg1 = fcmp ole float %arg, -1.0
   call void @llvm.assume(i1 %ole.neg1)
@@ -501,8 +473,7 @@ define i1 @assume_ogt_neg1__ord_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OGT_NEG1:%.*]] = fcmp ogt float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OGT_NEG1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ord float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %ogt.neg1 = fcmp ogt float %arg, -1.0
   call void @llvm.assume(i1 %ogt.neg1)
@@ -599,8 +570,7 @@ define i1 @assume_ogt_neg1__uno_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OGT_NEG1:%.*]] = fcmp ogt float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OGT_NEG1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp uno float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
   %ogt.neg1 = fcmp ogt float %arg, -1.0
   call void @llvm.assume(i1 %ogt.neg1)
@@ -701,8 +671,7 @@ define i1 @assume_oge_neg1__ord_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OGE_NEG1:%.*]] = fcmp oge float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OGE_NEG1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ord float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %oge.neg1 = fcmp oge float %arg, -1.0
   call void @llvm.assume(i1 %oge.neg1)
@@ -799,8 +768,7 @@ define i1 @assume_oge_neg1__uno_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OGE_NEG1:%.*]] = fcmp oge float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OGE_NEG1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp uno float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
   %oge.neg1 = fcmp oge float %arg, -1.0
   call void @llvm.assume(i1 %oge.neg1)
@@ -1217,8 +1185,7 @@ define i1 @assume_ule_neg1__oeq_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[ULE_NEG1:%.*]] = fcmp ule float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[ULE_NEG1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
   %ule.neg1 = fcmp ule float %arg, -1.0
   call void @llvm.assume(i1 %ule.neg1)
@@ -1231,8 +1198,7 @@ define i1 @assume_ule_neg1__ogt_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[ULE_NEG1:%.*]] = fcmp ule float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[ULE_NEG1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
   %ule.neg1 = fcmp ule float %arg, -1.0
   call void @llvm.assume(i1 %ule.neg1)
@@ -1245,8 +1211,7 @@ define i1 @assume_ule_neg1__oge_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[ULE_NEG1:%.*]] = fcmp ule float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[ULE_NEG1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp oge float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
   %ule.neg1 = fcmp ule float %arg, -1.0
   call void @llvm.assume(i1 %ule.neg1)
@@ -1357,8 +1322,7 @@ define i1 @assume_ule_neg1__ult_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[ULE_NEG1:%.*]] = fcmp ule float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[ULE_NEG1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ult float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %ule.neg1 = fcmp ule float %arg, -1.0
   call void @llvm.assume(i1 %ule.neg1)
@@ -1371,8 +1335,7 @@ define i1 @assume_ule_neg1__ule_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[ULE_NEG1:%.*]] = fcmp ule float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[ULE_NEG1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ule float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %ule.neg1 = fcmp ule float %arg, -1.0
   call void @llvm.assume(i1 %ule.neg1)
@@ -1385,8 +1348,7 @@ define i1 @assume_ule_neg1__une_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[ULE_NEG1:%.*]] = fcmp ule float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[ULE_NEG1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp une float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %ule.neg1 = fcmp ule float %arg, -1.0
   call void @llvm.assume(i1 %ule.neg1)
@@ -1417,8 +1379,7 @@ define i1 @assume_ult_neg1__oeq_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[ULT_NEG1:%.*]] = fcmp ult float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[ULT_NEG1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
   %ult.neg1 = fcmp ult float %arg, -1.0
   call void @llvm.assume(i1 %ult.neg1)
@@ -1431,8 +1392,7 @@ define i1 @assume_ult_neg1__ogt_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[ULT_NEG1:%.*]] = fcmp ult float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[ULT_NEG1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
   %ult.neg1 = fcmp ult float %arg, -1.0
   call void @llvm.assume(i1 %ult.neg1)
@@ -1445,8 +1405,7 @@ define i1 @assume_ult_neg1__oge_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[ULT_NEG1:%.*]] = fcmp ult float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[ULT_NEG1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp oge float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
   %ult.neg1 = fcmp ult float %arg, -1.0
   call void @llvm.assume(i1 %ult.neg1)
@@ -1557,8 +1516,7 @@ define i1 @assume_ult_neg1__ult_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[ULT_NEG1:%.*]] = fcmp ult float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[ULT_NEG1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ult float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %ult.neg1 = fcmp ult float %arg, -1.0
   call void @llvm.assume(i1 %ult.neg1)
@@ -1571,8 +1529,7 @@ define i1 @assume_ult_neg1__ule_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[ULT_NEG1:%.*]] = fcmp ult float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[ULT_NEG1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ule float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %ult.neg1 = fcmp ult float %arg, -1.0
   call void @llvm.assume(i1 %ult.neg1)
@@ -1585,8 +1542,7 @@ define i1 @assume_ult_neg1__une_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[ULT_NEG1:%.*]] = fcmp ult float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[ULT_NEG1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp une float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %ult.neg1 = fcmp ult float %arg, -1.0
   call void @llvm.assume(i1 %ult.neg1)
@@ -1824,8 +1780,7 @@ define i1 @assume_olt_pos1__ord_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OLT_POS1:%.*]] = fcmp olt float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OLT_POS1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ord float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %olt.pos1 = fcmp olt float %arg, 1.0
   call void @llvm.assume(i1 %olt.pos1)
@@ -1922,8 +1877,7 @@ define i1 @assume_olt_pos1__uno_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OLT_POS1:%.*]] = fcmp olt float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OLT_POS1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp uno float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
   %olt.pos1 = fcmp olt float %arg, 1.0
   call void @llvm.assume(i1 %olt.pos1)
@@ -2024,8 +1978,7 @@ define i1 @assume_ole_pos1__ord_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OLE_POS1:%.*]] = fcmp ole float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OLE_POS1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ord float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %ole.pos1 = fcmp ole float %arg, 1.0
   call void @llvm.assume(i1 %ole.pos1)
@@ -2122,8 +2075,7 @@ define i1 @assume_ole_pos1__uno_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OLE_POS1:%.*]] = fcmp ole float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OLE_POS1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp uno float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
   %ole.pos1 = fcmp ole float %arg, 1.0
   call void @llvm.assume(i1 %ole.pos1)
@@ -2140,8 +2092,7 @@ define i1 @assume_ogt_pos1__oeq_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OGT_POS1:%.*]] = fcmp ogt float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OGT_POS1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
   %ogt.pos1 = fcmp ogt float %arg, 1.0
   call void @llvm.assume(i1 %ogt.pos1)
@@ -2154,8 +2105,7 @@ define i1 @assume_ogt_pos1__ogt_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OGT_POS1:%.*]] = fcmp ogt float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OGT_POS1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %ogt.pos1 = fcmp ogt float %arg, 1.0
   call void @llvm.assume(i1 %ogt.pos1)
@@ -2168,8 +2118,7 @@ define i1 @assume_ogt_pos1__oge_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OGT_POS1:%.*]] = fcmp ogt float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OGT_POS1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp oge float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %ogt.pos1 = fcmp ogt float %arg, 1.0
   call void @llvm.assume(i1 %ogt.pos1)
@@ -2182,8 +2131,7 @@ define i1 @assume_ogt_pos1__olt_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OGT_POS1:%.*]] = fcmp ogt float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OGT_POS1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
   %ogt.pos1 = fcmp ogt float %arg, 1.0
   call void @llvm.assume(i1 %ogt.pos1)
@@ -2196,8 +2144,7 @@ define i1 @assume_ogt_pos1__ole_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OGT_POS1:%.*]] = fcmp ogt float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OGT_POS1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ole float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
   %ogt.pos1 = fcmp ogt float %arg, 1.0
   call void @llvm.assume(i1 %ogt.pos1)
@@ -2210,8 +2157,7 @@ define i1 @assume_ogt_pos1__one_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OGT_POS1:%.*]] = fcmp ogt float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OGT_POS1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp one float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %ogt.pos1 = fcmp ogt float %arg, 1.0
   call void @llvm.assume(i1 %ogt.pos1)
@@ -2224,8 +2170,7 @@ define i1 @assume_ogt_pos1__ord_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OGT_POS1:%.*]] = fcmp ogt float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OGT_POS1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ord float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %ogt.pos1 = fcmp ogt float %arg, 1.0
   call void @llvm.assume(i1 %ogt.pos1)
@@ -2238,8 +2183,7 @@ define i1 @assume_ogt_pos1__ueq_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OGT_POS1:%.*]] = fcmp ogt float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OGT_POS1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ueq float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
   %ogt.pos1 = fcmp ogt float %arg, 1.0
   call void @llvm.assume(i1 %ogt.pos1)
@@ -2252,8 +2196,7 @@ define i1 @assume_ogt_pos1__ugt_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OGT_POS1:%.*]] = fcmp ogt float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OGT_POS1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ugt float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %ogt.pos1 = fcmp ogt float %arg, 1.0
   call void @llvm.assume(i1 %ogt.pos1)
@@ -2266,8 +2209,7 @@ define i1 @assume_ogt_pos1__uge_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OGT_POS1:%.*]] = fcmp ogt float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OGT_POS1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp uge float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %ogt.pos1 = fcmp ogt float %arg, 1.0
   call void @llvm.assume(i1 %ogt.pos1)
@@ -2280,8 +2222,7 @@ define i1 @assume_ogt_pos1__ult_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OGT_POS1:%.*]] = fcmp ogt float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OGT_POS1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ult float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
   %ogt.pos1 = fcmp ogt float %arg, 1.0
   call void @llvm.assume(i1 %ogt.pos1)
@@ -2294,8 +2235,7 @@ define i1 @assume_ogt_pos1__ule_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OGT_POS1:%.*]] = fcmp ogt float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OGT_POS1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ule float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
   %ogt.pos1 = fcmp ogt float %arg, 1.0
   call void @llvm.assume(i1 %ogt.pos1)
@@ -2308,8 +2248,7 @@ define i1 @assume_ogt_pos1__une_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OGT_POS1:%.*]] = fcmp ogt float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OGT_POS1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp une float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %ogt.pos1 = fcmp ogt float %arg, 1.0
   call void @llvm.assume(i1 %ogt.pos1)
@@ -2322,8 +2261,7 @@ define i1 @assume_ogt_pos1__uno_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OGT_POS1:%.*]] = fcmp ogt float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OGT_POS1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp uno float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
   %ogt.pos1 = fcmp ogt float %arg, 1.0
   call void @llvm.assume(i1 %ogt.pos1)
@@ -2340,8 +2278,7 @@ define i1 @assume_oge_pos1__oeq_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OGE_POS1:%.*]] = fcmp oge float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OGE_POS1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
   %oge.pos1 = fcmp oge float %arg, 1.0
   call void @llvm.assume(i1 %oge.pos1)
@@ -2354,8 +2291,7 @@ define i1 @assume_oge_pos1__ogt_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OGE_POS1:%.*]] = fcmp oge float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OGE_POS1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %oge.pos1 = fcmp oge float %arg, 1.0
   call void @llvm.assume(i1 %oge.pos1)
@@ -2368,8 +2304,7 @@ define i1 @assume_oge_pos1__oge_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OGE_POS1:%.*]] = fcmp oge float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OGE_POS1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp oge float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %oge.pos1 = fcmp oge float %arg, 1.0
   call void @llvm.assume(i1 %oge.pos1)
@@ -2382,8 +2317,7 @@ define i1 @assume_oge_pos1__olt_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OGE_POS1:%.*]] = fcmp oge float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OGE_POS1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
   %oge.pos1 = fcmp oge float %arg, 1.0
   call void @llvm.assume(i1 %oge.pos1)
@@ -2396,8 +2330,7 @@ define i1 @assume_oge_pos1__ole_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OGE_POS1:%.*]] = fcmp oge float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OGE_POS1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ole float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
   %oge.pos1 = fcmp oge float %arg, 1.0
   call void @llvm.assume(i1 %oge.pos1)
@@ -2410,8 +2343,7 @@ define i1 @assume_oge_pos1__one_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OGE_POS1:%.*]] = fcmp oge float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OGE_POS1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp one float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %oge.pos1 = fcmp oge float %arg, 1.0
   call void @llvm.assume(i1 %oge.pos1)
@@ -2424,8 +2356,7 @@ define i1 @assume_oge_pos1__ord_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OGE_POS1:%.*]] = fcmp oge float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OGE_POS1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ord float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %oge.pos1 = fcmp oge float %arg, 1.0
   call void @llvm.assume(i1 %oge.pos1)
@@ -2438,8 +2369,7 @@ define i1 @assume_oge_pos1__ueq_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OGE_POS1:%.*]] = fcmp oge float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OGE_POS1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ueq float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
   %oge.pos1 = fcmp oge float %arg, 1.0
   call void @llvm.assume(i1 %oge.pos1)
@@ -2452,8 +2382,7 @@ define i1 @assume_oge_pos1__ugt_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OGE_POS1:%.*]] = fcmp oge float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OGE_POS1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ugt float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %oge.pos1 = fcmp oge float %arg, 1.0
   call void @llvm.assume(i1 %oge.pos1)
@@ -2466,8 +2395,7 @@ define i1 @assume_oge_pos1__uge_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OGE_POS1:%.*]] = fcmp oge float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OGE_POS1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp uge float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %oge.pos1 = fcmp oge float %arg, 1.0
   call void @llvm.assume(i1 %oge.pos1)
@@ -2480,8 +2408,7 @@ define i1 @assume_oge_pos1__ult_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OGE_POS1:%.*]] = fcmp oge float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OGE_POS1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ult float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
   %oge.pos1 = fcmp oge float %arg, 1.0
   call void @llvm.assume(i1 %oge.pos1)
@@ -2494,8 +2421,7 @@ define i1 @assume_oge_pos1__ule_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OGE_POS1:%.*]] = fcmp oge float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OGE_POS1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ule float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
   %oge.pos1 = fcmp oge float %arg, 1.0
   call void @llvm.assume(i1 %oge.pos1)
@@ -2508,8 +2434,7 @@ define i1 @assume_oge_pos1__une_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OGE_POS1:%.*]] = fcmp oge float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OGE_POS1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp une float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %oge.pos1 = fcmp oge float %arg, 1.0
   call void @llvm.assume(i1 %oge.pos1)
@@ -2522,8 +2447,7 @@ define i1 @assume_oge_pos1__uno_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OGE_POS1:%.*]] = fcmp oge float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OGE_POS1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp uno float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
   %oge.pos1 = fcmp oge float %arg, 1.0
   call void @llvm.assume(i1 %oge.pos1)
@@ -2540,8 +2464,7 @@ define i1 @assume_ugt_pos1__oeq_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[UGT_POS1:%.*]] = fcmp ugt float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[UGT_POS1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
   %ugt.pos1 = fcmp ugt float %arg, 1.0
   call void @llvm.assume(i1 %ugt.pos1)
@@ -2582,8 +2505,7 @@ define i1 @assume_ugt_pos1__olt_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[UGT_POS1:%.*]] = fcmp ugt float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[UGT_POS1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
   %ugt.pos1 = fcmp ugt float %arg, 1.0
   call void @llvm.assume(i1 %ugt.pos1)
@@ -2596,8 +2518,7 @@ define i1 @assume_ugt_pos1__ole_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[UGT_POS1:%.*]] = fcmp ugt float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[UGT_POS1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ole float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
   %ugt.pos1 = fcmp ugt float %arg, 1.0
   call void @llvm.assume(i1 %ugt.pos1)
@@ -2652,8 +2573,7 @@ define i1 @assume_ugt_pos1__ugt_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[UGT_POS1:%.*]] = fcmp ugt float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[UGT_POS1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ugt float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %ugt.pos1 = fcmp ugt float %arg, 1.0
   call void @llvm.assume(i1 %ugt.pos1)
@@ -2666,8 +2586,7 @@ define i1 @assume_ugt_pos1__uge_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[UGT_POS1:%.*]] = fcmp ugt float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[UGT_POS1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp uge float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %ugt.pos1 = fcmp ugt float %arg, 1.0
   call void @llvm.assume(i1 %ugt.pos1)
@@ -2708,8 +2627,7 @@ define i1 @assume_ugt_pos1__une_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[UGT_POS1:%.*]] = fcmp ugt float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[UGT_POS1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp une float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %ugt.pos1 = fcmp ugt float %arg, 1.0
   call void @llvm.assume(i1 %ugt.pos1)
@@ -2740,8 +2658,7 @@ define i1 @assume_uge_pos1__oeq_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[UGE_POS1:%.*]] = fcmp uge float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[UGE_POS1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
   %uge.pos1 = fcmp uge float %arg, 1.0
   call void @llvm.assume(i1 %uge.pos1)
@@ -2782,8 +2699,7 @@ define i1 @assume_uge_pos1__olt_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[UGE_POS1:%.*]] = fcmp uge float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[UGE_POS1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
   %uge.pos1 = fcmp uge float %arg, 1.0
   call void @llvm.assume(i1 %uge.pos1)
@@ -2796,8 +2712,7 @@ define i1 @assume_uge_pos1__ole_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[UGE_POS1:%.*]] = fcmp uge float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[UGE_POS1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ole float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
   %uge.pos1 = fcmp uge float %arg, 1.0
   call void @llvm.assume(i1 %uge.pos1)
@@ -2852,8 +2767,7 @@ define i1 @assume_uge_pos1__ugt_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[UGE_POS1:%.*]] = fcmp uge float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[UGE_POS1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ugt float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %uge.pos1 = fcmp uge float %arg, 1.0
   call void @llvm.assume(i1 %uge.pos1)
@@ -2866,8 +2780,7 @@ define i1 @assume_uge_pos1__uge_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[UGE_POS1:%.*]] = fcmp uge float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[UGE_POS1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp uge float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %uge.pos1 = fcmp uge float %arg, 1.0
   call void @llvm.assume(i1 %uge.pos1)
@@ -2908,8 +2821,7 @@ define i1 @assume_uge_pos1__une_0(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[UGE_POS1:%.*]] = fcmp uge float [[ARG]], 1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[UGE_POS1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp une float [[ARG]], 0.000000e+00
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %uge.pos1 = fcmp uge float %arg, 1.0
   call void @llvm.assume(i1 %uge.pos1)
@@ -2940,8 +2852,7 @@ define i1 @assume_olt_neg1__oeq_inf(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OLT_NEG1:%.*]] = fcmp olt float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OLT_NEG1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[ARG]], 0x7FF0000000000000
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
   %olt.neg1 = fcmp olt float %arg, -1.0
   call void @llvm.assume(i1 %olt.neg1)
@@ -2954,8 +2865,7 @@ define i1 @assume_olt_neg1__one_inf(float %arg) {
 ; CHECK-SAME: float [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[OLT_NEG1:%.*]] = fcmp olt float [[ARG]], -1.000000e+00
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OLT_NEG1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp one float [[ARG]], 0x7FF0000000000000
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %olt.neg1 = fcmp olt float %arg, -1.0
   call void @llvm.assume(i1 %olt.neg1)

From 5a4ca51a91ff28b1d6bdde5403144c29b86e4b54 Mon Sep 17 00:00:00 2001
From: jeanPerier <jperier@nvidia.com>
Date: Fri, 1 Dec 2023 10:03:02 +0100
Subject: [PATCH 13/72] [mlir] notify insertion of parent op first when cloning
 (#73806)

When cloning an operation with a region, the builder was currently
notifying about the insertion of the cloned operations inside the region
before the cloned operation itself.

When using cloning inside rewrite pass, this could cause issues if a
pattern is expected to be applied on a cloned parent operation before
trying to apply patterns on the cloned operations it contains (the
patterns are attempted in order of notifications for the cloned
operations).
---
 mlir/lib/IR/Builders.cpp       |  7 ++++---
 mlir/test/IR/test-clone.mlir   | 23 +++++++++++++++++++----
 mlir/test/lib/IR/TestClone.cpp |  8 ++++++++
 3 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/mlir/lib/IR/Builders.cpp b/mlir/lib/IR/Builders.cpp
index ab20f4863e11c2..2cabfcd24d3559 100644
--- a/mlir/lib/IR/Builders.cpp
+++ b/mlir/lib/IR/Builders.cpp
@@ -527,7 +527,8 @@ LogicalResult OpBuilder::tryFold(Operation *op,
 
 Operation *OpBuilder::clone(Operation &op, IRMapping &mapper) {
   Operation *newOp = op.clone(mapper);
-  // The `insert` call below handles the notification for inserting `newOp`
+  newOp = insert(newOp);
+  // The `insert` call above handles the notification for inserting `newOp`
   // itself. But if `newOp` has any regions, we need to notify the listener
   // about any ops that got inserted inside those regions as part of cloning.
   if (listener) {
@@ -535,9 +536,9 @@ Operation *OpBuilder::clone(Operation &op, IRMapping &mapper) {
       listener->notifyOperationInserted(walkedOp);
     };
     for (Region &region : newOp->getRegions())
-      region.walk(walkFn);
+      region.walk<WalkOrder::PreOrder>(walkFn);
   }
-  return insert(newOp);
+  return newOp;
 }
 
 Operation *OpBuilder::clone(Operation &op) {
diff --git a/mlir/test/IR/test-clone.mlir b/mlir/test/IR/test-clone.mlir
index 575098b642e8ea..0c07593aef32d9 100644
--- a/mlir/test/IR/test-clone.mlir
+++ b/mlir/test/IR/test-clone.mlir
@@ -1,20 +1,35 @@
-// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(func.func(test-clone))" -split-input-file
+// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(func.func(test-clone))" | FileCheck %s
 
 module {
   func.func @fixpoint(%arg1 : i32) -> i32 {
     %r = "test.use"(%arg1) ({
-       "test.yield"(%arg1) : (i32) -> ()
+      %r2 = "test.use2"(%arg1) ({
+         "test.yield2"(%arg1) : (i32) -> ()
+      }) : (i32) -> i32
+      "test.yield"(%r2) : (i32) -> ()
     }) : (i32) -> i32
     return %r : i32
   }
 }
 
+// CHECK: notifyOperationInserted: test.use
+// CHECK-NEXT: notifyOperationInserted: test.use2
+// CHECK-NEXT: notifyOperationInserted: test.yield2
+// CHECK-NEXT: notifyOperationInserted: test.yield
+// CHECK-NEXT: notifyOperationInserted: func.return
+
 // CHECK:   func @fixpoint(%[[arg0:.+]]: i32) -> i32 {
 // CHECK-NEXT:     %[[i0:.+]] = "test.use"(%[[arg0]]) ({
-// CHECK-NEXT:       "test.yield"(%arg0) : (i32) -> ()
+// CHECK-NEXT:       %[[r2:.+]] = "test.use2"(%[[arg0]]) ({
+// CHECK-NEXT:         "test.yield2"(%[[arg0]]) : (i32) -> ()
+// CHECK-NEXT:       }) : (i32) -> i32
+// CHECK-NEXT:       "test.yield"(%[[r2]]) : (i32) -> ()
 // CHECK-NEXT:     }) : (i32) -> i32
 // CHECK-NEXT:     %[[i1:.+]] = "test.use"(%[[i0]]) ({
-// CHECK-NEXT:       "test.yield"(%[[i0]]) : (i32) -> ()
+// CHECK-NEXT:       %[[r2:.+]] = "test.use2"(%[[i0]]) ({
+// CHECK-NEXT:         "test.yield2"(%[[i0]]) : (i32) -> ()
+// CHECK-NEXT:       }) : (i32) -> i32
+// CHECK-NEXT:       "test.yield"(%[[r2]]) : (i32) -> ()
 // CHECK-NEXT:     }) : (i32) -> i32
 // CHECK-NEXT:     return %[[i1]] : i32
 // CHECK-NEXT:   }
diff --git a/mlir/test/lib/IR/TestClone.cpp b/mlir/test/lib/IR/TestClone.cpp
index 70238608a67c2b..13a0cfeb402a9c 100644
--- a/mlir/test/lib/IR/TestClone.cpp
+++ b/mlir/test/lib/IR/TestClone.cpp
@@ -14,6 +14,12 @@ using namespace mlir;
 
 namespace {
 
+struct DumpNotifications : public OpBuilder::Listener {
+  void notifyOperationInserted(Operation *op) override {
+    llvm::outs() << "notifyOperationInserted: " << op->getName() << "\n";
+  }
+};
+
 /// This is a test pass which clones the body of a function. Specifically
 /// this pass replaces f(x) to instead return f(f(x)) in which the cloned body
 /// takes the result of the first operation return as an input.
@@ -50,6 +56,8 @@ struct ClonePass
     }
 
     OpBuilder builder(op->getContext());
+    DumpNotifications dumpNotifications;
+    builder.setListener(&dumpNotifications);
     builder.setInsertionPointToEnd(&regionEntry);
     SmallVector<Operation *> toClone;
     for (Operation &inst : regionEntry)

From a224ddc9b4458b1b9cf0a758c974a554f0f17dc4 Mon Sep 17 00:00:00 2001
From: Guray Ozen <guray.ozen@gmail.com>
Date: Fri, 1 Dec 2023 10:21:50 +0100
Subject: [PATCH 14/72] [mlir][nvvm] Introduce `cp.async.bulk.commit.group`

This PR introduced `cp.async.bulk.commit.group` op.
---
 mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td       | 9 +++++++++
 mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir | 8 ++++++++
 2 files changed, 17 insertions(+)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 54826f4196993d..ecad1a16eb6c59 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -1420,6 +1420,15 @@ def NVVM_MmaOp : NVVM_Op<"mma.sync", [AttrSizedOperandSegments]> {
 // NVVM TMA Ops
 //===----------------------------------------------------------------------===//
 
+def NVVM_CpAsyncBulkCommitGroupOp : NVVM_PTXBuilder_Op<"cp.async.bulk.commit.group">,
+  Arguments<(ins )> {
+  let assemblyFormat = "attr-dict";
+  let extraClassDefinition = [{
+    std::string $cppClass::getPtx() { return std::string("cp.async.bulk.commit_group;"); }
+  }];
+}
+
+
 def NVVM_CpAsyncBulkTensorGlobalToSharedClusterOp : 
   NVVM_Op<"cp.async.bulk.tensor.shared.cluster.global", 
   [DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>, 
diff --git a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
index 7da4e98c40e54b..5482cc194192dd 100644
--- a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
+++ b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
@@ -621,3 +621,11 @@ func.func @set_max_register() {
   nvvm.setmaxregister decrease 40
   func.return
 }
+
+// -----
+
+func.func @cp_bulk_commit() {
+  //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.commit_group;"
+  nvvm.cp.async.bulk.commit.group
+  func.return
+}

From 4d1dc7770a6411b87cc488dd982c034f1b4ff7a7 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <Ramkumar.Ramachandra@imgtec.com>
Date: Fri, 1 Dec 2023 09:22:13 +0000
Subject: [PATCH 15/72] AMDGPU/load-global-i32: regenerate test using UTC (NFC)
 (#73962)

Fix the RUN lines so that UTC runs cleanly, and regenerate the test
load-global-i32.ll using utils/update_llc_test_checks.py.
---
 llvm/test/CodeGen/AMDGPU/load-global-i32.ll | 4935 +++++++++++++++++--
 1 file changed, 4462 insertions(+), 473 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
index c4d9b4b2bb5ebb..55f0773f7e05ae 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -1,113 +1,825 @@
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=SI-NOHSA -check-prefix=FUNC %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=GCNX3-HSA -check-prefix=FUNC %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=GCNX3-NOHSA -check-prefix=FUNC %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=redwood < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=EG -check-prefix=FUNC %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn--amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCNX3-HSA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCNX3-NOHSA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-HSA -check-prefix=GCN-GFX900-HSA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-HSA -check-prefix=GCN-GFX908-HSA %s
 
-; FUNC-LABEL: {{^}}global_load_i32:
-; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}
-; GCN-HSA: {{flat|global}}_load_dword
-
-; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
 define amdgpu_kernel void @global_load_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-NOHSA-LABEL: global_load_i32:
+; SI-NOHSA:       ; %bb.0: ; %entry
+; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
+; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
+; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
+; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
+; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
+; SI-NOHSA-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
+; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
+; SI-NOHSA-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NOHSA-NEXT:    s_endpgm
+;
+; GCNX3-HSA-LABEL: global_load_i32:
+; GCNX3-HSA:       ; %bb.0: ; %entry
+; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCNX3-HSA-NEXT:    flat_load_dword v2, v[0:1]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s1
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(0)
+; GCNX3-HSA-NEXT:    flat_store_dword v[0:1], v2
+; GCNX3-HSA-NEXT:    s_endpgm
+;
+; GCNX3-NOHSA-LABEL: global_load_i32:
+; GCNX3-NOHSA:       ; %bb.0: ; %entry
+; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(0)
+; GCNX3-NOHSA-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCNX3-NOHSA-NEXT:    s_endpgm
+;
+; EG-LABEL: global_load_i32:
+; EG:       ; %bb.0: ; %entry
+; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 0 @6
+; EG-NEXT:    ALU 1, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
+; EG-NEXT:    ALU clause starting at 8:
+; EG-NEXT:     MOV * T0.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 9:
+; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; GCN-HSA-LABEL: global_load_i32:
+; GCN-HSA:       ; %bb.0: ; %entry
+; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-HSA-NEXT:    global_load_dword v1, v0, s[2:3]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-HSA-NEXT:    global_store_dword v0, v1, s[0:1]
+; GCN-HSA-NEXT:    s_endpgm
 entry:
   %ld = load i32, ptr addrspace(1) %in
   store i32 %ld, ptr addrspace(1) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}global_load_v2i32:
-; GCN-NOHSA: buffer_load_dwordx2
-; GCN-HSA: {{flat|global}}_load_dwordx2
-
-; EG: VTX_READ_64
 define amdgpu_kernel void @global_load_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-NOHSA-LABEL: global_load_v2i32:
+; SI-NOHSA:       ; %bb.0: ; %entry
+; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
+; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
+; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
+; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
+; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
+; SI-NOHSA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
+; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
+; SI-NOHSA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NOHSA-NEXT:    s_endpgm
+;
+; GCNX3-HSA-LABEL: global_load_v2i32:
+; GCNX3-HSA:       ; %bb.0: ; %entry
+; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCNX3-HSA-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, s0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v3, s1
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(0)
+; GCNX3-HSA-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GCNX3-HSA-NEXT:    s_endpgm
+;
+; GCNX3-NOHSA-LABEL: global_load_v2i32:
+; GCNX3-NOHSA:       ; %bb.0: ; %entry
+; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(0)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT:    s_endpgm
+;
+; EG-LABEL: global_load_v2i32:
+; EG:       ; %bb.0: ; %entry
+; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 0 @6
+; EG-NEXT:    ALU 1, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
+; EG-NEXT:    ALU clause starting at 8:
+; EG-NEXT:     MOV * T0.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 9:
+; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; GCN-HSA-LABEL: global_load_v2i32:
+; GCN-HSA:       ; %bb.0: ; %entry
+; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-HSA-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-HSA-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GCN-HSA-NEXT:    s_endpgm
 entry:
   %ld = load <2 x i32>, ptr addrspace(1) %in
   store <2 x i32> %ld, ptr addrspace(1) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}global_load_v3i32:
-; SI-NOHSA: buffer_load_dwordx4
-; GCNX3-NOHSA: buffer_load_dwordx3
-; GCNX3-HSA: {{flat|global}}_load_dwordx3
-
-; EG: VTX_READ_128
 define amdgpu_kernel void @global_load_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-NOHSA-LABEL: global_load_v3i32:
+; SI-NOHSA:       ; %bb.0: ; %entry
+; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
+; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
+; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
+; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
+; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
+; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
+; SI-NOHSA-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:8
+; SI-NOHSA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NOHSA-NEXT:    s_endpgm
+;
+; GCNX3-HSA-LABEL: global_load_v3i32:
+; GCNX3-HSA:       ; %bb.0: ; %entry
+; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCNX3-HSA-NEXT:    flat_load_dwordx3 v[0:2], v[0:1]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v3, s0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s1
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(0)
+; GCNX3-HSA-NEXT:    flat_store_dwordx3 v[3:4], v[0:2]
+; GCNX3-HSA-NEXT:    s_endpgm
+;
+; GCNX3-NOHSA-LABEL: global_load_v3i32:
+; GCNX3-NOHSA:       ; %bb.0: ; %entry
+; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx3 v[0:2], off, s[8:11], 0
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(0)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT:    s_endpgm
+;
+; EG-LABEL: global_load_v3i32:
+; EG:       ; %bb.0: ; %entry
+; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 0 @6
+; EG-NEXT:    ALU 6, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
+; EG-NEXT:    ALU clause starting at 8:
+; EG-NEXT:     MOV * T0.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 9:
+; EG-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
+; EG-NEXT:     MOV * T2.X, T0.Z,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR * T3.X, PV.W, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; GCN-HSA-LABEL: global_load_v3i32:
+; GCN-HSA:       ; %bb.0: ; %entry
+; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-HSA-NEXT:    global_load_dwordx3 v[0:2], v3, s[2:3]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-HSA-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
+; GCN-HSA-NEXT:    s_endpgm
 entry:
   %ld = load <3 x i32>, ptr addrspace(1) %in
   store <3 x i32> %ld, ptr addrspace(1) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}global_load_v4i32:
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dwordx4
-
-; EG: VTX_READ_128
 define amdgpu_kernel void @global_load_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-NOHSA-LABEL: global_load_v4i32:
+; SI-NOHSA:       ; %bb.0: ; %entry
+; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
+; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
+; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
+; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
+; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
+; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NOHSA-NEXT:    s_endpgm
+;
+; GCNX3-HSA-LABEL: global_load_v4i32:
+; GCNX3-HSA:       ; %bb.0: ; %entry
+; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(0)
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCNX3-HSA-NEXT:    s_endpgm
+;
+; GCNX3-NOHSA-LABEL: global_load_v4i32:
+; GCNX3-NOHSA:       ; %bb.0: ; %entry
+; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(0)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT:    s_endpgm
+;
+; EG-LABEL: global_load_v4i32:
+; EG:       ; %bb.0: ; %entry
+; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 0 @6
+; EG-NEXT:    ALU 1, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
+; EG-NEXT:    ALU clause starting at 8:
+; EG-NEXT:     MOV * T0.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 9:
+; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; GCN-HSA-LABEL: global_load_v4i32:
+; GCN-HSA:       ; %bb.0: ; %entry
+; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-HSA-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-HSA-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GCN-HSA-NEXT:    s_endpgm
 entry:
   %ld = load <4 x i32>, ptr addrspace(1) %in
   store <4 x i32> %ld, ptr addrspace(1) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}global_load_v8i32:
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dwordx4
-
-; EG: VTX_READ_128
-; EG: VTX_READ_128
 define amdgpu_kernel void @global_load_v8i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-NOHSA-LABEL: global_load_v8i32:
+; SI-NOHSA:       ; %bb.0: ; %entry
+; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
+; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
+; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
+; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
+; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0
+; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
+; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(1)
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(1)
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; SI-NOHSA-NEXT:    s_endpgm
+;
+; GCNX3-HSA-LABEL: global_load_v8i32:
+; GCNX3-HSA:       ; %bb.0: ; %entry
+; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s2
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s2, 16
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s3, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s1
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s0
+; GCNX3-HSA-NEXT:    s_add_u32 s0, s0, 16
+; GCNX3-HSA-NEXT:    s_addc_u32 s1, s1, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v11, s1
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v10, s0
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(1)
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[0:3]
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(1)
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
+; GCNX3-HSA-NEXT:    s_endpgm
+;
+; GCNX3-NOHSA-LABEL: global_load_v8i32:
+; GCNX3-NOHSA:       ; %bb.0: ; %entry
+; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(1)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(1)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT:    s_endpgm
+;
+; EG-LABEL: global_load_v8i32:
+; EG:       ; %bb.0: ; %entry
+; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 1 @6
+; EG-NEXT:    ALU 4, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 0, #1
+; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 16, #1
+; EG-NEXT:    ALU clause starting at 10:
+; EG-NEXT:     MOV * T0.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 11:
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; GCN-HSA-LABEL: global_load_v8i32:
+; GCN-HSA:       ; %bb.0: ; %entry
+; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-HSA-NEXT:    global_load_dwordx4 v[0:3], v8, s[2:3] offset:16
+; GCN-HSA-NEXT:    global_load_dwordx4 v[4:7], v8, s[2:3]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
+; GCN-HSA-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
+; GCN-HSA-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
+; GCN-HSA-NEXT:    s_endpgm
 entry:
   %ld = load <8 x i32>, ptr addrspace(1) %in
   store <8 x i32> %ld, ptr addrspace(1) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}global_load_v9i32:
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dword
-; GCN-HSA: {{flat|global}}_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dword
 define amdgpu_kernel void @global_load_v9i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-NOHSA-LABEL: global_load_v9i32:
+; SI-NOHSA:       ; %bb.0: ; %entry
+; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
+; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
+; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
+; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
+; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; SI-NOHSA-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:32
+; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
+; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; SI-NOHSA-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:32
+; SI-NOHSA-NEXT:    s_endpgm
+;
+; GCNX3-HSA-LABEL: global_load_v9i32:
+; GCNX3-HSA:       ; %bb.0: ; %entry
+; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 32
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s2, 16
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s3, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s5
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s2
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s4
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; GCNX3-HSA-NEXT:    flat_load_dword v14, v[8:9]
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 16
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s1
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s0
+; GCNX3-HSA-NEXT:    s_add_u32 s0, s0, 32
+; GCNX3-HSA-NEXT:    s_addc_u32 s1, s1, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v11, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v13, s1
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v10, s2
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v12, s0
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(2)
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(2)
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(2)
+; GCNX3-HSA-NEXT:    flat_store_dword v[12:13], v14
+; GCNX3-HSA-NEXT:    s_endpgm
+;
+; GCNX3-NOHSA-LABEL: global_load_v9i32:
+; GCNX3-NOHSA:       ; %bb.0: ; %entry
+; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; GCNX3-NOHSA-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:32
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; GCNX3-NOHSA-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:32
+; GCNX3-NOHSA-NEXT:    s_endpgm
+;
+; EG-LABEL: global_load_v9i32:
+; EG:       ; %bb.0: ; %entry
+; EG-NEXT:    ALU 8, @14, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 2 @8
+; EG-NEXT:    ALU 1, @23, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T5.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T1.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.X, T0.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    Fetch clause starting at 8:
+; EG-NEXT:     VTX_READ_128 T4.XYZW, T2.X, 0, #1
+; EG-NEXT:     VTX_READ_128 T2.XYZW, T2.X, 16, #1
+; EG-NEXT:     VTX_READ_32 T3.X, T3.X, 32, #1
+; EG-NEXT:    ALU clause starting at 14:
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR T0.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:     LSHR T1.X, PV.W, literal.x,
+; EG-NEXT:     MOV * T2.X, KC0[2].Z,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     MOV * T3.X, PS,
+; EG-NEXT:    ALU clause starting at 23:
+; EG-NEXT:     LSHR * T5.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; GCN-HSA-LABEL: global_load_v9i32:
+; GCN-HSA:       ; %bb.0: ; %entry
+; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-HSA-NEXT:    global_load_dwordx4 v[0:3], v8, s[2:3]
+; GCN-HSA-NEXT:    global_load_dwordx4 v[4:7], v8, s[2:3] offset:16
+; GCN-HSA-NEXT:    global_load_dword v9, v8, s[2:3] offset:32
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(2)
+; GCN-HSA-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(2)
+; GCN-HSA-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(2)
+; GCN-HSA-NEXT:    global_store_dword v8, v9, s[0:1] offset:32
+; GCN-HSA-NEXT:    s_endpgm
 entry:
   %ld = load <9 x i32>, ptr addrspace(1) %in
   store <9 x i32> %ld, ptr addrspace(1) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}global_load_v10i32:
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx2
-; GCN-HSA: {{flat|global}}_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dwordx2
 define amdgpu_kernel void @global_load_v10i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-NOHSA-LABEL: global_load_v10i32:
+; SI-NOHSA:       ; %bb.0: ; %entry
+; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
+; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
+; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
+; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
+; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; SI-NOHSA-NEXT:    buffer_load_dwordx2 v[8:9], off, s[8:11], 0 offset:32
+; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
+; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; SI-NOHSA-NEXT:    buffer_store_dwordx2 v[8:9], off, s[4:7], 0 offset:32
+; SI-NOHSA-NEXT:    s_endpgm
+;
+; GCNX3-HSA-LABEL: global_load_v10i32:
+; GCNX3-HSA:       ; %bb.0: ; %entry
+; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 32
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s2, 16
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s3, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s5
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s4
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s2
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; GCNX3-HSA-NEXT:    flat_load_dwordx2 v[8:9], v[8:9]
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 16
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v11, s1
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v10, s0
+; GCNX3-HSA-NEXT:    s_add_u32 s0, s0, 32
+; GCNX3-HSA-NEXT:    s_addc_u32 s1, s1, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v13, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v15, s1
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v12, s2
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v14, s0
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(2)
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[0:3]
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(2)
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[4:7]
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(2)
+; GCNX3-HSA-NEXT:    flat_store_dwordx2 v[14:15], v[8:9]
+; GCNX3-HSA-NEXT:    s_endpgm
+;
+; GCNX3-NOHSA-LABEL: global_load_v10i32:
+; GCNX3-NOHSA:       ; %bb.0: ; %entry
+; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx2 v[8:9], off, s[8:11], 0 offset:32
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx2 v[8:9], off, s[4:7], 0 offset:32
+; GCNX3-NOHSA-NEXT:    s_endpgm
+;
+; EG-LABEL: global_load_v10i32:
+; EG:       ; %bb.0: ; %entry
+; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 2 @8
+; EG-NEXT:    ALU 7, @15, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T5.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T4.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T3.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    Fetch clause starting at 8:
+; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 0, #1
+; EG-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 16, #1
+; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 32, #1
+; EG-NEXT:    ALU clause starting at 14:
+; EG-NEXT:     MOV * T0.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 15:
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR T3.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:     LSHR T4.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T5.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; GCN-HSA-LABEL: global_load_v10i32:
+; GCN-HSA:       ; %bb.0: ; %entry
+; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, 0
+; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-HSA-NEXT:    global_load_dwordx4 v[0:3], v10, s[2:3]
+; GCN-HSA-NEXT:    global_load_dwordx4 v[4:7], v10, s[2:3] offset:16
+; GCN-HSA-NEXT:    global_load_dwordx2 v[8:9], v10, s[2:3] offset:32
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(2)
+; GCN-HSA-NEXT:    global_store_dwordx4 v10, v[0:3], s[0:1]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(2)
+; GCN-HSA-NEXT:    global_store_dwordx4 v10, v[4:7], s[0:1] offset:16
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(2)
+; GCN-HSA-NEXT:    global_store_dwordx2 v10, v[8:9], s[0:1] offset:32
+; GCN-HSA-NEXT:    s_endpgm
 entry:
   %ld = load <10 x i32>, ptr addrspace(1) %in
   store <10 x i32> %ld, ptr addrspace(1) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}global_load_v11i32:
-; SI-NOHSA: buffer_load_dwordx4
-; SI-NOHSA: buffer_load_dwordx4
-; SI-NOHSA: buffer_load_dwordx4
-; GCNX3-NOHSA: buffer_load_dwordx4
-; GCNX3-NOHSA: buffer_load_dwordx4
-; GCNX3-NOHSA: buffer_load_dwordx3
-; GCN-HSA: {{flat|global}}_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dwordx3
 define amdgpu_kernel void @global_load_v11i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-NOHSA-LABEL: global_load_v11i32:
+; SI-NOHSA:       ; %bb.0: ; %entry
+; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
+; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
+; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
+; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
+; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
+; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
+; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; SI-NOHSA-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:40
+; SI-NOHSA-NEXT:    buffer_store_dwordx2 v[8:9], off, s[4:7], 0 offset:32
+; SI-NOHSA-NEXT:    s_endpgm
+;
+; GCNX3-HSA-LABEL: global_load_v11i32:
+; GCNX3-HSA:       ; %bb.0: ; %entry
+; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 32
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s2, 16
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s3, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s5
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s4
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s2
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; GCNX3-HSA-NEXT:    flat_load_dwordx3 v[8:10], v[8:9]
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 16
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v12, s1
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v11, s0
+; GCNX3-HSA-NEXT:    s_add_u32 s0, s0, 32
+; GCNX3-HSA-NEXT:    s_addc_u32 s1, s1, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v14, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, s1
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v13, s2
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v15, s0
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(2)
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[11:12], v[0:3]
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(2)
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[13:14], v[4:7]
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(2)
+; GCNX3-HSA-NEXT:    flat_store_dwordx3 v[15:16], v[8:10]
+; GCNX3-HSA-NEXT:    s_endpgm
+;
+; GCNX3-NOHSA-LABEL: global_load_v11i32:
+; GCNX3-NOHSA:       ; %bb.0: ; %entry
+; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx3 v[8:10], off, s[8:11], 0 offset:32
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx3 v[8:10], off, s[4:7], 0 offset:32
+; GCNX3-NOHSA-NEXT:    s_endpgm
+;
+; EG-LABEL: global_load_v11i32:
+; EG:       ; %bb.0: ; %entry
+; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 2 @8
+; EG-NEXT:    ALU 12, @15, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T7.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T6.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T3.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    Fetch clause starting at 8:
+; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 0, #1
+; EG-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 16, #1
+; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 32, #1
+; EG-NEXT:    ALU clause starting at 14:
+; EG-NEXT:     MOV * T0.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 15:
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR T3.X, PV.W, literal.x,
+; EG-NEXT:     MOV * T4.X, T0.Z,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    40(5.605194e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR T5.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:     LSHR T6.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; GCN-HSA-LABEL: global_load_v11i32:
+; GCN-HSA:       ; %bb.0: ; %entry
+; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, 0
+; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-HSA-NEXT:    global_load_dwordx4 v[0:3], v11, s[2:3]
+; GCN-HSA-NEXT:    global_load_dwordx4 v[4:7], v11, s[2:3] offset:16
+; GCN-HSA-NEXT:    global_load_dwordx3 v[8:10], v11, s[2:3] offset:32
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(2)
+; GCN-HSA-NEXT:    global_store_dwordx4 v11, v[0:3], s[0:1]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(2)
+; GCN-HSA-NEXT:    global_store_dwordx4 v11, v[4:7], s[0:1] offset:16
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(2)
+; GCN-HSA-NEXT:    global_store_dwordx3 v11, v[8:10], s[0:1] offset:32
+; GCN-HSA-NEXT:    s_endpgm
 entry:
   %ld = load <11 x i32>, ptr addrspace(1) %in
   store <11 x i32> %ld, ptr addrspace(1) %out
@@ -115,533 +827,3810 @@ entry:
 }
 
 
-; FUNC-LABEL: {{^}}global_load_v12i32:
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dwordx4
 define amdgpu_kernel void @global_load_v12i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-NOHSA-LABEL: global_load_v12i32:
+; SI-NOHSA:       ; %bb.0: ; %entry
+; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
+; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
+; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
+; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
+; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
+; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
+; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32
+; SI-NOHSA-NEXT:    s_endpgm
+;
+; GCNX3-HSA-LABEL: global_load_v12i32:
+; GCNX3-HSA:       ; %bb.0: ; %entry
+; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 32
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s2, 16
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s3, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s5
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s4
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s2
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 16
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v13, s1
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v12, s0
+; GCNX3-HSA-NEXT:    s_add_u32 s0, s0, 32
+; GCNX3-HSA-NEXT:    s_addc_u32 s1, s1, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v15, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v17, s1
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v14, s2
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, s0
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(2)
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(2)
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[14:15], v[4:7]
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(2)
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
+; GCNX3-HSA-NEXT:    s_endpgm
+;
+; GCNX3-NOHSA-LABEL: global_load_v12i32:
+; GCNX3-NOHSA:       ; %bb.0: ; %entry
+; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32
+; GCNX3-NOHSA-NEXT:    s_endpgm
+;
+; EG-LABEL: global_load_v12i32:
+; EG:       ; %bb.0: ; %entry
+; EG-NEXT:    ALU 7, @14, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 2 @8
+; EG-NEXT:    ALU 1, @22, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T5.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T1.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    Fetch clause starting at 8:
+; EG-NEXT:     VTX_READ_128 T3.XYZW, T2.X, 0, #1
+; EG-NEXT:     VTX_READ_128 T4.XYZW, T2.X, 16, #1
+; EG-NEXT:     VTX_READ_128 T2.XYZW, T2.X, 32, #1
+; EG-NEXT:    ALU clause starting at 14:
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR T0.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:     LSHR T1.X, PV.W, literal.x,
+; EG-NEXT:     MOV * T2.X, KC0[2].Z,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:    ALU clause starting at 22:
+; EG-NEXT:     LSHR * T5.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; GCN-HSA-LABEL: global_load_v12i32:
+; GCN-HSA:       ; %bb.0: ; %entry
+; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, 0
+; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-HSA-NEXT:    global_load_dwordx4 v[0:3], v12, s[2:3]
+; GCN-HSA-NEXT:    global_load_dwordx4 v[4:7], v12, s[2:3] offset:16
+; GCN-HSA-NEXT:    global_load_dwordx4 v[8:11], v12, s[2:3] offset:32
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(2)
+; GCN-HSA-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(2)
+; GCN-HSA-NEXT:    global_store_dwordx4 v12, v[4:7], s[0:1] offset:16
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(2)
+; GCN-HSA-NEXT:    global_store_dwordx4 v12, v[8:11], s[0:1] offset:32
+; GCN-HSA-NEXT:    s_endpgm
 entry:
   %ld = load <12 x i32>, ptr addrspace(1) %in
   store <12 x i32> %ld, ptr addrspace(1) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}global_load_v16i32:
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
-
-; GCN-HSA: {{flat|global}}_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dwordx4
-
-; EG: VTX_READ_128
-; EG: VTX_READ_128
-; EG: VTX_READ_128
-; EG: VTX_READ_128
 define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-NOHSA-LABEL: global_load_v16i32:
+; SI-NOHSA:       ; %bb.0: ; %entry
+; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
+; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
+; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
+; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
+; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
+; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
+; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:16
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(3)
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:32
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(3)
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:48
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(3)
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(3)
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:16
+; SI-NOHSA-NEXT:    s_endpgm
+;
+; GCNX3-HSA-LABEL: global_load_v16i32:
+; GCNX3-HSA:       ; %bb.0: ; %entry
+; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 16
+; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v13, s5
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v12, s4
+; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 48
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s2, 32
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s3, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s4
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s5
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s2
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 32
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v19, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v18, s2
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 48
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v17, s1
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, s0
+; GCNX3-HSA-NEXT:    s_add_u32 s0, s0, 16
+; GCNX3-HSA-NEXT:    s_addc_u32 s1, s1, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v21, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v23, s1
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v20, s2
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v22, s0
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(3)
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[0:3]
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(3)
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[4:7]
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(3)
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[8:11]
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(3)
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[12:15]
+; GCNX3-HSA-NEXT:    s_endpgm
+;
+; GCNX3-NOHSA-LABEL: global_load_v16i32:
+; GCNX3-NOHSA:       ; %bb.0: ; %entry
+; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:16
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(3)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:32
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(3)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:48
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(3)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(3)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:16
+; GCNX3-NOHSA-NEXT:    s_endpgm
+;
+; EG-LABEL: global_load_v16i32:
+; EG:       ; %bb.0: ; %entry
+; EG-NEXT:    ALU 11, @16, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 3 @8
+; EG-NEXT:    ALU 1, @28, KC0[], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T7.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T2.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T1.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T0.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    Fetch clause starting at 8:
+; EG-NEXT:     VTX_READ_128 T4.XYZW, T3.X, 32, #1
+; EG-NEXT:     VTX_READ_128 T5.XYZW, T3.X, 48, #1
+; EG-NEXT:     VTX_READ_128 T6.XYZW, T3.X, 0, #1
+; EG-NEXT:     VTX_READ_128 T3.XYZW, T3.X, 16, #1
+; EG-NEXT:    ALU clause starting at 16:
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR T0.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
+; EG-NEXT:     MOV * T3.X, KC0[2].Z,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT:    ALU clause starting at 28:
+; EG-NEXT:     LSHR * T7.X, T0.W, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; GCN-HSA-LABEL: global_load_v16i32:
+; GCN-HSA:       ; %bb.0: ; %entry
+; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v16, 0
+; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-HSA-NEXT:    global_load_dwordx4 v[0:3], v16, s[2:3] offset:32
+; GCN-HSA-NEXT:    global_load_dwordx4 v[4:7], v16, s[2:3] offset:48
+; GCN-HSA-NEXT:    global_load_dwordx4 v[8:11], v16, s[2:3]
+; GCN-HSA-NEXT:    global_load_dwordx4 v[12:15], v16, s[2:3] offset:16
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(3)
+; GCN-HSA-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1] offset:32
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(3)
+; GCN-HSA-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:48
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(3)
+; GCN-HSA-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(3)
+; GCN-HSA-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:16
+; GCN-HSA-NEXT:    s_endpgm
 entry:
   %ld = load <16 x i32>, ptr addrspace(1) %in
   store <16 x i32> %ld, ptr addrspace(1) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}global_zextload_i32_to_i64:
-; GCN-NOHSA-DAG: buffer_load_dword v[[LO:[0-9]+]],
-; GCN-HSA-DAG: {{flat|global}}_load_dword v[[LO:[0-9]+]],
-; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
-
-; GCN-NOHSA: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
-; GCN-HSA: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
-
-; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
 define amdgpu_kernel void @global_zextload_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-NOHSA-LABEL: global_zextload_i32_to_i64:
+; SI-NOHSA:       ; %bb.0:
+; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
+; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
+; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
+; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
+; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
+; SI-NOHSA-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
+; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
+; SI-NOHSA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NOHSA-NEXT:    s_endpgm
+;
+; GCNX3-HSA-LABEL: global_zextload_i32_to_i64:
+; GCNX3-HSA:       ; %bb.0:
+; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCNX3-HSA-NEXT:    flat_load_dword v0, v[0:1]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, s0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v3, s1
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, 0
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(0)
+; GCNX3-HSA-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GCNX3-HSA-NEXT:    s_endpgm
+;
+; GCNX3-NOHSA-LABEL: global_zextload_i32_to_i64:
+; GCNX3-NOHSA:       ; %bb.0:
+; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v1, 0
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(0)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT:    s_endpgm
+;
+; EG-LABEL: global_zextload_i32_to_i64:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 0 @6
+; EG-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
+; EG-NEXT:    ALU clause starting at 8:
+; EG-NEXT:     MOV * T0.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 9:
+; EG-NEXT:     MOV * T0.Y, 0.0,
+; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; GCN-HSA-LABEL: global_zextload_i32_to_i64:
+; GCN-HSA:       ; %bb.0:
+; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-HSA-NEXT:    global_load_dword v0, v1, s[2:3]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-HSA-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GCN-HSA-NEXT:    s_endpgm
   %ld = load i32, ptr addrspace(1) %in
   %ext = zext i32 %ld to i64
   store i64 %ext, ptr addrspace(1) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}global_sextload_i32_to_i64:
-; GCN-NOHSA: buffer_load_dword v[[LO:[0-9]+]]
-; GCN-HSA: {{flat|global}}_load_dword v[[LO:[0-9]+]]
-; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
-; GCN-NOHSA: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
-; GCN-HSA: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
-
-
-; EG: MEM_RAT
-; EG: VTX_READ_32
-; EG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}},  literal.
-; EG: 31
 define amdgpu_kernel void @global_sextload_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-NOHSA-LABEL: global_sextload_i32_to_i64:
+; SI-NOHSA:       ; %bb.0:
+; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
+; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
+; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
+; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
+; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
+; SI-NOHSA-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
+; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; SI-NOHSA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NOHSA-NEXT:    s_endpgm
+;
+; GCNX3-HSA-LABEL: global_sextload_i32_to_i64:
+; GCNX3-HSA:       ; %bb.0:
+; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCNX3-HSA-NEXT:    flat_load_dword v0, v[0:1]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, s0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v3, s1
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(0)
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCNX3-HSA-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GCNX3-HSA-NEXT:    s_endpgm
+;
+; GCNX3-NOHSA-LABEL: global_sextload_i32_to_i64:
+; GCNX3-NOHSA:       ; %bb.0:
+; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(0)
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT:    s_endpgm
+;
+; EG-LABEL: global_sextload_i32_to_i64:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 0 @6
+; EG-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
+; EG-NEXT:    ALU clause starting at 8:
+; EG-NEXT:     MOV * T0.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 9:
+; EG-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
+; EG-NEXT:     ASHR * T0.Y, T0.X, literal.y,
+; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
+;
+; GCN-HSA-LABEL: global_sextload_i32_to_i64:
+; GCN-HSA:       ; %bb.0:
+; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-HSA-NEXT:    global_load_dword v0, v2, s[2:3]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-HSA-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GCN-HSA-NEXT:    s_endpgm
   %ld = load i32, ptr addrspace(1) %in
   %ext = sext i32 %ld to i64
   store i64 %ext, ptr addrspace(1) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}global_zextload_v1i32_to_v1i64:
-; GCN-NOHSA: buffer_load_dword
-; GCN-NOHSA: buffer_store_dwordx2
-
-; GCN-HSA: {{flat|global}}_load_dword
-; GCN-HSA: {{flat|global}}_store_dwordx2
 define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-NOHSA-LABEL: global_zextload_v1i32_to_v1i64:
+; SI-NOHSA:       ; %bb.0:
+; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
+; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
+; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
+; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
+; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
+; SI-NOHSA-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
+; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
+; SI-NOHSA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NOHSA-NEXT:    s_endpgm
+;
+; GCNX3-HSA-LABEL: global_zextload_v1i32_to_v1i64:
+; GCNX3-HSA:       ; %bb.0:
+; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCNX3-HSA-NEXT:    flat_load_dword v0, v[0:1]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, s0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v3, s1
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, 0
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(0)
+; GCNX3-HSA-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GCNX3-HSA-NEXT:    s_endpgm
+;
+; GCNX3-NOHSA-LABEL: global_zextload_v1i32_to_v1i64:
+; GCNX3-NOHSA:       ; %bb.0:
+; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v1, 0
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(0)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT:    s_endpgm
+;
+; EG-LABEL: global_zextload_v1i32_to_v1i64:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 0 @6
+; EG-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
+; EG-NEXT:    ALU clause starting at 8:
+; EG-NEXT:     MOV * T0.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 9:
+; EG-NEXT:     MOV * T0.Y, 0.0,
+; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; GCN-HSA-LABEL: global_zextload_v1i32_to_v1i64:
+; GCN-HSA:       ; %bb.0:
+; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-HSA-NEXT:    global_load_dword v0, v1, s[2:3]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-HSA-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GCN-HSA-NEXT:    s_endpgm
   %ld = load <1 x i32>, ptr addrspace(1) %in
   %ext = zext <1 x i32> %ld to <1 x i64>
   store <1 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}global_sextload_v1i32_to_v1i64:
-; GCN-NOHSA: buffer_load_dword v[[LO:[0-9]+]]
-; GCN-HSA: {{flat|global}}_load_dword v[[LO:[0-9]+]]
-; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
-; GCN-NOHSA: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
-; GCN-HSA: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
 define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-NOHSA-LABEL: global_sextload_v1i32_to_v1i64:
+; SI-NOHSA:       ; %bb.0:
+; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
+; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
+; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
+; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
+; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
+; SI-NOHSA-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
+; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; SI-NOHSA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NOHSA-NEXT:    s_endpgm
+;
+; GCNX3-HSA-LABEL: global_sextload_v1i32_to_v1i64:
+; GCNX3-HSA:       ; %bb.0:
+; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCNX3-HSA-NEXT:    flat_load_dword v0, v[0:1]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, s0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v3, s1
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(0)
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCNX3-HSA-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GCNX3-HSA-NEXT:    s_endpgm
+;
+; GCNX3-NOHSA-LABEL: global_sextload_v1i32_to_v1i64:
+; GCNX3-NOHSA:       ; %bb.0:
+; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(0)
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT:    s_endpgm
+;
+; EG-LABEL: global_sextload_v1i32_to_v1i64:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 0 @6
+; EG-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
+; EG-NEXT:    ALU clause starting at 8:
+; EG-NEXT:     MOV * T0.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 9:
+; EG-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
+; EG-NEXT:     ASHR * T0.Y, T0.X, literal.y,
+; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
+;
+; GCN-HSA-LABEL: global_sextload_v1i32_to_v1i64:
+; GCN-HSA:       ; %bb.0:
+; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-HSA-NEXT:    global_load_dword v0, v2, s[2:3]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-HSA-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GCN-HSA-NEXT:    s_endpgm
   %ld = load <1 x i32>, ptr addrspace(1) %in
   %ext = sext <1 x i32> %ld to <1 x i64>
   store <1 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}global_zextload_v2i32_to_v2i64:
-; GCN-NOHSA: buffer_load_dwordx2
-; GCN-NOHSA: buffer_store_dwordx4
-
-; GCN-HSA: {{flat|global}}_load_dwordx2
-; GCN-HSA: {{flat|global}}_store_dwordx4
 define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-NOHSA-LABEL: global_zextload_v2i32_to_v2i64:
+; SI-NOHSA:       ; %bb.0:
+; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
+; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
+; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
+; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
+; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
+; SI-NOHSA-NEXT:    buffer_load_dwordx2 v[4:5], off, s[8:11], 0
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v3, v1
+; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
+; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v4
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v5
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NOHSA-NEXT:    s_endpgm
+;
+; GCNX3-HSA-LABEL: global_zextload_v2i32_to_v2i64:
+; GCNX3-HSA:       ; %bb.0:
+; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCNX3-HSA-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(0)
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, v2
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v3, v1
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCNX3-HSA-NEXT:    s_endpgm
+;
+; GCNX3-NOHSA-LABEL: global_zextload_v2i32_to_v2i64:
+; GCNX3-NOHSA:       ; %bb.0:
+; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx2 v[2:3], off, s[8:11], 0
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v1, 0
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(0)
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v0, v2
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v2, v3
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v3, v1
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT:    s_endpgm
+;
+; EG-LABEL: global_zextload_v2i32_to_v2i64:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 0 @6
+; EG-NEXT:    ALU 5, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
+; EG-NEXT:    ALU clause starting at 8:
+; EG-NEXT:     MOV * T0.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 9:
+; EG-NEXT:     MOV T1.X, T0.X,
+; EG-NEXT:     MOV T1.Y, 0.0,
+; EG-NEXT:     MOV T1.Z, T0.Y,
+; EG-NEXT:     MOV T1.W, 0.0,
+; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; GCN-HSA-LABEL: global_zextload_v2i32_to_v2i64:
+; GCN-HSA:       ; %bb.0:
+; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-HSA-NEXT:    global_load_dwordx2 v[2:3], v1, s[2:3]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1]
+; GCN-HSA-NEXT:    s_endpgm
   %ld = load <2 x i32>, ptr addrspace(1) %in
   %ext = zext <2 x i32> %ld to <2 x i64>
   store <2 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}global_sextload_v2i32_to_v2i64:
-; GCN-NOHSA: buffer_load_dwordx2
-; GCN-HSA: {{flat|global}}_load_dwordx2
-
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-
-; GCN-NOHSA-DAG: buffer_store_dwordx4
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
 define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-NOHSA-LABEL: global_sextload_v2i32_to_v2i64:
+; SI-NOHSA:       ; %bb.0:
+; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
+; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
+; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
+; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
+; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
+; SI-NOHSA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
+; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v1
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NOHSA-NEXT:    s_endpgm
+;
+; GCNX3-HSA-LABEL: global_sextload_v2i32_to_v2i64:
+; GCNX3-HSA:       ; %bb.0:
+; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCNX3-HSA-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(0)
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v1
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCNX3-HSA-NEXT:    s_endpgm
+;
+; GCNX3-NOHSA-LABEL: global_sextload_v2i32_to_v2i64:
+; GCNX3-NOHSA:       ; %bb.0:
+; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(0)
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v2, v1
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT:    s_endpgm
+;
+; EG-LABEL: global_sextload_v2i32_to_v2i64:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 0 @6
+; EG-NEXT:    ALU 7, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
+; EG-NEXT:    ALU clause starting at 8:
+; EG-NEXT:     MOV * T0.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 9:
+; EG-NEXT:     ASHR * T1.W, T0.Y, literal.x,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     ASHR * T1.Y, T0.X, literal.x,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     MOV T1.X, T0.X,
+; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     MOV * T1.Z, T0.Y,
+;
+; GCN-HSA-LABEL: global_sextload_v2i32_to_v2i64:
+; GCN-HSA:       ; %bb.0:
+; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-HSA-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v1
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-HSA-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GCN-HSA-NEXT:    s_endpgm
   %ld = load <2 x i32>, ptr addrspace(1) %in
   %ext = sext <2 x i32> %ld to <2 x i64>
   store <2 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}global_zextload_v4i32_to_v4i64:
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_store_dwordx4
-; GCN-NOHSA: buffer_store_dwordx4
-
-; GCN-HSA: {{flat|global}}_load_dwordx4
-; GCN-HSA: {{flat|global}}_store_dwordx4
-; GCN-HSA: {{flat|global}}_store_dwordx4
 define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-NOHSA-LABEL: global_zextload_v4i32_to_v4i64:
+; SI-NOHSA:       ; %bb.0:
+; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
+; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
+; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
+; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
+; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v5, 0
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v7, v5
+; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
+; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v4, v2
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v6, v3
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v4, v0
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v6, v1
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; SI-NOHSA-NEXT:    s_endpgm
+;
+; GCNX3-HSA-LABEL: global_zextload_v4i32_to_v4i64:
+; GCNX3-HSA:       ; %bb.0:
+; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v7, v5
+; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 16
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s2
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(0)
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, v2
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v6, v3
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
+; GCNX3-HSA-NEXT:    s_nop 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, v0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v6, v1
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s1
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
+; GCNX3-HSA-NEXT:    s_endpgm
+;
+; GCNX3-NOHSA-LABEL: global_zextload_v4i32_to_v4i64:
+; GCNX3-NOHSA:       ; %bb.0:
+; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v5, 0
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v7, v5
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(0)
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v4, v2
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v6, v3
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GCNX3-NOHSA-NEXT:    s_nop 0
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v4, v0
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v6, v1
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT:    s_endpgm
+;
+; EG-LABEL: global_zextload_v4i32_to_v4i64:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 0 @6
+; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
+; EG-NEXT:    ALU clause starting at 8:
+; EG-NEXT:     MOV * T0.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 9:
+; EG-NEXT:     MOV T1.X, T0.Z,
+; EG-NEXT:     MOV T1.Y, 0.0,
+; EG-NEXT:     MOV * T2.X, T0.X,
+; EG-NEXT:     MOV T2.Y, 0.0,
+; EG-NEXT:     MOV T1.Z, T0.W,
+; EG-NEXT:     MOV T1.W, 0.0,
+; EG-NEXT:     MOV * T2.Z, T0.Y,
+; EG-NEXT:     MOV * T2.W, 0.0,
+; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:     LSHR * T3.X, PV.W, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; GCN-HSA-LABEL: global_zextload_v4i32_to_v4i64:
+; GCN-HSA:       ; %bb.0:
+; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-HSA-NEXT:    global_load_dwordx4 v[4:7], v1, s[2:3]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v7
+; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:16
+; GCN-HSA-NEXT:    s_nop 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v5
+; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1]
+; GCN-HSA-NEXT:    s_endpgm
   %ld = load <4 x i32>, ptr addrspace(1) %in
   %ext = zext <4 x i32> %ld to <4 x i64>
   store <4 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}global_sextload_v4i32_to_v4i64:
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dwordx4
-
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-
-; GCN-NOHSA-DAG: buffer_store_dwordx4
-; GCN-NOHSA-DAG: buffer_store_dwordx4
-
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
 define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-NOHSA-LABEL: global_sextload_v4i32_to_v4i64:
+; SI-NOHSA:       ; %bb.0:
+; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
+; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
+; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
+; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
+; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
+; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v10, 31, v3
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v8, 31, v2
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v7, v2
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v9, v3
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v3, v0
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v5, v1
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[7:10], off, s[4:7], 0 offset:16
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[3:6], off, s[4:7], 0
+; SI-NOHSA-NEXT:    s_endpgm
+;
+; GCNX3-HSA-LABEL: global_sextload_v4i32_to_v4i64:
+; GCNX3-HSA:       ; %bb.0:
+; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 16
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v14, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v12, s1
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v13, s2
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v11, s0
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(0)
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v10, 31, v3
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v8, 31, v2
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v7, v2
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, v3
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v3, v0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, v1
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[13:14], v[7:10]
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[11:12], v[3:6]
+; GCNX3-HSA-NEXT:    s_endpgm
+;
+; GCNX3-NOHSA-LABEL: global_sextload_v4i32_to_v4i64:
+; GCNX3-NOHSA:       ; %bb.0:
+; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(0)
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v10, 31, v3
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v8, 31, v2
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v7, v2
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v9, v3
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v3, v0
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v5, v1
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[7:10], off, s[4:7], 0 offset:16
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[3:6], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT:    s_endpgm
+;
+; EG-LABEL: global_sextload_v4i32_to_v4i64:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 0 @6
+; EG-NEXT:    ALU 15, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T0.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
+; EG-NEXT:    ALU clause starting at 8:
+; EG-NEXT:     MOV * T0.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 9:
+; EG-NEXT:     ASHR * T1.W, T0.Y, literal.x,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
+; EG-NEXT:     ASHR T1.Y, T0.X, literal.y,
+; EG-NEXT:     ASHR T3.W, T0.W, literal.y,
+; EG-NEXT:     MOV * T1.X, T0.X,
+; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
+; EG-NEXT:     ASHR * T3.Y, T0.Z, literal.x,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     MOV T3.X, T0.Z,
+; EG-NEXT:     MOV T1.Z, T0.Y,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR T0.X, PV.W, literal.x,
+; EG-NEXT:     MOV * T3.Z, T0.W,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; GCN-HSA-LABEL: global_sextload_v4i32_to_v4i64:
+; GCN-HSA:       ; %bb.0:
+; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, 0
+; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-HSA-NEXT:    global_load_dwordx4 v[0:3], v11, s[2:3]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v10, 31, v3
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v8, 31, v2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v7, v2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, v3
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, v1
+; GCN-HSA-NEXT:    global_store_dwordx4 v11, v[7:10], s[0:1] offset:16
+; GCN-HSA-NEXT:    global_store_dwordx4 v11, v[3:6], s[0:1]
+; GCN-HSA-NEXT:    s_endpgm
   %ld = load <4 x i32>, ptr addrspace(1) %in
   %ext = sext <4 x i32> %ld to <4 x i64>
   store <4 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}global_zextload_v8i32_to_v8i64:
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
-
-; GCN-HSA: {{flat|global}}_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dwordx4
-
-; GCN-NOHSA-DAG: buffer_store_dwordx4
-; GCN-NOHSA-DAG: buffer_store_dwordx4
-; GCN-NOHSA-DAG: buffer_store_dwordx4
-; GCN-NOHSA-DAG: buffer_store_dwordx4
-
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
 define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-NOHSA-LABEL: global_zextload_v8i32_to_v8i64:
+; SI-NOHSA:       ; %bb.0:
+; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
+; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
+; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
+; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
+; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v9, 0
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v11, v9
+; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
+; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(1)
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v8, v2
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v10, v3
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:48
+; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v8, v0
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v10, v1
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(2) expcnt(0)
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v8, v6
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v10, v7
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:16
+; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v8, v4
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v10, v5
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; SI-NOHSA-NEXT:    s_endpgm
+;
+; GCNX3-HSA-LABEL: global_zextload_v8i32_to_v8i64:
+; GCNX3-HSA:       ; %bb.0:
+; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v11, v9
+; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s2, 16
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s3, 0
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s2
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 16
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v15, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v14, s2
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 48
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v13, s1
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v12, s0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v17, s3
+; GCNX3-HSA-NEXT:    s_add_u32 s0, s0, 32
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, s2
+; GCNX3-HSA-NEXT:    s_addc_u32 s1, s1, 0
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(1)
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, v2
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v10, v3
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[14:15], v[8:11]
+; GCNX3-HSA-NEXT:    s_nop 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, v0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v10, v1
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s0
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(2)
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, v6
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v10, v7
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s1
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, v4
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v10, v5
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
+; GCNX3-HSA-NEXT:    s_endpgm
+;
+; GCNX3-NOHSA-LABEL: global_zextload_v8i32_to_v8i64:
+; GCNX3-NOHSA:       ; %bb.0:
+; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v9, 0
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v11, v9
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(1)
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v8, v2
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v10, v3
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:48
+; GCNX3-NOHSA-NEXT:    s_nop 0
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v8, v0
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v10, v1
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v8, v6
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v10, v7
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:16
+; GCNX3-NOHSA-NEXT:    s_nop 0
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v8, v4
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v10, v5
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT:    s_endpgm
+;
+; EG-LABEL: global_zextload_v8i32_to_v8i64:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 0, @12, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 1 @8
+; EG-NEXT:    ALU 26, @13, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T7.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T6.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T1.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T0.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    Fetch clause starting at 8:
+; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 16, #1
+; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
+; EG-NEXT:    ALU clause starting at 12:
+; EG-NEXT:     MOV * T0.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 13:
+; EG-NEXT:     MOV T2.X, T1.Z,
+; EG-NEXT:     MOV T2.Y, 0.0,
+; EG-NEXT:     MOV * T3.X, T1.X,
+; EG-NEXT:     MOV * T3.Y, 0.0,
+; EG-NEXT:     MOV T4.X, T0.Z,
+; EG-NEXT:     MOV T4.Y, 0.0,
+; EG-NEXT:     MOV * T5.X, T0.X,
+; EG-NEXT:     MOV T5.Y, 0.0,
+; EG-NEXT:     MOV T2.Z, T1.W,
+; EG-NEXT:     MOV T2.W, 0.0,
+; EG-NEXT:     MOV * T3.Z, T1.Y,
+; EG-NEXT:     MOV * T3.W, 0.0,
+; EG-NEXT:     MOV T4.Z, T0.W,
+; EG-NEXT:     MOV T4.W, 0.0,
+; EG-NEXT:     MOV * T5.Z, T0.Y,
+; EG-NEXT:     MOV * T5.W, 0.0,
+; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:     LSHR T1.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
+; EG-NEXT:     LSHR T6.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 48(6.726233e-44)
+; EG-NEXT:     LSHR * T7.X, PV.W, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; GCN-HSA-LABEL: global_zextload_v8i32_to_v8i64:
+; GCN-HSA:       ; %bb.0:
+; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-HSA-NEXT:    global_load_dwordx4 v[4:7], v1, s[2:3] offset:16
+; GCN-HSA-NEXT:    global_load_dwordx4 v[8:11], v1, s[2:3]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v7
+; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:48
+; GCN-HSA-NEXT:    s_nop 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v5
+; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:32
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(2)
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v10
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v11
+; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:16
+; GCN-HSA-NEXT:    s_nop 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v9
+; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1]
+; GCN-HSA-NEXT:    s_endpgm
   %ld = load <8 x i32>, ptr addrspace(1) %in
   %ext = zext <8 x i32> %ld to <8 x i64>
   store <8 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}global_sextload_v8i32_to_v8i64:
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
-
-; GCN-HSA: {{flat|global}}_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dwordx4
-
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-
-; GCN-NOHSA-DAG: buffer_store_dwordx4
-; GCN-NOHSA-DAG: buffer_store_dwordx4
-; GCN-NOHSA-DAG: buffer_store_dwordx4
-; GCN-NOHSA-DAG: buffer_store_dwordx4
-
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
 define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-NOHSA-LABEL: global_sextload_v8i32_to_v8i64:
+; SI-NOHSA:       ; %bb.0:
+; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
+; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
+; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
+; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
+; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
+; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(1)
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v10, 31, v1
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v8, 31, v0
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v14, 31, v3
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v12, 31, v2
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v18, 31, v5
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v16, 31, v4
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v22, 31, v7
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v20, 31, v6
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v19, v6
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v21, v7
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v15, v4
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v17, v5
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v11, v2
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v13, v3
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v7, v0
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v9, v1
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[19:22], off, s[4:7], 0 offset:48
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[15:18], off, s[4:7], 0 offset:32
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[11:14], off, s[4:7], 0 offset:16
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[7:10], off, s[4:7], 0
+; SI-NOHSA-NEXT:    s_endpgm
+;
+; GCNX3-HSA-LABEL: global_sextload_v8i32_to_v8i64:
+; GCNX3-HSA:       ; %bb.0:
+; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s2, 16
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s3, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s3
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s2
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 16
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v19, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v18, s2
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 48
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v17, s1
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, s0
+; GCNX3-HSA-NEXT:    s_add_u32 s0, s0, 32
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v21, s3
+; GCNX3-HSA-NEXT:    s_addc_u32 s1, s1, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v20, s2
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v23, s1
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v22, s0
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(1)
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v11, 31, v1
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v0
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v15, 31, v3
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v13, 31, v2
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v12, v2
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v14, v3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, v0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v10, v1
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[12:15]
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(2)
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v11, 31, v7
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v6
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, v6
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v10, v7
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v4
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, v4
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v5
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[8:11]
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[0:3]
+; GCNX3-HSA-NEXT:    s_endpgm
+;
+; GCNX3-NOHSA-LABEL: global_sextload_v8i32_to_v8i64:
+; GCNX3-NOHSA:       ; %bb.0:
+; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s3
+; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s7
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s1, s5
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(1)
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v10, 31, v1
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(0)
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v22, 31, v7
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v20, 31, v6
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v19, v6
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v21, v7
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v8, 31, v0
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v14, 31, v3
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v12, 31, v2
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v18, 31, v5
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v16, 31, v4
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v15, v4
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v17, v5
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v11, v2
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v13, v3
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v7, v0
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v9, v1
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:48
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:32
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:16
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[7:10], off, s[0:3], 0
+; GCNX3-NOHSA-NEXT:    s_endpgm
+;
+; EG-LABEL: global_sextload_v8i32_to_v8i64:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 0, @12, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 1 @8
+; EG-NEXT:    ALU 31, @13, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T0.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T5.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T3.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T2.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    Fetch clause starting at 8:
+; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 16, #1
+; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
+; EG-NEXT:    ALU clause starting at 12:
+; EG-NEXT:     MOV * T0.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 13:
+; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:     LSHR T3.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:     ASHR * T4.W, T0.Y, literal.z,
+; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR T5.X, PV.W, literal.x,
+; EG-NEXT:     ASHR T4.Y, T0.X, literal.y,
+; EG-NEXT:     ASHR T6.W, T0.W, literal.y,
+; EG-NEXT:     MOV * T4.X, T0.X,
+; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
+; EG-NEXT:     ASHR T6.Y, T0.Z, literal.x,
+; EG-NEXT:     ASHR * T7.W, T1.Y, literal.x,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     MOV T6.X, T0.Z,
+; EG-NEXT:     ASHR T7.Y, T1.X, literal.x,
+; EG-NEXT:     MOV T4.Z, T0.Y,
+; EG-NEXT:     ASHR T8.W, T1.W, literal.x,
+; EG-NEXT:     MOV * T7.X, T1.X,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     ASHR T8.Y, T1.Z, literal.x,
+; EG-NEXT:     MOV * T6.Z, T0.W,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     MOV T8.X, T1.Z,
+; EG-NEXT:     MOV T7.Z, T1.Y,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR T0.X, PV.W, literal.x,
+; EG-NEXT:     MOV * T8.Z, T1.W,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; GCN-HSA-LABEL: global_sextload_v8i32_to_v8i64:
+; GCN-HSA:       ; %bb.0:
+; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, 0
+; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-HSA-NEXT:    global_load_dwordx4 v[0:3], v23, s[2:3]
+; GCN-HSA-NEXT:    global_load_dwordx4 v[4:7], v23, s[2:3] offset:16
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v10, 31, v1
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v22, 31, v7
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v20, 31, v6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v19, v6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, v7
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v8, 31, v0
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v14, 31, v3
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v12, 31, v2
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v18, 31, v5
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v16, 31, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v15, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v17, v5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, v2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, v3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v7, v0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, v1
+; GCN-HSA-NEXT:    global_store_dwordx4 v23, v[19:22], s[0:1] offset:48
+; GCN-HSA-NEXT:    global_store_dwordx4 v23, v[15:18], s[0:1] offset:32
+; GCN-HSA-NEXT:    global_store_dwordx4 v23, v[11:14], s[0:1] offset:16
+; GCN-HSA-NEXT:    global_store_dwordx4 v23, v[7:10], s[0:1]
+; GCN-HSA-NEXT:    s_endpgm
   %ld = load <8 x i32>, ptr addrspace(1) %in
   %ext = sext <8 x i32> %ld to <8 x i64>
   store <8 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}global_sextload_v16i32_to_v16i64:
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
-
-; GCN-HSA: {{flat|global}}_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dwordx4
-
-
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-NOHSA-DAG: buffer_store_dwordx4
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-NOHSA-DAG: buffer_store_dwordx4
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-NOHSA-DAG: buffer_store_dwordx4
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-NOHSA-DAG: buffer_store_dwordx4
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
 define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-NOHSA-LABEL: global_sextload_v16i32_to_v16i64:
+; SI-NOHSA:       ; %bb.0:
+; SI-NOHSA-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NOHSA-NEXT:    s_mov_b32 s2, -1
+; SI-NOHSA-NEXT:    s_mov_b32 s10, s2
+; SI-NOHSA-NEXT:    s_mov_b32 s11, s3
+; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NOHSA-NEXT:    s_mov_b32 s8, s6
+; SI-NOHSA-NEXT:    s_mov_b32 s9, s7
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(3)
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v19, 31, v3
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v2
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v23, 31, v1
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v21, 31, v0
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v20, v0
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v22, v1
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v16, v2
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v18, v3
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v7
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v6
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v27, 31, v5
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v25, 31, v4
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v24, v4
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v26, v5
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v6
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v7
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(1)
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v11
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v10
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v31, 31, v9
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v29, 31, v8
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v28, v8
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v30, v9
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v4, v10
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v6, v11
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v11, 31, v15
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v14
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v35, 31, v13
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v33, 31, v12
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v32, v12
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v34, v13
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v8, v14
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v10, v15
+; SI-NOHSA-NEXT:    s_mov_b32 s0, s4
+; SI-NOHSA-NEXT:    s_mov_b32 s1, s5
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:96
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:112
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:64
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:32
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
+; SI-NOHSA-NEXT:    s_endpgm
+;
+; GCNX3-HSA-LABEL: global_sextload_v16i32_to_v16i64:
+; GCNX3-HSA:       ; %bb.0:
+; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[0:1]
+; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 48
+; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s4
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s5
+; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 32
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[0:1]
+; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s4
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s5
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s2, 16
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s3, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 16
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v23, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v22, s2
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0x60
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v25, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v24, s2
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0x70
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v27, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v21, s1
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v26, s2
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 64
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v20, s0
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(3)
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v19, 31, v9
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v8
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, v8
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v18, v9
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v21, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v20, s2
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0x50
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v29, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v28, s2
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 32
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v19, 31, v11
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v10
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, v10
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v18, v11
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[16:19]
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(4)
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v11, 31, v15
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v14
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v19, 31, v13
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v12
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, v12
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v18, v13
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, v14
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v10, v15
+; GCNX3-HSA-NEXT:    s_add_u32 s0, s0, 48
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v23, s3
+; GCNX3-HSA-NEXT:    s_addc_u32 s1, s1, 0
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[16:19]
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[26:27], v[8:11]
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(5)
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v15, 31, v5
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v11, 31, v7
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v6
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v13, 31, v4
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v12, v4
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v14, v5
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, v6
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v10, v7
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v22, s2
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v31, s1
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[12:15]
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[28:29], v[8:11]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v30, s0
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(6)
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v11, 31, v1
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, v0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v10, v1
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v3
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v2
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, v2
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v6, v3
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[8:11]
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[30:31], v[4:7]
+; GCNX3-HSA-NEXT:    s_endpgm
+;
+; GCNX3-NOHSA-LABEL: global_sextload_v16i32_to_v16i64:
+; GCNX3-NOHSA:       ; %bb.0:
+; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s3
+; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s7
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s1, s5
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(3)
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v19, 31, v3
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v27, 31, v5
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v25, 31, v4
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v24, v4
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v26, v5
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v2
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v23, 31, v7
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v21, 31, v6
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v20, v6
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v22, v7
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v0
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v4, v0
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v6, v1
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v16, v2
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v18, v3
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(1)
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v11
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v10
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v31, 31, v9
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v29, 31, v8
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v28, v8
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v30, v9
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v0, v10
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v2, v11
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(0)
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v11, 31, v15
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v14
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v35, 31, v13
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v33, 31, v12
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v32, v12
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v34, v13
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v8, v14
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v10, v15
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:96
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:112
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:32
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
+; GCNX3-NOHSA-NEXT:    s_endpgm
+;
+; EG-LABEL: global_sextload_v16i32_to_v16i64:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 0, @20, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 3 @12
+; EG-NEXT:    ALU 64, @21, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T1.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T11.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T9.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T8.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T7.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T6.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T5.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T4.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    Fetch clause starting at 12:
+; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 48, #1
+; EG-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 32, #1
+; EG-NEXT:     VTX_READ_128 T3.XYZW, T0.X, 16, #1
+; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
+; EG-NEXT:    ALU clause starting at 20:
+; EG-NEXT:     MOV * T0.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 21:
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR T4.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T5.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.x,
+; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR T6.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
+; EG-NEXT:     LSHR T7.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
+; EG-NEXT:     LSHR T8.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
+; EG-NEXT:     LSHR T9.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:     ASHR * T10.W, T0.W, literal.z,
+; EG-NEXT:    2(2.802597e-45), 112(1.569454e-43)
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR T11.X, PV.W, literal.x,
+; EG-NEXT:     ASHR T10.Y, T0.Z, literal.y,
+; EG-NEXT:     ASHR T12.W, T0.Y, literal.y,
+; EG-NEXT:     MOV * T10.X, T0.Z,
+; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
+; EG-NEXT:     ASHR T12.Y, T0.X, literal.x,
+; EG-NEXT:     ASHR * T13.W, T3.W, literal.x,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     MOV T12.X, T0.X,
+; EG-NEXT:     ASHR T13.Y, T3.Z, literal.x,
+; EG-NEXT:     MOV T10.Z, T0.W,
+; EG-NEXT:     ASHR T14.W, T3.Y, literal.x,
+; EG-NEXT:     MOV * T13.X, T3.Z,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     ASHR T14.Y, T3.X, literal.x,
+; EG-NEXT:     MOV T12.Z, T0.Y,
+; EG-NEXT:     ASHR * T0.W, T2.W, literal.x,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     MOV T14.X, T3.X,
+; EG-NEXT:     ASHR T0.Y, T2.Z, literal.x,
+; EG-NEXT:     MOV T13.Z, T3.W,
+; EG-NEXT:     ASHR T15.W, T2.Y, literal.x,
+; EG-NEXT:     MOV * T0.X, T2.Z,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     ASHR T15.Y, T2.X, literal.x,
+; EG-NEXT:     MOV T14.Z, T3.Y,
+; EG-NEXT:     ASHR * T3.W, T1.W, literal.x,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     MOV T15.X, T2.X,
+; EG-NEXT:     ASHR T3.Y, T1.Z, literal.x,
+; EG-NEXT:     MOV T0.Z, T2.W,
+; EG-NEXT:     ASHR T16.W, T1.Y, literal.x,
+; EG-NEXT:     MOV * T3.X, T1.Z,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     ASHR T16.Y, T1.X, literal.x,
+; EG-NEXT:     MOV * T15.Z, T2.Y,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     MOV T16.X, T1.X,
+; EG-NEXT:     MOV T3.Z, T1.W,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT:    96(1.345247e-43), 0(0.000000e+00)
+; EG-NEXT:     LSHR T1.X, PV.W, literal.x,
+; EG-NEXT:     MOV * T16.Z, T1.Y,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; GCN-HSA-LABEL: global_sextload_v16i32_to_v16i64:
+; GCN-HSA:       ; %bb.0:
+; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v36, 0
+; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-HSA-NEXT:    global_load_dwordx4 v[0:3], v36, s[2:3] offset:32
+; GCN-HSA-NEXT:    global_load_dwordx4 v[4:7], v36, s[2:3] offset:48
+; GCN-HSA-NEXT:    global_load_dwordx4 v[8:11], v36, s[2:3] offset:16
+; GCN-HSA-NEXT:    global_load_dwordx4 v[12:15], v36, s[2:3]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(3)
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v19, 31, v3
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(2)
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v27, 31, v5
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v25, 31, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v24, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v26, v5
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v2
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v23, 31, v7
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v21, 31, v6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, v6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, v7
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, v0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v6, v1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v16, v2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v18, v3
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v11
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v10
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v31, 31, v9
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v29, 31, v8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v28, v8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v30, v9
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v10
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v11
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v11, 31, v15
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v14
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v35, 31, v13
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v33, 31, v12
+; GCN-HSA-NEXT:    v_mov_b32_e32 v32, v12
+; GCN-HSA-NEXT:    v_mov_b32_e32 v34, v13
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, v14
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, v15
+; GCN-HSA-NEXT:    global_store_dwordx4 v36, v[24:27], s[0:1] offset:96
+; GCN-HSA-NEXT:    global_store_dwordx4 v36, v[20:23], s[0:1] offset:112
+; GCN-HSA-NEXT:    global_store_dwordx4 v36, v[4:7], s[0:1] offset:64
+; GCN-HSA-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] offset:80
+; GCN-HSA-NEXT:    global_store_dwordx4 v36, v[28:31], s[0:1] offset:32
+; GCN-HSA-NEXT:    global_store_dwordx4 v36, v[0:3], s[0:1] offset:48
+; GCN-HSA-NEXT:    global_store_dwordx4 v36, v[32:35], s[0:1]
+; GCN-HSA-NEXT:    global_store_dwordx4 v36, v[8:11], s[0:1] offset:16
+; GCN-HSA-NEXT:    s_endpgm
   %ld = load <16 x i32>, ptr addrspace(1) %in
   %ext = sext <16 x i32> %ld to <16 x i64>
   store <16 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}global_zextload_v16i32_to_v16i64
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
-
-; GCN-HSA: {{flat|global}}_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dwordx4
-
-; GCN-NOHSA: buffer_store_dwordx4
-; GCN-NOHSA: buffer_store_dwordx4
-; GCN-NOHSA: buffer_store_dwordx4
-; GCN-NOHSA: buffer_store_dwordx4
-; GCN-NOHSA: buffer_store_dwordx4
-; GCN-NOHSA: buffer_store_dwordx4
-; GCN-NOHSA: buffer_store_dwordx4
-; GCN-NOHSA: buffer_store_dwordx4
-
-; GCN-HSA: {{flat|global}}_store_dwordx4
-; GCN-HSA: {{flat|global}}_store_dwordx4
-; GCN-HSA: {{flat|global}}_store_dwordx4
-; GCN-HSA: {{flat|global}}_store_dwordx4
-; GCN-HSA: {{flat|global}}_store_dwordx4
-; GCN-HSA: {{flat|global}}_store_dwordx4
-; GCN-HSA: {{flat|global}}_store_dwordx4
-; GCN-HSA: {{flat|global}}_store_dwordx4
 define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-NOHSA-LABEL: global_zextload_v16i32_to_v16i64:
+; SI-NOHSA:       ; %bb.0:
+; SI-NOHSA-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NOHSA-NEXT:    s_mov_b32 s2, -1
+; SI-NOHSA-NEXT:    s_mov_b32 s10, s2
+; SI-NOHSA-NEXT:    s_mov_b32 s11, s3
+; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NOHSA-NEXT:    s_mov_b32 s8, s6
+; SI-NOHSA-NEXT:    s_mov_b32 s9, s7
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v5, 0
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v7, v5
+; SI-NOHSA-NEXT:    s_mov_b32 s0, s4
+; SI-NOHSA-NEXT:    s_mov_b32 s1, s5
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:16
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(3)
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v4, v0
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v6, v1
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96
+; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v4, v2
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v6, v3
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:112
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v4, v8
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v6, v9
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64
+; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v4, v10
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v6, v11
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v4, v16
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v6, v17
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
+; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v4, v18
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v6, v19
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48
+; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v4, v12
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v6, v13
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0
+; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v4, v14
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v6, v15
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; SI-NOHSA-NEXT:    s_endpgm
+;
+; GCNX3-HSA-LABEL: global_zextload_v16i32_to_v16i64:
+; GCNX3-HSA:       ; %bb.0:
+; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v17, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v19, v17
+; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 16
+; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
+; GCNX3-HSA-NEXT:    s_add_u32 s6, s2, 32
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCNX3-HSA-NEXT:    s_addc_u32 s7, s3, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s2, 48
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s3, 0
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s2
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s7
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s6
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v13, s5
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v12, s4
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 16
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v23, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v22, s2
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0x60
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v25, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v24, s2
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0x70
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v27, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v21, s1
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v26, s2
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 64
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v20, s0
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(3)
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, v0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v18, v1
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0x50
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, v2
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v18, v3
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, s2
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[16:19]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v3, s3
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(4)
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, v4
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v18, v5
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 32
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[16:19]
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, v6
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v18, v7
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[26:27], v[16:19]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v21, s3
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(5)
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, v8
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v18, v9
+; GCNX3-HSA-NEXT:    s_add_u32 s0, s0, 48
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[16:19]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v20, s2
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, v10
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v18, v11
+; GCNX3-HSA-NEXT:    s_addc_u32 s1, s1, 0
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[2:3], v[16:19]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s0
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(6)
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, v12
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v18, v13
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s1
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, v14
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v18, v15
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[16:19]
+; GCNX3-HSA-NEXT:    s_endpgm
+;
+; GCNX3-NOHSA-LABEL: global_zextload_v16i32_to_v16i64:
+; GCNX3-NOHSA:       ; %bb.0:
+; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s3
+; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s7
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v17, 0
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v19, v17
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s1, s5
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(3)
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v16, v0
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v18, v1
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:96
+; GCNX3-NOHSA-NEXT:    s_nop 0
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v16, v2
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v18, v3
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:112
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(4)
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v16, v4
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v18, v5
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:64
+; GCNX3-NOHSA-NEXT:    s_nop 0
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v16, v6
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v18, v7
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(5)
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v16, v8
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v18, v9
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32
+; GCNX3-NOHSA-NEXT:    s_nop 0
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v16, v10
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v18, v11
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(6)
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v16, v12
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v18, v13
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0
+; GCNX3-NOHSA-NEXT:    s_nop 0
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v16, v14
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v18, v15
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16
+; GCNX3-NOHSA-NEXT:    s_endpgm
+;
+; EG-LABEL: global_zextload_v16i32_to_v16i64:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 0, @20, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 3 @12
+; EG-NEXT:    ALU 55, @21, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T15.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T14.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T13.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T12.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T3.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T2.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T1.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T0.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    Fetch clause starting at 12:
+; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 48, #1
+; EG-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 0, #1
+; EG-NEXT:     VTX_READ_128 T3.XYZW, T0.X, 16, #1
+; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 32, #1
+; EG-NEXT:    ALU clause starting at 20:
+; EG-NEXT:     MOV * T0.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 21:
+; EG-NEXT:     MOV T4.X, T1.X,
+; EG-NEXT:     MOV T4.Y, 0.0,
+; EG-NEXT:     MOV * T5.X, T1.Z,
+; EG-NEXT:     MOV * T5.Y, 0.0,
+; EG-NEXT:     MOV T6.X, T0.X,
+; EG-NEXT:     MOV T6.Y, 0.0,
+; EG-NEXT:     MOV * T7.X, T0.Z,
+; EG-NEXT:     MOV * T7.Y, 0.0,
+; EG-NEXT:     MOV T8.X, T3.X,
+; EG-NEXT:     MOV T8.Y, 0.0,
+; EG-NEXT:     MOV * T9.X, T3.Z,
+; EG-NEXT:     MOV * T9.Y, 0.0,
+; EG-NEXT:     MOV T10.X, T2.X,
+; EG-NEXT:     MOV T10.Y, 0.0,
+; EG-NEXT:     MOV * T11.X, T2.Z,
+; EG-NEXT:     MOV T11.Y, 0.0,
+; EG-NEXT:     MOV T4.Z, T1.Y,
+; EG-NEXT:     MOV T4.W, 0.0,
+; EG-NEXT:     MOV * T5.Z, T1.W,
+; EG-NEXT:     MOV * T5.W, 0.0,
+; EG-NEXT:     MOV T6.Z, T0.Y,
+; EG-NEXT:     MOV T6.W, 0.0,
+; EG-NEXT:     MOV * T7.Z, T0.W,
+; EG-NEXT:     MOV * T7.W, 0.0,
+; EG-NEXT:     MOV T8.Z, T3.Y,
+; EG-NEXT:     MOV T8.W, 0.0,
+; EG-NEXT:     MOV * T9.Z, T3.W,
+; EG-NEXT:     MOV * T9.W, 0.0,
+; EG-NEXT:     MOV T10.Z, T2.Y,
+; EG-NEXT:     MOV T10.W, 0.0,
+; EG-NEXT:     MOV * T11.Z, T2.W,
+; EG-NEXT:     MOV T11.W, 0.0,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR T0.X, PS, literal.x,
+; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
+; EG-NEXT:     LSHR T3.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
+; EG-NEXT:     LSHR T12.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
+; EG-NEXT:     LSHR T13.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 112(1.569454e-43)
+; EG-NEXT:     LSHR T14.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 96(1.345247e-43)
+; EG-NEXT:     LSHR * T15.X, PV.W, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; GCN-HSA-LABEL: global_zextload_v16i32_to_v16i64:
+; GCN-HSA:       ; %bb.0:
+; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-HSA-NEXT:    global_load_dwordx4 v[4:7], v1, s[2:3] offset:48
+; GCN-HSA-NEXT:    global_load_dwordx4 v[8:11], v1, s[2:3] offset:32
+; GCN-HSA-NEXT:    global_load_dwordx4 v[12:15], v1, s[2:3] offset:16
+; GCN-HSA-NEXT:    global_load_dwordx4 v[16:19], v1, s[2:3]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(3)
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v5
+; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:96
+; GCN-HSA-NEXT:    s_nop 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v7
+; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:112
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(4)
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v9
+; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:64
+; GCN-HSA-NEXT:    s_nop 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v10
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v11
+; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:80
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(5)
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v13
+; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:32
+; GCN-HSA-NEXT:    s_nop 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v14
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v15
+; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:48
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(6)
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v17
+; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1]
+; GCN-HSA-NEXT:    s_nop 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v18
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v19
+; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:16
+; GCN-HSA-NEXT:    s_endpgm
   %ld = load <16 x i32>, ptr addrspace(1) %in
   %ext = zext <16 x i32> %ld to <16 x i64>
   store <16 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}global_sextload_v32i32_to_v32i64:
-
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA-DAG: buffer_load_dwordx4
-
-; GCN-HSA: {{flat|global}}_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dwordx4
-
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-; GCN-DAG: v_ashrrev_i32
-
-; GCN-NOHSA: buffer_store_dwordx4
-; GCN-NOHSA: buffer_store_dwordx4
-; GCN-NOHSA: buffer_store_dwordx4
-; GCN-NOHSA: buffer_store_dwordx4
-
-; GCN-NOHSA: buffer_store_dwordx4
-; GCN-NOHSA: buffer_store_dwordx4
-; GCN-NOHSA: buffer_store_dwordx4
-; GCN-NOHSA: buffer_store_dwordx4
-
-; GCN-NOHSA: buffer_store_dwordx4
-; GCN-NOHSA: buffer_store_dwordx4
-; GCN-NOHSA: buffer_store_dwordx4
-; GCN-NOHSA: buffer_store_dwordx4
-
-; GCN-NOHSA: buffer_store_dwordx4
-; GCN-NOHSA: buffer_store_dwordx4
-; GCN-NOHSA: buffer_store_dwordx4
-; GCN-NOHSA: buffer_store_dwordx4
-
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-
 define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-NOHSA-LABEL: global_sextload_v32i32_to_v32i64:
+; SI-NOHSA:       ; %bb.0:
+; SI-NOHSA-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; SI-NOHSA-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; SI-NOHSA-NEXT:    s_mov_b32 s14, -1
+; SI-NOHSA-NEXT:    s_mov_b32 s15, 0xe8f000
+; SI-NOHSA-NEXT:    s_add_u32 s12, s12, s3
+; SI-NOHSA-NEXT:    s_addc_u32 s13, s13, 0
+; SI-NOHSA-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NOHSA-NEXT:    s_mov_b32 s2, -1
+; SI-NOHSA-NEXT:    s_mov_b32 s10, s2
+; SI-NOHSA-NEXT:    s_mov_b32 s11, s3
+; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NOHSA-NEXT:    s_mov_b32 s8, s6
+; SI-NOHSA-NEXT:    s_mov_b32 s9, s7
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:96
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:112
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:80
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:64
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:48
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:16
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(7)
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v47, 31, v31
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v45, 31, v30
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(6)
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v39, 31, v15
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v37, 31, v14
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v43, 31, v13
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v41, 31, v12
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v40, v12
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v42, v13
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v36, v14
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v38, v15
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v35, 31, v29
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v33, 31, v28
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v32, v28
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v34, v29
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v44, v30
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v46, v31
+; SI-NOHSA-NEXT:    buffer_store_dword v44, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
+; SI-NOHSA-NEXT:    buffer_store_dword v45, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
+; SI-NOHSA-NEXT:    buffer_store_dword v46, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
+; SI-NOHSA-NEXT:    buffer_store_dword v47, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v15, 31, v7
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v13, 31, v6
+; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v47, 31, v5
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v45, 31, v4
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v44, v4
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v46, v5
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v12, v6
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v14, v7
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v3
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v2
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v51, 31, v1
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v49, 31, v0
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v48, v0
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v50, v1
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v4, v2
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v6, v3
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v19
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v18
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v55, 31, v17
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v53, 31, v16
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v52, v16
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v54, v17
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v18
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v19
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v19, 31, v23
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v22
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v59, 31, v21
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v57, 31, v20
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v56, v20
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v58, v21
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v16, v22
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v18, v23
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v23, 31, v27
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v21, 31, v26
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v63, 31, v25
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v61, 31, v24
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v60, v24
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v62, v25
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v20, v26
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v22, v27
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v27, 31, v11
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v25, 31, v10
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v31, 31, v9
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v29, 31, v8
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v28, v8
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v30, v9
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v24, v10
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v26, v11
+; SI-NOHSA-NEXT:    s_mov_b32 s0, s4
+; SI-NOHSA-NEXT:    s_mov_b32 s1, s5
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:224
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:240
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:192
+; SI-NOHSA-NEXT:    buffer_load_dword v8, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; SI-NOHSA-NEXT:    buffer_load_dword v9, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; SI-NOHSA-NEXT:    buffer_load_dword v10, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; SI-NOHSA-NEXT:    buffer_load_dword v11, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:160
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:176
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:128
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:144
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:96
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:64
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:32
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:48
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:16
+; SI-NOHSA-NEXT:    s_endpgm
+;
+; GCNX3-HSA-LABEL: global_sextload_v32i32_to_v32i64:
+; GCNX3-HSA:       ; %bb.0:
+; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[28:31], v[0:1]
+; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 0x70
+; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s4
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s5
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[24:27], v[0:1]
+; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 0x60
+; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s4
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s5
+; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 0x50
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[20:23], v[0:1]
+; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s4
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s5
+; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 64
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[0:1]
+; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s4
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s5
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 48
+; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s5
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s4
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[16:19], v[8:9]
+; GCNX3-HSA-NEXT:    s_add_u32 s6, s2, 32
+; GCNX3-HSA-NEXT:    s_addc_u32 s7, s3, 0
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s2, 16
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s3, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s7
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s6
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 16
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v37, s1
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v36, s0
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(7)
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v35, 31, v29
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v33, 31, v28
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v32, v28
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v34, v29
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v29, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v28, s2
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0xe0
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[36:37], v[32:35]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v37, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v36, s2
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0xf0
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v35, 31, v31
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v33, 31, v30
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v32, v30
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v34, v31
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[28:29], v[32:35]
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(8)
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v31, 31, v25
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v33, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v32, s2
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0xc0
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v35, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v34, s2
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0xd0
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v29, 31, v24
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v28, v24
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v30, v25
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[36:37], v[28:31]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v37, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v36, s2
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0xa0
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v31, 31, v27
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v29, 31, v26
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v28, v26
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v30, v27
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[28:31]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v33, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v32, s2
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0xb0
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v39, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v38, s2
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0x80
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(9)
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v27, 31, v21
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v25, 31, v20
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v24, v20
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v26, v21
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v31, 31, v23
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v29, 31, v22
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v28, v22
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v30, v23
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[34:35], v[24:27]
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[36:37], v[28:31]
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(10)
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v23, 31, v15
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v21, 31, v14
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v27, 31, v13
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v25, 31, v12
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v24, v12
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v26, v13
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v20, v14
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v22, v15
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(9)
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v15, 31, v5
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v13, 31, v4
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v12, v4
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v14, v5
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s2
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0x90
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[24:27]
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[38:39], v[20:23]
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[12:15]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s2
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0x60
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v26, 31, v7
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v24, 31, v6
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v23, v6
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v25, v7
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[23:26]
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(12)
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v16
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, v16
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v15, s2
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0x70
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v17
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v6, v17
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[15:16], v[4:7]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v15, s2
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 64
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v26, 31, v19
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v24, 31, v18
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v23, v18
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v25, v19
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[15:16], v[23:26]
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(12)
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v18, 31, v9
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v16, 31, v8
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v15, v8
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v17, v9
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s2
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0x50
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[15:18]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s2
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 32
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v14, 31, v1
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v12, 31, v0
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v11
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v6, v11
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v11, v0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v13, v1
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCNX3-HSA-NEXT:    s_add_u32 s0, s0, 48
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[11:14]
+; GCNX3-HSA-NEXT:    s_addc_u32 s1, s1, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s0
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v22, 31, v3
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v20, 31, v2
+; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v10
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, v10
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v19, v2
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v21, v3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s1
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[19:22]
+; GCNX3-HSA-NEXT:    s_endpgm
+;
+; GCNX3-NOHSA-LABEL: global_sextload_v32i32_to_v32i64:
+; GCNX3-NOHSA:       ; %bb.0:
+; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s3
+; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s7
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:96
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:112
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:80
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:64
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:48
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:16
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s1, s5
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(7)
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v35, 31, v11
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v33, 31, v10
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(6)
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v39, 31, v15
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v37, 31, v14
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v43, 31, v13
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v41, 31, v12
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v40, v12
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v42, v13
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v36, v14
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v38, v15
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v15, 31, v9
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v13, 31, v8
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v12, v8
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v14, v9
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v32, v10
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v34, v11
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(5)
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v11, 31, v7
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v6
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v47, 31, v5
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v45, 31, v4
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v44, v4
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v46, v5
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v8, v6
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v10, v7
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(4)
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v3
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v2
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v4, v2
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v6, v3
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(3)
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v19
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v2, v19
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v19, 31, v23
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v51, 31, v1
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v49, 31, v0
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v48, v0
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v50, v1
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v18
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v55, 31, v17
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v53, 31, v16
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v52, v16
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v54, v17
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v0, v18
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v22
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v59, 31, v21
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v57, 31, v20
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v56, v20
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v58, v21
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v16, v22
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v18, v23
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(1)
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v22, 31, v27
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v20, 31, v26
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:224
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:240
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v42, 31, v25
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v40, 31, v24
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v38, 31, v31
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v36, 31, v30
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v37, v31
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v15, 31, v29
+; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v13, 31, v28
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v12, v28
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v14, v29
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:208
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:160
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:176
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:128
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:144
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:96
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:64
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v35, v30
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v39, v24
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v41, v25
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v19, v26
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v21, v27
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:48
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[39:42], off, s[0:3], 0
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:16
+; GCNX3-NOHSA-NEXT:    s_endpgm
+;
+; EG-LABEL: global_sextload_v32i32_to_v32i64:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 33, @36, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 7 @20
+; EG-NEXT:    ALU 96, @70, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T32.XYZW, T12.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T23.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T31.XYZW, T21.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T20.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T30.XYZW, T19.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T10.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T29.XYZW, T9.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T8.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T7.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T6.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T27.XYZW, T5.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T4.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T3.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T2.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T1.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T0.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    Fetch clause starting at 20:
+; EG-NEXT:     VTX_READ_128 T12.XYZW, T11.X, 112, #1
+; EG-NEXT:     VTX_READ_128 T13.XYZW, T11.X, 96, #1
+; EG-NEXT:     VTX_READ_128 T14.XYZW, T11.X, 80, #1
+; EG-NEXT:     VTX_READ_128 T15.XYZW, T11.X, 64, #1
+; EG-NEXT:     VTX_READ_128 T16.XYZW, T11.X, 48, #1
+; EG-NEXT:     VTX_READ_128 T17.XYZW, T11.X, 32, #1
+; EG-NEXT:     VTX_READ_128 T18.XYZW, T11.X, 16, #1
+; EG-NEXT:     VTX_READ_128 T11.XYZW, T11.X, 0, #1
+; EG-NEXT:    ALU clause starting at 36:
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR T0.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
+; EG-NEXT:     LSHR T3.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
+; EG-NEXT:     LSHR T4.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
+; EG-NEXT:     LSHR T5.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 112(1.569454e-43)
+; EG-NEXT:     LSHR T6.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 96(1.345247e-43)
+; EG-NEXT:     LSHR T7.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 144(2.017870e-43)
+; EG-NEXT:     LSHR T8.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 128(1.793662e-43)
+; EG-NEXT:     LSHR T9.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 176(2.466285e-43)
+; EG-NEXT:     LSHR T10.X, PV.W, literal.x,
+; EG-NEXT:     MOV * T11.X, KC0[2].Z,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:    ALU clause starting at 70:
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    160(2.242078e-43), 0(0.000000e+00)
+; EG-NEXT:     LSHR T19.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 208(2.914701e-43)
+; EG-NEXT:     LSHR T20.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 192(2.690493e-43)
+; EG-NEXT:     LSHR T21.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:     ASHR * T22.W, T11.W, literal.z,
+; EG-NEXT:    2(2.802597e-45), 240(3.363116e-43)
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR T23.X, PV.W, literal.x,
+; EG-NEXT:     ASHR T22.Y, T11.Z, literal.y,
+; EG-NEXT:     ASHR T24.W, T11.Y, literal.y,
+; EG-NEXT:     MOV * T22.X, T11.Z,
+; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
+; EG-NEXT:     ASHR T24.Y, T11.X, literal.x,
+; EG-NEXT:     ASHR * T25.W, T18.W, literal.x,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     MOV T24.X, T11.X,
+; EG-NEXT:     ASHR T25.Y, T18.Z, literal.x,
+; EG-NEXT:     MOV T22.Z, T11.W,
+; EG-NEXT:     ASHR T26.W, T18.Y, literal.x,
+; EG-NEXT:     MOV * T25.X, T18.Z,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     ASHR T26.Y, T18.X, literal.x,
+; EG-NEXT:     MOV T24.Z, T11.Y,
+; EG-NEXT:     ASHR * T11.W, T17.W, literal.x,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     MOV T26.X, T18.X,
+; EG-NEXT:     ASHR T11.Y, T17.Z, literal.x,
+; EG-NEXT:     MOV T25.Z, T18.W,
+; EG-NEXT:     ASHR T27.W, T17.Y, literal.x,
+; EG-NEXT:     MOV * T11.X, T17.Z,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     ASHR T27.Y, T17.X, literal.x,
+; EG-NEXT:     MOV T26.Z, T18.Y,
+; EG-NEXT:     ASHR * T18.W, T16.W, literal.x,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     MOV T27.X, T17.X,
+; EG-NEXT:     ASHR T18.Y, T16.Z, literal.x,
+; EG-NEXT:     MOV T11.Z, T17.W,
+; EG-NEXT:     ASHR T28.W, T16.Y, literal.x,
+; EG-NEXT:     MOV * T18.X, T16.Z,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     ASHR T28.Y, T16.X, literal.x,
+; EG-NEXT:     MOV T27.Z, T17.Y,
+; EG-NEXT:     ASHR * T17.W, T15.W, literal.x,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     MOV T28.X, T16.X,
+; EG-NEXT:     ASHR T17.Y, T15.Z, literal.x,
+; EG-NEXT:     MOV T18.Z, T16.W,
+; EG-NEXT:     ASHR T29.W, T15.Y, literal.x,
+; EG-NEXT:     MOV * T17.X, T15.Z,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     ASHR T29.Y, T15.X, literal.x,
+; EG-NEXT:     MOV T28.Z, T16.Y,
+; EG-NEXT:     ASHR * T16.W, T14.W, literal.x,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     MOV T29.X, T15.X,
+; EG-NEXT:     ASHR T16.Y, T14.Z, literal.x,
+; EG-NEXT:     MOV T17.Z, T15.W,
+; EG-NEXT:     ASHR T30.W, T14.Y, literal.x,
+; EG-NEXT:     MOV * T16.X, T14.Z,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     ASHR T30.Y, T14.X, literal.x,
+; EG-NEXT:     MOV T29.Z, T15.Y,
+; EG-NEXT:     ASHR * T15.W, T13.W, literal.x,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     MOV T30.X, T14.X,
+; EG-NEXT:     ASHR T15.Y, T13.Z, literal.x,
+; EG-NEXT:     MOV T16.Z, T14.W,
+; EG-NEXT:     ASHR T31.W, T13.Y, literal.x,
+; EG-NEXT:     MOV * T15.X, T13.Z,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     ASHR T31.Y, T13.X, literal.x,
+; EG-NEXT:     MOV T30.Z, T14.Y,
+; EG-NEXT:     ASHR * T14.W, T12.W, literal.x,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     MOV T31.X, T13.X,
+; EG-NEXT:     ASHR T14.Y, T12.Z, literal.x,
+; EG-NEXT:     MOV T15.Z, T13.W,
+; EG-NEXT:     ASHR T32.W, T12.Y, literal.x,
+; EG-NEXT:     MOV * T14.X, T12.Z,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     ASHR T32.Y, T12.X, literal.x,
+; EG-NEXT:     MOV * T31.Z, T13.Y,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     MOV T32.X, T12.X,
+; EG-NEXT:     MOV T14.Z, T12.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    224(3.138909e-43), 0(0.000000e+00)
+; EG-NEXT:     LSHR T12.X, PV.W, literal.x,
+; EG-NEXT:     MOV * T32.Z, T12.Y,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; GCN-GFX900-HSA-LABEL: global_sextload_v32i32_to_v32i64:
+; GCN-GFX900-HSA:       ; %bb.0:
+; GCN-GFX900-HSA-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GCN-GFX900-HSA-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; GCN-GFX900-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-GFX900-HSA-NEXT:    s_add_u32 s8, s8, s7
+; GCN-GFX900-HSA-NEXT:    s_addc_u32 s9, s9, 0
+; GCN-GFX900-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-GFX900-HSA-NEXT:    global_load_dwordx4 v[0:3], v8, s[2:3] offset:96
+; GCN-GFX900-HSA-NEXT:    global_load_dwordx4 v[4:7], v8, s[2:3] offset:112
+; GCN-GFX900-HSA-NEXT:    global_load_dwordx4 v[9:12], v8, s[2:3] offset:80
+; GCN-GFX900-HSA-NEXT:    global_load_dwordx4 v[13:16], v8, s[2:3] offset:64
+; GCN-GFX900-HSA-NEXT:    global_load_dwordx4 v[17:20], v8, s[2:3] offset:48
+; GCN-GFX900-HSA-NEXT:    global_load_dwordx4 v[21:24], v8, s[2:3] offset:32
+; GCN-GFX900-HSA-NEXT:    s_waitcnt vmcnt(5)
+; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v28, 31, v3
+; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v26, 31, v2
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v25, v2
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v27, v3
+; GCN-GFX900-HSA-NEXT:    s_waitcnt vmcnt(4)
+; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v32, 31, v7
+; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v30, 31, v6
+; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v36, 31, v5
+; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v34, 31, v4
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v33, v4
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v35, v5
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v29, v6
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v31, v7
+; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
+; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v0
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v4, v0
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v6, v1
+; GCN-GFX900-HSA-NEXT:    buffer_store_dword v25, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
+; GCN-GFX900-HSA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-GFX900-HSA-NEXT:    buffer_store_dword v26, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
+; GCN-GFX900-HSA-NEXT:    buffer_store_dword v27, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
+; GCN-GFX900-HSA-NEXT:    buffer_store_dword v28, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
+; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v28, 31, v12
+; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v26, 31, v11
+; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v40, 31, v10
+; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v38, 31, v9
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v37, v9
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v39, v10
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v25, v11
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v27, v12
+; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v12, 31, v16
+; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v10, 31, v15
+; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v44, 31, v14
+; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v42, 31, v13
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v41, v13
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v43, v14
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v9, v15
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v11, v16
+; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v16, 31, v20
+; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v14, 31, v19
+; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v48, 31, v18
+; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v46, 31, v17
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v45, v17
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v47, v18
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v13, v19
+; GCN-GFX900-HSA-NEXT:    global_load_dwordx4 v[49:52], v8, s[2:3] offset:16
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v15, v20
+; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v20, 31, v24
+; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v18, 31, v23
+; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v56, 31, v22
+; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v54, 31, v21
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v53, v21
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v55, v22
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v17, v23
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v19, v24
+; GCN-GFX900-HSA-NEXT:    global_load_dwordx4 v[21:24], v8, s[2:3]
+; GCN-GFX900-HSA-NEXT:    s_nop 0
+; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v8, v[33:36], s[0:1] offset:224
+; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v8, v[29:32], s[0:1] offset:240
+; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:192
+; GCN-GFX900-HSA-NEXT:    buffer_load_dword v32, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
+; GCN-GFX900-HSA-NEXT:    s_nop 0
+; GCN-GFX900-HSA-NEXT:    buffer_load_dword v33, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
+; GCN-GFX900-HSA-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
+; GCN-GFX900-HSA-NEXT:    buffer_load_dword v35, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload
+; GCN-GFX900-HSA-NEXT:    s_waitcnt vmcnt(8)
+; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v60, 31, v52
+; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v58, 31, v51
+; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v50
+; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v49
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v0, v49
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v2, v50
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v57, v51
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v59, v52
+; GCN-GFX900-HSA-NEXT:    s_waitcnt vmcnt(7)
+; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v31, 31, v24
+; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v29, 31, v23
+; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v22
+; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v21
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v4, v21
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v6, v22
+; GCN-GFX900-HSA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v8, v[32:35], s[0:1] offset:208
+; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v8, v[37:40], s[0:1] offset:160
+; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v8, v[25:28], s[0:1] offset:176
+; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v8, v[41:44], s[0:1] offset:128
+; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v8, v[9:12], s[0:1] offset:144
+; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v8, v[45:48], s[0:1] offset:96
+; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v8, v[13:16], s[0:1] offset:112
+; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v8, v[53:56], s[0:1] offset:64
+; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v8, v[17:20], s[0:1] offset:80
+; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:32
+; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v8, v[57:60], s[0:1] offset:48
+; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v28, v23
+; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v30, v24
+; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v8, v[28:31], s[0:1] offset:16
+; GCN-GFX900-HSA-NEXT:    s_endpgm
+;
+; GCN-GFX908-HSA-LABEL: global_sextload_v32i32_to_v32i64:
+; GCN-GFX908-HSA:       ; %bb.0:
+; GCN-GFX908-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-GFX908-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-GFX908-HSA-NEXT:    global_load_dwordx4 v[0:3], v8, s[2:3] offset:96
+; GCN-GFX908-HSA-NEXT:    global_load_dwordx4 v[4:7], v8, s[2:3] offset:112
+; GCN-GFX908-HSA-NEXT:    global_load_dwordx4 v[9:12], v8, s[2:3] offset:80
+; GCN-GFX908-HSA-NEXT:    global_load_dwordx4 v[13:16], v8, s[2:3] offset:64
+; GCN-GFX908-HSA-NEXT:    global_load_dwordx4 v[17:20], v8, s[2:3] offset:48
+; GCN-GFX908-HSA-NEXT:    global_load_dwordx4 v[21:24], v8, s[2:3] offset:32
+; GCN-GFX908-HSA-NEXT:    global_load_dwordx4 v[49:52], v8, s[2:3] offset:16
+; GCN-GFX908-HSA-NEXT:    s_waitcnt vmcnt(6)
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v25, v2
+; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v28, 31, v3
+; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v26, 31, v2
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v27, v3
+; GCN-GFX908-HSA-NEXT:    v_accvgpr_write_b32 a0, v25
+; GCN-GFX908-HSA-NEXT:    v_accvgpr_write_b32 a1, v26
+; GCN-GFX908-HSA-NEXT:    v_accvgpr_write_b32 a2, v27
+; GCN-GFX908-HSA-NEXT:    v_accvgpr_write_b32 a3, v28
+; GCN-GFX908-HSA-NEXT:    s_waitcnt vmcnt(4)
+; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v28, 31, v12
+; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v26, 31, v11
+; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v40, 31, v10
+; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v38, 31, v9
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v37, v9
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v39, v10
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v25, v11
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v27, v12
+; GCN-GFX908-HSA-NEXT:    s_waitcnt vmcnt(3)
+; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v12, 31, v16
+; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v10, 31, v15
+; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v44, 31, v14
+; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v42, 31, v13
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v41, v13
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v43, v14
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v9, v15
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v11, v16
+; GCN-GFX908-HSA-NEXT:    s_waitcnt vmcnt(2)
+; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v16, 31, v20
+; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v14, 31, v19
+; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v48, 31, v18
+; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v46, 31, v17
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v45, v17
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v47, v18
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v13, v19
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v15, v20
+; GCN-GFX908-HSA-NEXT:    s_waitcnt vmcnt(1)
+; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v20, 31, v24
+; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v18, 31, v23
+; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v56, 31, v22
+; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v54, 31, v21
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v53, v21
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v55, v22
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v17, v23
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v19, v24
+; GCN-GFX908-HSA-NEXT:    global_load_dwordx4 v[21:24], v8, s[2:3]
+; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v32, 31, v7
+; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v36, 31, v5
+; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v34, 31, v4
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v33, v4
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v35, v5
+; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v30, 31, v6
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v29, v6
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v31, v7
+; GCN-GFX908-HSA-NEXT:    global_store_dwordx4 v8, v[33:36], s[0:1] offset:224
+; GCN-GFX908-HSA-NEXT:    global_store_dwordx4 v8, v[29:32], s[0:1] offset:240
+; GCN-GFX908-HSA-NEXT:    v_accvgpr_read_b32 v35, a3
+; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
+; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v0
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v4, v0
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v6, v1
+; GCN-GFX908-HSA-NEXT:    v_accvgpr_read_b32 v34, a2
+; GCN-GFX908-HSA-NEXT:    v_accvgpr_read_b32 v33, a1
+; GCN-GFX908-HSA-NEXT:    v_accvgpr_read_b32 v32, a0
+; GCN-GFX908-HSA-NEXT:    s_waitcnt vmcnt(3)
+; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v60, 31, v52
+; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v58, 31, v51
+; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v50
+; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v49
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v0, v49
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v2, v50
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v57, v51
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v59, v52
+; GCN-GFX908-HSA-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:192
+; GCN-GFX908-HSA-NEXT:    s_waitcnt vmcnt(3)
+; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v31, 31, v24
+; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v29, 31, v23
+; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v22
+; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v21
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v4, v21
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v6, v22
+; GCN-GFX908-HSA-NEXT:    global_store_dwordx4 v8, v[32:35], s[0:1] offset:208
+; GCN-GFX908-HSA-NEXT:    global_store_dwordx4 v8, v[37:40], s[0:1] offset:160
+; GCN-GFX908-HSA-NEXT:    global_store_dwordx4 v8, v[25:28], s[0:1] offset:176
+; GCN-GFX908-HSA-NEXT:    global_store_dwordx4 v8, v[41:44], s[0:1] offset:128
+; GCN-GFX908-HSA-NEXT:    global_store_dwordx4 v8, v[9:12], s[0:1] offset:144
+; GCN-GFX908-HSA-NEXT:    global_store_dwordx4 v8, v[45:48], s[0:1] offset:96
+; GCN-GFX908-HSA-NEXT:    global_store_dwordx4 v8, v[13:16], s[0:1] offset:112
+; GCN-GFX908-HSA-NEXT:    global_store_dwordx4 v8, v[53:56], s[0:1] offset:64
+; GCN-GFX908-HSA-NEXT:    global_store_dwordx4 v8, v[17:20], s[0:1] offset:80
+; GCN-GFX908-HSA-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:32
+; GCN-GFX908-HSA-NEXT:    global_store_dwordx4 v8, v[57:60], s[0:1] offset:48
+; GCN-GFX908-HSA-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v28, v23
+; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v30, v24
+; GCN-GFX908-HSA-NEXT:    global_store_dwordx4 v8, v[28:31], s[0:1] offset:16
+; GCN-GFX908-HSA-NEXT:    s_endpgm
   %ld = load <32 x i32>, ptr addrspace(1) %in
   %ext = sext <32 x i32> %ld to <32 x i64>
   store <32 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}global_zextload_v32i32_to_v32i64:
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
-
-; GCN-HSA: {{flat|global}}_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dwordx4
-
-
-; GCN-NOHSA-DAG: buffer_store_dwordx4
-; GCN-NOHSA-DAG: buffer_store_dwordx4
-; GCN-NOHSA-DAG: buffer_store_dwordx4
-; GCN-NOHSA-DAG: buffer_store_dwordx4
-
-; GCN-NOHSA-DAG: buffer_store_dwordx4
-; GCN-NOHSA-DAG: buffer_store_dwordx4
-; GCN-NOHSA-DAG: buffer_store_dwordx4
-; GCN-NOHSA-DAG: buffer_store_dwordx4
-
-; GCN-NOHSA-DAG: buffer_store_dwordx4
-; GCN-NOHSA-DAG: buffer_store_dwordx4
-; GCN-NOHSA-DAG: buffer_store_dwordx4
-; GCN-NOHSA-DAG: buffer_store_dwordx4
-
-; GCN-NOHSA-DAG: buffer_store_dwordx4
-; GCN-NOHSA-DAG: buffer_store_dwordx4
-; GCN-NOHSA-DAG: buffer_store_dwordx4
-; GCN-NOHSA-DAG: buffer_store_dwordx4
-
-
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
 define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-NOHSA-LABEL: global_zextload_v32i32_to_v32i64:
+; SI-NOHSA:       ; %bb.0:
+; SI-NOHSA-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NOHSA-NEXT:    s_mov_b32 s2, -1
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NOHSA-NEXT:    s_mov_b32 s10, s2
+; SI-NOHSA-NEXT:    s_mov_b32 s11, s3
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v3, v1
+; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NOHSA-NEXT:    s_mov_b32 s8, s6
+; SI-NOHSA-NEXT:    s_mov_b32 s9, s7
+; SI-NOHSA-NEXT:    s_mov_b32 s0, s4
+; SI-NOHSA-NEXT:    s_mov_b32 s1, s5
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:112
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:96
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:16
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:48
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(5)
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v4
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v5
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:64
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[32:35], off, s[8:11], 0 offset:80
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
+; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v6
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v7
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(8) expcnt(0)
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v8
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v9
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
+; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v10
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v11
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v32
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v33
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
+; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v34
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v35
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
+; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v28
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v29
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
+; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v30
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v31
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
+; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v24
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v25
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
+; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v26
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v27
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
+; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v20
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v21
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
+; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v22
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v23
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
+; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v16
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v17
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
+; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v18
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v19
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v12
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v13
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v14
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v15
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; SI-NOHSA-NEXT:    s_endpgm
+;
+; GCNX3-HSA-LABEL: global_zextload_v32i32_to_v32i64:
+; GCNX3-HSA:       ; %bb.0:
+; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 16
+; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
+; GCNX3-HSA-NEXT:    s_add_u32 s6, s2, 32
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCNX3-HSA-NEXT:    s_addc_u32 s7, s3, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCNX3-HSA-NEXT:    s_add_u32 s8, s2, 48
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[28:31], v[0:1]
+; GCNX3-HSA-NEXT:    s_addc_u32 s9, s3, 0
+; GCNX3-HSA-NEXT:    s_add_u32 s10, s2, 64
+; GCNX3-HSA-NEXT:    s_addc_u32 s11, s3, 0
+; GCNX3-HSA-NEXT:    s_add_u32 s12, s2, 0x50
+; GCNX3-HSA-NEXT:    s_addc_u32 s13, s3, 0
+; GCNX3-HSA-NEXT:    s_add_u32 s14, s2, 0x60
+; GCNX3-HSA-NEXT:    s_addc_u32 s15, s3, 0
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s2, 0x70
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s3, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[32:35], v[0:1]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s14
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s15
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[24:27], v[0:1]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s12
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s13
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[20:23], v[0:1]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s10
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s11
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[16:19], v[0:1]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s8
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s9
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[0:1]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s4
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s7
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s5
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s6
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, 0
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 16
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v3, v1
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(7)
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, v28
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v29
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v29, s1
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v28, s0
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[28:29], v[0:3]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v29, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v28, s2
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0xe0
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, v30
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v31
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v31, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v30, s2
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0xf0
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[28:29], v[0:3]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v29, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v28, s2
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0xc0
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(8)
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, v32
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v33
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[30:31], v[0:3]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v31, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v30, s2
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0xd0
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v33, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v32, s2
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0xa0
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, v34
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v35
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v35, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v34, s2
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0xb0
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[28:29], v[0:3]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v29, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v28, s2
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0x80
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(9)
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, v24
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v25
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v25, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v24, s2
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0x90
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[30:31], v[0:3]
+; GCNX3-HSA-NEXT:    s_nop 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, v26
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v27
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v27, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v26, s2
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0x60
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[0:3]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v31, s3
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(10)
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, v20
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v21
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[34:35], v[0:3]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v30, s2
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, v22
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v23
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0x70
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[28:29], v[0:3]
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(11)
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, v16
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v17
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[0:3]
+; GCNX3-HSA-NEXT:    s_nop 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, v18
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v19
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[26:27], v[0:3]
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(12)
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, v12
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v13
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v13, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v12, s2
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 64
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[30:31], v[0:3]
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, v14
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v15
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(12)
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, v8
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v9
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s2
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0x50
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s2
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 32
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, v10
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v11
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
+; GCNX3-HSA-NEXT:    s_add_u32 s0, s0, 48
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, v4
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v5
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s2
+; GCNX3-HSA-NEXT:    s_addc_u32 s1, s1, 0
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, v6
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v7
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCNX3-HSA-NEXT:    s_endpgm
+;
+; GCNX3-NOHSA-LABEL: global_zextload_v32i32_to_v32i64:
+; GCNX3-NOHSA:       ; %bb.0:
+; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s3
+; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s7
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:112
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:96
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:80
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:64
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:48
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:16
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[32:35], off, s[8:11], 0
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v29, 0
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v31, v29
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s1, s5
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(7)
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v28, v0
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v30, v1
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:224
+; GCNX3-NOHSA-NEXT:    s_nop 0
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v28, v2
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v30, v3
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:240
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(8)
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v28, v4
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v30, v5
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:192
+; GCNX3-NOHSA-NEXT:    s_nop 0
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v28, v6
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v30, v7
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:208
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(9)
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v28, v8
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v30, v9
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:160
+; GCNX3-NOHSA-NEXT:    s_nop 0
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v28, v10
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v30, v11
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:176
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(10)
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v28, v12
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v30, v13
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:128
+; GCNX3-NOHSA-NEXT:    s_nop 0
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v28, v14
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v30, v15
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:144
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(11)
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v28, v16
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v30, v17
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:96
+; GCNX3-NOHSA-NEXT:    s_nop 0
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v28, v18
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v30, v19
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(12)
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v28, v20
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v30, v21
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:64
+; GCNX3-NOHSA-NEXT:    s_nop 0
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v28, v22
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v30, v23
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:80
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(13)
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v28, v24
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v30, v25
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:32
+; GCNX3-NOHSA-NEXT:    s_nop 0
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v28, v26
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v30, v27
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:48
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(14)
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v28, v32
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v30, v33
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0
+; GCNX3-NOHSA-NEXT:    s_nop 0
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v28, v34
+; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v30, v35
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:16
+; GCNX3-NOHSA-NEXT:    s_endpgm
+;
+; EG-LABEL: global_zextload_v32i32_to_v32i64:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 0, @38, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 2 @22
+; EG-NEXT:    ALU 10, @39, KC0[], KC1[]
+; EG-NEXT:    TEX 4 @28
+; EG-NEXT:    ALU 100, @50, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T31.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T30.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T29.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T28.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T27.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T26.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T25.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T24.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T13.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T12.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T11.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T10.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T3.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T2.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T1.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T0.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    Fetch clause starting at 22:
+; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 112, #1
+; EG-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 80, #1
+; EG-NEXT:     VTX_READ_128 T3.XYZW, T0.X, 96, #1
+; EG-NEXT:    Fetch clause starting at 28:
+; EG-NEXT:     VTX_READ_128 T10.XYZW, T0.X, 0, #1
+; EG-NEXT:     VTX_READ_128 T11.XYZW, T0.X, 16, #1
+; EG-NEXT:     VTX_READ_128 T12.XYZW, T0.X, 32, #1
+; EG-NEXT:     VTX_READ_128 T13.XYZW, T0.X, 48, #1
+; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 64, #1
+; EG-NEXT:    ALU clause starting at 38:
+; EG-NEXT:     MOV * T0.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 39:
+; EG-NEXT:     MOV T4.X, T1.X,
+; EG-NEXT:     MOV T4.Y, 0.0,
+; EG-NEXT:     MOV * T5.X, T1.Z,
+; EG-NEXT:     MOV * T5.Y, 0.0,
+; EG-NEXT:     MOV T6.X, T3.X,
+; EG-NEXT:     MOV T6.Y, 0.0,
+; EG-NEXT:     MOV * T7.X, T3.Z,
+; EG-NEXT:     MOV * T7.Y, 0.0,
+; EG-NEXT:     MOV T8.X, T2.X,
+; EG-NEXT:     MOV T8.Y, 0.0,
+; EG-NEXT:     MOV * T9.X, T2.Z,
+; EG-NEXT:    ALU clause starting at 50:
+; EG-NEXT:     MOV * T9.Y, 0.0,
+; EG-NEXT:     MOV T14.X, T0.X,
+; EG-NEXT:     MOV T14.Y, 0.0,
+; EG-NEXT:     MOV * T15.X, T0.Z,
+; EG-NEXT:     MOV * T15.Y, 0.0,
+; EG-NEXT:     MOV T16.X, T13.X,
+; EG-NEXT:     MOV T16.Y, 0.0,
+; EG-NEXT:     MOV * T17.X, T13.Z,
+; EG-NEXT:     MOV * T17.Y, 0.0,
+; EG-NEXT:     MOV T18.X, T12.X,
+; EG-NEXT:     MOV T18.Y, 0.0,
+; EG-NEXT:     MOV * T19.X, T12.Z,
+; EG-NEXT:     MOV * T19.Y, 0.0,
+; EG-NEXT:     MOV T20.X, T11.X,
+; EG-NEXT:     MOV T20.Y, 0.0,
+; EG-NEXT:     MOV * T21.X, T11.Z,
+; EG-NEXT:     MOV * T21.Y, 0.0,
+; EG-NEXT:     MOV T22.X, T10.X,
+; EG-NEXT:     MOV T22.Y, 0.0,
+; EG-NEXT:     MOV * T23.X, T10.Z,
+; EG-NEXT:     MOV T23.Y, 0.0,
+; EG-NEXT:     MOV T4.Z, T1.Y,
+; EG-NEXT:     MOV T4.W, 0.0,
+; EG-NEXT:     MOV * T5.Z, T1.W,
+; EG-NEXT:     MOV * T5.W, 0.0,
+; EG-NEXT:     MOV T6.Z, T3.Y,
+; EG-NEXT:     MOV T6.W, 0.0,
+; EG-NEXT:     MOV * T7.Z, T3.W,
+; EG-NEXT:     MOV * T7.W, 0.0,
+; EG-NEXT:     MOV T8.Z, T2.Y,
+; EG-NEXT:     MOV T8.W, 0.0,
+; EG-NEXT:     MOV * T9.Z, T2.W,
+; EG-NEXT:     MOV * T9.W, 0.0,
+; EG-NEXT:     MOV T14.Z, T0.Y,
+; EG-NEXT:     MOV T14.W, 0.0,
+; EG-NEXT:     MOV * T15.Z, T0.W,
+; EG-NEXT:     MOV * T15.W, 0.0,
+; EG-NEXT:     MOV T16.Z, T13.Y,
+; EG-NEXT:     MOV T16.W, 0.0,
+; EG-NEXT:     MOV * T17.Z, T13.W,
+; EG-NEXT:     MOV * T17.W, 0.0,
+; EG-NEXT:     MOV T18.Z, T12.Y,
+; EG-NEXT:     MOV T18.W, 0.0,
+; EG-NEXT:     MOV * T19.Z, T12.W,
+; EG-NEXT:     MOV * T19.W, 0.0,
+; EG-NEXT:     MOV T20.Z, T11.Y,
+; EG-NEXT:     MOV T20.W, 0.0,
+; EG-NEXT:     MOV * T21.Z, T11.W,
+; EG-NEXT:     MOV * T21.W, 0.0,
+; EG-NEXT:     MOV T22.Z, T10.Y,
+; EG-NEXT:     MOV T22.W, 0.0,
+; EG-NEXT:     MOV * T23.Z, T10.W,
+; EG-NEXT:     MOV T23.W, 0.0,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR T0.X, PS, literal.x,
+; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
+; EG-NEXT:     LSHR T3.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
+; EG-NEXT:     LSHR T10.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
+; EG-NEXT:     LSHR T11.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 112(1.569454e-43)
+; EG-NEXT:     LSHR T12.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 96(1.345247e-43)
+; EG-NEXT:     LSHR T13.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 144(2.017870e-43)
+; EG-NEXT:     LSHR T24.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 128(1.793662e-43)
+; EG-NEXT:     LSHR T25.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 176(2.466285e-43)
+; EG-NEXT:     LSHR T26.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 160(2.242078e-43)
+; EG-NEXT:     LSHR T27.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 208(2.914701e-43)
+; EG-NEXT:     LSHR T28.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 192(2.690493e-43)
+; EG-NEXT:     LSHR T29.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 240(3.363116e-43)
+; EG-NEXT:     LSHR T30.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 224(3.138909e-43)
+; EG-NEXT:     LSHR * T31.X, PV.W, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; GCN-HSA-LABEL: global_zextload_v32i32_to_v32i64:
+; GCN-HSA:       ; %bb.0:
+; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-HSA-NEXT:    global_load_dwordx4 v[4:7], v1, s[2:3] offset:112
+; GCN-HSA-NEXT:    global_load_dwordx4 v[8:11], v1, s[2:3] offset:96
+; GCN-HSA-NEXT:    global_load_dwordx4 v[12:15], v1, s[2:3] offset:80
+; GCN-HSA-NEXT:    global_load_dwordx4 v[16:19], v1, s[2:3] offset:64
+; GCN-HSA-NEXT:    global_load_dwordx4 v[20:23], v1, s[2:3] offset:48
+; GCN-HSA-NEXT:    global_load_dwordx4 v[24:27], v1, s[2:3] offset:32
+; GCN-HSA-NEXT:    global_load_dwordx4 v[28:31], v1, s[2:3] offset:16
+; GCN-HSA-NEXT:    global_load_dwordx4 v[32:35], v1, s[2:3]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(7)
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v5
+; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:224
+; GCN-HSA-NEXT:    s_nop 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v7
+; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:240
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(8)
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v9
+; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:192
+; GCN-HSA-NEXT:    s_nop 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v10
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v11
+; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:208
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(9)
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v13
+; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:160
+; GCN-HSA-NEXT:    s_nop 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v14
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v15
+; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:176
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(10)
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v17
+; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:128
+; GCN-HSA-NEXT:    s_nop 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v18
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v19
+; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:144
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(11)
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v20
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v21
+; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:96
+; GCN-HSA-NEXT:    s_nop 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v22
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v23
+; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:112
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(12)
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v24
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v25
+; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:64
+; GCN-HSA-NEXT:    s_nop 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v26
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v27
+; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:80
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(13)
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v28
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v29
+; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:32
+; GCN-HSA-NEXT:    s_nop 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v30
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v31
+; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:48
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(14)
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v32
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v33
+; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1]
+; GCN-HSA-NEXT:    s_nop 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v34
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v35
+; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:16
+; GCN-HSA-NEXT:    s_endpgm
   %ld = load <32 x i32>, ptr addrspace(1) %in
   %ext = zext <32 x i32> %ld to <32 x i64>
   store <32 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}global_load_v32i32:
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
-
-; GCN-HSA: {{flat|global}}_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dwordx4
-; GCN-HSA: {{flat|global}}_load_dwordx4
-
-
-; GCN-NOHSA-DAG: buffer_store_dwordx4
-; GCN-NOHSA-DAG: buffer_store_dwordx4
-; GCN-NOHSA-DAG: buffer_store_dwordx4
-; GCN-NOHSA-DAG: buffer_store_dwordx4
-
-; GCN-NOHSA-DAG: buffer_store_dwordx4
-; GCN-NOHSA-DAG: buffer_store_dwordx4
-; GCN-NOHSA-DAG: buffer_store_dwordx4
-; GCN-NOHSA-DAG: buffer_store_dwordx4
-
-; GCN-NOHSA-DAG: buffer_store_dwordx4
-; GCN-NOHSA-DAG: buffer_store_dwordx4
-; GCN-NOHSA-DAG: buffer_store_dwordx4
-; GCN-NOHSA-DAG: buffer_store_dwordx4
-
-; GCN-NOHSA-DAG: buffer_store_dwordx4
-; GCN-NOHSA-DAG: buffer_store_dwordx4
-; GCN-NOHSA-DAG: buffer_store_dwordx4
-; GCN-NOHSA-DAG: buffer_store_dwordx4
-
-; GCN-NOT: accvgpr
-
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
-; GCN-HSA-DAG: {{flat|global}}_store_dwordx4
 define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-NOHSA-LABEL: global_load_v32i32:
+; SI-NOHSA:       ; %bb.0:
+; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
+; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
+; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
+; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
+; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
+; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
+; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:112
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:96
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:80
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:64
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:32
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:48
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(4)
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:96
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:112
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(4)
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0 offset:64
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:80
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(5)
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0 offset:32
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(5)
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0 offset:48
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; SI-NOHSA-NEXT:    s_endpgm
+;
+; GCNX3-HSA-LABEL: global_load_v32i32:
+; GCNX3-HSA:       ; %bb.0:
+; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 16
+; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
+; GCNX3-HSA-NEXT:    s_add_u32 s6, s2, 48
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v29, s5
+; GCNX3-HSA-NEXT:    s_addc_u32 s7, s3, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v28, s4
+; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 32
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s5
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s4
+; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 0x50
+; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v13, s5
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v12, s4
+; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 64
+; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v17, s5
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, s4
+; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 0x70
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s6
+; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s7
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s2, 0x60
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s3, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v21, s5
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v25, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v20, s4
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v24, s2
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[16:19], v[16:17]
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[20:23], v[20:21]
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[24:27], v[24:25]
+; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[28:31], v[28:29]
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0x60
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v33, s1
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v32, s0
+; GCNX3-HSA-NEXT:    s_add_u32 s4, s0, 0x70
+; GCNX3-HSA-NEXT:    s_addc_u32 s5, s1, 0
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(7)
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[0:3]
+; GCNX3-HSA-NEXT:    s_nop 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 64
+; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    s_add_u32 s6, s0, 0x50
+; GCNX3-HSA-NEXT:    s_addc_u32 s7, s1, 0
+; GCNX3-HSA-NEXT:    s_add_u32 s8, s0, 32
+; GCNX3-HSA-NEXT:    s_addc_u32 s9, s1, 0
+; GCNX3-HSA-NEXT:    s_add_u32 s10, s0, 48
+; GCNX3-HSA-NEXT:    s_addc_u32 s11, s1, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, s10
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v3, s11
+; GCNX3-HSA-NEXT:    s_add_u32 s0, s0, 16
+; GCNX3-HSA-NEXT:    s_addc_u32 s1, s1, 0
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(7)
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
+; GCNX3-HSA-NEXT:    s_nop 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v6, s8
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v7, s9
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(7)
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[6:7], v[8:11]
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v6, s6
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, s4
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v7, s7
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s1
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v3, s5
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s2
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s0
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(7)
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[6:7], v[12:15]
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(7)
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[16:19]
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(7)
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[2:3], v[20:23]
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(7)
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[24:27]
+; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(7)
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[28:31]
+; GCNX3-HSA-NEXT:    s_endpgm
+;
+; GCNX3-NOHSA-LABEL: global_load_v32i32:
+; GCNX3-NOHSA:       ; %bb.0:
+; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:112
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:96
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:80
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:64
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:32
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:48
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:16
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(6)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:96
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:112
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(6)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:64
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:80
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(7)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:32
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(7)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0 offset:48
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(7)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(7)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0 offset:16
+; GCNX3-NOHSA-NEXT:    s_endpgm
+;
+; EG-LABEL: global_load_v32i32:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 23, @28, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 7 @12
+; EG-NEXT:    ALU 1, @52, KC0[], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T15.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T6.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T5.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T4.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T3.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T2.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T1.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T0.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    Fetch clause starting at 12:
+; EG-NEXT:     VTX_READ_128 T8.XYZW, T7.X, 96, #1
+; EG-NEXT:     VTX_READ_128 T9.XYZW, T7.X, 112, #1
+; EG-NEXT:     VTX_READ_128 T10.XYZW, T7.X, 64, #1
+; EG-NEXT:     VTX_READ_128 T11.XYZW, T7.X, 80, #1
+; EG-NEXT:     VTX_READ_128 T12.XYZW, T7.X, 32, #1
+; EG-NEXT:     VTX_READ_128 T13.XYZW, T7.X, 48, #1
+; EG-NEXT:     VTX_READ_128 T14.XYZW, T7.X, 0, #1
+; EG-NEXT:     VTX_READ_128 T7.XYZW, T7.X, 16, #1
+; EG-NEXT:    ALU clause starting at 28:
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR T0.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
+; EG-NEXT:     LSHR T3.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
+; EG-NEXT:     LSHR T4.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
+; EG-NEXT:     LSHR T5.X, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 112(1.569454e-43)
+; EG-NEXT:     LSHR T6.X, PV.W, literal.x,
+; EG-NEXT:     MOV * T7.X, KC0[2].Z,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    96(1.345247e-43), 0(0.000000e+00)
+; EG-NEXT:    ALU clause starting at 52:
+; EG-NEXT:     LSHR * T15.X, T0.W, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; GCN-HSA-LABEL: global_load_v32i32:
+; GCN-HSA:       ; %bb.0:
+; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v32, 0
+; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-HSA-NEXT:    global_load_dwordx4 v[0:3], v32, s[2:3] offset:96
+; GCN-HSA-NEXT:    global_load_dwordx4 v[4:7], v32, s[2:3] offset:112
+; GCN-HSA-NEXT:    global_load_dwordx4 v[8:11], v32, s[2:3] offset:64
+; GCN-HSA-NEXT:    global_load_dwordx4 v[12:15], v32, s[2:3] offset:80
+; GCN-HSA-NEXT:    global_load_dwordx4 v[16:19], v32, s[2:3] offset:32
+; GCN-HSA-NEXT:    global_load_dwordx4 v[20:23], v32, s[2:3] offset:48
+; GCN-HSA-NEXT:    global_load_dwordx4 v[24:27], v32, s[2:3]
+; GCN-HSA-NEXT:    global_load_dwordx4 v[28:31], v32, s[2:3] offset:16
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(7)
+; GCN-HSA-NEXT:    global_store_dwordx4 v32, v[0:3], s[0:1] offset:96
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(7)
+; GCN-HSA-NEXT:    global_store_dwordx4 v32, v[4:7], s[0:1] offset:112
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(7)
+; GCN-HSA-NEXT:    global_store_dwordx4 v32, v[8:11], s[0:1] offset:64
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(7)
+; GCN-HSA-NEXT:    global_store_dwordx4 v32, v[12:15], s[0:1] offset:80
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(7)
+; GCN-HSA-NEXT:    global_store_dwordx4 v32, v[16:19], s[0:1] offset:32
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(7)
+; GCN-HSA-NEXT:    global_store_dwordx4 v32, v[20:23], s[0:1] offset:48
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(7)
+; GCN-HSA-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(7)
+; GCN-HSA-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:16
+; GCN-HSA-NEXT:    s_endpgm
   %ld = load <32 x i32>, ptr addrspace(1) %in
   store <32 x i32> %ld, ptr addrspace(1) %out
   ret void

From bb98227db19ae4d80af7a25a9423aae2aeaec61d Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Fri, 1 Dec 2023 10:30:15 +0100
Subject: [PATCH 16/72] [libc][NFC] Remove named_pair (#73952)

`named_pair` does not provide enough value to deserve its own header.
---
 libc/src/__support/CMakeLists.txt              |  8 --------
 libc/src/__support/math_extras.h               | 11 ++++++++---
 libc/src/__support/named_pair.h                | 18 ------------------
 libc/src/__support/number_pair.h               |  6 ++++--
 .../llvm-project-overlay/libc/BUILD.bazel      |  7 -------
 5 files changed, 12 insertions(+), 38 deletions(-)
 delete mode 100644 libc/src/__support/named_pair.h

diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt
index decd6ed2dbd2bd..ba80965b5aaaf3 100644
--- a/libc/src/__support/CMakeLists.txt
+++ b/libc/src/__support/CMakeLists.txt
@@ -10,12 +10,6 @@ add_header_library(
     libc.src.__support.CPP.new
 )
 
-add_header_library(
-  named_pair
-  HDRS
-    named_pair.h
-)
-
 add_header_library(
   common
   HDRS
@@ -40,7 +34,6 @@ add_header_library(
   HDRS
     math_extras.h
   DEPENDS
-    .named_pair
     libc.src.__support.CPP.type_traits
     libc.src.__support.macros.attributes
     libc.src.__support.macros.config
@@ -187,7 +180,6 @@ add_header_library(
   HDRS
     number_pair.h
   DEPENDS
-    .named_pair
     libc.src.__support.CPP.type_traits
 )
 
diff --git a/libc/src/__support/math_extras.h b/libc/src/__support/math_extras.h
index cc22aa49d02601..860cdda8586d1e 100644
--- a/libc/src/__support/math_extras.h
+++ b/libc/src/__support/math_extras.h
@@ -10,7 +10,6 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_MATH_EXTRAS_H
 #define LLVM_LIBC_SRC___SUPPORT_MATH_EXTRAS_H
 
-#include "named_pair.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/macros/attributes.h" // LIBC_INLINE
 #include "src/__support/macros/config.h"     // LIBC_HAS_BUILTIN
@@ -18,7 +17,10 @@
 namespace LIBC_NAMESPACE {
 
 // Add with carry
-DEFINE_NAMED_PAIR_TEMPLATE(SumCarry, sum, carry);
+template <typename T> struct SumCarry {
+  T sum;
+  T carry;
+};
 
 // This version is always valid for constexpr.
 template <typename T>
@@ -91,7 +93,10 @@ add_with_carry<unsigned long long>(unsigned long long a, unsigned long long b,
 #endif // LIBC_HAS_BUILTIN(__builtin_addc)
 
 // Subtract with borrow
-DEFINE_NAMED_PAIR_TEMPLATE(DiffBorrow, diff, borrow);
+template <typename T> struct DiffBorrow {
+  T diff;
+  T borrow;
+};
 
 // This version is always valid for constexpr.
 template <typename T>
diff --git a/libc/src/__support/named_pair.h b/libc/src/__support/named_pair.h
deleted file mode 100644
index bd7dccf9810c7f..00000000000000
--- a/libc/src/__support/named_pair.h
+++ /dev/null
@@ -1,18 +0,0 @@
-//===-- Utilities for pairs of numbers. -------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC___SUPPORT_NAMED_PAIR_H
-#define LLVM_LIBC_SRC___SUPPORT_NAMED_PAIR_H
-
-#define DEFINE_NAMED_PAIR_TEMPLATE(Name, FirstField, SecondField)              \
-  template <typename T1, typename T2 = T1> struct Name {                       \
-    T1 FirstField;                                                             \
-    T2 SecondField;                                                            \
-  }
-
-#endif // LLVM_LIBC_SRC___SUPPORT_NAMED_PAIR_H
diff --git a/libc/src/__support/number_pair.h b/libc/src/__support/number_pair.h
index 5e553d817994b4..12e730836af2c6 100644
--- a/libc/src/__support/number_pair.h
+++ b/libc/src/__support/number_pair.h
@@ -10,13 +10,15 @@
 #define LLVM_LIBC_SRC___SUPPORT_NUMBER_PAIR_H
 
 #include "CPP/type_traits.h"
-#include "named_pair.h"
 
 #include <stddef.h>
 
 namespace LIBC_NAMESPACE {
 
-DEFINE_NAMED_PAIR_TEMPLATE(NumberPair, lo, hi);
+template <typename T> struct NumberPair {
+  T lo;
+  T hi;
+};
 
 template <typename T>
 cpp::enable_if_t<cpp::is_integral_v<T> && cpp::is_unsigned_v<T>, NumberPair<T>>
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index d53ca202101537..46d81987e7b32f 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -432,7 +432,6 @@ libc_support_library(
     hdrs = ["src/__support/number_pair.h"],
     deps = [
         ":__support_cpp_type_traits",
-        ":__support_named_pair",
     ],
 )
 
@@ -587,11 +586,6 @@ libc_support_library(
     ],
 )
 
-libc_support_library(
-    name = "__support_named_pair",
-    hdrs = ["src/__support/named_pair.h"],
-)
-
 libc_support_library(
     name = "__support_bit",
     hdrs = ["src/__support/bit.h"],
@@ -608,7 +602,6 @@ libc_support_library(
         ":__support_cpp_type_traits",
         ":__support_macros_attributes",
         ":__support_macros_config",
-        ":__support_named_pair",
     ],
 )
 

From 5fe7ae848cc6cb2afc3aab332743ffa2bb635fc3 Mon Sep 17 00:00:00 2001
From: Matt Devereau <matthew.devereau@arm.com>
Date: Mon, 20 Nov 2023 10:49:27 +0000
Subject: [PATCH 17/72] [AArch64][SME2] Add ldr_zt, str_zt builtins and
 intrinsics (#72849)

Adds the builtins:
void svldr_zt(uint64_t zt, const void *rn)
void svstr_zt(uint64_t zt, void *rn)

And the intrinsics:
call void @llvm.aarch64.sme.ldr.zt(i32, ptr)
tail call void @llvm.aarch64.sme.str.zt(i32, ptr)

Patch by: Kerry McLaughlin kerry.mclaughlin@arm.com
---
 clang/include/clang/Basic/arm_sme.td          |  8 ++++
 .../acle_sme2_ldr_str_zt.c                    | 41 +++++++++++++++++++
 .../aarch64-sme2-intrinsics/acle_sme2_imm.cpp |  7 +++-
 .../Target/AArch64/AArch64ISelDAGToDAG.cpp    |  7 +++-
 .../Target/AArch64/AArch64ISelLowering.cpp    | 18 ++++++++
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |  2 +
 .../Target/AArch64/AArch64RegisterInfo.cpp    |  6 +++
 .../lib/Target/AArch64/AArch64SMEInstrInfo.td |  4 +-
 llvm/lib/Target/AArch64/SMEInstrFormats.td    | 23 ++++++++---
 .../CodeGen/AArch64/sme2-intrinsics-zt0.ll    | 27 ++++++++++++
 10 files changed, 134 insertions(+), 9 deletions(-)
 create mode 100644 clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_ldr_str_zt.c
 create mode 100644 llvm/test/CodeGen/AArch64/sme2-intrinsics-zt0.ll

diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index d55deeaa40bbcd..7aae3c832bb1fe 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -314,3 +314,11 @@ let TargetGuard = "sme2" in {
 
   def SVBMOPS : Inst<"svbmops_za32[_{d}]_m", "viPPdd", "iUi", MergeNone, "aarch64_sme_bmops_za32", [IsSharedZA, IsStreaming], [ImmCheck<0, ImmCheck0_3>]>;
 }
+
+//
+// Spill and fill of ZT0
+//
+let TargetGuard = "sme2" in {
+  def SVLDR_ZT : Inst<"svldr_zt", "viQ", "", MergeNone, "aarch64_sme_ldr_zt", [IsOverloadNone, IsStreamingCompatible, IsSharedZA, IsPreservesZA], [ImmCheck<0, ImmCheck0_0>]>;
+  def SVSTR_ZT : Inst<"svstr_zt", "vi%", "", MergeNone, "aarch64_sme_str_zt", [IsOverloadNone, IsStreamingCompatible, IsSharedZA, IsPreservesZA], [ImmCheck<0, ImmCheck0_0>]>;
+}
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_ldr_str_zt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_ldr_str_zt.c
new file mode 100644
index 00000000000000..126a4fc1045853
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_ldr_str_zt.c
@@ -0,0 +1,41 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+#include <arm_sme_draft_spec_subject_to_change.h>
+
+// LDR ZT0
+
+// CHECK-LABEL: @test_svldr_zt(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.ldr.zt(i32 0, ptr [[BASE:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z13test_svldr_ztPKv(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.ldr.zt(i32 0, ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svldr_zt(const void *base) __arm_streaming_compatible __arm_shared_za __arm_preserves_za {
+  svldr_zt(0, base);
+}
+
+// STR ZT0
+
+// CHECK-LABEL: @test_svstr_zt(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.str.zt(i32 0, ptr [[BASE:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z13test_svstr_ztPv(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.str.zt(i32 0, ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svstr_zt(void *base) __arm_streaming_compatible __arm_shared_za __arm_preserves_za {
+  svstr_zt(0, base);
+}
diff --git a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp
index 4c35a238d9f9e2..70987ad395f735 100644
--- a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp
+++ b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu \
-// RUN:    -target-feature +sve2 -target-feature +sme2 -target-feature +sve -fsyntax-only -verify %s
+// RUN:    -target-feature +sve2 -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -fsyntax-only -verify %s
 
 // REQUIRES: aarch64-registered-target
 
@@ -19,3 +19,8 @@ void test_outer_product(svbool_t pred, svint16_t s16, svuint16_t u16, svint32_t
   svbmops_za32_u32_m(4, pred, pred, u32, u32); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
   svbmops_za32_s32_m(4, pred, pred, s32, s32); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
 }
+
+void test_ldr_str_zt(const void *const_base, void *base) __arm_streaming_compatible __arm_shared_za __arm_preserves_za {
+  svldr_zt(1, const_base); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
+  svstr_zt(1, base);       // expected-error {{argument value 1 is outside the valid range [0, 0]}}
+}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 136512db123b30..2f49e9a6b37cc3 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -326,9 +326,14 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
     return false;
   }
 
-  template <unsigned BaseReg> bool ImmToTile(SDValue N, SDValue &Imm) {
+  template <unsigned BaseReg, unsigned Max>
+  bool ImmToTile(SDValue N, SDValue &Imm) {
     if (auto *CI = dyn_cast<ConstantSDNode>(N)) {
       uint64_t C = CI->getZExtValue();
+
+      if (C > Max)
+        return false;
+
       Imm = CurDAG->getRegister(BaseReg + C, MVT::Other);
       return true;
     }
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index cb093a1613110e..4379c3fde6f3c5 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2753,6 +2753,20 @@ AArch64TargetLowering::EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const {
   return BB;
 }
 
+MachineBasicBlock *AArch64TargetLowering::EmitZTSpillFill(MachineInstr &MI,
+                                                          MachineBasicBlock *BB,
+                                                          bool IsSpill) const {
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  MachineInstrBuilder MIB;
+  unsigned Opc = IsSpill ? AArch64::STR_TX : AArch64::LDR_TX;
+  auto Rs = IsSpill ? RegState::Kill : RegState::Define;
+  MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
+  MIB.addReg(MI.getOperand(0).getReg(), Rs);
+  MIB.add(MI.getOperand(1)); // Base
+  MI.eraseFromParent();      // The pseudo is gone now.
+  return BB;
+}
+
 MachineBasicBlock *
 AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
                                    MachineInstr &MI,
@@ -2869,6 +2883,10 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
     return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
   case AArch64::LDR_ZA_PSEUDO:
     return EmitFill(MI, BB);
+  case AArch64::LDR_TX_PSEUDO:
+    return EmitZTSpillFill(MI, BB, /*IsSpill=*/false);
+  case AArch64::STR_TX_PSEUDO:
+    return EmitZTSpillFill(MI, BB, /*IsSpill=*/true);
   case AArch64::ZERO_M_PSEUDO:
     return EmitZero(MI, BB);
   }
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 25d7cb6d212d1f..009f8744b408a9 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -623,6 +623,8 @@ class AArch64TargetLowering : public TargetLowering {
   MachineBasicBlock *EmitZAInstr(unsigned Opc, unsigned BaseReg,
                                  MachineInstr &MI, MachineBasicBlock *BB,
                                  bool HasTile) const;
+  MachineBasicBlock *EmitZTSpillFill(MachineInstr &MI, MachineBasicBlock *BB,
+                                     bool IsSpill) const;
   MachineBasicBlock *EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const;
 
   MachineBasicBlock *
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index ed64a7b4984c17..24ba9dd95004c6 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -440,6 +440,12 @@ AArch64RegisterInfo::getStrictlyReservedRegs(const MachineFunction &MF) const {
       Reserved.set(SubReg);
   }
 
+  if (MF.getSubtarget<AArch64Subtarget>().hasSME2()) {
+    for (MCSubRegIterator SubReg(AArch64::ZT0, this, /*self=*/true);
+         SubReg.isValid(); ++SubReg)
+      Reserved.set(*SubReg);
+  }
+
   markSuperRegs(Reserved, AArch64::FPCR);
 
   if (MF.getFunction().getCallingConv() == CallingConv::GRAAL) {
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index bb9464a8d2e1cf..fcfa5f82a3809c 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -541,8 +541,8 @@ defm UMOPS_MPPZZ_HtoS : sme2_int_mopx_tile<"umops", 0b101, int_aarch64_sme_umops
 
 def ZERO_T : sme2_zero_zt<"zero", 0b0001>;
 
-def LDR_TX : sme2_spill_fill_vector<"ldr", 0b01111100>;
-def STR_TX : sme2_spill_fill_vector<"str", 0b11111100>;
+defm LDR_TX : sme2_spill_fill_vector<"ldr", 0b01111100, int_aarch64_sme_ldr_zt>;
+defm STR_TX : sme2_spill_fill_vector<"str", 0b11111100, int_aarch64_sme_str_zt>;
 
 def MOVT_XTI : sme2_movt_zt_to_scalar<"movt", 0b0011111>;
 def MOVT_TIX : sme2_movt_scalar_to_zt<"movt", 0b0011111>;
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 6c9b1f11a4decd..ef9c323e25bc35 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -10,11 +10,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-def imm_to_tile8   : ComplexPattern<i32, 1, "ImmToTile<AArch64::ZAB0>", []>;
-def imm_to_tile16  : ComplexPattern<i32, 1, "ImmToTile<AArch64::ZAH0>", []>;
-def imm_to_tile32  : ComplexPattern<i32, 1, "ImmToTile<AArch64::ZAS0>", []>;
-def imm_to_tile64  : ComplexPattern<i32, 1, "ImmToTile<AArch64::ZAD0>", []>;
-def imm_to_tile128 : ComplexPattern<i32, 1, "ImmToTile<AArch64::ZAQ0>", []>;
+def imm_to_tile8   : ComplexPattern<i32, 1, "ImmToTile<AArch64::ZAB0, 0>",  []>;
+def imm_to_tile16  : ComplexPattern<i32, 1, "ImmToTile<AArch64::ZAH0, 1>",  []>;
+def imm_to_tile32  : ComplexPattern<i32, 1, "ImmToTile<AArch64::ZAS0, 3>",  []>;
+def imm_to_tile64  : ComplexPattern<i32, 1, "ImmToTile<AArch64::ZAD0, 7>",  []>;
+def imm_to_tile128 : ComplexPattern<i32, 1, "ImmToTile<AArch64::ZAQ0, 15>", []>;
+def imm_to_zt      : ComplexPattern<i32, 1, "ImmToTile<AArch64::ZT0,  0>",  []>;
 
 def tileslice8   : ComplexPattern<i32 , 2, "SelectSMETileSlice<15, 1>", []>;
 def tileslice16  : ComplexPattern<i32 , 2, "SelectSMETileSlice<7,  1>", []>;
@@ -3137,6 +3138,18 @@ class sme2_spill_fill_vector<string mnemonic, bits<8> opc>
   let mayStore    = opc{7};
 }
 
+
+multiclass sme2_spill_fill_vector<string mnemonic, bits<8> opc, SDPatternOperator op> {
+  def NAME : sme2_spill_fill_vector<mnemonic, opc>;
+  def NAME # _PSEUDO
+      : Pseudo<(outs), (ins ZTR:$ZTt, GPR64sp:$base), []>, Sched<[]> {
+    // Translated to actual instruction in AArch64ISelLowering.cpp
+    let usesCustomInserter = 1;
+  }
+  def : Pat<(op (imm_to_zt untyped:$tile), GPR64sp:$base),
+            (!cast<Instruction>(NAME # _PSEUDO) $tile, $base)>;
+}
+
 //===----------------------------------------------------------------------===///
 // SME2 move to/from lookup table
 class sme2_movt_zt_to_scalar<string mnemonic, bits<7> opc>
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-zt0.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-zt0.ll
new file mode 100644
index 00000000000000..30205d86f2fb20
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-zt0.ll
@@ -0,0 +1,27 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s
+
+; LDR
+
+define void @ldr_zt0(ptr %ptr) {
+; CHECK-LABEL: ldr_zt0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr zt0, [x0]
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.ldr.zt(i32 0, ptr %ptr)
+  ret void;
+}
+
+; STR
+
+define void @str_zt0(ptr %ptr) {
+; CHECK-LABEL: str_zt0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str zt0, [x0]
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.str.zt(i32 0, ptr %ptr)
+  ret void;
+}
+
+declare void @llvm.aarch64.sme.ldr.zt(i32, ptr)
+declare void @llvm.aarch64.sme.str.zt(i32, ptr)

From de55a2843fae6afd4b0589d81496096a4ff73cbd Mon Sep 17 00:00:00 2001
From: XinWang10 <108658776+XinWang10@users.noreply.github.com>
Date: Fri, 1 Dec 2023 17:39:25 +0800
Subject: [PATCH 18/72] [X86][MC] Support Enc/Dec for EGPR for promoted BMI
 instructions (#73899)

R16-R31 was added into GPRs in
https://github.com/llvm/llvm-project/pull/70958,
This patch supports the encoding/decoding for promoted BMI instructions
in EVEX space.


RFC:
https://discourse.llvm.org/t/rfc-design-for-apx-feature-egpr-and-ndd-support/73031/4
---
 .../X86/MCTargetDesc/X86MCCodeEmitter.cpp     |   6 +-
 llvm/lib/Target/X86/X86InstrArithmetic.td     |  58 +++++--
 llvm/lib/Target/X86/X86InstrMisc.td           | 152 ++++++++++--------
 llvm/lib/Target/X86/X86InstrShiftRotate.td    |  59 ++++---
 llvm/test/MC/Disassembler/X86/apx/andn.txt    |  18 +++
 llvm/test/MC/Disassembler/X86/apx/bextr.txt   |  18 +++
 llvm/test/MC/Disassembler/X86/apx/blsi.txt    |  18 +++
 llvm/test/MC/Disassembler/X86/apx/blsmsk.txt  |  18 +++
 llvm/test/MC/Disassembler/X86/apx/blsr.txt    |  18 +++
 llvm/test/MC/Disassembler/X86/apx/bzhi.txt    |  18 +++
 .../MC/Disassembler/X86/apx/evex-format.txt   |  12 ++
 llvm/test/MC/Disassembler/X86/apx/mulx.txt    |  18 +++
 llvm/test/MC/Disassembler/X86/apx/pdep.txt    |  18 +++
 llvm/test/MC/Disassembler/X86/apx/pext.txt    |  18 +++
 llvm/test/MC/Disassembler/X86/apx/rorx.txt    |  18 +++
 llvm/test/MC/Disassembler/X86/apx/sarx.txt    |  18 +++
 llvm/test/MC/Disassembler/X86/apx/shlx.txt    |  18 +++
 llvm/test/MC/Disassembler/X86/apx/shrx.txt    |  18 +++
 llvm/test/MC/X86/apx/andn-att.s               |  20 +++
 llvm/test/MC/X86/apx/andn-intel.s             |  17 ++
 llvm/test/MC/X86/apx/bextr-att.s              |  20 +++
 llvm/test/MC/X86/apx/bextr-intel.s            |  17 ++
 llvm/test/MC/X86/apx/blsi-att.s               |  20 +++
 llvm/test/MC/X86/apx/blsi-intel.s             |  17 ++
 llvm/test/MC/X86/apx/blsmsk-att.s             |  20 +++
 llvm/test/MC/X86/apx/blsmsk-intel.s           |  17 ++
 llvm/test/MC/X86/apx/blsr-att.s               |  20 +++
 llvm/test/MC/X86/apx/blsr-intel.s             |  17 ++
 llvm/test/MC/X86/apx/bzhi-att.s               |  20 +++
 llvm/test/MC/X86/apx/bzhi-intel.s             |  17 ++
 llvm/test/MC/X86/apx/evex-format-att.s        |  12 ++
 llvm/test/MC/X86/apx/evex-format-intel.s      |  12 ++
 llvm/test/MC/X86/apx/mulx-att.s               |  20 +++
 llvm/test/MC/X86/apx/mulx-intel.s             |  17 ++
 llvm/test/MC/X86/apx/pdep-att.s               |  20 +++
 llvm/test/MC/X86/apx/pdep-intel.s             |  17 ++
 llvm/test/MC/X86/apx/pext-att.s               |  20 +++
 llvm/test/MC/X86/apx/pext-intel.s             |  17 ++
 llvm/test/MC/X86/apx/rorx-att.s               |  20 +++
 llvm/test/MC/X86/apx/rorx-intel.s             |  17 ++
 llvm/test/MC/X86/apx/sarx-att.s               |  20 +++
 llvm/test/MC/X86/apx/sarx-intel.s             |  17 ++
 llvm/test/MC/X86/apx/shlx-att.s               |  20 +++
 llvm/test/MC/X86/apx/shlx-intel.s             |  17 ++
 llvm/test/MC/X86/apx/shrx-att.s               |  20 +++
 llvm/test/MC/X86/apx/shrx-intel.s             |  17 ++
 llvm/test/TableGen/x86-fold-tables.inc        |  26 +++
 47 files changed, 942 insertions(+), 110 deletions(-)
 create mode 100644 llvm/test/MC/Disassembler/X86/apx/andn.txt
 create mode 100644 llvm/test/MC/Disassembler/X86/apx/bextr.txt
 create mode 100644 llvm/test/MC/Disassembler/X86/apx/blsi.txt
 create mode 100644 llvm/test/MC/Disassembler/X86/apx/blsmsk.txt
 create mode 100644 llvm/test/MC/Disassembler/X86/apx/blsr.txt
 create mode 100644 llvm/test/MC/Disassembler/X86/apx/bzhi.txt
 create mode 100644 llvm/test/MC/Disassembler/X86/apx/mulx.txt
 create mode 100644 llvm/test/MC/Disassembler/X86/apx/pdep.txt
 create mode 100644 llvm/test/MC/Disassembler/X86/apx/pext.txt
 create mode 100644 llvm/test/MC/Disassembler/X86/apx/rorx.txt
 create mode 100644 llvm/test/MC/Disassembler/X86/apx/sarx.txt
 create mode 100644 llvm/test/MC/Disassembler/X86/apx/shlx.txt
 create mode 100644 llvm/test/MC/Disassembler/X86/apx/shrx.txt
 create mode 100644 llvm/test/MC/X86/apx/andn-att.s
 create mode 100644 llvm/test/MC/X86/apx/andn-intel.s
 create mode 100644 llvm/test/MC/X86/apx/bextr-att.s
 create mode 100644 llvm/test/MC/X86/apx/bextr-intel.s
 create mode 100644 llvm/test/MC/X86/apx/blsi-att.s
 create mode 100644 llvm/test/MC/X86/apx/blsi-intel.s
 create mode 100644 llvm/test/MC/X86/apx/blsmsk-att.s
 create mode 100644 llvm/test/MC/X86/apx/blsmsk-intel.s
 create mode 100644 llvm/test/MC/X86/apx/blsr-att.s
 create mode 100644 llvm/test/MC/X86/apx/blsr-intel.s
 create mode 100644 llvm/test/MC/X86/apx/bzhi-att.s
 create mode 100644 llvm/test/MC/X86/apx/bzhi-intel.s
 create mode 100644 llvm/test/MC/X86/apx/mulx-att.s
 create mode 100644 llvm/test/MC/X86/apx/mulx-intel.s
 create mode 100644 llvm/test/MC/X86/apx/pdep-att.s
 create mode 100644 llvm/test/MC/X86/apx/pdep-intel.s
 create mode 100644 llvm/test/MC/X86/apx/pext-att.s
 create mode 100644 llvm/test/MC/X86/apx/pext-intel.s
 create mode 100644 llvm/test/MC/X86/apx/rorx-att.s
 create mode 100644 llvm/test/MC/X86/apx/rorx-intel.s
 create mode 100644 llvm/test/MC/X86/apx/sarx-att.s
 create mode 100644 llvm/test/MC/X86/apx/sarx-intel.s
 create mode 100644 llvm/test/MC/X86/apx/shlx-att.s
 create mode 100644 llvm/test/MC/X86/apx/shlx-intel.s
 create mode 100644 llvm/test/MC/X86/apx/shrx-att.s
 create mode 100644 llvm/test/MC/X86/apx/shrx-intel.s

diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 1f130c22298ed4..b6ebbcf56aef73 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -1115,10 +1115,10 @@ X86MCCodeEmitter::emitVEXOpcodePrefix(int MemOperand, const MCInst &MI,
   case X86II::MRMSrcMem4VOp3: {
     // Instruction format for 4VOp3:
     //   src1(ModR/M), MemAddr, src3(VEX_4V)
-    Prefix.setR(MI, CurOp++);
+    Prefix.setRR2(MI, CurOp++);
     Prefix.setBB2(MI, MemOperand + X86::AddrBaseReg);
     Prefix.setXX2(MI, MemOperand + X86::AddrIndexReg);
-    Prefix.set4V(MI, CurOp + X86::AddrNumOperands);
+    Prefix.set4VV2(MI, CurOp + X86::AddrNumOperands);
     break;
   }
   case X86II::MRMSrcMemOp4: {
@@ -1189,7 +1189,7 @@ X86MCCodeEmitter::emitVEXOpcodePrefix(int MemOperand, const MCInst &MI,
     //   src1(ModR/M), src2(ModR/M), src3(VEX_4V)
     Prefix.setRR2(MI, CurOp++);
     Prefix.setBB2(MI, CurOp++);
-    Prefix.set4V(MI, CurOp++);
+    Prefix.set4VV2(MI, CurOp++);
     break;
   }
   case X86II::MRMSrcRegOp4: {
diff --git a/llvm/lib/Target/X86/X86InstrArithmetic.td b/llvm/lib/Target/X86/X86InstrArithmetic.td
index 48188da291ded0..56cbc13eaaec8d 100644
--- a/llvm/lib/Target/X86/X86InstrArithmetic.td
+++ b/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -1289,21 +1289,34 @@ def : Pat<(X86testpat (loadi64 addr:$src1), i64relocImmSExt32_su:$src2),
 //
 multiclass bmi_andn<string mnemonic, RegisterClass RC, X86MemOperand x86memop,
                     PatFrag ld_frag, X86FoldableSchedWrite sched> {
+let Predicates = [HasBMI, NoEGPR] in {
   def rr : I<0xF2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
-            !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-            [(set RC:$dst, EFLAGS, (X86and_flag (not RC:$src1), RC:$src2))]>,
-            Sched<[sched]>;
+             !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set RC:$dst, EFLAGS, (X86and_flag (not RC:$src1), RC:$src2))]>,
+           VEX_4V, Sched<[sched]>;
   def rm : I<0xF2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
-            !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-            [(set RC:$dst, EFLAGS,
-             (X86and_flag (not RC:$src1), (ld_frag addr:$src2)))]>,
-           Sched<[sched.Folded, sched.ReadAfterFold]>;
+             !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set RC:$dst, EFLAGS,
+              (X86and_flag (not RC:$src1), (ld_frag addr:$src2)))]>,
+           VEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+let Predicates = [HasBMI, HasEGPR, In64BitMode] in {
+  def rr_EVEX : I<0xF2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+                  !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                  [(set RC:$dst, EFLAGS, (X86and_flag (not RC:$src1), RC:$src2))]>,
+                EVEX_4V, Sched<[sched]>;
+  def rm_EVEX : I<0xF2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+                  !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                  [(set RC:$dst, EFLAGS,
+                   (X86and_flag (not RC:$src1), (ld_frag addr:$src2)))]>,
+                EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
 }
 
 // Complexity is reduced to give and with immediate a chance to match first.
-let Predicates = [HasBMI], Defs = [EFLAGS], AddedComplexity = -6 in {
-  defm ANDN32 : bmi_andn<"andn{l}", GR32, i32mem, loadi32, WriteALU>, T8PS, VEX_4V;
-  defm ANDN64 : bmi_andn<"andn{q}", GR64, i64mem, loadi64, WriteALU>, T8PS, VEX_4V, REX_W;
+let Defs = [EFLAGS], AddedComplexity = -6 in {
+  defm ANDN32 : bmi_andn<"andn{l}", GR32, i32mem, loadi32, WriteALU>, T8PS;
+  defm ANDN64 : bmi_andn<"andn{q}", GR64, i64mem, loadi64, WriteALU>, T8PS, REX_W;
 }
 
 let Predicates = [HasBMI], AddedComplexity = -6 in {
@@ -1323,6 +1336,7 @@ let Predicates = [HasBMI], AddedComplexity = -6 in {
 multiclass bmi_mulx<string mnemonic, RegisterClass RC, X86MemOperand x86memop,
                     X86FoldableSchedWrite sched> {
 let hasSideEffects = 0 in {
+let Predicates = [HasBMI2, NoEGPR] in {
   def rr : I<0xF6, MRMSrcReg, (outs RC:$dst1, RC:$dst2), (ins RC:$src),
              !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
              []>, T8XD, VEX_4V, Sched<[WriteIMulH, sched]>;
@@ -1346,15 +1360,27 @@ let hasSideEffects = 0 in {
   def Hrm : PseudoI<(outs RC:$dst), (ins x86memop:$src),
                     []>, Sched<[sched.Folded]>;
 }
+let Predicates = [HasBMI2, HasEGPR, In64BitMode] in
+  def rr#_EVEX : I<0xF6, MRMSrcReg, (outs RC:$dst1, RC:$dst2), (ins RC:$src),
+                   !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
+                   []>, T8XD, EVEX_4V, Sched<[WriteIMulH, sched]>;
+let Predicates = [HasBMI2, HasEGPR, In64BitMode], mayLoad = 1 in
+  def rm#_EVEX : I<0xF6, MRMSrcMem, (outs RC:$dst1, RC:$dst2), (ins x86memop:$src),
+                   !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
+                   []>, T8XD, EVEX_4V,
+                 Sched<[WriteIMulHLd, sched.Folded,
+                        // Memory operand.
+                        ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+                        // Implicit read of EDX/RDX
+                        sched.ReadAfterFold]>;
 }
-
-let Predicates = [HasBMI2] in {
-  let Uses = [EDX] in
-    defm MULX32 : bmi_mulx<"mulx{l}", GR32, i32mem, WriteMULX32>;
-  let Uses = [RDX] in
-    defm MULX64 : bmi_mulx<"mulx{q}", GR64, i64mem, WriteMULX64>, REX_W;
 }
 
+let Uses = [EDX] in
+  defm MULX32 : bmi_mulx<"mulx{l}", GR32, i32mem, WriteMULX32>;
+let Uses = [RDX] in
+  defm MULX64 : bmi_mulx<"mulx{q}", GR64, i64mem, WriteMULX64>, REX_W;
+
 //===----------------------------------------------------------------------===//
 // ADCX and ADOX Instructions
 //
diff --git a/llvm/lib/Target/X86/X86InstrMisc.td b/llvm/lib/Target/X86/X86InstrMisc.td
index 32aa82fc93ca30..764d4bd6da2a1d 100644
--- a/llvm/lib/Target/X86/X86InstrMisc.td
+++ b/llvm/lib/Target/X86/X86InstrMisc.td
@@ -1214,19 +1214,19 @@ let Predicates = [HasBMI], Defs = [EFLAGS] in {
 
 multiclass bmi_bls<string mnemonic, Format RegMRM, Format MemMRM,
                   RegisterClass RC, X86MemOperand x86memop,
-                  X86FoldableSchedWrite sched> {
+                  X86FoldableSchedWrite sched, string Suffix = ""> {
 let hasSideEffects = 0 in {
-  def rr : I<0xF3, RegMRM, (outs RC:$dst), (ins RC:$src),
-             !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), []>,
-             T8PS, VEX_4V, Sched<[sched]>;
+  def rr#Suffix : I<0xF3, RegMRM, (outs RC:$dst), (ins RC:$src),
+                    !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), []>,
+                  T8PS, VEX_4V, Sched<[sched]>;
   let mayLoad = 1 in
-  def rm : I<0xF3, MemMRM, (outs RC:$dst), (ins x86memop:$src),
-             !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), []>,
-             T8PS, VEX_4V, Sched<[sched.Folded]>;
+  def rm#Suffix : I<0xF3, MemMRM, (outs RC:$dst), (ins x86memop:$src),
+                    !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), []>,
+                  T8PS, VEX_4V, Sched<[sched.Folded]>;
 }
 }
 
-let Predicates = [HasBMI], Defs = [EFLAGS] in {
+let Predicates = [HasBMI, NoEGPR], Defs = [EFLAGS] in {
   defm BLSR32 : bmi_bls<"blsr{l}", MRM1r, MRM1m, GR32, i32mem, WriteBLS>;
   defm BLSR64 : bmi_bls<"blsr{q}", MRM1r, MRM1m, GR64, i64mem, WriteBLS>, REX_W;
   defm BLSMSK32 : bmi_bls<"blsmsk{l}", MRM2r, MRM2m, GR32, i32mem, WriteBLS>;
@@ -1235,6 +1235,15 @@ let Predicates = [HasBMI], Defs = [EFLAGS] in {
   defm BLSI64 : bmi_bls<"blsi{q}", MRM3r, MRM3m, GR64, i64mem, WriteBLS>, REX_W;
 }
 
+let Predicates = [HasBMI, HasEGPR], Defs = [EFLAGS] in {
+  defm BLSR32 : bmi_bls<"blsr{l}", MRM1r, MRM1m, GR32, i32mem, WriteBLS, "_EVEX">, EVEX;
+  defm BLSR64 : bmi_bls<"blsr{q}", MRM1r, MRM1m, GR64, i64mem, WriteBLS, "_EVEX">, REX_W, EVEX;
+  defm BLSMSK32 : bmi_bls<"blsmsk{l}", MRM2r, MRM2m, GR32, i32mem, WriteBLS, "_EVEX">, EVEX;
+  defm BLSMSK64 : bmi_bls<"blsmsk{q}", MRM2r, MRM2m, GR64, i64mem, WriteBLS, "_EVEX">, REX_W, EVEX;
+  defm BLSI32 : bmi_bls<"blsi{l}", MRM3r, MRM3m, GR32, i32mem, WriteBLS, "_EVEX">, EVEX;
+  defm BLSI64 : bmi_bls<"blsi{q}", MRM3r, MRM3m, GR64, i64mem, WriteBLS, "_EVEX">, REX_W, EVEX;
+}
+
 //===----------------------------------------------------------------------===//
 // Pattern fragments to auto generate BMI instructions.
 //===----------------------------------------------------------------------===//
@@ -1292,56 +1301,50 @@ let Predicates = [HasBMI] in {
             (BLSI64rr GR64:$src)>;
 }
 
-multiclass bmi_bextr<bits<8> opc, string mnemonic, RegisterClass RC,
-                     X86MemOperand x86memop, SDNode OpNode,
-                     PatFrag ld_frag, X86FoldableSchedWrite Sched> {
-  def rr : I<opc, MRMSrcReg4VOp3, (outs RC:$dst), (ins RC:$src1, RC:$src2),
-             !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set RC:$dst, (OpNode RC:$src1, RC:$src2)), (implicit EFLAGS)]>,
-             T8PS, VEX, Sched<[Sched]>;
-  def rm : I<opc, MRMSrcMem4VOp3, (outs RC:$dst), (ins x86memop:$src1, RC:$src2),
-             !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set RC:$dst, (OpNode (ld_frag addr:$src1), RC:$src2)),
-              (implicit EFLAGS)]>, T8PS, VEX,
-             Sched<[Sched.Folded,
-                    // x86memop:$src1
-                    ReadDefault, ReadDefault, ReadDefault, ReadDefault,
-                    ReadDefault,
-                    // RC:$src2
-                    Sched.ReadAfterFold]>;
+multiclass bmi4VOp3_base<bits<8> opc, string mnemonic, RegisterClass RC,
+                         X86MemOperand x86memop, SDPatternOperator OpNode,
+                         PatFrag ld_frag, X86FoldableSchedWrite Sched,
+                         string Suffix = ""> {
+  def rr#Suffix : I<opc, MRMSrcReg4VOp3, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+                    !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set RC:$dst, (OpNode RC:$src1, RC:$src2)), (implicit EFLAGS)]>,
+                  T8PS, VEX, Sched<[Sched]>;
+let mayLoad = 1 in
+  def rm#Suffix : I<opc, MRMSrcMem4VOp3, (outs RC:$dst), (ins x86memop:$src1, RC:$src2),
+                    !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set RC:$dst, (OpNode (ld_frag addr:$src1), RC:$src2)),
+                     (implicit EFLAGS)]>, T8PS, VEX,
+                  Sched<[Sched.Folded,
+                         // x86memop:$src1
+                         ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+                         ReadDefault,
+                         // RC:$src2
+                         Sched.ReadAfterFold]>;
 }
 
-let Predicates = [HasBMI], Defs = [EFLAGS] in {
-  defm BEXTR32 : bmi_bextr<0xF7, "bextr{l}", GR32, i32mem,
-                           X86bextr, loadi32, WriteBEXTR>;
-  defm BEXTR64 : bmi_bextr<0xF7, "bextr{q}", GR64, i64mem,
-                           X86bextr, loadi64, WriteBEXTR>, REX_W;
-}
-
-multiclass bmi_bzhi<bits<8> opc, string mnemonic, RegisterClass RC,
-                    X86MemOperand x86memop, SDNode Int,
-                    PatFrag ld_frag, X86FoldableSchedWrite Sched> {
-  def rr : I<opc, MRMSrcReg4VOp3, (outs RC:$dst), (ins RC:$src1, RC:$src2),
-             !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set RC:$dst, (Int RC:$src1, RC:$src2)), (implicit EFLAGS)]>,
-             T8PS, VEX, Sched<[Sched]>;
-  def rm : I<opc, MRMSrcMem4VOp3, (outs RC:$dst), (ins x86memop:$src1, RC:$src2),
-             !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set RC:$dst, (Int (ld_frag addr:$src1), RC:$src2)),
-              (implicit EFLAGS)]>, T8PS, VEX,
-             Sched<[Sched.Folded,
-                    // x86memop:$src1
-                    ReadDefault, ReadDefault, ReadDefault, ReadDefault,
-                    ReadDefault,
-                    // RC:$src2
-                    Sched.ReadAfterFold]>;
-}
-
-let Predicates = [HasBMI2], Defs = [EFLAGS] in {
-  defm BZHI32 : bmi_bzhi<0xF5, "bzhi{l}", GR32, i32mem,
-                         X86bzhi, loadi32, WriteBZHI>;
-  defm BZHI64 : bmi_bzhi<0xF5, "bzhi{q}", GR64, i64mem,
-                         X86bzhi, loadi64, WriteBZHI>, REX_W;
+let Predicates = [HasBMI, NoEGPR], Defs = [EFLAGS] in {
+  defm BEXTR32 : bmi4VOp3_base<0xF7, "bextr{l}", GR32, i32mem,
+                               X86bextr, loadi32, WriteBEXTR>;
+  defm BEXTR64 : bmi4VOp3_base<0xF7, "bextr{q}", GR64, i64mem,
+                               X86bextr, loadi64, WriteBEXTR>, REX_W;
+}
+let Predicates = [HasBMI2, NoEGPR], Defs = [EFLAGS] in {
+  defm BZHI32 : bmi4VOp3_base<0xF5, "bzhi{l}", GR32, i32mem,
+                              X86bzhi, loadi32, WriteBZHI>;
+  defm BZHI64 : bmi4VOp3_base<0xF5, "bzhi{q}", GR64, i64mem,
+                              X86bzhi, loadi64, WriteBZHI>, REX_W;
+}
+let Predicates = [HasBMI, HasEGPR], Defs = [EFLAGS] in {
+  defm BEXTR32 : bmi4VOp3_base<0xF7, "bextr{l}", GR32, i32mem,
+                               X86bextr, loadi32, WriteBEXTR, "_EVEX">, EVEX;
+  defm BEXTR64 : bmi4VOp3_base<0xF7, "bextr{q}", GR64, i64mem,
+                               X86bextr, loadi64, WriteBEXTR, "_EVEX">, EVEX, REX_W;
+}
+let Predicates = [HasBMI2, HasEGPR], Defs = [EFLAGS] in {
+  defm BZHI32 : bmi4VOp3_base<0xF5, "bzhi{l}", GR32, i32mem,
+                              X86bzhi, loadi32, WriteBZHI, "_EVEX">, EVEX;
+  defm BZHI64 : bmi4VOp3_base<0xF5, "bzhi{q}", GR64, i64mem,
+                              X86bzhi, loadi64, WriteBZHI, "_EVEX">, EVEX, REX_W;
 }
 
 def CountTrailingOnes : SDNodeXForm<imm, [{
@@ -1383,19 +1386,19 @@ let Predicates = [HasBMI2, NoTBM] in {
 }
 
 multiclass bmi_pdep_pext<string mnemonic, RegisterClass RC,
-                         X86MemOperand x86memop, SDNode OpNode,
-                         PatFrag ld_frag> {
-  def rr : I<0xF5, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
-             !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set RC:$dst, (OpNode RC:$src1, RC:$src2))]>,
-             VEX_4V, Sched<[WriteALU]>;
-  def rm : I<0xF5, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
-             !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set RC:$dst, (OpNode RC:$src1, (ld_frag addr:$src2)))]>,
-             VEX_4V, Sched<[WriteALU.Folded, WriteALU.ReadAfterFold]>;
-}
-
-let Predicates = [HasBMI2] in {
+                         X86MemOperand x86memop, SDPatternOperator OpNode,
+                         PatFrag ld_frag, string Suffix = ""> {
+  def rr#Suffix : I<0xF5, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+                    !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set RC:$dst, (OpNode RC:$src1, RC:$src2))]>,
+                  VEX_4V, Sched<[WriteALU]>;
+  def rm#Suffix : I<0xF5, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+                    !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set RC:$dst, (OpNode RC:$src1, (ld_frag addr:$src2)))]>,
+                  VEX_4V, Sched<[WriteALU.Folded, WriteALU.ReadAfterFold]>;
+}
+
+let Predicates = [HasBMI2, NoEGPR] in {
   defm PDEP32 : bmi_pdep_pext<"pdep{l}", GR32, i32mem,
                                X86pdep, loadi32>, T8XD;
   defm PDEP64 : bmi_pdep_pext<"pdep{q}", GR64, i64mem,
@@ -1406,6 +1409,17 @@ let Predicates = [HasBMI2] in {
                                X86pext, loadi64>, T8XS, REX_W;
 }
 
+let Predicates = [HasBMI2, HasEGPR] in {
+  defm PDEP32 : bmi_pdep_pext<"pdep{l}", GR32, i32mem,
+                               X86pdep, loadi32, "_EVEX">, T8XD, EVEX;
+  defm PDEP64 : bmi_pdep_pext<"pdep{q}", GR64, i64mem,
+                               X86pdep, loadi64, "_EVEX">, T8XD, REX_W, EVEX;
+  defm PEXT32 : bmi_pdep_pext<"pext{l}", GR32, i32mem,
+                               X86pext, loadi32, "_EVEX">, T8XS, EVEX;
+  defm PEXT64 : bmi_pdep_pext<"pext{q}", GR64, i64mem,
+                               X86pext, loadi64, "_EVEX">, T8XS, REX_W, EVEX;
+}
+
 //===----------------------------------------------------------------------===//
 // Lightweight Profiling Instructions
 
diff --git a/llvm/lib/Target/X86/X86InstrShiftRotate.td b/llvm/lib/Target/X86/X86InstrShiftRotate.td
index e416e4495e2277..48bf23f8cbf7b2 100644
--- a/llvm/lib/Target/X86/X86InstrShiftRotate.td
+++ b/llvm/lib/Target/X86/X86InstrShiftRotate.td
@@ -824,38 +824,40 @@ def ROT64L2R_imm8  : SDNodeXForm<imm, [{
 
 // NOTE: We use WriteShift for these rotates as they avoid the stalls
 // of many of the older x86 rotate instructions.
-multiclass bmi_rotate<string asm, RegisterClass RC, X86MemOperand x86memop> {
+multiclass bmi_rotate<string asm, RegisterClass RC, X86MemOperand x86memop,
+                      string Suffix = ""> {
 let hasSideEffects = 0 in {
-  def ri : Ii8<0xF0, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, u8imm:$src2),
-               !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-               []>, TAXD, VEX, Sched<[WriteShift]>;
+  def ri#Suffix : Ii8<0xF0, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, u8imm:$src2),
+                      !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
+                  TAXD, VEX, Sched<[WriteShift]>;
   let mayLoad = 1 in
-  def mi : Ii8<0xF0, MRMSrcMem, (outs RC:$dst),
-               (ins x86memop:$src1, u8imm:$src2),
-               !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-               []>, TAXD, VEX, Sched<[WriteShiftLd]>;
+  def mi#Suffix : Ii8<0xF0, MRMSrcMem, (outs RC:$dst),
+                      (ins x86memop:$src1, u8imm:$src2),
+                      !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
+                  TAXD, VEX, Sched<[WriteShiftLd]>;
 }
 }
 
-multiclass bmi_shift<string asm, RegisterClass RC, X86MemOperand x86memop> {
+multiclass bmi_shift<string asm, RegisterClass RC, X86MemOperand x86memop,
+                     string Suffix = ""> {
 let hasSideEffects = 0 in {
-  def rr : I<0xF7, MRMSrcReg4VOp3, (outs RC:$dst), (ins RC:$src1, RC:$src2),
-             !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
-             VEX, Sched<[WriteShift]>;
+  def rr#Suffix : I<0xF7, MRMSrcReg4VOp3, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+                    !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
+                    VEX, Sched<[WriteShift]>;
   let mayLoad = 1 in
-  def rm : I<0xF7, MRMSrcMem4VOp3,
-             (outs RC:$dst), (ins x86memop:$src1, RC:$src2),
-             !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
-             VEX, Sched<[WriteShift.Folded,
-                         // x86memop:$src1
-                         ReadDefault, ReadDefault, ReadDefault, ReadDefault,
-                         ReadDefault,
-                         // RC:$src2
-                         WriteShift.ReadAfterFold]>;
+  def rm#Suffix : I<0xF7, MRMSrcMem4VOp3,
+                    (outs RC:$dst), (ins x86memop:$src1, RC:$src2),
+                    !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
+                  VEX, Sched<[WriteShift.Folded,
+                              // x86memop:$src1
+                              ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+                              ReadDefault,
+                              // RC:$src2
+                              WriteShift.ReadAfterFold]>;
 }
 }
 
-let Predicates = [HasBMI2] in {
+let Predicates = [HasBMI2, NoEGPR] in {
   defm RORX32 : bmi_rotate<"rorx{l}", GR32, i32mem>;
   defm RORX64 : bmi_rotate<"rorx{q}", GR64, i64mem>, REX_W;
   defm SARX32 : bmi_shift<"sarx{l}", GR32, i32mem>, T8XS;
@@ -864,7 +866,20 @@ let Predicates = [HasBMI2] in {
   defm SHRX64 : bmi_shift<"shrx{q}", GR64, i64mem>, T8XD, REX_W;
   defm SHLX32 : bmi_shift<"shlx{l}", GR32, i32mem>, T8PD;
   defm SHLX64 : bmi_shift<"shlx{q}", GR64, i64mem>, T8PD, REX_W;
+}
 
+let Predicates = [HasBMI2, HasEGPR] in {
+  defm RORX32 : bmi_rotate<"rorx{l}", GR32, i32mem, "_EVEX">, EVEX;
+  defm RORX64 : bmi_rotate<"rorx{q}", GR64, i64mem, "_EVEX">, REX_W, EVEX;
+  defm SARX32 : bmi_shift<"sarx{l}", GR32, i32mem, "_EVEX">, T8XS, EVEX;
+  defm SARX64 : bmi_shift<"sarx{q}", GR64, i64mem, "_EVEX">, T8XS, REX_W, EVEX;
+  defm SHRX32 : bmi_shift<"shrx{l}", GR32, i32mem, "_EVEX">, T8XD, EVEX;
+  defm SHRX64 : bmi_shift<"shrx{q}", GR64, i64mem, "_EVEX">, T8XD, REX_W, EVEX;
+  defm SHLX32 : bmi_shift<"shlx{l}", GR32, i32mem, "_EVEX">, T8PD, EVEX;
+  defm SHLX64 : bmi_shift<"shlx{q}", GR64, i64mem, "_EVEX">, T8PD, REX_W, EVEX;
+}
+
+let Predicates = [HasBMI2] in {
   // Prefer RORX which is non-destructive and doesn't update EFLAGS.
   let AddedComplexity = 10 in {
     def : Pat<(rotr GR32:$src, (i8 imm:$shamt)),
diff --git a/llvm/test/MC/Disassembler/X86/apx/andn.txt b/llvm/test/MC/Disassembler/X86/apx/andn.txt
new file mode 100644
index 00000000000000..8b943d2a0ac44c
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/andn.txt
@@ -0,0 +1,18 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT:   andnl	%r18d, %r22d, %r26d
+# INTEL: andn	r26d, r22d, r18d
+0x62,0x6a,0x4c,0x00,0xf2,0xd2
+
+# ATT:   andnq	%r19, %r23, %r27
+# INTEL: andn	r27, r23, r19
+0x62,0x6a,0xc4,0x00,0xf2,0xdb
+
+# ATT:   andnl	291(%r28,%r29,4), %r18d, %r22d
+# INTEL: andn	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8a,0x68,0x00,0xf2,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   andnq	291(%r28,%r29,4), %r19, %r23
+# INTEL: andn	r23, r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8a,0xe0,0x00,0xf2,0xbc,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/bextr.txt b/llvm/test/MC/Disassembler/X86/apx/bextr.txt
new file mode 100644
index 00000000000000..abd92864b315e3
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/bextr.txt
@@ -0,0 +1,18 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT:   bextrl	%r18d, %r22d, %r26d
+# INTEL: bextr	r26d, r22d, r18d
+0x62,0x6a,0x6c,0x00,0xf7,0xd6
+
+# ATT:   bextrl	%r18d, 291(%r28,%r29,4), %r22d
+# INTEL: bextr	r22d, dword ptr [r28 + 4*r29 + 291], r18d
+0x62,0x8a,0x68,0x00,0xf7,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   bextrq	%r19, %r23, %r27
+# INTEL: bextr	r27, r23, r19
+0x62,0x6a,0xe4,0x00,0xf7,0xdf
+
+# ATT:   bextrq	%r19, 291(%r28,%r29,4), %r23
+# INTEL: bextr	r23, qword ptr [r28 + 4*r29 + 291], r19
+0x62,0x8a,0xe0,0x00,0xf7,0xbc,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/blsi.txt b/llvm/test/MC/Disassembler/X86/apx/blsi.txt
new file mode 100644
index 00000000000000..254ec90caea515
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/blsi.txt
@@ -0,0 +1,18 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT:   blsil	%r18d, %r22d
+# INTEL: blsi	r22d, r18d
+0x62,0xfa,0x4c,0x00,0xf3,0xda
+
+# ATT:   blsiq	%r19, %r23
+# INTEL: blsi	r23, r19
+0x62,0xfa,0xc4,0x00,0xf3,0xdb
+
+# ATT:   blsil	291(%r28,%r29,4), %r18d
+# INTEL: blsi	r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x9a,0x68,0x00,0xf3,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   blsiq	291(%r28,%r29,4), %r19
+# INTEL: blsi	r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x9a,0xe0,0x00,0xf3,0x9c,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/blsmsk.txt b/llvm/test/MC/Disassembler/X86/apx/blsmsk.txt
new file mode 100644
index 00000000000000..5e47d3d3d625eb
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/blsmsk.txt
@@ -0,0 +1,18 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT:   blsmskl	%r18d, %r22d
+# INTEL: blsmsk	r22d, r18d
+0x62,0xfa,0x4c,0x00,0xf3,0xd2
+
+# ATT:   blsmskq	%r19, %r23
+# INTEL: blsmsk	r23, r19
+0x62,0xfa,0xc4,0x00,0xf3,0xd3
+
+# ATT:   blsmskl	291(%r28,%r29,4), %r18d
+# INTEL: blsmsk	r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x9a,0x68,0x00,0xf3,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   blsmskq	291(%r28,%r29,4), %r19
+# INTEL: blsmsk	r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x9a,0xe0,0x00,0xf3,0x94,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/blsr.txt b/llvm/test/MC/Disassembler/X86/apx/blsr.txt
new file mode 100644
index 00000000000000..37df4306da26ed
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/blsr.txt
@@ -0,0 +1,18 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT:   blsrl	%r18d, %r22d
+# INTEL: blsr	r22d, r18d
+0x62,0xfa,0x4c,0x00,0xf3,0xca
+
+# ATT:   blsrq	%r19, %r23
+# INTEL: blsr	r23, r19
+0x62,0xfa,0xc4,0x00,0xf3,0xcb
+
+# ATT:   blsrl	291(%r28,%r29,4), %r18d
+# INTEL: blsr	r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x9a,0x68,0x00,0xf3,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   blsrq	291(%r28,%r29,4), %r19
+# INTEL: blsr	r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x9a,0xe0,0x00,0xf3,0x8c,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/bzhi.txt b/llvm/test/MC/Disassembler/X86/apx/bzhi.txt
new file mode 100644
index 00000000000000..44f496e3cc0840
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/bzhi.txt
@@ -0,0 +1,18 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT:   bzhil	%r18d, %r22d, %r26d
+# INTEL: bzhi	r26d, r22d, r18d
+0x62,0x6a,0x6c,0x00,0xf5,0xd6
+
+# ATT:   bzhil	%r18d, 291(%r28,%r29,4), %r22d
+# INTEL: bzhi	r22d, dword ptr [r28 + 4*r29 + 291], r18d
+0x62,0x8a,0x68,0x00,0xf5,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   bzhiq	%r19, %r23, %r27
+# INTEL: bzhi	r27, r23, r19
+0x62,0x6a,0xe4,0x00,0xf5,0xdf
+
+# ATT:   bzhiq	%r19, 291(%r28,%r29,4), %r23
+# INTEL: bzhi	r23, qword ptr [r28 + 4*r29 + 291], r19
+0x62,0x8a,0xe0,0x00,0xf5,0xbc,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/evex-format.txt b/llvm/test/MC/Disassembler/X86/apx/evex-format.txt
index ee2c2c5bdf909c..389b22cb4a223d 100644
--- a/llvm/test/MC/Disassembler/X86/apx/evex-format.txt
+++ b/llvm/test/MC/Disassembler/X86/apx/evex-format.txt
@@ -62,8 +62,20 @@
 # INTEL: vpslldq	zmm0, zmmword ptr [r16 + r17], 0
 0x62,0xf9,0x79,0x48,0x73,0x3c,0x08,0x00
 
+## MRMSrcMem4VOp3
+
+# ATT:   bzhiq	%r19, 291(%r28,%r29,4), %r23
+# INTEL: bzhi	r23, qword ptr [r28 + 4*r29 + 291], r19
+0x62,0x8a,0xe0,0x00,0xf5,0xbc,0xac,0x23,0x01,0x00,0x00
+
 ## MRMDestReg
 
 # ATT:   vextractps	$1, %xmm16, %r16d
 # INTEL: vextractps	r16d, xmm16, 1
 0x62,0xeb,0x7d,0x08,0x17,0xc0,0x01
+
+## MRMSrcReg4VOp3
+
+# ATT:   bzhiq	%r19, %r23, %r27
+# INTEL: bzhi	r27, r23, r19
+0x62,0x6a,0xe4,0x00,0xf5,0xdf
diff --git a/llvm/test/MC/Disassembler/X86/apx/mulx.txt b/llvm/test/MC/Disassembler/X86/apx/mulx.txt
new file mode 100644
index 00000000000000..5d9b53b99a71b6
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/mulx.txt
@@ -0,0 +1,18 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT:   mulxl	%r18d, %r22d, %r26d
+# INTEL: mulx	r26d, r22d, r18d
+0x62,0x6a,0x4f,0x00,0xf6,0xd2
+
+# ATT:   mulxq	%r19, %r23, %r27
+# INTEL: mulx	r27, r23, r19
+0x62,0x6a,0xc7,0x00,0xf6,0xdb
+
+# ATT:   mulxl	291(%r28,%r29,4), %r18d, %r22d
+# INTEL: mulx	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8a,0x6b,0x00,0xf6,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   mulxq	291(%r28,%r29,4), %r19, %r23
+# INTEL: mulx	r23, r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8a,0xe3,0x00,0xf6,0xbc,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/pdep.txt b/llvm/test/MC/Disassembler/X86/apx/pdep.txt
new file mode 100644
index 00000000000000..87268fe5e27dd8
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/pdep.txt
@@ -0,0 +1,18 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT:   pdepl	%r18d, %r22d, %r26d
+# INTEL: pdep	r26d, r22d, r18d
+0x62,0x6a,0x4f,0x00,0xf5,0xd2
+
+# ATT:   pdepq	%r19, %r23, %r27
+# INTEL: pdep	r27, r23, r19
+0x62,0x6a,0xc7,0x00,0xf5,0xdb
+
+# ATT:   pdepl	291(%r28,%r29,4), %r18d, %r22d
+# INTEL: pdep	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8a,0x6b,0x00,0xf5,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   pdepq	291(%r28,%r29,4), %r19, %r23
+# INTEL: pdep	r23, r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8a,0xe3,0x00,0xf5,0xbc,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/pext.txt b/llvm/test/MC/Disassembler/X86/apx/pext.txt
new file mode 100644
index 00000000000000..6c5860aa812812
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/pext.txt
@@ -0,0 +1,18 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT:   pextl	%r18d, %r22d, %r26d
+# INTEL: pext	r26d, r22d, r18d
+0x62,0x6a,0x4e,0x00,0xf5,0xd2
+
+# ATT:   pextq	%r19, %r23, %r27
+# INTEL: pext	r27, r23, r19
+0x62,0x6a,0xc6,0x00,0xf5,0xdb
+
+# ATT:   pextl	291(%r28,%r29,4), %r18d, %r22d
+# INTEL: pext	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8a,0x6a,0x00,0xf5,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   pextq	291(%r28,%r29,4), %r19, %r23
+# INTEL: pext	r23, r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8a,0xe2,0x00,0xf5,0xbc,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/rorx.txt b/llvm/test/MC/Disassembler/X86/apx/rorx.txt
new file mode 100644
index 00000000000000..9860deaea86bdd
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/rorx.txt
@@ -0,0 +1,18 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT:   rorxl	$123, %r18d, %r22d
+# INTEL: rorx	r22d, r18d, 123
+0x62,0xeb,0x7f,0x08,0xf0,0xf2,0x7b
+
+# ATT:   rorxq	$123, %r19, %r23
+# INTEL: rorx	r23, r19, 123
+0x62,0xeb,0xff,0x08,0xf0,0xfb,0x7b
+
+# ATT:   rorxl	$123, 291(%r28,%r29,4), %r18d
+# INTEL: rorx	r18d, dword ptr [r28 + 4*r29 + 291], 123
+0x62,0x8b,0x7b,0x08,0xf0,0x94,0xac,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   rorxq	$123, 291(%r28,%r29,4), %r19
+# INTEL: rorx	r19, qword ptr [r28 + 4*r29 + 291], 123
+0x62,0x8b,0xfb,0x08,0xf0,0x9c,0xac,0x23,0x01,0x00,0x00,0x7b
diff --git a/llvm/test/MC/Disassembler/X86/apx/sarx.txt b/llvm/test/MC/Disassembler/X86/apx/sarx.txt
new file mode 100644
index 00000000000000..20018f4d4b1283
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/sarx.txt
@@ -0,0 +1,18 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT:   sarxl	%r18d, %r22d, %r26d
+# INTEL: sarx	r26d, r22d, r18d
+0x62,0x6a,0x6e,0x00,0xf7,0xd6
+
+# ATT:   sarxl	%r18d, 291(%r28,%r29,4), %r22d
+# INTEL: sarx	r22d, dword ptr [r28 + 4*r29 + 291], r18d
+0x62,0x8a,0x6a,0x00,0xf7,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   sarxq	%r19, %r23, %r27
+# INTEL: sarx	r27, r23, r19
+0x62,0x6a,0xe6,0x00,0xf7,0xdf
+
+# ATT:   sarxq	%r19, 291(%r28,%r29,4), %r23
+# INTEL: sarx	r23, qword ptr [r28 + 4*r29 + 291], r19
+0x62,0x8a,0xe2,0x00,0xf7,0xbc,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/shlx.txt b/llvm/test/MC/Disassembler/X86/apx/shlx.txt
new file mode 100644
index 00000000000000..f6d6250bd06318
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/shlx.txt
@@ -0,0 +1,18 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT:   shlxl	%r18d, %r22d, %r26d
+# INTEL: shlx	r26d, r22d, r18d
+0x62,0x6a,0x6d,0x00,0xf7,0xd6
+
+# ATT:   shlxl	%r18d, 291(%r28,%r29,4), %r22d
+# INTEL: shlx	r22d, dword ptr [r28 + 4*r29 + 291], r18d
+0x62,0x8a,0x69,0x00,0xf7,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   shlxq	%r19, %r23, %r27
+# INTEL: shlx	r27, r23, r19
+0x62,0x6a,0xe5,0x00,0xf7,0xdf
+
+# ATT:   shlxq	%r19, 291(%r28,%r29,4), %r23
+# INTEL: shlx	r23, qword ptr [r28 + 4*r29 + 291], r19
+0x62,0x8a,0xe1,0x00,0xf7,0xbc,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/shrx.txt b/llvm/test/MC/Disassembler/X86/apx/shrx.txt
new file mode 100644
index 00000000000000..09750e05c127e6
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/shrx.txt
@@ -0,0 +1,18 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT:   shrxl	%r18d, %r22d, %r26d
+# INTEL: shrx	r26d, r22d, r18d
+0x62,0x6a,0x6f,0x00,0xf7,0xd6
+
+# ATT:   shrxl	%r18d, 291(%r28,%r29,4), %r22d
+# INTEL: shrx	r22d, dword ptr [r28 + 4*r29 + 291], r18d
+0x62,0x8a,0x6b,0x00,0xf7,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   shrxq	%r19, %r23, %r27
+# INTEL: shrx	r27, r23, r19
+0x62,0x6a,0xe7,0x00,0xf7,0xdf
+
+# ATT:   shrxq	%r19, 291(%r28,%r29,4), %r23
+# INTEL: shrx	r23, qword ptr [r28 + 4*r29 + 291], r19
+0x62,0x8a,0xe3,0x00,0xf7,0xbc,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/X86/apx/andn-att.s b/llvm/test/MC/X86/apx/andn-att.s
new file mode 100644
index 00000000000000..d68cee8bcf1f72
--- /dev/null
+++ b/llvm/test/MC/X86/apx/andn-att.s
@@ -0,0 +1,20 @@
+# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-4: error:
+# ERROR-NOT: error:
+# CHECK: andnl	%r18d, %r22d, %r26d
+# CHECK: encoding: [0x62,0x6a,0x4c,0x00,0xf2,0xd2]
+         andnl	%r18d, %r22d, %r26d
+
+# CHECK: andnq	%r19, %r23, %r27
+# CHECK: encoding: [0x62,0x6a,0xc4,0x00,0xf2,0xdb]
+         andnq	%r19, %r23, %r27
+
+# CHECK: andnl	291(%r28,%r29,4), %r18d, %r22d
+# CHECK: encoding: [0x62,0x8a,0x68,0x00,0xf2,0xb4,0xac,0x23,0x01,0x00,0x00]
+         andnl	291(%r28,%r29,4), %r18d, %r22d
+
+# CHECK: andnq	291(%r28,%r29,4), %r19, %r23
+# CHECK: encoding: [0x62,0x8a,0xe0,0x00,0xf2,0xbc,0xac,0x23,0x01,0x00,0x00]
+         andnq	291(%r28,%r29,4), %r19, %r23
diff --git a/llvm/test/MC/X86/apx/andn-intel.s b/llvm/test/MC/X86/apx/andn-intel.s
new file mode 100644
index 00000000000000..583e6e763b1eca
--- /dev/null
+++ b/llvm/test/MC/X86/apx/andn-intel.s
@@ -0,0 +1,17 @@
+# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+# CHECK: andn	r26d, r22d, r18d
+# CHECK: encoding: [0x62,0x6a,0x4c,0x00,0xf2,0xd2]
+         andn	r26d, r22d, r18d
+
+# CHECK: andn	r27, r23, r19
+# CHECK: encoding: [0x62,0x6a,0xc4,0x00,0xf2,0xdb]
+         andn	r27, r23, r19
+
+# CHECK: andn	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8a,0x68,0x00,0xf2,0xb4,0xac,0x23,0x01,0x00,0x00]
+         andn	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: andn	r23, r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8a,0xe0,0x00,0xf2,0xbc,0xac,0x23,0x01,0x00,0x00]
+         andn	r23, r19, qword ptr [r28 + 4*r29 + 291]
diff --git a/llvm/test/MC/X86/apx/bextr-att.s b/llvm/test/MC/X86/apx/bextr-att.s
new file mode 100644
index 00000000000000..6095ffa389a34c
--- /dev/null
+++ b/llvm/test/MC/X86/apx/bextr-att.s
@@ -0,0 +1,20 @@
+# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-4: error:
+# ERROR-NOT: error:
+# CHECK: bextrl	%r18d, %r22d, %r26d
+# CHECK: encoding: [0x62,0x6a,0x6c,0x00,0xf7,0xd6]
+         bextrl	%r18d, %r22d, %r26d
+
+# CHECK: bextrl	%r18d, 291(%r28,%r29,4), %r22d
+# CHECK: encoding: [0x62,0x8a,0x68,0x00,0xf7,0xb4,0xac,0x23,0x01,0x00,0x00]
+         bextrl	%r18d, 291(%r28,%r29,4), %r22d
+
+# CHECK: bextrq	%r19, %r23, %r27
+# CHECK: encoding: [0x62,0x6a,0xe4,0x00,0xf7,0xdf]
+         bextrq	%r19, %r23, %r27
+
+# CHECK: bextrq	%r19, 291(%r28,%r29,4), %r23
+# CHECK: encoding: [0x62,0x8a,0xe0,0x00,0xf7,0xbc,0xac,0x23,0x01,0x00,0x00]
+         bextrq	%r19, 291(%r28,%r29,4), %r23
diff --git a/llvm/test/MC/X86/apx/bextr-intel.s b/llvm/test/MC/X86/apx/bextr-intel.s
new file mode 100644
index 00000000000000..af70c00c1d631d
--- /dev/null
+++ b/llvm/test/MC/X86/apx/bextr-intel.s
@@ -0,0 +1,17 @@
+# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+# CHECK: bextr	r26d, r22d, r18d
+# CHECK: encoding: [0x62,0x6a,0x6c,0x00,0xf7,0xd6]
+         bextr	r26d, r22d, r18d
+
+# CHECK: bextr	r22d, dword ptr [r28 + 4*r29 + 291], r18d
+# CHECK: encoding: [0x62,0x8a,0x68,0x00,0xf7,0xb4,0xac,0x23,0x01,0x00,0x00]
+         bextr	r22d, dword ptr [r28 + 4*r29 + 291], r18d
+
+# CHECK: bextr	r27, r23, r19
+# CHECK: encoding: [0x62,0x6a,0xe4,0x00,0xf7,0xdf]
+         bextr	r27, r23, r19
+
+# CHECK: bextr	r23, qword ptr [r28 + 4*r29 + 291], r19
+# CHECK: encoding: [0x62,0x8a,0xe0,0x00,0xf7,0xbc,0xac,0x23,0x01,0x00,0x00]
+         bextr	r23, qword ptr [r28 + 4*r29 + 291], r19
diff --git a/llvm/test/MC/X86/apx/blsi-att.s b/llvm/test/MC/X86/apx/blsi-att.s
new file mode 100644
index 00000000000000..65b2fd2b4d09b6
--- /dev/null
+++ b/llvm/test/MC/X86/apx/blsi-att.s
@@ -0,0 +1,20 @@
+# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-4: error:
+# ERROR-NOT: error:
+# CHECK: blsil	%r18d, %r22d
+# CHECK: encoding: [0x62,0xfa,0x4c,0x00,0xf3,0xda]
+         blsil	%r18d, %r22d
+
+# CHECK: blsiq	%r19, %r23
+# CHECK: encoding: [0x62,0xfa,0xc4,0x00,0xf3,0xdb]
+         blsiq	%r19, %r23
+
+# CHECK: blsil	291(%r28,%r29,4), %r18d
+# CHECK: encoding: [0x62,0x9a,0x68,0x00,0xf3,0x9c,0xac,0x23,0x01,0x00,0x00]
+         blsil	291(%r28,%r29,4), %r18d
+
+# CHECK: blsiq	291(%r28,%r29,4), %r19
+# CHECK: encoding: [0x62,0x9a,0xe0,0x00,0xf3,0x9c,0xac,0x23,0x01,0x00,0x00]
+         blsiq	291(%r28,%r29,4), %r19
diff --git a/llvm/test/MC/X86/apx/blsi-intel.s b/llvm/test/MC/X86/apx/blsi-intel.s
new file mode 100644
index 00000000000000..edf5711cc74b57
--- /dev/null
+++ b/llvm/test/MC/X86/apx/blsi-intel.s
@@ -0,0 +1,17 @@
+# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+# CHECK: blsi	r22d, r18d
+# CHECK: encoding: [0x62,0xfa,0x4c,0x00,0xf3,0xda]
+         blsi	r22d, r18d
+
+# CHECK: blsi	r23, r19
+# CHECK: encoding: [0x62,0xfa,0xc4,0x00,0xf3,0xdb]
+         blsi	r23, r19
+
+# CHECK: blsi	r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x9a,0x68,0x00,0xf3,0x9c,0xac,0x23,0x01,0x00,0x00]
+         blsi	r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: blsi	r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x9a,0xe0,0x00,0xf3,0x9c,0xac,0x23,0x01,0x00,0x00]
+         blsi	r19, qword ptr [r28 + 4*r29 + 291]
diff --git a/llvm/test/MC/X86/apx/blsmsk-att.s b/llvm/test/MC/X86/apx/blsmsk-att.s
new file mode 100644
index 00000000000000..710fcabddcc3ab
--- /dev/null
+++ b/llvm/test/MC/X86/apx/blsmsk-att.s
@@ -0,0 +1,20 @@
+# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-4: error:
+# ERROR-NOT: error:
+# CHECK: blsmskl	%r18d, %r22d
+# CHECK: encoding: [0x62,0xfa,0x4c,0x00,0xf3,0xd2]
+         blsmskl	%r18d, %r22d
+
+# CHECK: blsmskq	%r19, %r23
+# CHECK: encoding: [0x62,0xfa,0xc4,0x00,0xf3,0xd3]
+         blsmskq	%r19, %r23
+
+# CHECK: blsmskl	291(%r28,%r29,4), %r18d
+# CHECK: encoding: [0x62,0x9a,0x68,0x00,0xf3,0x94,0xac,0x23,0x01,0x00,0x00]
+         blsmskl	291(%r28,%r29,4), %r18d
+
+# CHECK: blsmskq	291(%r28,%r29,4), %r19
+# CHECK: encoding: [0x62,0x9a,0xe0,0x00,0xf3,0x94,0xac,0x23,0x01,0x00,0x00]
+         blsmskq	291(%r28,%r29,4), %r19
diff --git a/llvm/test/MC/X86/apx/blsmsk-intel.s b/llvm/test/MC/X86/apx/blsmsk-intel.s
new file mode 100644
index 00000000000000..bb8197d3d41026
--- /dev/null
+++ b/llvm/test/MC/X86/apx/blsmsk-intel.s
@@ -0,0 +1,17 @@
+# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+# CHECK: blsmsk	r22d, r18d
+# CHECK: encoding: [0x62,0xfa,0x4c,0x00,0xf3,0xd2]
+         blsmsk	r22d, r18d
+
+# CHECK: blsmsk	r23, r19
+# CHECK: encoding: [0x62,0xfa,0xc4,0x00,0xf3,0xd3]
+         blsmsk	r23, r19
+
+# CHECK: blsmsk	r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x9a,0x68,0x00,0xf3,0x94,0xac,0x23,0x01,0x00,0x00]
+         blsmsk	r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: blsmsk	r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x9a,0xe0,0x00,0xf3,0x94,0xac,0x23,0x01,0x00,0x00]
+         blsmsk	r19, qword ptr [r28 + 4*r29 + 291]
diff --git a/llvm/test/MC/X86/apx/blsr-att.s b/llvm/test/MC/X86/apx/blsr-att.s
new file mode 100644
index 00000000000000..c9ca56149cf1a8
--- /dev/null
+++ b/llvm/test/MC/X86/apx/blsr-att.s
@@ -0,0 +1,20 @@
+# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-4: error:
+# ERROR-NOT: error:
+# CHECK: blsrl	%r18d, %r22d
+# CHECK: encoding: [0x62,0xfa,0x4c,0x00,0xf3,0xca]
+         blsrl	%r18d, %r22d
+
+# CHECK: blsrq	%r19, %r23
+# CHECK: encoding: [0x62,0xfa,0xc4,0x00,0xf3,0xcb]
+         blsrq	%r19, %r23
+
+# CHECK: blsrl	291(%r28,%r29,4), %r18d
+# CHECK: encoding: [0x62,0x9a,0x68,0x00,0xf3,0x8c,0xac,0x23,0x01,0x00,0x00]
+         blsrl	291(%r28,%r29,4), %r18d
+
+# CHECK: blsrq	291(%r28,%r29,4), %r19
+# CHECK: encoding: [0x62,0x9a,0xe0,0x00,0xf3,0x8c,0xac,0x23,0x01,0x00,0x00]
+         blsrq	291(%r28,%r29,4), %r19
diff --git a/llvm/test/MC/X86/apx/blsr-intel.s b/llvm/test/MC/X86/apx/blsr-intel.s
new file mode 100644
index 00000000000000..acbfb81964614e
--- /dev/null
+++ b/llvm/test/MC/X86/apx/blsr-intel.s
@@ -0,0 +1,17 @@
+# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+# CHECK: blsr	r22d, r18d
+# CHECK: encoding: [0x62,0xfa,0x4c,0x00,0xf3,0xca]
+         blsr	r22d, r18d
+
+# CHECK: blsr	r23, r19
+# CHECK: encoding: [0x62,0xfa,0xc4,0x00,0xf3,0xcb]
+         blsr	r23, r19
+
+# CHECK: blsr	r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x9a,0x68,0x00,0xf3,0x8c,0xac,0x23,0x01,0x00,0x00]
+         blsr	r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: blsr	r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x9a,0xe0,0x00,0xf3,0x8c,0xac,0x23,0x01,0x00,0x00]
+         blsr	r19, qword ptr [r28 + 4*r29 + 291]
diff --git a/llvm/test/MC/X86/apx/bzhi-att.s b/llvm/test/MC/X86/apx/bzhi-att.s
new file mode 100644
index 00000000000000..635cfa14e6b4f5
--- /dev/null
+++ b/llvm/test/MC/X86/apx/bzhi-att.s
@@ -0,0 +1,20 @@
+# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-4: error:
+# ERROR-NOT: error:
+# CHECK: bzhil	%r18d, %r22d, %r26d
+# CHECK: encoding: [0x62,0x6a,0x6c,0x00,0xf5,0xd6]
+         bzhil	%r18d, %r22d, %r26d
+
+# CHECK: bzhil	%r18d, 291(%r28,%r29,4), %r22d
+# CHECK: encoding: [0x62,0x8a,0x68,0x00,0xf5,0xb4,0xac,0x23,0x01,0x00,0x00]
+         bzhil	%r18d, 291(%r28,%r29,4), %r22d
+
+# CHECK: bzhiq	%r19, %r23, %r27
+# CHECK: encoding: [0x62,0x6a,0xe4,0x00,0xf5,0xdf]
+         bzhiq	%r19, %r23, %r27
+
+# CHECK: bzhiq	%r19, 291(%r28,%r29,4), %r23
+# CHECK: encoding: [0x62,0x8a,0xe0,0x00,0xf5,0xbc,0xac,0x23,0x01,0x00,0x00]
+         bzhiq	%r19, 291(%r28,%r29,4), %r23
diff --git a/llvm/test/MC/X86/apx/bzhi-intel.s b/llvm/test/MC/X86/apx/bzhi-intel.s
new file mode 100644
index 00000000000000..f7ab72dd717ee7
--- /dev/null
+++ b/llvm/test/MC/X86/apx/bzhi-intel.s
@@ -0,0 +1,17 @@
+# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+# CHECK: bzhi	r26d, r22d, r18d
+# CHECK: encoding: [0x62,0x6a,0x6c,0x00,0xf5,0xd6]
+         bzhi	r26d, r22d, r18d
+
+# CHECK: bzhi	r22d, dword ptr [r28 + 4*r29 + 291], r18d
+# CHECK: encoding: [0x62,0x8a,0x68,0x00,0xf5,0xb4,0xac,0x23,0x01,0x00,0x00]
+         bzhi	r22d, dword ptr [r28 + 4*r29 + 291], r18d
+
+# CHECK: bzhi	r27, r23, r19
+# CHECK: encoding: [0x62,0x6a,0xe4,0x00,0xf5,0xdf]
+         bzhi	r27, r23, r19
+
+# CHECK: bzhi	r23, qword ptr [r28 + 4*r29 + 291], r19
+# CHECK: encoding: [0x62,0x8a,0xe0,0x00,0xf5,0xbc,0xac,0x23,0x01,0x00,0x00]
+         bzhi	r23, qword ptr [r28 + 4*r29 + 291], r19
diff --git a/llvm/test/MC/X86/apx/evex-format-att.s b/llvm/test/MC/X86/apx/evex-format-att.s
index aedd09e7e698df..0b2e860d6ba090 100644
--- a/llvm/test/MC/X86/apx/evex-format-att.s
+++ b/llvm/test/MC/X86/apx/evex-format-att.s
@@ -60,8 +60,20 @@
 # CHECK: encoding: [0x62,0xf9,0x79,0x48,0x73,0x3c,0x08,0x00]
          vpslldq	$0, (%r16,%r17), %zmm0
 
+## MRMSrcMem4VOp3
+
+# CHECK: bzhiq	%r19, 291(%r28,%r29,4), %r23
+# CHECK: encoding: [0x62,0x8a,0xe0,0x00,0xf5,0xbc,0xac,0x23,0x01,0x00,0x00]
+         bzhiq	%r19, 291(%r28,%r29,4), %r23
+
 ## MRMDestReg
 
 # CHECK: vextractps	$1, %xmm16, %r16d
 # CHECK: encoding: [0x62,0xeb,0x7d,0x08,0x17,0xc0,0x01]
          vextractps	$1, %xmm16, %r16d
+
+## MRMSrcReg4VOp3
+
+# CHECK: bzhiq	%r19, %r23, %r27
+# CHECK: encoding: [0x62,0x6a,0xe4,0x00,0xf5,0xdf]
+         bzhiq	%r19, %r23, %r27
diff --git a/llvm/test/MC/X86/apx/evex-format-intel.s b/llvm/test/MC/X86/apx/evex-format-intel.s
index aa11a879f4b4c6..ececb7137b1101 100644
--- a/llvm/test/MC/X86/apx/evex-format-intel.s
+++ b/llvm/test/MC/X86/apx/evex-format-intel.s
@@ -60,8 +60,20 @@
 # CHECK: encoding: [0x62,0xf9,0x79,0x48,0x73,0x3c,0x08,0x00]
          vpslldq	zmm0, zmmword ptr [r16 + r17], 0
 
+## MRMSrcMem4VOp3
+
+# CHECK: bzhi	r23, qword ptr [r28 + 4*r29 + 291], r19
+# CHECK: encoding: [0x62,0x8a,0xe0,0x00,0xf5,0xbc,0xac,0x23,0x01,0x00,0x00]
+         bzhi	r23, qword ptr [r28 + 4*r29 + 291], r19
+
 ## MRMDestReg
 
 # CHECK: vextractps	r16d, xmm16, 1
 # CHECK: encoding: [0x62,0xeb,0x7d,0x08,0x17,0xc0,0x01]
          vextractps	r16d, xmm16, 1
+
+## MRMSrcReg4VOp3
+
+# CHECK: bzhi	r27, r23, r19
+# CHECK: encoding: [0x62,0x6a,0xe4,0x00,0xf5,0xdf]
+         bzhi	r27, r23, r19
diff --git a/llvm/test/MC/X86/apx/mulx-att.s b/llvm/test/MC/X86/apx/mulx-att.s
new file mode 100644
index 00000000000000..976a79f469cd6f
--- /dev/null
+++ b/llvm/test/MC/X86/apx/mulx-att.s
@@ -0,0 +1,20 @@
+# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-4: error:
+# ERROR-NOT: error:
+# CHECK: mulxl	%r18d, %r22d, %r26d
+# CHECK: encoding: [0x62,0x6a,0x4f,0x00,0xf6,0xd2]
+         mulxl	%r18d, %r22d, %r26d
+
+# CHECK: mulxq	%r19, %r23, %r27
+# CHECK: encoding: [0x62,0x6a,0xc7,0x00,0xf6,0xdb]
+         mulxq	%r19, %r23, %r27
+
+# CHECK: mulxl	291(%r28,%r29,4), %r18d, %r22d
+# CHECK: encoding: [0x62,0x8a,0x6b,0x00,0xf6,0xb4,0xac,0x23,0x01,0x00,0x00]
+         mulxl	291(%r28,%r29,4), %r18d, %r22d
+
+# CHECK: mulxq	291(%r28,%r29,4), %r19, %r23
+# CHECK: encoding: [0x62,0x8a,0xe3,0x00,0xf6,0xbc,0xac,0x23,0x01,0x00,0x00]
+         mulxq	291(%r28,%r29,4), %r19, %r23
diff --git a/llvm/test/MC/X86/apx/mulx-intel.s b/llvm/test/MC/X86/apx/mulx-intel.s
new file mode 100644
index 00000000000000..3db587502915df
--- /dev/null
+++ b/llvm/test/MC/X86/apx/mulx-intel.s
@@ -0,0 +1,17 @@
+# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+# CHECK: mulx	r26d, r22d, r18d
+# CHECK: encoding: [0x62,0x6a,0x4f,0x00,0xf6,0xd2]
+         mulx	r26d, r22d, r18d
+
+# CHECK: mulx	r27, r23, r19
+# CHECK: encoding: [0x62,0x6a,0xc7,0x00,0xf6,0xdb]
+         mulx	r27, r23, r19
+
+# CHECK: mulx	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8a,0x6b,0x00,0xf6,0xb4,0xac,0x23,0x01,0x00,0x00]
+         mulx	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: mulx	r23, r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8a,0xe3,0x00,0xf6,0xbc,0xac,0x23,0x01,0x00,0x00]
+         mulx	r23, r19, qword ptr [r28 + 4*r29 + 291]
diff --git a/llvm/test/MC/X86/apx/pdep-att.s b/llvm/test/MC/X86/apx/pdep-att.s
new file mode 100644
index 00000000000000..c319b17e47f6fc
--- /dev/null
+++ b/llvm/test/MC/X86/apx/pdep-att.s
@@ -0,0 +1,20 @@
+# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-4: error:
+# ERROR-NOT: error:
+# CHECK: pdepl	%r18d, %r22d, %r26d
+# CHECK: encoding: [0x62,0x6a,0x4f,0x00,0xf5,0xd2]
+         pdepl	%r18d, %r22d, %r26d
+
+# CHECK: pdepq	%r19, %r23, %r27
+# CHECK: encoding: [0x62,0x6a,0xc7,0x00,0xf5,0xdb]
+         pdepq	%r19, %r23, %r27
+
+# CHECK: pdepl	291(%r28,%r29,4), %r18d, %r22d
+# CHECK: encoding: [0x62,0x8a,0x6b,0x00,0xf5,0xb4,0xac,0x23,0x01,0x00,0x00]
+         pdepl	291(%r28,%r29,4), %r18d, %r22d
+
+# CHECK: pdepq	291(%r28,%r29,4), %r19, %r23
+# CHECK: encoding: [0x62,0x8a,0xe3,0x00,0xf5,0xbc,0xac,0x23,0x01,0x00,0x00]
+         pdepq	291(%r28,%r29,4), %r19, %r23
diff --git a/llvm/test/MC/X86/apx/pdep-intel.s b/llvm/test/MC/X86/apx/pdep-intel.s
new file mode 100644
index 00000000000000..0f9e828c021c3a
--- /dev/null
+++ b/llvm/test/MC/X86/apx/pdep-intel.s
@@ -0,0 +1,17 @@
+# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+# CHECK: pdep	r26d, r22d, r18d
+# CHECK: encoding: [0x62,0x6a,0x4f,0x00,0xf5,0xd2]
+         pdep	r26d, r22d, r18d
+
+# CHECK: pdep	r27, r23, r19
+# CHECK: encoding: [0x62,0x6a,0xc7,0x00,0xf5,0xdb]
+         pdep	r27, r23, r19
+
+# CHECK: pdep	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8a,0x6b,0x00,0xf5,0xb4,0xac,0x23,0x01,0x00,0x00]
+         pdep	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: pdep	r23, r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8a,0xe3,0x00,0xf5,0xbc,0xac,0x23,0x01,0x00,0x00]
+         pdep	r23, r19, qword ptr [r28 + 4*r29 + 291]
diff --git a/llvm/test/MC/X86/apx/pext-att.s b/llvm/test/MC/X86/apx/pext-att.s
new file mode 100644
index 00000000000000..c07fa1ac2082af
--- /dev/null
+++ b/llvm/test/MC/X86/apx/pext-att.s
@@ -0,0 +1,20 @@
+# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-4: error:
+# ERROR-NOT: error:
+# CHECK: pextl	%r18d, %r22d, %r26d
+# CHECK: encoding: [0x62,0x6a,0x4e,0x00,0xf5,0xd2]
+         pextl	%r18d, %r22d, %r26d
+
+# CHECK: pextq	%r19, %r23, %r27
+# CHECK: encoding: [0x62,0x6a,0xc6,0x00,0xf5,0xdb]
+         pextq	%r19, %r23, %r27
+
+# CHECK: pextl	291(%r28,%r29,4), %r18d, %r22d
+# CHECK: encoding: [0x62,0x8a,0x6a,0x00,0xf5,0xb4,0xac,0x23,0x01,0x00,0x00]
+         pextl	291(%r28,%r29,4), %r18d, %r22d
+
+# CHECK: pextq	291(%r28,%r29,4), %r19, %r23
+# CHECK: encoding: [0x62,0x8a,0xe2,0x00,0xf5,0xbc,0xac,0x23,0x01,0x00,0x00]
+         pextq	291(%r28,%r29,4), %r19, %r23
diff --git a/llvm/test/MC/X86/apx/pext-intel.s b/llvm/test/MC/X86/apx/pext-intel.s
new file mode 100644
index 00000000000000..9a7e7d93094a42
--- /dev/null
+++ b/llvm/test/MC/X86/apx/pext-intel.s
@@ -0,0 +1,17 @@
+# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+# CHECK: pext	r26d, r22d, r18d
+# CHECK: encoding: [0x62,0x6a,0x4e,0x00,0xf5,0xd2]
+         pext	r26d, r22d, r18d
+
+# CHECK: pext	r27, r23, r19
+# CHECK: encoding: [0x62,0x6a,0xc6,0x00,0xf5,0xdb]
+         pext	r27, r23, r19
+
+# CHECK: pext	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8a,0x6a,0x00,0xf5,0xb4,0xac,0x23,0x01,0x00,0x00]
+         pext	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: pext	r23, r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8a,0xe2,0x00,0xf5,0xbc,0xac,0x23,0x01,0x00,0x00]
+         pext	r23, r19, qword ptr [r28 + 4*r29 + 291]
diff --git a/llvm/test/MC/X86/apx/rorx-att.s b/llvm/test/MC/X86/apx/rorx-att.s
new file mode 100644
index 00000000000000..fb613d95c7cb43
--- /dev/null
+++ b/llvm/test/MC/X86/apx/rorx-att.s
@@ -0,0 +1,20 @@
+# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-4: error:
+# ERROR-NOT: error:
+# CHECK: rorxl	$123, %r18d, %r22d
+# CHECK: encoding: [0x62,0xeb,0x7f,0x08,0xf0,0xf2,0x7b]
+         rorxl	$123, %r18d, %r22d
+
+# CHECK: rorxq	$123, %r19, %r23
+# CHECK: encoding: [0x62,0xeb,0xff,0x08,0xf0,0xfb,0x7b]
+         rorxq	$123, %r19, %r23
+
+# CHECK: rorxl	$123, 291(%r28,%r29,4), %r18d
+# CHECK: encoding: [0x62,0x8b,0x7b,0x08,0xf0,0x94,0xac,0x23,0x01,0x00,0x00,0x7b]
+         rorxl	$123, 291(%r28,%r29,4), %r18d
+
+# CHECK: rorxq	$123, 291(%r28,%r29,4), %r19
+# CHECK: encoding: [0x62,0x8b,0xfb,0x08,0xf0,0x9c,0xac,0x23,0x01,0x00,0x00,0x7b]
+         rorxq	$123, 291(%r28,%r29,4), %r19
diff --git a/llvm/test/MC/X86/apx/rorx-intel.s b/llvm/test/MC/X86/apx/rorx-intel.s
new file mode 100644
index 00000000000000..d3e63559cba579
--- /dev/null
+++ b/llvm/test/MC/X86/apx/rorx-intel.s
@@ -0,0 +1,17 @@
+# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+# CHECK: rorx	r22d, r18d, 123
+# CHECK: encoding: [0x62,0xeb,0x7f,0x08,0xf0,0xf2,0x7b]
+         rorx	r22d, r18d, 123
+
+# CHECK: rorx	r23, r19, 123
+# CHECK: encoding: [0x62,0xeb,0xff,0x08,0xf0,0xfb,0x7b]
+         rorx	r23, r19, 123
+
+# CHECK: rorx	r18d, dword ptr [r28 + 4*r29 + 291], 123
+# CHECK: encoding: [0x62,0x8b,0x7b,0x08,0xf0,0x94,0xac,0x23,0x01,0x00,0x00,0x7b]
+         rorx	r18d, dword ptr [r28 + 4*r29 + 291], 123
+
+# CHECK: rorx	r19, qword ptr [r28 + 4*r29 + 291], 123
+# CHECK: encoding: [0x62,0x8b,0xfb,0x08,0xf0,0x9c,0xac,0x23,0x01,0x00,0x00,0x7b]
+         rorx	r19, qword ptr [r28 + 4*r29 + 291], 123
diff --git a/llvm/test/MC/X86/apx/sarx-att.s b/llvm/test/MC/X86/apx/sarx-att.s
new file mode 100644
index 00000000000000..a174903d976cbf
--- /dev/null
+++ b/llvm/test/MC/X86/apx/sarx-att.s
@@ -0,0 +1,20 @@
+# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-4: error:
+# ERROR-NOT: error:
+# CHECK: sarxl	%r18d, %r22d, %r26d
+# CHECK: encoding: [0x62,0x6a,0x6e,0x00,0xf7,0xd6]
+         sarxl	%r18d, %r22d, %r26d
+
+# CHECK: sarxl	%r18d, 291(%r28,%r29,4), %r22d
+# CHECK: encoding: [0x62,0x8a,0x6a,0x00,0xf7,0xb4,0xac,0x23,0x01,0x00,0x00]
+         sarxl	%r18d, 291(%r28,%r29,4), %r22d
+
+# CHECK: sarxq	%r19, %r23, %r27
+# CHECK: encoding: [0x62,0x6a,0xe6,0x00,0xf7,0xdf]
+         sarxq	%r19, %r23, %r27
+
+# CHECK: sarxq	%r19, 291(%r28,%r29,4), %r23
+# CHECK: encoding: [0x62,0x8a,0xe2,0x00,0xf7,0xbc,0xac,0x23,0x01,0x00,0x00]
+         sarxq	%r19, 291(%r28,%r29,4), %r23
diff --git a/llvm/test/MC/X86/apx/sarx-intel.s b/llvm/test/MC/X86/apx/sarx-intel.s
new file mode 100644
index 00000000000000..962b6ec313b987
--- /dev/null
+++ b/llvm/test/MC/X86/apx/sarx-intel.s
@@ -0,0 +1,17 @@
+# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+# CHECK: sarx	r26d, r22d, r18d
+# CHECK: encoding: [0x62,0x6a,0x6e,0x00,0xf7,0xd6]
+         sarx	r26d, r22d, r18d
+
+# CHECK: sarx	r22d, dword ptr [r28 + 4*r29 + 291], r18d
+# CHECK: encoding: [0x62,0x8a,0x6a,0x00,0xf7,0xb4,0xac,0x23,0x01,0x00,0x00]
+         sarx	r22d, dword ptr [r28 + 4*r29 + 291], r18d
+
+# CHECK: sarx	r27, r23, r19
+# CHECK: encoding: [0x62,0x6a,0xe6,0x00,0xf7,0xdf]
+         sarx	r27, r23, r19
+
+# CHECK: sarx	r23, qword ptr [r28 + 4*r29 + 291], r19
+# CHECK: encoding: [0x62,0x8a,0xe2,0x00,0xf7,0xbc,0xac,0x23,0x01,0x00,0x00]
+         sarx	r23, qword ptr [r28 + 4*r29 + 291], r19
diff --git a/llvm/test/MC/X86/apx/shlx-att.s b/llvm/test/MC/X86/apx/shlx-att.s
new file mode 100644
index 00000000000000..4e28119f083056
--- /dev/null
+++ b/llvm/test/MC/X86/apx/shlx-att.s
@@ -0,0 +1,20 @@
+# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-4: error:
+# ERROR-NOT: error:
+# CHECK: shlxl	%r18d, %r22d, %r26d
+# CHECK: encoding: [0x62,0x6a,0x6d,0x00,0xf7,0xd6]
+         shlxl	%r18d, %r22d, %r26d
+
+# CHECK: shlxl	%r18d, 291(%r28,%r29,4), %r22d
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xf7,0xb4,0xac,0x23,0x01,0x00,0x00]
+         shlxl	%r18d, 291(%r28,%r29,4), %r22d
+
+# CHECK: shlxq	%r19, %r23, %r27
+# CHECK: encoding: [0x62,0x6a,0xe5,0x00,0xf7,0xdf]
+         shlxq	%r19, %r23, %r27
+
+# CHECK: shlxq	%r19, 291(%r28,%r29,4), %r23
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xf7,0xbc,0xac,0x23,0x01,0x00,0x00]
+         shlxq	%r19, 291(%r28,%r29,4), %r23
diff --git a/llvm/test/MC/X86/apx/shlx-intel.s b/llvm/test/MC/X86/apx/shlx-intel.s
new file mode 100644
index 00000000000000..9f16918a712dc5
--- /dev/null
+++ b/llvm/test/MC/X86/apx/shlx-intel.s
@@ -0,0 +1,17 @@
+# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+# CHECK: shlx	r26d, r22d, r18d
+# CHECK: encoding: [0x62,0x6a,0x6d,0x00,0xf7,0xd6]
+         shlx	r26d, r22d, r18d
+
+# CHECK: shlx	r22d, dword ptr [r28 + 4*r29 + 291], r18d
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xf7,0xb4,0xac,0x23,0x01,0x00,0x00]
+         shlx	r22d, dword ptr [r28 + 4*r29 + 291], r18d
+
+# CHECK: shlx	r27, r23, r19
+# CHECK: encoding: [0x62,0x6a,0xe5,0x00,0xf7,0xdf]
+         shlx	r27, r23, r19
+
+# CHECK: shlx	r23, qword ptr [r28 + 4*r29 + 291], r19
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xf7,0xbc,0xac,0x23,0x01,0x00,0x00]
+         shlx	r23, qword ptr [r28 + 4*r29 + 291], r19
diff --git a/llvm/test/MC/X86/apx/shrx-att.s b/llvm/test/MC/X86/apx/shrx-att.s
new file mode 100644
index 00000000000000..d9bb5f84af73d4
--- /dev/null
+++ b/llvm/test/MC/X86/apx/shrx-att.s
@@ -0,0 +1,20 @@
+# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-4: error:
+# ERROR-NOT: error:
+# CHECK: shrxl	%r18d, %r22d, %r26d
+# CHECK: encoding: [0x62,0x6a,0x6f,0x00,0xf7,0xd6]
+         shrxl	%r18d, %r22d, %r26d
+
+# CHECK: shrxl	%r18d, 291(%r28,%r29,4), %r22d
+# CHECK: encoding: [0x62,0x8a,0x6b,0x00,0xf7,0xb4,0xac,0x23,0x01,0x00,0x00]
+         shrxl	%r18d, 291(%r28,%r29,4), %r22d
+
+# CHECK: shrxq	%r19, %r23, %r27
+# CHECK: encoding: [0x62,0x6a,0xe7,0x00,0xf7,0xdf]
+         shrxq	%r19, %r23, %r27
+
+# CHECK: shrxq	%r19, 291(%r28,%r29,4), %r23
+# CHECK: encoding: [0x62,0x8a,0xe3,0x00,0xf7,0xbc,0xac,0x23,0x01,0x00,0x00]
+         shrxq	%r19, 291(%r28,%r29,4), %r23
diff --git a/llvm/test/MC/X86/apx/shrx-intel.s b/llvm/test/MC/X86/apx/shrx-intel.s
new file mode 100644
index 00000000000000..385c530a1108be
--- /dev/null
+++ b/llvm/test/MC/X86/apx/shrx-intel.s
@@ -0,0 +1,17 @@
+# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+# CHECK: shrx	r26d, r22d, r18d
+# CHECK: encoding: [0x62,0x6a,0x6f,0x00,0xf7,0xd6]
+         shrx	r26d, r22d, r18d
+
+# CHECK: shrx	r22d, dword ptr [r28 + 4*r29 + 291], r18d
+# CHECK: encoding: [0x62,0x8a,0x6b,0x00,0xf7,0xb4,0xac,0x23,0x01,0x00,0x00]
+         shrx	r22d, dword ptr [r28 + 4*r29 + 291], r18d
+
+# CHECK: shrx	r27, r23, r19
+# CHECK: encoding: [0x62,0x6a,0xe7,0x00,0xf7,0xdf]
+         shrx	r27, r23, r19
+
+# CHECK: shrx	r23, qword ptr [r28 + 4*r29 + 291], r19
+# CHECK: encoding: [0x62,0x8a,0xe3,0x00,0xf7,0xbc,0xac,0x23,0x01,0x00,0x00]
+         shrx	r23, qword ptr [r28 + 4*r29 + 291], r19
diff --git a/llvm/test/TableGen/x86-fold-tables.inc b/llvm/test/TableGen/x86-fold-tables.inc
index dcf650434c8169..b2609f01e86a21 100644
--- a/llvm/test/TableGen/x86-fold-tables.inc
+++ b/llvm/test/TableGen/x86-fold-tables.inc
@@ -411,7 +411,9 @@ static const X86FoldTableEntry Table1[] = {
   {X86::AESIMCrr, X86::AESIMCrm, TB_ALIGN_16},
   {X86::AESKEYGENASSIST128rr, X86::AESKEYGENASSIST128rm, TB_ALIGN_16},
   {X86::BEXTR32rr, X86::BEXTR32rm, 0},
+  {X86::BEXTR32rr_EVEX, X86::BEXTR32rm_EVEX, 0},
   {X86::BEXTR64rr, X86::BEXTR64rm, 0},
+  {X86::BEXTR64rr_EVEX, X86::BEXTR64rm_EVEX, 0},
   {X86::BEXTRI32ri, X86::BEXTRI32mi, 0},
   {X86::BEXTRI64ri, X86::BEXTRI64mi, 0},
   {X86::BLCFILL32rr, X86::BLCFILL32rm, 0},
@@ -427,13 +429,19 @@ static const X86FoldTableEntry Table1[] = {
   {X86::BLSFILL32rr, X86::BLSFILL32rm, 0},
   {X86::BLSFILL64rr, X86::BLSFILL64rm, 0},
   {X86::BLSI32rr, X86::BLSI32rm, 0},
+  {X86::BLSI32rr_EVEX, X86::BLSI32rm_EVEX, 0},
   {X86::BLSI64rr, X86::BLSI64rm, 0},
+  {X86::BLSI64rr_EVEX, X86::BLSI64rm_EVEX, 0},
   {X86::BLSIC32rr, X86::BLSIC32rm, 0},
   {X86::BLSIC64rr, X86::BLSIC64rm, 0},
   {X86::BLSMSK32rr, X86::BLSMSK32rm, 0},
+  {X86::BLSMSK32rr_EVEX, X86::BLSMSK32rm_EVEX, 0},
   {X86::BLSMSK64rr, X86::BLSMSK64rm, 0},
+  {X86::BLSMSK64rr_EVEX, X86::BLSMSK64rm_EVEX, 0},
   {X86::BLSR32rr, X86::BLSR32rm, 0},
+  {X86::BLSR32rr_EVEX, X86::BLSR32rm_EVEX, 0},
   {X86::BLSR64rr, X86::BLSR64rm, 0},
+  {X86::BLSR64rr_EVEX, X86::BLSR64rm_EVEX, 0},
   {X86::BSF16rr, X86::BSF16rm, 0},
   {X86::BSF32rr, X86::BSF32rm, 0},
   {X86::BSF64rr, X86::BSF64rm, 0},
@@ -441,7 +449,9 @@ static const X86FoldTableEntry Table1[] = {
   {X86::BSR32rr, X86::BSR32rm, 0},
   {X86::BSR64rr, X86::BSR64rm, 0},
   {X86::BZHI32rr, X86::BZHI32rm, 0},
+  {X86::BZHI32rr_EVEX, X86::BZHI32rm_EVEX, 0},
   {X86::BZHI64rr, X86::BZHI64rm, 0},
+  {X86::BZHI64rr_EVEX, X86::BZHI64rm_EVEX, 0},
   {X86::CMP16rr, X86::CMP16rm, 0},
   {X86::CMP32rr, X86::CMP32rm, 0},
   {X86::CMP64rr, X86::CMP64rm, 0},
@@ -582,7 +592,9 @@ static const X86FoldTableEntry Table1[] = {
   {X86::RCPPSr, X86::RCPPSm, TB_ALIGN_16},
   {X86::RCPSSr, X86::RCPSSm, 0},
   {X86::RORX32ri, X86::RORX32mi, 0},
+  {X86::RORX32ri_EVEX, X86::RORX32mi_EVEX, 0},
   {X86::RORX64ri, X86::RORX64mi, 0},
+  {X86::RORX64ri_EVEX, X86::RORX64mi_EVEX, 0},
   {X86::ROUNDPDr, X86::ROUNDPDm, TB_ALIGN_16},
   {X86::ROUNDPSr, X86::ROUNDPSm, TB_ALIGN_16},
   {X86::ROUNDSDr, X86::ROUNDSDm, 0},
@@ -590,11 +602,17 @@ static const X86FoldTableEntry Table1[] = {
   {X86::RSQRTPSr, X86::RSQRTPSm, TB_ALIGN_16},
   {X86::RSQRTSSr, X86::RSQRTSSm, 0},
   {X86::SARX32rr, X86::SARX32rm, 0},
+  {X86::SARX32rr_EVEX, X86::SARX32rm_EVEX, 0},
   {X86::SARX64rr, X86::SARX64rm, 0},
+  {X86::SARX64rr_EVEX, X86::SARX64rm_EVEX, 0},
   {X86::SHLX32rr, X86::SHLX32rm, 0},
+  {X86::SHLX32rr_EVEX, X86::SHLX32rm_EVEX, 0},
   {X86::SHLX64rr, X86::SHLX64rm, 0},
+  {X86::SHLX64rr_EVEX, X86::SHLX64rm_EVEX, 0},
   {X86::SHRX32rr, X86::SHRX32rm, 0},
+  {X86::SHRX32rr_EVEX, X86::SHRX32rm_EVEX, 0},
   {X86::SHRX64rr, X86::SHRX64rm, 0},
+  {X86::SHRX64rr_EVEX, X86::SHRX64rm_EVEX, 0},
   {X86::SQRTPDr, X86::SQRTPDm, TB_ALIGN_16},
   {X86::SQRTPSr, X86::SQRTPSm, TB_ALIGN_16},
   {X86::SQRTSDr, X86::SQRTSDm, 0},
@@ -1332,7 +1350,9 @@ static const X86FoldTableEntry Table2[] = {
   {X86::AND64rr, X86::AND64rm, 0},
   {X86::AND8rr, X86::AND8rm, 0},
   {X86::ANDN32rr, X86::ANDN32rm, 0},
+  {X86::ANDN32rr_EVEX, X86::ANDN32rm_EVEX, 0},
   {X86::ANDN64rr, X86::ANDN64rm, 0},
+  {X86::ANDN64rr_EVEX, X86::ANDN64rm_EVEX, 0},
   {X86::ANDNPDrr, X86::ANDNPDrm, TB_ALIGN_16},
   {X86::ANDNPSrr, X86::ANDNPSrm, TB_ALIGN_16},
   {X86::ANDPDrr, X86::ANDPDrm, TB_ALIGN_16},
@@ -1479,7 +1499,9 @@ static const X86FoldTableEntry Table2[] = {
   {X86::MULSSrr, X86::MULSSrm, 0},
   {X86::MULSSrr_Int, X86::MULSSrm_Int, TB_NO_REVERSE},
   {X86::MULX32rr, X86::MULX32rm, 0},
+  {X86::MULX32rr_EVEX, X86::MULX32rm_EVEX, 0},
   {X86::MULX64rr, X86::MULX64rm, 0},
+  {X86::MULX64rr_EVEX, X86::MULX64rm_EVEX, 0},
   {X86::OR16rr, X86::OR16rm, 0},
   {X86::OR32rr, X86::OR32rm, 0},
   {X86::OR64rr, X86::OR64rm, 0},
@@ -1516,9 +1538,13 @@ static const X86FoldTableEntry Table2[] = {
   {X86::PCMPGTQrr, X86::PCMPGTQrm, TB_ALIGN_16},
   {X86::PCMPGTWrr, X86::PCMPGTWrm, TB_ALIGN_16},
   {X86::PDEP32rr, X86::PDEP32rm, 0},
+  {X86::PDEP32rr_EVEX, X86::PDEP32rm_EVEX, 0},
   {X86::PDEP64rr, X86::PDEP64rm, 0},
+  {X86::PDEP64rr_EVEX, X86::PDEP64rm_EVEX, 0},
   {X86::PEXT32rr, X86::PEXT32rm, 0},
+  {X86::PEXT32rr_EVEX, X86::PEXT32rm_EVEX, 0},
   {X86::PEXT64rr, X86::PEXT64rm, 0},
+  {X86::PEXT64rr_EVEX, X86::PEXT64rm_EVEX, 0},
   {X86::PFACCrr, X86::PFACCrm, 0},
   {X86::PFADDrr, X86::PFADDrm, 0},
   {X86::PFCMPEQrr, X86::PFCMPEQrm, 0},

From f42ce1621f5f4129fb37c4a1af958e1d47344107 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski@arm.com>
Date: Fri, 1 Dec 2023 10:08:00 +0000
Subject: [PATCH 19/72] [mlir][sve][nfc] Update a test to use
 transform-interpreter (#73771)

This is a follow-up of #70040 in which the test updated here was missed.

Includes a few additional NFC changes in preparation for extending this
test.
---
 .../Dialect/Linalg/CPU/ArmSVE/matmul.mlir     | 70 +++++++++++--------
 1 file changed, 42 insertions(+), 28 deletions(-)

diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir
index 2024da2a585d99..d771d32d548bbe 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir
@@ -1,8 +1,14 @@
-// RUN: mlir-opt %s -test-transform-dialect-interpreter -test-transform-dialect-erase-schedule \
-// RUN:   -one-shot-bufferize -func-bufferize -cse -canonicalize -convert-vector-to-scf -arm-sve-legalize-vector-storage \
-// RUN:   -convert-vector-to-llvm="enable-arm-sve" -test-lower-to-llvm | \
-// RUN: %mcr_aarch64_cmd -e=matmul_f32 -entry-point-result=void --march=aarch64 --mattr="+sve" -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils | \
-// RUN: FileCheck %s
+// DEFINE: %{compile} =  mlir-opt %s \
+// DEFINE:    -transform-interpreter -test-transform-dialect-erase-schedule \
+// DEFINE:    -one-shot-bufferize -func-bufferize -cse -canonicalize -convert-vector-to-scf -arm-sve-legalize-vector-storage \
+// DEFINE:    -convert-vector-to-llvm="enable-arm-sve" -test-lower-to-llvm -o %t
+// DEFINE: %{entry_point} = matmul_f32
+// DEFINE: %{run} = %mcr_aarch64_cmd %t -e %{entry_point} -entry-point-result=void --march=aarch64 --mattr="+sve"\
+// DEFINE:    -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils
+
+// RUN: %{compile}
+
+// RUN: %{run} | FileCheck %s
 
 func.func @matmul_f32() {
   // Matrix dimensions
@@ -40,29 +46,37 @@ func.func @matmul_f32() {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%module_op: !transform.any_op):
-  // Step 1: Tile
-  %matmul = transform.structured.match ops{["linalg.matmul"]} in %module_op : (!transform.any_op) -> !transform.any_op
-  %func_op = get_parent_op %matmul : (!transform.any_op) -> !transform.op<"func.func">
-  %module_with_tiled_loops, %loops:3 = transform.structured.tile_using_for %matmul [2, [4], 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
-
-  // Step 2: Vectorize
-  %tiled_matmul = transform.structured.match ops{["linalg.matmul"]} in %module_with_tiled_loops : (!transform.any_op) -> !transform.any_op
-  transform.structured.vectorize %tiled_matmul vector_sizes [2, [4], 1] : !transform.any_op
-
-  // Step 3: Lower vector.multi_reduction to vector.contract (+ some helpful patterns)
-  transform.apply_patterns to %func_op {
-    transform.apply_patterns.vector.reduction_to_contract
-    transform.apply_patterns.vector.transfer_permutation_patterns
-    transform.apply_patterns.vector.lower_masked_transfers
-  } : !transform.op<"func.func">
-
-  // Step 4: Lower vector.contract to vector.fma
-  transform.apply_patterns to %func_op {
-    transform.apply_patterns.vector.lower_contraction lowering_strategy = "outerproduct"
-    transform.apply_patterns.vector.lower_outerproduct
-  } : !transform.op<"func.func">
+module attributes {transform.with_named_sequence} {
+transform.named_sequence @__transform_main(%module: !transform.any_op {transform.readonly}) {
+    %matmul = transform.structured.match ops{["linalg.matmul"]} in %module
+      : (!transform.any_op) -> !transform.any_op
+
+    // Step 1: Tile
+    %module_with_tiled_loops, %loops:3 = transform.structured.tile_using_for %matmul [2, [4], 1]
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+
+    // Step 2: Vectorize
+    %tiled_matmul = transform.structured.match ops{["linalg.matmul"]} in %module_with_tiled_loops
+      : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %tiled_matmul vector_sizes [2, [4], 1] : !transform.any_op
+
+    // Step 3: Lower vector.multi_reduction to vector.contract (+ some helpful patterns)
+    %func = transform.structured.match ops{["func.func"]} in %module
+      : (!transform.any_op) -> !transform.op<"func.func">
+    transform.apply_patterns to %func {
+      transform.apply_patterns.vector.reduction_to_contract
+      transform.apply_patterns.vector.transfer_permutation_patterns
+      transform.apply_patterns.vector.lower_masked_transfers
+    } : !transform.op<"func.func">
+
+    // Step 4: Lower vector.contract to vector.fma
+    transform.apply_patterns to %func {
+      transform.apply_patterns.vector.lower_contraction lowering_strategy = "outerproduct"
+      transform.apply_patterns.vector.lower_outerproduct
+    } : !transform.op<"func.func">
+
+    transform.yield
+  }
 }
 
 func.func private @printMemrefF32(%ptr : tensor<*xf32>)

From 1ee41b415398cde51c055a7b1a4d419350e7038f Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Fri, 1 Dec 2023 11:29:05 +0100
Subject: [PATCH 20/72] [libc++][NFC] Update the remaining old license headers

---
 libcxx/test/libcxx/numerics/bit.ops.pass.cpp              | 8 ++++----
 .../memory/ptr.align/assume_aligned.power2.verify.cpp     | 7 +++----
 .../sequences/forwardlist/forwardlist.spec/equal.pass.cpp | 7 +++----
 .../forwardlist/forwardlist.spec/member_swap.pass.cpp     | 7 +++----
 .../forwardlist/forwardlist.spec/non_member_swap.pass.cpp | 7 +++----
 .../forwardlist/forwardlist.spec/relational.pass.cpp      | 7 +++----
 .../forwardlist/forwardlist.spec/swap_noexcept.pass.cpp   | 7 +++----
 .../support.dynamic/destroying_delete_t.pass.cpp          | 7 +++----
 .../destroying_delete_t_declaration.pass.cpp              | 7 +++----
 .../language.support/support.dynamic/nothrow_t.pass.cpp   | 7 +++----
 .../language.support/support.dynamic/nothrow_t.verify.cpp | 7 +++----
 .../test/std/numerics/bit/bit.pow.two/bit_ceil.pass.cpp   | 8 ++++----
 .../test/std/numerics/bit/bit.pow.two/bit_ceil.verify.cpp | 8 ++++----
 .../test/std/numerics/bit/bit.pow.two/bit_floor.pass.cpp  | 8 ++++----
 .../test/std/numerics/bit/bit.pow.two/bit_width.pass.cpp  | 8 ++++----
 .../std/numerics/bit/bit.pow.two/has_single_bit.pass.cpp  | 8 ++++----
 .../std/numerics/bit/bitops.count/countl_one.pass.cpp     | 8 ++++----
 .../std/numerics/bit/bitops.count/countl_zero.pass.cpp    | 8 ++++----
 .../std/numerics/bit/bitops.count/countr_one.pass.cpp     | 8 ++++----
 .../std/numerics/bit/bitops.count/countr_zero.pass.cpp    | 8 ++++----
 .../test/std/numerics/bit/bitops.count/popcount.pass.cpp  | 8 ++++----
 libcxx/test/std/numerics/bit/bitops.rot/rotl.pass.cpp     | 8 ++++----
 libcxx/test/std/numerics/bit/bitops.rot/rotr.pass.cpp     | 8 ++++----
 .../std/thread/thread.mutex/thread.lock/types.verify.cpp  | 7 +++----
 .../func.wrap.func/func.wrap.func.con/deduct_F.pass.cpp   | 7 +++----
 .../func.wrap.func/func.wrap.func.con/deduct_F.verify.cpp | 7 +++----
 .../func.wrap.func/func.wrap.func.con/deduct_ptr.pass.cpp | 7 +++----
 .../util.smartptr.shared.spec/swap.pass.cpp               | 7 +++----
 .../util.smartptr.weak.spec/swap.pass.cpp                 | 7 +++----
 .../meta.trans.other/common_reference.compile.pass.cpp    | 7 +++----
 30 files changed, 103 insertions(+), 120 deletions(-)

diff --git a/libcxx/test/libcxx/numerics/bit.ops.pass.cpp b/libcxx/test/libcxx/numerics/bit.ops.pass.cpp
index 2a509db1d79a02..d3ca8b2f8030bd 100644
--- a/libcxx/test/libcxx/numerics/bit.ops.pass.cpp
+++ b/libcxx/test/libcxx/numerics/bit.ops.pass.cpp
@@ -1,11 +1,11 @@
 //===----------------------------------------------------------------------===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+
 // Test the __XXXX routines in the <bit> header.
 // These are not supposed to be exhaustive tests, just sanity checks.
 
diff --git a/libcxx/test/libcxx/utilities/memory/ptr.align/assume_aligned.power2.verify.cpp b/libcxx/test/libcxx/utilities/memory/ptr.align/assume_aligned.power2.verify.cpp
index 92c9e927a546e5..b206fe31ea1966 100644
--- a/libcxx/test/libcxx/utilities/memory/ptr.align/assume_aligned.power2.verify.cpp
+++ b/libcxx/test/libcxx/utilities/memory/ptr.align/assume_aligned.power2.verify.cpp
@@ -1,9 +1,8 @@
 //===----------------------------------------------------------------------===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/equal.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/equal.pass.cpp
index a727487ed0d9c5..b21035f7dd7468 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/equal.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/equal.pass.cpp
@@ -1,9 +1,8 @@
 //===----------------------------------------------------------------------===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/member_swap.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/member_swap.pass.cpp
index 6b16d66fedb2ec..e7eea2e87bfec3 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/member_swap.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/member_swap.pass.cpp
@@ -1,9 +1,8 @@
 //===----------------------------------------------------------------------===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/non_member_swap.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/non_member_swap.pass.cpp
index e46a55cf81e4f1..54d26d0a649179 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/non_member_swap.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/non_member_swap.pass.cpp
@@ -1,9 +1,8 @@
 //===----------------------------------------------------------------------===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/relational.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/relational.pass.cpp
index 29a180a9661242..9ca19486a54b9c 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/relational.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/relational.pass.cpp
@@ -1,9 +1,8 @@
 //===----------------------------------------------------------------------===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/swap_noexcept.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/swap_noexcept.pass.cpp
index b4568837a2e015..76f8b6213908b0 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/swap_noexcept.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/swap_noexcept.pass.cpp
@@ -1,9 +1,8 @@
 //===----------------------------------------------------------------------===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/libcxx/test/std/language.support/support.dynamic/destroying_delete_t.pass.cpp b/libcxx/test/std/language.support/support.dynamic/destroying_delete_t.pass.cpp
index 7f52e2d8d50800..95d2c41d7bfebc 100644
--- a/libcxx/test/std/language.support/support.dynamic/destroying_delete_t.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/destroying_delete_t.pass.cpp
@@ -1,9 +1,8 @@
 //===----------------------------------------------------------------------===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/libcxx/test/std/language.support/support.dynamic/destroying_delete_t_declaration.pass.cpp b/libcxx/test/std/language.support/support.dynamic/destroying_delete_t_declaration.pass.cpp
index b98af1bfe1e45f..1270853a1ccabf 100644
--- a/libcxx/test/std/language.support/support.dynamic/destroying_delete_t_declaration.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/destroying_delete_t_declaration.pass.cpp
@@ -1,9 +1,8 @@
 //===----------------------------------------------------------------------===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/libcxx/test/std/language.support/support.dynamic/nothrow_t.pass.cpp b/libcxx/test/std/language.support/support.dynamic/nothrow_t.pass.cpp
index 3a6231329f0964..bfc41cb141aac2 100644
--- a/libcxx/test/std/language.support/support.dynamic/nothrow_t.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/nothrow_t.pass.cpp
@@ -1,9 +1,8 @@
 //===----------------------------------------------------------------------===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/libcxx/test/std/language.support/support.dynamic/nothrow_t.verify.cpp b/libcxx/test/std/language.support/support.dynamic/nothrow_t.verify.cpp
index 50dd63a6350f00..f2b345e809695b 100644
--- a/libcxx/test/std/language.support/support.dynamic/nothrow_t.verify.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/nothrow_t.verify.cpp
@@ -1,9 +1,8 @@
 //===----------------------------------------------------------------------===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/libcxx/test/std/numerics/bit/bit.pow.two/bit_ceil.pass.cpp b/libcxx/test/std/numerics/bit/bit.pow.two/bit_ceil.pass.cpp
index 4e794f129f3660..5e37db95ab090e 100644
--- a/libcxx/test/std/numerics/bit/bit.pow.two/bit_ceil.pass.cpp
+++ b/libcxx/test/std/numerics/bit/bit.pow.two/bit_ceil.pass.cpp
@@ -1,11 +1,11 @@
 //===----------------------------------------------------------------------===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // template <class T>
diff --git a/libcxx/test/std/numerics/bit/bit.pow.two/bit_ceil.verify.cpp b/libcxx/test/std/numerics/bit/bit.pow.two/bit_ceil.verify.cpp
index 9424b0b24f8aad..d37de690a48dba 100644
--- a/libcxx/test/std/numerics/bit/bit.pow.two/bit_ceil.verify.cpp
+++ b/libcxx/test/std/numerics/bit/bit.pow.two/bit_ceil.verify.cpp
@@ -1,11 +1,11 @@
 //===----------------------------------------------------------------------===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // template <class T>
diff --git a/libcxx/test/std/numerics/bit/bit.pow.two/bit_floor.pass.cpp b/libcxx/test/std/numerics/bit/bit.pow.two/bit_floor.pass.cpp
index 06ee38cf8261b2..38a46fcc122274 100644
--- a/libcxx/test/std/numerics/bit/bit.pow.two/bit_floor.pass.cpp
+++ b/libcxx/test/std/numerics/bit/bit.pow.two/bit_floor.pass.cpp
@@ -1,11 +1,11 @@
 //===----------------------------------------------------------------------===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // template <class T>
diff --git a/libcxx/test/std/numerics/bit/bit.pow.two/bit_width.pass.cpp b/libcxx/test/std/numerics/bit/bit.pow.two/bit_width.pass.cpp
index cfb9a163b4fa7e..baf2032a4a1f00 100644
--- a/libcxx/test/std/numerics/bit/bit.pow.two/bit_width.pass.cpp
+++ b/libcxx/test/std/numerics/bit/bit.pow.two/bit_width.pass.cpp
@@ -1,11 +1,11 @@
 //===----------------------------------------------------------------------===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // template <class T>
diff --git a/libcxx/test/std/numerics/bit/bit.pow.two/has_single_bit.pass.cpp b/libcxx/test/std/numerics/bit/bit.pow.two/has_single_bit.pass.cpp
index 7b23627a3d02ed..81dca301e21fb3 100644
--- a/libcxx/test/std/numerics/bit/bit.pow.two/has_single_bit.pass.cpp
+++ b/libcxx/test/std/numerics/bit/bit.pow.two/has_single_bit.pass.cpp
@@ -1,11 +1,11 @@
 //===----------------------------------------------------------------------===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // template <class T>
diff --git a/libcxx/test/std/numerics/bit/bitops.count/countl_one.pass.cpp b/libcxx/test/std/numerics/bit/bitops.count/countl_one.pass.cpp
index bbce57b9caea14..92268cf563b471 100644
--- a/libcxx/test/std/numerics/bit/bitops.count/countl_one.pass.cpp
+++ b/libcxx/test/std/numerics/bit/bitops.count/countl_one.pass.cpp
@@ -1,11 +1,11 @@
 //===----------------------------------------------------------------------===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // template <class T>
diff --git a/libcxx/test/std/numerics/bit/bitops.count/countl_zero.pass.cpp b/libcxx/test/std/numerics/bit/bitops.count/countl_zero.pass.cpp
index f103450eb834f7..9d5d361662e8c8 100644
--- a/libcxx/test/std/numerics/bit/bitops.count/countl_zero.pass.cpp
+++ b/libcxx/test/std/numerics/bit/bitops.count/countl_zero.pass.cpp
@@ -1,11 +1,11 @@
 //===----------------------------------------------------------------------===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // template <class T>
diff --git a/libcxx/test/std/numerics/bit/bitops.count/countr_one.pass.cpp b/libcxx/test/std/numerics/bit/bitops.count/countr_one.pass.cpp
index 8e8ef1d535a54a..63b60640ac0480 100644
--- a/libcxx/test/std/numerics/bit/bitops.count/countr_one.pass.cpp
+++ b/libcxx/test/std/numerics/bit/bitops.count/countr_one.pass.cpp
@@ -1,11 +1,11 @@
 //===----------------------------------------------------------------------===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // template <class T>
diff --git a/libcxx/test/std/numerics/bit/bitops.count/countr_zero.pass.cpp b/libcxx/test/std/numerics/bit/bitops.count/countr_zero.pass.cpp
index d0fdf921a09db0..1df1d883a12e1f 100644
--- a/libcxx/test/std/numerics/bit/bitops.count/countr_zero.pass.cpp
+++ b/libcxx/test/std/numerics/bit/bitops.count/countr_zero.pass.cpp
@@ -1,11 +1,11 @@
 //===----------------------------------------------------------------------===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // template <class T>
diff --git a/libcxx/test/std/numerics/bit/bitops.count/popcount.pass.cpp b/libcxx/test/std/numerics/bit/bitops.count/popcount.pass.cpp
index b8759c440432ee..588c5e0cf7af27 100644
--- a/libcxx/test/std/numerics/bit/bitops.count/popcount.pass.cpp
+++ b/libcxx/test/std/numerics/bit/bitops.count/popcount.pass.cpp
@@ -1,11 +1,11 @@
 //===----------------------------------------------------------------------===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // template <class T>
diff --git a/libcxx/test/std/numerics/bit/bitops.rot/rotl.pass.cpp b/libcxx/test/std/numerics/bit/bitops.rot/rotl.pass.cpp
index a1be03453abe3d..50e498b5761e54 100644
--- a/libcxx/test/std/numerics/bit/bitops.rot/rotl.pass.cpp
+++ b/libcxx/test/std/numerics/bit/bitops.rot/rotl.pass.cpp
@@ -1,11 +1,11 @@
 //===----------------------------------------------------------------------===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // template <class T>
diff --git a/libcxx/test/std/numerics/bit/bitops.rot/rotr.pass.cpp b/libcxx/test/std/numerics/bit/bitops.rot/rotr.pass.cpp
index 89fef32c36d4ff..00c9e617d2edf3 100644
--- a/libcxx/test/std/numerics/bit/bitops.rot/rotr.pass.cpp
+++ b/libcxx/test/std/numerics/bit/bitops.rot/rotr.pass.cpp
@@ -1,11 +1,11 @@
 //===----------------------------------------------------------------------===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // template <class T>
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/types.verify.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/types.verify.cpp
index 623e5c3d22e75b..1688470e1ac4d1 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/types.verify.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/types.verify.cpp
@@ -1,9 +1,8 @@
 //===----------------------------------------------------------------------===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_F.pass.cpp b/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_F.pass.cpp
index 00d183168d4b02..ef43ab9b64b5b5 100644
--- a/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_F.pass.cpp
+++ b/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_F.pass.cpp
@@ -1,9 +1,8 @@
 //===----------------------------------------------------------------------===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_F.verify.cpp b/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_F.verify.cpp
index 520e5f055a71f6..8a42d3be3571c0 100644
--- a/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_F.verify.cpp
+++ b/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_F.verify.cpp
@@ -1,9 +1,8 @@
 //===----------------------------------------------------------------------===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_ptr.pass.cpp b/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_ptr.pass.cpp
index cc61a75c84f988..ed4e0e96de3b46 100644
--- a/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_ptr.pass.cpp
+++ b/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_ptr.pass.cpp
@@ -1,9 +1,8 @@
 //===----------------------------------------------------------------------===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.spec/swap.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.spec/swap.pass.cpp
index 47ae5dd8f7297d..94986eaa9e3ff3 100644
--- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.spec/swap.pass.cpp
+++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.spec/swap.pass.cpp
@@ -1,9 +1,8 @@
 //===----------------------------------------------------------------------===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.weak/util.smartptr.weak.spec/swap.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.weak/util.smartptr.weak.spec/swap.pass.cpp
index 98429fd740638d..d6fcd882cd6675 100644
--- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.weak/util.smartptr.weak.spec/swap.pass.cpp
+++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.weak/util.smartptr.weak.spec/swap.pass.cpp
@@ -1,9 +1,8 @@
 //===----------------------------------------------------------------------===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/libcxx/test/std/utilities/meta/meta.trans/meta.trans.other/common_reference.compile.pass.cpp b/libcxx/test/std/utilities/meta/meta.trans/meta.trans.other/common_reference.compile.pass.cpp
index 9332865eaa2450..04a1451863c90a 100644
--- a/libcxx/test/std/utilities/meta/meta.trans/meta.trans.other/common_reference.compile.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.trans/meta.trans.other/common_reference.compile.pass.cpp
@@ -1,9 +1,8 @@
 //===----------------------------------------------------------------------===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 

From f7d91faa790630eca506a29faa560d6783edcbc0 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Fri, 1 Dec 2023 10:39:01 +0000
Subject: [PATCH 21/72] [mlir][ArmSME] Add option to only enable streaming
 mode/ZA if required (#73931)

This adds a `only-if-required-by-ops` flag to the `enable-arm-streaming`
pass. This flag defaults to `false` (which preserves the original
behaviour), however, if set to `true` the pass will only add the
selected ZA/streaming mode to functions that contain ops that implement
`ArmSMETileOpInterface`.

This simplifies enabling these modes, as we can now first try lowering
ops to ArmSME, then only if we succeed, add the relevant function
attributes.
---
 .../mlir/Dialect/ArmSME/Transforms/Passes.h   |  2 +-
 .../mlir/Dialect/ArmSME/Transforms/Passes.td  |  6 ++++-
 .../ArmSME/Transforms/EnableArmStreaming.cpp  | 25 ++++++++++++++++---
 .../Dialect/ArmSME/enable-arm-streaming.mlir  | 16 ++++++++++++
 4 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/mlir/include/mlir/Dialect/ArmSME/Transforms/Passes.h b/mlir/include/mlir/Dialect/ArmSME/Transforms/Passes.h
index 11a7385fe311dd..21a97e9cbc794c 100644
--- a/mlir/include/mlir/Dialect/ArmSME/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/ArmSME/Transforms/Passes.h
@@ -27,7 +27,7 @@ namespace arm_sme {
 /// Pass to enable Armv9 Streaming SVE mode.
 std::unique_ptr<Pass> createEnableArmStreamingPass(
     const ArmStreamingMode = ArmStreamingMode::Streaming,
-    const ArmZaMode = ArmZaMode::Disabled);
+    const ArmZaMode = ArmZaMode::Disabled, bool onlyIfRequiredByOps = false);
 
 /// Pass that allocates tile IDs to ArmSME operations.
 std::unique_ptr<Pass> createTileAllocationPass();
diff --git a/mlir/include/mlir/Dialect/ArmSME/Transforms/Passes.td b/mlir/include/mlir/Dialect/ArmSME/Transforms/Passes.td
index 3253b47e62abdd..7b9c74e0b8f60e 100644
--- a/mlir/include/mlir/Dialect/ArmSME/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/ArmSME/Transforms/Passes.td
@@ -73,7 +73,11 @@ def EnableArmStreaming
                             "new-za",
                             "The function has ZA state. The ZA state is "
                             "created on entry and destroyed on exit.")
-           )}]>
+           )}]>,
+    Option<"onlyIfRequiredByOps", "only-if-required-by-ops", "bool",
+           /*default=*/"false",
+           "Only apply the selected streaming/ZA modes if the function "
+           " contains ops that require them.">
   ];
   let dependentDialects = ["func::FuncDialect"];
 }
diff --git a/mlir/lib/Dialect/ArmSME/Transforms/EnableArmStreaming.cpp b/mlir/lib/Dialect/ArmSME/Transforms/EnableArmStreaming.cpp
index c3a1a1c9a3fb49..79a6caffb6ee0b 100644
--- a/mlir/lib/Dialect/ArmSME/Transforms/EnableArmStreaming.cpp
+++ b/mlir/lib/Dialect/ArmSME/Transforms/EnableArmStreaming.cpp
@@ -33,6 +33,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/Dialect/ArmSME/IR/ArmSME.h"
 #include "mlir/Dialect/ArmSME/Transforms/Passes.h"
 #include "mlir/Dialect/ArmSME/Transforms/PassesEnums.cpp.inc"
 
@@ -56,12 +57,28 @@ constexpr StringLiteral
 
 struct EnableArmStreamingPass
     : public arm_sme::impl::EnableArmStreamingBase<EnableArmStreamingPass> {
-  EnableArmStreamingPass(ArmStreamingMode streamingMode, ArmZaMode zaMode) {
+  EnableArmStreamingPass(ArmStreamingMode streamingMode, ArmZaMode zaMode,
+                         bool onlyIfRequiredByOps) {
     this->streamingMode = streamingMode;
     this->zaMode = zaMode;
+    this->onlyIfRequiredByOps = onlyIfRequiredByOps;
   }
   void runOnOperation() override {
     auto op = getOperation();
+
+    if (onlyIfRequiredByOps) {
+      bool foundTileOp = false;
+      op.walk([&](Operation *op) {
+        if (llvm::isa<ArmSMETileOpInterface>(op)) {
+          foundTileOp = true;
+          return WalkResult::interrupt();
+        }
+        return WalkResult::advance();
+      });
+      if (!foundTileOp)
+        return;
+    }
+
     if (op->getAttr(kEnableArmStreamingIgnoreAttr) ||
         streamingMode == ArmStreamingMode::Disabled)
       return;
@@ -81,6 +98,8 @@ struct EnableArmStreamingPass
 } // namespace
 
 std::unique_ptr<Pass> mlir::arm_sme::createEnableArmStreamingPass(
-    const ArmStreamingMode streamingMode, const ArmZaMode zaMode) {
-  return std::make_unique<EnableArmStreamingPass>(streamingMode, zaMode);
+    const ArmStreamingMode streamingMode, const ArmZaMode zaMode,
+    bool onlyIfRequiredByOps) {
+  return std::make_unique<EnableArmStreamingPass>(streamingMode, zaMode,
+                                                  onlyIfRequiredByOps);
 }
diff --git a/mlir/test/Dialect/ArmSME/enable-arm-streaming.mlir b/mlir/test/Dialect/ArmSME/enable-arm-streaming.mlir
index 70119b08c3e91a..b1188acbc0b2d7 100644
--- a/mlir/test/Dialect/ArmSME/enable-arm-streaming.mlir
+++ b/mlir/test/Dialect/ArmSME/enable-arm-streaming.mlir
@@ -1,6 +1,7 @@
 // RUN: mlir-opt %s -enable-arm-streaming -verify-diagnostics | FileCheck %s
 // RUN: mlir-opt %s -enable-arm-streaming=streaming-mode=streaming-locally -verify-diagnostics | FileCheck %s -check-prefix=CHECK-LOCALLY
 // RUN: mlir-opt %s -enable-arm-streaming=za-mode=new-za -verify-diagnostics | FileCheck %s -check-prefix=CHECK-ENABLE-ZA
+// RUN: mlir-opt %s -enable-arm-streaming=only-if-required-by-ops -verify-diagnostics | FileCheck %s -check-prefix=IF-REQUIRED
 
 // CHECK-LABEL: @arm_streaming
 // CHECK-SAME: attributes {arm_streaming}
@@ -17,3 +18,18 @@ func.func @arm_streaming() { return }
 // CHECK-ENABLE-ZA-LABEL: @not_arm_streaming
 // CHECK-ENABLE-ZA-SAME: attributes {enable_arm_streaming_ignore}
 func.func @not_arm_streaming() attributes {enable_arm_streaming_ignore} { return }
+
+// CHECK-LABEL: @requires_arm_streaming
+// CHECK-SAME: attributes {arm_streaming}
+// IF-REQUIRED: @requires_arm_streaming
+// IF-REQUIRED-SAME: attributes {arm_streaming}
+func.func @requires_arm_streaming() {
+  %tile = arm_sme.get_tile : vector<[4]x[4]xi32>
+  return
+}
+
+// CHECK-LABEL: @does_not_require_arm_streaming
+// CHECK-SAME: attributes {arm_streaming}
+// IF-REQUIRED: @does_not_require_arm_streaming
+// IF-REQUIRED-NOT: arm_streaming
+func.func @does_not_require_arm_streaming() { return }

From da1aff2b2a3192f5e32fa350de19aac0b89fed18 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Fri, 1 Dec 2023 10:40:24 +0000
Subject: [PATCH 22/72] [llvm][PowerPC] Correct handling of spill slots for SPE
 when EXPENSIVE_CHECKS is enabled (#73940)

This was modifying a container as it iterated it, which tripped a check
in libstdc++'s debug checks.

Instead, just assign to the item via the reference we already have.

This fixes the following expensive checks failures on my machine:
  LLVM :: CodeGen/PowerPC/fp-strict.ll
  LLVM :: CodeGen/PowerPC/pr55463.ll
  LLVM :: CodeGen/PowerPC/register-pressure.ll
  LLVM :: CodeGen/PowerPC/spe.ll

Which are some of the tests noted by #68594.
---
 llvm/lib/Target/PowerPC/PPCFrameLowering.cpp | 24 +++++++-------------
 1 file changed, 8 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
index eb3bf3b2690b22..245e78641ed654 100644
--- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -2334,24 +2334,16 @@ bool PPCFrameLowering::assignCalleeSavedSpillSlots(
     // In case of SPE we only have SuperRegs and CRs
     // in our CalleSaveInfo vector.
 
-    unsigned Idx = 0;
     for (auto &CalleeSaveReg : CSI) {
-      const MCPhysReg &Reg = CalleeSaveReg.getReg();
-      const MCPhysReg &Lower = RegInfo->getSubReg(Reg, 1);
-      const MCPhysReg &Higher = RegInfo->getSubReg(Reg, 2);
-
-      // Check only for SuperRegs.
-      if (Lower) {
-        if (MRI.isPhysRegModified(Higher)) {
-          Idx++;
-          continue;
-        } else {
+      MCPhysReg Reg = CalleeSaveReg.getReg();
+      MCPhysReg Lower = RegInfo->getSubReg(Reg, 1);
+      MCPhysReg Higher = RegInfo->getSubReg(Reg, 2);
+
+      if ( // Check only for SuperRegs.
+          Lower &&
           // Replace Reg if only lower-32 bits modified
-          CSI.erase(CSI.begin() + Idx);
-          CSI.insert(CSI.begin() + Idx, CalleeSavedInfo(Lower));
-        }
-      }
-      Idx++;
+          !MRI.isPhysRegModified(Higher))
+        CalleeSaveReg = CalleeSavedInfo(Lower);
     }
   }
 

From 289fe74ddbb4c8aa7128f60db6b20c119922b542 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Fri, 1 Dec 2023 13:35:23 +0300
Subject: [PATCH 23/72] [clang][NFC] Fill in historical data on when C++ DRs
 100-199 were fixed

---
 clang/test/CXX/drs/dr1xx.cpp | 20 ++++++++++----------
 clang/www/cxx_dr_status.html | 20 ++++++++++----------
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/clang/test/CXX/drs/dr1xx.cpp b/clang/test/CXX/drs/dr1xx.cpp
index 60e80a4c0e1c4f..50236eb7c9499d 100644
--- a/clang/test/CXX/drs/dr1xx.cpp
+++ b/clang/test/CXX/drs/dr1xx.cpp
@@ -72,7 +72,7 @@ namespace dr107 { // dr107: yes
   extern "C" S operator+(S, S) { return S(); }
 }
 
-namespace dr108 { // dr108: yes
+namespace dr108 { // dr108: 2.9
   template<typename T> struct A {
     struct B { typedef int X; };
     B::X x;
@@ -143,7 +143,7 @@ namespace dr114 { // dr114: yes
   } b; // expected-error {{abstract}}
 }
 
-namespace dr115 { // dr115: yes
+namespace dr115 { // dr115: 3.0
   template<typename T> int f(T); // expected-note +{{}}
   template<typename T> int g(T); // expected-note +{{}}
   template<typename T> int g(T, int); // expected-note +{{}}
@@ -480,7 +480,7 @@ namespace dr140 { // dr140: yes
   void g(int n) { n = 2; }
 }
 
-namespace dr141 { // dr141: yes
+namespace dr141 { // dr141: 3.1
   template<typename T> void f();
   template<typename T> struct S { int n; }; // expected-note {{'::dr141::S<int>::n' declared here}}
   struct A : S<int> {
@@ -518,7 +518,7 @@ namespace dr141 { // dr141: yes
   void i() { C<X>().i(); } // ok!!
 }
 
-namespace dr142 { // dr142: yes
+namespace dr142 { // dr142: 2.8
   class B { // expected-note +{{here}}
   public:
     int mi; // expected-note +{{here}}
@@ -602,7 +602,7 @@ namespace dr148 { // dr148: yes
 
 // dr149: na
 
-namespace dr151 { // dr151: yes
+namespace dr151 { // dr151: 3.1
   struct X {};
   typedef int X::*p;
 #if __cplusplus < 201103L
@@ -655,7 +655,7 @@ namespace dr159 { // dr159: 3.5
 
 // dr160: na
 
-namespace dr161 { // dr161: yes
+namespace dr161 { // dr161: 3.1
   class A {
   protected:
     struct B { int n; } b; // expected-note 2{{here}}
@@ -724,7 +724,7 @@ namespace dr165 { // dr165: no
   void N::g() {}
 }
 
-namespace dr166 { // dr166: yes
+namespace dr166 { // dr166: 2.9
   namespace A { class X; }
 
   template<typename T> int f(T t) { return t.n; }
@@ -827,7 +827,7 @@ namespace dr173 { // dr173: yes
 
 // dr174: sup 1012
 
-namespace dr175 { // dr175: yes
+namespace dr175 { // dr175: 2.8
   struct A {}; // expected-note {{here}}
   struct B : private A {}; // expected-note {{constrained by private inheritance}}
   struct C : B {
@@ -836,7 +836,7 @@ namespace dr175 { // dr175: yes
   };
 }
 
-namespace dr176 { // dr176: yes
+namespace dr176 { // dr176: 3.1
   template<typename T> class Y;
   template<> class Y<int> {
     void f() {
@@ -904,7 +904,7 @@ namespace dr179 { // dr179: yes
   int n = &f - &f; // expected-error {{arithmetic on pointers to the function type 'void ()'}}
 }
 
-namespace dr180 { // dr180: yes
+namespace dr180 { // dr180: 2.8
   template<typename T> struct X : T, T::some_base {
     X() : T::some_type_that_might_be_T(), T::some_base() {}
     friend class T::some_class;
diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html
index 7cf657a47d6409..141b2aa515ad9a 100755
--- a/clang/www/cxx_dr_status.html
+++ b/clang/www/cxx_dr_status.html
@@ -685,7 +685,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/108.html">108</a></td>
     <td>TC1</td>
     <td>Are classes nested in templates dependent?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="109">
     <td><a href="https://cplusplus.github.io/CWG/issues/109.html">109</a></td>
@@ -727,7 +727,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/115.html">115</a></td>
     <td>CD1</td>
     <td>Address of template-id</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="116">
     <td><a href="https://cplusplus.github.io/CWG/issues/116.html">116</a></td>
@@ -883,13 +883,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/141.html">141</a></td>
     <td>CD1</td>
     <td>Non-member function templates in member access expressions</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="142">
     <td><a href="https://cplusplus.github.io/CWG/issues/142.html">142</a></td>
     <td>TC1</td>
     <td>Injection-related errors in access example</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="143">
     <td><a href="https://cplusplus.github.io/CWG/issues/143.html">143</a></td>
@@ -943,7 +943,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/151.html">151</a></td>
     <td>TC1</td>
     <td>Terminology of zero-initialization</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="152">
     <td><a href="https://cplusplus.github.io/CWG/issues/152.html">152</a></td>
@@ -1003,7 +1003,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/161.html">161</a></td>
     <td>TC1</td>
     <td>Access to protected nested type</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="162">
     <td><a href="https://cplusplus.github.io/CWG/issues/162.html">162</a></td>
@@ -1033,7 +1033,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/166.html">166</a></td>
     <td>TC1</td>
     <td>Friend declarations of <I>template-id</I>s</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="167">
     <td><a href="https://cplusplus.github.io/CWG/issues/167.html">167</a></td>
@@ -1087,13 +1087,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/175.html">175</a></td>
     <td>CD1</td>
     <td>Class name injection and base name access</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="176">
     <td><a href="https://cplusplus.github.io/CWG/issues/176.html">176</a></td>
     <td>TC1</td>
     <td>Name injection and templates</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="177">
     <td><a href="https://cplusplus.github.io/CWG/issues/177.html">177</a></td>
@@ -1117,7 +1117,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/180.html">180</a></td>
     <td>CD1</td>
     <td><TT>typename</TT> and elaborated types</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="181">
     <td><a href="https://cplusplus.github.io/CWG/issues/181.html">181</a></td>

From 5a1020bb0083ebfcf5d8879ba99c21bf214fcb56 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 1 Dec 2023 11:40:52 +0100
Subject: [PATCH 24/72] [InstSimplify] Add test for disjoint or miscompile
 (NFC)

The absorption case is already handled correctly, but the
idempentence case is not.
---
 llvm/test/Transforms/InstCombine/select.ll  | 43 ++++++++++++++++-----
 llvm/test/Transforms/InstSimplify/select.ll | 38 ++++++++++++++++++
 2 files changed, 72 insertions(+), 9 deletions(-)

diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll
index b3764cfb97d407..f1ccd4747bd1ce 100644
--- a/llvm/test/Transforms/InstCombine/select.ll
+++ b/llvm/test/Transforms/InstCombine/select.ll
@@ -1919,9 +1919,9 @@ define i32 @select_dominating_cond_inverted_multiple_duplicating_preds(i1 %cond,
 ; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF_FALSE:%.*]], label [[IF_TRUE:%.*]]
 ; CHECK:       if.true:
 ; CHECK-NEXT:    switch i32 [[COND2:%.*]], label [[SWITCH_CASE_1:%.*]] [
-; CHECK-NEXT:    i32 1, label [[MERGE:%.*]]
-; CHECK-NEXT:    i32 2, label [[MERGE]]
-; CHECK-NEXT:    i32 3, label [[MERGE]]
+; CHECK-NEXT:      i32 1, label [[MERGE:%.*]]
+; CHECK-NEXT:      i32 2, label [[MERGE]]
+; CHECK-NEXT:      i32 3, label [[MERGE]]
 ; CHECK-NEXT:    ]
 ; CHECK:       switch.case.1:
 ; CHECK-NEXT:    br label [[MERGE]]
@@ -2172,13 +2172,13 @@ define i32 @test_invoke_neg(i32 %x, i32 %y) nounwind uwtable ssp personality ptr
 ; CHECK-LABEL: @test_invoke_neg(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[COND:%.*]] = invoke i1 @foo()
-; CHECK-NEXT:    to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]]
+; CHECK-NEXT:            to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]]
 ; CHECK:       invoke.cont:
 ; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[COND]], i32 [[X:%.*]], i32 [[Y:%.*]]
 ; CHECK-NEXT:    ret i32 [[SEL]]
 ; CHECK:       lpad:
 ; CHECK-NEXT:    [[LP:%.*]] = landingpad { i1, i32 }
-; CHECK-NEXT:    filter [0 x i1] zeroinitializer
+; CHECK-NEXT:            filter [0 x i1] zeroinitializer
 ; CHECK-NEXT:    unreachable
 ;
 entry:
@@ -2205,14 +2205,14 @@ define i32 @test_invoke_2_neg(i1 %cond, i32 %x, i32 %y) nounwind uwtable ssp per
 ; CHECK-NEXT:    br label [[MERGE:%.*]]
 ; CHECK:       if.false:
 ; CHECK-NEXT:    [[RESULT:%.*]] = invoke i32 @bar()
-; CHECK-NEXT:    to label [[MERGE]] unwind label [[LPAD:%.*]]
+; CHECK-NEXT:            to label [[MERGE]] unwind label [[LPAD:%.*]]
 ; CHECK:       merge:
 ; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ 0, [[IF_TRUE]] ], [ [[RESULT]], [[IF_FALSE]] ]
 ; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[COND]], i32 1, i32 [[PHI]]
 ; CHECK-NEXT:    ret i32 [[SEL]]
 ; CHECK:       lpad:
 ; CHECK-NEXT:    [[LP:%.*]] = landingpad { i1, i32 }
-; CHECK-NEXT:    filter [0 x i1] zeroinitializer
+; CHECK-NEXT:            filter [0 x i1] zeroinitializer
 ; CHECK-NEXT:    unreachable
 ;
 entry:
@@ -2242,8 +2242,8 @@ define i32 @select_phi_same_condition_switch(i1 %cond, i32 %x, i32 %y) {
 ; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
 ; CHECK:       if.true:
 ; CHECK-NEXT:    switch i32 [[X:%.*]], label [[EXIT:%.*]] [
-; CHECK-NEXT:    i32 1, label [[MERGE:%.*]]
-; CHECK-NEXT:    i32 2, label [[MERGE]]
+; CHECK-NEXT:      i32 1, label [[MERGE:%.*]]
+; CHECK-NEXT:      i32 2, label [[MERGE]]
 ; CHECK-NEXT:    ]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret i32 0
@@ -2903,6 +2903,31 @@ define ptr @select_replacement_gep_inbounds(ptr %base, i64 %offset) {
   ret ptr %sel
 }
 
+define i8 @replace_false_op_eq_shl_or_disjoint(i8 %x) {
+; CHECK-LABEL: @replace_false_op_eq_shl_or_disjoint(
+; CHECK-NEXT:    [[SHL:%.*]] = shl i8 [[X:%.*]], 3
+; CHECK-NEXT:    [[OR:%.*]] = or i8 [[SHL]], [[X]]
+; CHECK-NEXT:    ret i8 [[OR]]
+;
+  %eq0 = icmp eq i8 %x, -1
+  %shl = shl i8 %x, 3
+  %or = or disjoint i8 %x, %shl
+  %sel = select i1 %eq0, i8 -1, i8 %or
+  ret i8 %sel
+}
+
+; FIXME: This is a miscompile.
+define i8 @select_or_disjoint_eq(i8 %x, i8 %y) {
+; CHECK-LABEL: @select_or_disjoint_eq(
+; CHECK-NEXT:    [[OR:%.*]] = or disjoint i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i8 [[OR]]
+;
+  %cmp = icmp eq i8 %x, %y
+  %or = or disjoint i8 %x, %y
+  %sel = select i1 %cmp, i8 %x, i8 %or
+  ret i8 %sel
+}
+
 define <2 x i1> @partial_true_undef_condval(<2 x i1> %x) {
 ; CHECK-LABEL: @partial_true_undef_condval(
 ; CHECK-NEXT:    ret <2 x i1> <i1 true, i1 poison>
diff --git a/llvm/test/Transforms/InstSimplify/select.ll b/llvm/test/Transforms/InstSimplify/select.ll
index 16901b88893387..473d8b8b036808 100644
--- a/llvm/test/Transforms/InstSimplify/select.ll
+++ b/llvm/test/Transforms/InstSimplify/select.ll
@@ -1429,6 +1429,21 @@ define i8 @replace_false_op_eq_shl_or(i8 %x) {
   ret i8 %sel
 }
 
+define i8 @replace_false_op_eq_shl_or_disjoint(i8 %x) {
+; CHECK-LABEL: @replace_false_op_eq_shl_or_disjoint(
+; CHECK-NEXT:    [[EQ0:%.*]] = icmp eq i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[SHL:%.*]] = shl i8 [[X]], 3
+; CHECK-NEXT:    [[OR:%.*]] = or disjoint i8 [[X]], [[SHL]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[EQ0]], i8 -1, i8 [[OR]]
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %eq0 = icmp eq i8 %x, -1
+  %shl = shl i8 %x, 3
+  %or = or disjoint i8 %x, %shl
+  %sel = select i1 %eq0, i8 -1, i8 %or
+  ret i8 %sel
+}
+
 ; negative test - wrong cmp predicate
 
 define i8 @replace_false_op_sgt_neg_and(i8 %x) {
@@ -1698,3 +1713,26 @@ define i8 @select_xor_cmp_unmatched_operands(i8 %0, i8 %1, i8 %c) {
   %5 = select i1 %3, i8 0, i8 %4
   ret i8 %5
 }
+
+define i8 @select_or_eq(i8 %x, i8 %y) {
+; CHECK-LABEL: @select_or_eq(
+; CHECK-NEXT:    [[OR:%.*]] = or i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i8 [[OR]]
+;
+  %cmp = icmp eq i8 %x, %y
+  %or = or i8 %x, %y
+  %sel = select i1 %cmp, i8 %x, i8 %or
+  ret i8 %sel
+}
+
+; FIXME: This is a miscompile.
+define i8 @select_or_disjoint_eq(i8 %x, i8 %y) {
+; CHECK-LABEL: @select_or_disjoint_eq(
+; CHECK-NEXT:    [[OR:%.*]] = or disjoint i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i8 [[OR]]
+;
+  %cmp = icmp eq i8 %x, %y
+  %or = or disjoint i8 %x, %y
+  %sel = select i1 %cmp, i8 %x, i8 %or
+  ret i8 %sel
+}

From cd31cf5989aaf6a187aaf3af4f94207c55a70d0f Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 1 Dec 2023 11:43:17 +0100
Subject: [PATCH 25/72] [InstSimplify] Fix or disjoint miscompile with op
 replacement

Make sure %x does not get folded to "or disjoint %x, %x" without
dropping the flag, as this would be a derefinement.
---
 llvm/lib/Analysis/InstructionSimplify.cpp   | 11 ++++++++++-
 llvm/test/Transforms/InstCombine/select.ll  |  3 +--
 llvm/test/Transforms/InstSimplify/select.ll |  7 ++++---
 3 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 9f3b3f25ec3f15..cef9f6ec179ba5 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -4331,8 +4331,17 @@ static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
 
       // x & x -> x, x | x -> x
       if ((Opcode == Instruction::And || Opcode == Instruction::Or) &&
-          NewOps[0] == NewOps[1])
+          NewOps[0] == NewOps[1]) {
+        // or disjoint x, x results in poison.
+        if (auto *PDI = dyn_cast<PossiblyDisjointInst>(BO)) {
+          if (PDI->isDisjoint()) {
+            if (!DropFlags)
+              return nullptr;
+            DropFlags->push_back(BO);
+          }
+        }
         return NewOps[0];
+      }
 
       // x - x -> 0, x ^ x -> 0. This is non-refining, because x is non-poison
       // by assumption and this case never wraps, so nowrap flags can be
diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll
index f1ccd4747bd1ce..6f24758effac2f 100644
--- a/llvm/test/Transforms/InstCombine/select.ll
+++ b/llvm/test/Transforms/InstCombine/select.ll
@@ -2916,10 +2916,9 @@ define i8 @replace_false_op_eq_shl_or_disjoint(i8 %x) {
   ret i8 %sel
 }
 
-; FIXME: This is a miscompile.
 define i8 @select_or_disjoint_eq(i8 %x, i8 %y) {
 ; CHECK-LABEL: @select_or_disjoint_eq(
-; CHECK-NEXT:    [[OR:%.*]] = or disjoint i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[OR:%.*]] = or i8 [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret i8 [[OR]]
 ;
   %cmp = icmp eq i8 %x, %y
diff --git a/llvm/test/Transforms/InstSimplify/select.ll b/llvm/test/Transforms/InstSimplify/select.ll
index 473d8b8b036808..b9c79f02245ccf 100644
--- a/llvm/test/Transforms/InstSimplify/select.ll
+++ b/llvm/test/Transforms/InstSimplify/select.ll
@@ -1725,11 +1725,12 @@ define i8 @select_or_eq(i8 %x, i8 %y) {
   ret i8 %sel
 }
 
-; FIXME: This is a miscompile.
 define i8 @select_or_disjoint_eq(i8 %x, i8 %y) {
 ; CHECK-LABEL: @select_or_disjoint_eq(
-; CHECK-NEXT:    [[OR:%.*]] = or disjoint i8 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    ret i8 [[OR]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[OR:%.*]] = or disjoint i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i8 [[X]], i8 [[OR]]
+; CHECK-NEXT:    ret i8 [[SEL]]
 ;
   %cmp = icmp eq i8 %x, %y
   %or = or disjoint i8 %x, %y

From 89b0044ca9a6fb233f8d6dd16db6bd4acc3d3f61 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 1 Dec 2023 12:14:57 +0100
Subject: [PATCH 26/72] [InstSimplify] Add test for implied cond with equal ops
 and constant (NFC)

---
 llvm/test/Transforms/InstSimplify/implies.ll | 26 ++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/llvm/test/Transforms/InstSimplify/implies.ll b/llvm/test/Transforms/InstSimplify/implies.ll
index 41f1995428473a..75044f4d9a356f 100644
--- a/llvm/test/Transforms/InstSimplify/implies.ll
+++ b/llvm/test/Transforms/InstSimplify/implies.ll
@@ -499,4 +499,30 @@ define i1 @lshr_value(i32 %length.i, i32 %i, i32 %v) {
   ret i1 %res
 }
 
+define i1 @same_ops_with_constant(i8 %x) {
+; CHECK-LABEL: @same_ops_with_constant(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i8 [[X:%.*]], 5
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ugt i8 [[X]], 5
+; CHECK-NEXT:    [[RES:%.*]] = icmp ule i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[RES]]
+;
+  %cmp1 = icmp sgt i8 %x, 5
+  %cmp2 = icmp ugt i8 %x, 5
+  %res = icmp ule i1 %cmp1, %cmp2
+  ret i1 %res
+}
+
+define i1 @same_ops_with_constant_wrong_sign(i8 %x) {
+; CHECK-LABEL: @same_ops_with_constant_wrong_sign(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i8 [[X:%.*]], -5
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ugt i8 [[X]], -5
+; CHECK-NEXT:    [[RES:%.*]] = icmp ule i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[RES]]
+;
+  %cmp1 = icmp sgt i8 %x, -5
+  %cmp2 = icmp ugt i8 %x, -5
+  %res = icmp ule i1 %cmp1, %cmp2
+  ret i1 %res
+}
+
 declare void @llvm.assume(i1)

From 460faa0c87f0a9496cdaf6c856aff1886e29afe3 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 1 Dec 2023 12:15:58 +0100
Subject: [PATCH 27/72] [InstSimplify] Check common operand with constant
 earlier

If both icmps have the same operands and the RHS is constant, we
would currently go into the isImpliedCondMatchingOperands() code
path, instead of the isImpliedCondCommonOperandWithConstants()
path. Both are correct, but the latter can produce more accurate
results if the implication is dependent on the sign.
---
 llvm/lib/Analysis/ValueTracking.cpp          | 10 +++++-----
 llvm/test/Transforms/InstSimplify/implies.ll |  5 +----
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 9cfe7315a7a4dc..d8a72c9f7b989d 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -8352,17 +8352,17 @@ static std::optional<bool> isImpliedCondICmps(const ICmpInst *LHS,
   CmpInst::Predicate LPred =
       LHSIsTrue ? LHS->getPredicate() : LHS->getInversePredicate();
 
-  // Can we infer anything when the two compares have matching operands?
-  bool AreSwappedOps;
-  if (areMatchingOperands(L0, L1, R0, R1, AreSwappedOps))
-    return isImpliedCondMatchingOperands(LPred, RPred, AreSwappedOps);
-
   // Can we infer anything when the 0-operands match and the 1-operands are
   // constants (not necessarily matching)?
   const APInt *LC, *RC;
   if (L0 == R0 && match(L1, m_APInt(LC)) && match(R1, m_APInt(RC)))
     return isImpliedCondCommonOperandWithConstants(LPred, *LC, RPred, *RC);
 
+  // Can we infer anything when the two compares have matching operands?
+  bool AreSwappedOps;
+  if (areMatchingOperands(L0, L1, R0, R1, AreSwappedOps))
+    return isImpliedCondMatchingOperands(LPred, RPred, AreSwappedOps);
+
   // L0 = R0 = L1 + R1, L0 >=u L1 implies R0 >=u R1, L0 <u L1 implies R0 <u R1
   if (ICmpInst::isUnsigned(LPred) && ICmpInst::isUnsigned(RPred)) {
     if (L0 == R1) {
diff --git a/llvm/test/Transforms/InstSimplify/implies.ll b/llvm/test/Transforms/InstSimplify/implies.ll
index 75044f4d9a356f..d72dad95bfbd09 100644
--- a/llvm/test/Transforms/InstSimplify/implies.ll
+++ b/llvm/test/Transforms/InstSimplify/implies.ll
@@ -501,10 +501,7 @@ define i1 @lshr_value(i32 %length.i, i32 %i, i32 %v) {
 
 define i1 @same_ops_with_constant(i8 %x) {
 ; CHECK-LABEL: @same_ops_with_constant(
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i8 [[X:%.*]], 5
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp ugt i8 [[X]], 5
-; CHECK-NEXT:    [[RES:%.*]] = icmp ule i1 [[CMP1]], [[CMP2]]
-; CHECK-NEXT:    ret i1 [[RES]]
+; CHECK-NEXT:    ret i1 true
 ;
   %cmp1 = icmp sgt i8 %x, 5
   %cmp2 = icmp ugt i8 %x, 5

From 9468de48fcd413aa0895a78bd6f1aeb161b39294 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <Ramkumar.Ramachandra@imgtec.com>
Date: Fri, 1 Dec 2023 11:29:19 +0000
Subject: [PATCH 28/72] TargetInstrInfo: make getOperandLatency return optional
 (NFC) (#73769)

getOperandLatency has the following behavior: it returns -1 as a special
value, negative numbers other than -1 on some target-specific overrides,
or a valid non-negative latency. This behavior can be surprising, as
some callers do arithmetic on these negative values. Change the
interface of getOperandLatency to return a std::optional<unsigned> to
prevent surprises in callers. While at it, change the interface of
getInstrLatency to return unsigned instead of int.

This change was inspired by a refactoring in
TargetSchedModel::computeOperandLatency.
---
 llvm/include/llvm/CodeGen/TargetInstrInfo.h   |  18 +--
 llvm/include/llvm/MC/MCInstrItineraries.h     |  40 +++---
 .../SelectionDAG/ScheduleDAGSDNodes.cpp       |   9 +-
 llvm/lib/CodeGen/TargetInstrInfo.cpp          |  23 ++--
 llvm/lib/CodeGen/TargetSchedule.cpp           |  32 +++--
 llvm/lib/MC/MCDisassembler/Disassembler.cpp   |  11 +-
 llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp      | 117 +++++++++---------
 llvm/lib/Target/ARM/ARMBaseInstrInfo.h        |  76 ++++++------
 llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp  |  14 +--
 llvm/lib/Target/Hexagon/HexagonInstrInfo.h    |   9 +-
 llvm/lib/Target/Hexagon/HexagonSubtarget.cpp  |  27 ++--
 llvm/lib/Target/PowerPC/PPCInstrInfo.cpp      |  23 ++--
 llvm/lib/Target/PowerPC/PPCInstrInfo.h        |  16 +--
 13 files changed, 209 insertions(+), 206 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 58355a32315b23..282fecc3ea81c0 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -1706,9 +1706,9 @@ class TargetInstrInfo : public MCInstrInfo {
     return Opcode <= TargetOpcode::COPY;
   }
 
-  virtual int getOperandLatency(const InstrItineraryData *ItinData,
-                                SDNode *DefNode, unsigned DefIdx,
-                                SDNode *UseNode, unsigned UseIdx) const;
+  virtual std::optional<unsigned>
+  getOperandLatency(const InstrItineraryData *ItinData, SDNode *DefNode,
+                    unsigned DefIdx, SDNode *UseNode, unsigned UseIdx) const;
 
   /// Compute and return the use operand latency of a given pair of def and use.
   /// In most cases, the static scheduling itinerary was enough to determine the
@@ -1718,10 +1718,10 @@ class TargetInstrInfo : public MCInstrInfo {
   /// This is a raw interface to the itinerary that may be directly overridden
   /// by a target. Use computeOperandLatency to get the best estimate of
   /// latency.
-  virtual int getOperandLatency(const InstrItineraryData *ItinData,
-                                const MachineInstr &DefMI, unsigned DefIdx,
-                                const MachineInstr &UseMI,
-                                unsigned UseIdx) const;
+  virtual std::optional<unsigned>
+  getOperandLatency(const InstrItineraryData *ItinData,
+                    const MachineInstr &DefMI, unsigned DefIdx,
+                    const MachineInstr &UseMI, unsigned UseIdx) const;
 
   /// Compute the instruction latency of a given instruction.
   /// If the instruction has higher cost when predicated, it's returned via
@@ -1732,8 +1732,8 @@ class TargetInstrInfo : public MCInstrInfo {
 
   virtual unsigned getPredicationCost(const MachineInstr &MI) const;
 
-  virtual int getInstrLatency(const InstrItineraryData *ItinData,
-                              SDNode *Node) const;
+  virtual unsigned getInstrLatency(const InstrItineraryData *ItinData,
+                                   SDNode *Node) const;
 
   /// Return the default expected latency for a def based on its opcode.
   unsigned defaultDefLatency(const MCSchedModel &SchedModel,
diff --git a/llvm/include/llvm/MC/MCInstrItineraries.h b/llvm/include/llvm/MC/MCInstrItineraries.h
index 652922feddc338..b17c41ce3aa4a1 100644
--- a/llvm/include/llvm/MC/MCInstrItineraries.h
+++ b/llvm/include/llvm/MC/MCInstrItineraries.h
@@ -17,6 +17,7 @@
 
 #include "llvm/MC/MCSchedule.h"
 #include <algorithm>
+#include <optional>
 
 namespace llvm {
 
@@ -162,18 +163,19 @@ class InstrItineraryData {
     return Latency;
   }
 
-  /// Return the cycle for the given class and operand.  Return -1 if no
-  /// cycle is specified for the operand.
-  int getOperandCycle(unsigned ItinClassIndx, unsigned OperandIdx) const {
+  /// Return the cycle for the given class and operand. Return std::nullopt if
+  /// the information is not available for the operand.
+  std::optional<unsigned> getOperandCycle(unsigned ItinClassIndx,
+                                          unsigned OperandIdx) const {
     if (isEmpty())
-      return -1;
+      return std::nullopt;
 
     unsigned FirstIdx = Itineraries[ItinClassIndx].FirstOperandCycle;
     unsigned LastIdx = Itineraries[ItinClassIndx].LastOperandCycle;
     if ((FirstIdx + OperandIdx) >= LastIdx)
-      return -1;
+      return std::nullopt;
 
-    return (int)OperandCycles[FirstIdx + OperandIdx];
+    return OperandCycles[FirstIdx + OperandIdx];
   }
 
   /// Return true if there is a pipeline forwarding between instructions
@@ -201,25 +203,27 @@ class InstrItineraryData {
 
   /// Compute and return the use operand latency of a given itinerary
   /// class and operand index if the value is produced by an instruction of the
-  /// specified itinerary class and def operand index.
-  int getOperandLatency(unsigned DefClass, unsigned DefIdx,
-                        unsigned UseClass, unsigned UseIdx) const {
+  /// specified itinerary class and def operand index. Return std::nullopt if
+  /// the information is not available for the operand.
+  std::optional<unsigned> getOperandLatency(unsigned DefClass, unsigned DefIdx,
+                                            unsigned UseClass,
+                                            unsigned UseIdx) const {
     if (isEmpty())
-      return -1;
+      return std::nullopt;
 
-    int DefCycle = getOperandCycle(DefClass, DefIdx);
-    if (DefCycle == -1)
-      return -1;
+    std::optional<unsigned> DefCycle = getOperandCycle(DefClass, DefIdx);
+    std::optional<unsigned> UseCycle = getOperandCycle(UseClass, UseIdx);
+    if (!DefCycle || !UseCycle)
+      return std::nullopt;
 
-    int UseCycle = getOperandCycle(UseClass, UseIdx);
-    if (UseCycle == -1)
-      return -1;
+    if (UseCycle > *DefCycle + 1)
+      return std::nullopt;
 
-    UseCycle = DefCycle - UseCycle + 1;
+    UseCycle = *DefCycle - *UseCycle + 1;
     if (UseCycle > 0 &&
         hasPipelineForwarding(DefClass, DefIdx, UseClass, UseIdx))
       // FIXME: This assumes one cycle benefit for every pipeline forwarding.
-      --UseCycle;
+      UseCycle = *UseCycle - 1;
     return UseCycle;
   }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index 0579c1664d5c9a..4d6d350c46f5af 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -659,7 +659,8 @@ void ScheduleDAGSDNodes::computeOperandLatency(SDNode *Def, SDNode *Use,
   if (Use->isMachineOpcode())
     // Adjust the use operand index by num of defs.
     OpIdx += TII->get(Use->getMachineOpcode()).getNumDefs();
-  int Latency = TII->getOperandLatency(InstrItins, Def, DefIdx, Use, OpIdx);
+  std::optional<unsigned> Latency =
+      TII->getOperandLatency(InstrItins, Def, DefIdx, Use, OpIdx);
   if (Latency > 1 && Use->getOpcode() == ISD::CopyToReg &&
       !BB->succ_empty()) {
     unsigned Reg = cast<RegisterSDNode>(Use->getOperand(1))->getReg();
@@ -667,10 +668,10 @@ void ScheduleDAGSDNodes::computeOperandLatency(SDNode *Def, SDNode *Use,
       // This copy is a liveout value. It is likely coalesced, so reduce the
       // latency so not to penalize the def.
       // FIXME: need target specific adjustment here?
-      Latency = Latency - 1;
+      Latency = *Latency - 1;
   }
-  if (Latency >= 0)
-    dep.setLatency(Latency);
+  if (Latency)
+    dep.setLatency(*Latency);
 }
 
 void ScheduleDAGSDNodes::dumpNode(const SUnit &SU) const {
diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp
index ac056fea8c3794..fbb7c81fa1f86f 100644
--- a/llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -1379,15 +1379,15 @@ bool TargetInstrInfo::getMemOperandWithOffset(
 //  SelectionDAG latency interface.
 //===----------------------------------------------------------------------===//
 
-int
+std::optional<unsigned>
 TargetInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
                                    SDNode *DefNode, unsigned DefIdx,
                                    SDNode *UseNode, unsigned UseIdx) const {
   if (!ItinData || ItinData->isEmpty())
-    return -1;
+    return std::nullopt;
 
   if (!DefNode->isMachineOpcode())
-    return -1;
+    return std::nullopt;
 
   unsigned DefClass = get(DefNode->getMachineOpcode()).getSchedClass();
   if (!UseNode->isMachineOpcode())
@@ -1396,8 +1396,8 @@ TargetInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
   return ItinData->getOperandLatency(DefClass, DefIdx, UseClass, UseIdx);
 }
 
-int TargetInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
-                                     SDNode *N) const {
+unsigned TargetInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
+                                          SDNode *N) const {
   if (!ItinData || ItinData->isEmpty())
     return 1;
 
@@ -1461,8 +1461,9 @@ bool TargetInstrInfo::hasLowDefLatency(const TargetSchedModel &SchedModel,
     return false;
 
   unsigned DefClass = DefMI.getDesc().getSchedClass();
-  int DefCycle = ItinData->getOperandCycle(DefClass, DefIdx);
-  return (DefCycle != -1 && DefCycle <= 1);
+  std::optional<unsigned> DefCycle =
+      ItinData->getOperandCycle(DefClass, DefIdx);
+  return DefCycle <= 1;
 }
 
 bool TargetInstrInfo::isFunctionSafeToSplit(const MachineFunction &MF) const {
@@ -1580,11 +1581,9 @@ unsigned TargetInstrInfo::getCallFrameSizeAt(MachineInstr &MI) const {
 
 /// Both DefMI and UseMI must be valid.  By default, call directly to the
 /// itinerary. This may be overriden by the target.
-int TargetInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
-                                       const MachineInstr &DefMI,
-                                       unsigned DefIdx,
-                                       const MachineInstr &UseMI,
-                                       unsigned UseIdx) const {
+std::optional<unsigned> TargetInstrInfo::getOperandLatency(
+    const InstrItineraryData *ItinData, const MachineInstr &DefMI,
+    unsigned DefIdx, const MachineInstr &UseMI, unsigned UseIdx) const {
   unsigned DefClass = DefMI.getDesc().getSchedClass();
   unsigned UseClass = UseMI.getDesc().getSchedClass();
   return ItinData->getOperandLatency(DefClass, DefIdx, UseClass, UseIdx);
diff --git a/llvm/lib/CodeGen/TargetSchedule.cpp b/llvm/lib/CodeGen/TargetSchedule.cpp
index 3cedb38de2ad8d..a25d4ff78f4d96 100644
--- a/llvm/lib/CodeGen/TargetSchedule.cpp
+++ b/llvm/lib/CodeGen/TargetSchedule.cpp
@@ -168,16 +168,20 @@ static unsigned findUseIdx(const MachineInstr *MI, unsigned UseOperIdx) {
   return UseIdx;
 }
 
-// Top-level API for clients that know the operand indices.
+// Top-level API for clients that know the operand indices. This doesn't need to
+// return std::optional<unsigned>, as it always returns a valid latency.
 unsigned TargetSchedModel::computeOperandLatency(
   const MachineInstr *DefMI, unsigned DefOperIdx,
   const MachineInstr *UseMI, unsigned UseOperIdx) const {
 
+  const unsigned InstrLatency = computeInstrLatency(DefMI);
+  const unsigned DefaultDefLatency = TII->defaultDefLatency(SchedModel, *DefMI);
+
   if (!hasInstrSchedModel() && !hasInstrItineraries())
-    return TII->defaultDefLatency(SchedModel, *DefMI);
+    return InstrLatency;
 
   if (hasInstrItineraries()) {
-    int OperLatency = 0;
+    std::optional<unsigned> OperLatency;
     if (UseMI) {
       OperLatency = TII->getOperandLatency(&InstrItins, *DefMI, DefOperIdx,
                                            *UseMI, UseOperIdx);
@@ -186,21 +190,13 @@ unsigned TargetSchedModel::computeOperandLatency(
       unsigned DefClass = DefMI->getDesc().getSchedClass();
       OperLatency = InstrItins.getOperandCycle(DefClass, DefOperIdx);
     }
-    if (OperLatency >= 0)
-      return OperLatency;
-
-    // No operand latency was found.
-    unsigned InstrLatency = TII->getInstrLatency(&InstrItins, *DefMI);
-
-    // Expected latency is the max of the stage latency and itinerary props.
-    // Rather than directly querying InstrItins stage latency, we call a TII
-    // hook to allow subtargets to specialize latency. This hook is only
-    // applicable to the InstrItins model. InstrSchedModel should model all
-    // special cases without TII hooks.
-    InstrLatency =
-        std::max(InstrLatency, TII->defaultDefLatency(SchedModel, *DefMI));
-    return InstrLatency;
+
+    // Expected latency is the max of InstrLatency and DefaultDefLatency, if we
+    // didn't find an operand latency.
+    return OperLatency ? *OperLatency
+                       : std::max(InstrLatency, DefaultDefLatency);
   }
+
   // hasInstrSchedModel()
   const MCSchedClassDesc *SCDesc = resolveSchedClass(DefMI);
   unsigned DefIdx = findDefIdx(DefMI, DefOperIdx);
@@ -237,7 +233,7 @@ unsigned TargetSchedModel::computeOperandLatency(
   // FIXME: Automatically giving all implicit defs defaultDefLatency is
   // undesirable. We should only do it for defs that are known to the MC
   // desc like flags. Truly implicit defs should get 1 cycle latency.
-  return DefMI->isTransient() ? 0 : TII->defaultDefLatency(SchedModel, *DefMI);
+  return DefMI->isTransient() ? 0 : DefaultDefLatency;
 }
 
 unsigned
diff --git a/llvm/lib/MC/MCDisassembler/Disassembler.cpp b/llvm/lib/MC/MCDisassembler/Disassembler.cpp
index 067b951fbfccb3..5e5a163c290244 100644
--- a/llvm/lib/MC/MCDisassembler/Disassembler.cpp
+++ b/llvm/lib/MC/MCDisassembler/Disassembler.cpp
@@ -180,12 +180,13 @@ static int getItineraryLatency(LLVMDisasmContext *DC, const MCInst &Inst) {
   const MCInstrDesc& Desc = DC->getInstrInfo()->get(Inst.getOpcode());
   unsigned SCClass = Desc.getSchedClass();
 
-  int Latency = 0;
-  for (unsigned OpIdx = 0, OpIdxEnd = Inst.getNumOperands(); OpIdx != OpIdxEnd;
-       ++OpIdx)
-    Latency = std::max(Latency, IID.getOperandCycle(SCClass, OpIdx));
+  unsigned Latency = 0;
 
-  return Latency;
+  for (unsigned Idx = 0, IdxEnd = Inst.getNumOperands(); Idx != IdxEnd; ++Idx)
+    if (std::optional<unsigned> OperCycle = IID.getOperandCycle(SCClass, Idx))
+      Latency = std::max(Latency, *OperCycle);
+
+  return (int)Latency;
 }
 
 /// Gets latency information for \p Inst, based on \p DC information.
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index c09879fd9c2beb..94f34b12769660 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -3872,17 +3872,16 @@ unsigned ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
   llvm_unreachable("Didn't find the number of microops");
 }
 
-int
+std::optional<unsigned>
 ARMBaseInstrInfo::getVLDMDefCycle(const InstrItineraryData *ItinData,
-                                  const MCInstrDesc &DefMCID,
-                                  unsigned DefClass,
+                                  const MCInstrDesc &DefMCID, unsigned DefClass,
                                   unsigned DefIdx, unsigned DefAlign) const {
   int RegNo = (int)(DefIdx+1) - DefMCID.getNumOperands() + 1;
   if (RegNo <= 0)
     // Def is the address writeback.
     return ItinData->getOperandCycle(DefClass, DefIdx);
 
-  int DefCycle;
+  unsigned DefCycle;
   if (Subtarget.isCortexA8() || Subtarget.isCortexA7()) {
     // (regno / 2) + (regno % 2) + 1
     DefCycle = RegNo / 2 + 1;
@@ -3913,17 +3912,16 @@ ARMBaseInstrInfo::getVLDMDefCycle(const InstrItineraryData *ItinData,
   return DefCycle;
 }
 
-int
+std::optional<unsigned>
 ARMBaseInstrInfo::getLDMDefCycle(const InstrItineraryData *ItinData,
-                                 const MCInstrDesc &DefMCID,
-                                 unsigned DefClass,
+                                 const MCInstrDesc &DefMCID, unsigned DefClass,
                                  unsigned DefIdx, unsigned DefAlign) const {
   int RegNo = (int)(DefIdx+1) - DefMCID.getNumOperands() + 1;
   if (RegNo <= 0)
     // Def is the address writeback.
     return ItinData->getOperandCycle(DefClass, DefIdx);
 
-  int DefCycle;
+  unsigned DefCycle;
   if (Subtarget.isCortexA8() || Subtarget.isCortexA7()) {
     // 4 registers would be issued: 1, 2, 1.
     // 5 registers would be issued: 1, 2, 2.
@@ -3948,16 +3946,15 @@ ARMBaseInstrInfo::getLDMDefCycle(const InstrItineraryData *ItinData,
   return DefCycle;
 }
 
-int
+std::optional<unsigned>
 ARMBaseInstrInfo::getVSTMUseCycle(const InstrItineraryData *ItinData,
-                                  const MCInstrDesc &UseMCID,
-                                  unsigned UseClass,
+                                  const MCInstrDesc &UseMCID, unsigned UseClass,
                                   unsigned UseIdx, unsigned UseAlign) const {
   int RegNo = (int)(UseIdx+1) - UseMCID.getNumOperands() + 1;
   if (RegNo <= 0)
     return ItinData->getOperandCycle(UseClass, UseIdx);
 
-  int UseCycle;
+  unsigned UseCycle;
   if (Subtarget.isCortexA8() || Subtarget.isCortexA7()) {
     // (regno / 2) + (regno % 2) + 1
     UseCycle = RegNo / 2 + 1;
@@ -3988,16 +3985,15 @@ ARMBaseInstrInfo::getVSTMUseCycle(const InstrItineraryData *ItinData,
   return UseCycle;
 }
 
-int
+std::optional<unsigned>
 ARMBaseInstrInfo::getSTMUseCycle(const InstrItineraryData *ItinData,
-                                 const MCInstrDesc &UseMCID,
-                                 unsigned UseClass,
+                                 const MCInstrDesc &UseMCID, unsigned UseClass,
                                  unsigned UseIdx, unsigned UseAlign) const {
   int RegNo = (int)(UseIdx+1) - UseMCID.getNumOperands() + 1;
   if (RegNo <= 0)
     return ItinData->getOperandCycle(UseClass, UseIdx);
 
-  int UseCycle;
+  unsigned UseCycle;
   if (Subtarget.isCortexA8() || Subtarget.isCortexA7()) {
     UseCycle = RegNo / 2;
     if (UseCycle < 2)
@@ -4017,12 +4013,10 @@ ARMBaseInstrInfo::getSTMUseCycle(const InstrItineraryData *ItinData,
   return UseCycle;
 }
 
-int
-ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
-                                    const MCInstrDesc &DefMCID,
-                                    unsigned DefIdx, unsigned DefAlign,
-                                    const MCInstrDesc &UseMCID,
-                                    unsigned UseIdx, unsigned UseAlign) const {
+std::optional<unsigned> ARMBaseInstrInfo::getOperandLatency(
+    const InstrItineraryData *ItinData, const MCInstrDesc &DefMCID,
+    unsigned DefIdx, unsigned DefAlign, const MCInstrDesc &UseMCID,
+    unsigned UseIdx, unsigned UseAlign) const {
   unsigned DefClass = DefMCID.getSchedClass();
   unsigned UseClass = UseMCID.getSchedClass();
 
@@ -4032,7 +4026,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
   // This may be a def / use of a variable_ops instruction, the operand
   // latency might be determinable dynamically. Let the target try to
   // figure it out.
-  int DefCycle = -1;
+  std::optional<unsigned> DefCycle;
   bool LdmBypass = false;
   switch (DefMCID.getOpcode()) {
   default:
@@ -4070,11 +4064,11 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     break;
   }
 
-  if (DefCycle == -1)
+  if (!DefCycle)
     // We can't seem to determine the result latency of the def, assume it's 2.
     DefCycle = 2;
 
-  int UseCycle = -1;
+  std::optional<unsigned> UseCycle;
   switch (UseMCID.getOpcode()) {
   default:
     UseCycle = ItinData->getOperandCycle(UseClass, UseIdx);
@@ -4108,21 +4102,24 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     break;
   }
 
-  if (UseCycle == -1)
+  if (!UseCycle)
     // Assume it's read in the first stage.
     UseCycle = 1;
 
-  UseCycle = DefCycle - UseCycle + 1;
+  if (UseCycle > *DefCycle + 1)
+    return std::nullopt;
+
+  UseCycle = *DefCycle - *UseCycle + 1;
   if (UseCycle > 0) {
     if (LdmBypass) {
       // It's a variable_ops instruction so we can't use DefIdx here. Just use
       // first def operand.
       if (ItinData->hasPipelineForwarding(DefClass, DefMCID.getNumOperands()-1,
                                           UseClass, UseIdx))
-        --UseCycle;
+        UseCycle = *UseCycle - 1;
     } else if (ItinData->hasPipelineForwarding(DefClass, DefIdx,
                                                UseClass, UseIdx)) {
-      --UseCycle;
+      UseCycle = *UseCycle - 1;
     }
   }
 
@@ -4362,14 +4359,12 @@ static int adjustDefLatency(const ARMSubtarget &Subtarget,
   return Adjust;
 }
 
-int ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
-                                        const MachineInstr &DefMI,
-                                        unsigned DefIdx,
-                                        const MachineInstr &UseMI,
-                                        unsigned UseIdx) const {
+std::optional<unsigned> ARMBaseInstrInfo::getOperandLatency(
+    const InstrItineraryData *ItinData, const MachineInstr &DefMI,
+    unsigned DefIdx, const MachineInstr &UseMI, unsigned UseIdx) const {
   // No operand latency. The caller may fall back to getInstrLatency.
   if (!ItinData || ItinData->isEmpty())
-    return -1;
+    return std::nullopt;
 
   const MachineOperand &DefMO = DefMI.getOperand(DefIdx);
   Register Reg = DefMO.getReg();
@@ -4390,7 +4385,7 @@ int ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     ResolvedUseMI =
         getBundledUseMI(&getRegisterInfo(), UseMI, Reg, UseIdx, UseAdj);
     if (!ResolvedUseMI)
-      return -1;
+      return std::nullopt;
   }
 
   return getOperandLatencyImpl(
@@ -4398,7 +4393,7 @@ int ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
       Reg, *ResolvedUseMI, UseIdx, ResolvedUseMI->getDesc(), UseAdj);
 }
 
-int ARMBaseInstrInfo::getOperandLatencyImpl(
+std::optional<unsigned> ARMBaseInstrInfo::getOperandLatencyImpl(
     const InstrItineraryData *ItinData, const MachineInstr &DefMI,
     unsigned DefIdx, const MCInstrDesc &DefMCID, unsigned DefAdj,
     const MachineOperand &DefMO, unsigned Reg, const MachineInstr &UseMI,
@@ -4430,7 +4425,7 @@ int ARMBaseInstrInfo::getOperandLatencyImpl(
   }
 
   if (DefMO.isImplicit() || UseMI.getOperand(UseIdx).isImplicit())
-    return -1;
+    return std::nullopt;
 
   unsigned DefAlign = DefMI.hasOneMemOperand()
                           ? (*DefMI.memoperands_begin())->getAlign().value()
@@ -4440,25 +4435,25 @@ int ARMBaseInstrInfo::getOperandLatencyImpl(
                           : 0;
 
   // Get the itinerary's latency if possible, and handle variable_ops.
-  int Latency = getOperandLatency(ItinData, DefMCID, DefIdx, DefAlign, UseMCID,
-                                  UseIdx, UseAlign);
+  std::optional<unsigned> Latency = getOperandLatency(
+      ItinData, DefMCID, DefIdx, DefAlign, UseMCID, UseIdx, UseAlign);
   // Unable to find operand latency. The caller may resort to getInstrLatency.
-  if (Latency < 0)
-    return Latency;
+  if (!Latency)
+    return std::nullopt;
 
   // Adjust for IT block position.
   int Adj = DefAdj + UseAdj;
 
   // Adjust for dynamic def-side opcode variants not captured by the itinerary.
   Adj += adjustDefLatency(Subtarget, DefMI, DefMCID, DefAlign);
-  if (Adj >= 0 || (int)Latency > -Adj) {
-    return Latency + Adj;
+  if (Adj >= 0 || (int)*Latency > -Adj) {
+    return *Latency + Adj;
   }
   // Return the itinerary latency, which may be zero but not less than zero.
   return Latency;
 }
 
-int
+std::optional<unsigned>
 ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
                                     SDNode *DefNode, unsigned DefIdx,
                                     SDNode *UseNode, unsigned UseIdx) const {
@@ -4474,10 +4469,11 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     return DefMCID.mayLoad() ? 3 : 1;
 
   if (!UseNode->isMachineOpcode()) {
-    int Latency = ItinData->getOperandCycle(DefMCID.getSchedClass(), DefIdx);
+    std::optional<unsigned> Latency =
+        ItinData->getOperandCycle(DefMCID.getSchedClass(), DefIdx);
     int Adj = Subtarget.getPreISelOperandLatencyAdjustment();
     int Threshold = 1 + Adj;
-    return Latency <= Threshold ? 1 : Latency - Adj;
+    return !Latency || Latency <= Threshold ? 1 : *Latency - Adj;
   }
 
   const MCInstrDesc &UseMCID = get(UseNode->getMachineOpcode());
@@ -4489,8 +4485,10 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
   unsigned UseAlign = !UseMN->memoperands_empty()
                           ? (*UseMN->memoperands_begin())->getAlign().value()
                           : 0;
-  int Latency = getOperandLatency(ItinData, DefMCID, DefIdx, DefAlign,
-                                  UseMCID, UseIdx, UseAlign);
+  std::optional<unsigned> Latency = getOperandLatency(
+      ItinData, DefMCID, DefIdx, DefAlign, UseMCID, UseIdx, UseAlign);
+  if (!Latency)
+    return std::nullopt;
 
   if (Latency > 1 &&
       (Subtarget.isCortexA8() || Subtarget.isLikeA9() ||
@@ -4506,7 +4504,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
       unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
       if (ShImm == 0 ||
           (ShImm == 2 && ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl))
-        --Latency;
+        Latency = *Latency - 1;
       break;
     }
     case ARM::t2LDRs:
@@ -4517,7 +4515,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
       unsigned ShAmt =
         cast<ConstantSDNode>(DefNode->getOperand(2))->getZExtValue();
       if (ShAmt == 0 || ShAmt == 2)
-        --Latency;
+        Latency = *Latency - 1;
       break;
     }
     }
@@ -4534,9 +4532,9 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
       if (ShImm == 0 ||
           ((ShImm == 1 || ShImm == 2 || ShImm == 3) &&
            ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl))
-        Latency -= 2;
+        Latency = *Latency - 2;
       else if (ShImm == 1 && ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsr)
-        --Latency;
+        Latency = *Latency - 1;
       break;
     }
     case ARM::t2LDRs:
@@ -4544,7 +4542,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     case ARM::t2LDRHs:
     case ARM::t2LDRSHs:
       // Thumb2 mode: lsl 0-3 only.
-      Latency -= 2;
+      Latency = *Latency - 2;
       break;
     }
   }
@@ -4710,7 +4708,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     case ARM::VLD4LNq32Pseudo_UPD:
       // If the address is not 64-bit aligned, the latencies of these
       // instructions increases by one.
-      ++Latency;
+      Latency = *Latency + 1;
       break;
     }
 
@@ -4787,8 +4785,8 @@ unsigned ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
   return Latency;
 }
 
-int ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
-                                      SDNode *Node) const {
+unsigned ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
+                                           SDNode *Node) const {
   if (!Node->isMachineOpcode())
     return 1;
 
@@ -4836,8 +4834,9 @@ bool ARMBaseInstrInfo::hasLowDefLatency(const TargetSchedModel &SchedModel,
   unsigned DDomain = DefMI.getDesc().TSFlags & ARMII::DomainMask;
   if (DDomain == ARMII::DomainGeneral) {
     unsigned DefClass = DefMI.getDesc().getSchedClass();
-    int DefCycle = ItinData->getOperandCycle(DefClass, DefIdx);
-    return (DefCycle != -1 && DefCycle <= 2);
+    std::optional<unsigned> DefCycle =
+        ItinData->getOperandCycle(DefClass, DefIdx);
+    return DefCycle <= 2;
   }
   return false;
 }
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
index 5efcc1a0d9fc07..6aebf3b64e8d43 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -316,13 +316,15 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo {
   unsigned getNumMicroOps(const InstrItineraryData *ItinData,
                           const MachineInstr &MI) const override;
 
-  int getOperandLatency(const InstrItineraryData *ItinData,
-                        const MachineInstr &DefMI, unsigned DefIdx,
-                        const MachineInstr &UseMI,
-                        unsigned UseIdx) const override;
-  int getOperandLatency(const InstrItineraryData *ItinData,
-                        SDNode *DefNode, unsigned DefIdx,
-                        SDNode *UseNode, unsigned UseIdx) const override;
+  std::optional<unsigned> getOperandLatency(const InstrItineraryData *ItinData,
+                                            const MachineInstr &DefMI,
+                                            unsigned DefIdx,
+                                            const MachineInstr &UseMI,
+                                            unsigned UseIdx) const override;
+  std::optional<unsigned> getOperandLatency(const InstrItineraryData *ItinData,
+                                            SDNode *DefNode, unsigned DefIdx,
+                                            SDNode *UseNode,
+                                            unsigned UseIdx) const override;
 
   /// VFP/NEON execution domains.
   std::pair<uint16_t, uint16_t>
@@ -421,34 +423,34 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo {
 
   unsigned getInstBundleLength(const MachineInstr &MI) const;
 
-  int getVLDMDefCycle(const InstrItineraryData *ItinData,
-                      const MCInstrDesc &DefMCID,
-                      unsigned DefClass,
-                      unsigned DefIdx, unsigned DefAlign) const;
-  int getLDMDefCycle(const InstrItineraryData *ItinData,
-                     const MCInstrDesc &DefMCID,
-                     unsigned DefClass,
-                     unsigned DefIdx, unsigned DefAlign) const;
-  int getVSTMUseCycle(const InstrItineraryData *ItinData,
-                      const MCInstrDesc &UseMCID,
-                      unsigned UseClass,
-                      unsigned UseIdx, unsigned UseAlign) const;
-  int getSTMUseCycle(const InstrItineraryData *ItinData,
-                     const MCInstrDesc &UseMCID,
-                     unsigned UseClass,
-                     unsigned UseIdx, unsigned UseAlign) const;
-  int getOperandLatency(const InstrItineraryData *ItinData,
-                        const MCInstrDesc &DefMCID,
-                        unsigned DefIdx, unsigned DefAlign,
-                        const MCInstrDesc &UseMCID,
-                        unsigned UseIdx, unsigned UseAlign) const;
-
-  int getOperandLatencyImpl(const InstrItineraryData *ItinData,
-                            const MachineInstr &DefMI, unsigned DefIdx,
-                            const MCInstrDesc &DefMCID, unsigned DefAdj,
-                            const MachineOperand &DefMO, unsigned Reg,
-                            const MachineInstr &UseMI, unsigned UseIdx,
-                            const MCInstrDesc &UseMCID, unsigned UseAdj) const;
+  std::optional<unsigned> getVLDMDefCycle(const InstrItineraryData *ItinData,
+                                          const MCInstrDesc &DefMCID,
+                                          unsigned DefClass, unsigned DefIdx,
+                                          unsigned DefAlign) const;
+  std::optional<unsigned> getLDMDefCycle(const InstrItineraryData *ItinData,
+                                         const MCInstrDesc &DefMCID,
+                                         unsigned DefClass, unsigned DefIdx,
+                                         unsigned DefAlign) const;
+  std::optional<unsigned> getVSTMUseCycle(const InstrItineraryData *ItinData,
+                                          const MCInstrDesc &UseMCID,
+                                          unsigned UseClass, unsigned UseIdx,
+                                          unsigned UseAlign) const;
+  std::optional<unsigned> getSTMUseCycle(const InstrItineraryData *ItinData,
+                                         const MCInstrDesc &UseMCID,
+                                         unsigned UseClass, unsigned UseIdx,
+                                         unsigned UseAlign) const;
+  std::optional<unsigned> getOperandLatency(const InstrItineraryData *ItinData,
+                                            const MCInstrDesc &DefMCID,
+                                            unsigned DefIdx, unsigned DefAlign,
+                                            const MCInstrDesc &UseMCID,
+                                            unsigned UseIdx,
+                                            unsigned UseAlign) const;
+
+  std::optional<unsigned> getOperandLatencyImpl(
+      const InstrItineraryData *ItinData, const MachineInstr &DefMI,
+      unsigned DefIdx, const MCInstrDesc &DefMCID, unsigned DefAdj,
+      const MachineOperand &DefMO, unsigned Reg, const MachineInstr &UseMI,
+      unsigned UseIdx, const MCInstrDesc &UseMCID, unsigned UseAdj) const;
 
   unsigned getPredicationCost(const MachineInstr &MI) const override;
 
@@ -456,8 +458,8 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo {
                            const MachineInstr &MI,
                            unsigned *PredCost = nullptr) const override;
 
-  int getInstrLatency(const InstrItineraryData *ItinData,
-                      SDNode *Node) const override;
+  unsigned getInstrLatency(const InstrItineraryData *ItinData,
+                           SDNode *Node) const override;
 
   bool hasHighOperandLatency(const TargetSchedModel &SchedModel,
                              const MachineRegisterInfo *MRI,
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 6f0210763bc5f3..1689b8f1e132d5 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -4295,11 +4295,9 @@ unsigned HexagonInstrInfo::getInstrTimingClassLatency(
 ///
 /// This is a raw interface to the itinerary that may be directly overriden by
 /// a target. Use computeOperandLatency to get the best estimate of latency.
-int HexagonInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
-                                        const MachineInstr &DefMI,
-                                        unsigned DefIdx,
-                                        const MachineInstr &UseMI,
-                                        unsigned UseIdx) const {
+std::optional<unsigned> HexagonInstrInfo::getOperandLatency(
+    const InstrItineraryData *ItinData, const MachineInstr &DefMI,
+    unsigned DefIdx, const MachineInstr &UseMI, unsigned UseIdx) const {
   const HexagonRegisterInfo &HRI = *Subtarget.getRegisterInfo();
 
   // Get DefIdx and UseIdx for super registers.
@@ -4328,9 +4326,9 @@ int HexagonInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     }
   }
 
-  int Latency = TargetInstrInfo::getOperandLatency(ItinData, DefMI, DefIdx,
-                                                   UseMI, UseIdx);
-  if (!Latency)
+  std::optional<unsigned> Latency = TargetInstrInfo::getOperandLatency(
+      ItinData, DefMI, DefIdx, UseMI, UseIdx);
+  if (Latency == 0)
     // We should never have 0 cycle latency between two instructions unless
     // they can be packetized together. However, this decision can't be made
     // here.
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
index 0bc0877f6e7067..645b57f4664df2 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -309,10 +309,11 @@ class HexagonInstrInfo : public HexagonGenInstrInfo {
   ///
   /// This is a raw interface to the itinerary that may be directly overriden by
   /// a target. Use computeOperandLatency to get the best estimate of latency.
-  int getOperandLatency(const InstrItineraryData *ItinData,
-                        const MachineInstr &DefMI, unsigned DefIdx,
-                        const MachineInstr &UseMI,
-                        unsigned UseIdx) const override;
+  std::optional<unsigned> getOperandLatency(const InstrItineraryData *ItinData,
+                                            const MachineInstr &DefMI,
+                                            unsigned DefIdx,
+                                            const MachineInstr &UseMI,
+                                            unsigned UseIdx) const override;
 
   /// Decompose the machine operand's target flags into two values - the direct
   /// target flag value and any of bit flags that are applied.
diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
index 1c9c258df9475f..e1ad15bbc7c17a 100644
--- a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -467,7 +467,7 @@ void HexagonSubtarget::adjustSchedDependency(SUnit *Src, int SrcOpIdx,
   // default.
   if ((DstInst->isRegSequence() || DstInst->isCopy())) {
     Register DReg = DstInst->getOperand(0).getReg();
-    int DLatency = -1;
+    std::optional<unsigned> DLatency;
     for (const auto &DDep : Dst->Succs) {
       MachineInstr *DDst = DDep.getSUnit()->getInstr();
       int UseIdx = -1;
@@ -482,21 +482,21 @@ void HexagonSubtarget::adjustSchedDependency(SUnit *Src, int SrcOpIdx,
       if (UseIdx == -1)
         continue;
 
-      int Latency = (InstrInfo.getOperandLatency(&InstrItins, *SrcInst, 0,
-                                                 *DDst, UseIdx));
+      std::optional<unsigned> Latency =
+          InstrInfo.getOperandLatency(&InstrItins, *SrcInst, 0, *DDst, UseIdx);
+
       // Set DLatency for the first time.
-      DLatency = (DLatency == -1) ? Latency : DLatency;
+      if (!DLatency)
+        DLatency = Latency;
 
       // For multiple uses, if the Latency is different across uses, reset
       // DLatency.
       if (DLatency != Latency) {
-        DLatency = -1;
+        DLatency = std::nullopt;
         break;
       }
     }
-
-    DLatency = std::max(DLatency, 0);
-    Dep.setLatency((unsigned)DLatency);
+    Dep.setLatency(DLatency ? *DLatency : 0);
   }
 
   // Try to schedule uses near definitions to generate .cur.
@@ -581,15 +581,16 @@ void HexagonSubtarget::restoreLatency(SUnit *Src, SUnit *Dst) const {
     for (unsigned OpNum = 0; OpNum < DstI->getNumOperands(); OpNum++) {
       const MachineOperand &MO = DstI->getOperand(OpNum);
       if (MO.isReg() && MO.isUse() && MO.getReg() == DepR) {
-        int Latency = (InstrInfo.getOperandLatency(&InstrItins, *SrcI,
-                                                   DefIdx, *DstI, OpNum));
+        std::optional<unsigned> Latency = InstrInfo.getOperandLatency(
+            &InstrItins, *SrcI, DefIdx, *DstI, OpNum);
 
         // For some instructions (ex: COPY), we might end up with < 0 latency
         // as they don't have any Itinerary class associated with them.
-        Latency = std::max(Latency, 0);
+        if (!Latency)
+          Latency = 0;
         bool IsArtificial = I.isArtificial();
-        Latency = updateLatency(*SrcI, *DstI, IsArtificial, Latency);
-        I.setLatency(Latency);
+        Latency = updateLatency(*SrcI, *DstI, IsArtificial, *Latency);
+        I.setLatency(*Latency);
       }
     }
 
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index 6784049348b163..49d003db8ffc9a 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -155,22 +155,21 @@ unsigned PPCInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
     if (!MO.isReg() || !MO.isDef() || MO.isImplicit())
       continue;
 
-    int Cycle = ItinData->getOperandCycle(DefClass, i);
-    if (Cycle < 0)
+    std::optional<unsigned> Cycle = ItinData->getOperandCycle(DefClass, i);
+    if (!Cycle)
       continue;
 
-    Latency = std::max(Latency, (unsigned) Cycle);
+    Latency = std::max(Latency, *Cycle);
   }
 
   return Latency;
 }
 
-int PPCInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
-                                    const MachineInstr &DefMI, unsigned DefIdx,
-                                    const MachineInstr &UseMI,
-                                    unsigned UseIdx) const {
-  int Latency = PPCGenInstrInfo::getOperandLatency(ItinData, DefMI, DefIdx,
-                                                   UseMI, UseIdx);
+std::optional<unsigned> PPCInstrInfo::getOperandLatency(
+    const InstrItineraryData *ItinData, const MachineInstr &DefMI,
+    unsigned DefIdx, const MachineInstr &UseMI, unsigned UseIdx) const {
+  std::optional<unsigned> Latency = PPCGenInstrInfo::getOperandLatency(
+      ItinData, DefMI, DefIdx, UseMI, UseIdx);
 
   if (!DefMI.getParent())
     return Latency;
@@ -190,7 +189,7 @@ int PPCInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
   }
 
   if (UseMI.isBranch() && IsRegCR) {
-    if (Latency < 0)
+    if (!Latency)
       Latency = getInstrLatency(ItinData, DefMI);
 
     // On some cores, there is an additional delay between writing to a condition
@@ -210,8 +209,8 @@ int PPCInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     case PPC::DIR_PWR7:
     case PPC::DIR_PWR8:
     // FIXME: Is this needed for POWER9?
-      Latency += 2;
-      break;
+    Latency = *Latency + 2;
+    break;
     }
   }
 
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/llvm/lib/Target/PowerPC/PPCInstrInfo.h
index 31e9859a41739a..a8dc7d6d0e37a2 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h
@@ -294,13 +294,15 @@ class PPCInstrInfo : public PPCGenInstrInfo {
                            const MachineInstr &MI,
                            unsigned *PredCost = nullptr) const override;
 
-  int getOperandLatency(const InstrItineraryData *ItinData,
-                        const MachineInstr &DefMI, unsigned DefIdx,
-                        const MachineInstr &UseMI,
-                        unsigned UseIdx) const override;
-  int getOperandLatency(const InstrItineraryData *ItinData,
-                        SDNode *DefNode, unsigned DefIdx,
-                        SDNode *UseNode, unsigned UseIdx) const override {
+  std::optional<unsigned> getOperandLatency(const InstrItineraryData *ItinData,
+                                            const MachineInstr &DefMI,
+                                            unsigned DefIdx,
+                                            const MachineInstr &UseMI,
+                                            unsigned UseIdx) const override;
+  std::optional<unsigned> getOperandLatency(const InstrItineraryData *ItinData,
+                                            SDNode *DefNode, unsigned DefIdx,
+                                            SDNode *UseNode,
+                                            unsigned UseIdx) const override {
     return PPCGenInstrInfo::getOperandLatency(ItinData, DefNode, DefIdx,
                                               UseNode, UseIdx);
   }

From 5a32014d82334c4c66c8cc7ae3ed2a489c07db07 Mon Sep 17 00:00:00 2001
From: "Oleksandr \"Alex\" Zinenko" <zinenko@google.com>
Date: Fri, 1 Dec 2023 12:53:35 +0100
Subject: [PATCH 29/72] [mlir] update linalg transform ops docs

---
 .../Dialect/Linalg/TransformOps/LinalgTransformOps.td  | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
index fb660c64612663..002926ff965fd1 100644
--- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
@@ -1910,16 +1910,20 @@ def TileUsingForallOp :
     #### Example using `num_threads`
 
     ```
-    %0 = pdl_match @match_matmul in %arg1
+    %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1
+       : (!transform.any_op) -> !transform.any_op
     %3:2 = transform.structured.tile_using_forall %0 num_threads [10, 20]
+       : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
     ```
 
     #### Example using `tile_sizes`
 
     ```
-    %0 = pdl_match @match_matmul in %arg1
-    %sz = pdl_match @match_size_op in %arg1
+    %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1
+       : (!transform.any_op) -> !transform.any_op
+    %sz = transform.structured.match ...
     %3:2 = transform.structured.tile_using_forall %0 tile_sizes [0, %sz, 20]
+       : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
     ```
   }];
 

From 69020827cf611170d0bc80879114a2427aa39960 Mon Sep 17 00:00:00 2001
From: Shivam Gupta <shivam98.tkg@gmail.com>
Date: Fri, 1 Dec 2023 17:27:19 +0530
Subject: [PATCH 30/72] [NFC] Remove a space in CMake.rst

The rendered document is not correctly indentated because of this space.
---
 llvm/docs/CMake.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/docs/CMake.rst b/llvm/docs/CMake.rst
index 4b86eb9c01d26b..7dd3fd26022e5c 100644
--- a/llvm/docs/CMake.rst
+++ b/llvm/docs/CMake.rst
@@ -389,7 +389,7 @@ enabled sub-projects. Nearly all of these variable names begin with
   will limit code coverage summaries to just the listed directories. If unset,
   coverage reports will include all sources identified by the tooling.
 
- **LLVM_INDIVIDUAL_TEST_COVERAGE**: BOOL
+**LLVM_INDIVIDUAL_TEST_COVERAGE**:BOOL
   Enable individual test case coverage. When set to ON, code coverage data for
   each test case will be generated and stored in a separate directory under the
   config.test_exec_root path. This feature allows code coverage analysis of each

From 8727982bdfb84ce4adbd138c146a6b7ecaf98fdb Mon Sep 17 00:00:00 2001
From: Simon Tatham <simon.tatham@arm.com>
Date: Fri, 1 Dec 2023 12:00:18 +0000
Subject: [PATCH 31/72] [Driver] Add exclusive-group feature to multilib.yaml.
 (#69447)

This allows a YAML-based multilib configuration to specify explicitly
that a subset of its library directories are alternatives to each
other, i.e. at most one of that subset should be selected.

So if you have multiple sysroots each including a full set of headers
and libraries, you can mark them as members of the same mutually
exclusive group, and then you'll be sure that only one of them is
selected, even if two or more are compatible with the compile options.

This is particularly important in multilib setups including the libc++
headers, where selecting the include directories from two different
sysroots can cause an actual build failure. This occurs when including
<stdio.h>, for example: libc++'s stdio.h is included first, and will
try to use `#include_next` to fetch the underlying libc's version. But
if there are two include directories from separate multilibs, then
both of their C++ include directories will end up on the include path
first, followed by both the C directories. So the `#include_next` from
the first libc++ stdio.h will include the second libc++ stdio.h, which
will do nothing because it has the same include guard macro, and the
libc header won't ever be included at all.

If more than one of the options in an exclusive group matches the
given flags, the last one wins.

The syntax for specifying this in multilib.yaml is to define a Groups
section in which you specify your group names, and for each one,
declare it to have Type: Exclusive. (This reserves space in the syntax
for maybe adding other group types later, such as a group of mutually
_dependent_ things that you must have all or none of.) Then each
Variant record that's a member of a group has a Group: property giving
that group's name.
---
 clang/include/clang/Driver/Multilib.h         |  16 ++-
 clang/lib/Driver/Multilib.cpp                 | 108 ++++++++++++++++--
 .../baremetal-multilib-exclusive-group.yaml   |  79 +++++++++++++
 .../baremetal-multilib-group-error.yaml       |  27 +++++
 4 files changed, 218 insertions(+), 12 deletions(-)
 create mode 100644 clang/test/Driver/baremetal-multilib-exclusive-group.yaml
 create mode 100644 clang/test/Driver/baremetal-multilib-group-error.yaml

diff --git a/clang/include/clang/Driver/Multilib.h b/clang/include/clang/Driver/Multilib.h
index 1416559414f894..6a9533e6dd831f 100644
--- a/clang/include/clang/Driver/Multilib.h
+++ b/clang/include/clang/Driver/Multilib.h
@@ -39,13 +39,22 @@ class Multilib {
   std::string IncludeSuffix;
   flags_list Flags;
 
+  // Optionally, a multilib can be assigned a string tag indicating that it's
+  // part of a group of mutually exclusive possibilities. If two or more
+  // multilibs have the same non-empty value of ExclusiveGroup, then only the
+  // last matching one of them will be selected.
+  //
+  // Setting this to the empty string is a special case, indicating that the
+  // directory is not mutually exclusive with anything else.
+  std::string ExclusiveGroup;
+
 public:
   /// GCCSuffix, OSSuffix & IncludeSuffix will be appended directly to the
   /// sysroot string so they must either be empty or begin with a '/' character.
   /// This is enforced with an assert in the constructor.
   Multilib(StringRef GCCSuffix = {}, StringRef OSSuffix = {},
-           StringRef IncludeSuffix = {},
-           const flags_list &Flags = flags_list());
+           StringRef IncludeSuffix = {}, const flags_list &Flags = flags_list(),
+           StringRef ExclusiveGroup = {});
 
   /// Get the detected GCC installation path suffix for the multi-arch
   /// target variant. Always starts with a '/', unless empty
@@ -63,6 +72,9 @@ class Multilib {
   /// All elements begin with either '-' or '!'
   const flags_list &flags() const { return Flags; }
 
+  /// Get the exclusive group label.
+  const std::string &exclusiveGroup() const { return ExclusiveGroup; }
+
   LLVM_DUMP_METHOD void dump() const;
   /// print summary of the Multilib
   void print(raw_ostream &OS) const;
diff --git a/clang/lib/Driver/Multilib.cpp b/clang/lib/Driver/Multilib.cpp
index 48a494d9fa38db..7681c1a3ce6756 100644
--- a/clang/lib/Driver/Multilib.cpp
+++ b/clang/lib/Driver/Multilib.cpp
@@ -9,6 +9,7 @@
 #include "clang/Driver/Multilib.h"
 #include "clang/Basic/LLVM.h"
 #include "clang/Basic/Version.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Compiler.h"
@@ -29,9 +30,10 @@ using namespace driver;
 using namespace llvm::sys;
 
 Multilib::Multilib(StringRef GCCSuffix, StringRef OSSuffix,
-                   StringRef IncludeSuffix, const flags_list &Flags)
+                   StringRef IncludeSuffix, const flags_list &Flags,
+                   StringRef ExclusiveGroup)
     : GCCSuffix(GCCSuffix), OSSuffix(OSSuffix), IncludeSuffix(IncludeSuffix),
-      Flags(Flags) {
+      Flags(Flags), ExclusiveGroup(ExclusiveGroup) {
   assert(GCCSuffix.empty() ||
          (StringRef(GCCSuffix).front() == '/' && GCCSuffix.size() > 1));
   assert(OSSuffix.empty() ||
@@ -96,13 +98,37 @@ bool MultilibSet::select(const Multilib::flags_list &Flags,
                          llvm::SmallVector<Multilib> &Selected) const {
   llvm::StringSet<> FlagSet(expandFlags(Flags));
   Selected.clear();
-  llvm::copy_if(Multilibs, std::back_inserter(Selected),
-                [&FlagSet](const Multilib &M) {
-                  for (const std::string &F : M.flags())
-                    if (!FlagSet.contains(F))
-                      return false;
-                  return true;
-                });
+
+  // Decide which multilibs we're going to select at all.
+  llvm::DenseSet<StringRef> ExclusiveGroupsSelected;
+  for (const Multilib &M : llvm::reverse(Multilibs)) {
+    // If this multilib doesn't match all our flags, don't select it.
+    if (!llvm::all_of(M.flags(), [&FlagSet](const std::string &F) {
+          return FlagSet.contains(F);
+        }))
+      continue;
+
+    const std::string &group = M.exclusiveGroup();
+    if (!group.empty()) {
+      // If this multilib has the same ExclusiveGroup as one we've already
+      // selected, skip it. We're iterating in reverse order, so the group
+      // member we've selected already is preferred.
+      //
+      // Otherwise, add the group name to the set of groups we've already
+      // selected a member of.
+      auto [It, Inserted] = ExclusiveGroupsSelected.insert(group);
+      if (!Inserted)
+        continue;
+    }
+
+    // Select this multilib.
+    Selected.push_back(M);
+  }
+
+  // We iterated in reverse order, so now put Selected back the right way
+  // round.
+  std::reverse(Selected.begin(), Selected.end());
+
   return !Selected.empty();
 }
 
@@ -138,10 +164,39 @@ static const VersionTuple MultilibVersionCurrent(1, 0);
 struct MultilibSerialization {
   std::string Dir;
   std::vector<std::string> Flags;
+  std::string Group;
+};
+
+enum class MultilibGroupType {
+  /*
+   * The only group type currently supported is 'Exclusive', which indicates a
+   * group of multilibs of which at most one may be selected.
+   */
+  Exclusive,
+
+  /*
+   * Future possibility: a second group type indicating a set of library
+   * directories that are mutually _dependent_ rather than mutually exclusive:
+   * if you include one you must include them all.
+   *
+   * It might also be useful to allow groups to be members of other groups, so
+   * that a mutually exclusive group could contain a mutually dependent set of
+   * library directories, or vice versa.
+   *
+   * These additional features would need changes in the implementation, but
+   * the YAML schema is set up so they can be added without requiring changes
+   * in existing users' multilib.yaml files.
+   */
+};
+
+struct MultilibGroupSerialization {
+  std::string Name;
+  MultilibGroupType Type;
 };
 
 struct MultilibSetSerialization {
   llvm::VersionTuple MultilibVersion;
+  std::vector<MultilibGroupSerialization> Groups;
   std::vector<MultilibSerialization> Multilibs;
   std::vector<MultilibSet::FlagMatcher> FlagMatchers;
 };
@@ -152,6 +207,7 @@ template <> struct llvm::yaml::MappingTraits<MultilibSerialization> {
   static void mapping(llvm::yaml::IO &io, MultilibSerialization &V) {
     io.mapRequired("Dir", V.Dir);
     io.mapRequired("Flags", V.Flags);
+    io.mapOptional("Group", V.Group);
   }
   static std::string validate(IO &io, MultilibSerialization &V) {
     if (StringRef(V.Dir).starts_with("/"))
@@ -160,6 +216,19 @@ template <> struct llvm::yaml::MappingTraits<MultilibSerialization> {
   }
 };
 
+template <> struct llvm::yaml::ScalarEnumerationTraits<MultilibGroupType> {
+  static void enumeration(IO &io, MultilibGroupType &Val) {
+    io.enumCase(Val, "Exclusive", MultilibGroupType::Exclusive);
+  }
+};
+
+template <> struct llvm::yaml::MappingTraits<MultilibGroupSerialization> {
+  static void mapping(llvm::yaml::IO &io, MultilibGroupSerialization &V) {
+    io.mapRequired("Name", V.Name);
+    io.mapRequired("Type", V.Type);
+  }
+};
+
 template <> struct llvm::yaml::MappingTraits<MultilibSet::FlagMatcher> {
   static void mapping(llvm::yaml::IO &io, MultilibSet::FlagMatcher &M) {
     io.mapRequired("Match", M.Match);
@@ -180,6 +249,7 @@ template <> struct llvm::yaml::MappingTraits<MultilibSetSerialization> {
   static void mapping(llvm::yaml::IO &io, MultilibSetSerialization &M) {
     io.mapRequired("MultilibVersion", M.MultilibVersion);
     io.mapRequired("Variants", M.Multilibs);
+    io.mapOptional("Groups", M.Groups);
     io.mapOptional("Mappings", M.FlagMatchers);
   }
   static std::string validate(IO &io, MultilibSetSerialization &M) {
@@ -191,11 +261,25 @@ template <> struct llvm::yaml::MappingTraits<MultilibSetSerialization> {
     if (M.MultilibVersion.getMinor() > MultilibVersionCurrent.getMinor())
       return "multilib version " + M.MultilibVersion.getAsString() +
              " is unsupported";
+    for (const MultilibSerialization &Lib : M.Multilibs) {
+      if (!Lib.Group.empty()) {
+        bool Found = false;
+        for (const MultilibGroupSerialization &Group : M.Groups)
+          if (Group.Name == Lib.Group) {
+            Found = true;
+            break;
+          }
+        if (!Found)
+          return "multilib \"" + Lib.Dir +
+                 "\" specifies undefined group name \"" + Lib.Group + "\"";
+      }
+    }
     return std::string{};
   }
 };
 
 LLVM_YAML_IS_SEQUENCE_VECTOR(MultilibSerialization)
+LLVM_YAML_IS_SEQUENCE_VECTOR(MultilibGroupSerialization)
 LLVM_YAML_IS_SEQUENCE_VECTOR(MultilibSet::FlagMatcher)
 
 llvm::ErrorOr<MultilibSet>
@@ -214,7 +298,11 @@ MultilibSet::parseYaml(llvm::MemoryBufferRef Input,
     std::string Dir;
     if (M.Dir != ".")
       Dir = "/" + M.Dir;
-    Multilibs.emplace_back(Dir, Dir, Dir, M.Flags);
+    // We transfer M.Group straight into the ExclusiveGroup parameter for the
+    // Multilib constructor. If we later support more than one type of group,
+    // we'll have to look up the group name in MS.Groups, check its type, and
+    // decide what to do here.
+    Multilibs.emplace_back(Dir, Dir, Dir, M.Flags, M.Group);
   }
 
   return MultilibSet(std::move(Multilibs), std::move(MS.FlagMatchers));
diff --git a/clang/test/Driver/baremetal-multilib-exclusive-group.yaml b/clang/test/Driver/baremetal-multilib-exclusive-group.yaml
new file mode 100644
index 00000000000000..a98549efea4f0a
--- /dev/null
+++ b/clang/test/Driver/baremetal-multilib-exclusive-group.yaml
@@ -0,0 +1,79 @@
+# UNSUPPORTED: system-windows
+
+# RUN: rm -rf %t
+
+# RUN: mkdir -p %t/baremetal_multilib/bin
+# RUN: ln -s %clang %t/baremetal_multilib/bin/clang
+
+# RUN: mkdir -p %t/baremetal_multilib/lib/clang-runtimes
+# RUN: ln -s %s %t/baremetal_multilib/lib/clang-runtimes/multilib.yaml
+
+# RUN: %t/baremetal_multilib/bin/clang -no-canonical-prefixes -x c++ %s -### -o %t.out --target=thumbv7em-none-unknown-eabi --sysroot= 2>%t.err
+
+# RUN: FileCheck -DSYSROOT=%t/baremetal_multilib %s < %t.err --check-prefix=POS
+# RUN: FileCheck -DSYSROOT=%t/baremetal_multilib %s < %t.err --check-prefix=NEG
+
+# Expected results:
+#
+# Due to the Mappings section, all six of these library directories should
+# match the command-line flag --target=thumbv7em-none-unknown-eabi.
+#
+# The two "non_exclusive" directories, which don't have an ExclusiveGroup at
+# all, should both be selected. So should the two "own_group", each of which
+# specifies a different value of ExclusiveGroup. But the three "exclusive",
+# which have the _same_ ExclusiveGroup value, should not: the third one wins.
+# So we expect five of these seven directories to show up in the clang-cc1
+# command line, but not testdir1_exclusive or testdir2_exclusive.
+
+# POS-DAG: "-internal-isystem" "[[SYSROOT]]/bin/../lib/clang-runtimes/testdir1_non_exclusive/include/c++/v1"
+# POS-DAG: "-internal-isystem" "[[SYSROOT]]/bin/../lib/clang-runtimes/testdir2_non_exclusive/include/c++/v1"
+# POS-DAG: "-internal-isystem" "[[SYSROOT]]/bin/../lib/clang-runtimes/testdir3_exclusive/include/c++/v1"
+# POS-DAG: "-internal-isystem" "[[SYSROOT]]/bin/../lib/clang-runtimes/testdir1_own_group/include/c++/v1"
+# POS-DAG: "-internal-isystem" "[[SYSROOT]]/bin/../lib/clang-runtimes/testdir2_own_group/include/c++/v1"
+
+# NEG-NOT: "-internal-isystem" "[[SYSROOT]]/bin/../lib/clang-runtimes/testdir1_exclusive/include/c++/v1"
+# NEG-NOT: "-internal-isystem" "[[SYSROOT]]/bin/../lib/clang-runtimes/testdir2_exclusive/include/c++/v1"
+
+---
+MultilibVersion: 1.0
+
+Groups:
+- Name: actually_exclude_something
+  Type: Exclusive
+
+- Name: foo
+  Type: Exclusive
+
+- Name: bar
+  Type: Exclusive
+
+Variants:
+- Dir: testdir1_non_exclusive
+  Flags: [--target=thumbv7m-none-unknown-eabi]
+
+- Dir: testdir2_non_exclusive
+  Flags: [--target=thumbv7em-none-unknown-eabi]
+
+- Dir: testdir1_exclusive
+  Flags: [--target=thumbv7m-none-unknown-eabi]
+  Group: actually_exclude_something
+
+- Dir: testdir2_exclusive
+  Flags: [--target=thumbv7em-none-unknown-eabi]
+  Group: actually_exclude_something
+
+- Dir: testdir3_exclusive
+  Flags: [--target=thumbv7em-none-unknown-eabi]
+  Group: actually_exclude_something
+
+- Dir: testdir1_own_group
+  Flags: [--target=thumbv7m-none-unknown-eabi]
+  Group: foo
+
+- Dir: testdir2_own_group
+  Flags: [--target=thumbv7em-none-unknown-eabi]
+  Group: bar
+
+Mappings:
+- Match: --target=thumbv7em-none-unknown-eabi
+  Flags: [--target=thumbv7m-none-unknown-eabi]
diff --git a/clang/test/Driver/baremetal-multilib-group-error.yaml b/clang/test/Driver/baremetal-multilib-group-error.yaml
new file mode 100644
index 00000000000000..1e8f83fa50d244
--- /dev/null
+++ b/clang/test/Driver/baremetal-multilib-group-error.yaml
@@ -0,0 +1,27 @@
+# UNSUPPORTED: system-windows
+
+# RUN: rm -rf %t
+
+# RUN: mkdir -p %t/baremetal_multilib/bin
+# RUN: ln -s %clang %t/baremetal_multilib/bin/clang
+
+# RUN: mkdir -p %t/baremetal_multilib/lib/clang-runtimes
+# RUN: ln -s %s %t/baremetal_multilib/lib/clang-runtimes/multilib.yaml
+
+# RUN: %t/baremetal_multilib/bin/clang -no-canonical-prefixes -x c++ %s -### -o %t.out --target=thumbv7em-none-unknown-eabi --sysroot= 2>%t.err
+# RUN: FileCheck %s < %t.err
+
+---
+MultilibVersion: 1.0
+
+Groups:
+- Name: group1
+  Type: Nonsense
+
+Variants:
+- Dir: testdir1
+  Flags: [--target=thumbv7m-none-unknown-eabi]
+  Group: nonexistent_group_name
+
+# CHECK: error: unknown enumerated scalar
+# CHECK: error: multilib "testdir1" specifies undefined group name "nonexistent_group_name"

From a2e8207178432f0af30e8c9e3b905a3fd770d500 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker@arm.com>
Date: Thu, 30 Nov 2023 16:50:57 +0000
Subject: [PATCH 32/72] [NFC][LLVMContext] Clean up DenseMapInfo classes used
 for APInt & APFloat.

DenseMapAPIntKeyInfo looks like a redundant definition because it
mirrors the default used by DenseMap when not specified.

Replacing DenseMapAPFloatKeyInfo with a specialisation of
DenseMapInfo allows DenseMap<T> to be more easily used when T is
an aggregate type containing an APFloat.
---
 llvm/lib/IR/LLVMContextImpl.h | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h
index b55107beba556c..6a20291344989d 100644
--- a/llvm/lib/IR/LLVMContextImpl.h
+++ b/llvm/lib/IR/LLVMContextImpl.h
@@ -73,9 +73,7 @@ class StringRef;
 class TypedPointerType;
 class ValueHandleBase;
 
-using DenseMapAPIntKeyInfo = DenseMapInfo<APInt>;
-
-struct DenseMapAPFloatKeyInfo {
+template <> struct DenseMapInfo<APFloat> {
   static inline APFloat getEmptyKey() { return APFloat(APFloat::Bogus(), 1); }
   static inline APFloat getTombstoneKey() {
     return APFloat(APFloat::Bogus(), 2);
@@ -1489,11 +1487,9 @@ class LLVMContextImpl {
 
   DenseMap<unsigned, std::unique_ptr<ConstantInt>> IntZeroConstants;
   DenseMap<unsigned, std::unique_ptr<ConstantInt>> IntOneConstants;
-  DenseMap<APInt, std::unique_ptr<ConstantInt>, DenseMapAPIntKeyInfo>
-      IntConstants;
+  DenseMap<APInt, std::unique_ptr<ConstantInt>> IntConstants;
 
-  DenseMap<APFloat, std::unique_ptr<ConstantFP>, DenseMapAPFloatKeyInfo>
-      FPConstants;
+  DenseMap<APFloat, std::unique_ptr<ConstantFP>> FPConstants;
 
   FoldingSet<AttributeImpl> AttrsSet;
   FoldingSet<AttributeListImpl> AttrsLists;

From 85184b4aefbd01afd6e7be57bc6c1c404b3c13ce Mon Sep 17 00:00:00 2001
From: Dominik Adamski <dominik.adamski@amd.com>
Date: Fri, 1 Dec 2023 13:33:11 +0100
Subject: [PATCH 33/72] [OpenMP] Fix libomptarget build issue (#74067)

Libomptarget cannot be build because of the recent refactoring
introduced in patch 148dec9fa43b :
[OpenMP][NFC] Separate Envar (environment variable) handling (#73994)

That patch moved handling of environment variables from libomptarget
library. That's why we don't need usage of "llvm::omp::target" namespace
if we handle environment variables.
---
 openmp/libomptarget/src/interface.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index a2f713459e1d0c..ee1bd4932442b1 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -454,7 +454,6 @@ EXTERN void __tgt_target_nowait_query(void **AsyncHandle) {
   // for the device operations (work/spin wait on them) or block until they are
   // completed (use device side blocking mechanism). This allows the runtime to
   // adapt itself when there are a lot of long-running target regions in-flight.
-  using namespace llvm::omp::target;
   static thread_local utils::ExponentialBackoff QueryCounter(
       Int64Envar("OMPTARGET_QUERY_COUNT_MAX", 10),
       Int64Envar("OMPTARGET_QUERY_COUNT_THRESHOLD", 5),

From 808b7d220309e279cf9c3d5762cb4c9120c0955f Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Fri, 1 Dec 2023 13:55:31 +0100
Subject: [PATCH 34/72] [libc][NFC] rename LONG_DOUBLE_IS_DOUBLE into
 LIBC_LONG_DOUBLE_IS_FLOAT64 (#73948)

---
 libc/src/__support/FPUtil/generic/sqrt.h      |  2 +-
 libc/src/__support/FPUtil/x86_64/sqrt.h       |  2 +-
 libc/src/__support/float_to_string.h          |  4 +-
 libc/src/__support/macros/properties/float.h  |  8 ++--
 libc/src/__support/str_to_float.h             |  4 +-
 .../test/src/__support/FPUtil/fpbits_test.cpp |  2 +-
 libc/test/src/__support/str_to_float_test.cpp |  2 +-
 libc/test/src/stdio/sprintf_test.cpp          | 44 +++++++++----------
 libc/test/src/stdio/sscanf_test.cpp           |  2 +-
 libc/test/src/stdlib/strtold_test.cpp         |  4 +-
 10 files changed, 36 insertions(+), 38 deletions(-)

diff --git a/libc/src/__support/FPUtil/generic/sqrt.h b/libc/src/__support/FPUtil/generic/sqrt.h
index 63e8329b066074..9c7e6a2f361c67 100644
--- a/libc/src/__support/FPUtil/generic/sqrt.h
+++ b/libc/src/__support/FPUtil/generic/sqrt.h
@@ -43,7 +43,7 @@ LIBC_INLINE void normalize(int &exponent,
   mantissa <<= shift;
 }
 
-#ifdef LONG_DOUBLE_IS_DOUBLE
+#ifdef LIBC_LONG_DOUBLE_IS_FLOAT64
 template <>
 LIBC_INLINE void normalize<long double>(int &exponent, uint64_t &mantissa) {
   normalize<double>(exponent, mantissa);
diff --git a/libc/src/__support/FPUtil/x86_64/sqrt.h b/libc/src/__support/FPUtil/x86_64/sqrt.h
index 7edba5528d6a91..cf3eb9b2f494cd 100644
--- a/libc/src/__support/FPUtil/x86_64/sqrt.h
+++ b/libc/src/__support/FPUtil/x86_64/sqrt.h
@@ -33,7 +33,7 @@ template <> LIBC_INLINE double sqrt<double>(double x) {
   return result;
 }
 
-#ifdef LONG_DOUBLE_IS_DOUBLE
+#ifdef LIBC_LONG_DOUBLE_IS_FLOAT64
 template <> LIBC_INLINE long double sqrt<long double>(long double x) {
   long double result;
   __asm__ __volatile__("sqrtsd %x1, %x0" : "=x"(result) : "x"(x));
diff --git a/libc/src/__support/float_to_string.h b/libc/src/__support/float_to_string.h
index eb06cd9c08af28..1bb4e5c5b9246a 100644
--- a/libc/src/__support/float_to_string.h
+++ b/libc/src/__support/float_to_string.h
@@ -602,7 +602,7 @@ class FloatToString {
   }
 };
 
-#ifndef LONG_DOUBLE_IS_DOUBLE
+#ifndef LIBC_LONG_DOUBLE_IS_FLOAT64
 // --------------------------- LONG DOUBLE FUNCTIONS ---------------------------
 
 template <>
@@ -754,7 +754,7 @@ FloatToString<long double>::get_negative_block(int block_index) {
   }
 }
 
-#endif // LONG_DOUBLE_IS_DOUBLE
+#endif // LIBC_LONG_DOUBLE_IS_FLOAT64
 
 } // namespace LIBC_NAMESPACE
 
diff --git a/libc/src/__support/macros/properties/float.h b/libc/src/__support/macros/properties/float.h
index 4bafc3777a4714..f1679fe5111369 100644
--- a/libc/src/__support/macros/properties/float.h
+++ b/libc/src/__support/macros/properties/float.h
@@ -19,11 +19,9 @@
 #include <float.h> // LDBL_MANT_DIG
 
 // 'long double' properties.
-#if (LDBL_MANT_DIG == DBL_MANT_DIG)
-// TODO: Replace with LIBC_LONG_DOUBLE_IS_DOUBLE
-#define LONG_DOUBLE_IS_DOUBLE
-#endif
-#if (LDBL_MANT_DIG == 64)
+#if (LDBL_MANT_DIG == 53)
+#define LIBC_LONG_DOUBLE_IS_FLOAT64
+#elif (LDBL_MANT_DIG == 64)
 // TODO: Replace with LIBC_LONG_DOUBLE_IS_X86_BIN80
 #define SPECIAL_X86_LONG_DOUBLE
 #elif (LDBL_MANT_DIG == 113)
diff --git a/libc/src/__support/str_to_float.h b/libc/src/__support/str_to_float.h
index 81ab36dbf9471f..a9232573041426 100644
--- a/libc/src/__support/str_to_float.h
+++ b/libc/src/__support/str_to_float.h
@@ -221,7 +221,7 @@ eisel_lemire(ExpandedFloat<T> init_num,
   return output;
 }
 
-#if !defined(LONG_DOUBLE_IS_DOUBLE)
+#if !defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
 template <>
 LIBC_INLINE cpp::optional<ExpandedFloat<long double>>
 eisel_lemire<long double>(ExpandedFloat<long double> init_num,
@@ -516,7 +516,7 @@ template <> class ClingerConsts<double> {
   static constexpr double MAX_EXACT_INT = 9007199254740991.0;
 };
 
-#if defined(LONG_DOUBLE_IS_DOUBLE)
+#if defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
 template <> class ClingerConsts<long double> {
 public:
   static constexpr long double POWERS_OF_TEN_ARRAY[] = {
diff --git a/libc/test/src/__support/FPUtil/fpbits_test.cpp b/libc/test/src/__support/FPUtil/fpbits_test.cpp
index 027db8807ab226..52635cc2af0940 100644
--- a/libc/test/src/__support/FPUtil/fpbits_test.cpp
+++ b/libc/test/src/__support/FPUtil/fpbits_test.cpp
@@ -213,7 +213,7 @@ TEST(LlvmLibcFPBitsTest, X86LongDoubleType) {
 }
 #else
 TEST(LlvmLibcFPBitsTest, LongDoubleType) {
-#if defined(LONG_DOUBLE_IS_DOUBLE)
+#if defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
   return; // The tests for the "double" type cover for this case.
 #else
   using LongDoubleBits = FPBits<long double>;
diff --git a/libc/test/src/__support/str_to_float_test.cpp b/libc/test/src/__support/str_to_float_test.cpp
index ae729418ebe363..c2643d9a764ee6 100644
--- a/libc/test/src/__support/str_to_float_test.cpp
+++ b/libc/test/src/__support/str_to_float_test.cpp
@@ -279,7 +279,7 @@ TEST(LlvmLibcStrToFloatTest, SimpleDecimalConversionExtraTypes) {
   EXPECT_EQ(double_result.error, 0);
 }
 
-#if defined(LONG_DOUBLE_IS_DOUBLE)
+#if defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
 TEST_F(LlvmLibcStrToFloatTest, EiselLemireFloat64AsLongDouble) {
   eisel_lemire_test<long double>(123, 0, 0x1EC00000000000, 1029);
 }
diff --git a/libc/test/src/stdio/sprintf_test.cpp b/libc/test/src/stdio/sprintf_test.cpp
index e41579a20656b7..e2265f5efbc465 100644
--- a/libc/test/src/stdio/sprintf_test.cpp
+++ b/libc/test/src/stdio/sprintf_test.cpp
@@ -644,7 +644,7 @@ TEST_F(LlvmLibcSPrintfTest, FloatHexExpConv) {
   written = LIBC_NAMESPACE::sprintf(buff, "%La", 0.1L);
 #if defined(SPECIAL_X86_LONG_DOUBLE)
   ASSERT_STREQ_LEN(written, buff, "0xc.ccccccccccccccdp-7");
-#elif defined(LONG_DOUBLE_IS_DOUBLE)
+#elif defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
   ASSERT_STREQ_LEN(written, buff, "0x1.999999999999ap-4");
 #else // 128 bit long double
   ASSERT_STREQ_LEN(written, buff, "0x1.999999999999999999999999999ap-4");
@@ -653,7 +653,7 @@ TEST_F(LlvmLibcSPrintfTest, FloatHexExpConv) {
   written = LIBC_NAMESPACE::sprintf(buff, "%La", 1.0e1000L);
 #if defined(SPECIAL_X86_LONG_DOUBLE)
   ASSERT_STREQ_LEN(written, buff, "0xf.38db1f9dd3dac05p+3318");
-#elif defined(LONG_DOUBLE_IS_DOUBLE)
+#elif defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
   ASSERT_STREQ_LEN(written, buff, "inf");
 #else // 128 bit long double
   ASSERT_STREQ_LEN(written, buff, "0x1.e71b63f3ba7b580af1a52d2a7379p+3321");
@@ -662,7 +662,7 @@ TEST_F(LlvmLibcSPrintfTest, FloatHexExpConv) {
   written = LIBC_NAMESPACE::sprintf(buff, "%La", 1.0e-1000L);
 #if defined(SPECIAL_X86_LONG_DOUBLE)
   ASSERT_STREQ_LEN(written, buff, "0x8.68a9188a89e1467p-3325");
-#elif defined(LONG_DOUBLE_IS_DOUBLE)
+#elif defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
   ASSERT_STREQ_LEN(written, buff, "0x0p+0");
 #else // 128 bit long double
   ASSERT_STREQ_LEN(written, buff, "0x1.0d152311513c28ce202627c06ec2p-3322");
@@ -768,7 +768,7 @@ TEST_F(LlvmLibcSPrintfTest, FloatHexExpConv) {
   written = LIBC_NAMESPACE::sprintf(buff, "%.1La", 0.1L);
 #if defined(SPECIAL_X86_LONG_DOUBLE)
   ASSERT_STREQ_LEN(written, buff, "0xc.dp-7");
-#elif defined(LONG_DOUBLE_IS_DOUBLE)
+#elif defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
   ASSERT_STREQ_LEN(written, buff, "0x1.ap-4");
 #else // 128 bit long double
   ASSERT_STREQ_LEN(written, buff, "0x1.ap-4");
@@ -777,7 +777,7 @@ TEST_F(LlvmLibcSPrintfTest, FloatHexExpConv) {
   written = LIBC_NAMESPACE::sprintf(buff, "%.1La", 0xf.fffffffffffffffp16380L);
 #if defined(SPECIAL_X86_LONG_DOUBLE)
   ASSERT_STREQ_LEN(written, buff, "0x1.0p+16384");
-#elif defined(LONG_DOUBLE_IS_DOUBLE)
+#elif defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
   ASSERT_STREQ_LEN(written, buff, "inf");
 #else // 128 bit long double
   ASSERT_STREQ_LEN(written, buff, "0x2.0p+16383");
@@ -1024,14 +1024,14 @@ TEST_F(LlvmLibcSPrintfTest, FloatDecimalConv) {
 
 // Some float128 systems (specifically the ones used for aarch64 buildbots)
 // don't respect signs for long double NaNs.
-#if defined(SPECIAL_X86_LONG_DOUBLE) || defined(LONG_DOUBLE_IS_DOUBLE)
+#if defined(SPECIAL_X86_LONG_DOUBLE) || defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
   written = LIBC_NAMESPACE::sprintf(buff, "%LF", -ld_nan);
   ASSERT_STREQ_LEN(written, buff, "-NAN");
 #endif
 
   // Length Modifier Tests.
 
-  // TODO(michaelrj): Add tests for LONG_DOUBLE_IS_DOUBLE and 128 bit long
+  // TODO(michaelrj): Add tests for LIBC_LONG_DOUBLE_IS_FLOAT64 and 128 bit long
   // double systems.
   // TODO(michaelrj): Fix the tests to only depend on the digits the long double
   // is accurate for.
@@ -1333,7 +1333,7 @@ TEST_F(LlvmLibcSPrintfTest, FloatDecimalConv) {
     written = LIBC_NAMESPACE::sprintf(buff, "%La", 0.1L);
   #if defined(SPECIAL_X86_LONG_DOUBLE)
     ASSERT_STREQ_LEN(written, buff, "0xc.ccccccccccccccdp-7");
-  #elif defined(LONG_DOUBLE_IS_DOUBLE)
+  #elif defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
     ASSERT_STREQ_LEN(written, buff, "0x1.999999999999ap-4");
   #else // 128 bit long double
     ASSERT_STREQ_LEN(written, buff, "0x1.999999999999999999999999999ap-4");
@@ -1342,7 +1342,7 @@ TEST_F(LlvmLibcSPrintfTest, FloatDecimalConv) {
     written = LIBC_NAMESPACE::sprintf(buff, "%La", 1.0e1000L);
   #if defined(SPECIAL_X86_LONG_DOUBLE)
     ASSERT_STREQ_LEN(written, buff, "0xf.38db1f9dd3dac05p+3318");
-  #elif defined(LONG_DOUBLE_IS_DOUBLE)
+  #elif defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
     ASSERT_STREQ_LEN(written, buff, "inf");
   #else // 128 bit long double
     ASSERT_STREQ_LEN(written, buff, "0x1.e71b63f3ba7b580af1a52d2a7379p+3321");
@@ -1351,7 +1351,7 @@ TEST_F(LlvmLibcSPrintfTest, FloatDecimalConv) {
     written = LIBC_NAMESPACE::sprintf(buff, "%La", 1.0e-1000L);
   #if defined(SPECIAL_X86_LONG_DOUBLE)
     ASSERT_STREQ_LEN(written, buff, "0x8.68a9188a89e1467p-3325");
-  #elif defined(LONG_DOUBLE_IS_DOUBLE)
+  #elif defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
     ASSERT_STREQ_LEN(written, buff, "0x0p+0");
   #else // 128 bit long double
     ASSERT_STREQ_LEN(written, buff, "0x1.0d152311513c28ce202627c06ec2p-3322");
@@ -1550,7 +1550,7 @@ TEST_F(LlvmLibcSPrintfTest, FloatDecimalConv) {
     written = LIBC_NAMESPACE::sprintf(buff, "%.1La", 0.1L);
   #if defined(SPECIAL_X86_LONG_DOUBLE)
     ASSERT_STREQ_LEN(written, buff, "0xc.dp-7");
-  #elif defined(LONG_DOUBLE_IS_DOUBLE)
+  #elif defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
     ASSERT_STREQ_LEN(written, buff, "0x1.ap-4");
   #else // 128 bit long double
     ASSERT_STREQ_LEN(written, buff, "0x1.ap-4");
@@ -1559,7 +1559,7 @@ TEST_F(LlvmLibcSPrintfTest, FloatDecimalConv) {
     written = LIBC_NAMESPACE::sprintf(buff, "%.1La",
   0xf.fffffffffffffffp16380L); #if defined(SPECIAL_X86_LONG_DOUBLE)
     ASSERT_STREQ_LEN(written, buff, "0x1.0p+16384");
-  #elif defined(LONG_DOUBLE_IS_DOUBLE)
+  #elif defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
     ASSERT_STREQ_LEN(written, buff, "inf");
   #else // 128 bit long double
     ASSERT_STREQ_LEN(written, buff, "0x2.0p+16383");
@@ -1977,7 +1977,7 @@ TEST_F(LlvmLibcSPrintfTest, FloatExponentConv) {
     written = LIBC_NAMESPACE::sprintf(buff, "%La", 0.1L);
   #if defined(SPECIAL_X86_LONG_DOUBLE)
     ASSERT_STREQ_LEN(written, buff, "0xc.ccccccccccccccdp-7");
-  #elif defined(LONG_DOUBLE_IS_DOUBLE)
+  #elif defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
     ASSERT_STREQ_LEN(written, buff, "0x1.999999999999ap-4");
   #else // 128 bit long double
     ASSERT_STREQ_LEN(written, buff, "0x1.999999999999999999999999999ap-4");
@@ -1986,7 +1986,7 @@ TEST_F(LlvmLibcSPrintfTest, FloatExponentConv) {
     written = LIBC_NAMESPACE::sprintf(buff, "%La", 1.0e1000L);
   #if defined(SPECIAL_X86_LONG_DOUBLE)
     ASSERT_STREQ_LEN(written, buff, "0xf.38db1f9dd3dac05p+3318");
-  #elif defined(LONG_DOUBLE_IS_DOUBLE)
+  #elif defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
     ASSERT_STREQ_LEN(written, buff, "inf");
   #else // 128 bit long double
     ASSERT_STREQ_LEN(written, buff, "0x1.e71b63f3ba7b580af1a52d2a7379p+3321");
@@ -1995,7 +1995,7 @@ TEST_F(LlvmLibcSPrintfTest, FloatExponentConv) {
     written = LIBC_NAMESPACE::sprintf(buff, "%La", 1.0e-1000L);
   #if defined(SPECIAL_X86_LONG_DOUBLE)
     ASSERT_STREQ_LEN(written, buff, "0x8.68a9188a89e1467p-3325");
-  #elif defined(LONG_DOUBLE_IS_DOUBLE)
+  #elif defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
     ASSERT_STREQ_LEN(written, buff, "0x0p+0");
   #else // 128 bit long double
     ASSERT_STREQ_LEN(written, buff, "0x1.0d152311513c28ce202627c06ec2p-3322");
@@ -2173,7 +2173,7 @@ TEST_F(LlvmLibcSPrintfTest, FloatExponentConv) {
     written = LIBC_NAMESPACE::sprintf(buff, "%.1La", 0.1L);
   #if defined(SPECIAL_X86_LONG_DOUBLE)
     ASSERT_STREQ_LEN(written, buff, "0xc.dp-7");
-  #elif defined(LONG_DOUBLE_IS_DOUBLE)
+  #elif defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
     ASSERT_STREQ_LEN(written, buff, "0x1.ap-4");
   #else // 128 bit long double
     ASSERT_STREQ_LEN(written, buff, "0x1.ap-4");
@@ -2182,7 +2182,7 @@ TEST_F(LlvmLibcSPrintfTest, FloatExponentConv) {
     written = LIBC_NAMESPACE::sprintf(buff, "%.1La",
   0xf.fffffffffffffffp16380L); #if defined(SPECIAL_X86_LONG_DOUBLE)
     ASSERT_STREQ_LEN(written, buff, "0x1.0p+16384");
-  #elif defined(LONG_DOUBLE_IS_DOUBLE)
+  #elif defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
     ASSERT_STREQ_LEN(written, buff, "inf");
   #else // 128 bit long double
     ASSERT_STREQ_LEN(written, buff, "0x2.0p+16383");
@@ -2616,7 +2616,7 @@ TEST_F(LlvmLibcSPrintfTest, FloatAutoConv) {
     written = LIBC_NAMESPACE::sprintf(buff, "%La", 0.1L);
   #if defined(SPECIAL_X86_LONG_DOUBLE)
     ASSERT_STREQ_LEN(written, buff, "0xc.ccccccccccccccdp-7");
-  #elif defined(LONG_DOUBLE_IS_DOUBLE)
+  #elif defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
     ASSERT_STREQ_LEN(written, buff, "0x1.999999999999ap-4");
   #else // 128 bit long double
     ASSERT_STREQ_LEN(written, buff, "0x1.999999999999999999999999999ap-4");
@@ -2625,7 +2625,7 @@ TEST_F(LlvmLibcSPrintfTest, FloatAutoConv) {
     written = LIBC_NAMESPACE::sprintf(buff, "%La", 1.0e1000L);
   #if defined(SPECIAL_X86_LONG_DOUBLE)
     ASSERT_STREQ_LEN(written, buff, "0xf.38db1f9dd3dac05p+3318");
-  #elif defined(LONG_DOUBLE_IS_DOUBLE)
+  #elif defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
     ASSERT_STREQ_LEN(written, buff, "inf");
   #else // 128 bit long double
     ASSERT_STREQ_LEN(written, buff, "0x1.e71b63f3ba7b580af1a52d2a7379p+3321");
@@ -2634,7 +2634,7 @@ TEST_F(LlvmLibcSPrintfTest, FloatAutoConv) {
     written = LIBC_NAMESPACE::sprintf(buff, "%La", 1.0e-1000L);
   #if defined(SPECIAL_X86_LONG_DOUBLE)
     ASSERT_STREQ_LEN(written, buff, "0x8.68a9188a89e1467p-3325");
-  #elif defined(LONG_DOUBLE_IS_DOUBLE)
+  #elif defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
     ASSERT_STREQ_LEN(written, buff, "0x0p+0");
   #else // 128 bit long double
     ASSERT_STREQ_LEN(written, buff, "0x1.0d152311513c28ce202627c06ec2p-3322");
@@ -2822,7 +2822,7 @@ TEST_F(LlvmLibcSPrintfTest, FloatAutoConv) {
     written = LIBC_NAMESPACE::sprintf(buff, "%.1La", 0.1L);
   #if defined(SPECIAL_X86_LONG_DOUBLE)
     ASSERT_STREQ_LEN(written, buff, "0xc.dp-7");
-  #elif defined(LONG_DOUBLE_IS_DOUBLE)
+  #elif defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
     ASSERT_STREQ_LEN(written, buff, "0x1.ap-4");
   #else // 128 bit long double
     ASSERT_STREQ_LEN(written, buff, "0x1.ap-4");
@@ -2831,7 +2831,7 @@ TEST_F(LlvmLibcSPrintfTest, FloatAutoConv) {
     written = LIBC_NAMESPACE::sprintf(buff, "%.1La",
   0xf.fffffffffffffffp16380L); #if defined(SPECIAL_X86_LONG_DOUBLE)
     ASSERT_STREQ_LEN(written, buff, "0x1.0p+16384");
-  #elif defined(LONG_DOUBLE_IS_DOUBLE)
+  #elif defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
     ASSERT_STREQ_LEN(written, buff, "inf");
   #else // 128 bit long double
     ASSERT_STREQ_LEN(written, buff, "0x2.0p+16383");
diff --git a/libc/test/src/stdio/sscanf_test.cpp b/libc/test/src/stdio/sscanf_test.cpp
index ec53c08bd9d41b..db3c48cdbf7a2d 100644
--- a/libc/test/src/stdio/sscanf_test.cpp
+++ b/libc/test/src/stdio/sscanf_test.cpp
@@ -322,7 +322,7 @@ TEST(LlvmLibcSScanfTest, FloatConvLengthModifier) {
   EXPECT_EQ(ret_val, 1);
 // 1e600 may be larger than the maximum long double (if long double is double).
 // In that case both of these should be evaluated as inf.
-#ifdef LONG_DOUBLE_IS_DOUBLE
+#ifdef LIBC_LONG_DOUBLE_IS_FLOAT64
   EXPECT_FP_EQ(ld_result, d_inf);
 #else
   EXPECT_FP_EQ(ld_result, 1.0e600L);
diff --git a/libc/test/src/stdlib/strtold_test.cpp b/libc/test/src/stdlib/strtold_test.cpp
index 680a93188c76d1..1ddf729689ff6c 100644
--- a/libc/test/src/stdlib/strtold_test.cpp
+++ b/libc/test/src/stdlib/strtold_test.cpp
@@ -16,7 +16,7 @@
 #include <limits.h>
 #include <stddef.h>
 
-#if defined(LONG_DOUBLE_IS_DOUBLE)
+#if defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
 #define SELECT_CONST(val, _, __) val
 #elif defined(SPECIAL_X86_LONG_DOUBLE)
 #define SELECT_CONST(_, val, __) val
@@ -26,7 +26,7 @@
 
 class LlvmLibcStrToLDTest : public LIBC_NAMESPACE::testing::Test {
 public:
-#if defined(LONG_DOUBLE_IS_DOUBLE)
+#if defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
   void run_test(const char *inputString, const ptrdiff_t expectedStrLen,
                 const uint64_t expectedRawData, const int expectedErrno = 0)
 #else

From f1d0276e4c42301155e900424ea734aca7ec97a8 Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Fri, 1 Dec 2023 13:57:36 +0100
Subject: [PATCH 35/72] [libc][NFC] Rename LIBC_LONG_DOUBLE_IS_IEEE754_BIN128
 to LIBC_LONG_DOUBLE_IS_FLOAT128 (#74052)

To make it consistent with
https://github.com/llvm/llvm-project/pull/73948 and
https://github.com/llvm/llvm-project/pull/73950
---
 libc/src/__support/macros/properties/float.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/libc/src/__support/macros/properties/float.h b/libc/src/__support/macros/properties/float.h
index f1679fe5111369..bd63f334660597 100644
--- a/libc/src/__support/macros/properties/float.h
+++ b/libc/src/__support/macros/properties/float.h
@@ -25,7 +25,7 @@
 // TODO: Replace with LIBC_LONG_DOUBLE_IS_X86_BIN80
 #define SPECIAL_X86_LONG_DOUBLE
 #elif (LDBL_MANT_DIG == 113)
-#define LIBC_LONG_DOUBLE_IS_IEEE754_BIN128
+#define LIBC_LONG_DOUBLE_IS_FLOAT128
 #endif
 
 // float16 support.
@@ -69,13 +69,13 @@ using float16 = _Float16;
 using float128 = _Float128;
 #elif defined(LIBC_COMPILER_HAS_FLOAT128_EXTENSION)
 using float128 = __float128;
-#elif defined(LIBC_LONG_DOUBLE_IS_IEEE754_BIN128)
+#elif defined(LIBC_LONG_DOUBLE_IS_FLOAT128)
 using float128 = long double;
 #endif
 
 #if defined(LIBC_COMPILER_HAS_C23_FLOAT128) ||                                 \
     defined(LIBC_COMPILER_HAS_FLOAT128_EXTENSION) ||                           \
-    defined(LIBC_LONG_DOUBLE_IS_IEEE754_BIN128)
+    defined(LIBC_LONG_DOUBLE_IS_FLOAT128)
 // TODO: Replace with LIBC_HAS_FLOAT128
 #define LIBC_COMPILER_HAS_FLOAT128
 #endif

From 977af4252d1d60a1e9c546f0e4328b1a646ef635 Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Fri, 1 Dec 2023 14:23:08 +0100
Subject: [PATCH 36/72] [libc][NFC] Rename SPECIAL_X86_LONG_DOUBLE in
 LIBC_LONG_DOUBLE_IS_X86_FLOAT80 (#73950)

---
 libc/src/__support/FPUtil/FPBits.h            |  2 +-
 .../__support/FPUtil/ManipulationFunctions.h  |  4 +-
 libc/src/__support/FPUtil/NormalFloat.h       |  4 +-
 libc/src/__support/FPUtil/generic/sqrt.h      |  6 +-
 .../FPUtil/generic/sqrt_80_bit_long_double.h  |  4 +-
 libc/src/__support/macros/properties/float.h  |  3 +-
 libc/src/__support/str_to_float.h             |  4 +-
 libc/test/src/__support/str_to_float_test.cpp |  2 +-
 libc/test/src/stdio/sprintf_test.cpp          | 57 ++++++++++---------
 libc/test/src/stdlib/strtold_test.cpp         |  2 +-
 10 files changed, 44 insertions(+), 44 deletions(-)

diff --git a/libc/src/__support/FPUtil/FPBits.h b/libc/src/__support/FPUtil/FPBits.h
index 76a9fc6d772bf9..f5b73440de2158 100644
--- a/libc/src/__support/FPUtil/FPBits.h
+++ b/libc/src/__support/FPUtil/FPBits.h
@@ -249,7 +249,7 @@ template <typename T> struct FPBits {
 } // namespace fputil
 } // namespace LIBC_NAMESPACE
 
-#ifdef SPECIAL_X86_LONG_DOUBLE
+#ifdef LIBC_LONG_DOUBLE_IS_X86_FLOAT80
 #include "x86_64/LongDoubleBits.h"
 #endif
 
diff --git a/libc/src/__support/FPUtil/ManipulationFunctions.h b/libc/src/__support/FPUtil/ManipulationFunctions.h
index 9286deee2d92c4..9d3fd075be4711 100644
--- a/libc/src/__support/FPUtil/ManipulationFunctions.h
+++ b/libc/src/__support/FPUtil/ManipulationFunctions.h
@@ -186,8 +186,8 @@ LIBC_INLINE T nextafter(T from, U to) {
 } // namespace fputil
 } // namespace LIBC_NAMESPACE
 
-#ifdef SPECIAL_X86_LONG_DOUBLE
+#ifdef LIBC_LONG_DOUBLE_IS_X86_FLOAT80
 #include "x86_64/NextAfterLongDouble.h"
-#endif // SPECIAL_X86_LONG_DOUBLE
+#endif // LIBC_LONG_DOUBLE_IS_X86_FLOAT80
 
 #endif // LLVM_LIBC_SRC___SUPPORT_FPUTIL_MANIPULATIONFUNCTIONS_H
diff --git a/libc/src/__support/FPUtil/NormalFloat.h b/libc/src/__support/FPUtil/NormalFloat.h
index afbf97cc2b6386..d59de14fb695e8 100644
--- a/libc/src/__support/FPUtil/NormalFloat.h
+++ b/libc/src/__support/FPUtil/NormalFloat.h
@@ -170,7 +170,7 @@ template <typename T> struct NormalFloat {
   }
 };
 
-#ifdef SPECIAL_X86_LONG_DOUBLE
+#ifdef LIBC_LONG_DOUBLE_IS_X86_FLOAT80
 template <>
 LIBC_INLINE void
 NormalFloat<long double>::init_from_bits(FPBits<long double> bits) {
@@ -259,7 +259,7 @@ template <> LIBC_INLINE NormalFloat<long double>::operator long double() const {
   result.set_implicit_bit(1);
   return static_cast<long double>(result);
 }
-#endif // SPECIAL_X86_LONG_DOUBLE
+#endif // LIBC_LONG_DOUBLE_IS_X86_FLOAT80
 
 } // namespace fputil
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/__support/FPUtil/generic/sqrt.h b/libc/src/__support/FPUtil/generic/sqrt.h
index 9c7e6a2f361c67..b93fa7a35f826b 100644
--- a/libc/src/__support/FPUtil/generic/sqrt.h
+++ b/libc/src/__support/FPUtil/generic/sqrt.h
@@ -28,11 +28,11 @@ template <typename T> struct SpecialLongDouble {
   static constexpr bool VALUE = false;
 };
 
-#if defined(SPECIAL_X86_LONG_DOUBLE)
+#if defined(LIBC_LONG_DOUBLE_IS_X86_FLOAT80)
 template <> struct SpecialLongDouble<long double> {
   static constexpr bool VALUE = true;
 };
-#endif // SPECIAL_X86_LONG_DOUBLE
+#endif // LIBC_LONG_DOUBLE_IS_X86_FLOAT80
 
 template <typename T>
 LIBC_INLINE void normalize(int &exponent,
@@ -48,7 +48,7 @@ template <>
 LIBC_INLINE void normalize<long double>(int &exponent, uint64_t &mantissa) {
   normalize<double>(exponent, mantissa);
 }
-#elif !defined(SPECIAL_X86_LONG_DOUBLE)
+#elif !defined(LIBC_LONG_DOUBLE_IS_X86_FLOAT80)
 template <>
 LIBC_INLINE void normalize<long double>(int &exponent, UInt128 &mantissa) {
   const uint64_t hi_bits = static_cast<uint64_t>(mantissa >> 64);
diff --git a/libc/src/__support/FPUtil/generic/sqrt_80_bit_long_double.h b/libc/src/__support/FPUtil/generic/sqrt_80_bit_long_double.h
index 713c3389051096..a3bf7e3cabad3e 100644
--- a/libc/src/__support/FPUtil/generic/sqrt_80_bit_long_double.h
+++ b/libc/src/__support/FPUtil/generic/sqrt_80_bit_long_double.h
@@ -34,7 +34,7 @@ LIBC_INLINE long double sqrt(long double x);
 
 // Correctly rounded SQRT for all rounding modes.
 // Shift-and-add algorithm.
-#if defined(SPECIAL_X86_LONG_DOUBLE)
+#if defined(LIBC_LONG_DOUBLE_IS_X86_FLOAT80)
 LIBC_INLINE long double sqrt(long double x) {
   using UIntType = typename FPBits<long double>::UIntType;
   constexpr UIntType ONE = UIntType(1)
@@ -135,7 +135,7 @@ LIBC_INLINE long double sqrt(long double x) {
     return out;
   }
 }
-#endif // SPECIAL_X86_LONG_DOUBLE
+#endif // LIBC_LONG_DOUBLE_IS_X86_FLOAT80
 
 } // namespace x86
 } // namespace fputil
diff --git a/libc/src/__support/macros/properties/float.h b/libc/src/__support/macros/properties/float.h
index bd63f334660597..bae51cbe8aee8e 100644
--- a/libc/src/__support/macros/properties/float.h
+++ b/libc/src/__support/macros/properties/float.h
@@ -22,8 +22,7 @@
 #if (LDBL_MANT_DIG == 53)
 #define LIBC_LONG_DOUBLE_IS_FLOAT64
 #elif (LDBL_MANT_DIG == 64)
-// TODO: Replace with LIBC_LONG_DOUBLE_IS_X86_BIN80
-#define SPECIAL_X86_LONG_DOUBLE
+#define LIBC_LONG_DOUBLE_IS_X86_FLOAT80
 #elif (LDBL_MANT_DIG == 113)
 #define LIBC_LONG_DOUBLE_IS_FLOAT128
 #endif
diff --git a/libc/src/__support/str_to_float.h b/libc/src/__support/str_to_float.h
index a9232573041426..a872c25e2f0998 100644
--- a/libc/src/__support/str_to_float.h
+++ b/libc/src/__support/str_to_float.h
@@ -89,7 +89,7 @@ template <class T> LIBC_INLINE void set_implicit_bit(fputil::FPBits<T> &) {
   return;
 }
 
-#if defined(SPECIAL_X86_LONG_DOUBLE)
+#if defined(LIBC_LONG_DOUBLE_IS_X86_FLOAT80)
 template <>
 LIBC_INLINE void
 set_implicit_bit<long double>(fputil::FPBits<long double> &result) {
@@ -529,7 +529,7 @@ template <> class ClingerConsts<long double> {
   static constexpr long double MAX_EXACT_INT =
       ClingerConsts<double>::MAX_EXACT_INT;
 };
-#elif defined(SPECIAL_X86_LONG_DOUBLE)
+#elif defined(LIBC_LONG_DOUBLE_IS_X86_FLOAT80)
 template <> class ClingerConsts<long double> {
 public:
   static constexpr long double POWERS_OF_TEN_ARRAY[] = {
diff --git a/libc/test/src/__support/str_to_float_test.cpp b/libc/test/src/__support/str_to_float_test.cpp
index c2643d9a764ee6..f9d12d95a50bee 100644
--- a/libc/test/src/__support/str_to_float_test.cpp
+++ b/libc/test/src/__support/str_to_float_test.cpp
@@ -283,7 +283,7 @@ TEST(LlvmLibcStrToFloatTest, SimpleDecimalConversionExtraTypes) {
 TEST_F(LlvmLibcStrToFloatTest, EiselLemireFloat64AsLongDouble) {
   eisel_lemire_test<long double>(123, 0, 0x1EC00000000000, 1029);
 }
-#elif defined(SPECIAL_X86_LONG_DOUBLE)
+#elif defined(LIBC_LONG_DOUBLE_IS_X86_FLOAT80)
 TEST_F(LlvmLibcStrToFloatTest, EiselLemireFloat80Simple) {
   eisel_lemire_test<long double>(123, 0, 0xf600000000000000, 16389);
   eisel_lemire_test<long double>(12345678901234568192u, 0, 0xab54a98ceb1f0c00,
diff --git a/libc/test/src/stdio/sprintf_test.cpp b/libc/test/src/stdio/sprintf_test.cpp
index e2265f5efbc465..344853beaf9fa7 100644
--- a/libc/test/src/stdio/sprintf_test.cpp
+++ b/libc/test/src/stdio/sprintf_test.cpp
@@ -642,7 +642,7 @@ TEST_F(LlvmLibcSPrintfTest, FloatHexExpConv) {
   // Length Modifier Tests.
 
   written = LIBC_NAMESPACE::sprintf(buff, "%La", 0.1L);
-#if defined(SPECIAL_X86_LONG_DOUBLE)
+#if defined(LIBC_LONG_DOUBLE_IS_X86_FLOAT80)
   ASSERT_STREQ_LEN(written, buff, "0xc.ccccccccccccccdp-7");
 #elif defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
   ASSERT_STREQ_LEN(written, buff, "0x1.999999999999ap-4");
@@ -651,7 +651,7 @@ TEST_F(LlvmLibcSPrintfTest, FloatHexExpConv) {
 #endif
 
   written = LIBC_NAMESPACE::sprintf(buff, "%La", 1.0e1000L);
-#if defined(SPECIAL_X86_LONG_DOUBLE)
+#if defined(LIBC_LONG_DOUBLE_IS_X86_FLOAT80)
   ASSERT_STREQ_LEN(written, buff, "0xf.38db1f9dd3dac05p+3318");
 #elif defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
   ASSERT_STREQ_LEN(written, buff, "inf");
@@ -660,7 +660,7 @@ TEST_F(LlvmLibcSPrintfTest, FloatHexExpConv) {
 #endif
 
   written = LIBC_NAMESPACE::sprintf(buff, "%La", 1.0e-1000L);
-#if defined(SPECIAL_X86_LONG_DOUBLE)
+#if defined(LIBC_LONG_DOUBLE_IS_X86_FLOAT80)
   ASSERT_STREQ_LEN(written, buff, "0x8.68a9188a89e1467p-3325");
 #elif defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
   ASSERT_STREQ_LEN(written, buff, "0x0p+0");
@@ -766,7 +766,7 @@ TEST_F(LlvmLibcSPrintfTest, FloatHexExpConv) {
   ASSERT_STREQ_LEN(written, buff, "0x0p+0");
 
   written = LIBC_NAMESPACE::sprintf(buff, "%.1La", 0.1L);
-#if defined(SPECIAL_X86_LONG_DOUBLE)
+#if defined(LIBC_LONG_DOUBLE_IS_X86_FLOAT80)
   ASSERT_STREQ_LEN(written, buff, "0xc.dp-7");
 #elif defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
   ASSERT_STREQ_LEN(written, buff, "0x1.ap-4");
@@ -775,7 +775,7 @@ TEST_F(LlvmLibcSPrintfTest, FloatHexExpConv) {
 #endif
 
   written = LIBC_NAMESPACE::sprintf(buff, "%.1La", 0xf.fffffffffffffffp16380L);
-#if defined(SPECIAL_X86_LONG_DOUBLE)
+#if defined(LIBC_LONG_DOUBLE_IS_X86_FLOAT80)
   ASSERT_STREQ_LEN(written, buff, "0x1.0p+16384");
 #elif defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
   ASSERT_STREQ_LEN(written, buff, "inf");
@@ -1024,7 +1024,8 @@ TEST_F(LlvmLibcSPrintfTest, FloatDecimalConv) {
 
 // Some float128 systems (specifically the ones used for aarch64 buildbots)
 // don't respect signs for long double NaNs.
-#if defined(SPECIAL_X86_LONG_DOUBLE) || defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
+#if defined(LIBC_LONG_DOUBLE_IS_X86_FLOAT80) ||                                \
+    defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
   written = LIBC_NAMESPACE::sprintf(buff, "%LF", -ld_nan);
   ASSERT_STREQ_LEN(written, buff, "-NAN");
 #endif
@@ -1042,7 +1043,7 @@ TEST_F(LlvmLibcSPrintfTest, FloatDecimalConv) {
   written = LIBC_NAMESPACE::sprintf(buff, "%.Lf", -2.5L);
   ASSERT_STREQ_LEN(written, buff, "-2");
 
-#if defined(SPECIAL_X86_LONG_DOUBLE)
+#if defined(LIBC_LONG_DOUBLE_IS_X86_FLOAT80)
 
   written = LIBC_NAMESPACE::sprintf(buff, "%Lf", 1e100L);
   ASSERT_STREQ_LEN(written, buff,
@@ -1327,11 +1328,11 @@ TEST_F(LlvmLibcSPrintfTest, FloatDecimalConv) {
       "570449525088342437216896462077260223998756027453411520977536701491759878"
       "422771447006016890777855573925295187921971811871399320142563330377888532"
       "179817332113");
-#endif // SPECIAL_X86_LONG_DOUBLE
+#endif // LIBC_LONG_DOUBLE_IS_X86_FLOAT80
 
   /*
     written = LIBC_NAMESPACE::sprintf(buff, "%La", 0.1L);
-  #if defined(SPECIAL_X86_LONG_DOUBLE)
+  #if defined(LIBC_LONG_DOUBLE_IS_X86_FLOAT80)
     ASSERT_STREQ_LEN(written, buff, "0xc.ccccccccccccccdp-7");
   #elif defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
     ASSERT_STREQ_LEN(written, buff, "0x1.999999999999ap-4");
@@ -1340,7 +1341,7 @@ TEST_F(LlvmLibcSPrintfTest, FloatDecimalConv) {
   #endif
 
     written = LIBC_NAMESPACE::sprintf(buff, "%La", 1.0e1000L);
-  #if defined(SPECIAL_X86_LONG_DOUBLE)
+  #if defined(LIBC_LONG_DOUBLE_IS_X86_FLOAT80)
     ASSERT_STREQ_LEN(written, buff, "0xf.38db1f9dd3dac05p+3318");
   #elif defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
     ASSERT_STREQ_LEN(written, buff, "inf");
@@ -1349,7 +1350,7 @@ TEST_F(LlvmLibcSPrintfTest, FloatDecimalConv) {
   #endif
 
     written = LIBC_NAMESPACE::sprintf(buff, "%La", 1.0e-1000L);
-  #if defined(SPECIAL_X86_LONG_DOUBLE)
+  #if defined(LIBC_LONG_DOUBLE_IS_X86_FLOAT80)
     ASSERT_STREQ_LEN(written, buff, "0x8.68a9188a89e1467p-3325");
   #elif defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
     ASSERT_STREQ_LEN(written, buff, "0x0p+0");
@@ -1548,7 +1549,7 @@ TEST_F(LlvmLibcSPrintfTest, FloatDecimalConv) {
 
   /*
     written = LIBC_NAMESPACE::sprintf(buff, "%.1La", 0.1L);
-  #if defined(SPECIAL_X86_LONG_DOUBLE)
+  #if defined(LIBC_LONG_DOUBLE_IS_X86_FLOAT80)
     ASSERT_STREQ_LEN(written, buff, "0xc.dp-7");
   #elif defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
     ASSERT_STREQ_LEN(written, buff, "0x1.ap-4");
@@ -1557,7 +1558,7 @@ TEST_F(LlvmLibcSPrintfTest, FloatDecimalConv) {
   #endif
 
     written = LIBC_NAMESPACE::sprintf(buff, "%.1La",
-  0xf.fffffffffffffffp16380L); #if defined(SPECIAL_X86_LONG_DOUBLE)
+  0xf.fffffffffffffffp16380L); #if defined(LIBC_LONG_DOUBLE_IS_X86_FLOAT80)
     ASSERT_STREQ_LEN(written, buff, "0x1.0p+16384");
   #elif defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
     ASSERT_STREQ_LEN(written, buff, "inf");
@@ -1858,7 +1859,7 @@ TEST_F(LlvmLibcSPrintfTest, FloatExponentConv) {
 
   // Length Modifier Tests.
 
-#if defined(SPECIAL_X86_LONG_DOUBLE)
+#if defined(LIBC_LONG_DOUBLE_IS_X86_FLOAT80)
   written = LIBC_NAMESPACE::sprintf(buff, "%.9Le", 1000000000500000000.1L);
   ASSERT_STREQ_LEN(written, buff, "1.000000001e+18");
 
@@ -1975,7 +1976,7 @@ TEST_F(LlvmLibcSPrintfTest, FloatExponentConv) {
 */
   /*
     written = LIBC_NAMESPACE::sprintf(buff, "%La", 0.1L);
-  #if defined(SPECIAL_X86_LONG_DOUBLE)
+  #if defined(LIBC_LONG_DOUBLE_IS_X86_FLOAT80)
     ASSERT_STREQ_LEN(written, buff, "0xc.ccccccccccccccdp-7");
   #elif defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
     ASSERT_STREQ_LEN(written, buff, "0x1.999999999999ap-4");
@@ -1984,7 +1985,7 @@ TEST_F(LlvmLibcSPrintfTest, FloatExponentConv) {
   #endif
 
     written = LIBC_NAMESPACE::sprintf(buff, "%La", 1.0e1000L);
-  #if defined(SPECIAL_X86_LONG_DOUBLE)
+  #if defined(LIBC_LONG_DOUBLE_IS_X86_FLOAT80)
     ASSERT_STREQ_LEN(written, buff, "0xf.38db1f9dd3dac05p+3318");
   #elif defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
     ASSERT_STREQ_LEN(written, buff, "inf");
@@ -1993,7 +1994,7 @@ TEST_F(LlvmLibcSPrintfTest, FloatExponentConv) {
   #endif
 
     written = LIBC_NAMESPACE::sprintf(buff, "%La", 1.0e-1000L);
-  #if defined(SPECIAL_X86_LONG_DOUBLE)
+  #if defined(LIBC_LONG_DOUBLE_IS_X86_FLOAT80)
     ASSERT_STREQ_LEN(written, buff, "0x8.68a9188a89e1467p-3325");
   #elif defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
     ASSERT_STREQ_LEN(written, buff, "0x0p+0");
@@ -2171,7 +2172,7 @@ TEST_F(LlvmLibcSPrintfTest, FloatExponentConv) {
 
   /*
     written = LIBC_NAMESPACE::sprintf(buff, "%.1La", 0.1L);
-  #if defined(SPECIAL_X86_LONG_DOUBLE)
+  #if defined(LIBC_LONG_DOUBLE_IS_X86_FLOAT80)
     ASSERT_STREQ_LEN(written, buff, "0xc.dp-7");
   #elif defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
     ASSERT_STREQ_LEN(written, buff, "0x1.ap-4");
@@ -2180,7 +2181,7 @@ TEST_F(LlvmLibcSPrintfTest, FloatExponentConv) {
   #endif
 
     written = LIBC_NAMESPACE::sprintf(buff, "%.1La",
-  0xf.fffffffffffffffp16380L); #if defined(SPECIAL_X86_LONG_DOUBLE)
+  0xf.fffffffffffffffp16380L); #if defined(LIBC_LONG_DOUBLE_IS_X86_FLOAT80)
     ASSERT_STREQ_LEN(written, buff, "0x1.0p+16384");
   #elif defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
     ASSERT_STREQ_LEN(written, buff, "inf");
@@ -2499,7 +2500,7 @@ TEST_F(LlvmLibcSPrintfTest, FloatAutoConv) {
 
   // Length Modifier Tests.
 
-#if defined(SPECIAL_X86_LONG_DOUBLE)
+#if defined(LIBC_LONG_DOUBLE_IS_X86_FLOAT80)
 
   written = LIBC_NAMESPACE::sprintf(buff, "%Lg", 0xf.fffffffffffffffp+16380L);
   ASSERT_STREQ_LEN(written, buff, "1.18973e+4932");
@@ -2507,7 +2508,7 @@ TEST_F(LlvmLibcSPrintfTest, FloatAutoConv) {
   written = LIBC_NAMESPACE::sprintf(buff, "%Lg", 0xa.aaaaaaaaaaaaaabp-7L);
   ASSERT_STREQ_LEN(written, buff, "0.0833333");
 
-#endif // SPECIAL_X86_LONG_DOUBLE
+#endif // LIBC_LONG_DOUBLE_IS_X86_FLOAT80
 
   // TODO: Uncomment the below tests after long double support is added
   /*
@@ -2614,7 +2615,7 @@ TEST_F(LlvmLibcSPrintfTest, FloatAutoConv) {
 */
   /*
     written = LIBC_NAMESPACE::sprintf(buff, "%La", 0.1L);
-  #if defined(SPECIAL_X86_LONG_DOUBLE)
+  #if defined(LIBC_LONG_DOUBLE_IS_X86_FLOAT80)
     ASSERT_STREQ_LEN(written, buff, "0xc.ccccccccccccccdp-7");
   #elif defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
     ASSERT_STREQ_LEN(written, buff, "0x1.999999999999ap-4");
@@ -2623,7 +2624,7 @@ TEST_F(LlvmLibcSPrintfTest, FloatAutoConv) {
   #endif
 
     written = LIBC_NAMESPACE::sprintf(buff, "%La", 1.0e1000L);
-  #if defined(SPECIAL_X86_LONG_DOUBLE)
+  #if defined(LIBC_LONG_DOUBLE_IS_X86_FLOAT80)
     ASSERT_STREQ_LEN(written, buff, "0xf.38db1f9dd3dac05p+3318");
   #elif defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
     ASSERT_STREQ_LEN(written, buff, "inf");
@@ -2632,7 +2633,7 @@ TEST_F(LlvmLibcSPrintfTest, FloatAutoConv) {
   #endif
 
     written = LIBC_NAMESPACE::sprintf(buff, "%La", 1.0e-1000L);
-  #if defined(SPECIAL_X86_LONG_DOUBLE)
+  #if defined(LIBC_LONG_DOUBLE_IS_X86_FLOAT80)
     ASSERT_STREQ_LEN(written, buff, "0x8.68a9188a89e1467p-3325");
   #elif defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
     ASSERT_STREQ_LEN(written, buff, "0x0p+0");
@@ -2806,21 +2807,21 @@ TEST_F(LlvmLibcSPrintfTest, FloatAutoConv) {
   written = LIBC_NAMESPACE::sprintf(buff, "%.10g", 0x1.0p-1074);
   ASSERT_STREQ_LEN(written, buff, "4.940656458e-324");
 
-#if defined(SPECIAL_X86_LONG_DOUBLE)
+#if defined(LIBC_LONG_DOUBLE_IS_X86_FLOAT80)
 
   written = LIBC_NAMESPACE::sprintf(buff, "%.60Lg", 0xa.aaaaaaaaaaaaaabp-7L);
   ASSERT_STREQ_LEN(
       written, buff,
       "0.0833333333333333333355920878593448009041821933351457118988037");
 
-#endif // SPECIAL_X86_LONG_DOUBLE
+#endif // LIBC_LONG_DOUBLE_IS_X86_FLOAT80
 
   // Long double precision tests.
   // These are currently commented out because they require long double support
   // that isn't ready yet.
   /*
     written = LIBC_NAMESPACE::sprintf(buff, "%.1La", 0.1L);
-  #if defined(SPECIAL_X86_LONG_DOUBLE)
+  #if defined(LIBC_LONG_DOUBLE_IS_X86_FLOAT80)
     ASSERT_STREQ_LEN(written, buff, "0xc.dp-7");
   #elif defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
     ASSERT_STREQ_LEN(written, buff, "0x1.ap-4");
@@ -2829,7 +2830,7 @@ TEST_F(LlvmLibcSPrintfTest, FloatAutoConv) {
   #endif
 
     written = LIBC_NAMESPACE::sprintf(buff, "%.1La",
-  0xf.fffffffffffffffp16380L); #if defined(SPECIAL_X86_LONG_DOUBLE)
+  0xf.fffffffffffffffp16380L); #if defined(LIBC_LONG_DOUBLE_IS_X86_FLOAT80)
     ASSERT_STREQ_LEN(written, buff, "0x1.0p+16384");
   #elif defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
     ASSERT_STREQ_LEN(written, buff, "inf");
diff --git a/libc/test/src/stdlib/strtold_test.cpp b/libc/test/src/stdlib/strtold_test.cpp
index 1ddf729689ff6c..37db385c959bf5 100644
--- a/libc/test/src/stdlib/strtold_test.cpp
+++ b/libc/test/src/stdlib/strtold_test.cpp
@@ -18,7 +18,7 @@
 
 #if defined(LIBC_LONG_DOUBLE_IS_FLOAT64)
 #define SELECT_CONST(val, _, __) val
-#elif defined(SPECIAL_X86_LONG_DOUBLE)
+#elif defined(LIBC_LONG_DOUBLE_IS_X86_FLOAT80)
 #define SELECT_CONST(_, val, __) val
 #else
 #define SELECT_CONST(_, __, val) val

From da86d4a8c956f0fcee21444eb6de9f05d39d6574 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 1 Dec 2023 14:25:16 +0100
Subject: [PATCH 37/72] [ValueTracking] Reduce duplication in
 haveNoCommonBitsSet() (NFC)

Extract a function and call it with both operand orders, so that
we don't have to explicitly commute every single pattern.
---
 llvm/lib/Analysis/ValueTracking.cpp | 50 ++++++++++++++---------------
 1 file changed, 24 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index d8a72c9f7b989d..8c29c242215d66 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -186,47 +186,30 @@ KnownBits llvm::computeKnownBits(const Value *V, const APInt &DemandedElts,
       SimplifyQuery(DL, DT, AC, safeCxtI(V, CxtI), UseInstrInfo));
 }
 
-bool llvm::haveNoCommonBitsSet(const WithCache<const Value *> &LHSCache,
-                               const WithCache<const Value *> &RHSCache,
-                               const SimplifyQuery &SQ) {
-  const Value *LHS = LHSCache.getValue();
-  const Value *RHS = RHSCache.getValue();
-
-  assert(LHS->getType() == RHS->getType() &&
-         "LHS and RHS should have the same type");
-  assert(LHS->getType()->isIntOrIntVectorTy() &&
-         "LHS and RHS should be integers");
+static bool haveNoCommonBitsSetSpecialCases(const Value *LHS,
+                                            const Value *RHS) {
   // Look for an inverted mask: (X & ~M) op (Y & M).
   {
     Value *M;
     if (match(LHS, m_c_And(m_Not(m_Value(M)), m_Value())) &&
         match(RHS, m_c_And(m_Specific(M), m_Value())))
       return true;
-    if (match(RHS, m_c_And(m_Not(m_Value(M)), m_Value())) &&
-        match(LHS, m_c_And(m_Specific(M), m_Value())))
-      return true;
   }
 
   // X op (Y & ~X)
-  if (match(RHS, m_c_And(m_Not(m_Specific(LHS)), m_Value())) ||
-      match(LHS, m_c_And(m_Not(m_Specific(RHS)), m_Value())))
+  if (match(RHS, m_c_And(m_Not(m_Specific(LHS)), m_Value())))
     return true;
 
   // X op ((X & Y) ^ Y) -- this is the canonical form of the previous pattern
   // for constant Y.
   Value *Y;
-  if (match(RHS,
-            m_c_Xor(m_c_And(m_Specific(LHS), m_Value(Y)), m_Deferred(Y))) ||
-      match(LHS, m_c_Xor(m_c_And(m_Specific(RHS), m_Value(Y)), m_Deferred(Y))))
+  if (match(RHS, m_c_Xor(m_c_And(m_Specific(LHS), m_Value(Y)), m_Deferred(Y))))
     return true;
 
   // Peek through extends to find a 'not' of the other side:
   // (ext Y) op ext(~Y)
-  // (ext ~Y) op ext(Y)
-  if ((match(LHS, m_ZExtOrSExt(m_Value(Y))) &&
-       match(RHS, m_ZExtOrSExt(m_Not(m_Specific(Y))))) ||
-      (match(RHS, m_ZExtOrSExt(m_Value(Y))) &&
-       match(LHS, m_ZExtOrSExt(m_Not(m_Specific(Y))))))
+  if (match(LHS, m_ZExtOrSExt(m_Value(Y))) &&
+      match(RHS, m_ZExtOrSExt(m_Not(m_Specific(Y)))))
     return true;
 
   // Look for: (A & B) op ~(A | B)
@@ -235,11 +218,26 @@ bool llvm::haveNoCommonBitsSet(const WithCache<const Value *> &LHSCache,
     if (match(LHS, m_And(m_Value(A), m_Value(B))) &&
         match(RHS, m_Not(m_c_Or(m_Specific(A), m_Specific(B)))))
       return true;
-    if (match(RHS, m_And(m_Value(A), m_Value(B))) &&
-        match(LHS, m_Not(m_c_Or(m_Specific(A), m_Specific(B)))))
-      return true;
   }
 
+  return false;
+}
+
+bool llvm::haveNoCommonBitsSet(const WithCache<const Value *> &LHSCache,
+                               const WithCache<const Value *> &RHSCache,
+                               const SimplifyQuery &SQ) {
+  const Value *LHS = LHSCache.getValue();
+  const Value *RHS = RHSCache.getValue();
+
+  assert(LHS->getType() == RHS->getType() &&
+         "LHS and RHS should have the same type");
+  assert(LHS->getType()->isIntOrIntVectorTy() &&
+         "LHS and RHS should be integers");
+
+  if (haveNoCommonBitsSetSpecialCases(LHS, RHS) ||
+      haveNoCommonBitsSetSpecialCases(RHS, LHS))
+    return true;
+
   return KnownBits::haveNoCommonBitsSet(LHSCache.getKnownBits(SQ),
                                         RHSCache.getKnownBits(SQ));
 }

From 6e3b2cb46ef5b9d9d28ed337491ee7da7b296616 Mon Sep 17 00:00:00 2001
From: Eleanor Bonnici <eleanor.bonnici@arm.com>
Date: Fri, 1 Dec 2023 13:54:04 +0000
Subject: [PATCH 38/72] [llvm][MC][ARM][Assembly] Emit relocations for ADRs and
 big-endian targets (#73834)

Follow-up on https://github.com/llvm/llvm-project/pull/72873/

When ADR/LDR instructions reference a label in a different section, the
offset is not known until link time, however, the assembler assumes it
    can resolve them in some cases.

    The previous patch addressed the issue for most LDR instructions,
    focusing on little-endian targets.

This patch addresses the remaining work for ADRs and big-endian targets.
---
 .../Target/ARM/MCTargetDesc/ARMAsmBackend.cpp | 15 ++++++-----
 .../ARM/MCTargetDesc/ARMELFObjectWriter.cpp   |  6 +++++
 llvm/test/MC/ARM/pcrel-adr16-relocs.s         | 26 +++++++++++++++++++
 llvm/test/MC/ARM/pcrel-adr32-relocs.s         | 24 +++++++++++++++++
 llvm/test/MC/ARM/pcrel-arm-ldr-imm8-relocs.s  |  6 +++++
 llvm/test/MC/ARM/pcrel-global.s               | 10 ++-----
 llvm/test/MC/ARM/pcrel-ldr-relocs.s           |  8 ++++--
 llvm/test/MC/ARM/pcrel-thumb-ldr2-relocs.s    |  3 +++
 llvm/test/MC/ARM/thumb1-relax-adr.s           |  1 -
 9 files changed, 82 insertions(+), 17 deletions(-)
 create mode 100644 llvm/test/MC/ARM/pcrel-adr16-relocs.s
 create mode 100644 llvm/test/MC/ARM/pcrel-adr32-relocs.s

diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index ca3b77e4a35653..41b3c6005231e8 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -88,10 +88,12 @@ const MCFixupKindInfo &ARMAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
        IsPCRelConstant | MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
       {"fixup_arm_ldst_abs_12", 0, 32, 0},
       {"fixup_thumb_adr_pcrel_10", 0, 8,
-       IsPCRelConstant | MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
-      {"fixup_arm_adr_pcrel_12", 0, 32, IsPCRelConstant},
+       MCFixupKindInfo::FKF_IsPCRel |
+           MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+      {"fixup_arm_adr_pcrel_12", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
       {"fixup_t2_adr_pcrel_12", 0, 32,
-       IsPCRelConstant | MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+       MCFixupKindInfo::FKF_IsPCRel |
+           MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
       {"fixup_arm_condbranch", 0, 24, MCFixupKindInfo::FKF_IsPCRel},
       {"fixup_arm_uncondbranch", 0, 24, MCFixupKindInfo::FKF_IsPCRel},
       {"fixup_t2_condbranch", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
@@ -133,10 +135,11 @@ const MCFixupKindInfo &ARMAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
       // ARMFixupKinds.h.
       //
       // Name                      Offset (bits) Size (bits)     Flags
-      {"fixup_arm_ldst_pcrel_12", 0, 32, IsPCRelConstant},
+      {"fixup_arm_ldst_pcrel_12", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
       {"fixup_t2_ldst_pcrel_12", 0, 32,
-       IsPCRelConstant | MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
-      {"fixup_arm_pcrel_10_unscaled", 0, 32, IsPCRelConstant},
+       MCFixupKindInfo::FKF_IsPCRel |
+           MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+      {"fixup_arm_pcrel_10_unscaled", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
       {"fixup_arm_pcrel_10", 0, 32, IsPCRelConstant},
       {"fixup_t2_pcrel_10", 0, 32,
        MCFixupKindInfo::FKF_IsPCRel |
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
index 985097fc328105..44695a86c4e36c 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
@@ -164,6 +164,12 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
       return ELF::R_ARM_LDRS_PC_G0;
     case ARM::fixup_t2_ldst_pcrel_12:
       return ELF::R_ARM_THM_PC12;
+    case ARM::fixup_arm_adr_pcrel_12:
+      return ELF::R_ARM_ALU_PC_G0;
+    case ARM::fixup_thumb_adr_pcrel_10:
+      return ELF::R_ARM_THM_PC8;
+    case ARM::fixup_t2_adr_pcrel_12:
+      return ELF::R_ARM_THM_ALU_PREL_11_0;
     case ARM::fixup_bf_target:
       return ELF::R_ARM_THM_BF16;
     case ARM::fixup_bfc_target:
diff --git a/llvm/test/MC/ARM/pcrel-adr16-relocs.s b/llvm/test/MC/ARM/pcrel-adr16-relocs.s
new file mode 100644
index 00000000000000..adef746c3607a5
--- /dev/null
+++ b/llvm/test/MC/ARM/pcrel-adr16-relocs.s
@@ -0,0 +1,26 @@
+@ RUN: llvm-mc -filetype=obj --triple=thumbv6m-none-eabi %s -o %t
+@ RUN: llvm-readelf -r %t | FileCheck %s --check-prefix=RELOC
+@ RUN: llvm-objdump -d --triple=thumbv6m-none-eabi %t | FileCheck %s --check-prefix=ADDEND
+
+    .section .text._func1, "ax"
+
+    .balign 4
+    .global _func1
+    .type _func1, %function
+_func1:
+    adr r0, _func2
+@ RELOC: R_ARM_THM_PC8
+    bx lr
+
+// Checking the encoding only, as the disassembly is not quite correct here.
+//00000000 <_func1>:
+//       0: a0ff         	adr	r0, #1020 <_func1+0x103>
+
+// Thumb16 encoding supports only adding of the encoded immediate (not
+// subtracting, see [Arm ARM]), therefore sign change is required if the pcrel
+// offset is negative. This makes the calculation of the addend for
+// R_ARM_THM_PC8 more complex, for details see [ELF for the Arm 32-bit
+// architecture].
+
+@ ADDEND: a0ff adr
+
diff --git a/llvm/test/MC/ARM/pcrel-adr32-relocs.s b/llvm/test/MC/ARM/pcrel-adr32-relocs.s
new file mode 100644
index 00000000000000..5fd30f24630f90
--- /dev/null
+++ b/llvm/test/MC/ARM/pcrel-adr32-relocs.s
@@ -0,0 +1,24 @@
+@ RUN: llvm-mc -filetype=obj -triple=armv7 %s -o %t
+@ RUN: llvm-readelf -r %t | FileCheck %s --check-prefix=RELOC
+@ RUN: llvm-objdump -d --triple=armv7 %t | FileCheck %s --check-prefix=ADDEND
+
+@ RUN: llvm-mc -filetype=obj --triple=armebv7-unknown-unknown %s -o %t
+@ RUN: llvm-readelf -r %t | FileCheck %s --check-prefix=RELOC
+@ RUN: llvm-objdump -d --triple=armebv7-unknown-unknown %t | FileCheck %s --check-prefix=ADDEND
+
+    .section .text._func1, "ax"
+
+    .balign 4
+    .global _func1
+    .type _func1, %function
+_func1:
+    adr r0, _func2
+@ RELOC: R_ARM_ALU_PC_G0
+    .thumb
+    adr r0, _func2
+@ RELOC: R_ARM_THM_ALU_PREL_11_0
+    bx lr
+
+@ ADDEND:      sub	r0, pc, #8
+@ ADDEND-NEXT: adr.w	r0, #-4
+
diff --git a/llvm/test/MC/ARM/pcrel-arm-ldr-imm8-relocs.s b/llvm/test/MC/ARM/pcrel-arm-ldr-imm8-relocs.s
index 40453d6ef341a4..f8b166d4c24858 100644
--- a/llvm/test/MC/ARM/pcrel-arm-ldr-imm8-relocs.s
+++ b/llvm/test/MC/ARM/pcrel-arm-ldr-imm8-relocs.s
@@ -1,6 +1,9 @@
 @ RUN: llvm-mc -filetype=obj -triple=armv7 %s -o %t
 @ RUN: llvm-readelf -r %t | FileCheck %s --check-prefix=ARM
 @ RUN: llvm-objdump -d --triple=armv7 %t | FileCheck %s --check-prefix=ARM_ADDEND
+@ RUN: llvm-mc -filetype=obj --triple=armebv7-unknown-unknown %s -o %t
+@ RUN: llvm-readelf -r %t | FileCheck %s --check-prefix=ARM
+@ RUN: llvm-objdump -d --triple=armebv7-unknown-unknown %t | FileCheck %s --check-prefix=ARM_ADDEND
 
 @ ARM: R_ARM_LDRS_PC_G0
 @ ARM: R_ARM_LDRS_PC_G0
@@ -8,6 +11,7 @@
 @ ARM: R_ARM_LDRS_PC_G0
 @ ARM: R_ARM_LDRS_PC_G0
 @ ARM: R_ARM_LDRS_PC_G0
+@ ARM: R_ARM_LDRS_PC_G0
 
 // The value format is decimal in these specific cases, but it's hex for other
 // ldr instructions. These checks are valid for both formats.
@@ -18,6 +22,7 @@
 @ ARM_ADDEND: r0, [pc, #-{{16|0x10}}]
 @ ARM_ADDEND: r0, [pc, #-{{16|0x10}}]
 @ ARM_ADDEND: r0, [pc]
+@ ARM_ADDEND: r0, r1, [pc]
 
     .arm
     .section .text.bar, "ax"
@@ -31,6 +36,7 @@ bar:
     ldrh r0, just_after-8
     ldrsb r0, just_after-8
     ldrsh r0, foo+8
+    ldrd r0,r1, foo+8
     bx lr
 
     .section .data.foo, "a", %progbits
diff --git a/llvm/test/MC/ARM/pcrel-global.s b/llvm/test/MC/ARM/pcrel-global.s
index 15d46cf2063ecf..1e9e6e989356ec 100644
--- a/llvm/test/MC/ARM/pcrel-global.s
+++ b/llvm/test/MC/ARM/pcrel-global.s
@@ -7,11 +7,9 @@
 @ CHECK: There are no relocations in this file.
 
 @ DISASM-LABEL: <bar>:
-@ DISASM-NEXT:    adr.w   r0, #-4
-@ DISASM-NEXT:    adr.w   r0, #-8
-@ DISASM-NEXT:    ldr     r0, [pc, #0x0]          @ 0x14 <bar+0xc>
+@ DISASM-NEXT:    ldr     r0, [pc, #0x0]          @ 0x8 <bar+0x4>
 @ DISASM-NEXT:    add     r0, pc
-@ DISASM-NEXT:   .word   0xfffffff3
+@ DISASM-NEXT:   .word   0xfffffffb
 @@ GNU assembler creates an R_ARM_REL32 referencing bar.
 @ DISASM-NOT:    {{.}}
 
@@ -20,16 +18,12 @@
 .globl foo
 foo:
 vldr d0, foo     @ arm_pcrel_10
-adr r2, foo      @ arm_adr_pcrel_12
 
 .thumb
 .thumb_func
 .type bar, %function
 .globl bar
 bar:
-adr r0, bar      @ thumb_adr_pcrel_10
-adr.w r0, bar    @ t2_adr_pcrel_12
-
   ldr r0, .LCPI
 .LPC0_1:
   add r0, pc
diff --git a/llvm/test/MC/ARM/pcrel-ldr-relocs.s b/llvm/test/MC/ARM/pcrel-ldr-relocs.s
index 120d54ebafe087..e0f27f29949993 100644
--- a/llvm/test/MC/ARM/pcrel-ldr-relocs.s
+++ b/llvm/test/MC/ARM/pcrel-ldr-relocs.s
@@ -4,12 +4,17 @@
 @ RUN: llvm-mc -filetype=obj -triple=thumbv7 %s -o %t
 @ RUN: llvm-readelf -r %t | FileCheck %s --check-prefix=THUMB
 @ RUN: llvm-objdump -d --triple=thumbv7 %t | FileCheck %s --check-prefix=THUMB_ADDEND
+@ RUN: llvm-mc -filetype=obj -triple=armebv7 %s -o %t
+@ RUN: llvm-readelf -r %t | FileCheck %s --check-prefix=ARM
+@ RUN: llvm-objdump -d --triple=armebv7 %t | FileCheck %s --check-prefix=ARM_ADDEND
+@ RUN: llvm-mc -filetype=obj -triple=thumbebv7 %s -o %t
+@ RUN: llvm-readelf -r %t | FileCheck %s --check-prefix=THUMB
+@ RUN: llvm-objdump -d --triple=thumbebv7 %t | FileCheck %s --check-prefix=THUMB_ADDEND
 
 @ ARM: R_ARM_LDR_PC_G0
 @ ARM: R_ARM_LDR_PC_G0
 @ ARM: R_ARM_LDR_PC_G0
 @ ARM: R_ARM_LDR_PC_G0
-
 @ ARM_ADDEND: r0, [pc, #-0x8]
 @ ARM_ADDEND: r0, [pc, #-0x8]
 @ ARM_ADDEND: r0, [pc, #-0x10]
@@ -19,7 +24,6 @@
 @ THUMB: R_ARM_THM_PC12
 @ THUMB: R_ARM_THM_PC12
 @ THUMB: R_ARM_THM_PC12
-
 @ THUMB_ADDEND: r0, [pc, #-0x4]
 @ THUMB_ADDEND: r0, [pc, #-0x4]
 @ THUMB_ADDEND: r0, [pc, #-0xc]
diff --git a/llvm/test/MC/ARM/pcrel-thumb-ldr2-relocs.s b/llvm/test/MC/ARM/pcrel-thumb-ldr2-relocs.s
index 17ca72bd3f00ca..3aa371fc7d702f 100644
--- a/llvm/test/MC/ARM/pcrel-thumb-ldr2-relocs.s
+++ b/llvm/test/MC/ARM/pcrel-thumb-ldr2-relocs.s
@@ -1,6 +1,9 @@
 @ RUN: llvm-mc -filetype=obj -triple=thumbv7 %s -o %t
 @ RUN: llvm-readelf -r %t | FileCheck %s --check-prefix=THUMB
 @ RUN: llvm-objdump -d --triple=thumbv7 %t | FileCheck %s --check-prefix=THUMB_ADDEND
+@ RUN: llvm-mc -filetype=obj --triple=thumbebv7-unknown-unknown %s -o %t
+@ RUN: llvm-readelf -r %t | FileCheck %s --check-prefix=THUMB
+@ RUN: llvm-objdump -d --triple=thumbebv7-unknown-unknown %t | FileCheck %s --check-prefix=THUMB_ADDEND
 
 @ All the ldr variants produce a relocation
 @ THUMB: R_ARM_THM_PC12
diff --git a/llvm/test/MC/ARM/thumb1-relax-adr.s b/llvm/test/MC/ARM/thumb1-relax-adr.s
index fc5c7c39df5ae1..97b566f4833e63 100644
--- a/llvm/test/MC/ARM/thumb1-relax-adr.s
+++ b/llvm/test/MC/ARM/thumb1-relax-adr.s
@@ -1,6 +1,5 @@
 @ RUN: not llvm-mc -triple thumbv6m-none-macho -filetype=obj -o /dev/null %s 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s
 @ RUN: not llvm-mc -triple thumbv7m-none-macho -filetype=obj -o /dev/null %s 2>&1  | FileCheck --check-prefix=CHECK-ERROR %s
-@ RUN: not llvm-mc -triple thumbv7m-none-eabi -filetype=obj -o /dev/null %s 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s
 
         .global func1
 _func1:

From 6ab7662f35bb5bc1d19a7e68ec0a710bbf71c2c4 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Fri, 1 Dec 2023 17:56:27 +0400
Subject: [PATCH 39/72] [clang][NFC] Refactor expected directives in C++ DRs
 100-199 (#74061)

This patch continues the work started with ea5b1ef016d020c37f903d6c7d4f623be975dab8. See that commit and its corresponding PR for details.
---
 clang/test/CXX/drs/dr0xx.cpp |  56 ++-
 clang/test/CXX/drs/dr1xx.cpp | 681 ++++++++++++++++++++++-------------
 2 files changed, 458 insertions(+), 279 deletions(-)

diff --git a/clang/test/CXX/drs/dr0xx.cpp b/clang/test/CXX/drs/dr0xx.cpp
index e79ce6daf2655c..768da0f8e6fa77 100644
--- a/clang/test/CXX/drs/dr0xx.cpp
+++ b/clang/test/CXX/drs/dr0xx.cpp
@@ -89,12 +89,11 @@ namespace dr7 { // dr7: 3.4
   class B : virtual private A {}; // #dr7-B
   class C : public B {} c; // #dr7-C
   // expected-error@#dr7-C {{inherited virtual base class 'A' has private destructor}}
-  // expected-note@#dr7-C {{in implicit default constructor for 'dr7::C' first required here}}
-  // expected-note@#dr7-B {{declared private here}}
-
+  //   expected-note@#dr7-C {{in implicit default constructor for 'dr7::C' first required here}}
+  //   expected-note@#dr7-B {{declared private here}}
   // expected-error@#dr7-C {{inherited virtual base class 'A' has private destructor}}
-  // expected-note@#dr7-C {{in implicit destructor for 'dr7::C' first required here}}
-  // expected-note@#dr7-B {{declared private here}}
+  //   expected-note@#dr7-C {{in implicit destructor for 'dr7::C' first required here}}
+  //   expected-note@#dr7-B {{declared private here}}
   class VeryDerivedC : public B, virtual public A {} vdc;
 
   class X { ~X(); }; // #dr7-X
@@ -237,11 +236,10 @@ namespace dr16 { // dr16: 2.8
       // expected-note@#dr16-A-f-decl {{member is declared here}}
       A::f(); // #dr16-A-f-call
       // expected-error@#dr16-A-f-call {{'A' is a private member of 'dr16::A'}}
-      // expected-note@#dr16-B {{constrained by implicitly private inheritance here}}
-      // expected-note@#dr16-A {{member is declared here}}
-
+      //   expected-note@#dr16-B {{constrained by implicitly private inheritance here}}
+      //   expected-note@#dr16-A {{member is declared here}}
       // expected-error@#dr16-A-f-call {{cannot cast 'dr16::C' to its private base class 'dr16::A'}}
-      // expected-note@#dr16-B {{implicitly declared private here}}
+      //   expected-note@#dr16-B {{implicitly declared private here}}
     }
   };
 }
@@ -361,9 +359,9 @@ namespace dr26 { // dr26: yes
     // FIXME: In C++98, we diagnose this twice.
     B(const B &, B = B());
     // cxx98-14-error@-1 {{recursive evaluation of default argument}}
-    // cxx98-14-note@-2 {{default argument used here}}
+    //   cxx98-14-note@-2 {{default argument used here}}
     // cxx98-error@-3 {{recursive evaluation of default argument}}
-    // cxx98-note@-4 {{default argument used here}}
+    //   cxx98-note@-4 {{default argument used here}}
   };
   struct C {
     static C &f();
@@ -788,23 +786,20 @@ namespace dr49 { // dr49: 2.8
   A<&k> a;
   A<p> b; // #dr49-b
   // cxx98-error@#dr49-b {{non-type template argument referring to object 'p' with internal linkage is a C++11 extension}}
-  // cxx98-note@#dr49-p {{non-type template argument refers to object here}}
-
+  //   cxx98-note@#dr49-p {{non-type template argument refers to object here}}
   // cxx98-14-error@#dr49-b {{non-type template argument for template parameter of pointer type 'int *' must have its address taken}}
-  // cxx98-14-note@#dr49-A {{template parameter is declared here}}
+  //   cxx98-14-note@#dr49-A {{template parameter is declared here}}
   int *q = &k; // #dr49-q
   A<q> c; // #dr49-c
   // cxx98-error@#dr49-c {{non-type template argument for template parameter of pointer type 'int *' must have its address taken}}
-  // cxx98-note@#dr49-A {{template parameter is declared here}}
-
+  //   cxx98-note@#dr49-A {{template parameter is declared here}}
   // cxx11-14-error@#dr49-c {{non-type template argument of type 'int *' is not a constant expression}}
-  // cxx11-14-note@#dr49-c {{read of non-constexpr variable 'q' is not allowed in a constant expression}}
-  // cxx11-14-note@#dr49-q {{declared here}}
-  // cxx11-14-note@#dr49-A {{template parameter is declared here}}
-
+  //   cxx11-14-note@#dr49-c {{read of non-constexpr variable 'q' is not allowed in a constant expression}}
+  //   cxx11-14-note@#dr49-q {{declared here}}
+  //   cxx11-14-note@#dr49-A {{template parameter is declared here}}
   // since-cxx17-error@#dr49-c {{non-type template argument is not a constant expression}}
-  // since-cxx17-note@#dr49-c {{read of non-constexpr variable 'q' is not allowed in a constant expression}}
-  // since-cxx17-note@#dr49-q {{declared here}}
+  //   since-cxx17-note@#dr49-c {{read of non-constexpr variable 'q' is not allowed in a constant expression}}
+  //   since-cxx17-note@#dr49-q {{declared here}}
 }
 
 namespace dr50 { // dr50: yes
@@ -835,11 +830,10 @@ namespace dr52 { // dr52: 2.8
   int k = b.A::n; // #dr52-k
   // FIXME: This first diagnostic is very strangely worded, and seems to be bogus.
   // expected-error@#dr52-k {{'A' is a private member of 'dr52::A'}}
-  // expected-note@#dr52-B {{constrained by private inheritance here}}
-  // expected-note@#dr52-A {{member is declared here}}
-
+  //   expected-note@#dr52-B {{constrained by private inheritance here}}
+  //   expected-note@#dr52-A {{member is declared here}}
   // expected-error@#dr52-k {{cannot cast 'struct B' to its private base class 'dr52::A'}}
-  // expected-note@#dr52-B {{declared private here}}
+  //   expected-note@#dr52-B {{declared private here}}
 }
 
 namespace dr53 { // dr53: yes
@@ -1171,8 +1165,7 @@ namespace dr76 { // dr76: yes
   const volatile int n = 1;
   int arr[n]; // #dr76-vla
   // expected-error@#dr76-vla {{variable length arrays in C++ are a Clang extension}}
-  // expected-note@#dr76-vla {{read of volatile-qualified type 'const volatile int' is not allowed in a constant expression}}
-
+  //   expected-note@#dr76-vla {{read of volatile-qualified type 'const volatile int' is not allowed in a constant expression}}
   // expected-error@#dr76-vla {{variable length array declaration not allowed at file scope}}
 }
 
@@ -1346,8 +1339,7 @@ namespace dr92 { // dr92: 4 c++17
   // since-cxx17-note@-2 {{use 'noexcept(false)' instead}}
   void (*p)() throw(int) = &f; // #dr92-p
   // since-cxx17-error@#dr92-p {{ISO C++17 does not allow dynamic exception specifications}}
-  // since-cxx17-note@#dr92-p {{use 'noexcept(false)' instead}}
-
+  //   since-cxx17-note@#dr92-p {{use 'noexcept(false)' instead}}
   // cxx98-14-error@#dr92-p {{target exception specification is not superset of source}}
   // since-cxx17-warning@#dr92-p {{target exception specification is not superset of source}}
   void (*q)() throw(int);
@@ -1363,11 +1355,11 @@ namespace dr92 { // dr92: 4 c++17
     g(f);
     // cxx98-14-error@-1 {{target exception specification is not superset of source}}
     // since-cxx17-error@-2 {{no matching function for call to 'g'}}
-    // since-cxx17-note@#dr92-g {{candidate function not viable: no known conversion from 'void () throw(int, float)' to 'void (*)() throw()' for 1st argument}}
+    //   since-cxx17-note@#dr92-g {{candidate function not viable: no known conversion from 'void () throw(int, float)' to 'void (*)() throw()' for 1st argument}}
     g(q);
     // cxx98-14-error@-1 {{target exception specification is not superset of source}}
     // since-cxx17-error@-2 {{no matching function for call to 'g'}}
-    // since-cxx17-note@#dr92-g {{candidate function not viable: no known conversion from 'void (*)() throw(int)' to 'void (*)() throw()' for 1st argument}}
+    //   since-cxx17-note@#dr92-g {{candidate function not viable: no known conversion from 'void (*)() throw(int)' to 'void (*)() throw()' for 1st argument}}
   }
 
   // Prior to C++17, this is OK because the exception specification is not
diff --git a/clang/test/CXX/drs/dr1xx.cpp b/clang/test/CXX/drs/dr1xx.cpp
index 50236eb7c9499d..4465e7e0f1bfdb 100644
--- a/clang/test/CXX/drs/dr1xx.cpp
+++ b/clang/test/CXX/drs/dr1xx.cpp
@@ -1,30 +1,31 @@
-// RUN: %clang_cc1 -std=c++98 -triple x86_64-unknown-unknown %s -verify -fexceptions -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-unknown %s -verify -fexceptions -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown %s -verify -fexceptions -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-unknown %s -verify -fexceptions -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-unknown %s -verify -fexceptions -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++23 -triple x86_64-unknown-unknown %s -verify -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++98 -triple x86_64-unknown-unknown %s -verify=expected,cxx98,cxx98-11,cxx98-14,cxx98-17 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11,cxx98-11,cxx98-14,cxx98-17,cxx11-14 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11,cxx98-14,cxx98-17,cxx11-14 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11,since-cxx17,cxx98-17 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11,since-cxx17 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++23 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11,since-cxx17 -fexceptions -fcxx-exceptions -pedantic-errors
 
 namespace dr100 { // dr100: yes
-  template<const char (*)[4]> struct A {}; // expected-note 0-1{{declared here}}
-  template<const char (&)[4]> struct B {}; // expected-note 0-1{{declared here}}
-  template<const char *> struct C {}; // expected-note 0-1{{declared here}}
-  template<const char &> struct D {}; // expected-note 0-1{{declared here}}
-  A<&"foo"> a; // #100a
-  B<"bar"> b; // #100b
-  C<"baz"> c; // #100c
-  D<*"quux"> d; // #100d
-#if __cplusplus < 201703L
-  // expected-error@#100a {{does not refer to any declaration}}
-  // expected-error@#100b {{does not refer to any declaration}}
-  // expected-error@#100c {{does not refer to any declaration}}
-  // expected-error@#100d {{does not refer to any declaration}}
-#else
-  // expected-error@#100a {{pointer to string literal is not allowed in a template argument}}
-  // expected-error@#100b {{reference to string literal is not allowed in a template argument}}
-  // expected-error@#100c {{pointer to subobject of string literal is not allowed in a template argument}}
-  // expected-error@#100d {{reference to subobject of string literal is not allowed in a template argument}}
-#endif
+  template<const char (*)[4]> struct A {}; // #dr100-A
+  template<const char (&)[4]> struct B {}; // #dr100-B
+  template<const char *> struct C {}; // #dr100-C
+  template<const char &> struct D {}; // #dr100-D
+  A<&"foo"> a; // #dr100-a
+  // cxx98-14-error@#dr100-a {{non-type template argument does not refer to any declaration}}
+  //   cxx98-14-note@#dr100-A {{template parameter is declared here}}
+  // since-cxx17-error@#dr100-a {{pointer to string literal is not allowed in a template argument}}
+  B<"bar"> b; // #dr100-b
+  // cxx98-14-error@#dr100-b {{non-type template argument does not refer to any declaration}}
+  //   cxx98-14-note@#dr100-B {{template parameter is declared here}}
+  // since-cxx17-error@#dr100-b {{reference to string literal is not allowed in a template argument}}
+  C<"baz"> c; // #dr100-c
+  // cxx98-14-error@#dr100-c {{non-type template argument does not refer to any declaration}}
+  //   cxx98-14-note@#dr100-C {{template parameter is declared here}}
+  // since-cxx17-error@#dr100-c {{pointer to subobject of string literal is not allowed in a template argument}}
+  D<*"quux"> d; // #dr100-d
+  // cxx98-14-error@#dr100-d {{non-type template argument does not refer to any declaration}}
+  //   cxx98-14-note@#dr100-D {{template parameter is declared here}}
+  // since-cxx17-error@#dr100-d {{reference to subobject of string literal is not allowed in a template argument}}
 }
 
 namespace dr101 { // dr101: 3.5
@@ -42,13 +43,16 @@ namespace dr101 { // dr101: 3.5
 
 namespace dr102 { // dr102: yes
   namespace A {
-    template<typename T> T f(T a, T b) { return a + b; } // expected-error {{neither visible in the template definition nor found by argument-dependent lookup}}
+    template<typename T> T f(T a, T b) { return a + b; }
+    // expected-error@-1 {{call to function 'operator+' that is neither visible in the template definition nor found by argument-dependent lookup}}
+    // expected-note@#dr102-instantiation {{in instantiation of function template specialization 'dr102::A::f<dr102::B::S>' requested here}}
+    // expected-note@#dr102-operator-plus {{'operator+' should be declared prior to the call site or in namespace 'dr102::B'}}
   }
   namespace B {
     struct S {};
   }
-  B::S operator+(B::S, B::S); // expected-note {{should be declared prior to the call site or in namespace 'dr102::B'}}
-  template B::S A::f(B::S, B::S); // expected-note {{in instantiation of}}
+  B::S operator+(B::S, B::S); // #dr102-operator-plus
+  template B::S A::f(B::S, B::S); // #dr102-instantiation
 }
 
 // dr103: na
@@ -58,13 +62,17 @@ namespace dr102 { // dr102: yes
 namespace dr106 { // dr106: sup 540
   typedef int &r1;
   typedef r1 &r1;
-  typedef const r1 r1; // expected-warning {{has no effect}}
-  typedef const r1 &r1; // expected-warning {{has no effect}}
+  typedef const r1 r1;
+  // expected-warning@-1 {{'const' qualifier on reference type 'r1' (aka 'int &') has no effect}}
+  typedef const r1 &r1;
+  // expected-warning@-1 {{'const' qualifier on reference type 'r1' (aka 'int &') has no effect}}
 
   typedef const int &r2;
   typedef r2 &r2;
-  typedef const r2 r2; // expected-warning {{has no effect}}
-  typedef const r2 &r2; // expected-warning {{has no effect}}
+  typedef const r2 r2;
+  // expected-warning@-1 {{'const' qualifier on reference type 'r2' (aka 'const int &') has no effect}}
+  typedef const r2 &r2;
+  // expected-warning@-1 {{'const' qualifier on reference type 'r2' (aka 'const int &') has no effect}}
 }
 
 namespace dr107 { // dr107: yes
@@ -76,10 +84,9 @@ namespace dr108 { // dr108: 2.9
   template<typename T> struct A {
     struct B { typedef int X; };
     B::X x;
-#if __cplusplus <= 201703L
-    // expected-error@-2 {{implicit 'typename' is a C++20 extension}}
-#endif
-    struct C : B { X x; }; // expected-error {{unknown type name}}
+    // cxx98-17-error@-1 {{missing 'typename' prior to dependent type name B::X; implicit 'typename' is a C++20 extension}}
+    struct C : B { X x; };
+    // expected-error@-1 {{unknown type name 'X'}}
   };
   template<> struct A<int>::B { int X; };
 }
@@ -87,46 +94,55 @@ namespace dr108 { // dr108: 2.9
 namespace dr109 { // dr109: yes
   struct A { template<typename T> void f(T); };
   template<typename T> struct B : T {
-    using T::template f; // expected-error {{'template' keyword not permitted here}}
-    using T::template f<int>; // expected-error {{'template' keyword not permitted here}} expected-error {{using declaration cannot refer to a template specialization}}
+    using T::template f;
+    // expected-error@-1 {{'template' keyword not permitted here}}
+    using T::template f<int>;
+    // expected-error@-1 {{'template' keyword not permitted here}}
+    // expected-error@-2 {{using declaration cannot refer to a template specialization}}
     // FIXME: We shouldn't suggest using the 'template' keyword in a location where it's not valid.
-    using T::f<int>; // expected-error {{use 'template' keyword}} expected-error {{using declaration cannot refer to a template specialization}}
-    void g() { this->f<int>(123); } // expected-error {{use 'template' keyword}}
+    using T::f<int>;
+    // expected-error@-1 {{use 'template' keyword to treat 'f' as a dependent template name}}
+    // expected-error@-2 {{using declaration cannot refer to a template specialization}}
+    void g() { this->f<int>(123); }
+    // expected-error@-1 {{use 'template' keyword to treat 'f' as a dependent template name}}
   };
 }
 
 namespace dr111 { // dr111: dup 535
   struct A { A(); A(volatile A&, int = 0); A(A&, const char * = "foo"); };
-  struct B : A { B(); }; // expected-note +{{would lose const qualifier}} expected-note {{requires 0 arguments}}
+  struct B : A { B(); }; // #dr111-B
   const B b1;
-  B b2(b1); // expected-error {{no matching constructor}}
+  B b2(b1);
+  // expected-error@-1 {{no matching constructor for initialization of 'B'}}
+  // expected-note@#dr111-B {{candidate constructor (the implicit copy constructor) not viable: 1st argument ('const B') would lose const qualifier}}
+  // expected-note@#dr111-B {{candidate constructor not viable: requires 0 arguments, but 1 was provided}}
 }
 
 namespace dr112 { // dr112: yes
   struct T { int n; };
   typedef T Arr[1];
 
-  const T a1[1] = {};
+  const T a1[1] = {}; // #dr112-a1
   volatile T a2[1] = {};
-  const Arr a3 = {};
+  const Arr a3 = {}; // #dr112-a3
   volatile Arr a4 = {};
   template<const volatile T*> struct X {};
+  // FIXME: Test this somehow in C++11 and on.
   X<a1> x1;
+  // cxx98-error@-1 {{non-type template argument referring to object 'a1' with internal linkage is a C++11 extension}}
+  // cxx98-note@#dr112-a1 {{non-type template argument refers to object here}}
   X<a2> x2;
   X<a3> x3;
+  // cxx98-error@-1 {{non-type template argument referring to object 'a3' with internal linkage is a C++11 extension}}
+  // cxx98-note@#dr112-a3 {{non-type template argument refers to object here}}
   X<a4> x4;
-#if __cplusplus < 201103L
-  // expected-error@-5 {{internal linkage}} expected-note@-10 {{here}}
-  // expected-error@-4 {{internal linkage}} expected-note@-9 {{here}}
-#else
-  // FIXME: Test this somehow.
-#endif
 }
 
 namespace dr113 { // dr113: yes
   extern void (*p)();
   void f() {
-    no_such_function(); // expected-error {{undeclared}}
+    no_such_function();
+    // expected-error@-1 {{use of undeclared identifier 'no_such_function'}}
     p();
   }
   void g();
@@ -135,31 +151,48 @@ namespace dr113 { // dr113: yes
 
 namespace dr114 { // dr114: yes
   struct A {
-    virtual void f(int) = 0; // expected-note {{unimplemented}}
+    virtual void f(int) = 0; // #dr114-A-f
   };
   struct B : A {
     template<typename T> void f(T);
     void g() { f(0); }
-  } b; // expected-error {{abstract}}
+  } b;
+  // expected-error@-1 {{variable type 'struct B' is an abstract class}}
+  // expected-note@#dr114-A-f {{unimplemented pure virtual method 'f' in 'B'}}
 }
 
 namespace dr115 { // dr115: 3.0
-  template<typename T> int f(T); // expected-note +{{}}
-  template<typename T> int g(T); // expected-note +{{}}
-  template<typename T> int g(T, int); // expected-note +{{}}
+  template<typename T> int f(T); // #dr115-f
+  template<typename T> int g(T); // #dr115-g
+  template<typename T> int g(T, int); // #dr115-g-int
 
-  int k1 = f(&f); // expected-error {{no match}}
+  int k1 = f(&f);
+  // expected-error@-1 {{no matching function for call to 'f'}}
+  // expected-note@#dr115-f {{candidate template ignored: couldn't infer template argument 'T'}}
   int k2 = f(&f<int>);
-  int k3 = f(&g<int>); // expected-error {{no match}}
+  int k3 = f(&g<int>);
+  // expected-error@-1 {{no matching function for call to 'f'}}
+  // expected-note@#dr115-f {{candidate template ignored: couldn't infer template argument 'T'}}
 
   void h() {
-    (void)&f; // expected-error {{address of overloaded function 'f' cannot be cast to type 'void'}}
+    (void)&f;
+    // expected-error@-1 {{address of overloaded function 'f' cannot be cast to type 'void'}}
+    // expected-note@#dr115-f {{candidate function template}}
     (void)&f<int>;
-    (void)&g<int>; // expected-error {{address of overloaded function 'g' cannot be cast to type 'void'}}
-
-    &f; // expected-error {{reference to overloaded function could not be resolved}}
-    &f<int>; // expected-warning {{unused}}
-    &g<int>; // expected-error {{reference to overloaded function could not be resolved}}
+    (void)&g<int>;
+    // expected-error@-1 {{address of overloaded function 'g' cannot be cast to type 'void'}}
+    // expected-note@#dr115-g-int {{candidate function template}}
+    // expected-note@#dr115-g {{candidate function template}}
+
+    &f;
+    // expected-error@-1 {{reference to overloaded function could not be resolved; did you mean to call it?}}
+    // expected-note@#dr115-f {{possible target for call}}
+    &f<int>;
+    // expected-warning@-1 {{expression result unused}}
+    &g<int>;
+    // expected-error@-1 {{reference to overloaded function could not be resolved; did you mean to call it?}}
+    // expected-note@#dr115-g-int {{possible target for call}}
+    // expected-note@#dr115-g {{possible target for call}}
   }
 
   struct S {
@@ -168,18 +201,25 @@ namespace dr115 { // dr115: 3.0
     template<typename T> static int g(T, int);
   } s;
 
-  int k4 = f(&s.f); // expected-error {{non-constant pointer to member}}
+  int k4 = f(&s.f);
+  // expected-error@-1 {{cannot create a non-constant pointer to member function}}
   int k5 = f(&s.f<int>);
-  int k6 = f(&s.g<int>); // expected-error {{non-constant pointer to member}}
+  int k6 = f(&s.g<int>);
+  // expected-error@-1 {{cannot create a non-constant pointer to member function}}
 
   void i() {
-    (void)&s.f; // expected-error {{non-constant pointer to member}}
+    (void)&s.f;
+    // expected-error@-1 {{cannot create a non-constant pointer to member function}}
     (void)&s.f<int>;
-    (void)&s.g<int>; // expected-error {{non-constant pointer to member}}
-
-    &s.f; // expected-error {{non-constant pointer to member}}
-    &s.f<int>; // expected-warning {{unused}}
-    &s.g<int>; // expected-error {{non-constant pointer to member}}
+    (void)&s.g<int>;
+    // expected-error@-1 {{cannot create a non-constant pointer to member function}}
+
+    &s.f;
+    // expected-error@-1 {{cannot create a non-constant pointer to member function}}
+    &s.f<int>;
+    // expected-warning@-1 {{expression result unused}}
+    &s.g<int>;
+    // expected-error@-1 {{cannot create a non-constant pointer to member function}}
   }
 
   struct T {
@@ -188,40 +228,58 @@ namespace dr115 { // dr115: 3.0
     template<typename T> int g(T, int);
   } t;
 
-  int k7 = f(&s.f); // expected-error {{non-constant pointer to member}}
+  int k7 = f(&s.f);
+  // expected-error@-1 {{cannot create a non-constant pointer to member function}}
   int k8 = f(&s.f<int>);
-  int k9 = f(&s.g<int>); // expected-error {{non-constant pointer to member}}
+  int k9 = f(&s.g<int>);
+  // expected-error@-1 {{cannot create a non-constant pointer to member function}}
 
   void j() {
-    (void)&s.f; // expected-error {{non-constant pointer to member}}
+    (void)&s.f;
+    // expected-error@-1 {{cannot create a non-constant pointer to member function}}
     (void)&s.f<int>;
-    (void)&s.g<int>; // expected-error {{non-constant pointer to member}}
-
-    &s.f; // expected-error {{non-constant pointer to member}}
-    &s.f<int>; // expected-warning {{unused}}
-    &s.g<int>; // expected-error {{non-constant pointer to member}}
+    (void)&s.g<int>;
+    // expected-error@-1 {{cannot create a non-constant pointer to member function}}
+
+    &s.f;
+    // expected-error@-1 {{cannot create a non-constant pointer to member function}}
+    &s.f<int>;
+    // expected-warning@-1 {{expression result unused}}
+    &s.g<int>;
+    // expected-error@-1 {{cannot create a non-constant pointer to member function}}
   }
 
 #if __cplusplus >= 201103L
   // Special case kicks in only if a template argument list is specified.
-  template<typename T=int> void with_default(); // expected-note +{{}}
-  int k10 = f(&with_default); // expected-error {{no matching function}}
+  template<typename T=int> void with_default(); // #dr115-with-default
+  int k10 = f(&with_default);
+  // expected-error@-1 {{no matching function for call to 'f'}}
+  // expected-note@#dr115-f {{candidate template ignored: couldn't infer template argument 'T'}}
   int k11 = f(&with_default<>);
   void k() {
-    (void)&with_default; // expected-error {{overloaded function}}
+    (void)&with_default;
+    // expected-error@-1 {{address of overloaded function 'with_default' cannot be cast to type 'void'}}
+    // expected-note@#dr115-with-default {{candidate function template}}
     (void)&with_default<>;
-    &with_default; // expected-error {{overloaded function}}
-    &with_default<>; // expected-warning {{unused}}
+    &with_default;
+    // expected-error@-1 {{reference to overloaded function could not be resolved; did you mean to call it?}}
+    // expected-note@#dr115-with-default {{possible target for call}}
+    &with_default<>;
+    // expected-warning@-1 {{expression result unused}}
   }
 #endif
 }
 
 namespace dr116 { // dr116: yes
   template<int> struct A {};
-  template<int N> void f(A<N>) {} // expected-note {{previous}}
-  template<int M> void f(A<M>) {} // expected-error {{redefinition}}
-  template<typename T> void f(A<sizeof(T)>) {} // expected-note {{previous}}
-  template<typename U> void f(A<sizeof(U)>) {} // expected-error {{redefinition}}
+  template<int N> void f(A<N>) {} // #dr116-f-N
+  template<int M> void f(A<M>) {}
+  // expected-error@-1 {{redefinition of 'f'}}
+  // expected-note@#dr116-f-N {{previous definition is here}}
+  template<typename T> void f(A<sizeof(T)>) {} // #dr116-f-T
+  template<typename U> void f(A<sizeof(U)>) {}
+  // expected-error@-1 {{redefinition of 'f'}}
+  // expected-note@#dr116-f-T {{previous definition is here}}
 }
 
 // dr117: na
@@ -235,7 +293,9 @@ namespace dr121 { // dr121: yes
   };
   template<typename T> struct Z {
     X::Y<T> x;
-    T::Y<T> y; // expected-error +{{}}
+    T::Y<T> y;
+    // expected-error@-1 {{use 'template' keyword to treat 'Y' as a dependent template name}}
+    // cxx98-17-error@-2 {{missing 'typename' prior to dependent type name T::Y; implicit 'typename' is a C++20 extension}}
   };
   Z<X> z;
 }
@@ -249,15 +309,19 @@ namespace dr122 { // dr122: yes
 // dr124: dup 201
 
 // dr125: yes
-struct dr125_A { struct dr125_B {}; }; // expected-note {{here}}
+struct dr125_A { struct dr125_B {}; }; // #dr125_B
 dr125_A::dr125_B dr125_C();
 namespace dr125_B { dr125_A dr125_C(); }
 namespace dr125 {
   struct X {
     friend dr125_A::dr125_B (::dr125_C)(); // ok
     friend dr125_A (::dr125_B::dr125_C)(); // ok
-    friend dr125_A::dr125_B::dr125_C(); // expected-error {{did you mean the constructor name 'dr125_B'?}}
-    // expected-error@-1 {{missing exception specification}}
+    friend dr125_A::dr125_B::dr125_C(); // #dr125_C
+    // expected-error@#dr125_C {{missing return type for function 'dr125_C'; did you mean the constructor name 'dr125_B'?}}
+    // cxx98-error@#dr125_C {{'dr125_B' is missing exception specification 'throw()'}}
+    //   cxx98-note@#dr125_B {{previous declaration is here}}
+    // since-cxx11-error@#dr125_C {{'dr125_B' is missing exception specification 'noexcept'}}
+    //   since-cxx11-note@#dr125_B {{previous declaration is here}}
   };
 }
 
@@ -275,7 +339,6 @@ namespace dr126 { // dr126: partial
   // So, when catching by non-const (or volatile) reference to pointer, we
   // should compare the exception type to the caught type and only accept an
   // exact match.
-#if __cplusplus <= 201402L
   struct C {};
   struct D : C {};
   struct E : private C { friend class A; friend class B; };
@@ -283,53 +346,64 @@ namespace dr126 { // dr126: partial
   struct G : C {};
   struct H : D, G {};
 
+#if __cplusplus <= 201402L
   struct A {
     virtual void cp() throw(C*);
     virtual void dp() throw(C*);
-    virtual void ep() throw(C*); // expected-note {{overridden}}
-    virtual void fp() throw(C*); // expected-note {{overridden}}
+    virtual void ep() throw(C*); // #dr126-ep
+    virtual void fp() throw(C*); // #dr126-fp
     virtual void gp() throw(C*);
-    virtual void hp() throw(C*); // expected-note {{overridden}}
+    virtual void hp() throw(C*); // #dr126-hp
 
     virtual void cr() throw(C&);
     virtual void dr() throw(C&);
-    virtual void er() throw(C&); // expected-note {{overridden}}
-    virtual void fr() throw(C&); // expected-note {{overridden}}
+    virtual void er() throw(C&); // #dr126-er
+    virtual void fr() throw(C&); // #dr126-fr
     virtual void gr() throw(C&);
-    virtual void hr() throw(C&); // expected-note {{overridden}}
+    virtual void hr() throw(C&); // #dr126-hr
 
     virtual void pv() throw(void*);
 
-#if __cplusplus >= 201103L
     virtual void np() throw(C*);
     virtual void npm() throw(int C::*);
-    virtual void nr() throw(C*&); // expected-note {{overridden}}
+    virtual void nr() throw(C*&); // #dr126-nr
     virtual void ncr() throw(C*const&);
-#endif
 
     virtual void ref1() throw(C *const&);
     virtual void ref2() throw(C *);
 
     virtual void v() throw(int);
     virtual void w() throw(const int);
-    virtual void x() throw(int*); // expected-note {{overridden}}
+    virtual void x() throw(int*); // #dr126-x
     virtual void y() throw(const int*);
-    virtual void z() throw(int); // expected-note {{overridden}}
+    virtual void z() throw(int); // #dr126-z
   };
   struct B : A {
     virtual void cp() throw(C*);
     virtual void dp() throw(D*);
-    virtual void ep() throw(E*); // expected-error {{more lax}}
-    virtual void fp() throw(F*); // expected-error {{more lax}}
+    virtual void ep() throw(E*);
+    // cxx98-14-error@-1 {{exception specification of overriding function is more lax than base version}}
+    // cxx98-14-note@#dr126-ep {{overridden virtual function is here}}
+    virtual void fp() throw(F*);
+    // cxx98-14-error@-1 {{exception specification of overriding function is more lax than base version}}
+    // cxx98-14-note@#dr126-fp {{overridden virtual function is here}}
     virtual void gp() throw(G*);
-    virtual void hp() throw(H*); // expected-error {{more lax}}
+    virtual void hp() throw(H*);
+    // cxx98-14-error@-1 {{exception specification of overriding function is more lax than base version}}
+    // cxx98-14-note@#dr126-hp {{overridden virtual function is here}}
 
     virtual void cr() throw(C&);
     virtual void dr() throw(D&);
-    virtual void er() throw(E&); // expected-error {{more lax}}
-    virtual void fr() throw(F&); // expected-error {{more lax}}
+    virtual void er() throw(E&);
+    // cxx98-14-error@-1 {{exception specification of overriding function is more lax than base version}}
+    // cxx98-14-note@#dr126-er {{overridden virtual function is here}}
+    virtual void fr() throw(F&);
+    // cxx98-14-error@-1 {{exception specification of overriding function is more lax than base version}}
+    // cxx98-14-note@#dr126-fr {{overridden virtual function is here}}
     virtual void gr() throw(G&);
-    virtual void hr() throw(H&); // expected-error {{more lax}}
+    virtual void hr() throw(H&);
+    // cxx98-14-error@-1 {{exception specification of overriding function is more lax than base version}}
+    // cxx98-14-note@#dr126-hr {{overridden virtual function is here}}
 
     virtual void pv() throw(C*);
 
@@ -337,22 +411,29 @@ namespace dr126 { // dr126: partial
     using nullptr_t = decltype(nullptr);
     virtual void np() throw(nullptr_t);
     virtual void npm() throw(nullptr_t&);
-    virtual void nr() throw(nullptr_t); // expected-error {{more lax}}
+    virtual void nr() throw(nullptr_t);
+    // cxx11-14-error@-1 {{exception specification of overriding function is more lax than base version}}
+    // cxx11-14-note@#dr126-nr {{overridden virtual function is here}}
     virtual void ncr() throw(nullptr_t);
-#endif
+#endif // __cplusplus >= 201103L
 
     virtual void ref1() throw(D *const &);
     virtual void ref2() throw(D *);
 
     virtual void v() throw(const int);
     virtual void w() throw(int);
-    virtual void x() throw(const int*); // expected-error {{more lax}}
+    virtual void x() throw(const int*);
+    // cxx98-14-error@-1 {{exception specification of overriding function is more lax than base version}}
+    // cxx98-14-note@#dr126-x {{overridden virtual function is here}}
     virtual void y() throw(int*); // ok
-    virtual void z() throw(long); // expected-error {{more lax}}
+    virtual void z() throw(long);
+    // cxx98-14-error@-1 {{exception specification of overriding function is more lax than base version}}
+    // cxx98-14-note@#dr126-z {{overridden virtual function is here}}
   };
-#else
-  void f() throw(int); // expected-error {{ISO C++17 does not allow}} expected-note {{use 'noexcept}}
-#endif
+#endif // __cplusplus <= 201402L
+  void f() throw(int);
+  // since-cxx17-error@-1 {{ISO C++17 does not allow dynamic exception specifications}}
+  // since-cxx17-note@-2 {{use 'noexcept(false)' instead}}
 }
 
 namespace dr127 { // dr127: yes
@@ -360,11 +441,16 @@ namespace dr127 { // dr127: yes
   template<typename T> struct A {
     A() { throw 0; }
     void *operator new(size_t, const char * = 0);
-    void operator delete(void *, const char *) { T::error; } // expected-error 2{{no members}}
+    void operator delete(void *, const char *) { T::error; } // #dr127-delete-const-char
+    // expected-error@#dr127-delete-const-char {{type 'void' cannot be used prior to '::' because it has no members}}
+    // expected-note@#dr127-p {{in instantiation of member function 'dr127::A<void>::operator delete' requested here}}
+
+    // expected-error@#dr127-delete-const-char {{type 'int' cannot be used prior to '::' because it has no members}}
+    // expected-note@#dr127-q {{in instantiation of member function 'dr127::A<int>::operator delete' requested here}}
     void operator delete(void *) { T::error; }
   };
-  A<void> *p = new A<void>; // expected-note {{instantiat}}
-  A<int> *q = new ("") A<int>; // expected-note {{instantiat}}
+  A<void> *p = new A<void>; // #dr127-p
+  A<int> *q = new ("") A<int>; // #dr127-q
 }
 
 namespace dr128 { // dr128: yes
@@ -401,36 +487,50 @@ namespace dr135 { // dr135: yes
 }
 
 namespace dr136 { // dr136: 3.4
-  void f(int, int, int = 0); // expected-note {{previous declaration is here}}
-  void g(int, int, int); // expected-note {{previous declaration is here}}
+  void f(int, int, int = 0); // #dr136-f
+  void g(int, int, int); // #dr136-g
   struct A {
-    friend void f(int, int = 0, int); // expected-error {{friend declaration specifying a default argument must be the only declaration}}
-    friend void g(int, int, int = 0); // expected-error {{friend declaration specifying a default argument must be the only declaration}}
-    friend void h(int, int, int = 0); // expected-error {{friend declaration specifying a default argument must be a definition}}
-    friend void i(int, int, int = 0) {} // expected-note {{previous declaration is here}}
+    friend void f(int, int = 0, int);
+    // expected-error@-1 {{friend declaration specifying a default argument must be the only declaration}}
+    // expected-note@#dr136-f {{previous declaration is here}}
+    friend void g(int, int, int = 0);
+    // expected-error@-1 {{friend declaration specifying a default argument must be the only declaration}}
+    // expected-note@#dr136-g {{previous declaration is here}}
+    friend void h(int, int, int = 0);
+    // expected-error@-1 {{friend declaration specifying a default argument must be a definition}}
+    friend void i(int, int, int = 0) {} // #dr136-A-i
     friend void j(int, int, int = 0) {}
     operator int();
   };
-  void i(int, int, int); // expected-error {{friend declaration specifying a default argument must be the only declaration}}
+  void i(int, int, int);
+  // expected-error@-1 {{friend declaration specifying a default argument must be the only declaration}}
+  // expected-note@#dr136-A-i {{previous declaration is here}}
   void q() {
     j(A(), A()); // ok, has default argument
   }
-  extern "C" void k(int, int, int, int); // expected-note 2{{previous declaration is here}}
+  extern "C" void k(int, int, int, int); // #dr136-k 
   namespace NSA {
   struct A {
-    friend void dr136::k(int, int, int, int = 0); // expected-error {{friend declaration specifying a default argument must be the only declaration}}
+    friend void dr136::k(int, int, int, int = 0);
+    // expected-error@-1 {{friend declaration specifying a default argument must be the only declaration}}
+    // expected-note@#dr136-k {{previous declaration is here}}
   };
   }
   namespace NSB {
   struct A {
-    friend void dr136::k(int, int, int = 0, int); // expected-error {{missing default argument on parameter}} expected-error {{must be the only declaration}}
+    friend void dr136::k(int, int, int = 0, int); // #dr136-friend-k
+    // expected-error@#dr136-friend-k {{friend declaration specifying a default argument must be the only declaration}}
+    //   expected-note@#dr136-k {{previous declaration is here}}
+    // expected-error@#dr136-friend-k {{missing default argument on parameter}}
   };
   }
   struct B {
-    void f(int); // expected-note {{previous declaration is here}}
+    void f(int); // #dr136-B-f
   };
   struct C {
-    friend void B::f(int = 0); // expected-error {{friend declaration specifying a default argument must be the only declaration}}
+    friend void B::f(int = 0);
+    // expected-error@-1 {{friend declaration specifying a default argument must be the only declaration}}
+    // expected-note@#dr136-B-f {{previous declaration is here}}
   };
 }
 
@@ -440,13 +540,18 @@ namespace dr137 { // dr137: yes
   extern volatile void *vp;
   extern const volatile void *cvp;
   int *q = static_cast<int*>(p);
-  int *qc = static_cast<int*>(cp); // expected-error {{casts away qualifiers}}
-  int *qv = static_cast<int*>(vp); // expected-error {{casts away qualifiers}}
-  int *qcv = static_cast<int*>(cvp); // expected-error {{casts away qualifiers}}
+  int *qc = static_cast<int*>(cp);
+  // expected-error@-1 {{static_cast from 'const void *' to 'int *' casts away qualifiers}}
+  int *qv = static_cast<int*>(vp);
+  // expected-error@-1 {{static_cast from 'volatile void *' to 'int *' casts away qualifiers}}
+  int *qcv = static_cast<int*>(cvp);
+  // expected-error@-1 {{static_cast from 'const volatile void *' to 'int *' casts away qualifiers}}
   const int *cq = static_cast<const int*>(p);
   const int *cqc = static_cast<const int*>(cp);
-  const int *cqv = static_cast<const int*>(vp); // expected-error {{casts away qualifiers}}
-  const int *cqcv = static_cast<const int*>(cvp); // expected-error {{casts away qualifiers}}
+  const int *cqv = static_cast<const int*>(vp);
+  // expected-error@-1 {{static_cast from 'volatile void *' to 'const int *' casts away qualifiers}}
+  const int *cqcv = static_cast<const int*>(cvp);
+  // expected-error@-1 {{static_cast from 'const volatile void *' to 'const int *' casts away qualifiers}}
   const volatile int *cvq = static_cast<const volatile int*>(p);
   const volatile int *cvqc = static_cast<const volatile int*>(cp);
   const volatile int *cvqv = static_cast<const volatile int*>(vp);
@@ -455,9 +560,11 @@ namespace dr137 { // dr137: yes
 
 namespace dr139 { // dr139: yes
   namespace example1 {
-    typedef int f; // expected-note {{previous}}
+    typedef int f; // #dr139-typedef-f
     struct A {
-      friend void f(A &); // expected-error {{different kind of symbol}}
+      friend void f(A &);
+      // expected-error@-1 {{redefinition of 'f' as different kind of symbol}}
+      // expected-note@#dr139-typedef-f {{previous definition is here}}
     };
   }
 
@@ -474,35 +581,41 @@ namespace dr139 { // dr139: yes
 }
 
 namespace dr140 { // dr140: yes
-  void f(int *const) {} // expected-note {{previous}}
-  void f(int[3]) {} // expected-error {{redefinition}}
+  void f(int *const) {} // #dr140-f-first
+  void f(int[3]) {}
+  // expected-error@-1 {{redefinition of 'f'}}
+  // expected-note@#dr140-f-first {{previous definition is here}}
   void g(const int);
   void g(int n) { n = 2; }
 }
 
 namespace dr141 { // dr141: 3.1
   template<typename T> void f();
-  template<typename T> struct S { int n; }; // expected-note {{'::dr141::S<int>::n' declared here}}
+  template<typename T> struct S { int n; }; // #dr141-S
   struct A : S<int> {
     template<typename T> void f();
-    template<typename T> struct S {};
+    template<typename T> struct S {}; // #dr141-A-S
   } a;
   struct B : S<int> {} b;
   void g() {
     a.f<int>();
-    (void)a.S<int>::n; // expected-error {{no member named 'n' in 'dr141::A::S<int>'; did you mean '::dr141::S<int>::n'?}}
-#if __cplusplus < 201103L
-    // expected-error@-2 {{ambiguous}}
-    // expected-note@-11 {{lookup from the current scope}}
-    // expected-note@-9 {{lookup in the object type}}
-#endif
-    b.f<int>(); // expected-error {{no member}} expected-error +{{}}
+    (void)a.S<int>::n; // #dr141-a
+    // cxx98-error@#dr141-a {{lookup of 'S' in member access expression is ambiguous; using member of 'struct A'}}
+    //   cxx98-note@#dr141-A-S {{lookup in the object type 'struct A' refers here}}
+    //   cxx98-note@#dr141-S {{lookup from the current scope refers here}}
+    // expected-error@#dr141-a {{no member named 'n' in 'dr141::A::S<int>'; did you mean '::dr141::S<int>::n'?}}
+    //   expected-note@#dr141-S {{'::dr141::S<int>::n' declared here}}
+    // FIXME: we issue a useful diagnostic first, then some bogus ones.
+    b.f<int>();
+    // expected-error@-1 {{no member named 'f' in 'dr141::B'}}
+    // expected-error@-2 +{{}}
     (void)b.S<int>::n;
   }
   template<typename T> struct C {
     T t;
     void g() {
-      t.f<int>(); // expected-error {{use 'template'}}
+      t.f<int>();
+      // expected-error@-1 {{use 'template' keyword to treat 'f' as a dependent template name}}
     }
     void h() {
       (void)t.S<int>::n; // ok
@@ -519,28 +632,53 @@ namespace dr141 { // dr141: 3.1
 }
 
 namespace dr142 { // dr142: 2.8
-  class B { // expected-note +{{here}}
+  class B { // #dr142-B
   public:
-    int mi; // expected-note +{{here}}
-    static int si; // expected-note +{{here}}
+    int mi; // #dr142-B-mi
+    static int si; // #dr142-B-si
   };
-  class D : private B { // expected-note +{{here}}
+  class D : private B { // #dr142-D
   };
   class DD : public D {
     void f();
   };
   void DD::f() {
-    mi = 3; // expected-error {{private member}}
-    si = 3; // expected-error {{private member}}
-    B b_old; // expected-error {{private member}}
+    mi = 3;
+    // expected-error@-1 {{'mi' is a private member of 'dr142::B'}}
+    // expected-note@#dr142-D {{constrained by private inheritance here}}
+    // expected-note@#dr142-B-mi {{member is declared here}}
+    si = 3;
+    // expected-error@-1 {{'si' is a private member of 'dr142::B'}}
+    // expected-note@#dr142-D {{constrained by private inheritance here}}
+    // expected-note@#dr142-B-si {{member is declared here}}
+    B b_old;
+    // expected-error@-1 {{'B' is a private member of 'dr142::B'}}
+    // expected-note@#dr142-D {{constrained by private inheritance here}}
+    // expected-note@#dr142-B {{member is declared here}}
     dr142::B b;
     b.mi = 3;
     b.si = 3;
-    B::si = 3; // expected-error {{private member}}
+    B::si = 3;
+    // expected-error@-1 {{'B' is a private member of 'dr142::B'}}
+    // expected-note@#dr142-D {{constrained by private inheritance here}}
+    // expected-note@#dr142-B {{member is declared here}}
     dr142::B::si = 3;
-    B *bp1_old = this; // expected-error {{private member}} expected-error {{private base class}}
-    dr142::B *bp1 = this; // expected-error {{private base class}}
-    B *bp2_old = (B*)this; // expected-error 2{{private member}}
+    B *bp1_old = this; // #dr142-bp1_old
+    // expected-error@#dr142-bp1_old {{'B' is a private member of 'dr142::B'}}
+    //   expected-note@#dr142-D {{constrained by private inheritance here}}
+    //   expected-note@#dr142-B {{member is declared here}}
+    // expected-error@#dr142-bp1_old {{cannot cast 'dr142::DD' to its private base class 'B'}}
+    //   expected-note@#dr142-D {{declared private here}}
+    dr142::B *bp1 = this;
+    // expected-error@-1 {{cannot cast 'dr142::DD' to its private base class 'dr142::B'}}
+    // expected-note@#dr142-D {{declared private here}}
+    B *bp2_old = (B*)this; // #dr142-bp2_old
+    // expected-error@#dr142-bp2_old {{'B' is a private member of 'dr142::B'}}
+    //   expected-note@#dr142-D {{constrained by private inheritance here}}
+    //   expected-note@#dr142-B {{member is declared here}}
+    // expected-error@#dr142-bp2_old {{'B' is a private member of 'dr142::B'}}
+    //   expected-note@#dr142-D {{constrained by private inheritance here}}
+    //   expected-note@#dr142-B {{member is declared here}}
     dr142::B *bp2 = (dr142::B*)this;
     bp2->mi = 3;
   }
@@ -553,19 +691,19 @@ namespace dr143 { // dr143: yes
     struct X { friend void B::f(X); };
   }
   void g(A::X x) {
-    f(x); // expected-error {{undeclared identifier 'f'}}
+    f(x);
+    // expected-error@-1 {{use of undeclared identifier 'f'}}
   }
 }
 
 namespace dr145 { // dr145: yes
   void f(bool b) {
-#if __cplusplus <= 201402L
-    ++b; // expected-warning {{deprecated}}
-    b++; // expected-warning {{deprecated}}
-#else
-    ++b; // expected-error {{increment}}
-    b++; // expected-error {{increment}}
-#endif
+    ++b;
+    // cxx98-14-warning@-1 {{incrementing expression of type bool is deprecated and incompatible with C++17}}
+    // since-cxx17-error@-2 {{ISO C++17 does not allow incrementing expression of type bool}}
+    b++;
+    // cxx98-14-warning@-1 {{incrementing expression of type bool is deprecated and incompatible with C++17}}
+    // since-cxx17-error@-2 {{ISO C++17 does not allow incrementing expression of type bool}}
   }
 }
 
@@ -576,13 +714,15 @@ namespace dr147 { // dr147: yes
     };
     // Per core issue 1435, this is ill-formed because A<int>::A<int> does not
     // name the injected-class-name. (A<int>::A does, though.)
-    template<> template<> A<int>::A<int>(int) {} // expected-error {{out-of-line constructor for 'A' cannot have template arguments}}
+    template<> template<> A<int>::A<int>(int) {}
+    // expected-error@-1 {{out-of-line constructor for 'A' cannot have template arguments}}
     template<> template<> A<float>::A(float) {}
   }
   namespace example2 {
     struct A { A(); };
     struct B : A { B(); };
-    A::A a1; // expected-error {{is a constructor}}
+    A::A a1;
+    // expected-error@-1 {{qualified reference to 'A' is a constructor name rather than a type in this context}}
     B::A a2;
   }
   namespace example3 {
@@ -590,7 +730,8 @@ namespace dr147 { // dr147: yes
       template<typename T> A(T);
       static A a;
     };
-    template<> A<int>::A<int>(A<int>::a); // expected-error {{is a constructor}}
+    template<> A<int>::A<int>(A<int>::a);
+    // expected-error@-1 {{qualified reference to 'A' is a constructor name rather than a template name in this context}}
   }
 }
 
@@ -616,24 +757,28 @@ namespace dr151 { // dr151: 3.1
 
 namespace dr152 { // dr152: yes
   struct A {
-    A(); // expected-note 0-2{{not viable}}
-    explicit A(const A&); // expected-note 1-2{{not a candidate}}
+    A(); // #dr152-A-ctor
+    explicit A(const A&); // #dr152-A-explicit-ctor
   };
   A a1 = A();
-#if __cplusplus <= 201402L
-  // expected-error@-2 {{no matching constructor}}
-#endif
+  // cxx98-14-error@-1 {{no matching constructor for initialization of 'A'}}
+  // cxx98-14-note@#dr152-A-explicit-ctor {{explicit constructor is not a candidate}}
+  // cxx98-14-note@#dr152-A-ctor {{candidate constructor not viable: requires 0 arguments, but 1 was provided}}
   A a2((A()));
 
   A &f();
-  A a3 = f(); // expected-error {{no matching constructor}}
+  A a3 = f();
+  // expected-error@-1 {{no matching constructor for initialization of 'A'}}
+  // expected-note@#dr152-A-explicit-ctor {{explicit constructor is not a candidate}}
+  // expected-note@#dr152-A-ctor {{candidate constructor not viable: requires 0 arguments, but 1 was provided}}
   A a4(f());
 }
 
 // dr153: na
 
 namespace dr154 { // dr154: yes
-  union { int a; }; // expected-error {{must be declared 'static'}}
+  union { int a; };
+  // expected-error@-1 {{nonymous unions at namespace or global scope must be declared 'static'}}
   namespace {
     union { int b; };
   }
@@ -641,7 +786,8 @@ namespace dr154 { // dr154: yes
 }
 
 namespace dr155 { // dr155: dup 632
-  struct S { int n; } s = { { 1 } }; // expected-warning {{braces around scalar initializer}}
+  struct S { int n; } s = { { 1 } };
+  // expected-warning@-1 {{braces around scalar initializer}}
 }
 
 // dr158 is in its own file.
@@ -649,7 +795,8 @@ namespace dr155 { // dr155: dup 632
 namespace dr159 { // dr159: 3.5
   namespace X { void f(); }
   void f();
-  void dr159::f() {} // expected-warning {{extra qualification}}
+  void dr159::f() {}
+  // expected-warning@-1 {{extra qualification on member 'f'}}
   void dr159::X::f() {}
 }
 
@@ -658,9 +805,9 @@ namespace dr159 { // dr159: 3.5
 namespace dr161 { // dr161: 3.1
   class A {
   protected:
-    struct B { int n; } b; // expected-note 2{{here}}
+    struct B { int n; } b; // #dr161-B
     static B bs;
-    void f(); // expected-note {{here}}
+    void f(); // #dr161-f
     static void sf();
   };
   struct C : A {};
@@ -669,13 +816,19 @@ namespace dr161 { // dr161: 3.1
       (void)b.n;
       B b1;
       C::B b2; // ok, accessible as a member of A
-      (void)&C::b; // expected-error {{protected}}
+      (void)&C::b;
+      // expected-error@-1 {{'b' is a protected member of 'dr161::A'}}
+      // expected-note@#dr161-B {{declared protected here}}
       (void)&C::bs;
-      (void)c.b; // expected-error {{protected}}
+      (void)c.b;
+      // expected-error@-1 {{'b' is a protected member of 'dr161::A'}}
+      // expected-note@#dr161-B {{declared protected here}}
       (void)c.bs;
       f();
       sf();
-      c.f(); // expected-error {{protected}}
+      c.f();
+      // expected-error@-1 {{protected}}
+      // expected-note@#dr161-f {{declared protected here}}
       c.sf();
       A::f();
       D::f();
@@ -692,13 +845,17 @@ namespace dr162 { // dr162: no
     static int &f(int);
 
     void g() {
-      int &a = (&A::f)(0); // FIXME: expected-error {{could not be resolved}}
-      char &b = (&A::f)('0'); // expected-error {{could not be resolved}}
+      int &a = (&A::f)(0);
+      // FIXME: expected-error@-1 {{reference to overloaded function could not be resolved; did you mean to call it?}}
+      char &b = (&A::f)('0');
+      // expected-error@-1 {{reference to overloaded function could not be resolved; did you mean to call it?}}
     }
   };
 
-  int &c = (&A::f)(0); // FIXME: expected-error {{could not be resolved}}
-  char &d = (&A::f)('0'); // expected-error {{could not be resolved}}
+  int &c = (&A::f)(0);
+  // FIXME: expected-error@-1 {{reference to overloaded function could not be resolved; did you mean to call it?}}
+  char &d = (&A::f)('0');
+  // expected-error@-1 {{reference to overloaded function could not be resolved; did you mean to call it?}}
 }
 
 // dr163: na
@@ -729,7 +886,10 @@ namespace dr166 { // dr166: 2.9
 
   template<typename T> int f(T t) { return t.n; }
   int g(A::X);
-  template<typename T> int h(T t) { return t.n; } // expected-error {{private}}
+  template<typename T> int h(T t) { return t.n; }
+  // expected-error@-1 {{'n' is a private member of 'dr166::A::X'}}
+  // expected-note@#dr166-h-instantiation {{in instantiation of function template specialization 'dr166::h<dr166::A::X>' requested here}}
+  // expected-note@#dr166-X-n {{implicitly declared private here}}
   int i(A::X);
 
   namespace A {
@@ -738,7 +898,7 @@ namespace dr166 { // dr166: 2.9
       friend int dr166::g(X);
       friend int h(X);
       friend int i(X);
-      int n; // expected-note 2{{here}}
+      int n; // #dr166-X-n
     };
 
     int h(X x) { return x.n; }
@@ -747,8 +907,10 @@ namespace dr166 { // dr166: 2.9
 
   template int f(A::X);
   int g(A::X x) { return x.n; }
-  template int h(A::X); // expected-note {{instantiation}}
-  int i(A::X x) { return x.n; } // expected-error {{private}}
+  template int h(A::X); // #dr166-h-instantiation
+  int i(A::X x) { return x.n; }
+  // expected-error@-1 {{'n' is a private member of 'dr166::A::X'}}
+  // expected-note@#dr166-X-n {{implicitly declared private here}}
 }
 
 // dr167: sup 1012
@@ -768,23 +930,29 @@ namespace dr169 { // dr169: yes
   struct B {
     template<typename> struct C;
     template<typename> void f();
-    template<typename> static int n; // expected-error 0-1{{extension}}
+    template<typename> static int n;
+    // cxx98-11-error@-1 {{variable templates are a C++14 extension}}
   };
   struct D : A<int>, B {
     using A<int>::n;
-    using B::C<int>; // expected-error {{using declaration cannot refer to a template specialization}}
-    using B::f<int>; // expected-error {{using declaration cannot refer to a template specialization}}
-    using B::n<int>; // expected-error {{using declaration cannot refer to a template specialization}}
+    using B::C<int>;
+    // expected-error@-1 {{using declaration cannot refer to a template specialization}}
+    using B::f<int>;
+    // expected-error@-1 {{using declaration cannot refer to a template specialization}}
+    using B::n<int>;
+    // expected-error@-1 {{using declaration cannot refer to a template specialization}}
   };
 }
 
 namespace { // dr171: yes
   int dr171a;
 }
-int dr171b; // expected-note {{here}}
+int dr171b; // #dr171b-int
 namespace dr171 {
   extern "C" void dr171a();
-  extern "C" void dr171b(); // expected-error {{conflicts}}
+  extern "C" void dr171b();
+  // expected-error@-1 {{declaration of 'dr171b' with C language linkage conflicts with declaration in global scope}}
+  // expected-note@#dr171b-int {{declared in global scope here}}
 }
 
 namespace dr172 { // dr172: yes
@@ -810,12 +978,14 @@ namespace dr172 { // dr172: yes
   int check6a[sizeof(d) == sizeof(unsigned long) ? 1 : -1];
   int check6b[-d > 0 ? 1 : -1];
 
-  enum { e = (unsigned long long)-1 / 2 }; // expected-error 0-1{{extension}}
-  int check7a[sizeof(e) == sizeof(long) ? 1 : -1]; // expected-error 0-1{{extension}}
+  enum { e = (unsigned long long)-1 / 2 };
+  // cxx98-error@-1 {{'long long' is a C++11 extension}}
+  int check7a[sizeof(e) == sizeof(long) ? 1 : -1];
   int check7b[-e < 0 ? 1 : -1];
 
-  enum { f = (unsigned long long)-1 / 2 + 1 }; // expected-error 0-1{{extension}}
-  int check8a[sizeof(f) == sizeof(unsigned long) ? 1 : -1]; // expected-error 0-1{{extension}}
+  enum { f = (unsigned long long)-1 / 2 + 1 };
+  // cxx98-error@-1 {{'long long' is a C++11 extension}}
+  int check8a[sizeof(f) == sizeof(unsigned long) ? 1 : -1];
   int check8b[-f > 0 ? 1 : -1];
 }
 
@@ -828,10 +998,13 @@ namespace dr173 { // dr173: yes
 // dr174: sup 1012
 
 namespace dr175 { // dr175: 2.8
-  struct A {}; // expected-note {{here}}
-  struct B : private A {}; // expected-note {{constrained by private inheritance}}
+  struct A {}; // #dr175-A
+  struct B : private A {}; // #dr175-B
   struct C : B {
-    A a; // expected-error {{private}}
+    A a;
+    // expected-error@-1 {{'A' is a private member of 'dr175::A'}}
+    // expected-note@#dr175-B {{constrained by private inheritance here}}
+    // expected-note@#dr175-A {{member is declared here}}
     dr175::A b;
   };
 }
@@ -840,12 +1013,14 @@ namespace dr176 { // dr176: 3.1
   template<typename T> class Y;
   template<> class Y<int> {
     void f() {
-      typedef Y A; // expected-note {{here}}
-      typedef Y<char> A; // expected-error {{different types ('Y<char>' vs 'Y<int>')}}
+      typedef Y A; // #dr176-A-first
+      typedef Y<char> A;
+      // expected-error@-1 {{typedef redefinition with different types ('Y<char>' vs 'Y<int>')}}
+      // expected-note@#dr176-A-first {{previous definition is here}}
     }
   };
 
-  template<typename T> struct Base {}; // expected-note 2{{found}}
+  template<typename T> struct Base {}; // #dr176-Base
   template<typename T> struct Derived : public Base<T> {
     void f() {
       typedef typename Derived::template Base<T> A;
@@ -855,35 +1030,44 @@ namespace dr176 { // dr176: 3.1
   template struct Derived<int>;
 
   template<typename T> struct Derived2 : Base<int>, Base<char> {
-    typename Derived2::Base b; // expected-error {{found in multiple base classes}}
+    typename Derived2::Base b;
+    // expected-error@-1 {{member 'Base' found in multiple base classes of different types}}
+    // expected-note@#dr176-Base {{member type 'dr176::Base<int>' found by ambiguous name lookup}}
+    // expected-note@#dr176-Base {{member type 'dr176::Base<char>' found by ambiguous name lookup}}
     typename Derived2::Base<double> d;
   };
 
-  template<typename T> class X { // expected-note {{here}}
+  template<typename T> class X { // #dr176-X
     X *p1;
     X<T> *p2;
     X<int> *p3;
-    dr176::X *p4; // expected-error {{requires template arguments}}
+    dr176::X *p4; // #dr176-p4
+    // cxx98-14-error@#dr176-p4 {{use of class template 'dr176::X' requires template arguments}}
+    //  cxx98-14-note@#dr176-X {{template is declared here}}
+    // since-cxx17-error@#dr176-p4 {{use of class template 'X' requires template arguments; argument deduction not allowed in non-static class member}}
+    //  since-cxx17-note@#dr176-X {{template is declared here}}
   };
 }
 
 namespace dr177 { // dr177: yes
   struct B {};
   struct A {
-    A(A &); // expected-note 0-1{{not viable: expects an lvalue}}
-    A(const B &); // expected-note 0-1{{not viable: no known conversion from 'A' to}}
+    A(A &); // #dr177-A-copy-ctor
+    A(const B &); // #dr177-A-ctor-from-B
   };
   B b;
   A a = b;
-#if __cplusplus <= 201402L
-  // expected-error@-2 {{no viable constructor copying variable}}
-#endif
+  // cxx98-14-error@-1 {{no viable constructor copying variable of type 'A'}}
+  // cxx98-14-note@#dr177-A-copy-ctor {{candidate constructor not viable: expects an lvalue for 1st argument}}
+  // cxx98-14-note@#dr177-A-ctor-from-B {{candidate constructor not viable: no known conversion from 'A' to 'const B &' for 1st argument}}
 
-  struct C { C(C&); }; // expected-note {{not viable: expects an lvalue for 1st argument}}
+  struct C { C(C&); }; // #dr177-C-copy-ctor
   struct D : C {};
   struct E { operator D(); };
   E e;
-  C c = e; // expected-error {{no viable constructor copying variable of type 'D'}}
+  C c = e;
+  // expected-error@-1 {{no viable constructor copying variable of type 'D'}}
+  // expected-note@#dr177-C-copy-ctor {{candidate constructor not viable: expects an lvalue for 1st argument}}
 }
 
 namespace dr178 { // dr178: yes
@@ -901,7 +1085,8 @@ namespace dr178 { // dr178: yes
 
 namespace dr179 { // dr179: yes
   void f();
-  int n = &f - &f; // expected-error {{arithmetic on pointers to the function type 'void ()'}}
+  int n = &f - &f;
+  // expected-error@-1 {{arithmetic on pointers to the function type 'void ()'}}
 }
 
 namespace dr180 { // dr180: 2.8
@@ -916,8 +1101,10 @@ namespace dr180 { // dr180: 2.8
 
 namespace dr181 { // dr181: yes
   namespace X {
-    template <template X<class T> > struct A { }; // expected-error +{{}}
-    template <template X<class T> > void f(A<X>) { } // expected-error +{{}}
+    template <template X<class T> > struct A { };
+    // expected-error@-1 +{{}}
+    template <template X<class T> > void f(A<X>) { }
+    // expected-error@-1 +{{}}
   }
 
   namespace Y {
@@ -955,11 +1142,8 @@ namespace dr183 { // dr183: sup 382
     typedef int X;
   };
   template<> struct A<int> {
-#if __cplusplus <= 199711
-    typename B<int>::X x; // expected-error {{'typename' occurs outside of a template}}
-#else
     typename B<int>::X x;
-#endif
+    // cxx98-error@-1 {{'typename' occurs outside of a template}}
   };
 }
 
@@ -971,8 +1155,10 @@ namespace dr184 { // dr184: yes
     void g();
   };
 
-  template<template<typename TT> class T> void A<T>::f() { // expected-note {{here}}
-    T<> t; // expected-error {{too few template arguments}}
+  template<template<typename TT> class T> void A<T>::f() { // #dr184-T
+    T<> t;
+    // expected-error@-1 {{too few template arguments for template template parameter 'T'}}
+    // expected-note@#dr184-T {{template is declared here}}
   }
 
   template<template<typename TT = char> class T> void A<T>::g() {
@@ -1035,10 +1221,12 @@ namespace dr191 { // dr191: yes
 namespace dr194 { // dr194: yes
   struct A {
     A();
-    void A(); // expected-error {{constructor cannot have a return type}}
+    void A();
+    // expected-error@-1 {{constructor cannot have a return type}}
   };
   struct B {
-    void B(); // expected-error {{constructor cannot have a return type}}
+    void B();
+    // expected-error@-1 {{constructor cannot have a return type}}
     B();
   };
   struct C {
@@ -1048,8 +1236,10 @@ namespace dr194 { // dr194: yes
 
 namespace dr195 { // dr195: yes
   void f();
-  int *p = (int*)&f; // expected-error 0-1{{extension}}
-  void (*q)() = (void(*)())&p; // expected-error 0-1{{extension}}
+  int *p = (int*)&f;
+  // cxx98-error@-1 {{cast between pointer-to-function and pointer-to-object is an extension}}
+  void (*q)() = (void(*)())&p;
+  // cxx98-error@-1 {{cast between pointer-to-function and pointer-to-object is an extension}}
 }
 
 namespace dr197 { // dr197: yes
@@ -1057,8 +1247,11 @@ namespace dr197 { // dr197: yes
 
   template <class T> void g(T t) {
     char &a = f(1);
-    char &b = f(T(1)); // expected-error {{unrelated type 'int'}}
-    char &c = f(t); // expected-error {{unrelated type 'int'}}
+    char &b = f(T(1));
+    // expected-error@-1 {{non-const lvalue reference to type 'char' cannot bind to a value of unrelated type 'int'}}
+    // expected-note@#dr197-g-e-call {{in instantiation of function template specialization 'dr197::g<dr197::E>' requested here}}
+    char &c = f(t);
+    // expected-error@-1 {{non-const lvalue reference to type 'char' cannot bind to a value of unrelated type 'int'}}
   }
 
   void f(int);
@@ -1069,7 +1262,7 @@ namespace dr197 { // dr197: yes
   void h() {
     g('a');
     g(2);
-    g(e); // expected-note {{in instantiation of}}
+    g(e); // #dr197-g-e-call
   }
 }
 
@@ -1078,9 +1271,7 @@ namespace dr198 { // dr198: yes
     int n;
     struct B {
       int m[sizeof(n)];
-#if __cplusplus < 201103L
-      // expected-error@-2 {{invalid use of non-static data member}}
-#endif
+      // cxx98-error@-1 {{invalid use of non-static data member 'n'}}
       int f() { return n; }
       // expected-error@-1 {{use of non-static data member 'n' of 'A' from nested type 'B'}}
     };
@@ -1089,17 +1280,13 @@ namespace dr198 { // dr198: yes
   };
   struct A::C {
     int m[sizeof(n)];
-#if __cplusplus < 201103L
-    // expected-error@-2 {{invalid use of non-static data member}}
-#endif
+    // cxx98-error@-1 {{invalid use of non-static data member 'n'}}
     int f() { return n; }
     // expected-error@-1 {{use of non-static data member 'n' of 'A' from nested type 'C'}}
   };
   struct A::D : A {
     int m[sizeof(n)];
-#if __cplusplus < 201103L
-    // expected-error@-2 {{invalid use of non-static data member}}
-#endif
+    // cxx98-error@-1 {{invalid use of non-static data member 'n'}}
     int f() { return n; }
   };
 }

From f19571ee781de932390e8983267263f504e99e1f Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Fri, 1 Dec 2023 09:02:16 -0500
Subject: [PATCH 40/72] [libc++] Revert "Compile MSAN/TSAN failing test with
 -O1 (#73555)"

This reverts commit 61aef978d6ab1553c48bbd9bf807a277b22451c1, which
broke the CI on GCC.
---
 .../sequences/deque/deque.modifiers/insert_iter_iter.pass.cpp | 4 ----
 .../rand.dist.bern/rand.dist.bern.negbin/eval.pass.cpp        | 4 ----
 2 files changed, 8 deletions(-)

diff --git a/libcxx/test/std/containers/sequences/deque/deque.modifiers/insert_iter_iter.pass.cpp b/libcxx/test/std/containers/sequences/deque/deque.modifiers/insert_iter_iter.pass.cpp
index 4a694e0dd71c86..946c2cfabf02b6 100644
--- a/libcxx/test/std/containers/sequences/deque/deque.modifiers/insert_iter_iter.pass.cpp
+++ b/libcxx/test/std/containers/sequences/deque/deque.modifiers/insert_iter_iter.pass.cpp
@@ -9,10 +9,6 @@
 // REQUIRES: long_tests
 // UNSUPPORTED: GCC-ALWAYS_INLINE-FIXME
 
-// This test chokes on the sanitizers during CI runs. It appears we can address most of this by simply enabling optimizations.
-// ADDITIONAL_COMPILE_FLAGS(msan): -O1
-// ADDITIONAL_COMPILE_FLAGS(tsan): -O1
-
 // <deque>
 
 // template <class InputIterator>
diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.negbin/eval.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.negbin/eval.pass.cpp
index 9ab8b6f4274924..d98a73d296668b 100644
--- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.negbin/eval.pass.cpp
+++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.negbin/eval.pass.cpp
@@ -8,10 +8,6 @@
 //
 // REQUIRES: long_tests
 
-// This test is super slow, in particular with msan or tsan. In order to avoid timeouts and to
-// spend less time waiting for this particular test to complete we compile with optimizations.
-// ADDITIONAL_COMPILE_FLAGS: -O1
-
 // <random>
 
 // template<class IntType = int>

From f42b7615b862bb5f77981f619f92877eb20adf54 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Fri, 1 Dec 2023 14:24:36 +0000
Subject: [PATCH 41/72] [mlir][Vector] Add fold transpose(shape_cast) ->
 shape_cast (#73951)

This folds transpose(shape_cast) into a new shape_cast, when the
transpose just permutes a unit dim from the result of the shape_cast.

Example:

```
%0 = vector.shape_cast %vec : vector<[4]xf32> to vector<[4]x1xf32>
%1 = vector.transpose %0, [1, 0] : vector<[4]x1xf32> to vector<1x[4]xf32>
```

Folds to:
```
%0 = vector.shape_cast %vec : vector<[4]xf32> to vector<1x[4]xf32>
```

This is an (alternate) fix for lowering matmuls to ArmSME.
---
 mlir/lib/Dialect/Vector/IR/VectorOps.cpp   | 47 +++++++++++++++++++++-
 mlir/test/Dialect/Vector/canonicalize.mlir | 12 ++++++
 2 files changed, 58 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index c462b23e1133fc..caffd344848b32 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -5548,12 +5548,57 @@ class FoldTransposeCreateMask final : public OpRewritePattern<TransposeOp> {
   }
 };
 
+/// Folds transpose(shape_cast) into a new shape_cast, when the transpose just
+/// permutes a unit dim from the result of the shape_cast.
+class FoldTransposeShapeCast : public OpRewritePattern<TransposeOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TransposeOp transpOp,
+                                PatternRewriter &rewriter) const override {
+    Value transposeSrc = transpOp.getVector();
+    auto shapeCastOp = transposeSrc.getDefiningOp<vector::ShapeCastOp>();
+    if (!shapeCastOp)
+      return rewriter.notifyMatchFailure(
+          transpOp, "TransposeOp source is not ShapeCastOp");
+
+    auto sourceType = transpOp.getSourceVectorType();
+    auto resultType = transpOp.getResultVectorType();
+
+    auto filterUnitDims = [](VectorType type) {
+      return llvm::make_filter_range(
+          llvm::zip_equal(type.getShape(), type.getScalableDims()),
+          [&](auto dim) {
+            auto [size, isScalable] = dim;
+            return size != 1 || isScalable;
+          });
+    };
+
+    auto sourceWithoutUnitDims = filterUnitDims(sourceType);
+    auto resultWithoutUnitDims = filterUnitDims(resultType);
+
+    // If this transpose just permutes a unit dim, then we can fold it into the
+    // shape_cast.
+    for (auto [srcDim, resDim] :
+         llvm::zip_equal(sourceWithoutUnitDims, resultWithoutUnitDims)) {
+      if (srcDim != resDim)
+        return rewriter.notifyMatchFailure(transpOp,
+                                           "TransposeOp permutes non-unit dim");
+    }
+
+    rewriter.replaceOpWithNewOp<vector::ShapeCastOp>(transpOp, resultType,
+                                                     shapeCastOp.getSource());
+
+    return success();
+  };
+};
+
 } // namespace
 
 void vector::TransposeOp::getCanonicalizationPatterns(
     RewritePatternSet &results, MLIRContext *context) {
   results.add<FoldTransposeCreateMask, FoldTransposedScalarBroadcast,
-              TransposeFolder, FoldTransposeSplat>(context);
+              TransposeFolder, FoldTransposeSplat, FoldTransposeShapeCast>(
+      context);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir
index 1021c73cc57d34..6bfb477ecf9728 100644
--- a/mlir/test/Dialect/Vector/canonicalize.mlir
+++ b/mlir/test/Dialect/Vector/canonicalize.mlir
@@ -67,6 +67,18 @@ func.func @create_mask_transpose_to_transposed_create_mask(
 
 // -----
 
+// CHECK-LABEL: transposed_unit_dim_shape_cast_to_shape_cast
+//  CHECK-SAME: %[[VEC:.*]]: vector<[4]xf32>
+func.func @transposed_unit_dim_shape_cast_to_shape_cast(%vec: vector<[4]xf32>) -> vector<1x[4]xf32> {
+  //     CHECK: vector.shape_cast %[[VEC]] : vector<[4]xf32> to vector<1x[4]xf32>
+  // CHECK-NOT: vector.transpose
+  %0 = vector.shape_cast %vec : vector<[4]xf32> to vector<[4]x1xf32>
+  %1 = vector.transpose %0, [1, 0] : vector<[4]x1xf32> to vector<1x[4]xf32>
+  return %1 : vector<1x[4]xf32>
+}
+
+// -----
+
 // CHECK-LABEL: extract_from_create_mask
 //  CHECK-SAME: %[[DIM0:.*]]: index, %[[DIM1:.*]]: index
 func.func @extract_from_create_mask(%dim0: index, %dim1: index) -> vector<[4]x[4]xi1> {

From e59a0cd7d80a9f1ab803c4ff7416c77e9a34ed1d Mon Sep 17 00:00:00 2001
From: Matthew Devereau <matthew.devereau@arm.com>
Date: Fri, 1 Dec 2023 14:30:39 +0000
Subject: [PATCH 42/72] [AArch64][SME2] Add SME2 builtins for zero { zt0 }
 (#72274)

See https://github.com/ARM-software/acle/pull/217

Patch by: Kerry McLaughlin kerry.mclaughlin@arm.com
---
 clang/include/clang/Basic/arm_sme.td          |  5 ++++
 .../acle_sme2_zero_zt.c                       | 23 ++++++++++++++++
 llvm/include/llvm/IR/IntrinsicsAArch64.td     |  4 +++
 .../Target/AArch64/AArch64ISelLowering.cpp    | 26 +++++++++++--------
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |  4 +--
 .../lib/Target/AArch64/AArch64SMEInstrInfo.td |  2 +-
 llvm/lib/Target/AArch64/SMEInstrFormats.td    | 11 ++++++++
 .../AArch64/sme2-intrinsics-zero-zt.ll        | 13 ++++++++++
 8 files changed, 74 insertions(+), 14 deletions(-)
 create mode 100644 clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_zero_zt.c
 create mode 100644 llvm/test/CodeGen/AArch64/sme2-intrinsics-zero-zt.ll

diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index 7aae3c832bb1fe..34dbfff6c4c85c 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -321,4 +321,9 @@ let TargetGuard = "sme2" in {
 let TargetGuard = "sme2" in {
   def SVLDR_ZT : Inst<"svldr_zt", "viQ", "", MergeNone, "aarch64_sme_ldr_zt", [IsOverloadNone, IsStreamingCompatible, IsSharedZA, IsPreservesZA], [ImmCheck<0, ImmCheck0_0>]>;
   def SVSTR_ZT : Inst<"svstr_zt", "vi%", "", MergeNone, "aarch64_sme_str_zt", [IsOverloadNone, IsStreamingCompatible, IsSharedZA, IsPreservesZA], [ImmCheck<0, ImmCheck0_0>]>;
+
+//
+// Zero ZT0
+//
+  def SVZERO_ZT : Inst<"svzero_zt", "vi", "", MergeNone, "aarch64_sme_zero_zt", [IsOverloadNone, IsStreamingCompatible, IsSharedZA], [ImmCheck<0, ImmCheck0_0>]>;
 }
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_zero_zt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_zero_zt.c
new file mode 100644
index 00000000000000..31e8d6850fb289
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_zero_zt.c
@@ -0,0 +1,23 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+#include <arm_sme_draft_spec_subject_to_change.h>
+
+// CHECK-LABEL: @test_svzero_zt(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.zero.zt(i32 0)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z14test_svzero_ztv(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.zero.zt(i32 0)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svzero_zt(void) __arm_streaming_compatible __arm_shared_za {
+  svzero_zt(0);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 60a8d98f3bc0d2..3c0a07be50607b 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -3544,6 +3544,10 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_sme_ldr_zt : SME_LDR_STR_ZT_Intrinsic;
   def int_aarch64_sme_str_zt : SME_LDR_STR_ZT_Intrinsic;
 
+  //
+  //  Zero ZT0
+  //
+  def int_aarch64_sme_zero_zt : DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrWriteMem]>;
 }
 
 // SVE2.1 - ZIPQ1, ZIPQ2, UZPQ1, UZPQ2
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 4379c3fde6f3c5..68fa58dea5beb1 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2753,17 +2753,19 @@ AArch64TargetLowering::EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const {
   return BB;
 }
 
-MachineBasicBlock *AArch64TargetLowering::EmitZTSpillFill(MachineInstr &MI,
-                                                          MachineBasicBlock *BB,
-                                                          bool IsSpill) const {
+MachineBasicBlock *AArch64TargetLowering::EmitZTInstr(MachineInstr &MI,
+                                                      MachineBasicBlock *BB,
+                                                      unsigned Opcode,
+                                                      bool Op0IsDef) const {
   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   MachineInstrBuilder MIB;
-  unsigned Opc = IsSpill ? AArch64::STR_TX : AArch64::LDR_TX;
-  auto Rs = IsSpill ? RegState::Kill : RegState::Define;
-  MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
-  MIB.addReg(MI.getOperand(0).getReg(), Rs);
-  MIB.add(MI.getOperand(1)); // Base
-  MI.eraseFromParent();      // The pseudo is gone now.
+
+  MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opcode))
+            .addReg(MI.getOperand(0).getReg(), Op0IsDef ? RegState::Define : 0);
+  for (unsigned I = 1; I < MI.getNumOperands(); ++I)
+    MIB.add(MI.getOperand(I));
+
+  MI.eraseFromParent(); // The pseudo is gone now.
   return BB;
 }
 
@@ -2884,11 +2886,13 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
   case AArch64::LDR_ZA_PSEUDO:
     return EmitFill(MI, BB);
   case AArch64::LDR_TX_PSEUDO:
-    return EmitZTSpillFill(MI, BB, /*IsSpill=*/false);
+    return EmitZTInstr(MI, BB, AArch64::LDR_TX, /*Op0IsDef=*/true);
   case AArch64::STR_TX_PSEUDO:
-    return EmitZTSpillFill(MI, BB, /*IsSpill=*/true);
+    return EmitZTInstr(MI, BB, AArch64::STR_TX, /*Op0IsDef=*/false);
   case AArch64::ZERO_M_PSEUDO:
     return EmitZero(MI, BB);
+  case AArch64::ZERO_T_PSEUDO:
+    return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*Op0IsDef=*/true);
   }
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 009f8744b408a9..2b16d2471770d0 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -623,8 +623,8 @@ class AArch64TargetLowering : public TargetLowering {
   MachineBasicBlock *EmitZAInstr(unsigned Opc, unsigned BaseReg,
                                  MachineInstr &MI, MachineBasicBlock *BB,
                                  bool HasTile) const;
-  MachineBasicBlock *EmitZTSpillFill(MachineInstr &MI, MachineBasicBlock *BB,
-                                     bool IsSpill) const;
+  MachineBasicBlock *EmitZTInstr(MachineInstr &MI, MachineBasicBlock *BB,
+                                 unsigned Opcode, bool Op0IsDef) const;
   MachineBasicBlock *EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const;
 
   MachineBasicBlock *
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index fcfa5f82a3809c..84ec88d4fd49b6 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -539,7 +539,7 @@ defm SMOPS_MPPZZ_HtoS : sme2_int_mopx_tile<"smops", 0b001, int_aarch64_sme_smops
 defm UMOPA_MPPZZ_HtoS : sme2_int_mopx_tile<"umopa", 0b100, int_aarch64_sme_umopa_za32>;
 defm UMOPS_MPPZZ_HtoS : sme2_int_mopx_tile<"umops", 0b101, int_aarch64_sme_umops_za32>;
 
-def ZERO_T : sme2_zero_zt<"zero", 0b0001>;
+defm ZERO_T : sme2_zero_zt<"zero", 0b0001>;
 
 defm LDR_TX : sme2_spill_fill_vector<"ldr", 0b01111100, int_aarch64_sme_ldr_zt>;
 defm STR_TX : sme2_spill_fill_vector<"str", 0b11111100, int_aarch64_sme_str_zt>;
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index ef9c323e25bc35..c13c1b4e81faad 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -3119,6 +3119,17 @@ class sme2_zero_zt<string mnemonic, bits<4> opc>
   let Inst{3-0}  = opc;
 }
 
+multiclass sme2_zero_zt<string mnemonic, bits<4> opc> {
+  def NAME : sme2_zero_zt<mnemonic, opc>;
+  def NAME # _PSEUDO
+        : Pseudo<(outs), (ins ZTR:$ZT), []>, Sched<[]> {
+    // Translated to actual instruction in AArch64ISelLowering.cpp
+    let usesCustomInserter = 1;
+  }
+  def : Pat<(int_aarch64_sme_zero_zt (imm_to_zt untyped:$zt)),
+          (!cast<Instruction>(NAME # _PSEUDO) $zt)>;
+}
+
 //===----------------------------------------------------------------------===//
 // SME2 lookup table load/store
 class sme2_spill_fill_vector<string mnemonic, bits<8> opc>
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-zero-zt.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-zero-zt.ll
new file mode 100644
index 00000000000000..14a4dba2466bf3
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-zero-zt.ll
@@ -0,0 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s
+
+define void @zero_zt0() {
+; CHECK-LABEL: zero_zt0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    zero { zt0 }
+; CHECK-NEXT:    ret
+    call void @llvm.aarch64.sme.zero.zt(i32 0)
+    ret void
+}
+
+declare void @llvm.aarch64.sme.zero.zt(i32)

From 93636581d3589b3b986c0080a82de7fc0bbd01cf Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 1 Dec 2023 15:38:28 +0100
Subject: [PATCH 43/72] [InstCombiner] Make isFreeToInvert() and friends
 instance functions (NFC)

In order to use SQ inside of these. There doesn't seem to be any
strong need for these to be static.
---
 .../llvm/Transforms/InstCombine/InstCombiner.h | 12 ++++++------
 .../InstCombine/InstCombineAndOrXor.cpp        | 18 +++++++++---------
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h b/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h
index e220d1ad4222ad..e33afd20cfb2d5 100644
--- a/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h
+++ b/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h
@@ -240,18 +240,18 @@ class LLVM_LIBRARY_VISIBILITY InstCombiner {
   /// dereferenceable).
   /// If the inversion will consume instructions, `DoesConsume` will be set to
   /// true. Otherwise it will be false.
-  static Value *getFreelyInvertedImpl(Value *V, bool WillInvertAllUses,
+  Value *getFreelyInvertedImpl(Value *V, bool WillInvertAllUses,
                                       BuilderTy *Builder, bool &DoesConsume,
                                       unsigned Depth);
 
-  static Value *getFreelyInverted(Value *V, bool WillInvertAllUses,
+  Value *getFreelyInverted(Value *V, bool WillInvertAllUses,
                                   BuilderTy *Builder, bool &DoesConsume) {
     DoesConsume = false;
     return getFreelyInvertedImpl(V, WillInvertAllUses, Builder, DoesConsume,
                                  /*Depth*/ 0);
   }
 
-  static Value *getFreelyInverted(Value *V, bool WillInvertAllUses,
+  Value *getFreelyInverted(Value *V, bool WillInvertAllUses,
                                   BuilderTy *Builder) {
     bool Unused;
     return getFreelyInverted(V, WillInvertAllUses, Builder, Unused);
@@ -263,13 +263,13 @@ class LLVM_LIBRARY_VISIBILITY InstCombiner {
   /// uses of V and only keep uses of ~V.
   ///
   /// See also: canFreelyInvertAllUsersOf()
-  static bool isFreeToInvert(Value *V, bool WillInvertAllUses,
+  bool isFreeToInvert(Value *V, bool WillInvertAllUses,
                              bool &DoesConsume) {
     return getFreelyInverted(V, WillInvertAllUses, /*Builder*/ nullptr,
                              DoesConsume) != nullptr;
   }
 
-  static bool isFreeToInvert(Value *V, bool WillInvertAllUses) {
+  bool isFreeToInvert(Value *V, bool WillInvertAllUses) {
     bool Unused;
     return isFreeToInvert(V, WillInvertAllUses, Unused);
   }
@@ -279,7 +279,7 @@ class LLVM_LIBRARY_VISIBILITY InstCombiner {
   /// NOTE: for Instructions only!
   ///
   /// See also: isFreeToInvert()
-  static bool canFreelyInvertAllUsersOf(Instruction *V, Value *IgnoredUser) {
+  bool canFreelyInvertAllUsersOf(Instruction *V, Value *IgnoredUser) {
     // Look at every user of V.
     for (Use &U : V->uses()) {
       if (U.getUser() == IgnoredUser)
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 1ca6085e36de3b..7379bdf93169ec 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -1610,7 +1610,7 @@ static Instruction *reassociateFCmps(BinaryOperator &BO,
 /// (~A & ~B) == (~(A | B))
 /// (~A | ~B) == (~(A & B))
 static Instruction *matchDeMorgansLaws(BinaryOperator &I,
-                                       InstCombiner::BuilderTy &Builder) {
+                                       InstCombiner &IC) {
   const Instruction::BinaryOps Opcode = I.getOpcode();
   assert((Opcode == Instruction::And || Opcode == Instruction::Or) &&
          "Trying to match De Morgan's Laws with something other than and/or");
@@ -1623,10 +1623,10 @@ static Instruction *matchDeMorgansLaws(BinaryOperator &I,
   Value *A, *B;
   if (match(Op0, m_OneUse(m_Not(m_Value(A)))) &&
       match(Op1, m_OneUse(m_Not(m_Value(B)))) &&
-      !InstCombiner::isFreeToInvert(A, A->hasOneUse()) &&
-      !InstCombiner::isFreeToInvert(B, B->hasOneUse())) {
+      !IC.isFreeToInvert(A, A->hasOneUse()) &&
+      !IC.isFreeToInvert(B, B->hasOneUse())) {
     Value *AndOr =
-        Builder.CreateBinOp(FlippedOpcode, A, B, I.getName() + ".demorgan");
+        IC.Builder.CreateBinOp(FlippedOpcode, A, B, I.getName() + ".demorgan");
     return BinaryOperator::CreateNot(AndOr);
   }
 
@@ -1638,8 +1638,8 @@ static Instruction *matchDeMorgansLaws(BinaryOperator &I,
   Value *C;
   if (match(Op0, m_OneUse(m_c_BinOp(Opcode, m_Value(A), m_Not(m_Value(B))))) &&
       match(Op1, m_Not(m_Value(C)))) {
-    Value *FlippedBO = Builder.CreateBinOp(FlippedOpcode, B, C);
-    return BinaryOperator::Create(Opcode, A, Builder.CreateNot(FlippedBO));
+    Value *FlippedBO = IC.Builder.CreateBinOp(FlippedOpcode, B, C);
+    return BinaryOperator::Create(Opcode, A, IC.Builder.CreateNot(FlippedBO));
   }
 
   return nullptr;
@@ -2483,7 +2483,7 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
   if (Instruction *FoldedLogic = foldBinOpIntoSelectOrPhi(I))
     return FoldedLogic;
 
-  if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder))
+  if (Instruction *DeMorgan = matchDeMorgansLaws(I, *this))
     return DeMorgan;
 
   {
@@ -3517,7 +3517,7 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
   if (match(Op0, m_And(m_Or(m_Specific(Op1), m_Value(C)), m_Value(A))))
     return BinaryOperator::CreateOr(Op1, Builder.CreateAnd(A, C));
 
-  if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder))
+  if (Instruction *DeMorgan = matchDeMorgansLaws(I, *this))
     return DeMorgan;
 
   // Canonicalize xor to the RHS.
@@ -4124,7 +4124,7 @@ static bool canFreelyInvert(InstCombiner &IC, Value *Op,
                             Instruction *IgnoredUser) {
   auto *I = dyn_cast<Instruction>(Op);
   return I && IC.isFreeToInvert(I, /*WillInvertAllUses=*/true) &&
-         InstCombiner::canFreelyInvertAllUsersOf(I, IgnoredUser);
+         IC.canFreelyInvertAllUsersOf(I, IgnoredUser);
 }
 
 static Value *freelyInvert(InstCombinerImpl &IC, Value *Op,

From 65aab9e7222025f57c4bfc253d48c7b2ea8581da Mon Sep 17 00:00:00 2001
From: Adam Paszke <apaszke@google.com>
Date: Fri, 1 Dec 2023 15:51:48 +0100
Subject: [PATCH 44/72] =?UTF-8?q?[mlir][gpu]=20Generate=20multiple=20rank-?=
 =?UTF-8?q?specializations=20for=20tensor=20map=20cre=E2=80=A6=20(#74082)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…ation

The previous code was technically incorrect in that the type indicated
that the memref only has 1 dimension, while the code below was happily
dereferencing the size array out of bounds. Now, if the compiler doesn't
get too smart about optimizations, this code *might even work*. But, if
the compiler realizes that the array has 1 element it might starrt doing
silly things. This generates a specialization per each supported rank,
making sure we don't do any UB.
---
 .../ExecutionEngine/CudaRuntimeWrappers.cpp   | 43 +++++++++++++++++--
 1 file changed, 40 insertions(+), 3 deletions(-)

diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
index b8ac9ab90a9f3b..5ec87d58cc57f8 100644
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -423,9 +423,24 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuTensorMapEncodeTiled(
               elementStrides[4], interleave, swizzle, l2Promotion, oobFill);
 }
 
+namespace {
+
+template <int rank>
+void mgpuGetMemRefDataAndShape(void *raw_descriptor, char **addr,
+                               uint64_t *globalDim) {
+  auto descriptor =
+      reinterpret_cast<StridedMemRefType<char, rank> *>(raw_descriptor);
+  *addr = descriptor->data;
+  for (int i = 0; i < rank; ++i) {
+    globalDim[i] = static_cast<uint64_t>(descriptor->sizes[rank - i - 1]);
+  }
+}
+
+} // namespace
+
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *mgpuTensorMapEncodeTiledMemref(
     int64_t tensorRank,                       // Dimensionality of tensor
-    StridedMemRefType<char, 1> *descriptor,   // Starting address
+    void *ranked_descriptor,                  // Ranked MemRef descriptor
     const CUtensorMapDataType tensorDataType, // Stride size (in bytes)
     CUtensorMapInterleave interleave,         // Type of interleaved layout
     CUtensorMapSwizzle swizzle,               // Bank swizzling pattern
@@ -435,17 +450,39 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *mgpuTensorMapEncodeTiledMemref(
 ) {
   CUtensorMap tensorMap;
 
-  auto *globalAddress = descriptor->data;
   uint32_t boxDim[5] = {1, 1, 1, 1, 1}, elementStrides[5] = {1, 1, 1, 1, 1};
   uint64_t globalDim[5] = {1, 1, 1, 1, 1}, globalStrides[5] = {0};
   uint32_t tensorRank32 = uint32_t(tensorRank);
 
+  char *globalAddress = nullptr;
+  switch (tensorRank) {
+  case 1:
+    mgpuGetMemRefDataAndShape<1>(ranked_descriptor, &globalAddress, globalDim);
+    break;
+  case 2:
+    mgpuGetMemRefDataAndShape<2>(ranked_descriptor, &globalAddress, globalDim);
+    break;
+  case 3:
+    mgpuGetMemRefDataAndShape<3>(ranked_descriptor, &globalAddress, globalDim);
+    break;
+  case 4:
+    mgpuGetMemRefDataAndShape<4>(ranked_descriptor, &globalAddress, globalDim);
+    break;
+  case 5:
+    mgpuGetMemRefDataAndShape<5>(ranked_descriptor, &globalAddress, globalDim);
+    break;
+  default:
+    fprintf(
+        stderr,
+        "'mgpuTensorMapEncodeTiledMemref' failed with 'rank is too high'\n");
+    return NULL;
+  }
+
   static const int elementSizeInBytes[] = {1, 2, 4, 4, 8, 8, 2,
                                            4, 8, 2, 4, 4, 4};
   for (int64_t r = 0; r < tensorRank; ++r) {
     elementStrides[r] = uint32_t(1);
     boxDim[r] = static_cast<uint32_t>(inputBoxDims[tensorRank - r - 1]);
-    globalDim[r] = static_cast<uint64_t>(descriptor->sizes[tensorRank - r - 1]);
   }
 
   globalStrides[0] = globalDim[0] * elementSizeInBytes[tensorDataType];

From dd5c5349e1f3b495789a9f67e579121da3722db6 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 1 Dec 2023 15:49:58 +0100
Subject: [PATCH 45/72] [InstCombine] Add tests for invert of lshr (NFC)

---
 .../Transforms/InstCombine/free-inversion.ll  | 32 +++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/llvm/test/Transforms/InstCombine/free-inversion.ll b/llvm/test/Transforms/InstCombine/free-inversion.ll
index ea5894597997bc..c16310cf09631c 100644
--- a/llvm/test/Transforms/InstCombine/free-inversion.ll
+++ b/llvm/test/Transforms/InstCombine/free-inversion.ll
@@ -6,6 +6,8 @@ declare i8 @llvm.umin.i8(i8, i8)
 declare i8 @llvm.smax.i8(i8, i8)
 declare i8 @llvm.umax.i8(i8, i8)
 
+declare void @llvm.assume(i1)
+
 declare void @use.i8(i8)
 
 define i8 @xor_1(i8 %a, i1 %c, i8 %x, i8 %y) {
@@ -492,3 +494,33 @@ define i8 @smax_both_freely_invertable_always(i8 %x, i8 %y) {
   %r = call i8 @llvm.smax.i8(i8 %xx, i8 %yy)
   ret i8 %r
 }
+
+define i8 @lshr_nneg(i8 %x, i8 %y) {
+; CHECK-LABEL: @lshr_nneg(
+; CHECK-NEXT:    [[NEG:%.*]] = icmp slt i8 [[X:%.*]], 0
+; CHECK-NEXT:    call void @llvm.assume(i1 [[NEG]])
+; CHECK-NEXT:    [[X_NOT:%.*]] = xor i8 [[X]], -1
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i8 [[X_NOT]], [[Y:%.*]]
+; CHECK-NEXT:    [[SHR_NOT:%.*]] = xor i8 [[SHR]], -1
+; CHECK-NEXT:    ret i8 [[SHR_NOT]]
+;
+  %neg = icmp slt i8 %x, 0
+  call void @llvm.assume(i1 %neg)
+  %x.not = xor i8 %x, -1
+  %shr = lshr i8 %x.not, %y
+  %shr.not = xor i8 %shr, -1
+  ret i8 %shr.not
+}
+
+define i8 @lshr_not_nneg(i8 %x, i8 %y) {
+; CHECK-LABEL: @lshr_not_nneg(
+; CHECK-NEXT:    [[X_NOT:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i8 [[X_NOT]], [[Y:%.*]]
+; CHECK-NEXT:    [[SHR_NOT:%.*]] = xor i8 [[SHR]], -1
+; CHECK-NEXT:    ret i8 [[SHR_NOT]]
+;
+  %x.not = xor i8 %x, -1
+  %shr = lshr i8 %x.not, %y
+  %shr.not = xor i8 %shr, -1
+  ret i8 %shr.not
+}

From b92693ac6afc522ea56bede0b9805ca7c138754c Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 1 Dec 2023 15:52:06 +0100
Subject: [PATCH 46/72] [InstCombine] Support inverting lshr with non-negative
 operand

If the lshr operand is non-negative, we can treat it the same
way as an ashr. Ideally we would represent this as "lshr nneg",
but for now just perform the necessary ValueTracking query.

Proof: https://alive2.llvm.org/ce/z/Ahg4ri
---
 .../Transforms/InstCombine/InstructionCombining.cpp    | 10 ++++++++++
 llvm/test/Transforms/InstCombine/free-inversion.ll     |  4 +---
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 26fdef672506a6..a2fadbd6999c5b 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2181,6 +2181,16 @@ Value *InstCombiner::getFreelyInvertedImpl(Value *V, bool WillInvertAllUses,
     return nullptr;
   }
 
+  // Treat lshr with non-negative operand as ashr.
+  if (match(V, m_LShr(m_Value(A), m_Value(B))) &&
+      isKnownNonNegative(V, SQ.getWithInstruction(cast<Instruction>(V)),
+                         Depth)) {
+    if (auto *AV = getFreelyInvertedImpl(A, A->hasOneUse(), Builder,
+                                         DoesConsume, Depth))
+      return Builder ? Builder->CreateAShr(AV, B) : NonNull;
+    return nullptr;
+  }
+
   Value *Cond;
   // LogicOps are special in that we canonicalize them at the cost of an
   // instruction.
diff --git a/llvm/test/Transforms/InstCombine/free-inversion.ll b/llvm/test/Transforms/InstCombine/free-inversion.ll
index c16310cf09631c..8d5b1936f95637 100644
--- a/llvm/test/Transforms/InstCombine/free-inversion.ll
+++ b/llvm/test/Transforms/InstCombine/free-inversion.ll
@@ -499,9 +499,7 @@ define i8 @lshr_nneg(i8 %x, i8 %y) {
 ; CHECK-LABEL: @lshr_nneg(
 ; CHECK-NEXT:    [[NEG:%.*]] = icmp slt i8 [[X:%.*]], 0
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[NEG]])
-; CHECK-NEXT:    [[X_NOT:%.*]] = xor i8 [[X]], -1
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i8 [[X_NOT]], [[Y:%.*]]
-; CHECK-NEXT:    [[SHR_NOT:%.*]] = xor i8 [[SHR]], -1
+; CHECK-NEXT:    [[SHR_NOT:%.*]] = ashr i8 [[X]], [[Y:%.*]]
 ; CHECK-NEXT:    ret i8 [[SHR_NOT]]
 ;
   %neg = icmp slt i8 %x, 0

From fdf84cbf87198d16fe17aed0c31989ee31051d82 Mon Sep 17 00:00:00 2001
From: Quinn Dawkins <quinn.dawkins@gmail.com>
Date: Fri, 1 Dec 2023 10:01:28 -0500
Subject: [PATCH 47/72] [mlir][vector] Fix unit dim dropping pattern for masked
 writes (#74038)

This does the same as #72142 for vector.transfer_write. Previously the
pattern would silently drop the mask.
---
 .../Transforms/VectorTransferOpTransforms.cpp | 42 ++++++++++--------
 ...ctor-transfer-drop-unit-dims-patterns.mlir | 44 +++++++++++++++++++
 2 files changed, 67 insertions(+), 19 deletions(-)

diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp
index d2c6ba557b9bbe..75e1abead973f0 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp
@@ -260,14 +260,6 @@ void TransferOptimization::storeToLoadForwarding(vector::TransferReadOp read) {
   opToErase.push_back(read.getOperation());
 }
 
-/// Returns a copy of `shape` without unit dims.
-static SmallVector<int64_t> getReducedShape(ArrayRef<int64_t> shape) {
-  SmallVector<int64_t> reducedShape;
-  llvm::copy_if(shape, std::back_inserter(reducedShape),
-                [](int64_t dimSize) { return dimSize != 1; });
-  return reducedShape;
-}
-
 /// Converts OpFoldResults to int64_t shape without unit dims.
 static SmallVector<int64_t> getReducedShape(ArrayRef<OpFoldResult> mixedSizes) {
   SmallVector<int64_t> reducedShape;
@@ -340,7 +332,7 @@ static FailureOr<Value>
 createMaskDropNonScalableUnitDims(PatternRewriter &rewriter, Location loc,
                                   vector::CreateMaskOp op) {
   auto type = op.getType();
-  auto reducedType = trimNonScalableUnitDims(type);
+  VectorType reducedType = trimNonScalableUnitDims(type);
   if (reducedType.getRank() == type.getRank())
     return failure();
 
@@ -391,7 +383,7 @@ class TransferReadDropUnitDimsPattern
       return failure();
     // Check if the reduced vector shape matches the reduced source shape.
     // Otherwise, this case is not supported yet.
-    auto reducedVectorType = trimNonScalableUnitDims(vectorType);
+    VectorType reducedVectorType = trimNonScalableUnitDims(vectorType);
     if (reducedRank != reducedVectorType.getRank())
       return failure();
     if (llvm::any_of(transferReadOp.getIndices(), [](Value v) {
@@ -446,9 +438,7 @@ class TransferWriteDropUnitDimsPattern
     Value source = transferWriteOp.getSource();
     MemRefType sourceType = dyn_cast<MemRefType>(source.getType());
     // TODO: support tensor type.
-    if (!sourceType || !sourceType.hasStaticShape())
-      return failure();
-    if (sourceType.getNumElements() != vectorType.getNumElements())
+    if (!sourceType)
       return failure();
     // TODO: generalize this pattern, relax the requirements here.
     if (transferWriteOp.hasOutOfBoundsDim())
@@ -461,25 +451,39 @@ class TransferWriteDropUnitDimsPattern
       return failure();
     // Check if the reduced vector shape matches the reduced destination shape.
     // Otherwise, this case is not supported yet.
-    int vectorReducedRank = getReducedRank(vectorType.getShape());
-    if (reducedRank != vectorReducedRank)
+    VectorType reducedVectorType = trimNonScalableUnitDims(vectorType);
+    if (reducedRank != reducedVectorType.getRank())
       return failure();
     if (llvm::any_of(transferWriteOp.getIndices(), [](Value v) {
           return getConstantIntValue(v) != static_cast<int64_t>(0);
         }))
       return failure();
+
+    Value maskOp = transferWriteOp.getMask();
+    if (maskOp) {
+      auto createMaskOp = maskOp.getDefiningOp<vector::CreateMaskOp>();
+      if (!createMaskOp)
+        return rewriter.notifyMatchFailure(
+            transferWriteOp,
+            "unsupported mask op, only 'vector.create_mask' is "
+            "currently supported");
+      FailureOr<Value> rankReducedCreateMask =
+          createMaskDropNonScalableUnitDims(rewriter, loc, createMaskOp);
+      if (failed(rankReducedCreateMask))
+        return failure();
+      maskOp = *rankReducedCreateMask;
+    }
     Value reducedShapeSource =
         rankReducingSubviewDroppingUnitDims(rewriter, loc, source);
     Value c0 = rewriter.create<arith::ConstantIndexOp>(loc, 0);
     SmallVector<Value> zeros(reducedRank, c0);
     auto identityMap = rewriter.getMultiDimIdentityMap(reducedRank);
-    VectorType reducedVectorType = VectorType::get(
-        getReducedShape(vectorType.getShape()), vectorType.getElementType());
-
+    SmallVector<bool> inBounds(reducedVectorType.getRank(), true);
     auto shapeCast = rewriter.createOrFold<vector::ShapeCastOp>(
         loc, reducedVectorType, vector);
     rewriter.replaceOpWithNewOp<vector::TransferWriteOp>(
-        transferWriteOp, shapeCast, reducedShapeSource, zeros, identityMap);
+        transferWriteOp, Type(), shapeCast, reducedShapeSource, zeros,
+        identityMap, maskOp, rewriter.getBoolArrayAttr(inBounds));
 
     return success();
   }
diff --git a/mlir/test/Dialect/Vector/vector-transfer-drop-unit-dims-patterns.mlir b/mlir/test/Dialect/Vector/vector-transfer-drop-unit-dims-patterns.mlir
index 735915d4356538..d65708068862f4 100644
--- a/mlir/test/Dialect/Vector/vector-transfer-drop-unit-dims-patterns.mlir
+++ b/mlir/test/Dialect/Vector/vector-transfer-drop-unit-dims-patterns.mlir
@@ -144,6 +144,50 @@ func.func @masked_transfer_read_dynamic_rank_reducing_2(
 //       CHECK:   %[[SUBVIEW:.+]] = memref.subview %[[ARG]][0, 0, 0, 0, 0, 0] [1, %[[DIM1]], 3, 1, %[[DIM4]], 1] [1, 1, 1, 1, 1, 1] : memref<1x?x3x1x?x1xi8, {{.*}}> to memref<?x3x?xi8, {{.*}}>
 //       CHECK:   vector.transfer_read %[[SUBVIEW]][{{.*}}], %[[PAD]], %[[MASK]] {in_bounds = [true, true, true]} : memref<?x3x?xi8, {{.*}}>, vector<[1]x3x[16]xi8>
 
+func.func @masked_transfer_write_and_vector_rank_reducing(
+      %arg : memref<1x1x3x1x16x1xf32>,
+      %vec : vector<1x3x1x16x1xf32>,
+      %mask_dim1 : index,
+      %mask_dim2 : index) {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %mask = vector.create_mask %c1, %mask_dim1, %c1, %mask_dim2, %c1 : vector<1x3x1x16x1xi1>
+    vector.transfer_write %vec, %arg[%c0, %c0, %c0, %c0, %c0, %c0], %mask :
+      vector<1x3x1x16x1xf32>, memref<1x1x3x1x16x1xf32>
+    return
+}
+// CHECK-LABEL: func @masked_transfer_write_and_vector_rank_reducing
+//  CHECK-SAME:     %[[ARG:.+]]: memref<1x1x3x1x16x1xf32>
+//  CHECK-SAME:     {{.*}}: vector<1x3x1x16x1xf32>,
+//  CHECK-SAME:     %[[MASKDIM1:.+]]: index,
+//  CHECK-SAME:     %[[MASKDIM2:.+]]: index
+//       CHECK:   %[[MASK:.+]] = vector.create_mask %[[MASKDIM1]], %[[MASKDIM2]] : vector<3x16xi1>
+//       CHECK:   %[[SUBVIEW:.+]] = memref.subview %[[ARG]][0, 0, 0, 0, 0, 0] [1, 1, 3, 1, 16, 1] [1, 1, 1, 1, 1, 1]
+//  CHECK-SAME:     memref<1x1x3x1x16x1xf32> to memref<3x16xf32>
+//       CHECK:   vector.transfer_write %{{.*}}, %[[SUBVIEW]]{{.*}}, %[[MASK]] {in_bounds = [true, true]} : vector<3x16xf32>, memref<3x16xf32>
+
+func.func @masked_transfer_write_dynamic_rank_reducing(
+      %arg : memref<?x1xi8, strided<[?, ?], offset: ?>>,
+      %vec : vector<[16]x1xi8>,
+      %mask_dim0 : index) {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %pad = arith.constant 0 : i8
+    %mask = vector.create_mask %mask_dim0, %c1 : vector<[16]x1xi1>
+    vector.transfer_write %vec, %arg[%c0, %c0], %mask {in_bounds = [true, true]} :
+      vector<[16]x1xi8>, memref<?x1xi8, strided<[?, ?], offset: ?>>
+    return
+}
+// CHECK-LABEL: func @masked_transfer_write_dynamic_rank_reducing
+//  CHECK-SAME:     %[[ARG:.+]]: memref<?x1xi8
+//  CHECK-SAME:     %{{.*}}: vector<[16]x1xi8>,
+//  CHECK-SAME:     %[[MASK_DIM0:.+]]: index
+//       CHECK:   %[[C0:.+]] = arith.constant 0 : index
+//       CHECK:   %[[MASK:.+]] = vector.create_mask %[[MASK_DIM0]] : vector<[16]xi1>
+//       CHECK:   %[[DIM0:.+]] = memref.dim %[[ARG]], %[[C0]] : memref<?x1xi8, strided<[?, ?], offset: ?>>
+//       CHECK:   %[[SUBVIEW:.+]] = memref.subview %[[ARG]][0, 0] [%[[DIM0]], 1] [1, 1] : memref<?x1xi8, {{.*}}> to memref<?xi8, {{.*}}>
+//       CHECK:   vector.transfer_write {{.*}}, %[[SUBVIEW]][%[[C0]]], %[[MASK]] {in_bounds = [true]} : vector<[16]xi8>, memref<?xi8, {{.*}}>
+
 /// Only masks operands of vector.create_mask are currently supported.
 func.func @unsupported_masked_transfer_read_dynamic_rank_reducing_1(
       %arg : memref<?x1xi8, strided<[?, ?], offset: ?>>,

From 8c130996c03f6c5993a0989a5c6fa95d1437995a Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 1 Dec 2023 16:02:23 +0100
Subject: [PATCH 48/72] Revert "[InstCombine] Support inverting lshr with
 non-negative operand"

This reverts commit b92693ac6afc522ea56bede0b9805ca7c138754c.

I've made a silly typo in the condition. Will reapply the corrected
version.
---
 .../Transforms/InstCombine/InstructionCombining.cpp    | 10 ----------
 llvm/test/Transforms/InstCombine/free-inversion.ll     |  4 +++-
 2 files changed, 3 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index a2fadbd6999c5b..26fdef672506a6 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2181,16 +2181,6 @@ Value *InstCombiner::getFreelyInvertedImpl(Value *V, bool WillInvertAllUses,
     return nullptr;
   }
 
-  // Treat lshr with non-negative operand as ashr.
-  if (match(V, m_LShr(m_Value(A), m_Value(B))) &&
-      isKnownNonNegative(V, SQ.getWithInstruction(cast<Instruction>(V)),
-                         Depth)) {
-    if (auto *AV = getFreelyInvertedImpl(A, A->hasOneUse(), Builder,
-                                         DoesConsume, Depth))
-      return Builder ? Builder->CreateAShr(AV, B) : NonNull;
-    return nullptr;
-  }
-
   Value *Cond;
   // LogicOps are special in that we canonicalize them at the cost of an
   // instruction.
diff --git a/llvm/test/Transforms/InstCombine/free-inversion.ll b/llvm/test/Transforms/InstCombine/free-inversion.ll
index 8d5b1936f95637..c16310cf09631c 100644
--- a/llvm/test/Transforms/InstCombine/free-inversion.ll
+++ b/llvm/test/Transforms/InstCombine/free-inversion.ll
@@ -499,7 +499,9 @@ define i8 @lshr_nneg(i8 %x, i8 %y) {
 ; CHECK-LABEL: @lshr_nneg(
 ; CHECK-NEXT:    [[NEG:%.*]] = icmp slt i8 [[X:%.*]], 0
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[NEG]])
-; CHECK-NEXT:    [[SHR_NOT:%.*]] = ashr i8 [[X]], [[Y:%.*]]
+; CHECK-NEXT:    [[X_NOT:%.*]] = xor i8 [[X]], -1
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i8 [[X_NOT]], [[Y:%.*]]
+; CHECK-NEXT:    [[SHR_NOT:%.*]] = xor i8 [[SHR]], -1
 ; CHECK-NEXT:    ret i8 [[SHR_NOT]]
 ;
   %neg = icmp slt i8 %x, 0

From 852f6be6967911de8ab6fe61b5a632366cc0804e Mon Sep 17 00:00:00 2001
From: Spenser Bauman <sbauman@mathworks.com>
Date: Fri, 1 Dec 2023 10:08:16 -0500
Subject: [PATCH 49/72] [mlir][tosa] Improve tosa-infer-shapes for ops consumed
 by non-TOSA operators (#72715)

TOSA operators consumed by non-TOSA ops generally do not have their
types inferred, as that would alter the types expected by their
consumers. This prevents type refinement on many TOSA operators when the
IR contains a mix of dialects.

This change modifies tosa-infer-shapes to update the types of all TOSA
operators during inference. When a consumer of that TOSA op is not safe
to update, a tensor.cast is inserted back to the original type. This
behavior is similar to how TOSA ops consumed by func.return are handled.

This allows for more type refinement of TOSA ops, and the additional
tensor.cast operators may be removed by later canonicalizations.
---
 .../Tosa/Transforms/TosaInferShapes.cpp       | 75 ++++++++-----------
 mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir | 12 +++
 2 files changed, 44 insertions(+), 43 deletions(-)

diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaInferShapes.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaInferShapes.cpp
index 3cc16a91edce74..ad28c564f7dbdd 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaInferShapes.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaInferShapes.cpp
@@ -183,17 +183,27 @@ void propagateShapesToTosaWhile(Operation &op) {
   }
 }
 
+// Track the old type for each operand whose type was updated
+// during inference. This information is used to introduce casts
+// back to the type expected by the operand after inference.
+struct TypeRewriteInfo {
+  OpOperand *operand;
+  Type oldType;
+};
+
 void propagateShapesInRegion(Region &region) {
   // Check whether this use case is replaceable. We define an op as
-  // being replaceable if it is used by a ReturnOp, a TosaOp, or an op with a
+  // being replaceable if it is used by a TosaOp, or an op with a
   // type-inference related interface.
+  // When a non-replaceable use is encountered, the value is wrapped in a
+  // cast back to the original type after inference.
   auto isReplaceableUser = [](Operation *user) -> bool {
-    return isa<func::ReturnOp>(user) ||
-           user->getDialect()->getNamespace() ==
+    return user->getDialect()->getNamespace() ==
                TosaDialect::getDialectNamespace() ||
            isa<InferTypeOpInterface, InferShapedTypeOpInterface>(user);
   };
 
+  llvm::SmallVector<TypeRewriteInfo> requiresUpdate;
   for (auto &block : region) {
     for (Operation &op : block) {
       if (op.getDialect()->getNamespace() != TosaDialect::getDialectNamespace())
@@ -219,9 +229,6 @@ void propagateShapesInRegion(Region &region) {
           Value result = std::get<0>(it);
           ShapedTypeComponents predictedShape = std::get<1>(it);
 
-          if (!llvm::all_of(result.getUsers(), isReplaceableUser))
-            continue;
-
           // Determine the knowledge based on the output type.
           // TODO: should also query WIP type probably
           Type resultTy = result.getType();
@@ -246,10 +253,29 @@ void propagateShapesInRegion(Region &region) {
 
           // Set new type
           result.setType(newKnowledge.getType());
+
+          // Collect all uses of the operation which require update.
+          for (auto &user : result.getUses()) {
+            if (!isReplaceableUser(user.getOwner()))
+              requiresUpdate.push_back({&user, resultTy});
+          }
         }
       }
     }
   }
+
+  // For each use whose type changed, cast the value with the new type back to
+  // the old type.
+  IRRewriter rewriter(region.getContext());
+  for (auto [operand, oldType] : requiresUpdate) {
+    rewriter.setInsertionPoint(operand->getOwner());
+
+    auto oldValue = operand->get();
+
+    auto loc = oldValue.getLoc();
+    auto castOp = rewriter.create<tensor::CastOp>(loc, oldType, oldValue);
+    operand->set(castOp);
+  }
 }
 
 /// Pass that performs shape propagation across TOSA operations. This includes
@@ -259,44 +285,7 @@ struct TosaInferShapes
 public:
   void runOnOperation() override {
     func::FuncOp func = getOperation();
-
-    IRRewriter rewriter(func.getContext());
-
     propagateShapesInRegion(func.getBody());
-
-    // Insert UnrealizedConversionCasts to guarantee ReturnOp agress with
-    // the FuncOp type.
-    func.walk([&](func::ReturnOp op) {
-      func::FuncOp parent = dyn_cast<func::FuncOp>(op->getParentOp());
-      if (!parent)
-        return;
-
-      rewriter.setInsertionPoint(op);
-      FunctionType funcTy = func.getFunctionType();
-      auto resultTys = funcTy.getResults();
-
-      bool castAdded = false;
-      SmallVector<Value> castedValues;
-      for (auto it : llvm::zip(op->getOperands(), resultTys)) {
-        auto operand = std::get<0>(it);
-        auto currentTy = operand.getType();
-        auto castTy = std::get<1>(it);
-        if (currentTy == castTy) {
-          castedValues.push_back(operand);
-          continue;
-        }
-
-        castedValues.push_back(
-            rewriter.create<tensor::CastOp>(op.getLoc(), castTy, operand)
-                .getResult());
-
-        castAdded = true;
-      }
-
-      if (castAdded) {
-        rewriter.replaceOpWithNewOp<func::ReturnOp>(op, castedValues);
-      }
-    });
   }
 };
 } // namespace
diff --git a/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir b/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir
index 7af66ae1dbc90f..f057431a841b59 100644
--- a/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir
+++ b/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir
@@ -1262,6 +1262,17 @@ func.func @test_non_tosa_consumer_extract(%arg0: tensor<4x4xf32>, %arg1: index)
 
 // -----
 
+// CHECK-LABEL: test_non_tosa_consumer_still_propagates
+func.func @test_non_tosa_consumer_still_propagates(%arg0: tensor<1x1x8xf32>, %arg1: tensor<1x8x1xf32>) -> tensor<?x?xf32> {
+  // CHECK: tosa.matmul %arg0, %arg1 : (tensor<1x1x8xf32>, tensor<1x8x1xf32>) -> tensor<1x1x1xf32>
+  %0 = tosa.matmul %arg0, %arg1 : (tensor<1x1x8xf32>, tensor<1x8x1xf32>) -> tensor<?x1x1xf32>
+  %1 = arith.constant dense<[1, 1]> : tensor<2xindex>
+  %2 = tensor.reshape %0(%1) : (tensor<?x1x1xf32>, tensor<2xindex>) -> tensor<?x?xf32>
+  return %2 : tensor<?x?xf32>
+}
+
+// -----
+
 // CHECK-LABEL: test_tosa_use_def_chain
 func.func @test_tosa_use_def_chain(%arg0: tensor<1x32x32x3xf32>, %arg1: tensor<16x3x3x3xf32>, %arg2: tensor<16xf32>) -> tensor<?x16x16x16xf32> {
   // CHECK: [[CONV:%.+]] = tosa.conv2d %arg0, %arg1, %arg2
@@ -1298,3 +1309,4 @@ func.func @test_large_constant_permutation() {
   %72 = tosa.transpose %14, %cst_26 : (tensor<?x27xi64>, tensor<2xi32>) -> tensor<?x27xi64>
   return
 }
+

From 7007919cfde7c7515c0c2cc9b7d66616225d0b17 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 1 Dec 2023 16:07:18 +0100
Subject: [PATCH 50/72] [InstCombine] Add additional test for invert of lshr
 (NFC)

---
 llvm/test/Transforms/InstCombine/free-inversion.ll | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/llvm/test/Transforms/InstCombine/free-inversion.ll b/llvm/test/Transforms/InstCombine/free-inversion.ll
index c16310cf09631c..93d293df6048e3 100644
--- a/llvm/test/Transforms/InstCombine/free-inversion.ll
+++ b/llvm/test/Transforms/InstCombine/free-inversion.ll
@@ -524,3 +524,15 @@ define i8 @lshr_not_nneg(i8 %x, i8 %y) {
   %shr.not = xor i8 %shr, -1
   ret i8 %shr.not
 }
+
+define i8 @lshr_not_nneg2(i8 %x) {
+; CHECK-LABEL: @lshr_not_nneg2(
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i8 [[X:%.*]], 1
+; CHECK-NEXT:    [[SHR_NOT1:%.*]] = or i8 [[SHR]], -128
+; CHECK-NEXT:    ret i8 [[SHR_NOT1]]
+;
+  %x.not = xor i8 %x, -1
+  %shr = lshr i8 %x.not, 1
+  %shr.not = xor i8 %shr, -1
+  ret i8 %shr.not
+}

From faebb1b2e6891687e4f608b74205985ec78ade40 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 1 Dec 2023 15:52:06 +0100
Subject: [PATCH 51/72] Reapply [InstCombine] Support inverting lshr with
 non-negative operand

My initial patch contained a typo, resulting in the wrong value
being checked for non-negativeness.

-----

If the lshr operand is non-negative, we can treat it the same
way as an ashr. Ideally we would represent this as "lshr nneg",
but for now just perform the necessary ValueTracking query.

Proof: https://alive2.llvm.org/ce/z/Ahg4ri
---
 .../Transforms/InstCombine/InstructionCombining.cpp    | 10 ++++++++++
 llvm/test/Transforms/InstCombine/free-inversion.ll     |  4 +---
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 26fdef672506a6..0f033a9adad702 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2181,6 +2181,16 @@ Value *InstCombiner::getFreelyInvertedImpl(Value *V, bool WillInvertAllUses,
     return nullptr;
   }
 
+  // Treat lshr with non-negative operand as ashr.
+  if (match(V, m_LShr(m_Value(A), m_Value(B))) &&
+      isKnownNonNegative(A, SQ.getWithInstruction(cast<Instruction>(V)),
+                         Depth)) {
+    if (auto *AV = getFreelyInvertedImpl(A, A->hasOneUse(), Builder,
+                                         DoesConsume, Depth))
+      return Builder ? Builder->CreateAShr(AV, B) : NonNull;
+    return nullptr;
+  }
+
   Value *Cond;
   // LogicOps are special in that we canonicalize them at the cost of an
   // instruction.
diff --git a/llvm/test/Transforms/InstCombine/free-inversion.ll b/llvm/test/Transforms/InstCombine/free-inversion.ll
index 93d293df6048e3..851f9823cc692b 100644
--- a/llvm/test/Transforms/InstCombine/free-inversion.ll
+++ b/llvm/test/Transforms/InstCombine/free-inversion.ll
@@ -499,9 +499,7 @@ define i8 @lshr_nneg(i8 %x, i8 %y) {
 ; CHECK-LABEL: @lshr_nneg(
 ; CHECK-NEXT:    [[NEG:%.*]] = icmp slt i8 [[X:%.*]], 0
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[NEG]])
-; CHECK-NEXT:    [[X_NOT:%.*]] = xor i8 [[X]], -1
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i8 [[X_NOT]], [[Y:%.*]]
-; CHECK-NEXT:    [[SHR_NOT:%.*]] = xor i8 [[SHR]], -1
+; CHECK-NEXT:    [[SHR_NOT:%.*]] = ashr i8 [[X]], [[Y:%.*]]
 ; CHECK-NEXT:    ret i8 [[SHR_NOT]]
 ;
   %neg = icmp slt i8 %x, 0

From 0d87e2577914a6384f4ad5952b8fa9b0d8e48da8 Mon Sep 17 00:00:00 2001
From: Spenser Bauman <sbauman@mathworks.com>
Date: Fri, 1 Dec 2023 10:16:51 -0500
Subject: [PATCH 52/72] [mlir][tosa] Improve lowering to tosa.fully_connected
 (#73049)

The current lowering of tosa.fully_connected produces a linalg.matmul
followed by a linalg.generic to add the bias. The IR looks like the
following:

    %init = tensor.empty()
    %zero = linalg.fill ins(0 : f32) outs(%init)
    %prod = linalg.matmul ins(%A, %B) outs(%zero)

    // Add the bias
    %initB = tensor.empty()
    %result = linalg.generic ins(%prod, %bias) outs(%initB) {
       // add bias and product
    }

This has two down sides:

1. The tensor.empty operations typically result in additional
allocations after bufferization
2. There is a redundant traversal of the data to add the bias to the
matrix product.

This extra work can be avoided by leveraging the out-param of
linalg.matmul. The new IR sequence is:

    %init = tensor.empty()
    %broadcast = linalg.broadcast ins(%bias) outs(%init)
    %prod = linalg.matmul ins(%A, %B) outs(%broadcast)

In my experiments, this eliminates one loop and one allocation (post
bufferization) from the generated code.
---
 .../TosaToLinalg/TosaToLinalgNamed.cpp        | 91 +++++++++++--------
 .../TosaToLinalg/tosa-to-linalg-named.mlir    | 84 +++++++++--------
 .../Tosa/CPU/test-fully-connected.mlir        | 36 ++++++++
 3 files changed, 128 insertions(+), 83 deletions(-)
 create mode 100644 mlir/test/Integration/Dialect/Tosa/CPU/test-fully-connected.mlir

diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp
index 9e374be534985e..b30651976eeb93 100644
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp
@@ -85,6 +85,49 @@ linalgIntBroadcastExtSIAdd(PatternRewriter &rewriter, Location loc, Value bias,
       .getResult(0);
 }
 
+// Broadcast the source value to all the outer dimensions of the result value.
+// If required, the element type is expanded using an arith.extsi operation.
+static mlir::Value linalgBroadcastAndMaybeExtSI(PatternRewriter &rewriter,
+                                                Location loc, Value source,
+                                                Value result) {
+  ShapedType resultTy = cast<ShapedType>(result.getType());
+  ShapedType sourceTy = cast<ShapedType>(source.getType());
+  int64_t resultRank = resultTy.getRank();
+  int64_t sourceRank = sourceTy.getRank();
+
+  // The source tensor is broadcast to all the outer dimensions of the
+  // result tensor.
+  SmallVector<AffineExpr> sourceDims;
+  for (auto dim : llvm::seq<int64_t>(0, sourceRank)) {
+    auto expr = rewriter.getAffineDimExpr(dim + resultRank - sourceRank);
+    sourceDims.push_back(expr);
+  }
+
+  // Creating maps for the input and output of the broacast-like generic op.
+  SmallVector<AffineMap, 2> indexingMaps = {
+      // Broadcast the last dimension of the bias to all output dimensions.
+      AffineMap::get(/*dimCount=*/resultRank,
+                     /*symbolCount=*/0, sourceDims, rewriter.getContext()),
+
+      // Output indexing map.
+      rewriter.getMultiDimIdentityMap(resultRank)};
+
+  // Build the broadcast-like operation as a linalg.generic.
+  return rewriter
+      .create<linalg::GenericOp>(
+          loc, resultTy, ValueRange({source}), result, indexingMaps,
+          getNParallelLoopsAttrs(resultTy.getRank()),
+          [](OpBuilder &builder, Location loc, ValueRange args) {
+            Value biasVal = args[0];
+            Type resType = args[1].getType();
+            if (resType != biasVal.getType()) {
+              biasVal = builder.create<arith::ExtSIOp>(loc, resType, biasVal);
+            }
+            builder.create<linalg::YieldOp>(loc, biasVal);
+          })
+      .getResult(0);
+}
+
 static mlir::Value reifyConstantDim(int64_t attr,
                                     ImplicitLocOpBuilder &builder) {
   return builder.createOrFold<arith::IndexCastOp>(
@@ -618,28 +661,6 @@ class FullyConnectedConverter
 
     SmallVector<Value> filteredDims = condenseValues(dynDims);
 
-    // Creating maps for the output of MatMul and the bias
-    SmallVector<AffineMap, 4> indexingMaps;
-
-    // Broadcast the bias.
-    indexingMaps.push_back(AffineMap::get(/*dimCount=*/2, /*symbolCount=*/0,
-                                          {rewriter.getAffineDimExpr(1)},
-                                          rewriter.getContext()));
-
-    indexingMaps.push_back(rewriter.getMultiDimIdentityMap(outputTy.getRank()));
-    indexingMaps.push_back(rewriter.getMultiDimIdentityMap(outputTy.getRank()));
-
-    auto emptyTensor = rewriter.create<tensor::EmptyOp>(
-        loc, outputTy.getShape(), outputTy.getElementType(), filteredDims);
-
-    // When quantized, the input elemeny type is not the same as the output
-    auto resultZeroAttr = rewriter.getZeroAttr(outputETy);
-    Value zero = rewriter.create<arith::ConstantOp>(loc, resultZeroAttr);
-    Value zeroTensor = rewriter
-                           .create<linalg::FillOp>(loc, ValueRange{zero},
-                                                   ValueRange{emptyTensor})
-                           .result();
-
     SmallVector<int64_t> permutation{1, 0};
     auto permutationAttr = rewriter.getI64TensorAttr(permutation);
     Value permutationValue =
@@ -655,26 +676,17 @@ class FullyConnectedConverter
     Value biasEmptyTensor = rewriter.create<tensor::EmptyOp>(
         loc, outputTy.getShape(), outputETy, filteredDims);
 
+    Value broadcastBias =
+        linalgBroadcastAndMaybeExtSI(rewriter, loc, bias, biasEmptyTensor);
+
     if (!op.getQuantizationInfo()) {
       Value matmul = rewriter
                          .create<linalg::MatmulOp>(
                              loc, TypeRange{op.getType()},
-                             ValueRange{input, transposedWeight}, zeroTensor)
+                             ValueRange{input, transposedWeight}, broadcastBias)
                          ->getResult(0);
 
-      Value result =
-          rewriter
-              .create<linalg::GenericOp>(
-                  loc, outputTy, ValueRange({bias, matmul}), biasEmptyTensor,
-                  indexingMaps, getNParallelLoopsAttrs(outputTy.getRank()),
-                  [&](OpBuilder &nestedBuilder, Location nestedLoc,
-                      ValueRange args) {
-                    Value added = nestedBuilder.create<arith::AddFOp>(
-                        loc, args[0], args[1]);
-                    nestedBuilder.create<linalg::YieldOp>(nestedLoc, added);
-                  })
-              .getResult(0);
-      rewriter.replaceOp(op, result);
+      rewriter.replaceOp(op, matmul);
       return success();
     }
 
@@ -688,11 +700,10 @@ class FullyConnectedConverter
             .create<linalg::QuantizedMatmulOp>(
                 loc, TypeRange{op.getType()},
                 ValueRange{input, transposedWeight, inputZp, outputZp},
-                zeroTensor)
+                broadcastBias)
             ->getResult(0);
-    Value result = linalgIntBroadcastExtSIAdd(rewriter, loc, bias, matmul,
-                                              biasEmptyTensor, indexingMaps);
-    rewriter.replaceOp(op, result);
+
+    rewriter.replaceOp(op, matmul);
     return success();
   }
 };
diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir
index 4edc7533193280..bbdd1bad799865 100644
--- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir
+++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir
@@ -82,22 +82,21 @@ func.func @matmul_dyn_output(%arg0: tensor<1x1x8xf32>, %arg1: tensor<1x8x1xf32>)
 
 // -----
 
-// CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1) -> (d1)>
-// CHECK: #[[$MAP2:.*]] = affine_map<(d0, d1) -> (d0, d1)>
+// CHECK: #[[$MAP0:.+]] = affine_map<(d0, d1) -> (d1)>
+// CHECK: #[[$MAP1:.+]] = affine_map<(d0, d1) -> (d0, d1)>
 
 // CHECK-LABEL: @fully_connected
 func.func @fully_connected(%arg0: tensor<5x3xf32>, %arg1: tensor<6x3xf32>, %arg2: tensor<6xf32>) -> (tensor<5x6xf32>) {
-  // CHECK: [[INITT:%.+]] = tensor.empty()
-  // CHECK: [[ZERO:%.+]] = arith.constant 0
-  // CHECK: [[FILL:%.+]] = linalg.fill ins([[ZERO]]{{.*}}outs([[INITT]]
-  // CHECK: [[PERM:%.+]] = arith.constant dense<[1, 0]>
-  // CHECK: [[TRANSPOSE:%.+]] = tosa.transpose %arg1, [[PERM]]
-  // CHECK: [[INITB:%.+]] = tensor.empty()
-  // CHECK: [[MATMUL:%.+]] = linalg.matmul ins(%arg0, [[TRANSPOSE]] : tensor<5x3xf32>, tensor<3x6xf32>) outs([[FILL]] : tensor<5x6xf32>) -> tensor<5x6xf32>
-  // CHECK: [[ADDED:%.+]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%arg2, [[MATMUL]] : tensor<6xf32>, tensor<5x6xf32>) outs([[INITB]] : tensor<5x6xf32>) {
-  // CHECK: ^bb0(%[[ARG3:[0-9a-zA-Z_]+]]: f32, %[[ARG4:[0-9a-zA-Z_]+]]: f32, %[[ARG5:[0-9a-zA-Z_]+]]: f32):
-  // CHECK:   [[ADD:%.+]] = arith.addf %[[ARG3]], %[[ARG4]] : f32
-  // CHECK:   linalg.yield [[ADD]] : f32
+  // CHECK: %[[PERM:.+]] = arith.constant dense<[1, 0]> : tensor<2xi64>
+  // CHECK: %[[TRANSPOSED:.+]] = tosa.transpose %arg1, %[[PERM]] : (tensor<6x3xf32>, tensor<2xi64>) -> tensor<3x6xf32>
+  // CHECK: %[[INIT:.+]] = tensor.empty() : tensor<5x6xf32>
+
+  // CHECK: %[[BROADCAST:.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["parallel", "parallel"]} ins(%arg2 : tensor<6xf32>) outs(%[[INIT]] : tensor<5x6xf32>) {
+  // CHECK: ^bb0(%[[IN:.+]]: f32, %[[OUT:.+]]: f32):
+  // CHECK:   linalg.yield %[[IN]] : f32
+  // CHECK: } -> tensor<5x6xf32>
+
+  // CHECK: linalg.matmul ins(%arg0, %[[TRANSPOSED]] : tensor<5x3xf32>, tensor<3x6xf32>) outs(%[[BROADCAST]] : tensor<5x6xf32>) -> tensor<5x6xf32>
 
   %0 = tosa.fully_connected %arg0, %arg1, %arg2 : (tensor<5x3xf32>, tensor<6x3xf32>, tensor<6xf32>) -> tensor<5x6xf32>
   return %0 : tensor<5x6xf32>
@@ -105,48 +104,47 @@ func.func @fully_connected(%arg0: tensor<5x3xf32>, %arg1: tensor<6x3xf32>, %arg2
 
 // -----
 
-// CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1) -> (d1)>
-// CHECK: #[[$MAP2:.*]] = affine_map<(d0, d1) -> (d0, d1)>
+// CHECK: #[[$MAP0:.+]] = affine_map<(d0, d1) -> (d1)>
+// CHECK: #[[$MAP1:.+]] = affine_map<(d0, d1) -> (d0, d1)>
 
 // CHECK-LABEL: @quantized_fully_connected
 func.func @quantized_fully_connected(%arg0: tensor<5x3xi8>, %arg1: tensor<6x3xi8>, %arg2: tensor<6xi32>) -> (tensor<5x6xi32>) {
-  // CHECK: [[INITT:%.+]] = tensor.empty()
-  // CHECK: [[ZERO:%.+]] = arith.constant 0
-  // CHECK: [[FILL:%.+]] = linalg.fill ins([[ZERO]]{{.*}}outs([[INITT]]
-  // CHECK: [[PERM:%.+]] = arith.constant dense<[1, 0]>
-  // CHECK: [[TRANSPOSE:%.+]] = tosa.transpose %arg1, [[PERM]]
-  // CHECK: [[INITB:%.+]] = tensor.empty()
-  // CHECK: [[ONE:%.+]] = arith.constant 1
-  // CHECK: [[TWO:%.+]] = arith.constant 2
-  // CHECK: [[MATMUL:%.+]] = linalg.quantized_matmul ins(%arg0, [[TRANSPOSE]], [[ONE]], [[TWO]] : tensor<5x3xi8>, tensor<3x6xi8>, i32, i32) outs([[FILL]] : tensor<5x6xi32>) -> tensor<5x6xi32>
-  // CHECK: [[ADDED:%.+]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%arg2, [[MATMUL]] : tensor<6xi32>, tensor<5x6xi32>) outs([[INITB]]
-  // CHECK: ^bb0([[IN1:%.+]]: i32, [[IN2:%.+]]: i32, [[UNUSED:%.+]]: i32):
-  // CHECK:   [[ADD:%.+]] = arith.addi
-  // CHECK:   linalg.yield [[ADD]] : i32
+  // CHECK: %[[PERM:.+]] = arith.constant dense<[1, 0]> : tensor<2xi64>
+  // CHECK: %[[TRANSPOSE:.+]] = tosa.transpose %arg1, %[[PERM]] : (tensor<6x3xi8>, tensor<2xi64>) -> tensor<3x6xi8>
+  // CHECK: %[[INIT:.+]] = tensor.empty() : tensor<5x6xi32>
+
+  // CHECK: %[[BROADCAST:.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["parallel", "parallel"]} ins(%arg2 : tensor<6xi32>) outs(%[[INIT]] : tensor<5x6xi32>) {
+  // CHECK: ^bb0(%[[IN:.+]]: i32, %[[OUT:.+]]: i32):
+  // CHECK:   linalg.yield %[[IN]] : i32
+  // CHECK: } -> tensor<5x6xi32>
+
+  // CHECK: %[[C1:.+]] = arith.constant 1 : i32
+  // CHECK: %[[C2:.+]] = arith.constant 2 : i32
+  // CHECK: linalg.quantized_matmul ins(%arg0, %[[TRANSPOSE]], %[[C1]], %[[C2]] : tensor<5x3xi8>, tensor<3x6xi8>, i32, i32) outs(%[[BROADCAST]] : tensor<5x6xi32>) -> tensor<5x6xi32>
+
   %0 = tosa.fully_connected %arg0, %arg1, %arg2 {quantization_info = #tosa.conv_quant<input_zp = 1, weight_zp = 2>} : (tensor<5x3xi8>, tensor<6x3xi8>, tensor<6xi32>) -> tensor<5x6xi32>
   return %0 : tensor<5x6xi32>
 }
 
 // -----
 
-// CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1) -> (d1)>
-// CHECK: #[[$MAP2:.*]] = affine_map<(d0, d1) -> (d0, d1)>
+// CHECK: #[[$MAP0:.+]] = affine_map<(d0, d1) -> (d1)>
+// CHECK: #[[$MAP1:.+]] = affine_map<(d0, d1) -> (d0, d1)>
 
 // CHECK-LABEL: @fully_connected_dyn
 func.func @fully_connected_dyn(%arg0: tensor<?x3xf32>, %arg1: tensor<6x3xf32>, %arg2: tensor<6xf32>) -> (tensor<?x6xf32>) {
-  // CHECK: %[[C0:.+]] = arith.constant 0
-  // CHECK: %[[DIM:.+]] = tensor.dim %arg0, %[[C0]]
-  // CHECK: %[[INITT:.+]] = tensor.empty(%[[DIM]])
-  // CHECK: %[[ZERO:.+]] = arith.constant 0
-  // CHECK: %[[FILL:.+]] = linalg.fill ins(%[[ZERO]]{{.*}}outs(%[[INITT]]
-  // CHECK: %[[PERM:.+]] = arith.constant dense<[1, 0]>
-  // CHECK: %[[TRANSPOSE:.+]] = tosa.transpose %arg1, %[[PERM]]
-  // CHECK: %[[INITB:.+]] = tensor.empty(%[[DIM]])
-  // CHECK: %[[MATMUL:.+]] = linalg.matmul ins(%arg0, %[[TRANSPOSE]] : tensor<?x3xf32>, tensor<3x6xf32>) outs(%[[FILL]] : tensor<?x6xf32>) -> tensor<?x6xf32>
-  // CHECK: %[[ADDED:.+]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%arg2, %[[MATMUL]] : tensor<6xf32>, tensor<?x6xf32>) outs(%[[INITB]] : tensor<?x6xf32>) {
-  // CHECK: ^bb0(%[[ARG3:[0-9a-zA-Z_]+]]: f32, %[[ARG4:[0-9a-zA-Z_]+]]: f32, %[[ARG5:[0-9a-zA-Z_]+]]: f32):
-  // CHECK:   %[[ADD:.+]] = arith.addf %[[ARG3]], %[[ARG4]] : f32
-  // CHECK:   linalg.yield %[[ADD]] : f32
+  // CHECK: %[[C0:.+]] = arith.constant 0 : index
+  // CHECK: %[[DIM0:.+]] = tensor.dim %arg0, %c0 : tensor<?x3xf32>
+  // CHECK: %[[PERM:.+]] = arith.constant dense<[1, 0]> : tensor<2xi64>
+  // CHECK: %[[TRANSPOSED:.+]] = tosa.transpose %arg1, %[[PERM]] : (tensor<6x3xf32>, tensor<2xi64>) -> tensor<3x6xf32>
+  // CHECK: %[[INIT:.+]] = tensor.empty(%[[DIM0]]) : tensor<?x6xf32>
+
+  // CHECK: %[[BROADCAST:.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["parallel", "parallel"]} ins(%arg2 : tensor<6xf32>) outs(%[[INIT]] : tensor<?x6xf32>) {
+  // CHECK: ^bb0(%[[IN:.+]]: f32, %[[OUT:.+]]: f32):
+  // CHECK:   linalg.yield %[[IN]] : f32
+  // CHECK: } -> tensor<?x6xf32>
+
+  // CHECK: linalg.matmul ins(%arg0, %[[TRANSPOSED]] : tensor<?x3xf32>, tensor<3x6xf32>) outs(%[[BROADCAST]] : tensor<?x6xf32>) -> tensor<?x6xf32>
 
   %0 = tosa.fully_connected %arg0, %arg1, %arg2 : (tensor<?x3xf32>, tensor<6x3xf32>, tensor<6xf32>) -> tensor<?x6xf32>
   return %0 : tensor<?x6xf32>
diff --git a/mlir/test/Integration/Dialect/Tosa/CPU/test-fully-connected.mlir b/mlir/test/Integration/Dialect/Tosa/CPU/test-fully-connected.mlir
new file mode 100644
index 00000000000000..bf178c826574e4
--- /dev/null
+++ b/mlir/test/Integration/Dialect/Tosa/CPU/test-fully-connected.mlir
@@ -0,0 +1,36 @@
+// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(tosa-to-linalg-named,tosa-to-linalg,tosa-to-arith))" | \
+// RUN: mlir-opt -one-shot-bufferize -func-bufferize -test-lower-to-llvm | \
+// RUN: mlir-cpu-runner -O3 -e main -entry-point-result=void \
+// RUN:   -shared-libs=%mlir_runner_utils \
+// RUN: | FileCheck %s
+
+func.func private @printMemrefF32(tensor<*xf32>)
+
+func.func @main() {
+  %A = arith.constant dense<[
+    [8.0, 1.0, 6.0],
+    [3.0, 5.0, 7.0],
+    [4.0, 9.0, 2.0]
+  ]> : tensor<3x3xf32>
+
+  %B = arith.constant dense<[
+    [1.0, 1.0, 1.0],
+    [1.0, 1.0, 1.0],
+    [1.0, 1.0, 1.0]
+  ]> : tensor<3x3xf32>
+
+  %C = arith.constant dense<[0.0, 1.0, 2.0]> : tensor<3xf32>
+
+  %result = tosa.fully_connected %A, %B, %C : (tensor<3x3xf32>, tensor<3x3xf32>, tensor<3xf32>) -> tensor<3x3xf32>
+
+  %result_unranked = tensor.cast %result : tensor<3x3xf32> to tensor<*xf32>
+  call @printMemrefF32(%result_unranked) : (tensor<*xf32>) -> ()
+  return
+}
+
+// CHECK: Unranked Memref base@ = {{.*}} rank = 2 offset = 0 sizes = [3, 3] strides = [3, 1] data =
+// CHECK-NEXT:      [
+// CHECK-SAME:  [15, 16, 17]
+// CHECK-NEXT:  [15, 16, 17]
+// CHECK-NEXT:  [15, 16, 17]
+// CHECK-SAME: ]

From f58fb8c209a5179f8f2e02e2a0816c9b1f1edb1b Mon Sep 17 00:00:00 2001
From: Spenser Bauman <sbauman@mathworks.com>
Date: Fri, 1 Dec 2023 10:33:14 -0500
Subject: [PATCH 53/72] [mlir][tosa] Fix lowering of tosa.conv2d (#73240)

The lowering of tosa.conv2d produces an illegal tensor.empty operation
where the number of inputs do not match the number of dynamic dimensions
in the output type.

The fix is to base the generation of tensor.dim operations off the
result type of the conv2d operation, rather than the input type. The
problem and fix are very similar to this fix

https://github.com/llvm/llvm-project/pull/72724

but for convolution.
---
 .../TosaToLinalg/TosaToLinalgNamed.cpp        |  4 ++--
 .../TosaToLinalg/tosa-to-linalg-named.mlir    | 23 +++++++++++++++++++
 2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp
index b30651976eeb93..0accd9d1986a1e 100644
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp
@@ -179,7 +179,7 @@ static SmallVector<Value> inferDynamicDimsForConv(
   for (uint32_t i = 0, s = inputSizeDims.size(); i < s; ++i) {
     int64_t inputDim = inputSizeDims[i];
     int64_t kernelDim = kernelSizeDims[i];
-    if (inputTy.isDynamicDim(inputDim)) {
+    if (resultTy.isDynamicDim(inputDim)) {
       auto padTop = padAttr[i * 2];
       auto padBottom = padAttr[i * 2 + 1];
       auto stride = strideAttr[i];
@@ -196,7 +196,7 @@ static SmallVector<Value> inferDynamicDimsForConv(
 
   // Get the batch/channels dimensions.
   for (int i = 0; i < inputRank; i++) {
-    if (inputTy.isDynamicDim(i) && !dynDims[i])
+    if (resultTy.isDynamicDim(i) && !dynDims[i])
       dynDims[i] = rewriter.create<tensor::DimOp>(loc, input, i);
   }
 
diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir
index bbdd1bad799865..230001f7633b57 100644
--- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir
+++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir
@@ -495,6 +495,29 @@ func.func @conv2d_dyn_w_h(%input: tensor<1x?x?x27xf32>, %weights: tensor<28x3x3x
 
 // -----
 
+// CHECK: [[$MAP1:.+]] = affine_map<(d0, d1, d2, d3) -> (d3)>
+// CHECK: [[$MAP2:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+
+func.func @conv2d_dyn_output(%input: tensor<2x6x5x4xf32>, %weights: tensor<4x3x3x4xf32>, %bias: tensor<4xf32>) {
+  // %[[C0:.+]] = arith.constant 0 : index
+  // %[[DIM0:.+]] = tensor.dim %input, %[[C0]] : tensor<2x6x5x4xf32>
+  // %[[INIT_CONV:.+]] = tensor.empty(%[[DIM0]]) : tensor<?x4x3x4xf32>
+  // %[[ZERO:.+]] = arith.constant 0.000000e+00 : f32
+  // %[[FILL:.+]] = linalg.fill
+  // %[[INIT_GENERIC:.+]] = tensor.empty([[DIM0]]) : tensor<?x4x3x4xf32>
+
+  // %[[CONV:.+]] = linalg.conv_2d_nhwc_fhwc {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %arg1 : tensor<2x6x5x4xf32>, tensor<4x3x3x4xf32>) outs(%[[INIT_CONV]] : tensor<?x4x3x4xf32>) -> tensor<?x4x3x4xf32>
+  // linalg.generic {indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP2]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %[[CONV]] : tensor<4xf32>, tensor<?x4x3x4xf32>) outs(%[[INIT_GENERIC]] : tensor<?x4x3x4xf32>) {
+  //   %[[ADD:.+]] = arith.addf
+  //   linalg.yield %[[ADD]] : f32
+  // } -> tensor<?x4x3x4xf32>
+
+  %0 = tosa.conv2d %input, %weights, %bias {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>} : (tensor<2x6x5x4xf32    >, tensor<4x3x3x4xf32>, tensor<4xf32>) -> tensor<?x4x3x4xf32>
+  return
+}
+
+// -----
+
 // CHECK-LABEL: @conv2d_padded_f32
 func.func @conv2d_padded_f32(%input: tensor<1x47x40x28xf32>, %weights: tensor<28x3x3x28xf32>, %bias: tensor<28xf32>) -> () {
   // CHECK: %[[C0:.+]] = arith.constant 0

From 39d15a7d3bc6bca4c2ad0fc432ba757eb9b8338c Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jonathan_roelofs@apple.com>
Date: Fri, 1 Dec 2023 07:34:22 -0800
Subject: [PATCH 54/72] [AArch64][SME] Remove implicit-def's on smstart
 (#69012)

When we lower calls, the sequence of argument copy-to-reg nodes are
glued to the smstart. In the InstrEmitter, these glued copies are turned
into implicit defs, since the actual call instruction uses those
physregs, resulting in the register allocator adding unnecessary copies
of regs that are preserved anyway.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 16 ++++++
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |  3 ++
 .../lib/Target/AArch64/AArch64SMEInstrInfo.td |  4 +-
 llvm/lib/Target/AArch64/SMEInstrFormats.td    |  1 +
 .../sme-streaming-compatible-interface.ll     | 53 +++++++++++++++++++
 .../AArch64/sme-streaming-interface.ll        | 41 +++++++++++---
 6 files changed, 109 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 68fa58dea5beb1..011aedeba1eb79 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7398,6 +7398,22 @@ static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
   return ZExtBool;
 }
 
+void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
+                                                          SDNode *Node) const {
+  // Live-in physreg copies that are glued to SMSTART are applied as
+  // implicit-def's in the InstrEmitter. Here we remove them, allowing the
+  // register allocator to pass call args in callee saved regs, without extra
+  // copies to avoid these fake clobbers of actually-preserved GPRs.
+  if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 ||
+      MI.getOpcode() == AArch64::MSRpstatePseudo)
+    for (unsigned I = MI.getNumOperands() - 1; I > 0; --I)
+      if (MachineOperand &MO = MI.getOperand(I);
+          MO.isReg() && MO.isImplicit() && MO.isDef() &&
+          (AArch64::GPR32RegClass.contains(MO.getReg()) ||
+           AArch64::GPR64RegClass.contains(MO.getReg())))
+        MI.removeOperand(I);
+}
+
 SDValue AArch64TargetLowering::changeStreamingMode(
     SelectionDAG &DAG, SDLoc DL, bool Enable,
     SDValue Chain, SDValue InGlue, SDValue PStateSM, bool Entry) const {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 2b16d2471770d0..e6d62f1704726b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -999,6 +999,9 @@ class AArch64TargetLowering : public TargetLowering {
                                const SDLoc &DL, SelectionDAG &DAG,
                                SmallVectorImpl<SDValue> &InVals) const override;
 
+  void AdjustInstrPostInstrSelection(MachineInstr &MI,
+                                     SDNode *Node) const override;
+
   SDValue LowerCall(CallLoweringInfo & /*CLI*/,
                     SmallVectorImpl<SDValue> &InVals) const override;
 
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 84ec88d4fd49b6..a58799116003dd 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -203,7 +203,9 @@ def : Pat<(i64 (int_aarch64_sme_get_tpidr2)),
 def MSRpstatePseudo :
   Pseudo<(outs),
            (ins svcr_op:$pstatefield, timm0_1:$imm, GPR64:$rtpstate, timm0_1:$expected_pstate, variable_ops), []>,
-    Sched<[WriteSys]>;
+    Sched<[WriteSys]> {
+  let hasPostISelHook = 1;
+}
 
 def : Pat<(AArch64_smstart (i32 svcr_op:$pstate), (i64 GPR64:$rtpstate), (i64 timm0_1:$expected_pstate)),
           (MSRpstatePseudo svcr_op:$pstate, 0b1, GPR64:$rtpstate, timm0_1:$expected_pstate)>;
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index c13c1b4e81faad..408b897070af0b 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -222,6 +222,7 @@ def MSRpstatesvcrImm1
   let Inst{11-9} = pstatefield;
   let Inst{8} = imm;
   let Inst{7-5} = 0b011; // op2
+  let hasPostISelHook = 1;
 }
 
 def : InstAlias<"smstart",    (MSRpstatesvcrImm1 0b011, 0b1)>;
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
index 1ad6b189d6fa5c..5d0c9127d3ebb2 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
@@ -436,3 +436,56 @@ define void @disable_tailcallopt() "aarch64_pstate_sm_compatible" nounwind {
   tail call void @normal_callee();
   ret void;
 }
+
+define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr, i64 %long1, i64 %long2, i32 %int1, i32 %int2, float %float1, float %float2, double %double1, double %double2) "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: call_to_non_streaming_pass_args:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #112
+; CHECK-NEXT:    stp d15, d14, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #96] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 112
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    .cfi_offset b8, -24
+; CHECK-NEXT:    .cfi_offset b9, -32
+; CHECK-NEXT:    .cfi_offset b10, -40
+; CHECK-NEXT:    .cfi_offset b11, -48
+; CHECK-NEXT:    .cfi_offset b12, -56
+; CHECK-NEXT:    .cfi_offset b13, -64
+; CHECK-NEXT:    .cfi_offset b14, -72
+; CHECK-NEXT:    .cfi_offset b15, -80
+; CHECK-NEXT:    stp d2, d3, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x8, x1
+; CHECK-NEXT:    mov x9, x0
+; CHECK-NEXT:    stp s0, s1, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    bl __arm_sme_state
+; CHECK-NEXT:    and x19, x0, #0x1
+; CHECK-NEXT:    tbz w19, #0, .LBB10_2
+; CHECK-NEXT:  // %bb.1: // %entry
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:  .LBB10_2: // %entry
+; CHECK-NEXT:    ldp s0, s1, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT:    mov x0, x9
+; CHECK-NEXT:    ldp d2, d3, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    mov x1, x8
+; CHECK-NEXT:    bl bar
+; CHECK-NEXT:    tbz w19, #0, .LBB10_4
+; CHECK-NEXT:  // %bb.3: // %entry
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:  .LBB10_4: // %entry
+; CHECK-NEXT:    ldp x30, x19, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #112
+; CHECK-NEXT:    ret
+entry:
+  call void @bar(ptr noundef nonnull %ptr, i64 %long1, i64 %long2, i32 %int1, i32 %int2, float %float1, float %float2, double %double1, double %double2)
+  ret void
+}
+
+declare void @bar(ptr noundef, i64 noundef, i64 noundef, i32 noundef, i32 noundef, float noundef, float noundef, double noundef, double noundef)
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
index 102ed896ce7b3e..dd7d6470ad7b08 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
@@ -368,15 +368,11 @@ define i8 @call_to_non_streaming_pass_sve_objects(ptr nocapture noundef readnone
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-3
-; CHECK-NEXT:    rdsvl x8, #1
-; CHECK-NEXT:    addvl x9, sp, #2
-; CHECK-NEXT:    addvl x10, sp, #1
-; CHECK-NEXT:    mov x11, sp
+; CHECK-NEXT:    rdsvl x3, #1
+; CHECK-NEXT:    addvl x0, sp, #2
+; CHECK-NEXT:    addvl x1, sp, #1
+; CHECK-NEXT:    mov x2, sp
 ; CHECK-NEXT:    smstop sm
-; CHECK-NEXT:    mov x0, x9
-; CHECK-NEXT:    mov x1, x10
-; CHECK-NEXT:    mov x2, x11
-; CHECK-NEXT:    mov x3, x8
 ; CHECK-NEXT:    bl foo
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    ptrue p0.b
@@ -400,8 +396,37 @@ entry:
   ret i8 %vecext
 }
 
+define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr, i64 %long1, i64 %long2, i32 %int1, i32 %int2, float %float1, float %float2, double %double1, double %double2) #0 {
+; CHECK-LABEL: call_to_non_streaming_pass_args:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #112
+; CHECK-NEXT:    stp d15, d14, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT:    stp d2, d3, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp s0, s1, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldp s0, s1, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d2, d3, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    bl bar
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ldp d9, d8, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #112
+; CHECK-NEXT:    ret
+entry:
+  call void @bar(ptr noundef nonnull %ptr, i64 %long1, i64 %long2, i32 %int1, i32 %int2, float %float1, float %float2, double %double1, double %double2)
+  ret void
+}
+
 declare i64 @llvm.aarch64.sme.cntsb()
 
 declare void @foo(ptr noundef, ptr noundef, ptr noundef, i64 noundef)
+declare void @bar(ptr noundef, i64 noundef, i64 noundef, i32 noundef, i32 noundef, float noundef, float noundef, double noundef, double noundef)
 
 attributes #0 = { nounwind vscale_range(1,16) "aarch64_pstate_sm_enabled" }

From 14e991740b5425680d49d75336132e793f1315e8 Mon Sep 17 00:00:00 2001
From: Daniel Grumberg <dgrumberg@apple.com>
Date: Fri, 1 Dec 2023 15:54:36 +0000
Subject: [PATCH 55/72] [clang][ExtractAPI] Ensure LocationFileChecker doesn't
 try to traverse VFS when determining file path (#74071)

As part of https://reviews.llvm.org/D154130 the logic of
LocationFileChecker changed slightly to try and get the absolute
external file path instead of the name as requested when the file was
openened which would be before VFS mappings in our usage. Ensure that we
only check against the name as requested instead of trying to generate
the external canonical file path.

rdar://115195433
---
 clang/lib/ExtractAPI/ExtractAPIConsumer.cpp   |  11 +-
 .../test/ExtractAPI/vfs_redirected_include.m  | 211 ++++++++++++++++++
 2 files changed, 219 insertions(+), 3 deletions(-)
 create mode 100644 clang/test/ExtractAPI/vfs_redirected_include.m

diff --git a/clang/lib/ExtractAPI/ExtractAPIConsumer.cpp b/clang/lib/ExtractAPI/ExtractAPIConsumer.cpp
index 3aba3bf44547cf..fe282dfb19e8aa 100644
--- a/clang/lib/ExtractAPI/ExtractAPIConsumer.cpp
+++ b/clang/lib/ExtractAPI/ExtractAPIConsumer.cpp
@@ -17,6 +17,7 @@
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/DeclObjC.h"
 #include "clang/Basic/DiagnosticFrontend.h"
+#include "clang/Basic/FileEntry.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Basic/TargetInfo.h"
@@ -167,6 +168,12 @@ std::optional<std::string> getRelativeIncludeName(const CompilerInstance &CI,
   return std::nullopt;
 }
 
+std::optional<std::string> getRelativeIncludeName(const CompilerInstance &CI,
+                                                  FileEntryRef FE,
+                                                  bool *IsQuoted = nullptr) {
+  return getRelativeIncludeName(CI, FE.getNameAsRequested(), IsQuoted);
+}
+
 struct LocationFileChecker {
   bool operator()(SourceLocation Loc) {
     // If the loc refers to a macro expansion we need to first get the file
@@ -187,11 +194,9 @@ struct LocationFileChecker {
     if (ExternalFileEntries.count(*File))
       return false;
 
-    StringRef FileName = SM.getFileManager().getCanonicalName(*File);
-
     // Try to reduce the include name the same way we tried to include it.
     bool IsQuoted = false;
-    if (auto IncludeName = getRelativeIncludeName(CI, FileName, &IsQuoted))
+    if (auto IncludeName = getRelativeIncludeName(CI, *File, &IsQuoted))
       if (llvm::any_of(KnownFiles,
                        [&IsQuoted, &IncludeName](const auto &KnownFile) {
                          return KnownFile.first.equals(*IncludeName) &&
diff --git a/clang/test/ExtractAPI/vfs_redirected_include.m b/clang/test/ExtractAPI/vfs_redirected_include.m
new file mode 100644
index 00000000000000..9ba7e1dedb601e
--- /dev/null
+++ b/clang/test/ExtractAPI/vfs_redirected_include.m
@@ -0,0 +1,211 @@
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+
+// Setup framework root
+// RUN: mkdir -p %t/Frameworks/MyFramework.framework/Headers
+// RUN: cp %t/MyFramework.h %t/Frameworks/MyFramework.framework/Headers/
+// RUN: cp %t/MyHeader.h %t/Frameworks/MyFramework.framework/Headers/
+
+// RUN: sed -e "s@SRCROOT@%{/t:regex_replacement}@g" \
+// RUN: %t/reference.output.json.in >> %t/reference.output.json
+
+// Create VFS overlay from framework headers to SRCROOT
+// RUN: sed -e "s@SRCROOT@%{/t:regex_replacement}@g" -e "s@DSTROOT@%{/t:regex_replacement}@g" \
+// RUN: %t/vfsoverlay.yaml.in >> %t/vfsoverlay.yaml
+
+// Input headers use paths to the framework root/DSTROOT
+// RUN: %clang_cc1 -extract-api -v --product-name=MyFramework \
+// RUN: -triple arm64-apple-macosx \
+// RUN: -iquote%t -ivfsoverlay %t/vfsoverlay.yaml -F%t/Frameworks \
+// RUN: -x objective-c-header \
+// RUN: %t/Frameworks/MyFramework.framework/Headers/MyFramework.h \
+// RUN: %t/Frameworks/MyFramework.framework/Headers/MyHeader.h \
+// RUN: %t/QuotedHeader.h \
+// RUN: -o %t/output.json 2>&1 -verify | FileCheck -allow-empty %s
+
+// Generator version is not consistent across test runs, normalize it.
+// RUN: sed -e "s@\"generator\": \".*\"@\"generator\": \"?\"@g" \
+// RUN: %t/output.json >> %t/output-normalized.json
+// RUN: diff %t/reference.output.json %t/output-normalized.json
+
+// CHECK:      <extract-api-includes>:
+// CHECK-NEXT: #import <MyFramework/MyFramework.h>
+// CHECK-NEXT: #import <MyFramework/MyHeader.h>
+// CHECK-NEXT: #import "QuotedHeader.h"
+
+//--- vfsoverlay.yaml.in
+{
+    "case-sensitive": "false",
+    "roots": [
+        {
+            "contents": [
+                {
+                    "external-contents": "SRCROOT/MyHeader.h",
+                    "name": "MyHeader.h",
+                    "type": "file"
+                }
+            ],
+            "name": "DSTROOT/Frameworks/MyFramework.framework/Headers",
+            "type": "directory"
+        }
+    ],
+    "version": 0
+}
+
+//--- MyFramework.h
+// Umbrella for MyFramework
+#import <MyFramework/MyHeader.h>
+// expected-no-diagnostics
+
+//--- MyHeader.h
+#import <OtherFramework/OtherHeader.h>
+int MyInt;
+// expected-no-diagnostics
+
+//--- QuotedHeader.h
+char MyChar;
+// expected-no-diagnostics
+
+//--- Frameworks/OtherFramework.framework/Headers/OtherHeader.h
+int OtherInt;
+// expected-no-diagnostics
+
+//--- reference.output.json.in
+{
+  "metadata": {
+    "formatVersion": {
+      "major": 0,
+      "minor": 5,
+      "patch": 3
+    },
+    "generator": "?"
+  },
+  "module": {
+    "name": "MyFramework",
+    "platform": {
+      "architecture": "arm64",
+      "operatingSystem": {
+        "minimumVersion": {
+          "major": 11,
+          "minor": 0,
+          "patch": 0
+        },
+        "name": "macosx"
+      },
+      "vendor": "apple"
+    }
+  },
+  "relationships": [],
+  "symbols": [
+    {
+      "accessLevel": "public",
+      "declarationFragments": [
+        {
+          "kind": "typeIdentifier",
+          "preciseIdentifier": "c:I",
+          "spelling": "int"
+        },
+        {
+          "kind": "text",
+          "spelling": " "
+        },
+        {
+          "kind": "identifier",
+          "spelling": "MyInt"
+        },
+        {
+          "kind": "text",
+          "spelling": ";"
+        }
+      ],
+      "identifier": {
+        "interfaceLanguage": "objective-c",
+        "precise": "c:@MyInt"
+      },
+      "kind": {
+        "displayName": "Global Variable",
+        "identifier": "objective-c.var"
+      },
+      "location": {
+        "position": {
+          "character": 4,
+          "line": 1
+        },
+        "uri": "file://SRCROOT/MyHeader.h"
+      },
+      "names": {
+        "navigator": [
+          {
+            "kind": "identifier",
+            "spelling": "MyInt"
+          }
+        ],
+        "subHeading": [
+          {
+            "kind": "identifier",
+            "spelling": "MyInt"
+          }
+        ],
+        "title": "MyInt"
+      },
+      "pathComponents": [
+        "MyInt"
+      ]
+    },
+    {
+      "accessLevel": "public",
+      "declarationFragments": [
+        {
+          "kind": "typeIdentifier",
+          "preciseIdentifier": "c:C",
+          "spelling": "char"
+        },
+        {
+          "kind": "text",
+          "spelling": " "
+        },
+        {
+          "kind": "identifier",
+          "spelling": "MyChar"
+        },
+        {
+          "kind": "text",
+          "spelling": ";"
+        }
+      ],
+      "identifier": {
+        "interfaceLanguage": "objective-c",
+        "precise": "c:@MyChar"
+      },
+      "kind": {
+        "displayName": "Global Variable",
+        "identifier": "objective-c.var"
+      },
+      "location": {
+        "position": {
+          "character": 5,
+          "line": 0
+        },
+        "uri": "file://SRCROOT/QuotedHeader.h"
+      },
+      "names": {
+        "navigator": [
+          {
+            "kind": "identifier",
+            "spelling": "MyChar"
+          }
+        ],
+        "subHeading": [
+          {
+            "kind": "identifier",
+            "spelling": "MyChar"
+          }
+        ],
+        "title": "MyChar"
+      },
+      "pathComponents": [
+        "MyChar"
+      ]
+    }
+  ]
+}

From d222fa4521531cc4ac14b8e157d231c108c003be Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <Ramkumar.Ramachandra@imgtec.com>
Date: Fri, 1 Dec 2023 16:08:41 +0000
Subject: [PATCH 56/72] TargetInstrInfo: squelch a signedness warning on MSVC
 (#74078)

Follow up on 9468de4 (TargetInstrInfo: make getOperandLatency return
optional (NFC)) to squelch a signedness warning on MSVC, reported by
Simon Pilgrim.
---
 llvm/include/llvm/MC/MCInstrItineraries.h | 2 +-
 llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/MC/MCInstrItineraries.h b/llvm/include/llvm/MC/MCInstrItineraries.h
index b17c41ce3aa4a1..d1c2e788ee8105 100644
--- a/llvm/include/llvm/MC/MCInstrItineraries.h
+++ b/llvm/include/llvm/MC/MCInstrItineraries.h
@@ -220,7 +220,7 @@ class InstrItineraryData {
       return std::nullopt;
 
     UseCycle = *DefCycle - *UseCycle + 1;
-    if (UseCycle > 0 &&
+    if (UseCycle > 0u &&
         hasPipelineForwarding(DefClass, DefIdx, UseClass, UseIdx))
       // FIXME: This assumes one cycle benefit for every pipeline forwarding.
       UseCycle = *UseCycle - 1;
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 94f34b12769660..ce65c8e5245b19 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -4110,7 +4110,7 @@ std::optional<unsigned> ARMBaseInstrInfo::getOperandLatency(
     return std::nullopt;
 
   UseCycle = *DefCycle - *UseCycle + 1;
-  if (UseCycle > 0) {
+  if (UseCycle > 0u) {
     if (LdmBypass) {
       // It's a variable_ops instruction so we can't use DefIdx here. Just use
       // first def operand.

From 76f78ecc789d58baa3a88b2fe2a57428f07e5362 Mon Sep 17 00:00:00 2001
From: Dmitri Gribenko <gribozavr@gmail.com>
Date: Fri, 1 Dec 2023 17:18:14 +0100
Subject: [PATCH 57/72] Revert "Reland [X86] With large code model, put
 functions into .ltext with large section flag (#73037)"

This reverts commit 4bf8a688956a759b7b6b8d94f42d25c13c7af130.

This commit seems to be breaking the semantics of the
ObjectFile::isSectionText method, which breaks numba/llvmlite bindings.
---
 llvm/include/llvm/Target/TargetMachine.h      |  2 +-
 .../CodeGen/TargetLoweringObjectFileImpl.cpp  | 15 ++++++--
 llvm/lib/Target/TargetMachine.cpp             | 12 +-----
 llvm/lib/Target/X86/X86Subtarget.cpp          | 38 ++++++++++++-------
 .../X86/code-model-elf-text-sections.ll       | 25 ------------
 llvm/test/CodeGen/X86/code-model-elf.ll       |  1 -
 llvm/test/CodeGen/X86/pcsections.ll           | 22 +++++------
 .../OrcLazy/debug-objects-elf-minimal.ll      |  2 +-
 8 files changed, 51 insertions(+), 66 deletions(-)
 delete mode 100644 llvm/test/CodeGen/X86/code-model-elf-text-sections.ll

diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h
index 4830ecbe1cd634..c1d05b25ea21f8 100644
--- a/llvm/include/llvm/Target/TargetMachine.h
+++ b/llvm/include/llvm/Target/TargetMachine.h
@@ -239,7 +239,7 @@ class TargetMachine {
   void setCodeModel(CodeModel::Model CM) { CMModel = CM; }
 
   void setLargeDataThreshold(uint64_t LDT) { LargeDataThreshold = LDT; }
-  bool isLargeGlobalObject(const GlobalObject *GO) const;
+  bool isLargeData(const GlobalVariable *GV) const;
 
   bool isPositionIndependent() const;
 
diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index 143a4951c1361b..f3ba380818901c 100644
--- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -616,7 +616,7 @@ static unsigned getEntrySizeForKind(SectionKind Kind) {
 /// DataSections.
 static StringRef getSectionPrefixForGlobal(SectionKind Kind, bool IsLarge) {
   if (Kind.isText())
-    return IsLarge ? ".ltext" : ".text";
+    return ".text";
   if (Kind.isReadOnly())
     return IsLarge ? ".lrodata" : ".rodata";
   if (Kind.isBSS())
@@ -650,7 +650,10 @@ getELFSectionNameForGlobal(const GlobalObject *GO, SectionKind Kind,
     Name = ".rodata.cst";
     Name += utostr(EntrySize);
   } else {
-    Name = getSectionPrefixForGlobal(Kind, TM.isLargeGlobalObject(GO));
+    bool IsLarge = false;
+    if (auto *GV = dyn_cast<GlobalVariable>(GO))
+      IsLarge = TM.isLargeData(GV);
+    Name = getSectionPrefixForGlobal(Kind, IsLarge);
   }
 
   bool HasPrefix = false;
@@ -770,8 +773,12 @@ getGlobalObjectInfo(const GlobalObject *GO, const TargetMachine &TM) {
     Group = C->getName();
     IsComdat = C->getSelectionKind() == Comdat::Any;
   }
-  if (TM.isLargeGlobalObject(GO))
-    Flags |= ELF::SHF_X86_64_LARGE;
+  if (auto *GV = dyn_cast<GlobalVariable>(GO)) {
+    if (TM.isLargeData(GV)) {
+      assert(TM.getTargetTriple().getArch() == Triple::x86_64);
+      Flags |= ELF::SHF_X86_64_LARGE;
+    }
+  }
   return {Group, IsComdat, Flags};
 }
 
diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp
index f7096b708b39de..1cba8cf8004bfb 100644
--- a/llvm/lib/Target/TargetMachine.cpp
+++ b/llvm/lib/Target/TargetMachine.cpp
@@ -39,21 +39,13 @@ TargetMachine::TargetMachine(const Target &T, StringRef DataLayoutString,
 
 TargetMachine::~TargetMachine() = default;
 
-bool TargetMachine::isLargeGlobalObject(const GlobalObject *GO) const {
-  if (getTargetTriple().getArch() != Triple::x86_64)
+bool TargetMachine::isLargeData(const GlobalVariable *GV) const {
+  if (getTargetTriple().getArch() != Triple::x86_64 || GV->isThreadLocal())
     return false;
 
   if (getCodeModel() != CodeModel::Medium && getCodeModel() != CodeModel::Large)
     return false;
 
-  if (isa<Function>(GO))
-    return getCodeModel() == CodeModel::Large;
-
-  auto *GV = cast<GlobalVariable>(GO);
-
-  if (GV->isThreadLocal())
-    return false;
-
   // Allowing large metadata sections in the presence of an explicit section is
   // useful, even if GCC does not allow them. However, we should not mark
   // certain well-known prefixes as large, because it would make the whole
diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp
index 328608363a8fd7..085fdafa6b9f2c 100644
--- a/llvm/lib/Target/X86/X86Subtarget.cpp
+++ b/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -83,20 +83,32 @@ X86Subtarget::classifyLocalReference(const GlobalValue *GV) const {
   if (is64Bit()) {
     // 64-bit ELF PIC local references may use GOTOFF relocations.
     if (isTargetELF()) {
-      CodeModel::Model CM = TM.getCodeModel();
-      assert(CM != CodeModel::Tiny &&
-             "Tiny codesize model not supported on X86");
-      // In the large code model, even referencing a global under the large data
-      // threshold which is considered "small", we need to use GOTOFF.
-      if (CM == CodeModel::Large)
+      switch (TM.getCodeModel()) {
+      // 64-bit small code model is simple: All rip-relative.
+      case CodeModel::Tiny:
+        llvm_unreachable("Tiny codesize model not supported on X86");
+      case CodeModel::Small:
+      case CodeModel::Kernel:
+        return X86II::MO_NO_FLAG;
+
+      // The large PIC code model uses GOTOFF.
+      case CodeModel::Large:
         return X86II::MO_GOTOFF;
-      // Large objects use GOTOFF, otherwise use RIP-rel access.
-      if (auto *GO = dyn_cast_or_null<GlobalObject>(GV))
-        return TM.isLargeGlobalObject(GO) ? X86II::MO_GOTOFF
-                                          : X86II::MO_NO_FLAG;
-      // For non-GlobalObjects, the small and medium code models treat them as
-      // accessible with a RIP-rel access.
-      return X86II::MO_NO_FLAG;
+
+      // Medium is a hybrid: RIP-rel for code and non-large data, GOTOFF for
+      // remaining DSO local data.
+      case CodeModel::Medium:
+        // Constant pool and jump table handling pass a nullptr to this
+        // function so we need to use isa_and_nonnull.
+        if (isa_and_nonnull<Function>(GV))
+          return X86II::MO_NO_FLAG; // All code is RIP-relative
+        if (auto *GVar = dyn_cast_or_null<GlobalVariable>(GV)) {
+          if (TM.isLargeData(GVar))
+            return X86II::MO_GOTOFF;
+        }
+        return X86II::MO_NO_FLAG;    // Local symbols use GOTOFF.
+      }
+      llvm_unreachable("invalid code model");
     }
 
     // Otherwise, this is either a RIP-relative reference or a 64-bit movabsq,
diff --git a/llvm/test/CodeGen/X86/code-model-elf-text-sections.ll b/llvm/test/CodeGen/X86/code-model-elf-text-sections.ll
deleted file mode 100644
index 016c9a4d7b8390..00000000000000
--- a/llvm/test/CodeGen/X86/code-model-elf-text-sections.ll
+++ /dev/null
@@ -1,25 +0,0 @@
-; RUN: llc < %s -relocation-model=pic -filetype=obj -code-model=small -o %t
-; RUN: llvm-readelf -S %t | FileCheck %s --check-prefix=SMALL
-; RUN: llc < %s -relocation-model=pic -filetype=obj -code-model=medium -o %t
-; RUN: llvm-readelf -S %t | FileCheck %s --check-prefix=SMALL
-; RUN: llc < %s -relocation-model=pic -filetype=obj -code-model=large -o %t
-; RUN: llvm-readelf -S %t | FileCheck %s --check-prefix=LARGE
-
-; RUN: llc < %s -relocation-model=pic -filetype=obj -code-model=small -function-sections -o %t
-; RUN: llvm-readelf -S %t | FileCheck %s --check-prefix=SMALL-DS
-; RUN: llc < %s -relocation-model=pic -filetype=obj -code-model=medium -function-sections -o %t
-; RUN: llvm-readelf -S %t | FileCheck %s --check-prefix=SMALL-DS
-; RUN: llc < %s -relocation-model=pic -filetype=obj -code-model=large -function-sections -o %t
-; RUN: llvm-readelf -S %t | FileCheck %s --check-prefix=LARGE-DS
-
-; SMALL: .text {{.*}} AX {{.*}}
-; SMALL-DS: .text.func {{.*}} AX {{.*}}
-; LARGE: .ltext {{.*}} AXl {{.*}}
-; LARGE-DS: .ltext.func {{.*}} AXl {{.*}}
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64--linux"
-
-define void @func() {
-  ret void
-}
diff --git a/llvm/test/CodeGen/X86/code-model-elf.ll b/llvm/test/CodeGen/X86/code-model-elf.ll
index 483ffd87ac696f..901a62d26f77e8 100644
--- a/llvm/test/CodeGen/X86/code-model-elf.ll
+++ b/llvm/test/CodeGen/X86/code-model-elf.ll
@@ -9,7 +9,6 @@
 ; RUN: llc -verify-machineinstrs < %s -relocation-model=pic    -code-model=medium -large-data-threshold=1000 | FileCheck %s --check-prefix=CHECK --check-prefix=MEDIUM-SMALL-DATA-PIC
 ; RUN: llc -verify-machineinstrs < %s -relocation-model=pic    -code-model=medium | FileCheck %s --check-prefix=CHECK --check-prefix=MEDIUM-PIC
 ; RUN: llc -verify-machineinstrs < %s -relocation-model=pic    -code-model=large  | FileCheck %s --check-prefix=CHECK --check-prefix=LARGE-PIC
-; RUN: llc -verify-machineinstrs < %s -relocation-model=pic    -code-model=large  -large-data-threshold=1000 | FileCheck %s --check-prefix=CHECK --check-prefix=LARGE-PIC
 
 ; Generated from this C source:
 ;
diff --git a/llvm/test/CodeGen/X86/pcsections.ll b/llvm/test/CodeGen/X86/pcsections.ll
index 4fe70d93cf347b..00c1aba18cb43f 100644
--- a/llvm/test/CodeGen/X86/pcsections.ll
+++ b/llvm/test/CodeGen/X86/pcsections.ll
@@ -19,12 +19,12 @@ define void @empty_no_aux() !pcsections !0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    retq
 ; CHECK-NEXT:  .Lfunc_end0:
-; CHECK:       .section	section_no_aux,"awo",@progbits,.{{l?}}text
+; CHECK:       .section	section_no_aux,"awo",@progbits,.text
 ; CHECK-NEXT:  .Lpcsection_base0:
 ; DEFCM-NEXT:  .long	.Lfunc_begin0-.Lpcsection_base0
 ; LARGE-NEXT:  .quad	.Lfunc_begin0-.Lpcsection_base0
 ; CHECK-NEXT:  .long	.Lfunc_end0-.Lfunc_begin0
-; CHECK-NEXT:  .{{l?}}text
+; CHECK-NEXT:  .text
 entry:
   ret void
 }
@@ -35,7 +35,7 @@ define void @empty_aux() !pcsections !1 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    retq
 ; CHECK-NEXT:  .Lfunc_end1:
-; CHECK:       .section	section_aux,"awo",@progbits,.{{l?}}text
+; CHECK:       .section	section_aux,"awo",@progbits,.text
 ; CHECK-NEXT:  .Lpcsection_base1:
 ; DEFCM-NEXT:  .long	.Lfunc_begin1-.Lpcsection_base1
 ; LARGE-NEXT:  .quad	.Lfunc_begin1-.Lpcsection_base1
@@ -43,7 +43,7 @@ define void @empty_aux() !pcsections !1 {
 ; CHECK-NEXT:  .long	10
 ; CHECK-NEXT:  .long	20
 ; CHECK-NEXT:  .long	30
-; CHECK-NEXT:  .{{l?}}text
+; CHECK-NEXT:  .text
 entry:
   ret void
 }
@@ -56,22 +56,22 @@ define i64 @multiple() !pcsections !0 {
 ; CHECK-NEXT:    movq
 ; CHECK-NEXT:    retq
 ; CHECK-NEXT:  .Lfunc_end2:
-; CHECK:       .section	section_no_aux,"awo",@progbits,.{{l?}}text
+; CHECK:       .section	section_no_aux,"awo",@progbits,.text
 ; CHECK-NEXT:  .Lpcsection_base2:
 ; DEFCM-NEXT:  .long	.Lfunc_begin2-.Lpcsection_base2
 ; LARGE-NEXT:  .quad	.Lfunc_begin2-.Lpcsection_base2
 ; CHECK-NEXT:  .long	.Lfunc_end2-.Lfunc_begin2
-; CHECK-NEXT:  .section	section_aux_42,"awo",@progbits,.{{l?}}text
+; CHECK-NEXT:  .section	section_aux_42,"awo",@progbits,.text
 ; CHECK-NEXT:  .Lpcsection_base3:
 ; DEFCM-NEXT:  .long	.Lpcsection0-.Lpcsection_base3
 ; LARGE-NEXT:  .quad	.Lpcsection0-.Lpcsection_base3
 ; CHECK-NEXT:  .long	42
-; CHECK-NEXT:  .section	section_aux_21264,"awo",@progbits,.{{l?}}text
+; CHECK-NEXT:  .section	section_aux_21264,"awo",@progbits,.text
 ; CHECK-NEXT:  .Lpcsection_base4:
 ; DEFCM-NEXT:  .long	.Lpcsection0-.Lpcsection_base4
 ; LARGE-NEXT:  .quad	.Lpcsection0-.Lpcsection_base4
 ; CHECK-NEXT:  .long	21264
-; CHECK-NEXT:  .{{l?}}text
+; CHECK-NEXT:  .text
 entry:
   %0 = load i64, ptr @bar, align 8, !pcsections !2
   ret i64 %0
@@ -79,7 +79,7 @@ entry:
 
 define void @multiple_uleb128() !pcsections !6 {
 ; CHECK-LABEL: multiple_uleb128:
-; CHECK:       .section	section_aux,"awo",@progbits,.{{l?}}text
+; CHECK:       .section	section_aux,"awo",@progbits,.text
 ; CHECK-NEXT:  .Lpcsection_base5:
 ; DEFCM-NEXT:  .long	.Lfunc_begin3-.Lpcsection_base5
 ; LARGE-NEXT:  .quad	.Lfunc_begin3-.Lpcsection_base5
@@ -87,13 +87,13 @@ define void @multiple_uleb128() !pcsections !6 {
 ; CHECK-NEXT:  .byte	42
 ; CHECK-NEXT:  .ascii	"\345\216&"
 ; CHECK-NEXT:  .byte	255
-; CHECK-NEXT:  .section	section_aux_21264,"awo",@progbits,.{{l?}}text
+; CHECK-NEXT:  .section	section_aux_21264,"awo",@progbits,.text
 ; CHECK-NEXT:  .Lpcsection_base6:
 ; DEFCM-NEXT:  .long	.Lfunc_begin3-.Lpcsection_base6
 ; LARGE-NEXT:  .quad	.Lfunc_begin3-.Lpcsection_base6
 ; CHECK-NEXT:  .long	.Lfunc_end3-.Lfunc_begin3
 ; CHECK-NEXT:  .long	21264
-; CHECK-NEXT:  .{{l?}}text
+; CHECK-NEXT:  .text
 entry:
   ret void
 }
diff --git a/llvm/test/ExecutionEngine/OrcLazy/debug-objects-elf-minimal.ll b/llvm/test/ExecutionEngine/OrcLazy/debug-objects-elf-minimal.ll
index d7bc2dc117b7f7..0d5aba376080a3 100644
--- a/llvm/test/ExecutionEngine/OrcLazy/debug-objects-elf-minimal.ll
+++ b/llvm/test/ExecutionEngine/OrcLazy/debug-objects-elf-minimal.ll
@@ -44,7 +44,7 @@
 ; RUN:     --generate=__dump_jit_debug_objects %s | llvm-objdump --section-headers - | \
 ; RUN:     FileCheck --check-prefix=CHECK_LOAD_ADDR %s
 ;
-; CHECK_LOAD_ADDR-NOT: {{[0-9]*}} .ltext {{.*}} 0000000000000000 TEXT
+; CHECK_LOAD_ADDR-NOT: {{[0-9]*}} .text {{.*}} 0000000000000000 TEXT
 
 target triple = "x86_64-unknown-unknown-elf"
 

From f184147706f5430387fee99d2e94c7a3361c642b Mon Sep 17 00:00:00 2001
From: Jon Chesterfield <jonathanchesterfield@gmail.com>
Date: Fri, 1 Dec 2023 16:37:49 +0000
Subject: [PATCH 58/72] [amdgpu] Default to 1.0, instead of unspecified, for
 dynamic hsa (#74098)

The plugin checks the values of HSA_AMD_INTERFACE_VERSION_* so we now
set them to something safe in the header.
---
 .../plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h     | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h b/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
index 188dd2600a610c..9c59d3bf824de3 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
@@ -14,6 +14,15 @@
 
 #include "hsa.h"
 
+/* Using this header means we do not know what version library will be linked.
+   Until such point as a CMake level override is requested, default to the
+   minimum. */
+/*
+ * - 1.0 - initial version
+ */
+#define HSA_AMD_INTERFACE_VERSION_MAJOR 1
+#define HSA_AMD_INTERFACE_VERSION_MINOR 0
+
 #ifdef __cplusplus
 extern "C" {
 #endif

From f40d25151c25e257f3ebd2696e0bf133fe2a30ff Mon Sep 17 00:00:00 2001
From: cor3ntin <corentinjabot@gmail.com>
Date: Fri, 1 Dec 2023 17:44:22 +0100
Subject: [PATCH 59/72] [Clang] Implement P2308R1 - Template Parameter
 Initialization. (#73103)

https://wiki.edg.com/pub/Wg21kona2023/StrawPolls/p2308r1.html

This implements P2308R1 as a DR and resolves CWG2459, CWG2450 and
CWG2049.


Fixes #73666
Fixes #58434
Fixes #41227
Fixes #49978
Fixes #36296
---
 clang/docs/ReleaseNotes.rst                   |   3 +
 clang/include/clang/Sema/Sema.h               |   5 +
 clang/lib/Parse/ParseTemplate.cpp             |  11 +-
 clang/lib/Sema/SemaOverload.cpp               | 115 +++++++++---------
 clang/lib/Sema/SemaTemplate.cpp               |  98 ++++++++++-----
 clang/test/CXX/drs/dr20xx.cpp                 |   9 ++
 clang/test/CXX/drs/dr24xx.cpp                 |  27 ++++
 .../SemaTemplate/temp_arg_nontype_cxx20.cpp   |   7 +-
 .../SemaTemplate/temp_arg_nontype_cxx2c.cpp   | 104 ++++++++++++++++
 clang/www/cxx_dr_status.html                  |   6 +-
 clang/www/cxx_status.html                     |   2 +-
 11 files changed, 287 insertions(+), 100 deletions(-)
 create mode 100644 clang/test/CXX/drs/dr24xx.cpp
 create mode 100644 clang/test/SemaTemplate/temp_arg_nontype_cxx2c.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 43ea6a3ffd6e6c..8733bb93f5708a 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -184,6 +184,9 @@ C++2c Feature Support
 
 - Implemented `P2864R2 Remove Deprecated Arithmetic Conversion on Enumerations From C++26 <https://wg21.link/P2864R2>`_.
 
+- Implemented `P2361R6 Template parameter initialization <https://wg21.link/P2308R1>`_.
+  This change is applied as a DR in all language modes.
+
 
 Resolutions to C++ Defect Reports
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 6de1a098e067a3..8b2ed6f7cd8cd8 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -3933,6 +3933,11 @@ class Sema final {
                                               APValue &Value, CCEKind CCE,
                                               NamedDecl *Dest = nullptr);
 
+  ExprResult
+  EvaluateConvertedConstantExpression(Expr *E, QualType T, APValue &Value,
+                                      CCEKind CCE, bool RequireInt,
+                                      const APValue &PreNarrowingValue);
+
   /// Abstract base class used to perform a contextual implicit
   /// conversion from an expression to any type passing a filter.
   class ContextualImplicitConverter {
diff --git a/clang/lib/Parse/ParseTemplate.cpp b/clang/lib/Parse/ParseTemplate.cpp
index f556d0e6d4f8b6..64fe4d50bba27b 100644
--- a/clang/lib/Parse/ParseTemplate.cpp
+++ b/clang/lib/Parse/ParseTemplate.cpp
@@ -1062,8 +1062,7 @@ Parser::ParseNonTypeTemplateParameter(unsigned Depth, unsigned Position) {
       ++CurTemplateDepthTracker;
       EnterExpressionEvaluationContext ConstantEvaluated(
           Actions, Sema::ExpressionEvaluationContext::ConstantEvaluated);
-      DefaultArg =
-          Actions.CorrectDelayedTyposInExpr(ParseAssignmentExpression());
+      DefaultArg = Actions.CorrectDelayedTyposInExpr(ParseInitializer());
       if (DefaultArg.isInvalid())
         SkipUntil(tok::comma, tok::greater, StopAtSemi | StopBeforeMatch);
     }
@@ -1582,6 +1581,8 @@ ParsedTemplateArgument Parser::ParseTemplateTemplateArgument() {
 ///         constant-expression
 ///         type-id
 ///         id-expression
+///         braced-init-list  [C++26, DR]
+///
 ParsedTemplateArgument Parser::ParseTemplateArgument() {
   // C++ [temp.arg]p2:
   //   In a template-argument, an ambiguity between a type-id and an
@@ -1619,8 +1620,12 @@ ParsedTemplateArgument Parser::ParseTemplateArgument() {
   }
 
   // Parse a non-type template argument.
+  ExprResult ExprArg;
   SourceLocation Loc = Tok.getLocation();
-  ExprResult ExprArg = ParseConstantExpressionInExprEvalContext(MaybeTypeCast);
+  if (getLangOpts().CPlusPlus11 && Tok.is(tok::l_brace))
+    ExprArg = ParseBraceInitializer();
+  else
+    ExprArg = ParseConstantExpressionInExprEvalContext(MaybeTypeCast);
   if (ExprArg.isInvalid() || !ExprArg.get()) {
     return ParsedTemplateArgument();
   }
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index 3a3e9234469d39..5026e1d603e5ee 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -6123,61 +6123,6 @@ static ExprResult BuildConvertedConstantExpression(Sema &S, Expr *From,
   return Result;
 }
 
-/// EvaluateConvertedConstantExpression - Evaluate an Expression
-/// That is a converted constant expression
-/// (which was built with BuildConvertedConstantExpression)
-static ExprResult EvaluateConvertedConstantExpression(
-    Sema &S, Expr *E, QualType T, APValue &Value, Sema::CCEKind CCE,
-    bool RequireInt, const APValue &PreNarrowingValue) {
-  ExprResult Result = E;
-  // Check the expression is a constant expression.
-  SmallVector<PartialDiagnosticAt, 8> Notes;
-  Expr::EvalResult Eval;
-  Eval.Diag = &Notes;
-
-  ConstantExprKind Kind;
-  if (CCE == Sema::CCEK_TemplateArg && T->isRecordType())
-    Kind = ConstantExprKind::ClassTemplateArgument;
-  else if (CCE == Sema::CCEK_TemplateArg)
-    Kind = ConstantExprKind::NonClassTemplateArgument;
-  else
-    Kind = ConstantExprKind::Normal;
-
-  if (!E->EvaluateAsConstantExpr(Eval, S.Context, Kind) ||
-      (RequireInt && !Eval.Val.isInt())) {
-    // The expression can't be folded, so we can't keep it at this position in
-    // the AST.
-    Result = ExprError();
-  } else {
-    Value = Eval.Val;
-
-    if (Notes.empty()) {
-      // It's a constant expression.
-      Expr *E = ConstantExpr::Create(S.Context, Result.get(), Value);
-      if (!PreNarrowingValue.isAbsent())
-        Value = std::move(PreNarrowingValue);
-      return E;
-    }
-  }
-
-  // It's not a constant expression. Produce an appropriate diagnostic.
-  if (Notes.size() == 1 &&
-      Notes[0].second.getDiagID() == diag::note_invalid_subexpr_in_const_expr) {
-    S.Diag(Notes[0].first, diag::err_expr_not_cce) << CCE;
-  } else if (!Notes.empty() && Notes[0].second.getDiagID() ==
-                                   diag::note_constexpr_invalid_template_arg) {
-    Notes[0].second.setDiagID(diag::err_constexpr_invalid_template_arg);
-    for (unsigned I = 0; I < Notes.size(); ++I)
-      S.Diag(Notes[I].first, Notes[I].second);
-  } else {
-    S.Diag(E->getBeginLoc(), diag::err_expr_not_cce)
-        << CCE << E->getSourceRange();
-    for (unsigned I = 0; I < Notes.size(); ++I)
-      S.Diag(Notes[I].first, Notes[I].second);
-  }
-  return ExprError();
-}
-
 /// CheckConvertedConstantExpression - Check that the expression From is a
 /// converted constant expression of type T, perform the conversion and produce
 /// the converted expression, per C++11 [expr.const]p3.
@@ -6194,8 +6139,8 @@ static ExprResult CheckConvertedConstantExpression(Sema &S, Expr *From,
     Value = APValue();
     return Result;
   }
-  return EvaluateConvertedConstantExpression(S, Result.get(), T, Value, CCE,
-                                             RequireInt, PreNarrowingValue);
+  return S.EvaluateConvertedConstantExpression(Result.get(), T, Value, CCE,
+                                               RequireInt, PreNarrowingValue);
 }
 
 ExprResult Sema::BuildConvertedConstantExpression(Expr *From, QualType T,
@@ -6226,6 +6171,62 @@ ExprResult Sema::CheckConvertedConstantExpression(Expr *From, QualType T,
   return R;
 }
 
+/// EvaluateConvertedConstantExpression - Evaluate an Expression
+/// That is a converted constant expression
+/// (which was built with BuildConvertedConstantExpression)
+ExprResult
+Sema::EvaluateConvertedConstantExpression(Expr *E, QualType T, APValue &Value,
+                                          Sema::CCEKind CCE, bool RequireInt,
+                                          const APValue &PreNarrowingValue) {
+
+  ExprResult Result = E;
+  // Check the expression is a constant expression.
+  SmallVector<PartialDiagnosticAt, 8> Notes;
+  Expr::EvalResult Eval;
+  Eval.Diag = &Notes;
+
+  ConstantExprKind Kind;
+  if (CCE == Sema::CCEK_TemplateArg && T->isRecordType())
+    Kind = ConstantExprKind::ClassTemplateArgument;
+  else if (CCE == Sema::CCEK_TemplateArg)
+    Kind = ConstantExprKind::NonClassTemplateArgument;
+  else
+    Kind = ConstantExprKind::Normal;
+
+  if (!E->EvaluateAsConstantExpr(Eval, Context, Kind) ||
+      (RequireInt && !Eval.Val.isInt())) {
+    // The expression can't be folded, so we can't keep it at this position in
+    // the AST.
+    Result = ExprError();
+  } else {
+    Value = Eval.Val;
+
+    if (Notes.empty()) {
+      // It's a constant expression.
+      Expr *E = ConstantExpr::Create(Context, Result.get(), Value);
+      if (!PreNarrowingValue.isAbsent())
+        Value = std::move(PreNarrowingValue);
+      return E;
+    }
+  }
+
+  // It's not a constant expression. Produce an appropriate diagnostic.
+  if (Notes.size() == 1 &&
+      Notes[0].second.getDiagID() == diag::note_invalid_subexpr_in_const_expr) {
+    Diag(Notes[0].first, diag::err_expr_not_cce) << CCE;
+  } else if (!Notes.empty() && Notes[0].second.getDiagID() ==
+                                   diag::note_constexpr_invalid_template_arg) {
+    Notes[0].second.setDiagID(diag::err_constexpr_invalid_template_arg);
+    for (unsigned I = 0; I < Notes.size(); ++I)
+      Diag(Notes[I].first, Notes[I].second);
+  } else {
+    Diag(E->getBeginLoc(), diag::err_expr_not_cce)
+        << CCE << E->getSourceRange();
+    for (unsigned I = 0; I < Notes.size(); ++I)
+      Diag(Notes[I].first, Notes[I].second);
+  }
+  return ExprError();
+}
 
 /// dropPointerConversions - If the given standard conversion sequence
 /// involves any pointer conversions, remove them.  This may change
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index 34d7b8c731e907..09bbf14d39af5a 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -7313,49 +7313,74 @@ ExprResult Sema::CheckTemplateArgument(NonTypeTemplateParmDecl *Param,
     return E;
   }
 
+  QualType CanonParamType = Context.getCanonicalType(ParamType);
+  // Avoid making a copy when initializing a template parameter of class type
+  // from a template parameter object of the same type. This is going beyond
+  // the standard, but is required for soundness: in
+  //   template<A a> struct X { X *p; X<a> *q; };
+  // ... we need p and q to have the same type.
+  //
+  // Similarly, don't inject a call to a copy constructor when initializing
+  // from a template parameter of the same type.
+  Expr *InnerArg = Arg->IgnoreParenImpCasts();
+  if (ParamType->isRecordType() && isa<DeclRefExpr>(InnerArg) &&
+      Context.hasSameUnqualifiedType(ParamType, InnerArg->getType())) {
+    NamedDecl *ND = cast<DeclRefExpr>(InnerArg)->getDecl();
+    if (auto *TPO = dyn_cast<TemplateParamObjectDecl>(ND)) {
+
+      SugaredConverted = TemplateArgument(TPO, ParamType);
+      CanonicalConverted =
+          TemplateArgument(TPO->getCanonicalDecl(), CanonParamType);
+      return Arg;
+    }
+    if (isa<NonTypeTemplateParmDecl>(ND)) {
+      SugaredConverted = TemplateArgument(Arg);
+      CanonicalConverted =
+          Context.getCanonicalTemplateArgument(SugaredConverted);
+      return Arg;
+    }
+  }
+
   // The initialization of the parameter from the argument is
   // a constant-evaluated context.
   EnterExpressionEvaluationContext ConstantEvaluated(
       *this, Sema::ExpressionEvaluationContext::ConstantEvaluated);
 
-  if (getLangOpts().CPlusPlus17) {
-    QualType CanonParamType = Context.getCanonicalType(ParamType);
-
-    // Avoid making a copy when initializing a template parameter of class type
-    // from a template parameter object of the same type. This is going beyond
-    // the standard, but is required for soundness: in
-    //   template<A a> struct X { X *p; X<a> *q; };
-    // ... we need p and q to have the same type.
-    //
-    // Similarly, don't inject a call to a copy constructor when initializing
-    // from a template parameter of the same type.
-    Expr *InnerArg = Arg->IgnoreParenImpCasts();
-    if (ParamType->isRecordType() && isa<DeclRefExpr>(InnerArg) &&
-        Context.hasSameUnqualifiedType(ParamType, InnerArg->getType())) {
-      NamedDecl *ND = cast<DeclRefExpr>(InnerArg)->getDecl();
-      if (auto *TPO = dyn_cast<TemplateParamObjectDecl>(ND)) {
-
-        SugaredConverted = TemplateArgument(TPO, ParamType);
-        CanonicalConverted =
-            TemplateArgument(TPO->getCanonicalDecl(), CanonParamType);
-        return Arg;
-      }
-      if (isa<NonTypeTemplateParmDecl>(ND)) {
-        SugaredConverted = TemplateArgument(Arg);
-        CanonicalConverted =
-            Context.getCanonicalTemplateArgument(SugaredConverted);
-        return Arg;
-      }
-    }
+  bool IsConvertedConstantExpression = true;
+  if (isa<InitListExpr>(Arg) || ParamType->isRecordType()) {
+    InitializationKind Kind = InitializationKind::CreateForInit(
+        Arg->getBeginLoc(), /*DirectInit=*/false, Arg);
+    Expr *Inits[1] = {Arg};
+    InitializedEntity Entity =
+        InitializedEntity::InitializeTemplateParameter(ParamType, Param);
+    InitializationSequence InitSeq(*this, Entity, Kind, Inits);
+    ExprResult Result = InitSeq.Perform(*this, Entity, Kind, Inits);
+    if (Result.isInvalid() || !Result.get())
+      return ExprError();
+    Result = ActOnConstantExpression(Result.get());
+    if (Result.isInvalid() || !Result.get())
+      return ExprError();
+    Arg = ActOnFinishFullExpr(Result.get(), Arg->getBeginLoc(),
+                              /*DiscardedValue=*/false,
+                              /*IsConstexpr=*/true, /*IsTemplateArgument=*/true)
+              .get();
+    IsConvertedConstantExpression = false;
+  }
 
+  if (getLangOpts().CPlusPlus17) {
     // C++17 [temp.arg.nontype]p1:
     //   A template-argument for a non-type template parameter shall be
     //   a converted constant expression of the type of the template-parameter.
     APValue Value;
-    ExprResult ArgResult = CheckConvertedConstantExpression(
-        Arg, ParamType, Value, CCEK_TemplateArg, Param);
-    if (ArgResult.isInvalid())
-      return ExprError();
+    ExprResult ArgResult;
+    if (IsConvertedConstantExpression) {
+      ArgResult = BuildConvertedConstantExpression(Arg, ParamType,
+                                                   CCEK_TemplateArg, Param);
+      if (ArgResult.isInvalid())
+        return ExprError();
+    } else {
+      ArgResult = Arg;
+    }
 
     // For a value-dependent argument, CheckConvertedConstantExpression is
     // permitted (and expected) to be unable to determine a value.
@@ -7366,6 +7391,13 @@ ExprResult Sema::CheckTemplateArgument(NonTypeTemplateParmDecl *Param,
       return ArgResult;
     }
 
+    APValue PreNarrowingValue;
+    ArgResult = EvaluateConvertedConstantExpression(
+        ArgResult.get(), ParamType, Value, CCEK_TemplateArg, /*RequireInt=*/
+        false, PreNarrowingValue);
+    if (ArgResult.isInvalid())
+      return ExprError();
+
     // Convert the APValue to a TemplateArgument.
     switch (Value.getKind()) {
     case APValue::None:
diff --git a/clang/test/CXX/drs/dr20xx.cpp b/clang/test/CXX/drs/dr20xx.cpp
index dd60af14bb6b71..4f81b0b413d4bd 100644
--- a/clang/test/CXX/drs/dr20xx.cpp
+++ b/clang/test/CXX/drs/dr20xx.cpp
@@ -61,6 +61,15 @@ namespace dr2026 { // dr2026: 11
   }
 }
 
+namespace dr2049 { // dr2049: 18 drafting
+#if __cplusplus > 202002L
+template <int* x = {}> struct X {};
+X<> a;
+X<nullptr> b;
+static_assert(__is_same(decltype(a), decltype(b)));
+#endif
+}
+
 namespace dr2061 { // dr2061: yes
 #if __cplusplus >= 201103L
   namespace A {
diff --git a/clang/test/CXX/drs/dr24xx.cpp b/clang/test/CXX/drs/dr24xx.cpp
new file mode 100644
index 00000000000000..3fd8539be53d81
--- /dev/null
+++ b/clang/test/CXX/drs/dr24xx.cpp
@@ -0,0 +1,27 @@
+// RUN: %clang_cc1 -std=c++20 %s -verify
+// RUN: %clang_cc1 -std=c++23 %s -verify
+// expected-no-diagnostics
+
+namespace dr2450 { // dr2450: 18 drafting
+#if __cplusplus > 202002L
+struct S {int a;};
+template <S s>
+void f(){}
+
+void test() {
+f<{0}>();
+f<{.a= 0}>();
+}
+
+#endif
+}
+
+namespace dr2459 { // dr2459: 18 drafting
+#if __cplusplus > 202002L
+struct A {
+  constexpr A(float) {}
+};
+template<A> struct X {};
+X<1> x;
+#endif
+}
diff --git a/clang/test/SemaTemplate/temp_arg_nontype_cxx20.cpp b/clang/test/SemaTemplate/temp_arg_nontype_cxx20.cpp
index 9f9aad604d5f11..792dc78464b2a8 100644
--- a/clang/test/SemaTemplate/temp_arg_nontype_cxx20.cpp
+++ b/clang/test/SemaTemplate/temp_arg_nontype_cxx20.cpp
@@ -62,12 +62,13 @@ namespace ClassNTTP {
   template<A a> constexpr int f() { return a.y; }
   static_assert(f<A{1,2}>() == 2);
 
-  template<A a> int id;
+  template<A a> int id; // #ClassNTTP1
   constexpr A a = {1, 2};
   static_assert(&id<A{1,2}> == &id<a>);
   static_assert(&id<A{1,3}> != &id<a>);
 
   int k = id<1>; // expected-error {{no viable conversion from 'int' to 'A'}}
+                 // expected-note@#ClassNTTP1 {{passing argument to parameter 'a' here}}
 
   struct B {
     constexpr B() {}
@@ -90,8 +91,8 @@ namespace ConvertedConstant {
     constexpr A(float) {}
   };
   template <A> struct X {};
-  void f(X<1.0f>) {} // OK, user-defined conversion
-  void f(X<2>) {} // expected-error {{conversion from 'int' to 'A' is not allowed in a converted constant expression}}
+  void f(X<1.0f>) {}
+  void g(X<2>) {}
 }
 
 namespace CopyCounting {
diff --git a/clang/test/SemaTemplate/temp_arg_nontype_cxx2c.cpp b/clang/test/SemaTemplate/temp_arg_nontype_cxx2c.cpp
new file mode 100644
index 00000000000000..9fb6b440b6b2af
--- /dev/null
+++ b/clang/test/SemaTemplate/temp_arg_nontype_cxx2c.cpp
@@ -0,0 +1,104 @@
+// RUN: %clang_cc1 -fsyntax-only -std=c++20 -Wconversion -verify %s
+
+struct Test {
+    int a = 0;
+    int b = 42;
+};
+
+template <Test t>
+struct A {
+    static constexpr auto a = t.a;
+    static constexpr auto b = t.b;
+};
+
+template <auto N>
+struct Auto {};
+
+template <typename T, T elem>
+struct Explicit{};
+
+struct L {};
+struct M {};
+
+struct Constructor {
+    Constructor(L) {}; // expected-note {{here}}
+    constexpr Constructor(M){};
+};
+
+template < Test = {} >
+struct DefaultParam1{};
+
+template < Test = {1, 2} >
+struct DefaultParam2{};
+
+template < Test = {. b = 5} >
+struct DefaultParam3{};
+
+void test() {
+    static_assert(A<{}>::a == 0);
+    static_assert(A<{}>::b == 42);
+    static_assert(A<{.a = 3}>::a == 3);
+    static_assert(A<{.b = 4}>::b == 4);
+
+    Auto<{0}> a; // expected-error {{cannot deduce type of initializer list}}
+
+    int notconst = 0; // expected-note {{declared here}}
+    A<{notconst}> _; // expected-error {{non-type template argument is not a constant expression}} \
+                     // expected-note  {{read of non-const variable 'notconst' is not allowed in a constant expression}}
+
+
+    Explicit<Constructor, {L{}}> err; // expected-error {{non-type template argument is not a constant expression}} \
+                                      // expected-note {{non-constexpr constructor 'Constructor' cannot be used in a constant expression}}
+    Explicit<Constructor, {M{}}> ok;
+
+
+    DefaultParam1<> d1;
+    DefaultParam2<> d2;
+    DefaultParam3<> d3;
+}
+
+template<auto n> struct B { /* ... */ };
+template<int i> struct C { /* ... */ };
+C<{ 42 }> c1;  // expected-warning {{braces around scalar initializer}}
+
+struct J1 {
+  J1 *self=this;
+};
+B<J1{}> j1;  // expected-error {{pointer to temporary object is not allowed in a template argument}}
+
+struct J2 {
+  J2 *self=this;
+  constexpr J2() {}
+  constexpr J2(const J2&) {}
+};
+B<J2{}> j2;  // expected-error {{pointer to temporary object is not allowed in a template argument}}
+
+
+namespace GH58434 {
+
+template<int>
+void f();
+
+void test() {
+  f<{42}>();
+}
+
+}
+
+namespace GH73666 {
+
+template<class T, int I>
+struct A {
+    T x[I];
+};
+
+template< class T, class... U >
+A( T, U... ) -> A<T, 1 + sizeof...(U)>;
+
+template<A a> void foo() { }
+
+void bar() {
+    foo<{1}>();
+}
+
+}
diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html
index 141b2aa515ad9a..c9bfe4c6caa835 100755
--- a/clang/www/cxx_dr_status.html
+++ b/clang/www/cxx_dr_status.html
@@ -12101,7 +12101,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/2049.html">2049</a></td>
     <td>drafting</td>
     <td>List initializer in non-type template default argument</td>
-    <td align="center">Not resolved</td>
+    <td class="unreleased" align="center">Clang 18</td>
   </tr>
   <tr id="2050">
     <td><a href="https://cplusplus.github.io/CWG/issues/2050.html">2050</a></td>
@@ -14507,7 +14507,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/2450.html">2450</a></td>
     <td>drafting</td>
     <td><I>braced-init-list</I> as a <I>template-argument</I></td>
-    <td align="center">Not resolved</td>
+    <td class="unreleased" align="center">Clang 18</td>
   </tr>
   <tr id="2451">
     <td><a href="https://cplusplus.github.io/CWG/issues/2451.html">2451</a></td>
@@ -14561,7 +14561,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/2459.html">2459</a></td>
     <td>drafting</td>
     <td>Template parameter initialization</td>
-    <td align="center">Not resolved</td>
+    <td class="unreleased" align="center">Clang 18</td>
   </tr>
   <tr id="2460">
     <td><a href="https://cplusplus.github.io/CWG/issues/2460.html">2460</a></td>
diff --git a/clang/www/cxx_status.html b/clang/www/cxx_status.html
index 9fb6b0cda4da50..197726f3aa3eee 100755
--- a/clang/www/cxx_status.html
+++ b/clang/www/cxx_status.html
@@ -151,7 +151,7 @@ <h2 id="cxx26">C++2c implementation status</h2>
  <tr>
   <td>Template parameter initialization</td>
   <td><a href="https://wg21.link/P2308R1">P2308R1</a> (<a href="#dr">DR</a>)</td>
-  <td class="none" align="center">No</td>
+  <td class="unreleased" align="center">Clang 18</td>
  </tr>
  <tr>
   <td>Pack Indexing</td>

From 6cd7500ae690b412b7a350c70b27bd8e0839d643 Mon Sep 17 00:00:00 2001
From: Youngsuk Kim <joseph942010@gmail.com>
Date: Fri, 1 Dec 2023 11:53:43 -0500
Subject: [PATCH 60/72] [llvm][IR] Remove method IRBuilderBase::getInt8PtrTy
 (#74096)

Users should migrate to IRBuilderBase::getPtrTy.
---
 llvm/include/llvm/IR/IRBuilder.h | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index 2cd9a665e9e5d0..8863ca8eba47ef 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -564,11 +564,6 @@ class IRBuilderBase {
     return PointerType::get(Context, AddrSpace);
   }
 
-  /// Fetch the type representing a pointer to an 8-bit integer value.
-  PointerType *getInt8PtrTy(unsigned AddrSpace = 0) {
-    return getPtrTy(AddrSpace);
-  }
-
   /// Fetch the type of an integer with size at least as big as that of a
   /// pointer in the given address space.
   IntegerType *getIntPtrTy(const DataLayout &DL, unsigned AddrSpace = 0) {

From abaeaf382304e5fe30fe05afd09cec3f7191e484 Mon Sep 17 00:00:00 2001
From: Shraiysh <Shraiysh.Vaishay@amd.com>
Date: Fri, 1 Dec 2023 10:59:01 -0600
Subject: [PATCH 61/72] [OpenMP][flang] Adding more tests for commonblock with
 target map (#71146)

This patch addresses the concern about multiple devices and also adds
more tests for `map(to:)`, `map(from:)` and named common blocks.
---
 .../fortran/target_map_common_block.f90       | 55 +++++++++++++++++--
 1 file changed, 51 insertions(+), 4 deletions(-)

diff --git a/openmp/libomptarget/test/offloading/fortran/target_map_common_block.f90 b/openmp/libomptarget/test/offloading/fortran/target_map_common_block.f90
index cc7ce50e661e19..e782ef8a670a71 100644
--- a/openmp/libomptarget/test/offloading/fortran/target_map_common_block.f90
+++ b/openmp/libomptarget/test/offloading/fortran/target_map_common_block.f90
@@ -12,9 +12,13 @@
 ! Testing simple variables in common block.
 program main
   call check_device
-  call commonblock_simple_with_implicit_type
+  call commonblock_simple_with_implicit_type_var
   call commonblock_simple_with_integer
   call commonblock_simple_with_real
+  call commonblock_simple_to
+  call commonblock_simple_from
+  call set_commonblock_named
+  call use_commonblock_named
 end program main
 
 !-----
@@ -26,14 +30,17 @@ subroutine check_device
   !$omp target map(tofrom:devices)
     devices(2) = omp_get_device_num()
   !$omp end target
+  print *, omp_get_num_devices()
+  !CHECK: [[ND:[0-9]+]]
+  print *, omp_get_default_device()
+  !CHECK: [[DD:[0-9]+]]
+  !CHECK: devices: [[ND]] [[DD]]
   print *, "devices: ", devices
 end subroutine check_device
 
-!CHECK: devices: 1 0
-
 !-----
 
-subroutine commonblock_simple_with_implicit_type
+subroutine commonblock_simple_with_implicit_type_var
   use omp_lib
   common var1
   var1 = 10
@@ -80,3 +87,43 @@ subroutine commonblock_simple_with_real
 
 ! CHECK: var3 before target = 12.5
 ! CHECK: var3 after target = 14.5
+
+! -----
+
+subroutine commonblock_simple_to_from
+  use omp_lib
+  integer :: var4, tmp
+  common var4
+  var4 = 10
+  tmp = 20
+  !$omp target map(to:var4) map(from:tmp)
+    tmp = var4
+    var4 = 20
+  !$omp end target
+  print *, "var4 after target = ", var4
+  print *, "tmp after target = ", tmp
+end subroutine
+
+! CHECK: var4 after target = 10
+! CHECK: tmp after target = 10
+
+! -----
+
+subroutine set_commonblock_named
+  integer :: var6
+  common /my_common_block/ var6
+  var6 = 20
+end subroutine
+
+subroutine use_commonblock_named
+  integer :: var6
+  common /my_common_block/ var6
+  print *, "var6 before target = ", var6
+  !$omp target map(tofrom: var6)
+    var6 = 30
+  !$omp end target
+  print *, "var6 after target = ", var6
+end subroutine
+
+! CHECK: var6 before target = 20
+! CHECK: var6 after target = 30

From 7832a8582a42e5be7e313e88efabcdc981be6dec Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Fri, 1 Dec 2023 09:02:38 -0800
Subject: [PATCH 62/72] [mlgo] Fix test post PR #73899

Opcode value change.
---
 llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll b/llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll
index 7d5c02aaeddb66..1104bdfb004a06 100644
--- a/llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll
+++ b/llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll
@@ -26,7 +26,7 @@
 ; Also, the first eviction problem is significantly less than 300 instructions. Check
 ; that there is a zero value.
 ; Note: we're regex-ing some of the opcodes to avoid test flakyness.
-; CHECK: instructions: 19,{{([0-9]{4})}},12{{([0-9]{2})}},13{{([0-9]{2})}},{{.*}},0,
+; CHECK: instructions: 19,{{([0-9]{4})}},13{{([0-9]{2})}},13{{([0-9]{2})}},{{.*}},0,
 ; Only the candidate virtreg and the 10th LR are included in this problem. Make
 ; sure the other LRs have values of zero. There are 2700 0s followed by some 1s.
 ; There's a limit to how many repetitions can be matched.

From ea4eb691f4955e3b784ebf9bc94a47186838c6f2 Mon Sep 17 00:00:00 2001
From: Radu Salavat <radu.salavat@arm.com>
Date: Mon, 13 Nov 2023 17:49:06 +0000
Subject: [PATCH 63/72] [Flang][Clang] Add support for frame pointers in Flang

---
 clang/include/clang/Driver/Options.td         |  13 +-
 clang/lib/Driver/ToolChains/Clang.cpp         | 133 -----------------
 clang/lib/Driver/ToolChains/CommonArgs.cpp    | 139 ++++++++++++++++++
 clang/lib/Driver/ToolChains/CommonArgs.h      |   4 +
 clang/lib/Driver/ToolChains/Flang.cpp         |  19 +++
 flang/test/Driver/driver-help-hidden.f90      |   1 +
 flang/test/Driver/driver-help.f90             |   2 +
 .../test/Driver/frame-pointer-forwarding.f90  |   9 ++
 flang/test/Driver/frontend-forwarding.f90     |   2 +
 9 files changed, 184 insertions(+), 138 deletions(-)
 create mode 100644 flang/test/Driver/frame-pointer-forwarding.f90

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index fae70e61375de7..19d04e82aed4d6 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -3172,7 +3172,8 @@ def fno_ms_compatibility : Flag<["-"], "fno-ms-compatibility">, Group<f_Group>,
 def fno_objc_legacy_dispatch : Flag<["-"], "fno-objc-legacy-dispatch">, Group<f_Group>;
 def fno_objc_weak : Flag<["-"], "fno-objc-weak">, Group<f_Group>,
   Visibility<[ClangOption, CC1Option]>;
-def fno_omit_frame_pointer : Flag<["-"], "fno-omit-frame-pointer">, Group<f_Group>;
+def fno_omit_frame_pointer : Flag<["-"], "fno-omit-frame-pointer">, Group<f_Group>,
+  Visibility<[ClangOption, FlangOption]>;
 defm operator_names : BoolFOption<"operator-names",
   LangOpts<"CXXOperatorNames">, Default<cplusplus.KeyPath>,
   NegFlag<SetFalse, [], [ClangOption, CC1Option],
@@ -3298,6 +3299,7 @@ defm objc_avoid_heapify_local_blocks : BoolFOption<"objc-avoid-heapify-local-blo
   BothFlags<[], [CC1Option], " to avoid heapifying local blocks">>;
 
 def fomit_frame_pointer : Flag<["-"], "fomit-frame-pointer">, Group<f_Group>,
+  Visibility<[ClangOption, FlangOption]>,
   HelpText<"Omit the frame pointer from functions that don't need it. "
   "Some stack unwinding cases, such as profilers and sanitizers, may prefer specifying -fno-omit-frame-pointer. "
   "On many targets, -O1 and higher omit the frame pointer by default. "
@@ -6786,10 +6788,6 @@ def new_struct_path_tbaa : Flag<["-"], "new-struct-path-tbaa">,
 def mdebug_pass : Separate<["-"], "mdebug-pass">,
   HelpText<"Enable additional debug output">,
   MarshallingInfoString<CodeGenOpts<"DebugPass">>;
-def mframe_pointer_EQ : Joined<["-"], "mframe-pointer=">,
-  HelpText<"Specify which frame pointers to retain.">, Values<"all,non-leaf,none">,
-  NormalizedValuesScope<"CodeGenOptions::FramePointerKind">, NormalizedValues<["All", "NonLeaf", "None"]>,
-  MarshallingInfoEnum<CodeGenOpts<"FramePointer">, "None">;
 def mabi_EQ_ieeelongdouble : Flag<["-"], "mabi=ieeelongdouble">,
   HelpText<"Use IEEE 754 quadruple-precision for long double">,
   MarshallingInfoFlag<LangOpts<"PPCIEEELongDouble">>;
@@ -7400,6 +7398,11 @@ def pic_is_pie : Flag<["-"], "pic-is-pie">,
   HelpText<"File is for a position independent executable">,
   MarshallingInfoFlag<LangOpts<"PIE">>;
 
+def mframe_pointer_EQ : Joined<["-"], "mframe-pointer=">,
+  HelpText<"Specify which frame pointers to retain.">, Values<"all,non-leaf,none">,
+  NormalizedValuesScope<"CodeGenOptions::FramePointerKind">, NormalizedValues<["All", "NonLeaf", "None"]>,
+  MarshallingInfoEnum<CodeGenOpts<"FramePointer">, "None">;
+
 
 def dependent_lib : Joined<["--"], "dependent-lib=">,
   HelpText<"Add dependent library">,
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index b1ce95be38f88d..f02f7c841b91f0 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -409,139 +409,6 @@ static bool ShouldEnableAutolink(const ArgList &Args, const ToolChain &TC,
                       Default);
 }
 
-static bool mustUseNonLeafFramePointerForTarget(const llvm::Triple &Triple) {
-  switch (Triple.getArch()){
-  default:
-    return false;
-  case llvm::Triple::arm:
-  case llvm::Triple::thumb:
-    // ARM Darwin targets require a frame pointer to be always present to aid
-    // offline debugging via backtraces.
-    return Triple.isOSDarwin();
-  }
-}
-
-static bool useFramePointerForTargetByDefault(const ArgList &Args,
-                                              const llvm::Triple &Triple) {
-  if (Args.hasArg(options::OPT_pg) && !Args.hasArg(options::OPT_mfentry))
-    return true;
-
-  if (Triple.isAndroid()) {
-    switch (Triple.getArch()) {
-    case llvm::Triple::aarch64:
-    case llvm::Triple::arm:
-    case llvm::Triple::armeb:
-    case llvm::Triple::thumb:
-    case llvm::Triple::thumbeb:
-    case llvm::Triple::riscv64:
-      return true;
-    default:
-      break;
-    }
-  }
-
-  switch (Triple.getArch()) {
-  case llvm::Triple::xcore:
-  case llvm::Triple::wasm32:
-  case llvm::Triple::wasm64:
-  case llvm::Triple::msp430:
-    // XCore never wants frame pointers, regardless of OS.
-    // WebAssembly never wants frame pointers.
-    return false;
-  case llvm::Triple::ppc:
-  case llvm::Triple::ppcle:
-  case llvm::Triple::ppc64:
-  case llvm::Triple::ppc64le:
-  case llvm::Triple::riscv32:
-  case llvm::Triple::riscv64:
-  case llvm::Triple::sparc:
-  case llvm::Triple::sparcel:
-  case llvm::Triple::sparcv9:
-  case llvm::Triple::amdgcn:
-  case llvm::Triple::r600:
-  case llvm::Triple::csky:
-  case llvm::Triple::loongarch32:
-  case llvm::Triple::loongarch64:
-    return !areOptimizationsEnabled(Args);
-  default:
-    break;
-  }
-
-  if (Triple.isOSFuchsia() || Triple.isOSNetBSD()) {
-    return !areOptimizationsEnabled(Args);
-  }
-
-  if (Triple.isOSLinux() || Triple.isOSHurd()) {
-    switch (Triple.getArch()) {
-    // Don't use a frame pointer on linux if optimizing for certain targets.
-    case llvm::Triple::arm:
-    case llvm::Triple::armeb:
-    case llvm::Triple::thumb:
-    case llvm::Triple::thumbeb:
-    case llvm::Triple::mips64:
-    case llvm::Triple::mips64el:
-    case llvm::Triple::mips:
-    case llvm::Triple::mipsel:
-    case llvm::Triple::systemz:
-    case llvm::Triple::x86:
-    case llvm::Triple::x86_64:
-      return !areOptimizationsEnabled(Args);
-    default:
-      return true;
-    }
-  }
-
-  if (Triple.isOSWindows()) {
-    switch (Triple.getArch()) {
-    case llvm::Triple::x86:
-      return !areOptimizationsEnabled(Args);
-    case llvm::Triple::x86_64:
-      return Triple.isOSBinFormatMachO();
-    case llvm::Triple::arm:
-    case llvm::Triple::thumb:
-      // Windows on ARM builds with FPO disabled to aid fast stack walking
-      return true;
-    default:
-      // All other supported Windows ISAs use xdata unwind information, so frame
-      // pointers are not generally useful.
-      return false;
-    }
-  }
-
-  return true;
-}
-
-static CodeGenOptions::FramePointerKind
-getFramePointerKind(const ArgList &Args, const llvm::Triple &Triple) {
-  // We have 4 states:
-  //
-  //  00) leaf retained, non-leaf retained
-  //  01) leaf retained, non-leaf omitted (this is invalid)
-  //  10) leaf omitted, non-leaf retained
-  //      (what -momit-leaf-frame-pointer was designed for)
-  //  11) leaf omitted, non-leaf omitted
-  //
-  //  "omit" options taking precedence over "no-omit" options is the only way
-  //  to make 3 valid states representable
-  Arg *A = Args.getLastArg(options::OPT_fomit_frame_pointer,
-                           options::OPT_fno_omit_frame_pointer);
-  bool OmitFP = A && A->getOption().matches(options::OPT_fomit_frame_pointer);
-  bool NoOmitFP =
-      A && A->getOption().matches(options::OPT_fno_omit_frame_pointer);
-  bool OmitLeafFP =
-      Args.hasFlag(options::OPT_momit_leaf_frame_pointer,
-                   options::OPT_mno_omit_leaf_frame_pointer,
-                   Triple.isAArch64() || Triple.isPS() || Triple.isVE() ||
-                   (Triple.isAndroid() && Triple.isRISCV64()));
-  if (NoOmitFP || mustUseNonLeafFramePointerForTarget(Triple) ||
-      (!OmitFP && useFramePointerForTargetByDefault(Args, Triple))) {
-    if (OmitLeafFP)
-      return CodeGenOptions::FramePointerKind::NonLeaf;
-    return CodeGenOptions::FramePointerKind::All;
-  }
-  return CodeGenOptions::FramePointerKind::None;
-}
-
 /// Add a CC1 option to specify the debug compilation directory.
 static const char *addDebugCompDirArg(const ArgList &Args,
                                       ArgStringList &CmdArgs,
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index efb8b71f24a932..0ae8e2dce32e94 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -24,6 +24,7 @@
 #include "MSP430.h"
 #include "Solaris.h"
 #include "clang/Basic/CharInfo.h"
+#include "clang/Basic/CodeGenOptions.h"
 #include "clang/Basic/LangOptions.h"
 #include "clang/Basic/ObjCRuntime.h"
 #include "clang/Basic/Version.h"
@@ -71,6 +72,144 @@ using namespace clang::driver::tools;
 using namespace clang;
 using namespace llvm::opt;
 
+static bool useFramePointerForTargetByDefault(const llvm::opt::ArgList &Args,
+                                              const llvm::Triple &Triple) {
+  if (Args.hasArg(clang::driver::options::OPT_pg) &&
+      !Args.hasArg(clang::driver::options::OPT_mfentry))
+    return true;
+
+  if (Triple.isAndroid()) {
+    switch (Triple.getArch()) {
+    case llvm::Triple::aarch64:
+    case llvm::Triple::arm:
+    case llvm::Triple::armeb:
+    case llvm::Triple::thumb:
+    case llvm::Triple::thumbeb:
+    case llvm::Triple::riscv64:
+      return true;
+    default:
+      break;
+    }
+  }
+
+  switch (Triple.getArch()) {
+  case llvm::Triple::xcore:
+  case llvm::Triple::wasm32:
+  case llvm::Triple::wasm64:
+  case llvm::Triple::msp430:
+    // XCore never wants frame pointers, regardless of OS.
+    // WebAssembly never wants frame pointers.
+    return false;
+  case llvm::Triple::ppc:
+  case llvm::Triple::ppcle:
+  case llvm::Triple::ppc64:
+  case llvm::Triple::ppc64le:
+  case llvm::Triple::riscv32:
+  case llvm::Triple::riscv64:
+  case llvm::Triple::sparc:
+  case llvm::Triple::sparcel:
+  case llvm::Triple::sparcv9:
+  case llvm::Triple::amdgcn:
+  case llvm::Triple::r600:
+  case llvm::Triple::csky:
+  case llvm::Triple::loongarch32:
+  case llvm::Triple::loongarch64:
+    return !clang::driver::tools::areOptimizationsEnabled(Args);
+  default:
+    break;
+  }
+
+  if (Triple.isOSFuchsia() || Triple.isOSNetBSD()) {
+    return !clang::driver::tools::areOptimizationsEnabled(Args);
+  }
+
+  if (Triple.isOSLinux() || Triple.isOSHurd()) {
+    switch (Triple.getArch()) {
+    // Don't use a frame pointer on linux if optimizing for certain targets.
+    case llvm::Triple::arm:
+    case llvm::Triple::armeb:
+    case llvm::Triple::thumb:
+    case llvm::Triple::thumbeb:
+    case llvm::Triple::mips64:
+    case llvm::Triple::mips64el:
+    case llvm::Triple::mips:
+    case llvm::Triple::mipsel:
+    case llvm::Triple::systemz:
+    case llvm::Triple::x86:
+    case llvm::Triple::x86_64:
+      return !clang::driver::tools::areOptimizationsEnabled(Args);
+    default:
+      return true;
+    }
+  }
+
+  if (Triple.isOSWindows()) {
+    switch (Triple.getArch()) {
+    case llvm::Triple::x86:
+      return !clang::driver::tools::areOptimizationsEnabled(Args);
+    case llvm::Triple::x86_64:
+      return Triple.isOSBinFormatMachO();
+    case llvm::Triple::arm:
+    case llvm::Triple::thumb:
+      // Windows on ARM builds with FPO disabled to aid fast stack walking
+      return true;
+    default:
+      // All other supported Windows ISAs use xdata unwind information, so frame
+      // pointers are not generally useful.
+      return false;
+    }
+  }
+
+  return true;
+}
+
+static bool mustUseNonLeafFramePointerForTarget(const llvm::Triple &Triple) {
+  switch (Triple.getArch()) {
+  default:
+    return false;
+  case llvm::Triple::arm:
+  case llvm::Triple::thumb:
+    // ARM Darwin targets require a frame pointer to be always present to aid
+    // offline debugging via backtraces.
+    return Triple.isOSDarwin();
+  }
+}
+
+clang::CodeGenOptions::FramePointerKind
+getFramePointerKind(const llvm::opt::ArgList &Args,
+                    const llvm::Triple &Triple) {
+  // We have 4 states:
+  //
+  //  00) leaf retained, non-leaf retained
+  //  01) leaf retained, non-leaf omitted (this is invalid)
+  //  10) leaf omitted, non-leaf retained
+  //      (what -momit-leaf-frame-pointer was designed for)
+  //  11) leaf omitted, non-leaf omitted
+  //
+  //  "omit" options taking precedence over "no-omit" options is the only way
+  //  to make 3 valid states representable
+  llvm::opt::Arg *A =
+      Args.getLastArg(clang::driver::options::OPT_fomit_frame_pointer,
+                      clang::driver::options::OPT_fno_omit_frame_pointer);
+
+  bool OmitFP = A && A->getOption().matches(
+                         clang::driver::options::OPT_fomit_frame_pointer);
+  bool NoOmitFP = A && A->getOption().matches(
+                           clang::driver::options::OPT_fno_omit_frame_pointer);
+  bool OmitLeafFP =
+      Args.hasFlag(clang::driver::options::OPT_momit_leaf_frame_pointer,
+                   clang::driver::options::OPT_mno_omit_leaf_frame_pointer,
+                   Triple.isAArch64() || Triple.isPS() || Triple.isVE() ||
+                       (Triple.isAndroid() && Triple.isRISCV64()));
+  if (NoOmitFP || mustUseNonLeafFramePointerForTarget(Triple) ||
+      (!OmitFP && useFramePointerForTargetByDefault(Args, Triple))) {
+    if (OmitLeafFP)
+      return clang::CodeGenOptions::FramePointerKind::NonLeaf;
+    return clang::CodeGenOptions::FramePointerKind::All;
+  }
+  return clang::CodeGenOptions::FramePointerKind::None;
+}
+
 static void renderRpassOptions(const ArgList &Args, ArgStringList &CmdArgs,
                                const StringRef PluginOptPrefix) {
   if (const Arg *A = Args.getLastArg(options::OPT_Rpass_EQ))
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.h b/clang/lib/Driver/ToolChains/CommonArgs.h
index 0a0951c5386e60..25d68345a9f9eb 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.h
+++ b/clang/lib/Driver/ToolChains/CommonArgs.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_COMMONARGS_H
 #define LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_COMMONARGS_H
 
+#include "clang/Basic/CodeGenOptions.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/InputInfo.h"
 #include "clang/Driver/Multilib.h"
@@ -215,4 +216,7 @@ void addOpenMPDeviceRTL(const Driver &D, const llvm::opt::ArgList &DriverArgs,
 } // end namespace driver
 } // end namespace clang
 
+clang::CodeGenOptions::FramePointerKind
+getFramePointerKind(const llvm::opt::ArgList &Args, const llvm::Triple &Triple);
+
 #endif // LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_COMMONARGS_H
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index 033051b95288a4..98b337e60e4ffd 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -9,6 +9,7 @@
 #include "Flang.h"
 #include "CommonArgs.h"
 
+#include "clang/Basic/CodeGenOptions.h"
 #include "clang/Driver/Options.h"
 #include "llvm/Frontend/Debug/Options.h"
 #include "llvm/Support/FileSystem.h"
@@ -674,6 +675,24 @@ void Flang::ConstructJob(Compilation &C, const JobAction &JA,
   // Forward -Xflang arguments to -fc1
   Args.AddAllArgValues(CmdArgs, options::OPT_Xflang);
 
+  CodeGenOptions::FramePointerKind FPKeepKind =
+      getFramePointerKind(Args, Triple);
+
+  const char *FPKeepKindStr = nullptr;
+  switch (FPKeepKind) {
+  case CodeGenOptions::FramePointerKind::None:
+    FPKeepKindStr = "-mframe-pointer=none";
+    break;
+  case CodeGenOptions::FramePointerKind::NonLeaf:
+    FPKeepKindStr = "-mframe-pointer=non-leaf";
+    break;
+  case CodeGenOptions::FramePointerKind::All:
+    FPKeepKindStr = "-mframe-pointer=all";
+    break;
+  }
+  assert(FPKeepKindStr && "unknown FramePointerKind");
+  CmdArgs.push_back(FPKeepKindStr);
+
   // Forward -mllvm options to the LLVM option parser. In practice, this means
   // forwarding to `-fc1` as that's where the LLVM parser is run.
   for (const Arg *A : Args.filtered(options::OPT_mllvm)) {
diff --git a/flang/test/Driver/driver-help-hidden.f90 b/flang/test/Driver/driver-help-hidden.f90
index 4ea89776416a6b..f420f1ef3290f6 100644
--- a/flang/test/Driver/driver-help-hidden.f90
+++ b/flang/test/Driver/driver-help-hidden.f90
@@ -74,6 +74,7 @@
 ! CHECK-NEXT: -fno-stack-arrays       Allocate array temporaries on the heap (default)
 ! CHECK-NEXT: -fno-version-loops-for-stride
 ! CHECK-NEXT:                         Do not create unit-strided loops (default)
+! CHECK-NEXT: -fomit-frame-pointer    Omit the frame pointer from functions that don't need it. Some stack unwinding cases, such as profilers and sanitizers, may prefer specifying -fno-omit-frame-pointer. On many targets, -O1 and higher omit the frame pointer by default. -m[no-]omit-leaf-frame-pointer takes precedence for leaf functions
 ! CHECK-NEXT: -fopenacc               Enable OpenACC
 ! CHECK-NEXT: -fopenmp-assume-no-nested-parallelism
 ! CHECK-NEXT:                         Assert no nested parallel regions in the GPU
diff --git a/flang/test/Driver/driver-help.f90 b/flang/test/Driver/driver-help.f90
index 6fb306d3196fba..23197e8d489086 100644
--- a/flang/test/Driver/driver-help.f90
+++ b/flang/test/Driver/driver-help.f90
@@ -64,6 +64,7 @@
 ! HELP-NEXT: -fno-stack-arrays       Allocate array temporaries on the heap (default)
 ! HELP-NEXT: -fno-version-loops-for-stride
 ! HELP-NEXT:                         Do not create unit-strided loops (default)
+! HELP-NEXT: -fomit-frame-pointer    Omit the frame pointer from functions that don't need it. Some stack unwinding cases, such as profilers and sanitizers, may prefer specifying -fno-omit-frame-pointer. On many targets, -O1 and higher omit the frame pointer by default. -m[no-]omit-leaf-frame-pointer takes precedence for leaf functions
 ! HELP-NEXT: -fopenacc               Enable OpenACC
 ! HELP-NEXT: -fopenmp-target-debug   Enable debugging in the OpenMP offloading device RTL
 ! HELP-NEXT: -fopenmp-targets=<value>
@@ -238,6 +239,7 @@
 ! HELP-FC1-NEXT:                         Specify code object ABI version. Defaults to 4. (AMDGPU only)
 ! HELP-FC1-NEXT: -menable-no-infs        Allow optimization to assume there are no infinities.
 ! HELP-FC1-NEXT: -menable-no-nans        Allow optimization to assume there are no NaNs.
+! HELP-FC1-NEXT: -mframe-pointer=<value> Specify which frame pointers to retain.
 ! HELP-FC1-NEXT: -mllvm <value>          Additional arguments to forward to LLVM's option processing
 ! HELP-FC1-NEXT: -mmlir <value>          Additional arguments to forward to MLIR's option processing
 ! HELP-FC1-NEXT: -module-dir <dir>       Put MODULE files in <dir>
diff --git a/flang/test/Driver/frame-pointer-forwarding.f90 b/flang/test/Driver/frame-pointer-forwarding.f90
new file mode 100644
index 00000000000000..fd615987f82f4c
--- /dev/null
+++ b/flang/test/Driver/frame-pointer-forwarding.f90
@@ -0,0 +1,9 @@
+! Test that flang-new forwards -fno-omit-frame-pointer and -fomit-frame-pointer Flang frontend
+! RUN: %flang -fno-omit-frame-pointer --target=x86-none-none -fsyntax-only -### %s -o %t 2>&1  | FileCheck %s
+! CHECK: "-mframe-pointer=all"
+
+! RUN: %flang -fno-omit-frame-pointer --target=aarch64-none-none -fsyntax-only -### %s -o %t 2>&1  | FileCheck %s --check-prefix=CHECK-NONLEAFFP
+! CHECK-NONLEAFFP: "-mframe-pointer=non-leaf"
+
+! RUN: %flang -fomit-frame-pointer --target=aarch64-none-none -fsyntax-only -### %s -o %t 2>&1  | FileCheck %s --check-prefix=CHECK-NONEFP
+! CHECK-NONEFP: "-mframe-pointer=none"
diff --git a/flang/test/Driver/frontend-forwarding.f90 b/flang/test/Driver/frontend-forwarding.f90
index 20455791c9ff4d..8e9c9b78c3c10a 100644
--- a/flang/test/Driver/frontend-forwarding.f90
+++ b/flang/test/Driver/frontend-forwarding.f90
@@ -14,6 +14,7 @@
 ! RUN:     -fno-signed-zeros \
 ! RUN:     -fassociative-math \
 ! RUN:     -freciprocal-math \
+! RUN:     -fomit-frame-pointer \
 ! RUN:     -fpass-plugin=Bye%pluginext \
 ! RUN:     -fversion-loops-for-stride \
 ! RUN:     -flang-experimental-polymorphism \
@@ -60,5 +61,6 @@
 ! CHECK: "-Reverything"
 ! CHECK: "-Rno-everything"
 ! CHECK: "-Rpass=inline"
+! CHECK: "-mframe-pointer=none"
 ! CHECK: "-mllvm" "-print-before-all"
 ! CHECK: "-save-temps=obj"

From bc802407d16f4aa0df9f32610e3b25b6a791c085 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski@arm.com>
Date: Fri, 1 Dec 2023 17:39:48 +0000
Subject: [PATCH 64/72] [mlir][sve][nfc] Merge the integration tests for
 linalg.matmul (#74059)

At the moment the logic to tile and vectorize `linalg.matmul` is
duplicated in multiple test files:
  * matmul.mlir
  * matmul_mixed_ty.mlir

Instead, this patch uses `transform.foreach` to apply the same sequence
to multiple functions within the same test file (e.g. `matmul_f32` and
`matmul_mixed_ty` as defined in the original files). This allows us to
merge relevant test files.
---
 .../Dialect/Linalg/CPU/ArmSVE/matmul.mlir     | 82 +++++++++++++++---
 .../Linalg/CPU/ArmSVE/matmul_mixed_ty.mlir    | 83 -------------------
 2 files changed, 69 insertions(+), 96 deletions(-)
 delete mode 100644 mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul_mixed_ty.mlir

diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir
index d771d32d548bbe..17393412badf35 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir
@@ -8,7 +8,10 @@
 
 // RUN: %{compile}
 
-// RUN: %{run} | FileCheck %s
+// RUN: %{run} | FileCheck %s --check-prefix=F32
+
+// REDEFINE: %{entry_point} = matmul_mixed_ty
+// RUN: %{run} | FileCheck %s --check-prefix=MIXED
 
 func.func @matmul_f32() {
   // Matrix dimensions
@@ -32,37 +35,75 @@ func.func @matmul_f32() {
   %C_out = linalg.matmul ins(%A, %B: tensor<?x?xf32>, tensor<?x?xf32>) outs(%C_in: tensor<?x?xf32>) -> tensor<?x?xf32>
 
   // Print and verify the output
-  // CHECK-LABEL: SVE: START OF TEST OUTPUT
+  // F32-LABEL: SVE: START OF TEST OUTPUT
   vector.print str "SVE: START OF TEST OUTPUT"
 
-  // CHECK-NEXT: Unranked Memref {{.*}} rank = 2 offset = 0 sizes = [5, 15] strides = [15, 1] data =
-  // CHECK-COUNT-5: [29.5788, 29.5788, 29.5788, 29.5788, 29.5788, 29.5788, 29.5788, 29.5788, 29.5788, 29.5788, 29.5788, 29.5788, 29.5788, 29.5788, 29.5788]
+  // F32-NEXT: Unranked Memref {{.*}} rank = 2 offset = 0 sizes = [5, 15] strides = [15, 1] data =
+  // F32-COUNT-5: [29.5788, 29.5788, 29.5788, 29.5788, 29.5788, 29.5788, 29.5788, 29.5788, 29.5788, 29.5788, 29.5788, 29.5788, 29.5788, 29.5788, 29.5788]
   %xf = tensor.cast %C_out : tensor<?x?xf32> to tensor<*xf32>
   call @printMemrefF32(%xf) : (tensor<*xf32>) -> ()
 
-  // CHECK-NEXT: SVE: END OF TEST OUTPUT
+  // F32-NEXT: SVE: END OF TEST OUTPUT
+  vector.print str "SVE: END OF TEST OUTPUT"
+
+  return
+}
+
+func.func @matmul_mixed_ty() {
+  // Matrix dimensions
+  %K = arith.constant 3 : index
+  %M = arith.constant 5 : index
+  %N = arith.constant 15 : index
+  %c0_i8 = arith.constant 0 : i8
+  %c0_i32 = arith.constant 0 : i32
+
+  // Allocate the matrices
+  %A_alloc = bufferization.alloc_tensor(%M, %K) : tensor<?x?xi8>
+  %B_alloc = bufferization.alloc_tensor(%K, %N) : tensor<?x?xi8>
+  %C_alloc = bufferization.alloc_tensor(%M, %N) : tensor<?x?xi32>
+
+  // Initialise the matrices
+  %pi = arith.constant  123 : i8
+  %A = linalg.fill ins(%pi : i8) outs(%A_alloc : tensor<?x?xi8>) -> tensor<?x?xi8>
+  %B = linalg.fill ins(%pi : i8) outs(%B_alloc : tensor<?x?xi8>) -> tensor<?x?xi8>
+  %C_in = linalg.fill ins(%c0_i32 : i32) outs(%C_alloc : tensor<?x?xi32>) -> tensor<?x?xi32>
+
+  // Matmul
+  %C_out = linalg.matmul ins(%A, %B: tensor<?x?xi8>, tensor<?x?xi8>) outs(%C_in: tensor<?x?xi32>) -> tensor<?x?xi32>
+
+  // Print and verify the output
+  // MIXED-LABEL: SVE: START OF TEST OUTPUT
+  vector.print str "SVE: START OF TEST OUTPUT"
+
+  // MIXED-NEXT: Unranked Memref {{.*}} rank = 2 offset = 0 sizes = [5, 15] strides = [15, 1] data =
+  // MIXED-COUNT-5: [45387,   45387,   45387,   45387,   45387,   45387,   45387,   45387,   45387,   45387,   45387,   45387,   45387,   45387,   45387]
+  %xf = tensor.cast %C_out : tensor<?x?xi32> to tensor<*xi32>
+  call @printMemrefI32(%xf) : (tensor<*xi32>) -> ()
+
+  // MIXED-NEXT: SVE: END OF TEST OUTPUT
   vector.print str "SVE: END OF TEST OUTPUT"
 
   return
 }
 
 module attributes {transform.with_named_sequence} {
-transform.named_sequence @__transform_main(%module: !transform.any_op {transform.readonly}) {
-    %matmul = transform.structured.match ops{["linalg.matmul"]} in %module
-      : (!transform.any_op) -> !transform.any_op
+  // A sequence that will tile and vectorise a Matmul Op
+  transform.named_sequence @tile_and_vectorize_matmul(%func
+    : !transform.op<"func.func"> {transform.readonly}) {
+
+    // Step 0: Get a handle to the matmul Op
+    %matmul = transform.structured.match ops{["linalg.matmul"]} in %func
+      : (!transform.op<"func.func">) -> !transform.any_op
 
     // Step 1: Tile
-    %module_with_tiled_loops, %loops:3 = transform.structured.tile_using_for %matmul [2, [4], 1]
+    %tiled_matmul, %loops:3 = transform.structured.tile_using_for %matmul [2, [4], 1]
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+    transform.print %tiled_matmul {name = "matmul lal"}: !transform.any_op
 
     // Step 2: Vectorize
-    %tiled_matmul = transform.structured.match ops{["linalg.matmul"]} in %module_with_tiled_loops
-      : (!transform.any_op) -> !transform.any_op
     transform.structured.vectorize %tiled_matmul vector_sizes [2, [4], 1] : !transform.any_op
 
     // Step 3: Lower vector.multi_reduction to vector.contract (+ some helpful patterns)
-    %func = transform.structured.match ops{["func.func"]} in %module
-      : (!transform.any_op) -> !transform.op<"func.func">
     transform.apply_patterns to %func {
       transform.apply_patterns.vector.reduction_to_contract
       transform.apply_patterns.vector.transfer_permutation_patterns
@@ -77,6 +118,21 @@ transform.named_sequence @__transform_main(%module: !transform.any_op {transform
 
     transform.yield
   }
+
+  // A sequence that goes over all functions in tis module and applies
+  // "tile_and_vectorize_matmul"
+  transform.named_sequence @__transform_main(%module: !transform.any_op {transform.readonly}) {
+    %funcs = transform.structured.match ops{["func.func"]} in %module
+        : (!transform.any_op) -> !transform.op<"func.func">
+
+    transform.foreach %funcs : !transform.op<"func.func"> {
+      ^bb2(%func : !transform.op<"func.func">):
+        transform.include @tile_and_vectorize_matmul failures(propagate)
+        (%func) : (!transform.op<"func.func">) -> ()
+    }
+    transform.yield
+  }
 }
 
 func.func private @printMemrefF32(%ptr : tensor<*xf32>)
+func.func private @printMemrefI32(%ptr : tensor<*xi32>)
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul_mixed_ty.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul_mixed_ty.mlir
deleted file mode 100644
index f4f2d87b4d0b42..00000000000000
--- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul_mixed_ty.mlir
+++ /dev/null
@@ -1,83 +0,0 @@
-// DEFINE: %{compile} =  mlir-opt %s \
-// DEFINE:    -transform-interpreter -test-transform-dialect-erase-schedule \
-// DEFINE:    -one-shot-bufferize -func-bufferize -cse -canonicalize -convert-vector-to-scf -arm-sve-legalize-vector-storage \
-// DEFINE:    -convert-vector-to-llvm="enable-arm-sve" -test-lower-to-llvm -o %t
-// DEFINE: %{entry_point} = matmul_mixed_ty
-// DEFINE: %{run} = %mcr_aarch64_cmd %t -e %{entry_point} -entry-point-result=void --march=aarch64 --mattr="+sve"\
-// DEFINE:    -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils
-
-// RUN: %{compile}
-
-// RUN: %{run} | FileCheck %s
-
-func.func @matmul_mixed_ty() {
-  // Matrix dimensions
-  %K = arith.constant 3 : index
-  %M = arith.constant 5 : index
-  %N = arith.constant 15 : index
-  %c0_i8 = arith.constant 0 : i8
-  %c0_i32 = arith.constant 0 : i32
-
-  // Allocate the matrices
-  %A_alloc = bufferization.alloc_tensor(%M, %K) : tensor<?x?xi8>
-  %B_alloc = bufferization.alloc_tensor(%K, %N) : tensor<?x?xi8>
-  %C_alloc = bufferization.alloc_tensor(%M, %N) : tensor<?x?xi32>
-
-  // Initialise the matrices
-  %pi = arith.constant  123 : i8
-  %A = linalg.fill ins(%pi : i8) outs(%A_alloc : tensor<?x?xi8>) -> tensor<?x?xi8>
-  %B = linalg.fill ins(%pi : i8) outs(%B_alloc : tensor<?x?xi8>) -> tensor<?x?xi8>
-  %C_in = linalg.fill ins(%c0_i32 : i32) outs(%C_alloc : tensor<?x?xi32>) -> tensor<?x?xi32>
-
-  // Matmul
-  %C_out = linalg.matmul ins(%A, %B: tensor<?x?xi8>, tensor<?x?xi8>) outs(%C_in: tensor<?x?xi32>) -> tensor<?x?xi32>
-
-  // Print and verify the output
-  // CHECK-LABEL: SVE: START OF TEST OUTPUT
-  vector.print str "SVE: START OF TEST OUTPUT"
-
-  // CHECK-NEXT: Unranked Memref {{.*}} rank = 2 offset = 0 sizes = [5, 15] strides = [15, 1] data =
-  // CHECK-COUNT-5: [45387,   45387,   45387,   45387,   45387,   45387,   45387,   45387,   45387,   45387,   45387,   45387,   45387,   45387,   45387]
-  %xf = tensor.cast %C_out : tensor<?x?xi32> to tensor<*xi32>
-  call @printMemrefI32(%xf) : (tensor<*xi32>) -> ()
-
-  // CHECK-NEXT: SVE: END OF TEST OUTPUT
-  vector.print str "SVE: END OF TEST OUTPUT"
-
-  return
-}
-
-module attributes {transform.with_named_sequence} {
-transform.named_sequence @__transform_main(%module: !transform.any_op {transform.readonly}) {
-    %matmul = transform.structured.match ops{["linalg.matmul"]} in %module
-      : (!transform.any_op) -> !transform.any_op
-
-    // Step 1: Tile
-    %module_with_tiled_loops, %loops:3 = transform.structured.tile_using_for %matmul [2, [4], 1]
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
-
-    // Step 2: Vectorize
-    %tiled_matmul = transform.structured.match ops{["linalg.matmul"]} in %module_with_tiled_loops
-      : (!transform.any_op) -> !transform.any_op
-    transform.structured.vectorize %tiled_matmul vector_sizes [2, [4], 1] : !transform.any_op
-
-    // Step 3: Lower vector.multi_reduction to vector.contract (+ some helpful patterns)
-    %func = transform.structured.match ops{["func.func"]} in %module
-      : (!transform.any_op) -> !transform.op<"func.func">
-    transform.apply_patterns to %func {
-      transform.apply_patterns.vector.reduction_to_contract
-      transform.apply_patterns.vector.transfer_permutation_patterns
-      transform.apply_patterns.vector.lower_masked_transfers
-    } : !transform.op<"func.func">
-
-    // Step 4: Lower vector.contract to vector.fma
-    transform.apply_patterns to %func {
-      transform.apply_patterns.vector.lower_contraction lowering_strategy = "outerproduct"
-      transform.apply_patterns.vector.lower_outerproduct
-    } : !transform.op<"func.func">
-
-    transform.yield
-  }
-}
-
-func.func private @printMemrefI32(%ptr : tensor<*xi32>)

From 76a9ea1321b1365713bbf6afafbd18cc5d7a9381 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Fri, 1 Dec 2023 09:04:14 -0800
Subject: [PATCH 65/72] [BOLT][utils] Remove heatmap mode detection from
 wrapper script

Heatmap mode has been moved to a separate tool. Drop the support in
llvm-bolt-wrapper.
---
 bolt/utils/llvm-bolt-wrapper.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/bolt/utils/llvm-bolt-wrapper.py b/bolt/utils/llvm-bolt-wrapper.py
index cd48204026fb77..b9d6fad825e786 100755
--- a/bolt/utils/llvm-bolt-wrapper.py
+++ b/bolt/utils/llvm-bolt-wrapper.py
@@ -112,7 +112,6 @@ def run_cmd(cmd, out_f, cfg):
 def run_bolt(bolt_path, bolt_args, out_f, cfg):
     p2b = os.path.basename(sys.argv[0]) == "perf2bolt"  # perf2bolt mode
     bd = os.path.basename(sys.argv[0]) == "llvm-boltdiff"  # boltdiff mode
-    hm = sys.argv[1] == "heatmap"  # heatmap mode
     cmd = ["/usr/bin/time", "-f", "%e %M", bolt_path] + bolt_args
     if p2b:
         # -ignore-build-id can occur at most once, hence remove it from cmd
@@ -121,7 +120,7 @@ def run_bolt(bolt_path, bolt_args, out_f, cfg):
         cmd += PERF2BOLT_MODE
     elif bd:
         cmd += BOLTDIFF_MODE
-    elif not cfg.NO_MINIMIZE and not hm:
+    elif not cfg.NO_MINIMIZE:
         cmd += MINIMIZE_DIFFS
     return run_cmd(cmd, out_f, cfg)
 

From 9584f5834499e6093797d4a28fde209f927ea556 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Fri, 1 Dec 2023 09:57:02 -0800
Subject: [PATCH 66/72] [BOLT][utils] Bump default time threshold to 2s in
 nfc-stat-parser

---
 bolt/utils/nfc-stat-parser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bolt/utils/nfc-stat-parser.py b/bolt/utils/nfc-stat-parser.py
index 72a21e0e24a1a7..4a0d677ec962b3 100755
--- a/bolt/utils/nfc-stat-parser.py
+++ b/bolt/utils/nfc-stat-parser.py
@@ -19,7 +19,7 @@ def main():
     )
     parser.add_argument(
         "--check_longer_than",
-        default=1,
+        default=2,
         type=float,
         help="Only warn on tests longer than X seconds for at least one side",
     )

From f866fde59854fd12dcc067388e4a60b218a0f818 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 1 Dec 2023 10:24:26 -0800
Subject: [PATCH 67/72] [RISCV][GISel] Lower G_FCONSTANT to constant pool load
 without F or D. (#73034)

I used an IR test because it was easier than constructing different MIR
test for each type of addressing.
---
 .../RISCV/GISel/RISCVInstructionSelector.cpp  |   4 +-
 .../Target/RISCV/GISel/RISCVLegalizerInfo.cpp |   7 +-
 .../CodeGen/RISCV/GlobalISel/constantpool.ll  | 122 ++++++++++++++++++
 3 files changed, 130 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/constantpool.ll

diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
index c926a380a27396..140dd58fdc5b9c 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
@@ -540,6 +540,7 @@ bool RISCVInstructionSelector::select(MachineInstr &MI) {
                       GV->hasExternalWeakLinkage());
   }
   case TargetOpcode::G_JUMP_TABLE:
+  case TargetOpcode::G_CONSTANT_POOL:
     return selectAddr(MI, MIB, MRI);
   case TargetOpcode::G_BRCOND: {
     Register LHS, RHS;
@@ -875,7 +876,8 @@ bool RISCVInstructionSelector::selectAddr(MachineInstr &MI,
                                           bool IsLocal,
                                           bool IsExternWeak) const {
   assert((MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE ||
-          MI.getOpcode() == TargetOpcode::G_JUMP_TABLE) &&
+          MI.getOpcode() == TargetOpcode::G_JUMP_TABLE ||
+          MI.getOpcode() == TargetOpcode::G_CONSTANT_POOL) &&
          "Unexpected opcode");
 
   const MachineOperand &DispMO = MI.getOperand(1);
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
index 153bac34986ec1..97222ce452d8cf 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
@@ -195,7 +195,8 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
       .widenScalarToNextPow2(0)
       .clampScalar(0, sXLen, sXLen);
 
-  getActionDefinitionsBuilder({G_GLOBAL_VALUE, G_JUMP_TABLE}).legalFor({p0});
+  getActionDefinitionsBuilder({G_GLOBAL_VALUE, G_JUMP_TABLE, G_CONSTANT_POOL})
+      .legalFor({p0});
 
   if (ST.hasStdExtM() || ST.hasStdExtZmmul()) {
     getActionDefinitionsBuilder(G_MUL)
@@ -283,7 +284,9 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
   getActionDefinitionsBuilder(G_IS_FPCLASS)
       .customIf(all(typeIs(0, s1), typeIsScalarFPArith(1, ST)));
 
-  getActionDefinitionsBuilder(G_FCONSTANT).legalIf(typeIsScalarFPArith(0, ST));
+  getActionDefinitionsBuilder(G_FCONSTANT)
+      .legalIf(typeIsScalarFPArith(0, ST))
+      .lowerFor({s32, s64});
 
   getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
       .legalIf(all(typeInSet(0, {s32, sXLen}), typeIsScalarFPArith(1, ST)))
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/constantpool.ll b/llvm/test/CodeGen/RISCV/GlobalISel/constantpool.ll
new file mode 100644
index 00000000000000..1eeeb60c2eb405
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/constantpool.ll
@@ -0,0 +1,122 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=riscv32 -global-isel -code-model=small \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefix=RV32-SMALL
+; RUN: llc < %s -mtriple=riscv32 -global-isel -code-model=medium \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefix=RV32-MEDIUM
+; RUN: llc < %s -mtriple=riscv32 -global-isel -relocation-model=pic \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefix=RV32-PIC
+; RUN: llc < %s -mtriple=riscv64 -global-isel -code-model=small \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefix=RV64-SMALL
+; RUN: llc < %s -mtriple=riscv64 -global-isel -code-model=medium \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefix=RV64-MEDIUM
+; RUN: llc < %s -mtriple=riscv64 -global-isel -relocation-model=pic \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefix=RV64-PIC
+
+define void @constpool_f32(ptr %p) {
+; RV32-SMALL-LABEL: constpool_f32:
+; RV32-SMALL:       # %bb.0:
+; RV32-SMALL-NEXT:    lui a1, %hi(.LCPI0_0)
+; RV32-SMALL-NEXT:    lw a1, %lo(.LCPI0_0)(a1)
+; RV32-SMALL-NEXT:    sw a1, 0(a0)
+; RV32-SMALL-NEXT:    ret
+;
+; RV32-MEDIUM-LABEL: constpool_f32:
+; RV32-MEDIUM:       # %bb.0:
+; RV32-MEDIUM-NEXT:  .Lpcrel_hi0:
+; RV32-MEDIUM-NEXT:    auipc a1, %pcrel_hi(.LCPI0_0)
+; RV32-MEDIUM-NEXT:    lw a1, %pcrel_lo(.Lpcrel_hi0)(a1)
+; RV32-MEDIUM-NEXT:    sw a1, 0(a0)
+; RV32-MEDIUM-NEXT:    ret
+;
+; RV32-PIC-LABEL: constpool_f32:
+; RV32-PIC:       # %bb.0:
+; RV32-PIC-NEXT:  .Lpcrel_hi0:
+; RV32-PIC-NEXT:    auipc a1, %pcrel_hi(.LCPI0_0)
+; RV32-PIC-NEXT:    lw a1, %pcrel_lo(.Lpcrel_hi0)(a1)
+; RV32-PIC-NEXT:    sw a1, 0(a0)
+; RV32-PIC-NEXT:    ret
+;
+; RV64-SMALL-LABEL: constpool_f32:
+; RV64-SMALL:       # %bb.0:
+; RV64-SMALL-NEXT:    lui a1, %hi(.LCPI0_0)
+; RV64-SMALL-NEXT:    lw a1, %lo(.LCPI0_0)(a1)
+; RV64-SMALL-NEXT:    sw a1, 0(a0)
+; RV64-SMALL-NEXT:    ret
+;
+; RV64-MEDIUM-LABEL: constpool_f32:
+; RV64-MEDIUM:       # %bb.0:
+; RV64-MEDIUM-NEXT:  .Lpcrel_hi0:
+; RV64-MEDIUM-NEXT:    auipc a1, %pcrel_hi(.LCPI0_0)
+; RV64-MEDIUM-NEXT:    lw a1, %pcrel_lo(.Lpcrel_hi0)(a1)
+; RV64-MEDIUM-NEXT:    sw a1, 0(a0)
+; RV64-MEDIUM-NEXT:    ret
+;
+; RV64-PIC-LABEL: constpool_f32:
+; RV64-PIC:       # %bb.0:
+; RV64-PIC-NEXT:  .Lpcrel_hi0:
+; RV64-PIC-NEXT:    auipc a1, %pcrel_hi(.LCPI0_0)
+; RV64-PIC-NEXT:    lw a1, %pcrel_lo(.Lpcrel_hi0)(a1)
+; RV64-PIC-NEXT:    sw a1, 0(a0)
+; RV64-PIC-NEXT:    ret
+  store float 1.0, ptr %p
+  ret void
+}
+
+define void @constpool_f64(ptr %p) {
+; RV32-SMALL-LABEL: constpool_f64:
+; RV32-SMALL:       # %bb.0:
+; RV32-SMALL-NEXT:    lui a1, %hi(.LCPI1_0)
+; RV32-SMALL-NEXT:    addi a1, a1, %lo(.LCPI1_0)
+; RV32-SMALL-NEXT:    lw a2, 0(a1)
+; RV32-SMALL-NEXT:    lw a1, 4(a1)
+; RV32-SMALL-NEXT:    sw a2, 0(a0)
+; RV32-SMALL-NEXT:    sw a1, 4(a0)
+; RV32-SMALL-NEXT:    ret
+;
+; RV32-MEDIUM-LABEL: constpool_f64:
+; RV32-MEDIUM:       # %bb.0:
+; RV32-MEDIUM-NEXT:  .Lpcrel_hi1:
+; RV32-MEDIUM-NEXT:    auipc a1, %pcrel_hi(.LCPI1_0)
+; RV32-MEDIUM-NEXT:    addi a1, a1, %pcrel_lo(.Lpcrel_hi1)
+; RV32-MEDIUM-NEXT:    lw a2, 0(a1)
+; RV32-MEDIUM-NEXT:    lw a1, 4(a1)
+; RV32-MEDIUM-NEXT:    sw a2, 0(a0)
+; RV32-MEDIUM-NEXT:    sw a1, 4(a0)
+; RV32-MEDIUM-NEXT:    ret
+;
+; RV32-PIC-LABEL: constpool_f64:
+; RV32-PIC:       # %bb.0:
+; RV32-PIC-NEXT:  .Lpcrel_hi1:
+; RV32-PIC-NEXT:    auipc a1, %pcrel_hi(.LCPI1_0)
+; RV32-PIC-NEXT:    addi a1, a1, %pcrel_lo(.Lpcrel_hi1)
+; RV32-PIC-NEXT:    lw a2, 0(a1)
+; RV32-PIC-NEXT:    lw a1, 4(a1)
+; RV32-PIC-NEXT:    sw a2, 0(a0)
+; RV32-PIC-NEXT:    sw a1, 4(a0)
+; RV32-PIC-NEXT:    ret
+;
+; RV64-SMALL-LABEL: constpool_f64:
+; RV64-SMALL:       # %bb.0:
+; RV64-SMALL-NEXT:    lui a1, %hi(.LCPI1_0)
+; RV64-SMALL-NEXT:    ld a1, %lo(.LCPI1_0)(a1)
+; RV64-SMALL-NEXT:    sd a1, 0(a0)
+; RV64-SMALL-NEXT:    ret
+;
+; RV64-MEDIUM-LABEL: constpool_f64:
+; RV64-MEDIUM:       # %bb.0:
+; RV64-MEDIUM-NEXT:  .Lpcrel_hi1:
+; RV64-MEDIUM-NEXT:    auipc a1, %pcrel_hi(.LCPI1_0)
+; RV64-MEDIUM-NEXT:    ld a1, %pcrel_lo(.Lpcrel_hi1)(a1)
+; RV64-MEDIUM-NEXT:    sd a1, 0(a0)
+; RV64-MEDIUM-NEXT:    ret
+;
+; RV64-PIC-LABEL: constpool_f64:
+; RV64-PIC:       # %bb.0:
+; RV64-PIC-NEXT:  .Lpcrel_hi1:
+; RV64-PIC-NEXT:    auipc a1, %pcrel_hi(.LCPI1_0)
+; RV64-PIC-NEXT:    ld a1, %pcrel_lo(.Lpcrel_hi1)(a1)
+; RV64-PIC-NEXT:    sd a1, 0(a0)
+; RV64-PIC-NEXT:    ret
+  store double 1.0, ptr %p
+  ret void
+}

From 3693f44fffc0622760979dd5e2143797662913fe Mon Sep 17 00:00:00 2001
From: Caslyn Tonelli <6718161+Caslyn@users.noreply.github.com>
Date: Fri, 1 Dec 2023 10:30:18 -0800
Subject: [PATCH 68/72] [libc] Exclude Fuchsia from float128 detection (#73985)

Following from https://github.com/llvm/llvm-project/pull/73372:

Fuchsia targets currently don't support `float128`. Add detection for
`LIBC_TARGET_OS_IS_FUCHSIA`, and exclude this OS from setting
`LIBC_COMPILER_HAS_FLOAT128_EXTENSION`.
---
 libc/src/__support/macros/properties/float.h | 3 ++-
 libc/src/__support/macros/properties/os.h    | 4 ++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/libc/src/__support/macros/properties/float.h b/libc/src/__support/macros/properties/float.h
index bae51cbe8aee8e..756579024cad8b 100644
--- a/libc/src/__support/macros/properties/float.h
+++ b/libc/src/__support/macros/properties/float.h
@@ -60,7 +60,8 @@ using float16 = _Float16;
 #define LIBC_COMPILER_HAS_C23_FLOAT128
 #endif
 #if (defined(LIBC_COMPILER_CLANG_VER) && (LIBC_COMPILER_CLANG_VER >= 500)) &&  \
-    (defined(LIBC_TARGET_ARCH_IS_X86_64))
+    (defined(LIBC_TARGET_ARCH_IS_X86_64) &&                                    \
+     !defined(LIBC_TARGET_OS_IS_FUCHSIA))
 #define LIBC_COMPILER_HAS_FLOAT128_EXTENSION
 #endif
 
diff --git a/libc/src/__support/macros/properties/os.h b/libc/src/__support/macros/properties/os.h
index 92e68b3e6612a8..1c8fd5721ce679 100644
--- a/libc/src/__support/macros/properties/os.h
+++ b/libc/src/__support/macros/properties/os.h
@@ -37,4 +37,8 @@
 #endif
 #endif
 
+#if defined(__Fuchsia__)
+#define LIBC_TARGET_OS_IS_FUCHSIA
+#endif
+
 #endif // LLVM_LIBC_SRC___SUPPORT_MACROS_PROPERTIES_OS_H

From 8bea804923a1b028e86b177caccb3258708ca01c Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Fri, 1 Dec 2023 12:31:34 -0600
Subject: [PATCH 69/72] [libc] Move the pointer to pin off the stack to the
 heap (#74118)

Summary:
This may be problematic to pin a stack pointer. Allocate it via the OS
allocator instead as the documentation suggests.

For some reason, if you attempt to free this pointer after the memory
region has been unlocked, it will return an invalid pointer.
---
 libc/utils/gpu/loader/amdgpu/Loader.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp
index a9a687656efcb3..61b86bcf9c7f8b 100644
--- a/libc/utils/gpu/loader/amdgpu/Loader.cpp
+++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp
@@ -471,10 +471,10 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
     handle_error(err);
 
   // Pin some memory we can use to obtain the address of the rpc client.
-  void *rpc_client_storage = nullptr;
+  void *rpc_client_storage = malloc(sizeof(void *));
   void *rpc_client_host = nullptr;
   if (hsa_status_t err =
-          hsa_amd_memory_lock(&rpc_client_storage, sizeof(void *),
+          hsa_amd_memory_lock(rpc_client_storage, sizeof(void *),
                               /*agents=*/nullptr, 0, &rpc_client_host))
     handle_error(err);
 
@@ -501,6 +501,7 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
     handle_error(err);
   if (hsa_status_t err = hsa_amd_memory_unlock(rpc_client_host))
     handle_error(err);
+  free(rpc_client_storage);
 
   // Obtain the GPU's fixed-frequency clock rate and copy it to the GPU.
   // If the clock_freq symbol is missing, no work to do.

From 6c5e967f5d19ccefe6cf7700ac3998b80d19f202 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 1 Dec 2023 18:31:50 +0000
Subject: [PATCH 70/72] Fix MSVC signed/unsigned mismatch warning. NFC.

---
 llvm/lib/CodeGen/TargetInstrInfo.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp
index fbb7c81fa1f86f..b88bb9a8a0c418 100644
--- a/llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -1463,7 +1463,7 @@ bool TargetInstrInfo::hasLowDefLatency(const TargetSchedModel &SchedModel,
   unsigned DefClass = DefMI.getDesc().getSchedClass();
   std::optional<unsigned> DefCycle =
       ItinData->getOperandCycle(DefClass, DefIdx);
-  return DefCycle <= 1;
+  return DefCycle <= 1U;
 }
 
 bool TargetInstrInfo::isFunctionSafeToSplit(const MachineFunction &MF) const {

From 625e1ecb7e80c1da4ea50e5b1ad632f08b71d127 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 1 Dec 2023 18:32:26 +0000
Subject: [PATCH 71/72] Fix MSVC signed/unsigned mismatch warning. NFC.

---
 llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index 4d6d350c46f5af..c9e2745f00c958 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -661,7 +661,7 @@ void ScheduleDAGSDNodes::computeOperandLatency(SDNode *Def, SDNode *Use,
     OpIdx += TII->get(Use->getMachineOpcode()).getNumDefs();
   std::optional<unsigned> Latency =
       TII->getOperandLatency(InstrItins, Def, DefIdx, Use, OpIdx);
-  if (Latency > 1 && Use->getOpcode() == ISD::CopyToReg &&
+  if (Latency > 1U && Use->getOpcode() == ISD::CopyToReg &&
       !BB->succ_empty()) {
     unsigned Reg = cast<RegisterSDNode>(Use->getOperand(1))->getReg();
     if (Register::isVirtualRegister(Reg))

From ff0d8a9a6c2a08e206e77db318a18210678f54a4 Mon Sep 17 00:00:00 2001
From: chrulski-intel <christopher.m.chrulski@intel.com>
Date: Fri, 1 Dec 2023 11:36:25 -0700
Subject: [PATCH 72/72] Report pass name when -llvm-verify-each reports
 breakage (#71447)

Update the string reported to include the pass name of last pass when
running verifier after each pass.
---
 llvm/lib/Passes/PassBuilder.cpp              | 26 ++++++++++++++++++++
 llvm/lib/Passes/PassRegistry.def             |  2 ++
 llvm/lib/Passes/StandardInstrumentations.cpp |  8 ++++--
 llvm/test/Other/trigger-verifier-error.ll    | 18 ++++++++++++++
 4 files changed, 52 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/Other/trigger-verifier-error.ll

diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index dad7a74693cbcc..da1bc6c5a8f7b9 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -408,6 +408,32 @@ class TriggerCrashPass : public PassInfoMixin<TriggerCrashPass> {
   static StringRef name() { return "TriggerCrashPass"; }
 };
 
+// A pass for testing message reporting of -verify-each failures.
+// DO NOT USE THIS EXCEPT FOR TESTING!
+class TriggerVerifierErrorPass
+    : public PassInfoMixin<TriggerVerifierErrorPass> {
+public:
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &) {
+    // Intentionally break the Module by creating an alias without setting the
+    // aliasee.
+    auto *PtrTy = llvm::PointerType::getUnqual(M.getContext());
+    GlobalAlias::create(PtrTy, PtrTy->getAddressSpace(),
+                        GlobalValue::LinkageTypes::InternalLinkage,
+                        "__bad_alias", nullptr, &M);
+    return PreservedAnalyses::none();
+  }
+
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &) {
+    // Intentionally break the Function by inserting a terminator
+    // instruction in the middle of a basic block.
+    BasicBlock &BB = F.getEntryBlock();
+    new UnreachableInst(F.getContext(), BB.getTerminator());
+    return PreservedAnalyses::none();
+  }
+
+  static StringRef name() { return "TriggerVerifierErrorPass"; }
+};
+
 } // namespace
 
 PassBuilder::PassBuilder(TargetMachine *TM, PipelineTuningOptions PTO,
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index e23863a235a160..6f3c408f56f5a3 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -133,6 +133,7 @@ MODULE_PASS("strip-nondebug", StripNonDebugSymbolsPass())
 MODULE_PASS("strip-nonlinetable-debuginfo", StripNonLineTableDebugInfoPass())
 MODULE_PASS("synthetic-counts-propagation", SyntheticCountsPropagation())
 MODULE_PASS("trigger-crash", TriggerCrashPass())
+MODULE_PASS("trigger-verifier-error", TriggerVerifierErrorPass())
 MODULE_PASS("tsan-module", ModuleThreadSanitizerPass())
 MODULE_PASS("verify", VerifierPass())
 MODULE_PASS("view-callgraph", CallGraphViewerPass())
@@ -404,6 +405,7 @@ FUNCTION_PASS("structurizecfg", StructurizeCFGPass())
 FUNCTION_PASS("tailcallelim", TailCallElimPass())
 FUNCTION_PASS("tlshoist", TLSVariableHoistPass())
 FUNCTION_PASS("transform-warning", WarnMissedTransformationsPass())
+FUNCTION_PASS("trigger-verifier-error", TriggerVerifierErrorPass())  
 FUNCTION_PASS("tsan", ThreadSanitizerPass())
 FUNCTION_PASS("typepromotion", TypePromotionPass(TM))
 FUNCTION_PASS("unify-loop-exits", UnifyLoopExitsPass())
diff --git a/llvm/lib/Passes/StandardInstrumentations.cpp b/llvm/lib/Passes/StandardInstrumentations.cpp
index 8cae1235487f13..df445c2dd78b77 100644
--- a/llvm/lib/Passes/StandardInstrumentations.cpp
+++ b/llvm/lib/Passes/StandardInstrumentations.cpp
@@ -1405,7 +1405,9 @@ void VerifyInstrumentation::registerCallbacks(
             dbgs() << "Verifying function " << F->getName() << "\n";
 
           if (verifyFunction(*F, &errs()))
-            report_fatal_error("Broken function found, compilation aborted!");
+            report_fatal_error(formatv("Broken function found after pass "
+                                       "\"{0}\", compilation aborted!",
+                                       P));
         } else {
           const Module **MPtr = llvm::any_cast<const Module *>(&IR);
           const Module *M = MPtr ? *MPtr : nullptr;
@@ -1420,7 +1422,9 @@ void VerifyInstrumentation::registerCallbacks(
               dbgs() << "Verifying module " << M->getName() << "\n";
 
             if (verifyModule(*M, &errs()))
-              report_fatal_error("Broken module found, compilation aborted!");
+              report_fatal_error(formatv("Broken module found after pass "
+                                         "\"{0}\", compilation aborted!",
+                                         P));
           }
         }
       });
diff --git a/llvm/test/Other/trigger-verifier-error.ll b/llvm/test/Other/trigger-verifier-error.ll
new file mode 100644
index 00000000000000..692758cd0eb504
--- /dev/null
+++ b/llvm/test/Other/trigger-verifier-error.ll
@@ -0,0 +1,18 @@
+; A test that the option -verify-each reports the last pass run
+; when a failure occurs.
+
+; RUN: not --crash opt -disable-output -debug-pass-manager -verify-each -passes="module(trigger-verifier-error)" %s 2>&1 | FileCheck %s --check-prefix=CHECK_MODULE
+; RUN: not --crash opt -disable-output -debug-pass-manager -verify-each -passes="function(trigger-verifier-error)" %s 2>&1 | FileCheck %s --check-prefix=CHECK_FUNCTION
+
+; CHECK_MODULE: Running pass: TriggerVerifierErrorPass on [module]
+; CHECK_MODULE: Broken module found after pass "TriggerVerifierErrorPass", compilation aborted!
+
+; CHECK_FUNCTION: Running pass: TriggerVerifierErrorPass on main
+; CHECK_FUNCTION: Broken function found after pass "TriggerVerifierErrorPass", compilation aborted!
+
+define i32 @main() {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, ptr %retval, align 4
+  ret i32 0
+}