From ef5120c189617e82674f4da9f55ab7e413dbee57 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Tue, 11 Jun 2019 16:42:42 +0000
Subject: [PATCH 01/17] [Path] Set FD to -1 in moved-from TempFile

When moving a temp file, explicitly set the file descriptor to -1 so we
can never accidentally close the moved-from TempFile.

Differential revision: https://reviews.llvm.org/D63087

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@363083 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Support/Path.cpp       | 1 +
 unittests/Support/Path.cpp | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/lib/Support/Path.cpp b/lib/Support/Path.cpp
index 5312e1df3b6a..c49260125dba 100644
--- a/lib/Support/Path.cpp
+++ b/lib/Support/Path.cpp
@@ -1125,6 +1125,7 @@ TempFile &TempFile::operator=(TempFile &&Other) {
   TmpName = std::move(Other.TmpName);
   FD = Other.FD;
   Other.Done = true;
+  Other.FD = -1;
   return *this;
 }
 
diff --git a/unittests/Support/Path.cpp b/unittests/Support/Path.cpp
index 4eee8e923b73..f0e11b4e3f62 100644
--- a/unittests/Support/Path.cpp
+++ b/unittests/Support/Path.cpp
@@ -578,6 +578,7 @@ TEST_F(FileSystemTest, TempFileKeepDiscard) {
   auto TempFileOrError = fs::TempFile::create(TestDirectory + "/test-%%%%");
   ASSERT_TRUE((bool)TempFileOrError);
   fs::TempFile File = std::move(*TempFileOrError);
+  ASSERT_EQ(-1, TempFileOrError->FD);
   ASSERT_FALSE((bool)File.keep(TestDirectory + "/keep"));
   ASSERT_FALSE((bool)File.discard());
   ASSERT_TRUE(fs::exists(TestDirectory + "/keep"));
@@ -589,6 +590,7 @@ TEST_F(FileSystemTest, TempFileDiscardDiscard) {
   auto TempFileOrError = fs::TempFile::create(TestDirectory + "/test-%%%%");
   ASSERT_TRUE((bool)TempFileOrError);
   fs::TempFile File = std::move(*TempFileOrError);
+  ASSERT_EQ(-1, TempFileOrError->FD);
   ASSERT_FALSE((bool)File.discard());
   ASSERT_FALSE((bool)File.discard());
   ASSERT_FALSE(fs::exists(TestDirectory + "/keep"));

From 31d8a65082264a366e9775654f58a0cecb8e8614 Mon Sep 17 00:00:00 2001
From: Cameron McInally <cameron.mcinally@nyu.edu>
Date: Tue, 11 Jun 2019 17:05:36 +0000
Subject: [PATCH 02/17] [NFC][CodeGen] Add unary fneg tests to
 X86/fma-fneg-combine.ll

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@363084 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/fma-fneg-combine.ll | 434 +++++++++++++++++++++++++++
 1 file changed, 434 insertions(+)

diff --git a/test/CodeGen/X86/fma-fneg-combine.ll b/test/CodeGen/X86/fma-fneg-combine.ll
index d583c54a086b..210fd41d00cb 100644
--- a/test/CodeGen/X86/fma-fneg-combine.ll
+++ b/test/CodeGen/X86/fma-fneg-combine.ll
@@ -16,6 +16,17 @@ entry:
   ret <16 x float> %0
 }
 
+define <16 x float> @test1_unary_fneg(<16 x float> %a, <16 x float> %b, <16 x float> %c)  {
+; CHECK-LABEL: test1_unary_fneg:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2
+; CHECK-NEXT:    retq
+entry:
+  %neg.i = fneg <16 x float> %c
+  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %neg.i, i32 4) #2
+  ret <16 x float> %0
+}
+
 declare <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i32)
 declare <16 x float> @llvm.x86.avx512.mask.vfnmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
 declare <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
@@ -32,6 +43,17 @@ entry:
   ret <16 x float> %sub.i
 }
 
+define <16 x float> @test2_unary_fneg(<16 x float> %a, <16 x float> %b, <16 x float> %c) {
+; CHECK-LABEL: test2_unary_fneg:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vfnmsub213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2
+; CHECK-NEXT:    retq
+entry:
+  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i32 4) #2
+  %neg.i = fneg <16 x float> %0
+  ret <16 x float> %neg.i
+}
+
 define <16 x float> @test3(<16 x float> %a, <16 x float> %b, <16 x float> %c)  {
 ; CHECK-LABEL: test3:
 ; CHECK:       # %bb.0: # %entry
@@ -43,6 +65,17 @@ entry:
   ret <16 x float> %sub.i
 }
 
+define <16 x float> @test3_unary_fneg(<16 x float> %a, <16 x float> %b, <16 x float> %c)  {
+; CHECK-LABEL: test3_unary_fneg:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2
+; CHECK-NEXT:    retq
+entry:
+  %0 = tail call <16 x float> @llvm.x86.avx512.mask.vfnmadd.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 -1, i32 4) #2
+  %neg.i = fneg <16 x float> %0
+  ret <16 x float> %neg.i
+}
+
 define <16 x float> @test4(<16 x float> %a, <16 x float> %b, <16 x float> %c) {
 ; CHECK-LABEL: test4:
 ; CHECK:       # %bb.0: # %entry
@@ -54,6 +87,17 @@ entry:
   ret <16 x float> %sub.i
 }
 
+define <16 x float> @test4_unary_fneg(<16 x float> %a, <16 x float> %b, <16 x float> %c) {
+; CHECK-LABEL: test4_unary_fneg:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
+; CHECK-NEXT:    retq
+entry:
+  %0 = tail call <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 -1, i32 4) #2
+  %neg.i = fneg <16 x float> %0
+  ret <16 x float> %neg.i
+}
+
 define <16 x float> @test5(<16 x float> %a, <16 x float> %b, <16 x float> %c) {
 ; CHECK-LABEL: test5:
 ; CHECK:       # %bb.0: # %entry
@@ -65,6 +109,17 @@ entry:
   ret <16 x float> %0
 }
 
+define <16 x float> @test5_unary_fneg(<16 x float> %a, <16 x float> %b, <16 x float> %c) {
+; CHECK-LABEL: test5_unary_fneg:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vfmsub213ps {ru-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+entry:
+  %neg.i = fneg <16 x float> %c
+  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %neg.i, i32 10) #2
+  ret <16 x float> %0
+}
+
 define <16 x float> @test6(<16 x float> %a, <16 x float> %b, <16 x float> %c) {
 ; CHECK-LABEL: test6:
 ; CHECK:       # %bb.0: # %entry
@@ -76,6 +131,16 @@ entry:
   ret <16 x float> %sub.i
 }
 
+define <16 x float> @test6_unary_fneg(<16 x float> %a, <16 x float> %b, <16 x float> %c) {
+; CHECK-LABEL: test6_unary_fneg:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vfmadd213ps {ru-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+entry:
+  %0 = tail call <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 -1, i32 10) #2
+  %neg.i = fneg <16 x float> %0
+  ret <16 x float> %neg.i
+}
 
 define <8 x float> @test7(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
 ; CHECK-LABEL: test7:
@@ -88,6 +153,17 @@ entry:
   ret <8 x float> %sub.i
 }
 
+define <8 x float> @test7_unary_fneg(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
+; CHECK-LABEL: test7_unary_fneg:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2
+; CHECK-NEXT:    retq
+entry:
+  %0 = tail call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c) #2
+  %neg.i = fneg <8 x float> %0
+  ret <8 x float> %neg.i
+}
+
 define <8 x float> @test8(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
 ; CHECK-LABEL: test8:
 ; CHECK:       # %bb.0: # %entry
@@ -99,6 +175,17 @@ entry:
   ret <8 x float> %0
 }
 
+define <8 x float> @test8_unary_fneg(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
+; CHECK-LABEL: test8_unary_fneg:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2
+; CHECK-NEXT:    retq
+entry:
+  %neg.c = fneg <8 x float> %c
+  %0 = tail call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %neg.c) #2
+  ret <8 x float> %0
+}
+
 declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
 
 
@@ -113,6 +200,17 @@ entry:
   ret <8 x double> %sub.i
 }
 
+define <8 x double> @test9_unary_fneg(<8 x double> %a, <8 x double> %b, <8 x double> %c) {
+; CHECK-LABEL: test9_unary_fneg:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vfnmsub213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2
+; CHECK-NEXT:    retq
+entry:
+  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i32 4) #2
+  %neg.i = fneg <8 x double> %0
+  ret <8 x double> %neg.i
+}
+
 declare <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i32)
 
 define <2 x double> @test10(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
@@ -127,6 +225,18 @@ entry:
   ret <2 x double> %sub.i
 }
 
+define <2 x double> @test10_unary_fneg(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; CHECK-LABEL: test10_unary_fneg:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
+; CHECK-NEXT:    vxorpd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-NEXT:    retq
+entry:
+  %0 = tail call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 -1, i32 4) #2
+  %neg.i = fneg <2 x double> %0
+  ret <2 x double> %neg.i
+}
+
 declare <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8, i32)
 
 define <4 x float> @test11(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 {
@@ -154,6 +264,31 @@ entry:
   ret <4 x float> %0
 }
 
+define <4 x float> @test11_unary_fneg(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 {
+; SKX-LABEL: test11_unary_fneg:
+; SKX:       # %bb.0: # %entry
+; SKX-NEXT:    vxorps {{.*}}(%rip){1to4}, %xmm2, %xmm3
+; SKX-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vmovss %xmm0, %xmm3, %xmm3 {%k1}
+; SKX-NEXT:    vmovaps %xmm3, %xmm0
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: test11_unary_fneg:
+; KNL:       # %bb.0: # %entry
+; KNL-NEXT:    vbroadcastss {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; KNL-NEXT:    vxorps %xmm3, %xmm2, %xmm3
+; KNL-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    vmovss %xmm0, %xmm3, %xmm3 {%k1}
+; KNL-NEXT:    vmovaps %xmm3, %xmm0
+; KNL-NEXT:    retq
+entry:
+  %neg.i = fneg <4 x float> %c
+  %0 = tail call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %neg.i, i8 %mask, i32 4) #10
+  ret <4 x float> %0
+}
+
 declare <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
 
 define <4 x float> @test11b(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 {
@@ -174,6 +309,24 @@ entry:
   ret <4 x float> %0
 }
 
+define <4 x float> @test11b_unary_fneg(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 {
+; SKX-LABEL: test11b_unary_fneg:
+; SKX:       # %bb.0: # %entry
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: test11b_unary_fneg:
+; KNL:       # %bb.0: # %entry
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
+; KNL-NEXT:    retq
+entry:
+  %neg.i = fneg <4 x float> %c
+  %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %neg.i, i8 %mask, i32 4) #10
+  ret <4 x float> %0
+}
+
 declare <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
 
 define <8 x double> @test12(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
@@ -198,6 +351,28 @@ entry:
   ret <8 x double> %sub.i
 }
 
+define <8 x double> @test12_unary_fneg(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
+; SKX-LABEL: test12_unary_fneg:
+; SKX:       # %bb.0: # %entry
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vfmadd132pd {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm2
+; SKX-NEXT:    vxorpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: test12_unary_fneg:
+; KNL:       # %bb.0: # %entry
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    vfmadd132pd {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm2
+; KNL-NEXT:    vpxorq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; KNL-NEXT:    retq
+entry:
+  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i32 4) #2
+  %bc = bitcast i8 %mask to <8 x i1>
+  %sel = select <8 x i1> %bc, <8 x double> %0, <8 x double> %a
+  %neg.i = fneg <8 x double> %sel
+  ret <8 x double> %neg.i
+}
+
 define <2 x double> @test13(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
 ; SKX-LABEL: test13:
 ; SKX:       # %bb.0: # %entry
@@ -223,6 +398,31 @@ entry:
   ret <2 x double> %0
 }
 
+define <2 x double> @test13_unary_fneg(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; SKX-LABEL: test13_unary_fneg:
+; SKX:       # %bb.0: # %entry
+; SKX-NEXT:    vxorpd {{.*}}(%rip), %xmm0, %xmm3
+; SKX-NEXT:    vfnmadd213sd {{.*#+}} xmm1 = -(xmm0 * xmm1) + xmm2
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vmovsd %xmm1, %xmm3, %xmm3 {%k1}
+; SKX-NEXT:    vmovapd %xmm3, %xmm0
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: test13_unary_fneg:
+; KNL:       # %bb.0: # %entry
+; KNL-NEXT:    vxorpd {{.*}}(%rip), %xmm0, %xmm3
+; KNL-NEXT:    vfnmadd213sd {{.*#+}} xmm1 = -(xmm0 * xmm1) + xmm2
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    vmovsd %xmm1, %xmm3, %xmm3 {%k1}
+; KNL-NEXT:    vmovapd %xmm3, %xmm0
+; KNL-NEXT:    retq
+
+entry:
+  %neg.i = fneg <2 x double> %a
+  %0 = tail call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %neg.i, <2 x double> %b, <2 x double> %c, i8 %mask, i32 4)
+  ret <2 x double> %0
+}
+
 define <16 x float> @test14(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
 ; SKX-LABEL: test14:
 ; SKX:       # %bb.0: # %entry
@@ -243,6 +443,26 @@ entry:
   ret <16 x float> %sub.i
 }
 
+define <16 x float> @test14_unary_fneg(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
+; SKX-LABEL: test14_unary_fneg:
+; SKX:       # %bb.0: # %entry
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vfnmsub132ps {ru-sae}, %zmm1, %zmm2, %zmm0 {%k1}
+; SKX-NEXT:    vxorps {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: test14_unary_fneg:
+; KNL:       # %bb.0: # %entry
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    vfnmsub132ps {ru-sae}, %zmm1, %zmm2, %zmm0 {%k1}
+; KNL-NEXT:    vpxord {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; KNL-NEXT:    retq
+entry:
+  %0 = tail call <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 10) #2
+  %neg.i = fneg <16 x float> %0
+  ret <16 x float> %neg.i
+}
+
 define <16 x float> @test15(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask)  {
 ; SKX-LABEL: test15:
 ; SKX:       # %bb.0: # %entry
@@ -273,6 +493,36 @@ entry:
   ret <16 x float> %sel2
 }
 
+define <16 x float> @test15_unary_fneg(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask)  {
+; SKX-LABEL: test15_unary_fneg:
+; SKX:       # %bb.0: # %entry
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vxorps {{.*}}(%rip){1to16}, %zmm0, %zmm3
+; SKX-NEXT:    vfnmadd213ps {ru-sae}, %zmm2, %zmm0, %zmm1
+; SKX-NEXT:    vmovaps %zmm1, %zmm3 {%k1}
+; SKX-NEXT:    vfnmadd132ps {rd-sae}, %zmm0, %zmm2, %zmm3 {%k1}
+; SKX-NEXT:    vmovaps %zmm3, %zmm0
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: test15_unary_fneg:
+; KNL:       # %bb.0: # %entry
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    vpxord {{.*}}(%rip){1to16}, %zmm0, %zmm3
+; KNL-NEXT:    vfnmadd213ps {ru-sae}, %zmm2, %zmm0, %zmm1
+; KNL-NEXT:    vmovaps %zmm1, %zmm3 {%k1}
+; KNL-NEXT:    vfnmadd132ps {rd-sae}, %zmm0, %zmm2, %zmm3 {%k1}
+; KNL-NEXT:    vmovaps %zmm3, %zmm0
+; KNL-NEXT:    retq
+entry:
+  %bc = bitcast i16 %mask to <16 x i1>
+  %neg.i = fneg <16 x float> %a
+  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %neg.i, <16 x float> %b, <16 x float> %c, i32 10)
+  %sel = select <16 x i1> %bc, <16 x float> %0, <16 x float> %neg.i
+  %1 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sel, <16 x float> %neg.i, <16 x float> %c, i32 9)
+  %sel2 = select <16 x i1> %bc, <16 x float> %1, <16 x float> %sel
+  ret <16 x float> %sel2
+}
+
 define <16 x float> @test16(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
 ; SKX-LABEL: test16:
 ; SKX:       # %bb.0:
@@ -291,6 +541,26 @@ define <16 x float> @test16(<16 x float> %a, <16 x float> %b, <16 x float> %c, i
   %sel = select <16 x i1> %bc, <16 x float> %res, <16 x float> %a
   ret <16 x float> %sel
 }
+
+define <16 x float> @test16_unary_fneg(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
+; SKX-LABEL: test16_unary_fneg:
+; SKX:       # %bb.0:
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vfmsubadd132ps {rd-sae}, %zmm1, %zmm2, %zmm0 {%k1}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: test16_unary_fneg:
+; KNL:       # %bb.0:
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    vfmsubadd132ps {rd-sae}, %zmm1, %zmm2, %zmm0 {%k1}
+; KNL-NEXT:    retq
+  %neg.i = fneg <16 x float> %c
+  %res = call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %neg.i, i32 9)
+  %bc = bitcast i16 %mask to <16 x i1>
+  %sel = select <16 x i1> %bc, <16 x float> %res, <16 x float> %a
+  ret <16 x float> %sel
+}
+
 declare <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i32)
 
 define <8 x double> @test17(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
@@ -311,6 +581,26 @@ define <8 x double> @test17(<8 x double> %a, <8 x double> %b, <8 x double> %c, i
   %sel = select <8 x i1> %bc, <8 x double> %res, <8 x double> %a
   ret <8 x double> %sel
 }
+
+define <8 x double> @test17_unary_fneg(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
+; SKX-LABEL: test17_unary_fneg:
+; SKX:       # %bb.0:
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vfmsubadd132pd {{.*#+}} zmm0 = (zmm0 * zmm1) -/+ zmm2
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: test17_unary_fneg:
+; KNL:       # %bb.0:
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    vfmsubadd132pd {{.*#+}} zmm0 = (zmm0 * zmm1) -/+ zmm2
+; KNL-NEXT:    retq
+  %neg.i = fneg <8 x double> %c
+  %res = call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %neg.i, i32 4)
+  %bc = bitcast i8 %mask to <8 x i1>
+  %sel = select <8 x i1> %bc, <8 x double> %res, <8 x double> %a
+  ret <8 x double> %sel
+}
+
 declare <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i32)
 
 define <4 x float> @test18(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 {
@@ -331,6 +621,24 @@ entry:
   ret <4 x float> %0
 }
 
+define <4 x float> @test18_unary_fneg(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 {
+; SKX-LABEL: test18_unary_fneg:
+; SKX:       # %bb.0: # %entry
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: test18_unary_fneg:
+; KNL:       # %bb.0: # %entry
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
+; KNL-NEXT:    retq
+entry:
+  %neg.i = fneg <4 x float> %b
+  %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %a, <4 x float> %neg.i, <4 x float> %c, i8 %mask, i32 4) #10
+  ret <4 x float> %0
+}
+
 define <4 x float> @test19(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 {
 ; SKX-LABEL: test19:
 ; SKX:       # %bb.0: # %entry
@@ -350,6 +658,25 @@ entry:
   ret <4 x float> %0
 }
 
+define <4 x float> @test19_unary_fneg(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 {
+; SKX-LABEL: test19_unary_fneg:
+; SKX:       # %bb.0: # %entry
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: test19_unary_fneg:
+; KNL:       # %bb.0: # %entry
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
+; KNL-NEXT:    retq
+entry:
+  %neg.i = fneg <4 x float> %b
+  %neg.i.2 = fneg <4 x float> %c
+  %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %a, <4 x float> %neg.i, <4 x float> %neg.i.2, i8 %mask, i32 4) #10
+  ret <4 x float> %0
+}
+
 define <4 x float> @test20(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 {
 ; SKX-LABEL: test20:
 ; SKX:       # %bb.0: # %entry
@@ -370,6 +697,26 @@ entry:
   ret <4 x float> %0
 }
 
+define <4 x float> @test20_unary_fneg(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 {
+; SKX-LABEL: test20_unary_fneg:
+; SKX:       # %bb.0: # %entry
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vfnmadd231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
+; SKX-NEXT:    vmovaps %xmm2, %xmm0
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: test20_unary_fneg:
+; KNL:       # %bb.0: # %entry
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    vfnmadd231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
+; KNL-NEXT:    vmovaps %xmm2, %xmm0
+; KNL-NEXT:    retq
+entry:
+  %neg.i = fneg <4 x float> %b
+  %0 = tail call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %a, <4 x float> %neg.i, <4 x float> %c, i8 %mask, i32 4) #10
+  ret <4 x float> %0
+}
+
 define <4 x float> @test21(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 {
 ; SKX-LABEL: test21:
 ; SKX:       # %bb.0: # %entry
@@ -388,6 +735,24 @@ entry:
   ret <4 x float> %0
 }
 
+define <4 x float> @test21_unary_fneg(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 {
+; SKX-LABEL: test21_unary_fneg:
+; SKX:       # %bb.0: # %entry
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: test21_unary_fneg:
+; KNL:       # %bb.0: # %entry
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
+; KNL-NEXT:    retq
+entry:
+  %neg.i = fneg <4 x float> %b
+  %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %a, <4 x float> %neg.i, <4 x float> %c, i8 %mask, i32 8) #10
+  ret <4 x float> %0
+}
+
 define <4 x float> @test22(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 {
 ; SKX-LABEL: test22:
 ; SKX:       # %bb.0: # %entry
@@ -407,6 +772,25 @@ entry:
   ret <4 x float> %0
 }
 
+define <4 x float> @test22_unary_fneg(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 {
+; SKX-LABEL: test22_unary_fneg:
+; SKX:       # %bb.0: # %entry
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: test22_unary_fneg:
+; KNL:       # %bb.0: # %entry
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
+; KNL-NEXT:    retq
+entry:
+  %neg.i = fneg <4 x float> %b
+  %neg.i.2 = fneg <4 x float> %c
+  %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %a, <4 x float> %neg.i, <4 x float> %neg.i.2, i8 %mask, i32 8) #10
+  ret <4 x float> %0
+}
+
 define <4 x float> @test23(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 {
 ; SKX-LABEL: test23:
 ; SKX:       # %bb.0: # %entry
@@ -427,6 +811,26 @@ entry:
   ret <4 x float> %0
 }
 
+define <4 x float> @test23_unary_fneg(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 {
+; SKX-LABEL: test23_unary_fneg:
+; SKX:       # %bb.0: # %entry
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vfnmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; SKX-NEXT:    vmovaps %xmm2, %xmm0
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: test23_unary_fneg:
+; KNL:       # %bb.0: # %entry
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    vfnmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; KNL-NEXT:    vmovaps %xmm2, %xmm0
+; KNL-NEXT:    retq
+entry:
+  %neg.i = fneg <4 x float> %b
+  %0 = tail call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %a, <4 x float> %neg.i, <4 x float> %c, i8 %mask, i32 8) #10
+  ret <4 x float> %0
+}
+
 define <4 x float> @test24(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 {
 ; SKX-LABEL: test24:
 ; SKX:       # %bb.0: # %entry
@@ -445,6 +849,24 @@ entry:
   ret <4 x float> %0
 }
 
+define <4 x float> @test24_unary_fneg(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 {
+; SKX-LABEL: test24_unary_fneg:
+; SKX:       # %bb.0: # %entry
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: test24_unary_fneg:
+; KNL:       # %bb.0: # %entry
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
+; KNL-NEXT:    retq
+entry:
+  %neg.i = fneg <4 x float> %c
+  %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %neg.i, i8 %mask, i32 8) #10
+  ret <4 x float> %0
+}
+
 define <16 x float> @test25(<16 x float> %a, <16 x float> %b, <16 x float> %c)  {
 ; CHECK-LABEL: test25:
 ; CHECK:       # %bb.0: # %entry
@@ -456,3 +878,15 @@ entry:
   %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %a, <16 x float> %sub.i, <16 x float> %sub.i.2, i32 8) #2
   ret <16 x float> %0
 }
+
+define <16 x float> @test25_unary_fneg(<16 x float> %a, <16 x float> %b, <16 x float> %c)  {
+; CHECK-LABEL: test25_unary_fneg:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vfnmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+entry:
+  %neg.i = fneg <16 x float> %b
+  %neg.i.2 = fneg <16 x float> %c
+  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %a, <16 x float> %neg.i, <16 x float> %neg.i.2, i32 8) #2
+  ret <16 x float> %0
+}

From 8a6498e04762ed8261d7449ff09dcdc155a244cd Mon Sep 17 00:00:00 2001
From: Jinsong Ji <jji@us.ibm.com>
Date: Tue, 11 Jun 2019 17:40:39 +0000
Subject: [PATCH 03/17] [PowerPC] Enable MachinePipeliner for P9 with
 -ppc-enable-pipeliner

Implement necessary target hooks to enable MachinePipeliner for P9 only.
The pass is off by default, can be enabled with -ppc-enable-pipeliner for P9.

Differential Revision: https://reviews.llvm.org/D62164

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@363085 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/MachinePipeliner.h    |  6 +-
 include/llvm/CodeGen/TargetInstrInfo.h     |  5 +-
 include/llvm/CodeGen/TargetSubtargetInfo.h |  3 +
 lib/CodeGen/MachinePipeliner.cpp           | 16 +++--
 lib/Target/Hexagon/HexagonInstrInfo.cpp    | 10 +--
 lib/Target/Hexagon/HexagonInstrInfo.h      |  2 +-
 lib/Target/PowerPC/PPCInstrInfo.cpp        | 74 ++++++++++++++++++++
 lib/Target/PowerPC/PPCInstrInfo.h          | 28 ++++++++
 lib/Target/PowerPC/PPCSubtarget.cpp        | 13 +++-
 lib/Target/PowerPC/PPCSubtarget.h          |  8 ++-
 lib/Target/PowerPC/PPCTargetMachine.cpp    |  3 +
 test/CodeGen/PowerPC/sms-simple.ll         | 78 ++++++++++++++++++++++
 12 files changed, 227 insertions(+), 19 deletions(-)
 create mode 100644 test/CodeGen/PowerPC/sms-simple.ll

diff --git a/include/llvm/CodeGen/MachinePipeliner.h b/include/llvm/CodeGen/MachinePipeliner.h
index d40becbb227f..03ca53072685 100644
--- a/include/llvm/CodeGen/MachinePipeliner.h
+++ b/include/llvm/CodeGen/MachinePipeliner.h
@@ -318,9 +318,9 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
                               MBBVectorTy &EpilogBBs);
   void splitLifetimes(MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs,
                       SMSchedule &Schedule);
-  void addBranches(MBBVectorTy &PrologBBs, MachineBasicBlock *KernelBB,
-                   MBBVectorTy &EpilogBBs, SMSchedule &Schedule,
-                   ValueMapTy *VRMap);
+  void addBranches(MachineBasicBlock &PreheaderBB, MBBVectorTy &PrologBBs,
+                   MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs,
+                   SMSchedule &Schedule, ValueMapTy *VRMap);
   bool computeDelta(MachineInstr &MI, unsigned &Delta);
   void updateMemOperands(MachineInstr &NewMI, MachineInstr &OldMI,
                          unsigned Num);
diff --git a/include/llvm/CodeGen/TargetInstrInfo.h b/include/llvm/CodeGen/TargetInstrInfo.h
index ca16340ff378..20c4934c7d5e 100644
--- a/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/include/llvm/CodeGen/TargetInstrInfo.h
@@ -670,8 +670,9 @@ class TargetInstrInfo : public MCInstrInfo {
   /// is finished.  Return the value/register of the new loop count.  We need
   /// this function when peeling off one or more iterations of a loop. This
   /// function assumes the nth iteration is peeled first.
-  virtual unsigned reduceLoopCount(MachineBasicBlock &MBB, MachineInstr *IndVar,
-                                   MachineInstr &Cmp,
+  virtual unsigned reduceLoopCount(MachineBasicBlock &MBB,
+                                   MachineBasicBlock &PreHeader,
+                                   MachineInstr *IndVar, MachineInstr &Cmp,
                                    SmallVectorImpl<MachineOperand> &Cond,
                                    SmallVectorImpl<MachineInstr *> &PrevInsts,
                                    unsigned Iter, unsigned MaxIter) const {
diff --git a/include/llvm/CodeGen/TargetSubtargetInfo.h b/include/llvm/CodeGen/TargetSubtargetInfo.h
index 4c6f1163469b..9057b2d87c39 100644
--- a/include/llvm/CodeGen/TargetSubtargetInfo.h
+++ b/include/llvm/CodeGen/TargetSubtargetInfo.h
@@ -193,6 +193,9 @@ class TargetSubtargetInfo : public MCSubtargetInfo {
   /// for preRA scheduling with the source level scheduler.
   virtual bool enableMachineSchedDefaultSched() const { return true; }
 
+  /// True if the subtarget should run MachinePipeliner
+  virtual bool enableMachinePipeliner() const { return true; };
+
   /// True if the subtarget should enable joining global copies.
   ///
   /// By default this is enabled if the machine scheduler is enabled, but
diff --git a/lib/CodeGen/MachinePipeliner.cpp b/lib/CodeGen/MachinePipeliner.cpp
index 639d124804c9..81917abae087 100644
--- a/lib/CodeGen/MachinePipeliner.cpp
+++ b/lib/CodeGen/MachinePipeliner.cpp
@@ -187,6 +187,9 @@ bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {
       !EnableSWPOptSize.getPosition())
     return false;
 
+  if (!mf.getSubtarget().enableMachinePipeliner())
+    return false;
+
   // Cannot pipeline loops without instruction itineraries if we are using
   // DFA for the pipeliner.
   if (mf.getSubtarget().useDFAforSMS() &&
@@ -2026,6 +2029,10 @@ void SwingSchedulerDAG::generatePipelinedLoop(SMSchedule &Schedule) {
   InstrMapTy InstrMap;
 
   SmallVector<MachineBasicBlock *, 4> PrologBBs;
+
+  MachineBasicBlock *PreheaderBB = MLI->getLoopFor(BB)->getLoopPreheader();
+  assert(PreheaderBB != nullptr &&
+         "Need to add code to handle loops w/o preheader");
   // Generate the prolog instructions that set up the pipeline.
   generateProlog(Schedule, MaxStageCount, KernelBB, VRMap, PrologBBs);
   MF.insert(BB->getIterator(), KernelBB);
@@ -2082,7 +2089,7 @@ void SwingSchedulerDAG::generatePipelinedLoop(SMSchedule &Schedule) {
   removeDeadInstructions(KernelBB, EpilogBBs);
 
   // Add branches between prolog and epilog blocks.
-  addBranches(PrologBBs, KernelBB, EpilogBBs, Schedule, VRMap);
+  addBranches(*PreheaderBB, PrologBBs, KernelBB, EpilogBBs, Schedule, VRMap);
 
   // Remove the original loop since it's no longer referenced.
   for (auto &I : *BB)
@@ -2767,7 +2774,8 @@ static void removePhis(MachineBasicBlock *BB, MachineBasicBlock *Incoming) {
 /// Create branches from each prolog basic block to the appropriate epilog
 /// block.  These edges are needed if the loop ends before reaching the
 /// kernel.
-void SwingSchedulerDAG::addBranches(MBBVectorTy &PrologBBs,
+void SwingSchedulerDAG::addBranches(MachineBasicBlock &PreheaderBB,
+                                    MBBVectorTy &PrologBBs,
                                     MachineBasicBlock *KernelBB,
                                     MBBVectorTy &EpilogBBs,
                                     SMSchedule &Schedule, ValueMapTy *VRMap) {
@@ -2794,8 +2802,8 @@ void SwingSchedulerDAG::addBranches(MBBVectorTy &PrologBBs,
     // Check if the LOOP0 has already been removed. If so, then there is no need
     // to reduce the trip count.
     if (LC != 0)
-      LC = TII->reduceLoopCount(*Prolog, IndVar, *Cmp, Cond, PrevInsts, j,
-                                MaxIter);
+      LC = TII->reduceLoopCount(*Prolog, PreheaderBB, IndVar, *Cmp, Cond,
+                                PrevInsts, j, MaxIter);
 
     // Record the value of the first trip count, which is used to determine if
     // branches and blocks can be removed for constant trip counts.
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp
index d525baa03014..a156de5ba128 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -697,11 +697,11 @@ bool HexagonInstrInfo::analyzeLoop(MachineLoop &L,
 /// Generate code to reduce the loop iteration by one and check if the loop is
 /// finished. Return the value/register of the new loop count. this function
 /// assumes the nth iteration is peeled first.
-unsigned HexagonInstrInfo::reduceLoopCount(MachineBasicBlock &MBB,
-      MachineInstr *IndVar, MachineInstr &Cmp,
-      SmallVectorImpl<MachineOperand> &Cond,
-      SmallVectorImpl<MachineInstr *> &PrevInsts,
-      unsigned Iter, unsigned MaxIter) const {
+unsigned HexagonInstrInfo::reduceLoopCount(
+    MachineBasicBlock &MBB, MachineBasicBlock &PreHeader, MachineInstr *IndVar,
+    MachineInstr &Cmp, SmallVectorImpl<MachineOperand> &Cond,
+    SmallVectorImpl<MachineInstr *> &PrevInsts, unsigned Iter,
+    unsigned MaxIter) const {
   // We expect a hardware loop currently. This means that IndVar is set
   // to null, and the compare is the ENDLOOP instruction.
   assert((!IndVar) && isEndLoopN(Cmp.getOpcode())
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.h b/lib/Target/Hexagon/HexagonInstrInfo.h
index 6c4dbfedb31d..e0a999d0f4c4 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -139,7 +139,7 @@ class HexagonInstrInfo : public HexagonGenInstrInfo {
   /// is finished.  Return the value/register of the new loop count.  We need
   /// this function when peeling off one or more iterations of a loop. This
   /// function assumes the nth iteration is peeled first.
-  unsigned reduceLoopCount(MachineBasicBlock &MBB,
+  unsigned reduceLoopCount(MachineBasicBlock &MBB, MachineBasicBlock &PreHeader,
                            MachineInstr *IndVar, MachineInstr &Cmp,
                            SmallVectorImpl<MachineOperand> &Cond,
                            SmallVectorImpl<MachineInstr *> &PrevInsts,
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index 0799c4281e31..7aede06b73d6 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -3922,3 +3922,77 @@ PPCInstrInfo::isSignOrZeroExtended(const MachineInstr &MI, bool SignExt,
   }
   return false;
 }
+
+bool PPCInstrInfo::isBDNZ(unsigned Opcode) const {
+  return (Opcode == (Subtarget.isPPC64() ? PPC::BDNZ8 : PPC::BDNZ));
+}
+
+bool PPCInstrInfo::analyzeLoop(MachineLoop &L, MachineInstr *&IndVarInst,
+                               MachineInstr *&CmpInst) const {
+  MachineBasicBlock *LoopEnd = L.getBottomBlock();
+  MachineBasicBlock::iterator I = LoopEnd->getFirstTerminator();
+  // We really "analyze" only CTR loops right now.
+  if (I != LoopEnd->end() && isBDNZ(I->getOpcode())) {
+    IndVarInst = nullptr;
+    CmpInst = &*I;
+    return false;
+  }
+  return true;
+}
+
+MachineInstr *
+PPCInstrInfo::findLoopInstr(MachineBasicBlock &PreHeader) const {
+
+  unsigned LOOPi = (Subtarget.isPPC64() ? PPC::MTCTR8loop : PPC::MTCTRloop);
+
+  // The loop set-up instruction should be in preheader
+  for (auto &I : PreHeader.instrs())
+    if (I.getOpcode() == LOOPi)
+      return &I;
+  return nullptr;
+}
+
+unsigned PPCInstrInfo::reduceLoopCount(
+    MachineBasicBlock &MBB, MachineBasicBlock &PreHeader, MachineInstr *IndVar,
+    MachineInstr &Cmp, SmallVectorImpl<MachineOperand> &Cond,
+    SmallVectorImpl<MachineInstr *> &PrevInsts, unsigned Iter,
+    unsigned MaxIter) const {
+  // We expect a hardware loop currently. This means that IndVar is set
+  // to null, and the compare is the ENDLOOP instruction.
+  assert((!IndVar) && isBDNZ(Cmp.getOpcode()) && "Expecting a CTR loop");
+  MachineFunction *MF = MBB.getParent();
+  DebugLoc DL = Cmp.getDebugLoc();
+  MachineInstr *Loop = findLoopInstr(PreHeader);
+  if (!Loop)
+    return 0;
+  unsigned LoopCountReg = Loop->getOperand(0).getReg();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  MachineInstr *LoopCount = MRI.getUniqueVRegDef(LoopCountReg);
+
+  if (!LoopCount)
+    return 0;
+  // If the loop trip count is a compile-time value, then just change the
+  // value.
+  if (LoopCount->getOpcode() == PPC::LI8 || LoopCount->getOpcode() == PPC::LI) {
+    int64_t Offset = LoopCount->getOperand(1).getImm();
+    if (Offset <= 1) {
+      LoopCount->eraseFromParent();
+      Loop->eraseFromParent();
+      return 0;
+    }
+    LoopCount->getOperand(1).setImm(Offset - 1);
+    return Offset - 1;
+  }
+
+  // The loop trip count is a run-time value.
+  // We need to subtract one from the trip count,
+  // and insert branch later to check if we're done with the loop.
+
+  // Since BDZ/BDZ8 that we will insert will also decrease the ctr by 1,
+  // so we don't need to generate any thing here.
+  Cond.push_back(MachineOperand::CreateImm(0));
+  Cond.push_back(MachineOperand::CreateReg(
+      Subtarget.isPPC64() ? PPC::CTR8 : PPC::CTR, true));
+  return LoopCountReg;
+}
+
diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h
index da34e70989db..78b04c2c1ca6 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/lib/Target/PowerPC/PPCInstrInfo.h
@@ -457,6 +457,34 @@ class PPCInstrInfo : public PPCGenInstrInfo {
     }
     return Reg;
   }
+
+  /// Check \p Opcode is BDNZ (Decrement CTR and branch if it is still nonzero).
+  bool isBDNZ(unsigned Opcode) const;
+
+  /// Find the hardware loop instruction used to set-up the specified loop.
+  /// On PPC, we have two instructions used to set-up the hardware loop
+  /// (MTCTRloop, MTCTR8loop) with corresponding endloop (BDNZ, BDNZ8)
+  /// instructions to indicate the end of a loop.
+  MachineInstr *findLoopInstr(MachineBasicBlock &PreHeader) const;
+
+  /// Analyze the loop code to find the loop induction variable and compare used
+  /// to compute the number of iterations. Currently, we analyze loop that are
+  /// controlled using hardware loops.  In this case, the induction variable
+  /// instruction is null.  For all other cases, this function returns true,
+  /// which means we're unable to analyze it. \p IndVarInst and \p CmpInst will
+  /// return new values when we can analyze the readonly loop \p L, otherwise,
+  /// nothing got changed
+  bool analyzeLoop(MachineLoop &L, MachineInstr *&IndVarInst,
+                   MachineInstr *&CmpInst) const override;
+  /// Generate code to reduce the loop iteration by one and check if the loop
+  /// is finished.  Return the value/register of the new loop count.  We need
+  /// this function when peeling off one or more iterations of a loop. This
+  /// function assumes the last iteration is peeled first.
+  unsigned reduceLoopCount(MachineBasicBlock &MBB, MachineBasicBlock &PreHeader,
+                           MachineInstr *IndVar, MachineInstr &Cmp,
+                           SmallVectorImpl<MachineOperand> &Cond,
+                           SmallVectorImpl<MachineInstr *> &PrevInsts,
+                           unsigned Iter, unsigned MaxIter) const override;
 };
 
 }
diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp
index e3bc305be7a2..0d2786e84432 100644
--- a/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -39,6 +39,11 @@ static cl::opt<bool> QPXStackUnaligned("qpx-stack-unaligned",
   cl::desc("Even when QPX is enabled the stack is not 32-byte aligned"),
   cl::Hidden);
 
+static cl::opt<bool>
+    EnableMachinePipeliner("ppc-enable-pipeliner",
+                           cl::desc("Enable Machine Pipeliner for PPC"),
+                           cl::init(false), cl::Hidden);
+
 PPCSubtarget &PPCSubtarget::initializeSubtargetDependencies(StringRef CPU,
                                                             StringRef FS) {
   initializeEnvironment();
@@ -181,10 +186,14 @@ bool PPCSubtarget::hasLazyResolverStub(const GlobalValue *GV) const {
   return false;
 }
 
-bool PPCSubtarget::enableMachineScheduler() const {
-  return true;
+bool PPCSubtarget::enableMachineScheduler() const { return true; }
+
+bool PPCSubtarget::enableMachinePipeliner() const {
+  return (DarwinDirective == PPC::DIR_PWR9) && EnableMachinePipeliner;
 }
 
+bool PPCSubtarget::useDFAforSMS() const { return false; }
+
 // This overrides the PostRAScheduler bit in the SchedModel for each CPU.
 bool PPCSubtarget::enablePostRAScheduler() const { return true; }
 
diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h
index e1f4a9680da7..a59cbd60618e 100644
--- a/lib/Target/PowerPC/PPCSubtarget.h
+++ b/lib/Target/PowerPC/PPCSubtarget.h
@@ -322,9 +322,13 @@ class PPCSubtarget : public PPCGenSubtargetInfo {
   /// but may expand the ISEL instruction later.
   bool enableEarlyIfConversion() const override { return true; }
 
-  // Scheduling customization.
+  /// Scheduling customization.
   bool enableMachineScheduler() const override;
-  // This overrides the PostRAScheduler bit in the SchedModel for each CPU.
+  /// Pipeliner customization.
+  bool enableMachinePipeliner() const override;
+  /// Machine Pipeliner customization
+  bool useDFAforSMS() const override;
+  /// This overrides the PostRAScheduler bit in the SchedModel for each CPU.
   bool enablePostRAScheduler() const override;
   AntiDepBreakMode getAntiDepBreakMode() const override;
   void getCriticalPathRCs(RegClassVector &CriticalPathRCs) const override;
diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
index da1121bac9c7..fcaf7d6e3ee7 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -488,6 +488,9 @@ void PPCPassConfig::addPreRegAlloc() {
   }
   if (EnableExtraTOCRegDeps)
     addPass(createPPCTOCRegDepsPass());
+
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(&MachinePipelinerID);
 }
 
 void PPCPassConfig::addPreSched2() {
diff --git a/test/CodeGen/PowerPC/sms-simple.ll b/test/CodeGen/PowerPC/sms-simple.ll
new file mode 100644
index 000000000000..6b1f0e453e8f
--- /dev/null
+++ b/test/CodeGen/PowerPC/sms-simple.ll
@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -ppc-vsr-nums-as-vr -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:       -verify-machineinstrs -ppc-asm-full-reg-names -mcpu=pwr9 --ppc-enable-pipeliner \
+; RUN:       | FileCheck %s
+
+@x = dso_local local_unnamed_addr global <{ i32, i32, i32, i32, [1020 x i32] }> <{ i32 1, i32 2, i32 3, i32 4, [1020 x i32] zeroinitializer }>, align 4
+@y = common dso_local global [1024 x i32] zeroinitializer, align 4
+
+; Function Attrs: norecurse nounwind
+define dso_local i32* @foo() local_unnamed_addr #0 {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addis r5, r2, x@toc@ha
+; CHECK-NEXT:    addi r5, r5, x@toc@l
+; CHECK-NEXT:    addis r6, r2, y@toc@ha
+; CHECK-NEXT:    li r7, 340
+; CHECK-NEXT:    addi r3, r6, y@toc@l
+; CHECK-NEXT:    lwz r6, y@toc@l(r6)
+; CHECK-NEXT:    mtctr r7
+; CHECK-NEXT:    addi r5, r5, -8
+; CHECK-NEXT:    lwzu r7, 12(r5)
+; CHECK-NEXT:    maddld r6, r7, r7, r6
+; CHECK-NEXT:    lwz r7, 4(r5)
+; CHECK-NEXT:    addi r4, r3, -8
+; CHECK-NEXT:    stwu r6, 12(r4)
+; CHECK-NEXT:    maddld r6, r7, r7, r6
+; CHECK-NEXT:    lwz r7, 8(r5)
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB0_1: # %for.body
+; CHECK:         maddld r7, r7, r7, r6
+; CHECK-NEXT:    lwzu r8, 12(r5)
+; CHECK-NEXT:    maddld r8, r8, r8, r7
+; CHECK-NEXT:    stw r6, 4(r4)
+; CHECK-NEXT:    lwz r6, 4(r5)
+; CHECK-NEXT:    maddld r6, r6, r6, r8
+; CHECK-NEXT:    stw r7, 8(r4)
+; CHECK-NEXT:    lwz r7, 8(r5)
+; CHECK-NEXT:    stwu r8, 12(r4)
+; CHECK-NEXT:    bdnz .LBB0_1
+; CHECK-NEXT:  # %bb.2:
+; CHECK-NEXT:    maddld r5, r7, r7, r6
+; CHECK-NEXT:    stw r6, 4(r4)
+; CHECK-NEXT:    stw r5, 8(r4)
+; CHECK-NEXT:    blr
+entry:
+  %.pre = load i32, i32* getelementptr inbounds ([1024 x i32], [1024 x i32]* @y, i64 0, i64 0), align 4
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret i32* getelementptr inbounds ([1024 x i32], [1024 x i32]* @y, i64 0, i64 0)
+
+for.body:                                         ; preds = %for.body, %entry
+  %0 = phi i32 [ %.pre, %entry ], [ %add.2, %for.body ]
+  %indvars.iv = phi i64 [ 1, %entry ], [ %indvars.iv.next.2, %for.body ]
+  %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* bitcast (<{ i32, i32, i32, i32, [1020 x i32] }>* @x to [1024 x i32]*), i64 0, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %mul = mul nsw i32 %1, %1
+  %add = add nsw i32 %mul, %0
+  %arrayidx6 = getelementptr inbounds [1024 x i32], [1024 x i32]* @y, i64 0, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx6, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx2.1 = getelementptr inbounds [1024 x i32], [1024 x i32]* bitcast (<{ i32, i32, i32, i32, [1020 x i32] }>* @x to [1024 x i32]*), i64 0, i64 %indvars.iv.next
+  %2 = load i32, i32* %arrayidx2.1, align 4
+  %mul.1 = mul nsw i32 %2, %2
+  %add.1 = add nsw i32 %mul.1, %add
+  %arrayidx6.1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @y, i64 0, i64 %indvars.iv.next
+  store i32 %add.1, i32* %arrayidx6.1, align 4
+  %indvars.iv.next.1 = add nuw nsw i64 %indvars.iv, 2
+  %arrayidx2.2 = getelementptr inbounds [1024 x i32], [1024 x i32]* bitcast (<{ i32, i32, i32, i32, [1020 x i32] }>* @x to [1024 x i32]*), i64 0, i64 %indvars.iv.next.1
+  %3 = load i32, i32* %arrayidx2.2, align 4
+  %mul.2 = mul nsw i32 %3, %3
+  %add.2 = add nsw i32 %mul.2, %add.1
+  %arrayidx6.2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @y, i64 0, i64 %indvars.iv.next.1
+  store i32 %add.2, i32* %arrayidx6.2, align 4
+  %indvars.iv.next.2 = add nuw nsw i64 %indvars.iv, 3
+  %exitcond.2 = icmp eq i64 %indvars.iv.next.2, 1024
+  br i1 %exitcond.2, label %for.cond.cleanup, label %for.body
+}

From af0dbee15920ea1a992c6cb1194904bf9184f81a Mon Sep 17 00:00:00 2001
From: Amy Huang <akhuang@google.com>
Date: Tue, 11 Jun 2019 18:02:39 +0000
Subject: [PATCH 04/17] Deduplicate S_CONSTANTs in LLD.

Summary: Deduplicate S_CONSTANTS when linking, if they have the same value.

Reviewers: rnk

Subscribers: hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D63151

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@363089 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp b/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp
index de453ee4b849..8ed5b8b44c59 100644
--- a/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp
+++ b/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp
@@ -30,7 +30,7 @@ using namespace llvm::pdb;
 using namespace llvm::codeview;
 
 struct llvm::pdb::GSIHashStreamBuilder {
-  struct UdtDenseMapInfo {
+  struct SymbolDenseMapInfo {
     static inline CVSymbol getEmptyKey() {
       static CVSymbol Empty;
       return Empty;
@@ -50,7 +50,7 @@ struct llvm::pdb::GSIHashStreamBuilder {
 
   std::vector<CVSymbol> Records;
   uint32_t StreamIndex;
-  llvm::DenseSet<CVSymbol, UdtDenseMapInfo> UdtHashes;
+  llvm::DenseSet<CVSymbol, SymbolDenseMapInfo> SymbolHashes;
   std::vector<PSHashRecord> HashRecords;
   std::array<support::ulittle32_t, (IPHR_HASH + 32) / 32> HashBitmap;
   std::vector<support::ulittle32_t> HashBuckets;
@@ -66,8 +66,8 @@ struct llvm::pdb::GSIHashStreamBuilder {
                                                CodeViewContainer::Pdb));
   }
   void addSymbol(const CVSymbol &Symbol) {
-    if (Symbol.kind() == S_UDT) {
-      auto Iter = UdtHashes.insert(Symbol);
+    if (Symbol.kind() == S_UDT || Symbol.kind() == S_CONSTANT) {
+      auto Iter = SymbolHashes.insert(Symbol);
       if (!Iter.second)
         return;
     }

From 49ae59819e7a6a7470d75e38cf7f22f967a10db2 Mon Sep 17 00:00:00 2001
From: Alina Sbirlea <asbirlea@google.com>
Date: Tue, 11 Jun 2019 18:27:49 +0000
Subject: [PATCH 05/17] Only passes that preserve MemorySSA must mark it as
 preserved.

Summary:
The method `getLoopPassPreservedAnalyses` should not mark MemorySSA as
preserved, because it's being called in a lot of passes that do not
preserve MemorySSA.
Instead, mark the MemorySSA analysis as preserved by each pass that does
preserve it.
These changes only affect the new pass mananger.

Reviewers: chandlerc

Subscribers: mehdi_amini, jlebar, Prazek, george.burgess.iv, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D62536

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@363091 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Analysis/LoopAnalysisManager.cpp                |  2 --
 lib/Transforms/Scalar/LICM.cpp                      |  2 ++
 lib/Transforms/Scalar/LoopInstSimplify.cpp          |  2 ++
 lib/Transforms/Scalar/LoopRotation.cpp              |  5 ++++-
 lib/Transforms/Scalar/LoopSimplifyCFG.cpp           |  5 ++++-
 lib/Transforms/Scalar/SimpleLoopUnswitch.cpp        |  6 +++++-
 unittests/Transforms/Scalar/LoopPassManagerTest.cpp | 12 ++++++++++--
 7 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/lib/Analysis/LoopAnalysisManager.cpp b/lib/Analysis/LoopAnalysisManager.cpp
index d0cfb3e73421..a10a87ce113b 100644
--- a/lib/Analysis/LoopAnalysisManager.cpp
+++ b/lib/Analysis/LoopAnalysisManager.cpp
@@ -141,8 +141,6 @@ PreservedAnalyses llvm::getLoopPassPreservedAnalyses() {
   PA.preserve<LoopAnalysis>();
   PA.preserve<LoopAnalysisManagerFunctionProxy>();
   PA.preserve<ScalarEvolutionAnalysis>();
-  if (EnableMSSALoopDependency)
-    PA.preserve<MemorySSAAnalysis>();
   // FIXME: What we really want to do here is preserve an AA category, but that
   // concept doesn't exist yet.
   PA.preserve<AAManager>();
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index f039692e201b..938ee80e2794 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -294,6 +294,8 @@ PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM,
 
   PA.preserve<DominatorTreeAnalysis>();
   PA.preserve<LoopAnalysis>();
+  if (EnableMSSALoopDependency)
+    PA.preserve<MemorySSAAnalysis>();
 
   return PA;
 }
diff --git a/lib/Transforms/Scalar/LoopInstSimplify.cpp b/lib/Transforms/Scalar/LoopInstSimplify.cpp
index 6df32b0129c1..31191b52895c 100644
--- a/lib/Transforms/Scalar/LoopInstSimplify.cpp
+++ b/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -233,6 +233,8 @@ PreservedAnalyses LoopInstSimplifyPass::run(Loop &L, LoopAnalysisManager &AM,
 
   auto PA = getLoopPassPreservedAnalyses();
   PA.preserveSet<CFGAnalyses>();
+  if (EnableMSSALoopDependency)
+    PA.preserve<MemorySSAAnalysis>();
   return PA;
 }
 
diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp
index f9d784049981..e009947690af 100644
--- a/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/lib/Transforms/Scalar/LoopRotation.cpp
@@ -54,7 +54,10 @@ PreservedAnalyses LoopRotatePass::run(Loop &L, LoopAnalysisManager &AM,
   if (AR.MSSA && VerifyMemorySSA)
     AR.MSSA->verifyMemorySSA();
 
-  return getLoopPassPreservedAnalyses();
+  auto PA = getLoopPassPreservedAnalyses();
+  if (EnableMSSALoopDependency)
+    PA.preserve<MemorySSAAnalysis>();
+  return PA;
 }
 
 namespace {
diff --git a/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index 9fa4afb8217a..c650abb412d9 100644
--- a/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -701,7 +701,10 @@ PreservedAnalyses LoopSimplifyCFGPass::run(Loop &L, LoopAnalysisManager &AM,
   if (DeleteCurrentLoop)
     LPMU.markLoopAsDeleted(L, "loop-simplifycfg");
 
-  return getLoopPassPreservedAnalyses();
+  auto PA = getLoopPassPreservedAnalyses();
+  if (EnableMSSALoopDependency)
+    PA.preserve<MemorySSAAnalysis>();
+  return PA;
 }
 
 namespace {
diff --git a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 06d65b55890a..9fba159495bc 100644
--- a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -2861,7 +2861,11 @@ PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM,
   // Historically this pass has had issues with the dominator tree so verify it
   // in asserts builds.
   assert(AR.DT.verify(DominatorTree::VerificationLevel::Fast));
-  return getLoopPassPreservedAnalyses();
+
+  auto PA = getLoopPassPreservedAnalyses();
+  if (EnableMSSALoopDependency)
+    PA.preserve<MemorySSAAnalysis>();
+  return PA;
 }
 
 namespace {
diff --git a/unittests/Transforms/Scalar/LoopPassManagerTest.cpp b/unittests/Transforms/Scalar/LoopPassManagerTest.cpp
index 0743cba852da..5eb8101a3df9 100644
--- a/unittests/Transforms/Scalar/LoopPassManagerTest.cpp
+++ b/unittests/Transforms/Scalar/LoopPassManagerTest.cpp
@@ -396,11 +396,13 @@ TEST_F(LoopPassManagerTest, FunctionPassInvalidationOfLoopAnalyses) {
   // No need to re-run if we require again from a fresh loop pass manager.
   FPM.addPass(createFunctionToLoopPassAdaptor(
       RequireAnalysisLoopPass<MockLoopAnalysisHandle::Analysis>()));
-
   // For 'f', preserve most things but not the specific loop analyses.
+  auto PA = getLoopPassPreservedAnalyses();
+  if (EnableMSSALoopDependency)
+    PA.preserve<MemorySSAAnalysis>();
   EXPECT_CALL(MFPHandle, run(HasName("f"), _))
       .InSequence(FSequence)
-      .WillOnce(Return(getLoopPassPreservedAnalyses()));
+      .WillOnce(Return(PA));
   EXPECT_CALL(MLAHandle, invalidate(HasName("loop.0.0"), _, _))
       .InSequence(FSequence)
       .WillOnce(DoDefault());
@@ -475,6 +477,8 @@ TEST_F(LoopPassManagerTest, ModulePassInvalidationOfLoopAnalyses) {
   EXPECT_CALL(MMPHandle, run(_, _)).WillOnce(InvokeWithoutArgs([] {
     auto PA = getLoopPassPreservedAnalyses();
     PA.preserve<FunctionAnalysisManagerModuleProxy>();
+    if (EnableMSSALoopDependency)
+      PA.preserve<MemorySSAAnalysis>();
     return PA;
   }));
   // All the loop analyses from both functions get invalidated before we
@@ -803,6 +807,8 @@ TEST_F(LoopPassManagerTest, IndirectOuterPassInvalidation) {
   // the fact that they were preserved.
   EXPECT_CALL(MFPHandle, run(HasName("f"), _)).WillOnce(InvokeWithoutArgs([] {
     auto PA = getLoopPassPreservedAnalyses();
+    if (EnableMSSALoopDependency)
+      PA.preserve<MemorySSAAnalysis>();
     PA.preserveSet<AllAnalysesOn<Loop>>();
     return PA;
   }));
@@ -824,6 +830,8 @@ TEST_F(LoopPassManagerTest, IndirectOuterPassInvalidation) {
   // Which means that no extra invalidation occurs and cached values are used.
   EXPECT_CALL(MFPHandle, run(HasName("g"), _)).WillOnce(InvokeWithoutArgs([] {
     auto PA = getLoopPassPreservedAnalyses();
+    if (EnableMSSALoopDependency)
+      PA.preserve<MemorySSAAnalysis>();
     PA.preserveSet<AllAnalysesOn<Loop>>();
     return PA;
   }));

From 80956de3737b3efbbb36afa4734bd8b3d76031a5 Mon Sep 17 00:00:00 2001
From: Cameron McInally <cameron.mcinally@nyu.edu>
Date: Tue, 11 Jun 2019 18:55:13 +0000
Subject: [PATCH 06/17] [NFC][CodeGen] Add unary FNeg tests to
 X86/combine-fcopysign.ll X86/dag-fmf-cse.ll X86/fast-isel-fneg.ll X86/fdiv.ll

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@363093 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/combine-fcopysign.ll | 38 +++++++++++++++
 test/CodeGen/X86/dag-fmf-cse.ll       | 26 +++++++++++
 test/CodeGen/X86/fast-isel-fneg.ll    | 51 +++++++++++++++++++++
 test/CodeGen/X86/fdiv.ll              | 66 +++++++++++++++++++++++++++
 4 files changed, 181 insertions(+)

diff --git a/test/CodeGen/X86/combine-fcopysign.ll b/test/CodeGen/X86/combine-fcopysign.ll
index 05c546c411f5..9cbd59aa2422 100644
--- a/test/CodeGen/X86/combine-fcopysign.ll
+++ b/test/CodeGen/X86/combine-fcopysign.ll
@@ -102,6 +102,23 @@ define <4 x float> @combine_vec_fcopysign_fneg_fabs_sgn(<4 x float> %x, <4 x flo
   ret <4 x float> %3
 }
 
+define <4 x float> @combine_vec_fcopysign_unary_fneg_fabs_sgn(<4 x float> %x, <4 x float> %y) {
+; SSE-LABEL: combine_vec_fcopysign_unary_fneg_fabs_sgn:
+; SSE:       # %bb.0:
+; SSE-NEXT:    orps {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_vec_fcopysign_unary_fneg_fabs_sgn:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %y)
+  %2 = fneg <4 x float> %1
+  %3 = call <4 x float> @llvm.copysign.v4f32(<4 x float> %x, <4 x float> %2)
+  ret <4 x float> %3
+}
+
 ; copysign(fabs(x), y) -> copysign(x, y)
 define <4 x float> @combine_vec_fcopysign_fabs_mag(<4 x float> %x, <4 x float> %y) {
 ; SSE-LABEL: combine_vec_fcopysign_fabs_mag:
@@ -146,6 +163,27 @@ define <4 x float> @combine_vec_fcopysign_fneg_mag(<4 x float> %x, <4 x float> %
   ret <4 x float> %2
 }
 
+define <4 x float> @combine_vec_fcopysign_unary_fneg_mag(<4 x float> %x, <4 x float> %y) {
+; SSE-LABEL: combine_vec_fcopysign_unary_fneg_mag:
+; SSE:       # %bb.0:
+; SSE-NEXT:    andps {{.*}}(%rip), %xmm1
+; SSE-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSE-NEXT:    orps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_vec_fcopysign_unary_fneg_mag:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; AVX-NEXT:    vandps %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN]
+; AVX-NEXT:    vandps %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = fneg <4 x float> %x
+  %2 = call <4 x float> @llvm.copysign.v4f32(<4 x float> %1, <4 x float> %y)
+  ret <4 x float> %2
+}
+
 ; copysign(copysign(x,z), y) -> copysign(x, y)
 define <4 x float> @combine_vec_fcopysign_fcopysign_mag(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
 ; SSE-LABEL: combine_vec_fcopysign_fcopysign_mag:
diff --git a/test/CodeGen/X86/dag-fmf-cse.ll b/test/CodeGen/X86/dag-fmf-cse.ll
index 609ccdc36739..f1ea3f158be5 100644
--- a/test/CodeGen/X86/dag-fmf-cse.ll
+++ b/test/CodeGen/X86/dag-fmf-cse.ll
@@ -19,6 +19,19 @@ define float @fmf_should_not_break_cse(float %a, float %b) {
   ret float %abx2
 }
 
+define float @fmf_should_not_break_cse_unary_fneg(float %a, float %b) {
+; CHECK-LABEL: fmf_should_not_break_cse_unary_fneg:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddss %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %mul1 = fmul fast float %a, %b
+  %nega = fneg fast float %a
+  %mul2 = fmul fast float %nega, %b
+  %abx2 = fsub fast float %mul1, %mul2
+  ret float %abx2
+}
+
 define <4 x float> @fmf_should_not_break_cse_vector(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: fmf_should_not_break_cse_vector:
 ; CHECK:       # %bb.0:
@@ -31,3 +44,16 @@ define <4 x float> @fmf_should_not_break_cse_vector(<4 x float> %a, <4 x float>
   %abx2 = fsub fast <4 x float> %mul1, %mul2
   ret <4 x float> %abx2
 }
+
+define <4 x float> @fmf_should_not_break_cse_vector_unary_fneg(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: fmf_should_not_break_cse_vector_unary_fneg:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %mul1 = fmul fast <4 x float> %a, %b
+  %nega = fneg fast <4 x float> %a
+  %mul2 = fmul fast <4 x float> %nega, %b
+  %abx2 = fsub fast <4 x float> %mul1, %mul2
+  ret <4 x float> %abx2
+}
diff --git a/test/CodeGen/X86/fast-isel-fneg.ll b/test/CodeGen/X86/fast-isel-fneg.ll
index beb454ece26b..7e6772dc8dd7 100644
--- a/test/CodeGen/X86/fast-isel-fneg.ll
+++ b/test/CodeGen/X86/fast-isel-fneg.ll
@@ -74,6 +74,31 @@ define void @goo(double* %x, double* %y) nounwind {
   ret void
 }
 
+define void @goo_unary_fneg(double* %x, double* %y) nounwind {
+; CHECK-LABEL: goo_unary_fneg:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movq %xmm0, %rax
+; CHECK-NEXT:    movabsq $-9223372036854775808, %rcx ## imm = 0x8000000000000000
+; CHECK-NEXT:    xorq %rax, %rcx
+; CHECK-NEXT:    movq %rcx, %xmm0
+; CHECK-NEXT:    movq %xmm0, (%rsi)
+; CHECK-NEXT:    retq
+;
+; SSE2-LABEL: goo_unary_fneg:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    xorps {{\.LCPI.*}}, %xmm0
+; SSE2-NEXT:    movsd %xmm0, (%eax)
+; SSE2-NEXT:    retl
+  %a = load double, double* %x
+  %b = fneg double %a
+  store double %b, double* %y
+  ret void
+}
+
 define void @loo(float* %x, float* %y) nounwind {
 ; CHECK-LABEL: loo:
 ; CHECK:       ## %bb.0:
@@ -100,6 +125,32 @@ define void @loo(float* %x, float* %y) nounwind {
   ret void
 }
 
+define void @loo_unary_fneg(float* %x, float* %y) nounwind {
+; CHECK-LABEL: loo_unary_fneg:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movd %xmm0, %eax
+; CHECK-NEXT:    xorl $2147483648, %eax ## imm = 0x80000000
+; CHECK-NEXT:    movd %eax, %xmm0
+; CHECK-NEXT:    movd %xmm0, (%rsi)
+; CHECK-NEXT:    retq
+;
+; SSE2-LABEL: loo_unary_fneg:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movd %xmm0, %ecx
+; SSE2-NEXT:    xorl $2147483648, %ecx # imm = 0x80000000
+; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    movd %xmm0, (%eax)
+; SSE2-NEXT:    retl
+  %a = load float, float* %x
+  %b = fneg float %a
+  store float %b, float* %y
+  ret void
+}
+
 define double @too(double %x) nounwind {
 ; CHECK-LABEL: too:
 ; CHECK:       ## %bb.0:
diff --git a/test/CodeGen/X86/fdiv.ll b/test/CodeGen/X86/fdiv.ll
index 259cd91cca52..26670be29396 100644
--- a/test/CodeGen/X86/fdiv.ll
+++ b/test/CodeGen/X86/fdiv.ll
@@ -65,6 +65,39 @@ define float @double_negative(float %x, float %y) #0 {
   ret float %div
 }
 
+define float @double_negative_unary_fneg_x_unary_fneg_y(float %x, float %y) #0 {
+; CHECK-LABEL: double_negative_unary_fneg_x_unary_fneg_y:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    divss %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %neg1 = fneg float %x
+  %neg2 = fneg float %y
+  %div = fdiv float %neg1, %neg2
+  ret float %div
+}
+
+define float @double_negative_unary_fneg_x(float %x, float %y) #0 {
+; CHECK-LABEL: double_negative_unary_fneg_x:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    divss %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %neg1 = fneg float %x
+  %neg2 = fsub float -0.0, %y
+  %div = fdiv float %neg1, %neg2
+  ret float %div
+}
+
+define float @double_negative_unary_fneg_y(float %x, float %y) #0 {
+; CHECK-LABEL: double_negative_unary_fneg_y:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    divss %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %neg1 = fsub float -0.0, %x
+  %neg2 = fneg float %y
+  %div = fdiv float %neg1, %neg2
+  ret float %div
+}
+
 define <4 x float> @double_negative_vector(<4 x float> %x, <4 x float> %y) #0 {
 ; CHECK-LABEL: double_negative_vector:
 ; CHECK:       # %bb.0:
@@ -76,5 +109,38 @@ define <4 x float> @double_negative_vector(<4 x float> %x, <4 x float> %y) #0 {
   ret <4 x float> %div
 }
 
+define <4 x float> @double_negative_vector_unary_fneg_x_unary_fneg_y(<4 x float> %x, <4 x float> %y) #0 {
+; CHECK-LABEL: double_negative_vector_unary_fneg_x_unary_fneg_y:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    divps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %neg1 = fneg <4 x float> %x
+  %neg2 = fneg <4 x float> %y
+  %div = fdiv <4 x float> %neg1, %neg2
+  ret <4 x float> %div
+}
+
+define <4 x float> @double_negative_vector_unary_fneg_x(<4 x float> %x, <4 x float> %y) #0 {
+; CHECK-LABEL: double_negative_vector_unary_fneg_x:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    divps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %neg1 = fneg <4 x float> %x
+  %neg2 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %y
+  %div = fdiv <4 x float> %neg1, %neg2
+  ret <4 x float> %div
+}
+
+define <4 x float> @double_negative_vector_unary_fneg_y(<4 x float> %x, <4 x float> %y) #0 {
+; CHECK-LABEL: double_negative_vector_unary_fneg_y:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    divps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %neg1 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %x
+  %neg2 = fneg <4 x float> %y
+  %div = fdiv <4 x float> %neg1, %neg2
+  ret <4 x float> %div
+}
+
 attributes #0 = { "unsafe-fp-math"="false" }
 

From 68e99ab9740a6abe59119ae1f9ecdc0ec36eb2f0 Mon Sep 17 00:00:00 2001
From: Alina Sbirlea <asbirlea@google.com>
Date: Tue, 11 Jun 2019 19:09:34 +0000
Subject: [PATCH 07/17] [MemorySSA] When applying updates, clean unnecessary
 Phis.

Summary: After applying a set of insert updates, there may be trivial Phis left over. Clean them up.

Reviewers: george.burgess.iv

Subscribers: jlebar, Prazek, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D63033

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@363094 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Analysis/MemorySSAUpdater.cpp             |  5 +-
 .../loop_rotate_remove_trivial_phis.ll        | 78 +++++++++++++++++++
 2 files changed, 82 insertions(+), 1 deletion(-)
 create mode 100644 test/Analysis/MemorySSA/loop_rotate_remove_trivial_phis.ll

diff --git a/lib/Analysis/MemorySSAUpdater.cpp b/lib/Analysis/MemorySSAUpdater.cpp
index b0f7dc69e5a5..9b82808681ec 100644
--- a/lib/Analysis/MemorySSAUpdater.cpp
+++ b/lib/Analysis/MemorySSAUpdater.cpp
@@ -860,13 +860,14 @@ void MemorySSAUpdater::applyInsertUpdates(ArrayRef<CFGUpdate> Updates,
 
   SmallVector<BasicBlock *, 8> BlocksToProcess;
   SmallVector<BasicBlock *, 16> BlocksWithDefsToReplace;
+  SmallVector<WeakVH, 8> InsertedPhis;
 
   // First create MemoryPhis in all blocks that don't have one. Create in the
   // order found in Updates, not in PredMap, to get deterministic numbering.
   for (auto &Edge : Updates) {
     BasicBlock *BB = Edge.getTo();
     if (PredMap.count(BB) && !MSSA->getMemoryAccess(BB))
-      MSSA->createMemoryPhi(BB);
+      InsertedPhis.push_back(MSSA->createMemoryPhi(BB));
   }
 
   // Now we'll fill in the MemoryPhis with the right incoming values.
@@ -967,6 +968,7 @@ void MemorySSAUpdater::applyInsertUpdates(ArrayRef<CFGUpdate> Updates,
           IDFPhi->setIncomingValue(I, GetLastDef(IDFPhi->getIncomingBlock(I)));
       } else {
         IDFPhi = MSSA->createMemoryPhi(BBIDF);
+        InsertedPhis.push_back(IDFPhi);
         for (auto &Pair : children<GraphDiffInvBBPair>({GD, BBIDF})) {
           BasicBlock *Pi = Pair.second;
           IDFPhi->addIncoming(GetLastDef(Pi), Pi);
@@ -1009,6 +1011,7 @@ void MemorySSAUpdater::applyInsertUpdates(ArrayRef<CFGUpdate> Updates,
       }
     }
   }
+  tryRemoveTrivialPhis(InsertedPhis);
 }
 
 // Move What before Where in the MemorySSA IR.
diff --git a/test/Analysis/MemorySSA/loop_rotate_remove_trivial_phis.ll b/test/Analysis/MemorySSA/loop_rotate_remove_trivial_phis.ll
new file mode 100644
index 000000000000..e0352dfa403c
--- /dev/null
+++ b/test/Analysis/MemorySSA/loop_rotate_remove_trivial_phis.ll
@@ -0,0 +1,78 @@
+; RUN: opt -loop-rotate -print-memoryssa -disable-output -enable-mssa-loop-dependency -verify-memoryssa %s 2>&1 |  FileCheck %s
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-grtev4-linux-gnu"
+
+declare double @sqrt(double)
+
+; CHECK-LABEL: @f
+define internal fastcc double @f(i32* %n_, double* %dx) align 32 {
+entry:
+; CHECK: entry:
+; CHECK: MemoryUse(liveOnEntry)
+; CHECK-NOT: 7 = MemoryPhi
+; CHECK-NOT: 6 = MemoryPhi
+  %v0 = load i32, i32* %n_, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %xmax.0 = phi double [ undef, %entry ], [ %xmax.1, %for.body ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp slt i32 %i.0, %v0
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %idxprom = zext i32 %i.0 to i64
+  %arrayidx = getelementptr inbounds double, double* %dx, i64 %idxprom
+  %v1 = load double, double* %arrayidx, align 8
+  %cmp1 = fcmp ueq double %v1, 0.000000e+00
+  %xmax.1 = select i1 %cmp1, double %xmax.0, double %v1
+  %inc = add nuw nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %xmax.0.lcssa = phi double [ %xmax.0, %for.cond ]
+  %cmp2 = fcmp oeq double %xmax.0.lcssa, 0.000000e+00
+  br i1 %cmp2, label %cleanup, label %if.end4
+
+if.end4:                                          ; preds = %for.end
+  %div = fdiv double 1.000000e+00, %xmax.0.lcssa
+  %cmp61 = icmp slt i32 0, %v0
+  br i1 %cmp61, label %for.body7.lr.ph, label %for.end15
+
+for.body7.lr.ph:                                  ; preds = %if.end4
+  br label %for.body7
+
+; CHECK: for.body7:
+; CHECK: 3 = MemoryPhi({for.body7.lr.ph,liveOnEntry},{for.body7,1})
+for.body7:                                        ; preds = %for.body7.lr.ph, %for.body7
+  %i.13 = phi i32 [ 0, %for.body7.lr.ph ], [ %inc14, %for.body7 ]
+  %sum.02 = phi x86_fp80 [ undef, %for.body7.lr.ph ], [ %add, %for.body7 ]
+  %idxprom9 = zext i32 %i.13 to i64
+  %arrayidx10 = getelementptr inbounds double, double* %dx, i64 %idxprom9
+  %v3 = load double, double* %arrayidx10, align 8
+  %mul11 = fmul double %div, %v3
+  %v2 = call double @sqrt(double %v3)
+  %mul12 = fmul double %mul11, %v2
+  %conv = fpext double %mul12 to x86_fp80
+  %add = fadd x86_fp80 %sum.02, %conv
+  %inc14 = add nuw nsw i32 %i.13, 1
+  %cmp6 = icmp slt i32 %inc14, %v0
+  br i1 %cmp6, label %for.body7, label %for.cond5.for.end15_crit_edge
+
+for.cond5.for.end15_crit_edge:                    ; preds = %for.body7
+  %split = phi x86_fp80 [ %add, %for.body7 ]
+  br label %for.end15
+
+for.end15:                                        ; preds = %for.cond5.for.end15_crit_edge, %if.end4
+  %sum.0.lcssa = phi x86_fp80 [ %split, %for.cond5.for.end15_crit_edge ], [ undef, %if.end4 ]
+  %conv16 = fptrunc x86_fp80 %sum.0.lcssa to double
+  %call = call double @sqrt(double %conv16)
+  %mul17 = fmul double %call, 0.000000e+00
+  br label %cleanup
+
+cleanup:                                          ; preds = %for.end15, %for.end
+  %retval.0 = phi double [ undef, %for.end ], [ %mul17, %for.end15 ]
+  ret double %retval.0
+}

From dae5d38e983792c73073d8a4e6ade2a040391f3b Mon Sep 17 00:00:00 2001
From: Amara Emerson <aemerson@apple.com>
Date: Tue, 11 Jun 2019 19:58:06 +0000
Subject: [PATCH 08/17] [GlobalISel] Add a G_JUMP_TABLE opcode.

This opcode generates a pointer to the address of the jump table
specified by the source operand, which is a jump table index.

It will be used in conjunction with an upcoming G_BRJT opcode to support
jump table codegen with GlobalISel.

Differential Revision: https://reviews.llvm.org/D63111

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@363096 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../CodeGen/GlobalISel/MachineIRBuilder.h     |  8 ++++++
 include/llvm/Support/TargetOpcodes.def        |  5 +++-
 include/llvm/Target/GenericOpcodes.td         |  6 +++++
 lib/CodeGen/GlobalISel/MachineIRBuilder.cpp   |  6 +++++
 lib/CodeGen/MachineVerifier.cpp               |  8 ++++++
 test/MachineVerifier/test_g_jump_table.mir    | 26 +++++++++++++++++++
 6 files changed, 58 insertions(+), 1 deletion(-)
 create mode 100644 test/MachineVerifier/test_g_jump_table.mir

diff --git a/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h b/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
index 1b1d7017c4b3..d7ca0ed87a7f 100644
--- a/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
+++ b/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
@@ -1359,6 +1359,14 @@ class MachineIRBuilder {
     return buildInstr(TargetOpcode::G_UMAX, {Dst}, {Src0, Src1});
   }
 
+  /// Build and insert \p Res = G_JUMP_TABLE \p JTI
+  ///
+  /// G_JUMP_TABLE sets \p Res to the address of the jump table specified by
+  /// the jump table index \p JTI.
+  ///
+  /// \return a MachineInstrBuilder for the newly created instruction.
+  MachineInstrBuilder buildJumpTable(const LLT PtrTy, unsigned JTI);
+
   virtual MachineInstrBuilder buildInstr(unsigned Opc, ArrayRef<DstOp> DstOps,
                                          ArrayRef<SrcOp> SrcOps,
                                          Optional<unsigned> Flags = None);
diff --git a/include/llvm/Support/TargetOpcodes.def b/include/llvm/Support/TargetOpcodes.def
index 479c5acb9fda..b91da0b2b70b 100644
--- a/include/llvm/Support/TargetOpcodes.def
+++ b/include/llvm/Support/TargetOpcodes.def
@@ -566,12 +566,15 @@ HANDLE_TARGET_OPCODE(G_ADDRSPACE_CAST)
 /// Generic block address
 HANDLE_TARGET_OPCODE(G_BLOCK_ADDR)
 
+/// Generic jump table address
+HANDLE_TARGET_OPCODE(G_JUMP_TABLE)
+
 // TODO: Add more generic opcodes as we move along.
 
 /// Marker for the end of the generic opcode.
 /// This is used to check if an opcode is in the range of the
 /// generic opcodes.
-HANDLE_TARGET_OPCODE_MARKER(PRE_ISEL_GENERIC_OPCODE_END, G_BLOCK_ADDR)
+HANDLE_TARGET_OPCODE_MARKER(PRE_ISEL_GENERIC_OPCODE_END, G_JUMP_TABLE)
 
 /// BUILTIN_OP_END - This must be the last enum value in this list.
 /// The target-specific post-isel opcode values start here.
diff --git a/include/llvm/Target/GenericOpcodes.td b/include/llvm/Target/GenericOpcodes.td
index 75f3b5fe746d..a907bd4b5a86 100644
--- a/include/llvm/Target/GenericOpcodes.td
+++ b/include/llvm/Target/GenericOpcodes.td
@@ -169,6 +169,12 @@ def G_BLOCK_ADDR : GenericInstruction {
   let hasSideEffects = 0;
 }
 
+def G_JUMP_TABLE : GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins unknown:$jti);
+  let hasSideEffects = 0;
+}
+
 //------------------------------------------------------------------------------
 // Binary ops.
 //------------------------------------------------------------------------------
diff --git a/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
index d58a46298155..f6eac3956168 100644
--- a/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
+++ b/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -179,6 +179,12 @@ MachineInstrBuilder MachineIRBuilder::buildGlobalValue(unsigned Res,
       .addGlobalAddress(GV);
 }
 
+MachineInstrBuilder MachineIRBuilder::buildJumpTable(const LLT PtrTy,
+                                                     unsigned JTI) {
+  return buildInstr(TargetOpcode::G_JUMP_TABLE, {PtrTy}, {})
+      .addJumpTableIndex(JTI);
+}
+
 void MachineIRBuilder::validateBinaryOp(const LLT &Res, const LLT &Op0,
                                         const LLT &Op1) {
   assert((Res.isScalar() || Res.isVector()) && "invalid operand type");
diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
index b1964d68111d..78072a482dde 100644
--- a/lib/CodeGen/MachineVerifier.cpp
+++ b/lib/CodeGen/MachineVerifier.cpp
@@ -1312,6 +1312,14 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
 
     break;
   }
+  case TargetOpcode::G_JUMP_TABLE: {
+    if (!MI->getOperand(1).isJTI())
+      report("G_JUMP_TABLE source operand must be a jump table index", MI);
+    LLT DstTy = MRI->getType(MI->getOperand(0).getReg());
+    if (!DstTy.isPointer())
+      report("G_JUMP_TABLE dest operand must have a pointer type", MI);
+    break;
+  }
   default:
     break;
   }
diff --git a/test/MachineVerifier/test_g_jump_table.mir b/test/MachineVerifier/test_g_jump_table.mir
new file mode 100644
index 000000000000..406edcbea5c2
--- /dev/null
+++ b/test/MachineVerifier/test_g_jump_table.mir
@@ -0,0 +1,26 @@
+# RUN: not llc -march=aarch64 -o /dev/null -run-pass=none -verify-machineinstrs %s 2>&1 | FileCheck %s
+# REQUIRES: global-isel, aarch64-registered-target
+
+---
+name:            test_jump_table
+legalized:       true
+tracksRegLiveness: true
+jumpTable:
+  kind:            block-address
+  entries:
+    - id:              0
+      blocks:          [ '%bb.0' ]
+liveins:
+body:             |
+  bb.0:
+
+    ; CHECK: Bad machine code: Too few operands
+    %0:_(s32) = G_JUMP_TABLE
+
+    ; CHECK: G_JUMP_TABLE source operand must be a jump table index
+    %2:_(s32) = G_JUMP_TABLE %0
+
+    ; CHECK: G_JUMP_TABLE dest operand must have a pointer type
+    %3:_(s32) = G_JUMP_TABLE %jump-table.0
+
+...

From 69f0756aa54bf9cb4422c305f2e77f606b315f15 Mon Sep 17 00:00:00 2001
From: Jordan Rupprecht <rupprecht@google.com>
Date: Tue, 11 Jun 2019 21:13:01 +0000
Subject: [PATCH 09/17] [docs] Add "GNU binutils Replacements" section to
 command guide

Summary:
This splits out a section in the command guide for llvm tools that can be used as replacements for GNU tools. For pages that didn't exist, I added stub pages that can be individually filled in by followup patches.

Tested by running `ninja docs-llvm-html` and inspecting locally.

Reviewers: jhenderson, MaskRay, grimar, alexshap

Reviewed By: jhenderson, MaskRay, grimar

Subscribers: smeenai, arphaman, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D63014

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@363100 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/CommandGuide/index.rst       | 22 ++++++++++++++++++----
 docs/CommandGuide/llvm-cxxfilt.md | 10 ++++++++++
 docs/CommandGuide/llvm-objcopy.md | 16 ++++++++++++++++
 docs/CommandGuide/llvm-ranlib.md  | 17 +++++++++++++++++
 docs/CommandGuide/llvm-readelf.md | 28 ++++++++++++++++++++++++++++
 docs/CommandGuide/llvm-size.md    | 10 ++++++++++
 docs/CommandGuide/llvm-strings.md | 10 ++++++++++
 docs/CommandGuide/llvm-strip.md   | 16 ++++++++++++++++
 8 files changed, 125 insertions(+), 4 deletions(-)
 create mode 100644 docs/CommandGuide/llvm-cxxfilt.md
 create mode 100644 docs/CommandGuide/llvm-objcopy.md
 create mode 100644 docs/CommandGuide/llvm-ranlib.md
 create mode 100644 docs/CommandGuide/llvm-readelf.md
 create mode 100644 docs/CommandGuide/llvm-size.md
 create mode 100644 docs/CommandGuide/llvm-strings.md
 create mode 100644 docs/CommandGuide/llvm-strip.md

diff --git a/docs/CommandGuide/index.rst b/docs/CommandGuide/index.rst
index b461757d9977..52ad316459d4 100644
--- a/docs/CommandGuide/index.rst
+++ b/docs/CommandGuide/index.rst
@@ -20,11 +20,8 @@ Basic Commands
    llc
    lli
    llvm-link
-   llvm-ar
    llvm-lib
    llvm-lipo
-   llvm-nm
-   llvm-objdump
    llvm-config
    llvm-cxxmap
    llvm-diff
@@ -32,12 +29,29 @@ Basic Commands
    llvm-profdata
    llvm-stress
    llvm-symbolizer
-   llvm-addr2line
    llvm-dwarfdump
    dsymutil
    llvm-mca
    llvm-readobj
 
+GNU binutils replacements
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. toctree::
+   :maxdepth: 1
+
+   llvm-addr2line
+   llvm-ar
+   llvm-cxxfilt
+   llvm-nm
+   llvm-objcopy
+   llvm-objdump
+   llvm-ranlib
+   llvm-readelf
+   llvm-size
+   llvm-strings
+   llvm-strip
+
 Debugging Tools
 ~~~~~~~~~~~~~~~
 
diff --git a/docs/CommandGuide/llvm-cxxfilt.md b/docs/CommandGuide/llvm-cxxfilt.md
new file mode 100644
index 000000000000..e687cd271844
--- /dev/null
+++ b/docs/CommandGuide/llvm-cxxfilt.md
@@ -0,0 +1,10 @@
+# llvm-cxxfilt - a symbol demangler
+
+## SYNOPSIS
+
+**llvm-cxxfilt** [*options*]
+
+## DESCRIPTION
+
+**llvm-cxxfilt** is symbol demangler that can be used as a replacement for the
+GNU **c++filt** tool.
diff --git a/docs/CommandGuide/llvm-objcopy.md b/docs/CommandGuide/llvm-objcopy.md
new file mode 100644
index 000000000000..3b79fb7dff07
--- /dev/null
+++ b/docs/CommandGuide/llvm-objcopy.md
@@ -0,0 +1,16 @@
+# llvm-objcopy - object copying tool
+
+## SYNOPSIS
+
+**llvm-objcopy** [*options*]
+
+## DESCRIPTION
+
+**llvm-objcopy** is a tool to copy and manipulate objects.
+
+The tool is still in active development, but in most scenarios it works as a
+drop-in replacement for GNU's **objcopy**.
+
+## SEE ALSO
+
+[llvm-strip](llvm-strip.html)
diff --git a/docs/CommandGuide/llvm-ranlib.md b/docs/CommandGuide/llvm-ranlib.md
new file mode 100644
index 000000000000..4377364b57d7
--- /dev/null
+++ b/docs/CommandGuide/llvm-ranlib.md
@@ -0,0 +1,17 @@
+# llvm-ranlib - generates an archive index
+
+## SYNOPSIS
+
+**llvm-ranlib** [*options*]
+
+## DESCRIPTION
+
+**llvm-ranlib** is an alias for the [llvm-ar](llvm-ar.html) tool that generates
+an index for an archive. It can be used as a replacement for GNU's **ranlib**
+tool.
+
+Running **llvm-ranlib** is equivalent to running **llvm-ar s**.
+
+## SEE ALSO
+
+Refer to [llvm-ar](llvm-ar.html) for additional information.
diff --git a/docs/CommandGuide/llvm-readelf.md b/docs/CommandGuide/llvm-readelf.md
new file mode 100644
index 000000000000..a03844da1a2c
--- /dev/null
+++ b/docs/CommandGuide/llvm-readelf.md
@@ -0,0 +1,28 @@
+# llvm-readelf - a drop-in replacement for readelf
+
+## SYNOPSIS
+
+**llvm-readelf** [*options*]
+
+## DESCRIPTION
+
+**llvm-readelf** is an alias for the [llvm-readobj](llvm-readobj.html) tool with
+a command-line interface and output style more closely resembling GNU
+**readelf**.
+
+Here are some of those differences:
+
+* Uses `--elf-output-style=GNU` by default.
+
+* Allows single-letter grouped flags (e.g. `llvm-readelf -SW` is the same as
+  `llvm-readelf -S -W`).
+
+* Allows use of `-s` as an alias for `--symbols` (versus `--section-headers` in
+  **llvm-readobj**) for GNU **readelf** compatibility.
+
+* Prevents use of `-sr`, `-sd`, `-st` and `-dt` **llvm-readobj** aliases, to
+  avoid conflicting with standard GNU **readelf** grouped flags.
+
+## SEE ALSO
+
+Refer to [llvm-readobj](llvm-readobj.html) for additional information.
diff --git a/docs/CommandGuide/llvm-size.md b/docs/CommandGuide/llvm-size.md
new file mode 100644
index 000000000000..3952708f3661
--- /dev/null
+++ b/docs/CommandGuide/llvm-size.md
@@ -0,0 +1,10 @@
+# llvm-size - print segment sizes
+
+## SYNOPSIS
+
+**llvm-size** [*options*]
+
+## DESCRIPTION
+
+**llvm-size** is a tool that prints segment sizes in object files. The goal is
+to make it a drop-in replacement for GNU's **size**.
diff --git a/docs/CommandGuide/llvm-strings.md b/docs/CommandGuide/llvm-strings.md
new file mode 100644
index 000000000000..b5871c4c3191
--- /dev/null
+++ b/docs/CommandGuide/llvm-strings.md
@@ -0,0 +1,10 @@
+# llvm-strings - print strings
+
+## SYNOPSIS
+
+**llvm-strings** [*options*]
+
+## DESCRIPTION
+
+**llvm-strings** is a tool that prints strings in object files. The goal is to
+make it a drop-in replacement for GNU's **size**.
diff --git a/docs/CommandGuide/llvm-strip.md b/docs/CommandGuide/llvm-strip.md
new file mode 100644
index 000000000000..dd6e8593483f
--- /dev/null
+++ b/docs/CommandGuide/llvm-strip.md
@@ -0,0 +1,16 @@
+# llvm-strip - object stripping tool
+
+## SYNOPSIS
+
+**llvm-strip** [*options*]
+
+## DESCRIPTION
+
+**llvm-strip** is a tool to strip sections and symbols from object files.
+
+The tool is still in active development, but in most scenarios it works as a
+drop-in replacement for GNU's **strip**.
+
+## SEE ALSO
+
+[llvm-objcopy](llvm-objcopy.html)

From 1a2e461f0bba859510911e8908cef79a7df2500d Mon Sep 17 00:00:00 2001
From: Jinsong Ji <jji@us.ibm.com>
Date: Tue, 11 Jun 2019 22:09:33 +0000
Subject: [PATCH 10/17] [PowerPC][NFC]Remove sms-simple.ll test temporarily.

Looks like a MachinePipeliner algorithm problem found by
sanitizer-x86_64-linux-fast.
I will backout this test first while investigating the problem to
unblock buildbot.

==49637==ERROR: AddressSanitizer: heap-buffer-overflow on address
0x614000002e08 at pc 0x000004364350 bp 0x7ffe228a3bd0 sp 0x7ffe228a3bc8
READ of size 4 at 0x614000002e08 thread T0
    #0 0x436434f in
llvm::SwingSchedulerDAG::checkValidNodeOrder(llvm::SmallVector<llvm::NodeSet,
8u> const&) const
/b/sanitizer-x86_64-linux-fast/build/llvm/lib/CodeGen/MachinePipeliner.cpp:3736:11
    #1 0x4342cd0 in llvm::SwingSchedulerDAG::schedule()
/b/sanitizer-x86_64-linux-fast/build/llvm/lib/CodeGen/MachinePipeliner.cpp:486:3
    #2 0x434042d in
llvm::MachinePipeliner::swingModuloScheduler(llvm::MachineLoop&)
/b/sanitizer-x86_64-linux-fast/build/llvm/lib/CodeGen/MachinePipeliner.cpp:385:7
    #3 0x433eb90 in
llvm::MachinePipeliner::runOnMachineFunction(llvm::MachineFunction&)
/b/sanitizer-x86_64-linux-fast/build/llvm/lib/CodeGen/MachinePipeliner.cpp:207:5
    #4 0x428b7ea in
llvm::MachineFunctionPass::runOnFunction(llvm::Function&)
/b/sanitizer-x86_64-linux-fast/build/llvm/lib/CodeGen/MachineFunctionPass.cpp:73:13
    #5 0x4d1a913 in llvm::FPPassManager::runOnFunction(llvm::Function&)
/b/sanitizer-x86_64-linux-fast/build/llvm/lib/IR/LegacyPassManager.cpp:1648:27
    #6 0x4d1b192 in llvm::FPPassManager::runOnModule(llvm::Module&)
/b/sanitizer-x86_64-linux-fast/build/llvm/lib/IR/LegacyPassManager.cpp:1685:16
    #7 0x4d1c06d in runOnModule
/b/sanitizer-x86_64-linux-fast/build/llvm/lib/IR/LegacyPassManager.cpp:1752:27
    #8 0x4d1c06d in llvm::legacy::PassManagerImpl::run(llvm::Module&)
/b/sanitizer-x86_64-linux-fast/build/llvm/lib/IR/LegacyPassManager.cpp:1865
    #9 0xa48ca3 in compileModule(char**, llvm::LLVMContext&)
/b/sanitizer-x86_64-linux-fast/build/llvm/tools/llc/llc.cpp:611:8
    #10 0xa4270f in main
/b/sanitizer-x86_64-linux-fast/build/llvm/tools/llc/llc.cpp:365:22
    #11 0x7fec902572e0 in __libc_start_main
(/lib/x86_64-linux-gnu/libc.so.6+0x202e0)
    #12 0x971b69 in _start
(/b/sanitizer-x86_64-linux-fast/build/llvm_build_asan/bin/llc+0x971b69)

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@363105 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/PowerPC/sms-simple.ll | 78 ------------------------------
 1 file changed, 78 deletions(-)
 delete mode 100644 test/CodeGen/PowerPC/sms-simple.ll

diff --git a/test/CodeGen/PowerPC/sms-simple.ll b/test/CodeGen/PowerPC/sms-simple.ll
deleted file mode 100644
index 6b1f0e453e8f..000000000000
--- a/test/CodeGen/PowerPC/sms-simple.ll
+++ /dev/null
@@ -1,78 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -ppc-vsr-nums-as-vr -mtriple=powerpc64-unknown-linux-gnu \
-; RUN:       -verify-machineinstrs -ppc-asm-full-reg-names -mcpu=pwr9 --ppc-enable-pipeliner \
-; RUN:       | FileCheck %s
-
-@x = dso_local local_unnamed_addr global <{ i32, i32, i32, i32, [1020 x i32] }> <{ i32 1, i32 2, i32 3, i32 4, [1020 x i32] zeroinitializer }>, align 4
-@y = common dso_local global [1024 x i32] zeroinitializer, align 4
-
-; Function Attrs: norecurse nounwind
-define dso_local i32* @foo() local_unnamed_addr #0 {
-; CHECK-LABEL: foo:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r5, r2, x@toc@ha
-; CHECK-NEXT:    addi r5, r5, x@toc@l
-; CHECK-NEXT:    addis r6, r2, y@toc@ha
-; CHECK-NEXT:    li r7, 340
-; CHECK-NEXT:    addi r3, r6, y@toc@l
-; CHECK-NEXT:    lwz r6, y@toc@l(r6)
-; CHECK-NEXT:    mtctr r7
-; CHECK-NEXT:    addi r5, r5, -8
-; CHECK-NEXT:    lwzu r7, 12(r5)
-; CHECK-NEXT:    maddld r6, r7, r7, r6
-; CHECK-NEXT:    lwz r7, 4(r5)
-; CHECK-NEXT:    addi r4, r3, -8
-; CHECK-NEXT:    stwu r6, 12(r4)
-; CHECK-NEXT:    maddld r6, r7, r7, r6
-; CHECK-NEXT:    lwz r7, 8(r5)
-; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_1: # %for.body
-; CHECK:         maddld r7, r7, r7, r6
-; CHECK-NEXT:    lwzu r8, 12(r5)
-; CHECK-NEXT:    maddld r8, r8, r8, r7
-; CHECK-NEXT:    stw r6, 4(r4)
-; CHECK-NEXT:    lwz r6, 4(r5)
-; CHECK-NEXT:    maddld r6, r6, r6, r8
-; CHECK-NEXT:    stw r7, 8(r4)
-; CHECK-NEXT:    lwz r7, 8(r5)
-; CHECK-NEXT:    stwu r8, 12(r4)
-; CHECK-NEXT:    bdnz .LBB0_1
-; CHECK-NEXT:  # %bb.2:
-; CHECK-NEXT:    maddld r5, r7, r7, r6
-; CHECK-NEXT:    stw r6, 4(r4)
-; CHECK-NEXT:    stw r5, 8(r4)
-; CHECK-NEXT:    blr
-entry:
-  %.pre = load i32, i32* getelementptr inbounds ([1024 x i32], [1024 x i32]* @y, i64 0, i64 0), align 4
-  br label %for.body
-
-for.cond.cleanup:                                 ; preds = %for.body
-  ret i32* getelementptr inbounds ([1024 x i32], [1024 x i32]* @y, i64 0, i64 0)
-
-for.body:                                         ; preds = %for.body, %entry
-  %0 = phi i32 [ %.pre, %entry ], [ %add.2, %for.body ]
-  %indvars.iv = phi i64 [ 1, %entry ], [ %indvars.iv.next.2, %for.body ]
-  %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* bitcast (<{ i32, i32, i32, i32, [1020 x i32] }>* @x to [1024 x i32]*), i64 0, i64 %indvars.iv
-  %1 = load i32, i32* %arrayidx2, align 4
-  %mul = mul nsw i32 %1, %1
-  %add = add nsw i32 %mul, %0
-  %arrayidx6 = getelementptr inbounds [1024 x i32], [1024 x i32]* @y, i64 0, i64 %indvars.iv
-  store i32 %add, i32* %arrayidx6, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %arrayidx2.1 = getelementptr inbounds [1024 x i32], [1024 x i32]* bitcast (<{ i32, i32, i32, i32, [1020 x i32] }>* @x to [1024 x i32]*), i64 0, i64 %indvars.iv.next
-  %2 = load i32, i32* %arrayidx2.1, align 4
-  %mul.1 = mul nsw i32 %2, %2
-  %add.1 = add nsw i32 %mul.1, %add
-  %arrayidx6.1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @y, i64 0, i64 %indvars.iv.next
-  store i32 %add.1, i32* %arrayidx6.1, align 4
-  %indvars.iv.next.1 = add nuw nsw i64 %indvars.iv, 2
-  %arrayidx2.2 = getelementptr inbounds [1024 x i32], [1024 x i32]* bitcast (<{ i32, i32, i32, i32, [1020 x i32] }>* @x to [1024 x i32]*), i64 0, i64 %indvars.iv.next.1
-  %3 = load i32, i32* %arrayidx2.2, align 4
-  %mul.2 = mul nsw i32 %3, %3
-  %add.2 = add nsw i32 %mul.2, %add.1
-  %arrayidx6.2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @y, i64 0, i64 %indvars.iv.next.1
-  store i32 %add.2, i32* %arrayidx6.2, align 4
-  %indvars.iv.next.2 = add nuw nsw i64 %indvars.iv, 3
-  %exitcond.2 = icmp eq i64 %indvars.iv.next.2, 1024
-  br i1 %exitcond.2, label %for.cond.cleanup, label %for.body
-}

From baa325e1de31e4be5b0a99ea19c8305d339c722a Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 11 Jun 2019 22:25:18 +0000
Subject: [PATCH 11/17] [Analysis] add isSplatValue() for vectors in IR

We have the related getSplatValue() already in IR (see code just above the proposed addition).
But sometimes we only need to know that the value is a splat rather than capture the splatted
scalar value. Also, we have an isSplatValue() function already in SDAG.

Motivation - recent bugs that would potentially benefit from improved splat analysis in IR:
https://bugs.llvm.org/show_bug.cgi?id=37428
https://bugs.llvm.org/show_bug.cgi?id=42174

Differential Revision: https://reviews.llvm.org/D63138

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@363106 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/VectorUtils.h    |   6 +
 lib/Analysis/VectorUtils.cpp           |  39 +++++
 unittests/Analysis/VectorUtilsTest.cpp | 190 +++++++++++++++++++++++++
 3 files changed, 235 insertions(+)

diff --git a/include/llvm/Analysis/VectorUtils.h b/include/llvm/Analysis/VectorUtils.h
index f79bdc17502d..ac3588594377 100644
--- a/include/llvm/Analysis/VectorUtils.h
+++ b/include/llvm/Analysis/VectorUtils.h
@@ -78,6 +78,12 @@ Value *findScalarElement(Value *V, unsigned EltNo);
 /// a sequence of instructions that broadcast a single value into a vector.
 const Value *getSplatValue(const Value *V);
 
+/// Return true if the input value is known to be a vector with all identical
+/// elements (potentially including undefined elements).
+/// This may be more powerful than the related getSplatValue() because it is
+/// not limited by finding a scalar source value to a splatted vector.
+bool isSplatValue(const Value *V, unsigned Depth = 0);
+
 /// Compute a map of integer instructions to their minimum legal type
 /// size.
 ///
diff --git a/lib/Analysis/VectorUtils.cpp b/lib/Analysis/VectorUtils.cpp
index ea00f0a7dd3d..8040af12d7c9 100644
--- a/lib/Analysis/VectorUtils.cpp
+++ b/lib/Analysis/VectorUtils.cpp
@@ -321,6 +321,45 @@ const llvm::Value *llvm::getSplatValue(const Value *V) {
   return nullptr;
 }
 
+// This setting is based on its counterpart in value tracking, but it could be
+// adjusted if needed.
+const unsigned MaxDepth = 6;
+
+bool llvm::isSplatValue(const Value *V, unsigned Depth) {
+  assert(Depth <= MaxDepth && "Limit Search Depth");
+
+  if (isa<VectorType>(V->getType())) {
+    if (isa<UndefValue>(V))
+      return true;
+    // FIXME: Constant splat analysis does not allow undef elements.
+    if (auto *C = dyn_cast<Constant>(V))
+      return C->getSplatValue() != nullptr;
+  }
+
+  // FIXME: Constant splat analysis does not allow undef elements.
+  Constant *Mask;
+  if (match(V, m_ShuffleVector(m_Value(), m_Value(), m_Constant(Mask))))
+    return Mask->getSplatValue() != nullptr;
+
+  // The remaining tests are all recursive, so bail out if we hit the limit.
+  if (Depth++ == MaxDepth)
+    return false;
+
+  // If both operands of a binop are splats, the result is a splat.
+  Value *X, *Y, *Z;
+  if (match(V, m_BinOp(m_Value(X), m_Value(Y))))
+    return isSplatValue(X, Depth) && isSplatValue(Y, Depth);
+
+  // If all operands of a select are splats, the result is a splat.
+  if (match(V, m_Select(m_Value(X), m_Value(Y), m_Value(Z))))
+    return isSplatValue(X, Depth) && isSplatValue(Y, Depth) &&
+           isSplatValue(Z, Depth);
+
+  // TODO: Add support for unary ops (fneg), casts, intrinsics (overflow ops).
+
+  return false;
+}
+
 MapVector<Instruction *, uint64_t>
 llvm::computeMinimumValueSizes(ArrayRef<BasicBlock *> Blocks, DemandedBits &DB,
                                const TargetTransformInfo *TTI) {
diff --git a/unittests/Analysis/VectorUtilsTest.cpp b/unittests/Analysis/VectorUtilsTest.cpp
index 8ff6744bf780..a33fdb503bb4 100644
--- a/unittests/Analysis/VectorUtilsTest.cpp
+++ b/unittests/Analysis/VectorUtilsTest.cpp
@@ -11,8 +11,10 @@
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstIterator.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/NoFolder.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/KnownBits.h"
@@ -56,8 +58,196 @@ class VectorUtilsTest : public testing::Test {
   Instruction *A;
 };
 
+struct BasicTest : public testing::Test {
+  LLVMContext Ctx;
+  std::unique_ptr<Module> M;
+  Function *F;
+  BasicBlock *BB;
+  IRBuilder<NoFolder> IRB;
+
+  BasicTest()
+      : M(new Module("VectorUtils", Ctx)),
+        F(Function::Create(
+            FunctionType::get(Type::getVoidTy(Ctx), /* IsVarArg */ false),
+            Function::ExternalLinkage, "f", M.get())),
+        BB(BasicBlock::Create(Ctx, "entry", F)), IRB(BB) {}
+};
+
+
 } // namespace
 
+TEST_F(BasicTest, isSplat) {
+  Value *UndefVec = UndefValue::get(VectorType::get(IRB.getInt8Ty(), 4));
+  EXPECT_TRUE(isSplatValue(UndefVec));
+
+  Constant *UndefScalar = UndefValue::get(IRB.getInt8Ty());
+  EXPECT_FALSE(isSplatValue(UndefScalar));
+
+  Constant *ScalarC = IRB.getInt8(42);
+  EXPECT_FALSE(isSplatValue(ScalarC));
+
+  Constant *OtherScalarC = IRB.getInt8(-42);
+  Constant *NonSplatC = ConstantVector::get({ScalarC, OtherScalarC});
+  EXPECT_FALSE(isSplatValue(NonSplatC));
+
+  Value *SplatC = IRB.CreateVectorSplat(5, ScalarC);
+  EXPECT_TRUE(isSplatValue(SplatC));
+
+  // FIXME: Constant splat analysis does not allow undef elements.
+  Constant *SplatWithUndefC = ConstantVector::get({ScalarC, UndefScalar});
+  EXPECT_FALSE(isSplatValue(SplatWithUndefC));
+}
+
+TEST_F(VectorUtilsTest, isSplatValue_00) {
+  parseAssembly(
+      "define <2 x i8> @test(<2 x i8> %x) {\n"
+      "  %A = shufflevector <2 x i8> %x, <2 x i8> undef, <2 x i32> zeroinitializer\n"
+      "  ret <2 x i8> %A\n"
+      "}\n");
+  EXPECT_TRUE(isSplatValue(A));
+}
+
+TEST_F(VectorUtilsTest, isSplatValue_11) {
+  parseAssembly(
+      "define <2 x i8> @test(<2 x i8> %x) {\n"
+      "  %A = shufflevector <2 x i8> %x, <2 x i8> undef, <2 x i32> <i32 1, i32 1>\n"
+      "  ret <2 x i8> %A\n"
+      "}\n");
+  EXPECT_TRUE(isSplatValue(A));
+}
+
+TEST_F(VectorUtilsTest, isSplatValue_01) {
+  parseAssembly(
+      "define <2 x i8> @test(<2 x i8> %x) {\n"
+      "  %A = shufflevector <2 x i8> %x, <2 x i8> undef, <2 x i32> <i32 0, i32 1>\n"
+      "  ret <2 x i8> %A\n"
+      "}\n");
+  EXPECT_FALSE(isSplatValue(A));
+}
+
+// FIXME: Constant (mask) splat analysis does not allow undef elements.
+
+TEST_F(VectorUtilsTest, isSplatValue_0u) {
+  parseAssembly(
+      "define <2 x i8> @test(<2 x i8> %x) {\n"
+      "  %A = shufflevector <2 x i8> %x, <2 x i8> undef, <2 x i32> <i32 0, i32 undef>\n"
+      "  ret <2 x i8> %A\n"
+      "}\n");
+  EXPECT_FALSE(isSplatValue(A));
+}
+
+TEST_F(VectorUtilsTest, isSplatValue_Binop) {
+  parseAssembly(
+      "define <2 x i8> @test(<2 x i8> %x) {\n"
+      "  %v0 = shufflevector <2 x i8> %x, <2 x i8> undef, <2 x i32> <i32 0, i32 0>\n"
+      "  %v1 = shufflevector <2 x i8> %x, <2 x i8> undef, <2 x i32> <i32 1, i32 1>\n"
+      "  %A = udiv <2 x i8> %v0, %v1\n"
+      "  ret <2 x i8> %A\n"
+      "}\n");
+  EXPECT_TRUE(isSplatValue(A));
+}
+
+TEST_F(VectorUtilsTest, isSplatValue_Binop_ConstantOp0) {
+  parseAssembly(
+      "define <2 x i8> @test(<2 x i8> %x) {\n"
+      "  %v1 = shufflevector <2 x i8> %x, <2 x i8> undef, <2 x i32> <i32 1, i32 1>\n"
+      "  %A = ashr <2 x i8> <i8 42, i8 42>, %v1\n"
+      "  ret <2 x i8> %A\n"
+      "}\n");
+  EXPECT_TRUE(isSplatValue(A));
+}
+
+TEST_F(VectorUtilsTest, isSplatValue_Binop_Not_Op0) {
+  parseAssembly(
+      "define <2 x i8> @test(<2 x i8> %x) {\n"
+      "  %v0 = shufflevector <2 x i8> %x, <2 x i8> undef, <2 x i32> <i32 1, i32 0>\n"
+      "  %v1 = shufflevector <2 x i8> %x, <2 x i8> undef, <2 x i32> <i32 1, i32 1>\n"
+      "  %A = add <2 x i8> %v0, %v1\n"
+      "  ret <2 x i8> %A\n"
+      "}\n");
+  EXPECT_FALSE(isSplatValue(A));
+}
+
+TEST_F(VectorUtilsTest, isSplatValue_Binop_Not_Op1) {
+  parseAssembly(
+      "define <2 x i8> @test(<2 x i8> %x) {\n"
+      "  %v0 = shufflevector <2 x i8> %x, <2 x i8> undef, <2 x i32> <i32 1, i32 1>\n"
+      "  %v1 = shufflevector <2 x i8> %x, <2 x i8> undef, <2 x i32> <i32 0, i32 1>\n"
+      "  %A = shl <2 x i8> %v0, %v1\n"
+      "  ret <2 x i8> %A\n"
+      "}\n");
+  EXPECT_FALSE(isSplatValue(A));
+}
+
+TEST_F(VectorUtilsTest, isSplatValue_Select) {
+  parseAssembly(
+      "define <2 x i8> @test(<2 x i1> %x, <2 x i8> %y, <2 x i8> %z) {\n"
+      "  %v0 = shufflevector <2 x i1> %x, <2 x i1> undef, <2 x i32> <i32 1, i32 1>\n"
+      "  %v1 = shufflevector <2 x i8> %y, <2 x i8> undef, <2 x i32> <i32 0, i32 0>\n"
+      "  %v2 = shufflevector <2 x i8> %z, <2 x i8> undef, <2 x i32> <i32 1, i32 1>\n"
+      "  %A = select <2 x i1> %v0, <2 x i8> %v1, <2 x i8> %v2\n"
+      "  ret <2 x i8> %A\n"
+      "}\n");
+  EXPECT_TRUE(isSplatValue(A));
+}
+
+TEST_F(VectorUtilsTest, isSplatValue_Select_ConstantOp) {
+  parseAssembly(
+      "define <2 x i8> @test(<2 x i1> %x, <2 x i8> %y, <2 x i8> %z) {\n"
+      "  %v0 = shufflevector <2 x i1> %x, <2 x i1> undef, <2 x i32> <i32 1, i32 1>\n"
+      "  %v2 = shufflevector <2 x i8> %z, <2 x i8> undef, <2 x i32> <i32 1, i32 1>\n"
+      "  %A = select <2 x i1> %v0, <2 x i8> <i8 42, i8 42>, <2 x i8> %v2\n"
+      "  ret <2 x i8> %A\n"
+      "}\n");
+  EXPECT_TRUE(isSplatValue(A));
+}
+
+TEST_F(VectorUtilsTest, isSplatValue_Select_NotCond) {
+  parseAssembly(
+      "define <2 x i8> @test(<2 x i1> %x, <2 x i8> %y, <2 x i8> %z) {\n"
+      "  %v1 = shufflevector <2 x i8> %y, <2 x i8> undef, <2 x i32> <i32 0, i32 0>\n"
+      "  %v2 = shufflevector <2 x i8> %z, <2 x i8> undef, <2 x i32> <i32 1, i32 1>\n"
+      "  %A = select <2 x i1> %x, <2 x i8> %v1, <2 x i8> %v2\n"
+      "  ret <2 x i8> %A\n"
+      "}\n");
+  EXPECT_FALSE(isSplatValue(A));
+}
+
+TEST_F(VectorUtilsTest, isSplatValue_Select_NotOp1) {
+  parseAssembly(
+      "define <2 x i8> @test(<2 x i1> %x, <2 x i8> %y, <2 x i8> %z) {\n"
+      "  %v0 = shufflevector <2 x i1> %x, <2 x i1> undef, <2 x i32> <i32 1, i32 1>\n"
+      "  %v2 = shufflevector <2 x i8> %z, <2 x i8> undef, <2 x i32> <i32 1, i32 1>\n"
+      "  %A = select <2 x i1> %v0, <2 x i8> %y, <2 x i8> %v2\n"
+      "  ret <2 x i8> %A\n"
+      "}\n");
+  EXPECT_FALSE(isSplatValue(A));
+}
+
+TEST_F(VectorUtilsTest, isSplatValue_Select_NotOp2) {
+  parseAssembly(
+      "define <2 x i8> @test(<2 x i1> %x, <2 x i8> %y, <2 x i8> %z) {\n"
+      "  %v0 = shufflevector <2 x i1> %x, <2 x i1> undef, <2 x i32> <i32 1, i32 1>\n"
+      "  %v1 = shufflevector <2 x i8> %y, <2 x i8> undef, <2 x i32> <i32 0, i32 0>\n"
+      "  %A = select <2 x i1> %v0, <2 x i8> %v1, <2 x i8> %z\n"
+      "  ret <2 x i8> %A\n"
+      "}\n");
+  EXPECT_FALSE(isSplatValue(A));
+}
+
+TEST_F(VectorUtilsTest, isSplatValue_SelectBinop) {
+  parseAssembly(
+      "define <2 x i8> @test(<2 x i1> %x, <2 x i8> %y, <2 x i8> %z) {\n"
+      "  %v0 = shufflevector <2 x i1> %x, <2 x i1> undef, <2 x i32> <i32 1, i32 1>\n"
+      "  %v1 = shufflevector <2 x i8> %y, <2 x i8> undef, <2 x i32> <i32 0, i32 0>\n"
+      "  %v2 = shufflevector <2 x i8> %z, <2 x i8> undef, <2 x i32> <i32 1, i32 1>\n"
+      "  %bo = xor <2 x i8> %v1, %v2\n"
+      "  %A = select <2 x i1> %v0, <2 x i8> %bo, <2 x i8> %v2\n"
+      "  ret <2 x i8> %A\n"
+      "}\n");
+  EXPECT_TRUE(isSplatValue(A));
+}
+
 TEST_F(VectorUtilsTest, getSplatValueElt0) {
   parseAssembly(
       "define <2 x i8> @test(i8 %x) {\n"

From 28bea3dbfef348e53cf48f921e96a35b642b3950 Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Tue, 11 Jun 2019 22:43:25 +0000
Subject: [PATCH 12/17] Generalize icmp matching in IndVars' eliminateTrunc

We were only matching RHS being a loop invariant value, not the inverse. Since there's nothing which appears to canonicalize loop invariant values to RHS, this means we missed cases.

Differential Revision: https://reviews.llvm.org/D63112


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@363108 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Utils/SimplifyIndVar.cpp       |  29 ++---
 .../IndVarSimplify/eliminate-trunc.ll         | 104 ++++++++++++++++++
 2 files changed, 119 insertions(+), 14 deletions(-)

diff --git a/lib/Transforms/Utils/SimplifyIndVar.cpp b/lib/Transforms/Utils/SimplifyIndVar.cpp
index 9a4ebd3255d7..f1048d98f00b 100644
--- a/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -521,20 +521,19 @@ bool SimplifyIndvar::eliminateTrunc(TruncInst *TI) {
     if (isa<Instruction>(U) &&
         !DT->isReachableFromEntry(cast<Instruction>(U)->getParent()))
       continue;
-    if (ICmpInst *ICI = dyn_cast<ICmpInst>(U)) {
-      if (ICI->getOperand(0) == TI && L->isLoopInvariant(ICI->getOperand(1))) {
-        assert(L->contains(ICI->getParent()) && "LCSSA form broken?");
-        // If we cannot get rid of trunc, bail.
-        if (ICI->isSigned() && !DoesSExtCollapse)
-          return false;
-        if (ICI->isUnsigned() && !DoesZExtCollapse)
-          return false;
-        // For equality, either signed or unsigned works.
-        ICmpUsers.push_back(ICI);
-      } else
-        return false;
-    } else
+    ICmpInst *ICI = dyn_cast<ICmpInst>(U);
+    if (!ICI) return false;
+    assert(L->contains(ICI->getParent()) && "LCSSA form broken?");
+    if (!(ICI->getOperand(0) == TI && L->isLoopInvariant(ICI->getOperand(1))) &&
+        !(ICI->getOperand(1) == TI && L->isLoopInvariant(ICI->getOperand(0))))
       return false;
+    // If we cannot get rid of trunc, bail.
+    if (ICI->isSigned() && !DoesSExtCollapse)
+      return false;
+    if (ICI->isUnsigned() && !DoesZExtCollapse)
+      return false;
+    // For equality, either signed or unsigned works.
+    ICmpUsers.push_back(ICI);
   }
 
   auto CanUseZExt = [&](ICmpInst *ICI) {
@@ -557,7 +556,8 @@ bool SimplifyIndvar::eliminateTrunc(TruncInst *TI) {
   };
   // Replace all comparisons against trunc with comparisons against IV.
   for (auto *ICI : ICmpUsers) {
-    auto *Op1 = ICI->getOperand(1);
+    bool IsSwapped = L->isLoopInvariant(ICI->getOperand(0));
+    auto *Op1 = IsSwapped ? ICI->getOperand(0) : ICI->getOperand(1);
     Instruction *Ext = nullptr;
     // For signed/unsigned predicate, replace the old comparison with comparison
     // of immediate IV against sext/zext of the invariant argument. If we can
@@ -566,6 +566,7 @@ bool SimplifyIndvar::eliminateTrunc(TruncInst *TI) {
     // TODO: If we see a signed comparison which can be turned into unsigned,
     // we can do it here for canonicalization purposes.
     ICmpInst::Predicate Pred = ICI->getPredicate();
+    if (IsSwapped) Pred = ICmpInst::getSwappedPredicate(Pred);
     if (CanUseZExt(ICI)) {
       assert(DoesZExtCollapse && "Unprofitable zext?");
       Ext = new ZExtInst(Op1, IVTy, "zext", ICI);
diff --git a/test/Transforms/IndVarSimplify/eliminate-trunc.ll b/test/Transforms/IndVarSimplify/eliminate-trunc.ll
index 7e0971f9f31d..f152c2368594 100644
--- a/test/Transforms/IndVarSimplify/eliminate-trunc.ll
+++ b/test/Transforms/IndVarSimplify/eliminate-trunc.ll
@@ -561,4 +561,108 @@ exit:
   ret void
 }
 
+define void @test_13a(i32 %n) {
+;
+; CHECK-LABEL: @test_13a(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 1024 to i64
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ne i64 [[IV]], [[ZEXT]]
+; CHECK-NEXT:    br i1 [[TMP0]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.next = add nsw nuw i64 %iv, 2
+  %narrow.iv = trunc i64 %iv to i32
+  %cmp = icmp ne i32 1024, %narrow.iv
+  br i1 %cmp, label %loop, label %exit
+exit:
+  ret void
+}
+
+define void @test_13b(i32 %n) {
+;
+; CHECK-LABEL: @test_13b(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 1024 to i64
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i64 [[IV]], [[ZEXT]]
+; CHECK-NEXT:    br i1 [[TMP0]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.next = add nsw nuw i64 %iv, 2
+  %narrow.iv = trunc i64 %iv to i32
+  %cmp = icmp ugt i32 1024, %narrow.iv
+  br i1 %cmp, label %loop, label %exit
+exit:
+  ret void
+}
+
+define void @test_13c(i32 %n) {
+;
+; CHECK-LABEL: @test_13c(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 1024 to i64
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i64 [[IV]], [[ZEXT]]
+; CHECK-NEXT:    br i1 [[TMP0]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.next = add nsw nuw i64 %iv, 2
+  %narrow.iv = trunc i64 %iv to i32
+  %cmp = icmp sgt i32 1024, %narrow.iv
+  br i1 %cmp, label %loop, label %exit
+exit:
+  ret void
+}
+
+define void @test_13d(i32 %n) {
+;
+; CHECK-LABEL: @test_13d(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SEXT:%.*]] = sext i32 1024 to i64
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ -20, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp slt i64 [[IV]], [[SEXT]]
+; CHECK-NEXT:    br i1 [[TMP0]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+loop:
+  %iv = phi i64 [ -20, %entry ], [ %iv.next, %loop ]
+  %iv.next = add nsw i64 %iv, 2
+  %narrow.iv = trunc i64 %iv to i32
+  %cmp = icmp sgt i32 1024, %narrow.iv
+  br i1 %cmp, label %loop, label %exit
+exit:
+  ret void
+}
+
 !0 = !{i32 0, i32 1000}

From e5bf57d8b8e9a29ef24b75172b1dcccda87ecfaa Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Tue, 11 Jun 2019 23:21:24 +0000
Subject: [PATCH 13/17] Fix a bug in getSCEVAtScope w.r.t. non-canonical loops

The issue is that if we have a loop with multiple predecessors outside the loop, the code was expecting to merge them and only return if equal, but instead returned the first one seen.

I have no idea if this actually tripped anywhere.  I noticed it by accident when reading the code and have no idea how to go about constructing a test case.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@363112 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Analysis/ScalarEvolution.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index a7b7fa262383..dc2d32856cc4 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -8126,9 +8126,9 @@ const SCEV *ScalarEvolution::computeSCEVAtScope(const SCEV *V, const Loop *L) {
                     break;
                   }
                 }
-                if (!MultipleInitValues && InitValue)
-                  return getSCEV(InitValue);
               }
+              if (!MultipleInitValues && InitValue)
+                return getSCEV(InitValue);
             }
             // Okay, we know how many times the containing loop executes.  If
             // this is a constant evolving PHI node, get the final value at

From b24a3c2da1f89b9d68caabe5dee2a19bd6b45344 Mon Sep 17 00:00:00 2001
From: Cameron McInally <cameron.mcinally@nyu.edu>
Date: Wed, 12 Jun 2019 00:18:54 +0000
Subject: [PATCH 14/17] [NFC][CodeGen] Add unary FNeg tests to
 X86/avx512vl-intrinsics-fast-isel.ll X86/combine-fabs.ll

X86/avx512vl-intrinsics-fast-isel.ll is only partially complete.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@363114 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../X86/avx512vl-intrinsics-fast-isel.ll      | 669 ++++++++++++++++++
 test/CodeGen/X86/combine-fabs.ll              |  32 +
 2 files changed, 701 insertions(+)

diff --git a/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll
index 4f1affb94e61..af0529fe41ba 100644
--- a/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll
@@ -4282,6 +4282,28 @@ entry:
   ret <2 x double> %2
 }
 
+define <2 x double> @test_mm_mask_fmsub_pd_unary_fneg(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
+; X86-LABEL: test_mm_mask_fmsub_pd_unary_fneg:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    kmovw %eax, %k1
+; X86-NEXT:    vfmsub132pd {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mm_mask_fmsub_pd_unary_fneg:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vfmsub132pd {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2
+; X64-NEXT:    retq
+entry:
+  %neg.i = fneg <2 x double> %__C
+  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %neg.i) #9
+  %1 = bitcast i8 %__U to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A
+  ret <2 x double> %2
+}
+
 define <2 x double> @test_mm_mask3_fmadd_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
 ; X86-LABEL: test_mm_mask3_fmadd_pd:
 ; X86:       # %bb.0: # %entry
@@ -4329,6 +4351,30 @@ entry:
   ret <2 x double> %2
 }
 
+define <2 x double> @test_mm_mask3_fnmadd_pd_unary_fneg(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
+; X86-LABEL: test_mm_mask3_fnmadd_pd_unary_fneg:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    kmovw %eax, %k1
+; X86-NEXT:    vfnmadd231pd {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
+; X86-NEXT:    vmovapd %xmm2, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mm_mask3_fnmadd_pd_unary_fneg:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vfnmadd231pd {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
+; X64-NEXT:    vmovapd %xmm2, %xmm0
+; X64-NEXT:    retq
+entry:
+  %neg.i = fneg <2 x double> %__A
+  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %neg.i, <2 x double> %__B, <2 x double> %__C) #9
+  %1 = bitcast i8 %__U to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C
+  ret <2 x double> %2
+}
+
 define <2 x double> @test_mm_maskz_fmadd_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
 ; X86-LABEL: test_mm_maskz_fmadd_pd:
 ; X86:       # %bb.0: # %entry
@@ -4372,6 +4418,28 @@ entry:
   ret <2 x double> %2
 }
 
+define <2 x double> @test_mm_maskz_fmsub_pd_unary_fneg(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
+; X86-LABEL: test_mm_maskz_fmsub_pd_unary_fneg:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    kmovw %eax, %k1
+; X86-NEXT:    vfmsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mm_maskz_fmsub_pd_unary_fneg:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vfmsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
+; X64-NEXT:    retq
+entry:
+  %neg.i = fneg <2 x double> %__C
+  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %neg.i) #9
+  %1 = bitcast i8 %__U to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
+  ret <2 x double> %2
+}
+
 define <2 x double> @test_mm_maskz_fnmadd_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
 ; X86-LABEL: test_mm_maskz_fnmadd_pd:
 ; X86:       # %bb.0: # %entry
@@ -4394,6 +4462,28 @@ entry:
   ret <2 x double> %2
 }
 
+define <2 x double> @test_mm_maskz_fnmadd_pd_unary_fneg(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
+; X86-LABEL: test_mm_maskz_fnmadd_pd_unary_fneg:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    kmovw %eax, %k1
+; X86-NEXT:    vfnmadd213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mm_maskz_fnmadd_pd_unary_fneg:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vfnmadd213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
+; X64-NEXT:    retq
+entry:
+  %neg.i = fneg <2 x double> %__A
+  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %neg.i, <2 x double> %__B, <2 x double> %__C) #9
+  %1 = bitcast i8 %__U to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
+  ret <2 x double> %2
+}
+
 define <2 x double> @test_mm_maskz_fnmsub_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
 ; X86-LABEL: test_mm_maskz_fnmsub_pd:
 ; X86:       # %bb.0: # %entry
@@ -4417,6 +4507,29 @@ entry:
   ret <2 x double> %2
 }
 
+define <2 x double> @test_mm_maskz_fnmsub_pd_unary_fneg(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
+; X86-LABEL: test_mm_maskz_fnmsub_pd_unary_fneg:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    kmovw %eax, %k1
+; X86-NEXT:    vfnmsub213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mm_maskz_fnmsub_pd_unary_fneg:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vfnmsub213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
+; X64-NEXT:    retq
+entry:
+  %neg.i = fneg <2 x double> %__A
+  %neg1.i = fneg <2 x double> %__C
+  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %neg.i, <2 x double> %__B, <2 x double> %neg1.i) #9
+  %1 = bitcast i8 %__U to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
+  ret <2 x double> %2
+}
+
 define <4 x double> @test_mm256_mask_fmadd_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
 ; X86-LABEL: test_mm256_mask_fmadd_pd:
 ; X86:       # %bb.0: # %entry
@@ -4460,6 +4573,28 @@ entry:
   ret <4 x double> %2
 }
 
+define <4 x double> @test_mm256_mask_fmsub_pd_unary_fneg(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
+; X86-LABEL: test_mm256_mask_fmsub_pd_unary_fneg:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    kmovw %eax, %k1
+; X86-NEXT:    vfmsub132pd {{.*#+}} ymm0 = (ymm0 * ymm1) - ymm2
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mm256_mask_fmsub_pd_unary_fneg:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vfmsub132pd {{.*#+}} ymm0 = (ymm0 * ymm1) - ymm2
+; X64-NEXT:    retq
+entry:
+  %neg.i = fneg <4 x double> %__C
+  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %neg.i) #9
+  %1 = bitcast i8 %__U to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A
+  ret <4 x double> %2
+}
+
 define <4 x double> @test_mm256_mask3_fmadd_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
 ; X86-LABEL: test_mm256_mask3_fmadd_pd:
 ; X86:       # %bb.0: # %entry
@@ -4507,6 +4642,30 @@ entry:
   ret <4 x double> %2
 }
 
+define <4 x double> @test_mm256_mask3_fnmadd_pd_unary_fneg(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
+; X86-LABEL: test_mm256_mask3_fnmadd_pd_unary_fneg:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    kmovw %eax, %k1
+; X86-NEXT:    vfnmadd231pd {{.*#+}} ymm2 = -(ymm0 * ymm1) + ymm2
+; X86-NEXT:    vmovapd %ymm2, %ymm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mm256_mask3_fnmadd_pd_unary_fneg:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vfnmadd231pd {{.*#+}} ymm2 = -(ymm0 * ymm1) + ymm2
+; X64-NEXT:    vmovapd %ymm2, %ymm0
+; X64-NEXT:    retq
+entry:
+  %neg.i = fneg <4 x double> %__A
+  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %neg.i, <4 x double> %__B, <4 x double> %__C) #9
+  %1 = bitcast i8 %__U to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C
+  ret <4 x double> %2
+}
+
 define <4 x double> @test_mm256_maskz_fmadd_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
 ; X86-LABEL: test_mm256_maskz_fmadd_pd:
 ; X86:       # %bb.0: # %entry
@@ -4550,6 +4709,28 @@ entry:
   ret <4 x double> %2
 }
 
+define <4 x double> @test_mm256_maskz_fmsub_pd_unary_fneg(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
+; X86-LABEL: test_mm256_maskz_fmsub_pd_unary_fneg:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    kmovw %eax, %k1
+; X86-NEXT:    vfmsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mm256_maskz_fmsub_pd_unary_fneg:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vfmsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2
+; X64-NEXT:    retq
+entry:
+  %neg.i = fneg <4 x double> %__C
+  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %neg.i) #9
+  %1 = bitcast i8 %__U to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
+  ret <4 x double> %2
+}
+
 define <4 x double> @test_mm256_maskz_fnmadd_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
 ; X86-LABEL: test_mm256_maskz_fnmadd_pd:
 ; X86:       # %bb.0: # %entry
@@ -4572,6 +4753,28 @@ entry:
   ret <4 x double> %2
 }
 
+define <4 x double> @test_mm256_maskz_fnmadd_pd_unary_fneg(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
+; X86-LABEL: test_mm256_maskz_fnmadd_pd_unary_fneg:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    kmovw %eax, %k1
+; X86-NEXT:    vfnmadd213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mm256_maskz_fnmadd_pd_unary_fneg:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vfnmadd213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2
+; X64-NEXT:    retq
+entry:
+  %neg.i = fneg <4 x double> %__A
+  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %neg.i, <4 x double> %__B, <4 x double> %__C) #9
+  %1 = bitcast i8 %__U to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
+  ret <4 x double> %2
+}
+
 define <4 x double> @test_mm256_maskz_fnmsub_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
 ; X86-LABEL: test_mm256_maskz_fnmsub_pd:
 ; X86:       # %bb.0: # %entry
@@ -4595,6 +4798,29 @@ entry:
   ret <4 x double> %2
 }
 
+define <4 x double> @test_mm256_maskz_fnmsub_pd_unary_fneg(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
+; X86-LABEL: test_mm256_maskz_fnmsub_pd_unary_fneg:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    kmovw %eax, %k1
+; X86-NEXT:    vfnmsub213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) - ymm2
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mm256_maskz_fnmsub_pd_unary_fneg:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vfnmsub213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) - ymm2
+; X64-NEXT:    retq
+entry:
+  %neg.i = fneg <4 x double> %__A
+  %neg1.i = fneg <4 x double> %__C
+  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %neg.i, <4 x double> %__B, <4 x double> %neg1.i) #9
+  %1 = bitcast i8 %__U to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
+  ret <4 x double> %2
+}
+
 define <4 x float> @test_mm_mask_fmadd_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
 ; X86-LABEL: test_mm_mask_fmadd_ps:
 ; X86:       # %bb.0: # %entry
@@ -4638,6 +4864,28 @@ entry:
   ret <4 x float> %2
 }
 
+define <4 x float> @test_mm_mask_fmsub_ps_unary_fneg(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
+; X86-LABEL: test_mm_mask_fmsub_ps_unary_fneg:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    kmovw %eax, %k1
+; X86-NEXT:    vfmsub132ps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mm_mask_fmsub_ps_unary_fneg:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vfmsub132ps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2
+; X64-NEXT:    retq
+entry:
+  %neg.i = fneg <4 x float> %__C
+  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %neg.i) #9
+  %1 = bitcast i8 %__U to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A
+  ret <4 x float> %2
+}
+
 define <4 x float> @test_mm_mask3_fmadd_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
 ; X86-LABEL: test_mm_mask3_fmadd_ps:
 ; X86:       # %bb.0: # %entry
@@ -4685,6 +4933,30 @@ entry:
   ret <4 x float> %2
 }
 
+define <4 x float> @test_mm_mask3_fnmadd_ps_unary_fneg(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
+; X86-LABEL: test_mm_mask3_fnmadd_ps_unary_fneg:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    kmovw %eax, %k1
+; X86-NEXT:    vfnmadd231ps {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
+; X86-NEXT:    vmovaps %xmm2, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mm_mask3_fnmadd_ps_unary_fneg:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vfnmadd231ps {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
+; X64-NEXT:    vmovaps %xmm2, %xmm0
+; X64-NEXT:    retq
+entry:
+  %neg.i = fneg <4 x float> %__A
+  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %neg.i, <4 x float> %__B, <4 x float> %__C) #9
+  %1 = bitcast i8 %__U to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C
+  ret <4 x float> %2
+}
+
 define <4 x float> @test_mm_maskz_fmadd_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
 ; X86-LABEL: test_mm_maskz_fmadd_ps:
 ; X86:       # %bb.0: # %entry
@@ -4728,6 +5000,28 @@ entry:
   ret <4 x float> %2
 }
 
+define <4 x float> @test_mm_maskz_fmsub_ps_unary_fneg(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
+; X86-LABEL: test_mm_maskz_fmsub_ps_unary_fneg:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    kmovw %eax, %k1
+; X86-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mm_maskz_fmsub_ps_unary_fneg:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
+; X64-NEXT:    retq
+entry:
+  %neg.i = fneg <4 x float> %__C
+  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %neg.i) #9
+  %1 = bitcast i8 %__U to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
+  ret <4 x float> %2
+}
+
 define <4 x float> @test_mm_maskz_fnmadd_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
 ; X86-LABEL: test_mm_maskz_fnmadd_ps:
 ; X86:       # %bb.0: # %entry
@@ -4750,6 +5044,28 @@ entry:
   ret <4 x float> %2
 }
 
+define <4 x float> @test_mm_maskz_fnmadd_ps_unary_fneg(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
+; X86-LABEL: test_mm_maskz_fnmadd_ps_unary_fneg:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    kmovw %eax, %k1
+; X86-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mm_maskz_fnmadd_ps_unary_fneg:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
+; X64-NEXT:    retq
+entry:
+  %neg.i = fneg <4 x float> %__A
+  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %neg.i, <4 x float> %__B, <4 x float> %__C) #9
+  %1 = bitcast i8 %__U to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
+  ret <4 x float> %2
+}
+
 define <4 x float> @test_mm_maskz_fnmsub_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
 ; X86-LABEL: test_mm_maskz_fnmsub_ps:
 ; X86:       # %bb.0: # %entry
@@ -4773,6 +5089,29 @@ entry:
   ret <4 x float> %2
 }
 
+define <4 x float> @test_mm_maskz_fnmsub_ps_unary_fneg(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
+; X86-LABEL: test_mm_maskz_fnmsub_ps_unary_fneg:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    kmovw %eax, %k1
+; X86-NEXT:    vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mm_maskz_fnmsub_ps_unary_fneg:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
+; X64-NEXT:    retq
+entry:
+  %neg.i = fneg <4 x float> %__A
+  %neg1.i = fneg <4 x float> %__C
+  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %neg.i, <4 x float> %__B, <4 x float> %neg1.i) #9
+  %1 = bitcast i8 %__U to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
+  ret <4 x float> %2
+}
+
 define <8 x float> @test_mm256_mask_fmadd_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
 ; X86-LABEL: test_mm256_mask_fmadd_ps:
 ; X86:       # %bb.0: # %entry
@@ -4814,6 +5153,27 @@ entry:
   ret <8 x float> %2
 }
 
+define <8 x float> @test_mm256_mask_fmsub_ps_unary_fneg(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
+; X86-LABEL: test_mm256_mask_fmsub_ps_unary_fneg:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    kmovw %eax, %k1
+; X86-NEXT:    vfmsub132ps {{.*#+}} ymm0 = (ymm0 * ymm1) - ymm2
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mm256_mask_fmsub_ps_unary_fneg:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vfmsub132ps {{.*#+}} ymm0 = (ymm0 * ymm1) - ymm2
+; X64-NEXT:    retq
+entry:
+  %neg.i = fneg <8 x float> %__C
+  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %neg.i) #9
+  %1 = bitcast i8 %__U to <8 x i1>
+  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A
+  ret <8 x float> %2
+}
+
 define <8 x float> @test_mm256_mask3_fmadd_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
 ; X86-LABEL: test_mm256_mask3_fmadd_ps:
 ; X86:       # %bb.0: # %entry
@@ -5547,6 +5907,32 @@ entry:
   ret <2 x double> %4
 }
 
+define <2 x double> @test_mm_mask3_fmsubadd_pd_unary_fneg(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
+; X86-LABEL: test_mm_mask3_fmsubadd_pd_unary_fneg:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    kmovw %eax, %k1
+; X86-NEXT:    vfmsubadd231pd {{.*#+}} xmm2 = (xmm0 * xmm1) -/+ xmm2
+; X86-NEXT:    vmovapd %xmm2, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mm_mask3_fmsubadd_pd_unary_fneg:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vfmsubadd231pd {{.*#+}} xmm2 = (xmm0 * xmm1) -/+ xmm2
+; X64-NEXT:    vmovapd %xmm2, %xmm0
+; X64-NEXT:    retq
+entry:
+  %neg.i = fneg <2 x double> %__C
+  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %neg.i) #9
+  %1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
+  %2 = shufflevector <2 x double> %1, <2 x double> %0, <2 x i32> <i32 0, i32 3>
+  %3 = bitcast i8 %__U to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %4 = select <2 x i1> %extract.i, <2 x double> %2, <2 x double> %__C
+  ret <2 x double> %4
+}
+
 define <4 x double> @test_mm256_mask3_fmsubadd_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
 ; X86-LABEL: test_mm256_mask3_fmsubadd_pd:
 ; X86:       # %bb.0: # %entry
@@ -5573,6 +5959,32 @@ entry:
   ret <4 x double> %4
 }
 
+define <4 x double> @test_mm256_mask3_fmsubadd_pd_unary_fneg(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
+; X86-LABEL: test_mm256_mask3_fmsubadd_pd_unary_fneg:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    kmovw %eax, %k1
+; X86-NEXT:    vfmsubadd231pd {{.*#+}} ymm2 = (ymm0 * ymm1) -/+ ymm2
+; X86-NEXT:    vmovapd %ymm2, %ymm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mm256_mask3_fmsubadd_pd_unary_fneg:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vfmsubadd231pd {{.*#+}} ymm2 = (ymm0 * ymm1) -/+ ymm2
+; X64-NEXT:    vmovapd %ymm2, %ymm0
+; X64-NEXT:    retq
+entry:
+  %neg.i = fneg <4 x double> %__C
+  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %neg.i) #9
+  %1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
+  %2 = shufflevector <4 x double> %1, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  %3 = bitcast i8 %__U to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %4 = select <4 x i1> %extract.i, <4 x double> %2, <4 x double> %__C
+  ret <4 x double> %4
+}
+
 define <4 x float> @test_mm_mask3_fmsubadd_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
 ; X86-LABEL: test_mm_mask3_fmsubadd_ps:
 ; X86:       # %bb.0: # %entry
@@ -5599,6 +6011,32 @@ entry:
   ret <4 x float> %4
 }
 
+define <4 x float> @test_mm_mask3_fmsubadd_ps_unary_fneg(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
+; X86-LABEL: test_mm_mask3_fmsubadd_ps_unary_fneg:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    kmovw %eax, %k1
+; X86-NEXT:    vfmsubadd231ps {{.*#+}} xmm2 = (xmm0 * xmm1) -/+ xmm2
+; X86-NEXT:    vmovaps %xmm2, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mm_mask3_fmsubadd_ps_unary_fneg:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vfmsubadd231ps {{.*#+}} xmm2 = (xmm0 * xmm1) -/+ xmm2
+; X64-NEXT:    vmovaps %xmm2, %xmm0
+; X64-NEXT:    retq
+entry:
+  %neg.i = fneg <4 x float> %__C
+  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %neg.i) #9
+  %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
+  %2 = shufflevector <4 x float> %1, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  %3 = bitcast i8 %__U to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %4 = select <4 x i1> %extract.i, <4 x float> %2, <4 x float> %__C
+  ret <4 x float> %4
+}
+
 define <8 x float> @test_mm256_mask3_fmsubadd_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
 ; X86-LABEL: test_mm256_mask3_fmsubadd_ps:
 ; X86:       # %bb.0: # %entry
@@ -5624,6 +6062,31 @@ entry:
   ret <8 x float> %4
 }
 
+define <8 x float> @test_mm256_mask3_fmsubadd_ps_unary_fneg(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
+; X86-LABEL: test_mm256_mask3_fmsubadd_ps_unary_fneg:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    kmovw %eax, %k1
+; X86-NEXT:    vfmsubadd231ps {{.*#+}} ymm2 = (ymm0 * ymm1) -/+ ymm2
+; X86-NEXT:    vmovaps %ymm2, %ymm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mm256_mask3_fmsubadd_ps_unary_fneg:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vfmsubadd231ps {{.*#+}} ymm2 = (ymm0 * ymm1) -/+ ymm2
+; X64-NEXT:    vmovaps %ymm2, %ymm0
+; X64-NEXT:    retq
+entry:
+  %neg.i = fneg <8 x float> %__C
+  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %neg.i) #9
+  %1 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
+  %2 = shufflevector <8 x float> %1, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+  %3 = bitcast i8 %__U to <8 x i1>
+  %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> %__C
+  ret <8 x float> %4
+}
+
 define <2 x double> @test_mm_mask_fnmadd_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
 ; X86-LABEL: test_mm_mask_fnmadd_pd:
 ; X86:       # %bb.0: # %entry
@@ -5646,6 +6109,28 @@ entry:
   ret <2 x double> %2
 }
 
+define <2 x double> @test_mm_mask_fnmadd_pd_unary_fneg(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
+; X86-LABEL: test_mm_mask_fnmadd_pd_unary_fneg:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    kmovw %eax, %k1
+; X86-NEXT:    vfnmadd132pd {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm2
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mm_mask_fnmadd_pd_unary_fneg:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vfnmadd132pd {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm2
+; X64-NEXT:    retq
+entry:
+  %neg.i = fneg <2 x double> %__B
+  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %neg.i, <2 x double> %__C) #9
+  %1 = bitcast i8 %__U to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A
+  ret <2 x double> %2
+}
+
 define <4 x double> @test_mm256_mask_fnmadd_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
 ; X86-LABEL: test_mm256_mask_fnmadd_pd:
 ; X86:       # %bb.0: # %entry
@@ -5668,6 +6153,28 @@ entry:
   ret <4 x double> %2
 }
 
+define <4 x double> @test_mm256_mask_fnmadd_pd_unary_fneg(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
+; X86-LABEL: test_mm256_mask_fnmadd_pd_unary_fneg:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    kmovw %eax, %k1
+; X86-NEXT:    vfnmadd132pd {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm2
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mm256_mask_fnmadd_pd_unary_fneg:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vfnmadd132pd {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm2
+; X64-NEXT:    retq
+entry:
+  %neg.i = fneg <4 x double> %__B
+  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %neg.i, <4 x double> %__C) #9
+  %1 = bitcast i8 %__U to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A
+  ret <4 x double> %2
+}
+
 define <4 x float> @test_mm_mask_fnmadd_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
 ; X86-LABEL: test_mm_mask_fnmadd_ps:
 ; X86:       # %bb.0: # %entry
@@ -5690,6 +6197,28 @@ entry:
   ret <4 x float> %2
 }
 
+define <4 x float> @test_mm_mask_fnmadd_ps_unary_fneg(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
+; X86-LABEL: test_mm_mask_fnmadd_ps_unary_fneg:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    kmovw %eax, %k1
+; X86-NEXT:    vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm2
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mm_mask_fnmadd_ps_unary_fneg:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm2
+; X64-NEXT:    retq
+entry:
+  %neg.i = fneg <4 x float> %__B
+  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %neg.i, <4 x float> %__C) #9
+  %1 = bitcast i8 %__U to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A
+  ret <4 x float> %2
+}
+
 define <8 x float> @test_mm256_mask_fnmadd_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
 ; X86-LABEL: test_mm256_mask_fnmadd_ps:
 ; X86:       # %bb.0: # %entry
@@ -5711,6 +6240,27 @@ entry:
   ret <8 x float> %2
 }
 
+define <8 x float> @test_mm256_mask_fnmadd_ps_unary_fneg(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
+; X86-LABEL: test_mm256_mask_fnmadd_ps_unary_fneg:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    kmovw %eax, %k1
+; X86-NEXT:    vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm2
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mm256_mask_fnmadd_ps_unary_fneg:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm2
+; X64-NEXT:    retq
+entry:
+  %neg.i = fneg <8 x float> %__B
+  %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %neg.i, <8 x float> %__C) #9
+  %1 = bitcast i8 %__U to <8 x i1>
+  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A
+  ret <8 x float> %2
+}
+
 define <2 x double> @test_mm_mask_fnmsub_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
 ; X86-LABEL: test_mm_mask_fnmsub_pd:
 ; X86:       # %bb.0: # %entry
@@ -5734,6 +6284,29 @@ entry:
   ret <2 x double> %2
 }
 
+define <2 x double> @test_mm_mask_fnmsub_pd_unary_fneg(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
+; X86-LABEL: test_mm_mask_fnmsub_pd_unary_fneg:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    kmovw %eax, %k1
+; X86-NEXT:    vfnmsub132pd {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mm_mask_fnmsub_pd_unary_fneg:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vfnmsub132pd {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2
+; X64-NEXT:    retq
+entry:
+  %neg.i = fneg <2 x double> %__B
+  %neg1.i = fneg <2 x double> %__C
+  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %neg.i, <2 x double> %neg1.i) #9
+  %1 = bitcast i8 %__U to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A
+  ret <2 x double> %2
+}
+
 define <2 x double> @test_mm_mask3_fnmsub_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
 ; X86-LABEL: test_mm_mask3_fnmsub_pd:
 ; X86:       # %bb.0: # %entry
@@ -5759,6 +6332,31 @@ entry:
   ret <2 x double> %2
 }
 
+define <2 x double> @test_mm_mask3_fnmsub_pd_unary_fneg(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
+; X86-LABEL: test_mm_mask3_fnmsub_pd_unary_fneg:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    kmovw %eax, %k1
+; X86-NEXT:    vfnmsub231pd {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2
+; X86-NEXT:    vmovapd %xmm2, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mm_mask3_fnmsub_pd_unary_fneg:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vfnmsub231pd {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2
+; X64-NEXT:    vmovapd %xmm2, %xmm0
+; X64-NEXT:    retq
+entry:
+  %neg.i = fneg <2 x double> %__B
+  %neg1.i = fneg <2 x double> %__C
+  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %neg.i, <2 x double> %neg1.i) #9
+  %1 = bitcast i8 %__U to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C
+  ret <2 x double> %2
+}
+
 define <4 x double> @test_mm256_mask_fnmsub_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
 ; X86-LABEL: test_mm256_mask_fnmsub_pd:
 ; X86:       # %bb.0: # %entry
@@ -5782,6 +6380,29 @@ entry:
   ret <4 x double> %2
 }
 
+define <4 x double> @test_mm256_mask_fnmsub_pd_unary_fneg(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
+; X86-LABEL: test_mm256_mask_fnmsub_pd_unary_fneg:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    kmovw %eax, %k1
+; X86-NEXT:    vfnmsub132pd {{.*#+}} ymm0 = -(ymm0 * ymm1) - ymm2
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mm256_mask_fnmsub_pd_unary_fneg:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vfnmsub132pd {{.*#+}} ymm0 = -(ymm0 * ymm1) - ymm2
+; X64-NEXT:    retq
+entry:
+  %neg.i = fneg <4 x double> %__B
+  %neg1.i = fneg <4 x double> %__C
+  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %neg.i, <4 x double> %neg1.i) #9
+  %1 = bitcast i8 %__U to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A
+  ret <4 x double> %2
+}
+
 define <4 x double> @test_mm256_mask3_fnmsub_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
 ; X86-LABEL: test_mm256_mask3_fnmsub_pd:
 ; X86:       # %bb.0: # %entry
@@ -5807,6 +6428,31 @@ entry:
   ret <4 x double> %2
 }
 
+define <4 x double> @test_mm256_mask3_fnmsub_pd_unary_fneg(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
+; X86-LABEL: test_mm256_mask3_fnmsub_pd_unary_fneg:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    kmovw %eax, %k1
+; X86-NEXT:    vfnmsub231pd {{.*#+}} ymm2 = -(ymm0 * ymm1) - ymm2
+; X86-NEXT:    vmovapd %ymm2, %ymm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mm256_mask3_fnmsub_pd_unary_fneg:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vfnmsub231pd {{.*#+}} ymm2 = -(ymm0 * ymm1) - ymm2
+; X64-NEXT:    vmovapd %ymm2, %ymm0
+; X64-NEXT:    retq
+entry:
+  %neg.i = fneg <4 x double> %__B
+  %neg1.i = fneg <4 x double> %__C
+  %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %neg.i, <4 x double> %neg1.i) #9
+  %1 = bitcast i8 %__U to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C
+  ret <4 x double> %2
+}
+
 define <4 x float> @test_mm_mask_fnmsub_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
 ; X86-LABEL: test_mm_mask_fnmsub_ps:
 ; X86:       # %bb.0: # %entry
@@ -5830,6 +6476,29 @@ entry:
   ret <4 x float> %2
 }
 
+define <4 x float> @test_mm_mask_fnmsub_ps_unary_fneg(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
+; X86-LABEL: test_mm_mask_fnmsub_ps_unary_fneg:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    kmovw %eax, %k1
+; X86-NEXT:    vfnmsub132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mm_mask_fnmsub_ps_unary_fneg:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vfnmsub132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2
+; X64-NEXT:    retq
+entry:
+  %neg.i = fneg <4 x float> %__B
+  %neg1.i = fneg <4 x float> %__C
+  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %neg.i, <4 x float> %neg1.i) #9
+  %1 = bitcast i8 %__U to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A
+  ret <4 x float> %2
+}
+
 define <4 x float> @test_mm_mask3_fnmsub_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
 ; X86-LABEL: test_mm_mask3_fnmsub_ps:
 ; X86:       # %bb.0: # %entry
diff --git a/test/CodeGen/X86/combine-fabs.ll b/test/CodeGen/X86/combine-fabs.ll
index b779c589cf9d..67b695ebd018 100644
--- a/test/CodeGen/X86/combine-fabs.ll
+++ b/test/CodeGen/X86/combine-fabs.ll
@@ -85,6 +85,22 @@ define float @combine_fabs_fneg(float %a) {
   ret float %2
 }
 
+define float @combine_fabs_unary_fneg(float %a) {
+; SSE-LABEL: combine_fabs_unary_fneg:
+; SSE:       # %bb.0:
+; SSE-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_fabs_unary_fneg:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vbroadcastss {{.*}}(%rip), %xmm1
+; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = fneg float %a
+  %2 = call float @llvm.fabs.f32(float %1)
+  ret float %2
+}
+
 define <4 x float> @combine_vec_fabs_fneg(<4 x float> %a) {
 ; SSE-LABEL: combine_vec_fabs_fneg:
 ; SSE:       # %bb.0:
@@ -101,6 +117,22 @@ define <4 x float> @combine_vec_fabs_fneg(<4 x float> %a) {
   ret <4 x float> %2
 }
 
+define <4 x float> @combine_vec_fabs_unary_fneg(<4 x float> %a) {
+; SSE-LABEL: combine_vec_fabs_unary_fneg:
+; SSE:       # %bb.0:
+; SSE-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_vec_fabs_unary_fneg:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vbroadcastss {{.*}}(%rip), %xmm1
+; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = fneg <4 x float> %a
+  %2 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %1)
+  ret <4 x float> %2
+}
+
 ; fabs(fcopysign(x, y)) -> fabs(x)
 define float @combine_fabs_fcopysign(float %a, float %b) {
 ; SSE-LABEL: combine_fabs_fcopysign:

From 64492a44634c5f7cde655a778ae792c9583655ae Mon Sep 17 00:00:00 2001
From: Kai Luo <lkail@cn.ibm.com>
Date: Wed, 12 Jun 2019 02:45:27 +0000
Subject: [PATCH 15/17] [PowerPC][NFC] Added test for sext/shl combination
 after isel.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@363118 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../combine-sext-and-shl-after-isel.ll        | 76 +++++++++++++++++++
 1 file changed, 76 insertions(+)
 create mode 100644 test/CodeGen/PowerPC/combine-sext-and-shl-after-isel.ll

diff --git a/test/CodeGen/PowerPC/combine-sext-and-shl-after-isel.ll b/test/CodeGen/PowerPC/combine-sext-and-shl-after-isel.ll
new file mode 100644
index 000000000000..1e6648928b98
--- /dev/null
+++ b/test/CodeGen/PowerPC/combine-sext-and-shl-after-isel.ll
@@ -0,0 +1,76 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mcpu=pwr8 -mtriple=powerpc64le-unknown-unknown \
+; RUN:   -ppc-asm-full-reg-names -verify-machineinstrs -O2 < %s | FileCheck %s
+; RUN: llc -mcpu=pwr8 -mtriple=powerpc64-unknown-unknown \
+; RUN:   -ppc-asm-full-reg-names -verify-machineinstrs -O2 < %s | FileCheck %s \
+; RUN:   --check-prefix=CHECK-BE
+; RUN: llc -mcpu=pwr9 -mtriple=powerpc64le-unknown-unknown \
+; RUN:   -ppc-asm-full-reg-names -verify-machineinstrs -O2 < %s | FileCheck %s \
+; RUN:   --check-prefix=CHECK-P9
+; RUN: llc -mcpu=pwr9 -mtriple=powerpc64-unknown-unknown \
+; RUN:   -ppc-asm-full-reg-names -verify-machineinstrs -O2 < %s | FileCheck %s \
+; RUN:   --check-prefix=CHECK-P9-BE
+define dso_local i32 @poc(i32* %base, i32 %index, i1 %flag, i32 %default) {
+; CHECK-LABEL: poc:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    andi. r5, r5, 1
+; CHECK-NEXT:    bc 4, gt, .LBB0_2
+; CHECK-NEXT:  # %bb.1: # %true
+; CHECK-NEXT:    extsw r4, r4
+; CHECK-NEXT:    sldi r4, r4, 2
+; CHECK-NEXT:    lwzx r3, r3, r4
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB0_2: # %false
+; CHECK-NEXT:    mr r3, r6
+; CHECK-NEXT:    blr
+;
+; CHECK-BE-LABEL: poc:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    andi. r5, r5, 1
+; CHECK-BE-NEXT:    bc 4, gt, .LBB0_2
+; CHECK-BE-NEXT:  # %bb.1: # %true
+; CHECK-BE-NEXT:    extsw r4, r4
+; CHECK-BE-NEXT:    sldi r4, r4, 2
+; CHECK-BE-NEXT:    lwzx r3, r3, r4
+; CHECK-BE-NEXT:    blr
+; CHECK-BE-NEXT:  .LBB0_2: # %false
+; CHECK-BE-NEXT:    mr r3, r6
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-P9-LABEL: poc:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    andi. r5, r5, 1
+; CHECK-P9-NEXT:    bc 4, gt, .LBB0_2
+; CHECK-P9-NEXT:  # %bb.1: # %true
+; CHECK-P9-NEXT:    extsw r4, r4
+; CHECK-P9-NEXT:    sldi r4, r4, 2
+; CHECK-P9-NEXT:    lwzx r3, r3, r4
+; CHECK-P9-NEXT:    blr
+; CHECK-P9-NEXT:  .LBB0_2: # %false
+; CHECK-P9-NEXT:    mr r3, r6
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-P9-BE-LABEL: poc:
+; CHECK-P9-BE:       # %bb.0: # %entry
+; CHECK-P9-BE-NEXT:    andi. r5, r5, 1
+; CHECK-P9-BE-NEXT:    bc 4, gt, .LBB0_2
+; CHECK-P9-BE-NEXT:  # %bb.1: # %true
+; CHECK-P9-BE-NEXT:    extsw r4, r4
+; CHECK-P9-BE-NEXT:    sldi r4, r4, 2
+; CHECK-P9-BE-NEXT:    lwzx r3, r3, r4
+; CHECK-P9-BE-NEXT:    blr
+; CHECK-P9-BE-NEXT:  .LBB0_2: # %false
+; CHECK-P9-BE-NEXT:    mr r3, r6
+; CHECK-P9-BE-NEXT:    blr
+entry:
+  %iconv = sext i32 %index to i64
+  br i1 %flag, label %true, label %false
+
+true:
+  %ptr = getelementptr inbounds i32, i32* %base, i64 %iconv
+  %value = load i32, i32* %ptr, align 4
+  ret i32 %value
+
+false:
+  ret i32 %default
+}

From 72b5654efd8826a7c034a7660ef69ff63b7e299a Mon Sep 17 00:00:00 2001
From: Hsiangkai Wang <hsiangkai@gmail.com>
Date: Wed, 12 Jun 2019 02:58:04 +0000
Subject: [PATCH 16/17] [NFC] Correct comments in RegisterCoalescer.

Differential Revision: https://reviews.llvm.org/D63124

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@363119 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/RegisterCoalescer.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/lib/CodeGen/RegisterCoalescer.cpp b/lib/CodeGen/RegisterCoalescer.cpp
index 1f0046ab164e..f747b5d682f8 100644
--- a/lib/CodeGen/RegisterCoalescer.cpp
+++ b/lib/CodeGen/RegisterCoalescer.cpp
@@ -970,7 +970,7 @@ RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
 
 /// For copy B = A in BB2, if A is defined by A = B in BB0 which is a
 /// predecessor of BB2, and if B is not redefined on the way from A = B
-/// in BB2 to B = A in BB2, B = A in BB2 is partially redundant if the
+/// in BB0 to B = A in BB2, B = A in BB2 is partially redundant if the
 /// execution goes through the path from BB0 to BB2. We may move B = A
 /// to the predecessor without such reversed copy.
 /// So we will transform the program from:
@@ -2018,19 +2018,19 @@ bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) {
   if (CP.isFlipped()) {
     // Physreg is copied into vreg
     //   %y = COPY %physreg_x
-    //   ...  //< no other def of %x here
+    //   ...  //< no other def of %physreg_x here
     //   use %y
     // =>
     //   ...
-    //   use %x
+    //   use %physreg_x
     CopyMI = MRI->getVRegDef(SrcReg);
   } else {
     // VReg is copied into physreg:
     //   %y = def
-    //   ... //< no other def or use of %y here
-    //   %y = COPY %physreg_x
+    //   ... //< no other def or use of %physreg_x here
+    //   %physreg_x = COPY %y
     // =>
-    //   %y = def
+    //   %physreg_x = def
     //   ...
     if (!MRI->hasOneNonDBGUse(SrcReg)) {
       LLVM_DEBUG(dbgs() << "\t\tMultiple vreg uses!\n");

From 0b65399dd62fe1079a7e5ebb42ee837dabfa20d9 Mon Sep 17 00:00:00 2001
From: Hsiangkai Wang <hsiangkai@gmail.com>
Date: Wed, 12 Jun 2019 03:04:22 +0000
Subject: [PATCH 17/17] [RISCV] Add CFI directives for RISCV prologue/epilog.

In order to generate correct debug frame information, it needs to
generate CFI information in prologue and epilog.

Differential Revision: https://reviews.llvm.org/D61773

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@363120 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp     |  1 +
 .../RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp  |  8 +-
 lib/Target/RISCV/RISCVFrameLowering.cpp       | 73 ++++++++++++++++++-
 test/CodeGen/RISCV/frame-info.ll              | 66 +++++++++++++++++
 4 files changed, 144 insertions(+), 4 deletions(-)
 create mode 100644 test/CodeGen/RISCV/frame-info.ll

diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp
index 51106cd4e64b..983629692883 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp
@@ -21,6 +21,7 @@ RISCVMCAsmInfo::RISCVMCAsmInfo(const Triple &TT) {
   CommentString = "#";
   AlignmentIsInBytes = false;
   SupportsDebugInformation = true;
+  ExceptionsType = ExceptionHandling::DwarfCFI;
   Data16bitsDirective = "\t.half\t";
   Data32bitsDirective = "\t.word\t";
 }
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
index d920fd203d85..bc45262ab2de 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
@@ -50,7 +50,13 @@ static MCRegisterInfo *createRISCVMCRegisterInfo(const Triple &TT) {
 
 static MCAsmInfo *createRISCVMCAsmInfo(const MCRegisterInfo &MRI,
                                        const Triple &TT) {
-  return new RISCVMCAsmInfo(TT);
+  MCAsmInfo *MAI = new RISCVMCAsmInfo(TT);
+
+  unsigned SP = MRI.getDwarfRegNum(RISCV::X2, true);
+  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, SP, 0);
+  MAI->addInitialFrameState(Inst);
+
+  return MAI;
 }
 
 static MCSubtargetInfo *createRISCVMCSubtargetInfo(const Triple &TT,
diff --git a/lib/Target/RISCV/RISCVFrameLowering.cpp b/lib/Target/RISCV/RISCVFrameLowering.cpp
index 6e0703c7b021..32c3b9684d2c 100644
--- a/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -18,6 +18,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/MC/MCDwarf.h"
 
 using namespace llvm;
 
@@ -96,6 +97,8 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
 
   MachineFrameInfo &MFI = MF.getFrameInfo();
   auto *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
+  const RISCVRegisterInfo *RI = STI.getRegisterInfo();
+  const RISCVInstrInfo *TII = STI.getInstrInfo();
   MachineBasicBlock::iterator MBBI = MBB.begin();
 
   unsigned FPReg = getFPReg(STI);
@@ -119,6 +122,12 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
   // Allocate space on the stack if necessary.
   adjustReg(MBB, MBBI, DL, SPReg, SPReg, -StackSize, MachineInstr::FrameSetup);
 
+  // Emit ".cfi_def_cfa_offset StackSize"
+  unsigned CFIIndex = MF.addFrameInst(
+      MCCFIInstruction::createDefCfaOffset(nullptr, -StackSize));
+  BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
+
   // The frame pointer is callee-saved, and code has been generated for us to
   // save it to the stack. We need to skip over the storing of callee-saved
   // registers as the frame pointer must be modified after it has been saved
@@ -128,10 +137,28 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
   const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
   std::advance(MBBI, CSI.size());
 
+  // Iterate over list of callee-saved registers and emit .cfi_offset
+  // directives.
+  for (const auto &Entry : CSI) {
+    int64_t Offset = MFI.getObjectOffset(Entry.getFrameIdx());
+    unsigned Reg = Entry.getReg();
+    unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
+        nullptr, RI->getDwarfRegNum(Reg, true), Offset));
+    BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
+  }
+
   // Generate new FP.
-  if (hasFP(MF))
+  if (hasFP(MF)) {
     adjustReg(MBB, MBBI, DL, FPReg, SPReg,
               StackSize - RVFI->getVarArgsSaveSize(), MachineInstr::FrameSetup);
+
+    // Emit ".cfi_def_cfa $fp, 0"
+    unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfa(
+        nullptr, RI->getDwarfRegNum(FPReg, true), 0));
+    BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
+  }
 }
 
 void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
@@ -141,6 +168,7 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
   MachineFrameInfo &MFI = MF.getFrameInfo();
   auto *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
   DebugLoc DL = MBBI->getDebugLoc();
+  const RISCVInstrInfo *TII = STI.getInstrInfo();
   unsigned FPReg = getFPReg(STI);
   unsigned SPReg = getSPReg(STI);
 
@@ -150,19 +178,58 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
   auto LastFrameDestroy = std::prev(MBBI, MFI.getCalleeSavedInfo().size());
 
   uint64_t StackSize = MFI.getStackSize();
+  uint64_t FPOffset = StackSize - RVFI->getVarArgsSaveSize();
 
   // Restore the stack pointer using the value of the frame pointer. Only
   // necessary if the stack pointer was modified, meaning the stack size is
   // unknown.
   if (RI->needsStackRealignment(MF) || MFI.hasVarSizedObjects()) {
     assert(hasFP(MF) && "frame pointer should not have been eliminated");
-    adjustReg(MBB, LastFrameDestroy, DL, SPReg, FPReg,
-              -StackSize + RVFI->getVarArgsSaveSize(),
+    adjustReg(MBB, LastFrameDestroy, DL, SPReg, FPReg, -FPOffset,
               MachineInstr::FrameDestroy);
   }
 
+  if (hasFP(MF)) {
+    // To find the instruction restoring FP from stack.
+    for (auto &I = LastFrameDestroy; I != MBBI; ++I) {
+      if (I->mayLoad() && I->getOperand(0).isReg()) {
+        unsigned DestReg = I->getOperand(0).getReg();
+        if (DestReg == FPReg) {
+          // If there is frame pointer, after restoring $fp registers, we
+          // need adjust CFA to ($sp - FPOffset).
+          // Emit ".cfi_def_cfa $sp, -FPOffset"
+          unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfa(
+              nullptr, RI->getDwarfRegNum(SPReg, true), -FPOffset));
+          BuildMI(MBB, std::next(I), DL,
+                  TII->get(TargetOpcode::CFI_INSTRUCTION))
+              .addCFIIndex(CFIIndex);
+          break;
+        }
+      }
+    }
+  }
+
+  // Add CFI directives for callee-saved registers.
+  const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+  // Iterate over list of callee-saved registers and emit .cfi_restore
+  // directives.
+  for (const auto &Entry : CSI) {
+    unsigned Reg = Entry.getReg();
+    unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createRestore(
+        nullptr, RI->getDwarfRegNum(Reg, true)));
+    BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
+  }
+
   // Deallocate stack
   adjustReg(MBB, MBBI, DL, SPReg, SPReg, StackSize, MachineInstr::FrameDestroy);
+
+  // After restoring $sp, we need to adjust CFA to $(sp + 0)
+  // Emit ".cfi_def_cfa_offset 0"
+  unsigned CFIIndex =
+      MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, 0));
+  BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
 }
 
 int RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF,
diff --git a/test/CodeGen/RISCV/frame-info.ll b/test/CodeGen/RISCV/frame-info.ll
new file mode 100644
index 000000000000..9022fc395628
--- /dev/null
+++ b/test/CodeGen/RISCV/frame-info.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 < %s | FileCheck -check-prefix=RV32 %s
+; RUN: llc -mtriple=riscv64 < %s | FileCheck -check-prefix=RV64 %s
+
+define void @foo(i32 signext %size) {
+; RV32-LABEL: foo:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw ra, 12(sp)
+; RV32-NEXT:    sw s0, 8(sp)
+; RV32-NEXT:    .cfi_offset ra, -4
+; RV32-NEXT:    .cfi_offset s0, -8
+; RV32-NEXT:    addi s0, sp, 16
+; RV32-NEXT:    .cfi_def_cfa s0, 0
+; RV32-NEXT:    addi a0, a0, 15
+; RV32-NEXT:    andi a0, a0, -16
+; RV32-NEXT:    sub a0, sp, a0
+; RV32-NEXT:    mv sp, a0
+; RV32-NEXT:    call bar
+; RV32-NEXT:    addi sp, s0, -16
+; RV32-NEXT:    lw s0, 8(sp)
+; RV32-NEXT:    .cfi_def_cfa sp, 16
+; RV32-NEXT:    lw ra, 12(sp)
+; RV32-NEXT:    .cfi_restore ra
+; RV32-NEXT:    .cfi_restore s0
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    .cfi_def_cfa_offset 0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: foo:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    sd ra, 8(sp)
+; RV64-NEXT:    sd s0, 0(sp)
+; RV64-NEXT:    .cfi_offset ra, -8
+; RV64-NEXT:    .cfi_offset s0, -16
+; RV64-NEXT:    addi s0, sp, 16
+; RV64-NEXT:    .cfi_def_cfa s0, 0
+; RV64-NEXT:    addi a1, zero, 1
+; RV64-NEXT:    slli a1, a1, 33
+; RV64-NEXT:    addi a1, a1, -16
+; RV64-NEXT:    slli a0, a0, 32
+; RV64-NEXT:    srli a0, a0, 32
+; RV64-NEXT:    addi a0, a0, 15
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    sub a0, sp, a0
+; RV64-NEXT:    mv sp, a0
+; RV64-NEXT:    call bar
+; RV64-NEXT:    addi sp, s0, -16
+; RV64-NEXT:    ld s0, 0(sp)
+; RV64-NEXT:    .cfi_def_cfa sp, 16
+; RV64-NEXT:    ld ra, 8(sp)
+; RV64-NEXT:    .cfi_restore ra
+; RV64-NEXT:    .cfi_restore s0
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    .cfi_def_cfa_offset 0
+; RV64-NEXT:    ret
+entry:
+  %0 = alloca i8, i32 %size, align 16
+  call void @bar(i8* nonnull %0) #2
+  ret void
+}
+
+declare void @bar(i8*)