From c76086ed45e6c83a73cb2d95b1c4744b50695987 Mon Sep 17 00:00:00 2001
From: Joe Evans <joeev@amazon.com>
Date: Thu, 12 Nov 2020 19:47:48 -0800
Subject: [PATCH 1/5] Don't use namespace for pow() function, since it is built
 into cuda math library, and cast the second argument so it will find an
 acceptable form.

---
 src/operator/contrib/multi_lamb.cu | 4 ++--
 src/operator/contrib/multi_lans.cu | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/src/operator/contrib/multi_lamb.cu b/src/operator/contrib/multi_lamb.cu
index 6415bfbda015..02e8ff524feb 100644
--- a/src/operator/contrib/multi_lamb.cu
+++ b/src/operator/contrib/multi_lamb.cu
@@ -51,9 +51,9 @@ __global__ void KernelStep1(const MultiLAMBKernelParam<DType, MPDType> kernel_pa
   MPDType biascorrection1, biascorrection2;
   if (bias_correction) {
     biascorrection1 = 1.0 -
-                      static_cast<MPDType>(std::pow(beta1, kernel_params.step_count[tensor_id]));
+                      static_cast<MPDType>(pow(beta1, (float)kernel_params.step_count[tensor_id]));
     biascorrection2 = 1.0 -
-                      static_cast<MPDType>(std::pow(beta2, kernel_params.step_count[tensor_id]));
+                      static_cast<MPDType>(pow(beta2, (float)kernel_params.step_count[tensor_id]));
   } else {
     biascorrection1 = static_cast<MPDType>(1.0);
     biascorrection2 = static_cast<MPDType>(1.0);
diff --git a/src/operator/contrib/multi_lans.cu b/src/operator/contrib/multi_lans.cu
index 64de72116514..1665206a0874 100644
--- a/src/operator/contrib/multi_lans.cu
+++ b/src/operator/contrib/multi_lans.cu
@@ -54,9 +54,9 @@ __global__ void KernelStep1(const MultiLANSKernelParam<DType, MPDType> kernel_pa
   MPDType biascorrection1, biascorrection2;
 
   biascorrection1 = 1.0 -
-                    static_cast<MPDType>(std::pow(beta1, kernel_params.step_count[tensor_id]));
+                    static_cast<MPDType>(pow(beta1, (float)kernel_params.step_count[tensor_id]));
   biascorrection2 = 1.0 -
-                    static_cast<MPDType>(std::pow(beta2, kernel_params.step_count[tensor_id]));
+                    static_cast<MPDType>(pow(beta2, (float)kernel_params.step_count[tensor_id]));
 
   MPDType r_weight[ILP_LAMB];
   MPDType r_grad[ILP_LAMB];

From 8d6b7520000a6a91d09bef8b003f858fcb71232d Mon Sep 17 00:00:00 2001
From: Joe Evans <joeev@amazon.com>
Date: Thu, 12 Nov 2020 22:39:12 -0800
Subject: [PATCH 2/5] Properly case exponent.

---
 src/operator/contrib/multi_lamb.cu | 4 ++--
 src/operator/contrib/multi_lans.cu | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/operator/contrib/multi_lamb.cu b/src/operator/contrib/multi_lamb.cu
index 02e8ff524feb..fd1c0c5d50e0 100644
--- a/src/operator/contrib/multi_lamb.cu
+++ b/src/operator/contrib/multi_lamb.cu
@@ -51,9 +51,9 @@ __global__ void KernelStep1(const MultiLAMBKernelParam<DType, MPDType> kernel_pa
   MPDType biascorrection1, biascorrection2;
   if (bias_correction) {
     biascorrection1 = 1.0 -
-                      static_cast<MPDType>(pow(beta1, (float)kernel_params.step_count[tensor_id]));
+                      static_cast<MPDType>(pow(beta1, static_cast<float>(kernel_params.step_count[tensor_id])));
     biascorrection2 = 1.0 -
-                      static_cast<MPDType>(pow(beta2, (float)kernel_params.step_count[tensor_id]));
+                      static_cast<MPDType>(pow(beta2, static_cast<float>(kernel_params.step_count[tensor_id])));
   } else {
     biascorrection1 = static_cast<MPDType>(1.0);
     biascorrection2 = static_cast<MPDType>(1.0);
diff --git a/src/operator/contrib/multi_lans.cu b/src/operator/contrib/multi_lans.cu
index 1665206a0874..8950a48d8e4e 100644
--- a/src/operator/contrib/multi_lans.cu
+++ b/src/operator/contrib/multi_lans.cu
@@ -54,9 +54,9 @@ __global__ void KernelStep1(const MultiLANSKernelParam<DType, MPDType> kernel_pa
   MPDType biascorrection1, biascorrection2;
 
   biascorrection1 = 1.0 -
-                    static_cast<MPDType>(pow(beta1, (float)kernel_params.step_count[tensor_id]));
+                    static_cast<MPDType>(pow(beta1, static_cast<float>(kernel_params.step_count[tensor_id])));
   biascorrection2 = 1.0 -
-                    static_cast<MPDType>(pow(beta2, (float)kernel_params.step_count[tensor_id]));
+                    static_cast<MPDType>(pow(beta2, static_cast<float>(kernel_params.step_count[tensor_id])));
 
   MPDType r_weight[ILP_LAMB];
   MPDType r_grad[ILP_LAMB];

From c816e7ab2bce360efa259e81b9a764d911786a01 Mon Sep 17 00:00:00 2001
From: Joe Evans <joeev@amazon.com>
Date: Fri, 13 Nov 2020 13:37:23 -0800
Subject: [PATCH 3/5] Remove thrust library override and use default from cuda
 11.0.

---
 ci/build_windows.py | 17 +----------------
 1 file changed, 1 insertion(+), 16 deletions(-)

diff --git a/ci/build_windows.py b/ci/build_windows.py
index f6626d6629af..f18492298cc9 100755
--- a/ci/build_windows.py
+++ b/ci/build_windows.py
@@ -151,20 +151,6 @@ def windows_build(args):
     mxnet_root = get_mxnet_root()
     logging.info("Found MXNet root: {}".format(mxnet_root))
 
-    if 'GPU' in args.flavour:
-        # Get Thrust version to be shipped in Cuda 11, due to flakyness of
-        # older Thrust versions with MSVC 19 compiler
-        with remember_cwd():
-            tmpdirname = tempfile.mkdtemp()
-            os.chdir(tmpdirname)
-            r = requests.get('https://github.com/thrust/thrust/archive/1.9.8.zip', allow_redirects=True)
-            with open('thrust.zip', 'wb') as f:
-                f.write(r.content)
-            with zipfile.ZipFile('thrust.zip', 'r') as zip_ref:
-                zip_ref.extractall('.')
-            thrust_path = os.path.join(tmpdirname, "thrust-1.9.8")
-
-
     # cuda thrust / CUB + VS 2019 is flaky: try multiple times if fail
     MAXIMUM_TRY = 5
     build_try = 0
@@ -178,8 +164,7 @@ def windows_build(args):
             os.chdir(path)
             env = os.environ.copy()
             if 'GPU' in args.flavour:
-                env["CXXFLAGS"] = '/FS /MD /O2 /Ob2 /I {}'.format(thrust_path)
-                env["CUDAFLAGS"] = '-I {}'.format(thrust_path)
+                env["CXXFLAGS"] = '/FS /MD /O2 /Ob2'
             cmd = "\"{}\" && cmake -GNinja {} {}".format(args.vcvars,
                                                          CMAKE_FLAGS[args.flavour],
                                                          mxnet_root)

From faccbfdc92b94365dee21b5bd14f0e8edb53ef1d Mon Sep 17 00:00:00 2001
From: Joe Evans <joeev@amazon.com>
Date: Fri, 13 Nov 2020 13:52:07 -0800
Subject: [PATCH 4/5] Fix lint.

---
 src/operator/contrib/multi_lamb.cu | 8 ++++++--
 src/operator/contrib/multi_lans.cu | 6 ++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/operator/contrib/multi_lamb.cu b/src/operator/contrib/multi_lamb.cu
index fd1c0c5d50e0..6de28a851b37 100644
--- a/src/operator/contrib/multi_lamb.cu
+++ b/src/operator/contrib/multi_lamb.cu
@@ -51,9 +51,13 @@ __global__ void KernelStep1(const MultiLAMBKernelParam<DType, MPDType> kernel_pa
   MPDType biascorrection1, biascorrection2;
   if (bias_correction) {
     biascorrection1 = 1.0 -
-                      static_cast<MPDType>(pow(beta1, static_cast<float>(kernel_params.step_count[tensor_id])));
+                      static_cast<MPDType>(pow(beta1,
+                                           static_cast<float>(kernel_params.step_count[tensor_id])
+                                          ));
     biascorrection2 = 1.0 -
-                      static_cast<MPDType>(pow(beta2, static_cast<float>(kernel_params.step_count[tensor_id])));
+                      static_cast<MPDType>(pow(beta2,
+                                           static_cast<float>(kernel_params.step_count[tensor_id])
+                                          ));
   } else {
     biascorrection1 = static_cast<MPDType>(1.0);
     biascorrection2 = static_cast<MPDType>(1.0);
diff --git a/src/operator/contrib/multi_lans.cu b/src/operator/contrib/multi_lans.cu
index 8950a48d8e4e..239332b2ee3a 100644
--- a/src/operator/contrib/multi_lans.cu
+++ b/src/operator/contrib/multi_lans.cu
@@ -54,9 +54,11 @@ __global__ void KernelStep1(const MultiLANSKernelParam<DType, MPDType> kernel_pa
   MPDType biascorrection1, biascorrection2;
 
   biascorrection1 = 1.0 -
-                    static_cast<MPDType>(pow(beta1, static_cast<float>(kernel_params.step_count[tensor_id])));
+                    static_cast<MPDType>(pow(beta1,
+                                         static_cast<float>(kernel_params.step_count[tensor_id])));
   biascorrection2 = 1.0 -
-                    static_cast<MPDType>(pow(beta2, static_cast<float>(kernel_params.step_count[tensor_id])));
+                    static_cast<MPDType>(pow(beta2,
+                                         static_cast<float>(kernel_params.step_count[tensor_id])));
 
   MPDType r_weight[ILP_LAMB];
   MPDType r_grad[ILP_LAMB];

From 7a8ce083d09bd3a9413a2de778ebe3eba4f319c5 Mon Sep 17 00:00:00 2001
From: Joe Evans <joeev@amazon.com>
Date: Fri, 13 Nov 2020 14:20:21 -0800
Subject: [PATCH 5/5] Fix lint.

---
 src/operator/contrib/multi_lamb.cu | 12 ++++--------
 src/operator/contrib/multi_lans.cu | 10 ++++------
 2 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/src/operator/contrib/multi_lamb.cu b/src/operator/contrib/multi_lamb.cu
index 6de28a851b37..0a55b89bc501 100644
--- a/src/operator/contrib/multi_lamb.cu
+++ b/src/operator/contrib/multi_lamb.cu
@@ -50,14 +50,10 @@ __global__ void KernelStep1(const MultiLAMBKernelParam<DType, MPDType> kernel_pa
 
   MPDType biascorrection1, biascorrection2;
   if (bias_correction) {
-    biascorrection1 = 1.0 -
-                      static_cast<MPDType>(pow(beta1,
-                                           static_cast<float>(kernel_params.step_count[tensor_id])
-                                          ));
-    biascorrection2 = 1.0 -
-                      static_cast<MPDType>(pow(beta2,
-                                           static_cast<float>(kernel_params.step_count[tensor_id])
-                                          ));
+    biascorrection1 = 1.0 - static_cast<MPDType>(
+                      pow(beta1, static_cast<float>(kernel_params.step_count[tensor_id])));
+    biascorrection2 = 1.0 - static_cast<MPDType>(
+                      pow(beta2, static_cast<float>(kernel_params.step_count[tensor_id])));
   } else {
     biascorrection1 = static_cast<MPDType>(1.0);
     biascorrection2 = static_cast<MPDType>(1.0);
diff --git a/src/operator/contrib/multi_lans.cu b/src/operator/contrib/multi_lans.cu
index 239332b2ee3a..2a7acb6bcaa9 100644
--- a/src/operator/contrib/multi_lans.cu
+++ b/src/operator/contrib/multi_lans.cu
@@ -53,12 +53,10 @@ __global__ void KernelStep1(const MultiLANSKernelParam<DType, MPDType> kernel_pa
 
   MPDType biascorrection1, biascorrection2;
 
-  biascorrection1 = 1.0 -
-                    static_cast<MPDType>(pow(beta1,
-                                         static_cast<float>(kernel_params.step_count[tensor_id])));
-  biascorrection2 = 1.0 -
-                    static_cast<MPDType>(pow(beta2,
-                                         static_cast<float>(kernel_params.step_count[tensor_id])));
+  biascorrection1 = 1.0 - static_cast<MPDType>(
+                    pow(beta1, static_cast<float>(kernel_params.step_count[tensor_id])));
+  biascorrection2 = 1.0 - static_cast<MPDType>(
+                    pow(beta2, static_cast<float>(kernel_params.step_count[tensor_id])));
 
   MPDType r_weight[ILP_LAMB];
   MPDType r_grad[ILP_LAMB];