From 09a2fcefbc3df358b5e6aff34beff497a63fb5e0 Mon Sep 17 00:00:00 2001
From: Thejaswi Rao <rao.thejaswi@gmail.com>
Date: Tue, 24 Sep 2019 22:57:55 -0700
Subject: [PATCH 01/63] Separate Input and Output type for unaryOp

---
 cpp/src_prims/linalg/unary_op.h | 80 ++++++++++++++++++---------------
 1 file changed, 45 insertions(+), 35 deletions(-)
diff --git a/cpp/src_prims/linalg/unary_op.h b/cpp/src_prims/linalg/unary_op.h
index 24860e6fae..bf06ed76ec 100644
--- a/cpp/src_prims/linalg/unary_op.h
+++ b/cpp/src_prims/linalg/unary_op.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2019, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,42 +16,47 @@
 
 #pragma once
 
+#include <algorithm>
 #include "cuda_utils.h"
 #include "vectorized.h"
 
 namespace MLCommon {
 namespace LinAlg {
 
-template <typename math_t, int veclen_, typename Lambda, typename IdxType>
-__global__ void unaryOpKernel(math_t *out, const math_t *in, IdxType len,
+template <typename InType, int VecLen, typename Lambda, typename OutType,
+          typename IdxType>
+__global__ void unaryOpKernel(OutType *out, const InType *in, IdxType len,
                               Lambda op) {
-  typedef TxN_t<math_t, veclen_> VecType;
-  VecType a;
+  typedef TxN_t<InType, VecLen> InVecType;
+  typedef TxN_t<OutType, VecLen> OutVecType;
+  InVecType a;
+  OutVecType b;
   IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x);
-  idx *= VecType::Ratio;
+  idx *= InVecType::Ratio;
   if (idx >= len) return;
   a.load(in, idx);
 #pragma unroll
-  for (int i = 0; i < VecType::Ratio; ++i) {
-    a.val.data[i] = op(a.val.data[i]);
+  for (int i = 0; i < InVecType::Ratio; ++i) {
+    b.val.data[i] = op(a.val.data[i]);
   }
-  a.store(out, idx);
+  b.store(out, idx);
 }
 
-template <typename math_t, int veclen_, typename Lambda, typename IdxType,
-          int TPB>
-void unaryOpImpl(math_t *out, const math_t *in, IdxType len, Lambda op,
+template <typename InType, int VecLen, typename Lambda, typename OutType,
+          typename IdxType, int TPB>
+void unaryOpImpl(OutType *out, const InType *in, IdxType len, Lambda op,
                  cudaStream_t stream) {
-  const IdxType nblks = ceildiv(veclen_ ? len / veclen_ : len, (IdxType)TPB);
-  unaryOpKernel<math_t, veclen_, Lambda, IdxType>
+  const IdxType nblks = ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB);
+  unaryOpKernel<InType, VecLen, Lambda, OutType, IdxType>
     <<<nblks, TPB, 0, stream>>>(out, in, len, op);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
 /**
  * @brief perform element-wise unary operation in the input array
- * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam InType input data-type
  * @tparam Lambda the device-lambda performing the actual operation
+ * @tparam OutType output data-type
  * @tparam IdxType Integer type used to for addressing
  * @tparam TPB threads-per-block in the final kernel launched
  * @param out the output array
@@ -59,36 +64,41 @@ void unaryOpImpl(math_t *out, const math_t *in, IdxType len, Lambda op,
  * @param len number of elements in the input array
  * @param op the device-lambda
  * @param stream cuda stream where to launch work
+ * @note Lambda must be a functor with the following signature:
+ *       `OutType func(const InType& val);`
  */
-template <typename math_t, typename Lambda, typename IdxType = int,
-          int TPB = 256>
-void unaryOp(math_t *out, const math_t *in, IdxType len, Lambda op,
+template <typename InType, typename Lambda, typename IdxType = int,
+          typename OutType = InType, int TPB = 256>
+void unaryOp(OutType *out, const InType *in, IdxType len, Lambda op,
              cudaStream_t stream) {
   if (len <= 0) return;  //silently skip in case of 0 length input
-  size_t bytes = len * sizeof(math_t);
+  constexpr auto minSize =
+    sizeof(InType) < sizeof(OutType) ? sizeof(InType) : sizeof(OutType);
+  size_t bytes = len * minSize;
   uint64_t inAddr = uint64_t(in);
   uint64_t outAddr = uint64_t(out);
-  if (16 / sizeof(math_t) && bytes % 16 == 0 && inAddr % 16 == 0 &&
+  if (16 / minSize && bytes % 16 == 0 && inAddr % 16 == 0 &&
       outAddr % 16 == 0) {
-    unaryOpImpl<math_t, 16 / sizeof(math_t), Lambda, IdxType, TPB>(out, in, len,
-                                                                   op, stream);
-  } else if (8 / sizeof(math_t) && bytes % 8 == 0 && inAddr % 8 == 0 &&
+    unaryOpImpl<InType, 16 / minSize, Lambda, OutType, IdxType, TPB>(
+      out, in, len, op, stream);
+  } else if (8 / minSize && bytes % 8 == 0 && inAddr % 8 == 0 &&
              outAddr % 8 == 0) {
-    unaryOpImpl<math_t, 8 / sizeof(math_t), Lambda, IdxType, TPB>(out, in, len,
-                                                                  op, stream);
-  } else if (4 / sizeof(math_t) && bytes % 4 == 0 && inAddr % 4 == 0 &&
+    unaryOpImpl<InType, 8 / minSize, Lambda, OutType, IdxType, TPB>(
+      out, in, len, op, stream);
+  } else if (4 / minSize && bytes % 4 == 0 && inAddr % 4 == 0 &&
              outAddr % 4 == 0) {
-    unaryOpImpl<math_t, 4 / sizeof(math_t), Lambda, IdxType, TPB>(out, in, len,
-                                                                  op, stream);
-  } else if (2 / sizeof(math_t) && bytes % 2 == 0 && inAddr % 2 == 0 &&
+    unaryOpImpl<InType, 4 / minSize, Lambda, OutType, IdxType, TPB>(
+      out, in, len, op, stream);
+  } else if (2 / minSize && bytes % 2 == 0 && inAddr % 2 == 0 &&
              outAddr % 2 == 0) {
-    unaryOpImpl<math_t, 2 / sizeof(math_t), Lambda, IdxType, TPB>(out, in, len,
-                                                                  op, stream);
-  } else if (1 / sizeof(math_t)) {
-    unaryOpImpl<math_t, 1 / sizeof(math_t), Lambda, IdxType, TPB>(out, in, len,
-                                                                  op, stream);
+    unaryOpImpl<InType, 2 / minSize, Lambda, OutType, IdxType, TPB>(
+      out, in, len, op, stream);
+  } else if (1 / minSize) {
+    unaryOpImpl<InType, 1 / minSize, Lambda, OutType, IdxType, TPB>(
+      out, in, len, op, stream);
   } else {
-    unaryOpImpl<math_t, 1, Lambda, IdxType, TPB>(out, in, len, op, stream);
+    unaryOpImpl<InType, 1, Lambda, OutType, IdxType, TPB>(out, in, len, op,
+                                                          stream);
   }
 }
 

From 8bfd35c29d7426921060b37a5d35e3252f1b1746 Mon Sep 17 00:00:00 2001
From: Thejaswi Rao <rao.thejaswi@gmail.com>
Date: Tue, 24 Sep 2019 23:01:05 -0700
Subject: [PATCH 02/63] update changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 01f4dd36b0..eed5c246ae 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -16,6 +16,7 @@
 - PR #1076: Paying off some UMAP / Spectral tech debt.
 - PR #1086: Ensure RegressorMixin scorer uses device arrays
 - PR #1114: K-means: Exposing useful params, removing unused params, proxying params in Dask
+- PR #1142: prims: expose separate InType and OutType for unaryOp and binaryOp
 
 ## Bug Fixes
 

From 832227d283e9d8ebb90143feb1cbb602a6a8be7a Mon Sep 17 00:00:00 2001
From: Thejaswi Rao <rao.thejaswi@gmail.com>
Date: Wed, 25 Sep 2019 01:28:54 -0700
Subject: [PATCH 03/63] unit-tests to verify different input and output types
 for unaryOp

---
 cpp/test/prims/unary_op.cu | 33 ++++++++++++++++++++++++---------
 cpp/test/prims/unary_op.h  | 23 ++++++++++++-----------
 2 files changed, 36 insertions(+), 20 deletions(-)

diff --git a/cpp/test/prims/unary_op.cu b/cpp/test/prims/unary_op.cu
index 62d925fbc7..ebf7d955c9 100644
--- a/cpp/test/prims/unary_op.cu
+++ b/cpp/test/prims/unary_op.cu
@@ -26,18 +26,22 @@ namespace LinAlg {
 // Or else, we get the following compilation error
 // for an extended __device__ lambda cannot have private or protected access
 // within its class
-template <typename T, typename IdxType = int>
-void unaryOpLaunch(T *out, const T *in, T scalar, IdxType len,
+template <typename InType, typename IdxType = int, typename OutType = InType>
+void unaryOpLaunch(OutType *out, const InType *in, InType scalar, IdxType len,
                    cudaStream_t stream) {
-  unaryOp(
-    out, in, len, [scalar] __device__(T in) { return in * scalar; }, stream);
+  auto op = [scalar] __device__(InType in) {
+    return static_cast<OutType>(in * scalar);
+  };
+  unaryOp<InType, decltype(op), IdxType, OutType>(out, in, len, op, stream);
 }
 
-template <typename T, typename IdxType>
-class UnaryOpTest : public ::testing::TestWithParam<UnaryOpInputs<T, IdxType>> {
+template <typename InType, typename IdxType, typename OutType = InType>
+class UnaryOpTest
+  : public ::testing::TestWithParam<UnaryOpInputs<InType, IdxType, OutType>> {
  protected:
   void SetUp() override {
-    params = ::testing::TestWithParam<UnaryOpInputs<T, IdxType>>::GetParam();
+    params = ::testing::TestWithParam<
+      UnaryOpInputs<InType, IdxType, OutType>>::GetParam();
     Random::Rng r(params.seed);
     cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
@@ -61,8 +65,9 @@ class UnaryOpTest : public ::testing::TestWithParam<UnaryOpInputs<T, IdxType>> {
   }
 
  protected:
-  UnaryOpInputs<T, IdxType> params;
-  T *in, *out_ref, *out;
+  UnaryOpInputs<InType, IdxType, OutType> params;
+  InType *in;
+  OutType *out_ref, *out;
 };
 
 const std::vector<UnaryOpInputs<float, int>> inputsf_i32 = {
@@ -85,6 +90,16 @@ TEST_P(UnaryOpTestF_i64, Result) {
 INSTANTIATE_TEST_CASE_P(UnaryOpTests, UnaryOpTestF_i64,
                         ::testing::ValuesIn(inputsf_i64));
 
+const std::vector<UnaryOpInputs<float, int, double>> inputsf_i32_d = {
+  {0.000001f, 1024 * 1024, 2.f, 1234ULL}};
+typedef UnaryOpTest<float, int, double> UnaryOpTestF_i32_D;
+TEST_P(UnaryOpTestF_i32_D, Result) {
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+                          CompareApprox<double>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(UnaryOpTests, UnaryOpTestF_i32_D,
+                        ::testing::ValuesIn(inputsf_i32_d));
+
 const std::vector<UnaryOpInputs<double, int>> inputsd_i32 = {
   {0.00000001, 1024 * 1024, 2.0, 1234ULL}};
 typedef UnaryOpTest<double, int> UnaryOpTestD_i32;
diff --git a/cpp/test/prims/unary_op.h b/cpp/test/prims/unary_op.h
index bc08a56d59..24e5e2ec1c 100644
--- a/cpp/test/prims/unary_op.h
+++ b/cpp/test/prims/unary_op.h
@@ -22,35 +22,36 @@
 namespace MLCommon {
 namespace LinAlg {
 
-template <typename Type, typename IdxType>
-__global__ void naiveScaleKernel(Type *out, const Type *in, Type scalar,
+template <typename InType, typename OutType, typename IdxType>
+__global__ void naiveScaleKernel(OutType *out, const InType *in, InType scalar,
                                  IdxType len) {
   IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * (IdxType)blockDim.x);
   if (idx < len) {
-    out[idx] = scalar * in[idx];
+    out[idx] = static_cast<OutType>(scalar * in[idx]);
   }
 }
 
-template <typename Type, typename IdxType = int>
-void naiveScale(Type *out, const Type *in, Type scalar, int len,
+template <typename InType, typename IdxType = int, typename OutType = InType>
+void naiveScale(OutType *out, const InType *in, InType scalar, int len,
                 cudaStream_t stream) {
   static const int TPB = 64;
   int nblks = ceildiv(len, TPB);
-  naiveScaleKernel<Type><<<nblks, TPB, 0, stream>>>(out, in, scalar, len);
+  naiveScaleKernel<InType, OutType, IdxType>
+    <<<nblks, TPB, 0, stream>>>(out, in, scalar, len);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
-template <typename T, typename IdxType = int>
+template <typename InType, typename IdxType = int, typename OutType = InType>
 struct UnaryOpInputs {
-  T tolerance;
+  InType tolerance;
   IdxType len;
-  T scalar;
+  InType scalar;
   unsigned long long int seed;
 };
 
-template <typename T, typename IdxType = int>
+template <typename InType, typename IdxType = int, typename OutType = InType>
 ::std::ostream &operator<<(::std::ostream &os,
-                           const UnaryOpInputs<T, IdxType> &dims) {
+                           const UnaryOpInputs<InType, IdxType, OutType> &d) {
   return os;
 }
 

From 6e97fb626e8b5eddf0b9e845c5b58ffb56b37be6 Mon Sep 17 00:00:00 2001
From: Thejaswi Rao <rao.thejaswi@gmail.com>
Date: Wed, 25 Sep 2019 01:39:20 -0700
Subject: [PATCH 04/63] fixed the max size comparison bug in unaryOp

---
 cpp/src_prims/linalg/unary_op.h | 27 +++++++++++++--------------
 cpp/test/prims/unary_op.cu      |  2 +-
 2 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/cpp/src_prims/linalg/unary_op.h b/cpp/src_prims/linalg/unary_op.h
index bf06ed76ec..35b1fdd9cd 100644
--- a/cpp/src_prims/linalg/unary_op.h
+++ b/cpp/src_prims/linalg/unary_op.h
@@ -16,7 +16,6 @@
 
 #pragma once
 
-#include <algorithm>
 #include "cuda_utils.h"
 #include "vectorized.h"
 
@@ -72,29 +71,29 @@ template <typename InType, typename Lambda, typename IdxType = int,
 void unaryOp(OutType *out, const InType *in, IdxType len, Lambda op,
              cudaStream_t stream) {
   if (len <= 0) return;  //silently skip in case of 0 length input
-  constexpr auto minSize =
-    sizeof(InType) < sizeof(OutType) ? sizeof(InType) : sizeof(OutType);
-  size_t bytes = len * minSize;
+  constexpr auto maxSize =
+    sizeof(InType) >= sizeof(OutType) ? sizeof(InType) : sizeof(OutType);
+  size_t bytes = len * maxSize;
   uint64_t inAddr = uint64_t(in);
   uint64_t outAddr = uint64_t(out);
-  if (16 / minSize && bytes % 16 == 0 && inAddr % 16 == 0 &&
+  if (16 / maxSize && bytes % 16 == 0 && inAddr % 16 == 0 &&
       outAddr % 16 == 0) {
-    unaryOpImpl<InType, 16 / minSize, Lambda, OutType, IdxType, TPB>(
+    unaryOpImpl<InType, 16 / maxSize, Lambda, OutType, IdxType, TPB>(
       out, in, len, op, stream);
-  } else if (8 / minSize && bytes % 8 == 0 && inAddr % 8 == 0 &&
+  } else if (8 / maxSize && bytes % 8 == 0 && inAddr % 8 == 0 &&
              outAddr % 8 == 0) {
-    unaryOpImpl<InType, 8 / minSize, Lambda, OutType, IdxType, TPB>(
+    unaryOpImpl<InType, 8 / maxSize, Lambda, OutType, IdxType, TPB>(
       out, in, len, op, stream);
-  } else if (4 / minSize && bytes % 4 == 0 && inAddr % 4 == 0 &&
+  } else if (4 / maxSize && bytes % 4 == 0 && inAddr % 4 == 0 &&
              outAddr % 4 == 0) {
-    unaryOpImpl<InType, 4 / minSize, Lambda, OutType, IdxType, TPB>(
+    unaryOpImpl<InType, 4 / maxSize, Lambda, OutType, IdxType, TPB>(
       out, in, len, op, stream);
-  } else if (2 / minSize && bytes % 2 == 0 && inAddr % 2 == 0 &&
+  } else if (2 / maxSize && bytes % 2 == 0 && inAddr % 2 == 0 &&
              outAddr % 2 == 0) {
-    unaryOpImpl<InType, 2 / minSize, Lambda, OutType, IdxType, TPB>(
+    unaryOpImpl<InType, 2 / maxSize, Lambda, OutType, IdxType, TPB>(
       out, in, len, op, stream);
-  } else if (1 / minSize) {
-    unaryOpImpl<InType, 1 / minSize, Lambda, OutType, IdxType, TPB>(
+  } else if (1 / maxSize) {
+    unaryOpImpl<InType, 1 / maxSize, Lambda, OutType, IdxType, TPB>(
       out, in, len, op, stream);
   } else {
     unaryOpImpl<InType, 1, Lambda, OutType, IdxType, TPB>(out, in, len, op,
diff --git a/cpp/test/prims/unary_op.cu b/cpp/test/prims/unary_op.cu
index ebf7d955c9..e91399c75a 100644
--- a/cpp/test/prims/unary_op.cu
+++ b/cpp/test/prims/unary_op.cu
@@ -52,7 +52,7 @@ class UnaryOpTest
     allocate(in, len);
     allocate(out_ref, len);
     allocate(out, len);
-    r.uniform(in, len, T(-1.0), T(1.0), stream);
+    r.uniform(in, len, InType(-1.0), InType(1.0), stream);
     naiveScale(out_ref, in, scalar, len, stream);
     unaryOpLaunch(out, in, scalar, len, stream);
     CUDA_CHECK(cudaStreamDestroy(stream));

From dda78c1a232b295ebd4d2dbfb52ee52facfcdb38 Mon Sep 17 00:00:00 2001
From: Thejaswi Rao <rao.thejaswi@gmail.com>
Date: Thu, 26 Sep 2019 22:05:26 -0700
Subject: [PATCH 05/63] Enable binaryOp to also support different input and
 output types

---
 cpp/src_prims/linalg/binary_op.h | 68 +++++++++++++++++---------------
 cpp/test/prims/binary_op.cu      | 28 +++++++++----
 cpp/test/prims/binary_op.h       | 22 +++++------
 cpp/test/prims/unary_op.cu       |  2 -
 4 files changed, 68 insertions(+), 52 deletions(-)

diff --git a/cpp/src_prims/linalg/binary_op.h b/cpp/src_prims/linalg/binary_op.h
index 9f799e076c..3a60519d1c 100644
--- a/cpp/src_prims/linalg/binary_op.h
+++ b/cpp/src_prims/linalg/binary_op.h
@@ -22,37 +22,41 @@
 namespace MLCommon {
 namespace LinAlg {
 
-template <typename math_t, int veclen_, typename Lambda, typename IdxType>
-__global__ void binaryOpKernel(math_t *out, const math_t *in1,
-                               const math_t *in2, IdxType len, Lambda op) {
-  typedef TxN_t<math_t, veclen_> VecType;
-  VecType a, b;
+template <typename InType, int VecLen, typename Lambda, typename IdxType,
+          typename OutType>
+__global__ void binaryOpKernel(OutType *out, const InType *in1,
+                               const InType *in2, IdxType len, Lambda op) {
+  typedef TxN_t<InType, VecLen> InVecType;
+  typedef TxN_t<OutType, VecLen> OutVecType;
+  InVecType a, b;
+  OutVecType c;
   IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x);
-  idx *= VecType::Ratio;
+  idx *= InVecType::Ratio;
   if (idx >= len) return;
   a.load(in1, idx);
   b.load(in2, idx);
 #pragma unroll
-  for (int i = 0; i < VecType::Ratio; ++i) {
-    a.val.data[i] = op(a.val.data[i], b.val.data[i]);
+  for (int i = 0; i < InVecType::Ratio; ++i) {
+    c.val.data[i] = op(a.val.data[i], b.val.data[i]);
   }
-  a.store(out, idx);
+  c.store(out, idx);
 }
 
-template <typename math_t, int veclen_, typename Lambda, typename IdxType,
-          int TPB>
-void binaryOpImpl(math_t *out, const math_t *in1, const math_t *in2,
+template <typename InType, int VecLen, typename Lambda, typename IdxType,
+          typename OutType, int TPB>
+void binaryOpImpl(OutType *out, const InType *in1, const InType *in2,
                   IdxType len, Lambda op, cudaStream_t stream) {
-  const IdxType nblks = ceildiv(veclen_ ? len / veclen_ : len, (IdxType)TPB);
-  binaryOpKernel<math_t, veclen_, Lambda, IdxType>
+  const IdxType nblks = ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB);
+  binaryOpKernel<InType, VecLen, Lambda, IdxType, OutType>
     <<<nblks, TPB, 0, stream>>>(out, in1, in2, len, op);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
 /**
  * @brief perform element-wise binary operation on the input arrays
- * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam InType input data-type
  * @tparam Lambda the device-lambda performing the actual operation
+ * @tparam OutType output data-type
  * @tparam IdxType Integer type used to for addressing
  * @tparam TPB threads-per-block in the final kernel launched
  * @param out the output array
@@ -62,29 +66,31 @@ void binaryOpImpl(math_t *out, const math_t *in1, const math_t *in2,
  * @param op the device-lambda
  * @param stream cuda stream where to launch work
  */
-template <typename math_t, typename Lambda, typename IdxType = int,
-          int TPB = 256>
-void binaryOp(math_t *out, const math_t *in1, const math_t *in2, IdxType len,
+template <typename InType, typename Lambda, typename OutType = InType,
+          typename IdxType = int, int TPB = 256>
+void binaryOp(OutType *out, const InType *in1, const InType *in2, IdxType len,
               Lambda op, cudaStream_t stream) {
-  size_t bytes = len * sizeof(math_t);
-  if (16 / sizeof(math_t) && bytes % 16 == 0) {
-    binaryOpImpl<math_t, 16 / sizeof(math_t), Lambda, IdxType, TPB>(
+  constexpr auto maxSize =
+    sizeof(InType) > sizeof(OutType) ? sizeof(InType) : sizeof(OutType);
+  size_t bytes = len * maxSize;
+  if (16 / maxSize && bytes % 16 == 0) {
+    binaryOpImpl<InType, 16 / maxSize, Lambda, IdxType, OutType, TPB>(
       out, in1, in2, len, op, stream);
-  } else if (8 / sizeof(math_t) && bytes % 8 == 0) {
-    binaryOpImpl<math_t, 8 / sizeof(math_t), Lambda, IdxType, TPB>(
+  } else if (8 / maxSize && bytes % 8 == 0) {
+    binaryOpImpl<InType, 8 / maxSize, Lambda, IdxType, OutType, TPB>(
       out, in1, in2, len, op, stream);
-  } else if (4 / sizeof(math_t) && bytes % 4 == 0) {
-    binaryOpImpl<math_t, 4 / sizeof(math_t), Lambda, IdxType, TPB>(
+  } else if (4 / maxSize && bytes % 4 == 0) {
+    binaryOpImpl<InType, 4 / maxSize, Lambda, IdxType, OutType, TPB>(
       out, in1, in2, len, op, stream);
-  } else if (2 / sizeof(math_t) && bytes % 2 == 0) {
-    binaryOpImpl<math_t, 2 / sizeof(math_t), Lambda, IdxType, TPB>(
+  } else if (2 / maxSize && bytes % 2 == 0) {
+    binaryOpImpl<InType, 2 / maxSize, Lambda, IdxType, OutType, TPB>(
       out, in1, in2, len, op, stream);
-  } else if (1 / sizeof(math_t)) {
-    binaryOpImpl<math_t, 1 / sizeof(math_t), Lambda, IdxType, TPB>(
+  } else if (1 / maxSize) {
+    binaryOpImpl<InType, 1 / maxSize, Lambda, IdxType, OutType, TPB>(
       out, in1, in2, len, op, stream);
   } else {
-    binaryOpImpl<math_t, 1, Lambda, IdxType, TPB>(out, in1, in2, len, op,
-                                                  stream);
+    binaryOpImpl<InType, 1, Lambda, IdxType, OutType, TPB>(out, in1, in2, len,
+                                                           op, stream);
   }
 }
 
diff --git a/cpp/test/prims/binary_op.cu b/cpp/test/prims/binary_op.cu
index 917df565d6..d92bf2fe6e 100644
--- a/cpp/test/prims/binary_op.cu
+++ b/cpp/test/prims/binary_op.cu
@@ -26,19 +26,20 @@ namespace LinAlg {
 // Or else, we get the following compilation error
 // for an extended __device__ lambda cannot have private or protected access
 // within its class
-template <typename T, typename IdxType>
-void binaryOpLaunch(T *out, const T *in1, const T *in2, IdxType len,
-                    cudaStream_t stream) {
+template <typename InType, typename IdxType, typename OutType>
+void binaryOpLaunch(OutType *out, const InType *in1, const InType *in2,
+                    IdxType len, cudaStream_t stream) {
   binaryOp(
     out, in1, in2, len, [] __device__(T a, T b) { return a + b; }, stream);
 }
 
-template <typename T, typename IdxType>
+template <typename InType, typename IdxType, typename OutType = InType>
 class BinaryOpTest
-  : public ::testing::TestWithParam<BinaryOpInputs<T, IdxType>> {
+  : public ::testing::TestWithParam<BinaryOpInputs<InType, IdxType, OutType>> {
  protected:
   void SetUp() override {
-    params = ::testing::TestWithParam<BinaryOpInputs<T, IdxType>>::GetParam();
+    params = ::testing::TestWithParam<
+      BinaryOpInputs<InType, IdxType, OutType>>::GetParam();
     Random::Rng r(params.seed);
     cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
@@ -62,8 +63,9 @@ class BinaryOpTest
   }
 
  protected:
-  BinaryOpInputs<T, IdxType> params;
-  T *in1, *in2, *out_ref, *out;
+  BinaryOpInputs<InType, IdxType, OutType> params;
+  InType *in1, *in2;
+  OutType *out_ref, *out;
 };
 
 const std::vector<BinaryOpInputs<float, int>> inputsf_i32 = {
@@ -86,6 +88,16 @@ TEST_P(BinaryOpTestF_i64, Result) {
 INSTANTIATE_TEST_CASE_P(BinaryOpTests, BinaryOpTestF_i64,
                         ::testing::ValuesIn(inputsf_i64));
 
+const std::vector<BinaryOpInputs<float, int, double>> inputsf_i32_d = {
+  {0.000001f, 1024 * 1024, 2.f, 1234ULL}};
+typedef BinaryOpTest<float, int, double> BinaryOpTestF_i32_D;
+TEST_P(BinaryOpTestF_i32_D, Result) {
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+                          CompareApprox<double>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(BinaryOpTests, BinaryOpTestF_i32_D,
+                        ::testing::ValuesIn(inputsf_i32_d));
+
 const std::vector<BinaryOpInputs<double, int>> inputsd_i32 = {
   {0.00000001, 1024 * 1024, 1234ULL}};
 typedef BinaryOpTest<double, int> BinaryOpTestD_i32;
diff --git a/cpp/test/prims/binary_op.h b/cpp/test/prims/binary_op.h
index 30d0b3f42b..b1d25d81e2 100644
--- a/cpp/test/prims/binary_op.h
+++ b/cpp/test/prims/binary_op.h
@@ -22,33 +22,33 @@
 namespace MLCommon {
 namespace LinAlg {
 
-template <typename Type, typename IdxType>
-__global__ void naiveAddKernel(Type *out, const Type *in1, const Type *in2,
-                               IdxType len) {
+template <typename InType, typename OutType, typename IdxType>
+__global__ void naiveAddKernel(OutType *out, const InType *in1,
+                               const InType *in2, IdxType len) {
   IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * (IdxType)blockDim.x);
   if (idx < len) {
-    out[idx] = in1[idx] + in2[idx];
+    out[idx] = static_cast<OutType>(in1[idx] + in2[idx]);
   }
 }
 
-template <typename Type, typename IdxType = int>
-void naiveAdd(Type *out, const Type *in1, const Type *in2, IdxType len) {
+template <typename InType, typename IdxType = int, typename OutType = InType>
+void naiveAdd(OutType *out, const InType *in1, const InType *in2, IdxType len) {
   static const IdxType TPB = 64;
   IdxType nblks = ceildiv(len, TPB);
-  naiveAddKernel<Type, IdxType><<<nblks, TPB>>>(out, in1, in2, len);
+  naiveAddKernel<InType, OutType, IdxType><<<nblks, TPB>>>(out, in1, in2, len);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
-template <typename T, typename IdxType = int>
+template <typename InType, typename IdxType = int, typename OutType = InType>
 struct BinaryOpInputs {
-  T tolerance;
+  InType tolerance;
   IdxType len;
   unsigned long long int seed;
 };
 
-template <typename T, typename IdxType = int>
+template <typename InType, typename IdxType = int, typename OutType = InType>
 ::std::ostream &operator<<(::std::ostream &os,
-                           const BinaryOpInputs<T, IdxType> &dims) {
+                           const BinaryOpInputs<InType, IdxType, OutType> &d) {
   return os;
 }
 
diff --git a/cpp/test/prims/unary_op.cu b/cpp/test/prims/unary_op.cu
index e91399c75a..ca9d0d25c9 100644
--- a/cpp/test/prims/unary_op.cu
+++ b/cpp/test/prims/unary_op.cu
@@ -45,10 +45,8 @@ class UnaryOpTest
     Random::Rng r(params.seed);
     cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
-
     auto len = params.len;
     auto scalar = params.scalar;
-
     allocate(in, len);
     allocate(out_ref, len);
     allocate(out, len);

From 14f466726432ec00485a497e0c6e1bb3de672fdc Mon Sep 17 00:00:00 2001
From: Thejaswi Rao <rao.thejaswi@gmail.com>
Date: Thu, 26 Sep 2019 22:07:56 -0700
Subject: [PATCH 06/63] fixed minor typos in the previous commit

---
 cpp/test/prims/binary_op.cu | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/cpp/test/prims/binary_op.cu b/cpp/test/prims/binary_op.cu
index d92bf2fe6e..c0bac8b438 100644
--- a/cpp/test/prims/binary_op.cu
+++ b/cpp/test/prims/binary_op.cu
@@ -30,7 +30,8 @@ template <typename InType, typename IdxType, typename OutType>
 void binaryOpLaunch(OutType *out, const InType *in1, const InType *in2,
                     IdxType len, cudaStream_t stream) {
   binaryOp(
-    out, in1, in2, len, [] __device__(T a, T b) { return a + b; }, stream);
+    out, in1, in2, len, [] __device__(InType a, InType b) { return a + b; },
+    stream);
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
@@ -48,8 +49,8 @@ class BinaryOpTest
     allocate(in2, len);
     allocate(out_ref, len);
     allocate(out, len);
-    r.uniform(in1, len, T(-1.0), T(1.0), stream);
-    r.uniform(in2, len, T(-1.0), T(1.0), stream);
+    r.uniform(in1, len, InType(-1.0), InType(1.0), stream);
+    r.uniform(in2, len, InType(-1.0), InType(1.0), stream);
     naiveAdd(out_ref, in1, in2, len);
     binaryOpLaunch(out, in1, in2, len, stream);
     CUDA_CHECK(cudaStreamDestroy(stream));
@@ -89,7 +90,7 @@ INSTANTIATE_TEST_CASE_P(BinaryOpTests, BinaryOpTestF_i64,
                         ::testing::ValuesIn(inputsf_i64));
 
 const std::vector<BinaryOpInputs<float, int, double>> inputsf_i32_d = {
-  {0.000001f, 1024 * 1024, 2.f, 1234ULL}};
+  {0.000001f, 1024 * 1024, 1234ULL}};
 typedef BinaryOpTest<float, int, double> BinaryOpTestF_i32_D;
 TEST_P(BinaryOpTestF_i32_D, Result) {
   ASSERT_TRUE(devArrMatch(out_ref, out, params.len,

From 927b119a4a709a9c4a71cf8f7c683a8634877093 Mon Sep 17 00:00:00 2001
From: Thejaswi Rao <rao.thejaswi@gmail.com>
Date: Thu, 26 Sep 2019 22:10:07 -0700
Subject: [PATCH 07/63] updated doxygen for binaryOp to document the lambda
 signature

---
 cpp/src_prims/linalg/binary_op.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cpp/src_prims/linalg/binary_op.h b/cpp/src_prims/linalg/binary_op.h
index 3a60519d1c..a7ad986b7a 100644
--- a/cpp/src_prims/linalg/binary_op.h
+++ b/cpp/src_prims/linalg/binary_op.h
@@ -65,6 +65,8 @@ void binaryOpImpl(OutType *out, const InType *in1, const InType *in2,
  * @param len number of elements in the input array
  * @param op the device-lambda
  * @param stream cuda stream where to launch work
+ * @note Lambda must be a functor with the following signature:
+ *       `OutType func(const InType& val1, const InType& val2);`
  */
 template <typename InType, typename Lambda, typename OutType = InType,
           typename IdxType = int, int TPB = 256>

From a2ad67f49e51b50a9e9d89b5bb21fa01ac6b85d5 Mon Sep 17 00:00:00 2001
From: Onur Yilmaz <oyilmaz@nvidia.com>
Date: Fri, 27 Sep 2019 13:36:21 -0400
Subject: [PATCH 08/63] Initial commit for seed bug in dask RF.

---
 cpp/src/randomforest/randomforest.cu          | 15 ++++++----
 cpp/src/randomforest/randomforest.hpp         | 13 ++++++---
 cpp/src/randomforest/randomforest_impl.cuh    | 29 ++++++++++---------
 cpp/src/randomforest/randomforest_impl.h      |  2 +-
 cpp/test/sg/rf_test.cu                        |  4 +--
 cpp/test/sg/rf_treelite_test.cu               |  2 +-
 .../cuml/ensemble/randomforestclassifier.pyx  |  7 ++++-
 .../cuml/ensemble/randomforestregressor.pyx   |  6 +++-
 8 files changed, 48 insertions(+), 30 deletions(-)

diff --git a/cpp/src/randomforest/randomforest.cu b/cpp/src/randomforest/randomforest.cu
index 59c8ed9c5c..fa82017d4a 100644
--- a/cpp/src/randomforest/randomforest.cu
+++ b/cpp/src/randomforest/randomforest.cu
@@ -150,10 +150,11 @@ void postprocess_labels(int n_rows, std::vector<int>& labels,
  * @param[in] cfg_n_streams: No of parallel CUDA for training forest
  */
 void set_rf_params(RF_params& params, int cfg_n_trees, bool cfg_bootstrap,
-                   float cfg_rows_sample, int cfg_n_streams) {
+                   float cfg_rows_sample, int cfg_seed, int cfg_n_streams) {
   params.n_trees = cfg_n_trees;
   params.bootstrap = cfg_bootstrap;
   params.rows_sample = cfg_rows_sample;
+  params.seed = cfg_seed;
   params.n_streams = min(cfg_n_streams, omp_get_max_threads());
   if (params.n_streams == cfg_n_streams) {
     std::cout << "Warning! Max setting Max streams to max openmp threads "
@@ -173,11 +174,12 @@ void set_rf_params(RF_params& params, int cfg_n_trees, bool cfg_bootstrap,
  * @param[in] cfg_tree_params: tree parameters
  */
 void set_all_rf_params(RF_params& params, int cfg_n_trees, bool cfg_bootstrap,
-                       float cfg_rows_sample, int cfg_n_streams,
+                       float cfg_rows_sample, int cfg_seed, int cfg_n_streams,
                        DecisionTree::DecisionTreeParams cfg_tree_params) {
   params.n_trees = cfg_n_trees;
   params.bootstrap = cfg_bootstrap;
   params.rows_sample = cfg_rows_sample;
+  params.seed = cfg_seed;
   params.n_streams = min(cfg_n_streams, omp_get_max_threads());
   if (cfg_n_trees < params.n_streams) params.n_streams = cfg_n_trees;
   set_tree_params(params.tree_params);  // use input tree params
@@ -462,15 +464,16 @@ RF_metrics score(const cumlHandle& user_handle,
 RF_params set_rf_class_obj(int max_depth, int max_leaves, float max_features,
                            int n_bins, int split_algo, int min_rows_per_node,
                            bool bootstrap_features, bool bootstrap, int n_trees,
-                           float rows_sample, CRITERION split_criterion,
-                           bool quantile_per_tree, int cfg_n_streams) {
+                           float rows_sample, int seed,
+                           CRITERION split_criterion, bool quantile_per_tree,
+                           int cfg_n_streams) {
   DecisionTree::DecisionTreeParams tree_params;
   DecisionTree::set_tree_params(
     tree_params, max_depth, max_leaves, max_features, n_bins, split_algo,
     min_rows_per_node, bootstrap_features, split_criterion, quantile_per_tree);
   RF_params rf_params;
-  set_all_rf_params(rf_params, n_trees, bootstrap, rows_sample, cfg_n_streams,
-                    tree_params);
+  set_all_rf_params(rf_params, n_trees, bootstrap, rows_sample, seed,
+                    cfg_n_streams, tree_params);
   return rf_params;
 }
 
diff --git a/cpp/src/randomforest/randomforest.hpp b/cpp/src/randomforest/randomforest.hpp
index 75f78e6340..277aee5172 100644
--- a/cpp/src/randomforest/randomforest.hpp
+++ b/cpp/src/randomforest/randomforest.hpp
@@ -65,6 +65,10 @@ struct RF_params {
   /**
    * Decision tree training hyper parameter struct.
    */
+  /**
+   * random seed
+   */
+  int seed;
   /**
    * Number of concurrent GPU streams for parallel tree building.
    * Each stream is independently managed by CPU thread.
@@ -76,9 +80,9 @@ struct RF_params {
 
 void set_rf_params(RF_params& params, int cfg_n_trees = 1,
                    bool cfg_bootstrap = true, float cfg_rows_sample = 1.0f,
-                   int cfg_n_streams = 8);
+                   int cfg_seed = -1, int cfg_n_streams = 8);
 void set_all_rf_params(RF_params& params, int cfg_n_trees, bool cfg_bootstrap,
-                       float cfg_rows_sample, int cfg_n_streams,
+                       float cfg_rows_sample, int cfg_seed, int cfg_n_streams,
                        DecisionTree::DecisionTreeParams cfg_tree_params);
 void validity_check(const RF_params rf_params);
 void print(const RF_params rf_params);
@@ -154,8 +158,9 @@ RF_metrics score(const cumlHandle& user_handle,
 RF_params set_rf_class_obj(int max_depth, int max_leaves, float max_features,
                            int n_bins, int split_algo, int min_rows_per_node,
                            bool bootstrap_features, bool bootstrap, int n_trees,
-                           float rows_sample, CRITERION split_criterion,
-                           bool quantile_per_tree, int cfg_n_streams);
+                           float rows_sample, int seed,
+                           CRITERION split_criterion, bool quantile_per_tree,
+                           int cfg_n_streams);
 
 // ----------------------------- Regression ----------------------------------- //
 
diff --git a/cpp/src/randomforest/randomforest_impl.cuh b/cpp/src/randomforest/randomforest_impl.cuh
index a1de148a10..60096c2e12 100644
--- a/cpp/src/randomforest/randomforest_impl.cuh
+++ b/cpp/src/randomforest/randomforest_impl.cuh
@@ -70,9 +70,12 @@ void random_uniformInt(int treeid, unsigned int* data, int len, int n_rows,
 template <typename T, typename L>
 void rf<T, L>::prepare_fit_per_tree(
   int tree_id, int n_rows, int n_sampled_rows, unsigned int* selected_rows,
-  const int num_sms, const cudaStream_t stream,
+  int seed, const int num_sms, const cudaStream_t stream,
   const std::shared_ptr<deviceAllocator> device_allocator) {
-  srand(tree_id * 1000);
+  int rs = tree_id * 1000;
+  if (seed != -1) rs = seed * 1000;
+
+  srand(rs * 1000);
   if (rf_params.bootstrap) {
     random_uniformInt(tree_id, selected_rows, n_sampled_rows, n_rows, num_sms,
                       stream);
@@ -221,10 +224,10 @@ void rfClassifier<T>::fit(const cumlHandle& user_handle, const T* input,
     unsigned int* rowids;
     rowids = selected_rows[stream_id]->data();
 
-    this->prepare_fit_per_tree(i, n_rows, n_sampled_rows, rowids,
-                               tempmem[stream_id]->num_sms,
-                               tempmem[stream_id]->stream,
-                               handle.getDeviceAllocator());
+    this->prepare_fit_per_tree(
+      i, n_rows, n_sampled_rows, rowids, (this->rf_params.seed + i),
+      tempmem[stream_id]->num_sms, tempmem[stream_id]->stream,
+      handle.getDeviceAllocator());
 
     /* Build individual tree in the forest.
        - input is a pointer to orig data that have n_cols features and n_rows rows.
@@ -236,8 +239,7 @@ void rfClassifier<T>::fit(const cumlHandle& user_handle, const T* input,
     */
     DecisionTree::TreeMetaDataNode<T, int>* tree_ptr = &(forest->trees[i]);
     tree_ptr->treeid = i;
-    trees[i].fit(handle.getDeviceAllocator(),
-                 handle.getHostAllocator(),
+    trees[i].fit(handle.getDeviceAllocator(), handle.getHostAllocator(),
                  tempmem[stream_id]->stream, input, n_cols, n_rows, labels,
                  rowids, n_sampled_rows, n_unique_labels, tree_ptr,
                  this->rf_params.tree_params, tempmem[stream_id]);
@@ -485,10 +487,10 @@ void rfRegressor<T>::fit(const cumlHandle& user_handle, const T* input,
   for (int i = 0; i < this->rf_params.n_trees; i++) {
     int stream_id = omp_get_thread_num();
     unsigned int* rowids = selected_rows[stream_id]->data();
-    this->prepare_fit_per_tree(i, n_rows, n_sampled_rows, rowids,
-                               tempmem[stream_id]->num_sms,
-                               tempmem[stream_id]->stream,
-                               handle.getDeviceAllocator());
+    this->prepare_fit_per_tree(
+      i, n_rows, n_sampled_rows, rowids, (this->rf_params.seed + i),
+      tempmem[stream_id]->num_sms, tempmem[stream_id]->stream,
+      handle.getDeviceAllocator());
 
     /* Build individual tree in the forest.
        - input is a pointer to orig data that have n_cols features and n_rows rows.
@@ -499,8 +501,7 @@ void rfRegressor<T>::fit(const cumlHandle& user_handle, const T* input,
     */
     DecisionTree::TreeMetaDataNode<T, T>* tree_ptr = &(forest->trees[i]);
     tree_ptr->treeid = i;
-    trees[i].fit(handle.getDeviceAllocator(),
-                 handle.getHostAllocator(),
+    trees[i].fit(handle.getDeviceAllocator(), handle.getHostAllocator(),
                  tempmem[stream_id]->stream, input, n_cols, n_rows, labels,
                  rowids, n_sampled_rows, tree_ptr, this->rf_params.tree_params,
                  tempmem[stream_id]);
diff --git a/cpp/src/randomforest/randomforest_impl.h b/cpp/src/randomforest/randomforest_impl.h
index 3369c82ae1..cafa675838 100644
--- a/cpp/src/randomforest/randomforest_impl.h
+++ b/cpp/src/randomforest/randomforest_impl.h
@@ -30,7 +30,7 @@ class rf {
   virtual ~rf() = default;
   void prepare_fit_per_tree(
     int tree_id, int n_rows, int n_sampled_rows, unsigned int* selected_rows,
-    int num_sms, const cudaStream_t stream,
+    int seed, int num_sms, const cudaStream_t stream,
     const std::shared_ptr<deviceAllocator> device_allocator);
 
   void error_checking(const T* input, L* predictions, int n_rows, int n_cols,
diff --git a/cpp/test/sg/rf_test.cu b/cpp/test/sg/rf_test.cu
index 512c53c2af..115cbd8e97 100644
--- a/cpp/test/sg/rf_test.cu
+++ b/cpp/test/sg/rf_test.cu
@@ -61,7 +61,7 @@ class RfClassifierTest : public ::testing::TestWithParam<RfInputs<T>> {
                     params.split_criterion, false);
     RF_params rf_params;
     set_all_rf_params(rf_params, params.n_trees, params.bootstrap,
-                      params.rows_sample, params.n_streams, tree_params);
+                      params.rows_sample, -1, params.n_streams, tree_params);
     //print(rf_params);
 
     //--------------------------------------------------------
@@ -161,7 +161,7 @@ class RfRegressorTest : public ::testing::TestWithParam<RfInputs<T>> {
                     params.split_criterion, false);
     RF_params rf_params;
     set_all_rf_params(rf_params, params.n_trees, params.bootstrap,
-                      params.rows_sample, params.n_streams, tree_params);
+                      params.rows_sample, -1, params.n_streams, tree_params);
     //print(rf_params);
 
     //--------------------------------------------------------
diff --git a/cpp/test/sg/rf_treelite_test.cu b/cpp/test/sg/rf_treelite_test.cu
index d772b0c83e..279db9588e 100644
--- a/cpp/test/sg/rf_treelite_test.cu
+++ b/cpp/test/sg/rf_treelite_test.cu
@@ -181,7 +181,7 @@ class RfTreeliteTestCommon : public ::testing::TestWithParam<RfInputs<T>> {
                     params.min_rows_per_node, params.bootstrap_features,
                     params.split_criterion, false);
     set_all_rf_params(rf_params, params.n_trees, params.bootstrap,
-                      params.rows_sample, params.n_streams, tree_params);
+                      params.rows_sample, -1, params.n_streams, tree_params);
     // print(rf_params);
     handle.reset(new cumlHandle(rf_params.n_streams));
 
diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx
index 04f848a92d..8dc308bd9d 100644
--- a/python/cuml/ensemble/randomforestclassifier.pyx
+++ b/python/cuml/ensemble/randomforestclassifier.pyx
@@ -81,6 +81,7 @@ cdef extern from "randomforest/randomforest.hpp" namespace "ML":
         int n_trees
         bool bootstrap
         float rows_sample
+        int seed
         pass
 
     cdef cppclass RandomForestMetaData[T, L]:
@@ -181,6 +182,7 @@ cdef extern from "randomforest/randomforest.hpp" namespace "ML":
                                     bool,
                                     int,
                                     float,
+                                    int,
                                     CRITERION,
                                     bool,
                                     int) except +
@@ -302,7 +304,8 @@ class RandomForestClassifier(Base):
                  min_samples_leaf=None, min_weight_fraction_leaf=None,
                  max_leaf_nodes=None, min_impurity_decrease=None,
                  min_impurity_split=None, oob_score=None, n_jobs=None,
-                 random_state=None, warm_start=None, class_weight=None):
+                 random_state=None, warm_start=None, class_weight=None, 
+                 int seed=-1):
 
         sklearn_params = {"criterion": criterion,
                           "min_samples_leaf": min_samples_leaf,
@@ -353,6 +356,7 @@ class RandomForestClassifier(Base):
         self.quantile_per_tree = quantile_per_tree
         self.n_cols = None
         self.n_streams = n_streams
+        self.seed = seed
 
         cdef RandomForestMetaData[float, int] *rf_forest = \
             new RandomForestMetaData[float, int]()
@@ -497,6 +501,7 @@ class RandomForestClassifier(Base):
                                      <bool> self.bootstrap,
                                      <int> self.n_estimators,
                                      <float> self.rows_sample,
+                                     <int> self.seed,
                                      <CRITERION> self.split_criterion,
                                      <bool> self.quantile_per_tree,
                                      <int> self.n_streams)
diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx
index e6727a47a8..f86ff332ad 100644
--- a/python/cuml/ensemble/randomforestregressor.pyx
+++ b/python/cuml/ensemble/randomforestregressor.pyx
@@ -80,6 +80,7 @@ cdef extern from "randomforest/randomforest.hpp" namespace "ML":
         int n_trees
         bool bootstrap
         float rows_sample
+        int seed
         pass
 
     cdef cppclass RandomForestMetaData[T, L]:
@@ -162,6 +163,7 @@ cdef extern from "randomforest/randomforest.hpp" namespace "ML":
                                     bool,
                                     int,
                                     float,
+                                    int,
                                     CRITERION,
                                     bool,
                                     int) except +
@@ -286,7 +288,7 @@ class RandomForestRegressor(Base):
                  max_leaf_nodes=None, min_impurity_decrease=None,
                  min_impurity_split=None, oob_score=None,
                  random_state=None, warm_start=None, class_weight=None,
-                 quantile_per_tree=False, criterion=None):
+                 quantile_per_tree=False, criterion=None, int seed=-1):
 
         sklearn_params = {"criterion": criterion,
                           "min_samples_leaf": min_samples_leaf,
@@ -337,6 +339,7 @@ class RandomForestRegressor(Base):
         self.accuracy_metric = accuracy_metric
         self.quantile_per_tree = quantile_per_tree
         self.n_streams = n_streams
+        self.seed = seed
 
         cdef RandomForestMetaData[float, float] *rf_forest = \
             new RandomForestMetaData[float, float]()
@@ -461,6 +464,7 @@ class RandomForestRegressor(Base):
                                      <bool> self.bootstrap,
                                      <int> self.n_estimators,
                                      <float> self.rows_sample,
+                                     <int> self.seed,
                                      <CRITERION> self.split_criterion,
                                      <bool> self.quantile_per_tree,
                                      <int> self.n_streams)

From ebf4233249755361142572129e35d66c186f761c Mon Sep 17 00:00:00 2001
From: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com>
Date: Fri, 27 Sep 2019 13:39:24 -0400
Subject: [PATCH 09/63] Update CHANGELOG.md

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b8bc4a6279..85a841dd55 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -41,6 +41,7 @@
 - PR #1106: Pinning Distributed version to match Dask for consistent CI results
 - PR #1116: TSNE CUDA 10.1 Bug Fixes
 - PR #1132: DBSCAN Batching Bug Fix
+- PR #1162: DASK RF random seed bug fix
 
 # cuML 0.9.0 (21 Aug 2019)
 

From 72e631fc7128e804d97510a8d0fde3ed279f242f Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 27 Sep 2019 14:16:20 -0400
Subject: [PATCH 10/63] Exposing both single and double index precision, now
 that we have full control over the types used in dbscan

Found what might be a nasty bug in the vertex degree computation.
---
 cpp/examples/dbscan/dbscan_example.cpp |  6 ++---
 cpp/src/datasets/make_blobs.cu         | 23 +++++++++++++++-
 cpp/src/datasets/make_blobs.hpp        | 13 +++++++++-
 cpp/src/dbscan/dbscan.cu               | 28 +++++++++++++++-----
 cpp/src/dbscan/dbscan.hpp              |  8 ++++++
 cpp/src/dbscan/dbscan_api.cpp          |  8 +++---
 cpp/src/dbscan/dbscan_api.h            |  8 +++---
 cpp/src/dbscan/runner.h                |  8 +++---
 cpp/src/dbscan/vertexdeg/algo.h        | 16 ++++++------
 cpp/src/metrics/metrics.cu             |  9 +++++++
 cpp/src/metrics/metrics.hpp            |  5 ++++
 cpp/test/sg/dbscan_test.cu             | 36 +++++++++++++-------------
 python/cuml/cluster/dbscan.pyx         | 26 +++++++++----------
 13 files changed, 133 insertions(+), 61 deletions(-)

diff --git a/cpp/examples/dbscan/dbscan_example.cpp b/cpp/examples/dbscan/dbscan_example.cpp
index 8e5b33069e..21f2ab0aae 100644
--- a/cpp/examples/dbscan/dbscan_example.cpp
+++ b/cpp/examples/dbscan/dbscan_example.cpp
@@ -241,10 +241,10 @@ int main(int argc, char* argv[]) {
   cumlHandle.setStream(stream);
 
   std::vector<int> h_labels(nRows);
-  long* d_labels = nullptr;
+  int* d_labels = nullptr;
   float* d_inputData = nullptr;
 
-  CUDA_RT_CALL(cudaMalloc(&d_labels, nRows * sizeof(long)));
+  CUDA_RT_CALL(cudaMalloc(&d_labels, nRows * sizeof(int)));
   CUDA_RT_CALL(cudaMalloc(&d_inputData, nRows * nCols * sizeof(float)));
   CUDA_RT_CALL(cudaMemcpyAsync(d_inputData, h_inputData.data(),
                                nRows * nCols * sizeof(float),
@@ -259,7 +259,7 @@ int main(int argc, char* argv[]) {
 
   ML::dbscanFit(cumlHandle, d_inputData, nRows, nCols, eps, minPts, d_labels,
                 max_bytes_per_batch, false);
-  CUDA_RT_CALL(cudaMemcpyAsync(h_labels.data(), d_labels, nRows * sizeof(long),
+  CUDA_RT_CALL(cudaMemcpyAsync(h_labels.data(), d_labels, nRows * sizeof(int),
                                cudaMemcpyDeviceToHost, stream));
   CUDA_RT_CALL(cudaStreamSynchronize(stream));
 
diff --git a/cpp/src/datasets/make_blobs.cu b/cpp/src/datasets/make_blobs.cu
index 703594b982..3b476cd5ca 100644
--- a/cpp/src/datasets/make_blobs.cu
+++ b/cpp/src/datasets/make_blobs.cu
@@ -45,5 +45,26 @@ void make_blobs(const cumlHandle& handle, double* out, long* labels,
                                shuffle, center_box_min, center_box_max, seed);
 }
 
+void make_blobs(const cumlHandle& handle, float* out, int* labels, int n_rows,
+                int n_cols, int n_clusters, const float* centers,
+                const float* cluster_std, const float cluster_std_scalar,
+                bool shuffle, float center_box_min, float center_box_max,
+                uint64_t seed) {
+  MLCommon::Random::make_blobs(out, labels, n_rows, n_cols, n_clusters,
+                               handle.getDeviceAllocator(), handle.getStream(),
+                               centers, cluster_std, cluster_std_scalar,
+                               shuffle, center_box_min, center_box_max, seed);
+}
+
+void make_blobs(const cumlHandle& handle, double* out, int* labels, int n_rows,
+                int n_cols, int n_clusters, const double* centers,
+                const double* cluster_std, const double cluster_std_scalar,
+                bool shuffle, double center_box_min, double center_box_max,
+                uint64_t seed) {
+  MLCommon::Random::make_blobs(out, labels, n_rows, n_cols, n_clusters,
+                               handle.getDeviceAllocator(), handle.getStream(),
+                               centers, cluster_std, cluster_std_scalar,
+                               shuffle, center_box_min, center_box_max, seed);
+}
 }  // namespace Datasets
-}  // end namespace ML
+}  // namespace ML
diff --git a/cpp/src/datasets/make_blobs.hpp b/cpp/src/datasets/make_blobs.hpp
index ce08d92062..645f075e38 100644
--- a/cpp/src/datasets/make_blobs.hpp
+++ b/cpp/src/datasets/make_blobs.hpp
@@ -63,7 +63,18 @@ void make_blobs(const cumlHandle& handle, double* out, long* labels,
                 const double cluster_std_scalar = 1.f, bool shuffle = true,
                 double center_box_min = 10.f, double center_box_max = 10.f,
                 uint64_t seed = 0ULL);
-/** @} */
+
+void make_blobs(const cumlHandle& handle, float* out, int* labels, int n_rows,
+                int n_cols, int n_clusters, const float* centers,
+                const float* cluster_std, const float cluster_std_scalar,
+                bool shuffle, float center_box_min, float center_box_max,
+                uint64_t seed);
+
+void make_blobs(const cumlHandle& handle, double* out, int* labels, int n_rows,
+                int n_cols, int n_clusters, const double* centers,
+                const double* cluster_std, const double cluster_std_scalar,
+                bool shuffle, double center_box_min, double center_box_max,
+                uint64_t seed);
 
 }  // namespace Datasets
 }  // namespace ML
diff --git a/cpp/src/dbscan/dbscan.cu b/cpp/src/dbscan/dbscan.cu
index 74419967d3..f94d622584 100644
--- a/cpp/src/dbscan/dbscan.cu
+++ b/cpp/src/dbscan/dbscan.cu
@@ -27,20 +27,36 @@ using namespace Dbscan;
 // @todo
 // In the below 2 calls, the Index type has been hard-coded to `int64_t`
 // We should pick the right Index type based on the input dimensions.
+void dbscanFit(const cumlHandle &handle, float *input, int n_rows, int n_cols,
+               float eps, int min_pts, int *labels, size_t max_bytes_per_batch,
+               bool verbose) {
+  dbscanFitImpl<float, int>(handle.getImpl(), input, n_rows, n_cols, eps,
+                            min_pts, labels, max_bytes_per_batch,
+                            handle.getStream(), verbose);
+}
+
+void dbscanFit(const cumlHandle &handle, double *input, int n_rows, int n_cols,
+               double eps, int min_pts, int *labels, size_t max_bytes_per_batch,
+               bool verbose) {
+  dbscanFitImpl<double, int>(handle.getImpl(), input, n_rows, n_cols, eps,
+                             min_pts, labels, max_bytes_per_batch,
+                             handle.getStream(), verbose);
+}
+
 void dbscanFit(const cumlHandle &handle, float *input, long n_rows, long n_cols,
                float eps, int min_pts, long *labels, size_t max_bytes_per_batch,
                bool verbose) {
-  dbscanFitImpl<float, int64_t>(handle.getImpl(), input, n_rows, n_cols, eps,
-                                min_pts, labels, max_bytes_per_batch,
-                                handle.getStream(), verbose);
+  dbscanFitImpl<float, long>(handle.getImpl(), input, n_rows, n_cols, eps,
+                             min_pts, labels, max_bytes_per_batch,
+                             handle.getStream(), verbose);
 }
 
 void dbscanFit(const cumlHandle &handle, double *input, long n_rows,
                long n_cols, double eps, int min_pts, long *labels,
                size_t max_bytes_per_batch, bool verbose) {
-  dbscanFitImpl<double, int64_t>(handle.getImpl(), input, n_rows, n_cols, eps,
-                                 min_pts, labels, max_bytes_per_batch,
-                                 handle.getStream(), verbose);
+  dbscanFitImpl<double, long>(handle.getImpl(), input, n_rows, n_cols, eps,
+                              min_pts, labels, max_bytes_per_batch,
+                              handle.getStream(), verbose);
 }
 
 };  // end namespace ML
diff --git a/cpp/src/dbscan/dbscan.hpp b/cpp/src/dbscan/dbscan.hpp
index abe03ab280..8f1e5ae4c8 100644
--- a/cpp/src/dbscan/dbscan.hpp
+++ b/cpp/src/dbscan/dbscan.hpp
@@ -35,12 +35,20 @@ namespace ML {
  * @param[in] verbose: print useful information as algorithm executes
  * @{
  */
+void dbscanFit(const cumlHandle &handle, float *input, int n_rows, int n_cols,
+               float eps, int min_pts, int *labels,
+               size_t max_bytes_per_batch = 0, bool verbose = false);
+void dbscanFit(const cumlHandle &handle, double *input, int n_rows, int n_cols,
+               double eps, int min_pts, int *labels,
+               size_t max_bytes_per_batch = 0, bool verbose = false);
+
 void dbscanFit(const cumlHandle &handle, float *input, long n_rows, long n_cols,
                float eps, int min_pts, long *labels,
                size_t max_bytes_per_batch = 0, bool verbose = false);
 void dbscanFit(const cumlHandle &handle, double *input, long n_rows,
                long n_cols, double eps, int min_pts, long *labels,
                size_t max_bytes_per_batch = 0, bool verbose = false);
+
 /** @} */
 
 }  // namespace ML
diff --git a/cpp/src/dbscan/dbscan_api.cpp b/cpp/src/dbscan/dbscan_api.cpp
index 0c0272ab3d..a70a1a9bb9 100644
--- a/cpp/src/dbscan/dbscan_api.cpp
+++ b/cpp/src/dbscan/dbscan_api.cpp
@@ -18,8 +18,8 @@
 #include "common/cumlHandle.hpp"
 #include "dbscan.hpp"
 
-cumlError_t cumlSpDbscanFit(cumlHandle_t handle, float *input, long n_rows,
-                            long n_cols, float eps, int min_pts, long *labels,
+cumlError_t cumlSpDbscanFit(cumlHandle_t handle, float *input, int n_rows,
+                            int n_cols, float eps, int min_pts, int *labels,
                             size_t max_bytes_per_batch, int verbose) {
   cumlError_t status;
   ML::cumlHandle *handle_ptr;
@@ -42,8 +42,8 @@ cumlError_t cumlSpDbscanFit(cumlHandle_t handle, float *input, long n_rows,
   return status;
 }
 
-cumlError_t cumlDpDbscanFit(cumlHandle_t handle, double *input, long n_rows,
-                            long n_cols, double eps, int min_pts, long *labels,
+cumlError_t cumlDpDbscanFit(cumlHandle_t handle, double *input, int n_rows,
+                            int n_cols, double eps, int min_pts, int *labels,
                             size_t max_bytes_per_batch, int verbose) {
   cumlError_t status;
   ML::cumlHandle *handle_ptr;
diff --git a/cpp/src/dbscan/dbscan_api.h b/cpp/src/dbscan/dbscan_api.h
index 5563d01330..67112b131e 100644
--- a/cpp/src/dbscan/dbscan_api.h
+++ b/cpp/src/dbscan/dbscan_api.h
@@ -39,11 +39,11 @@ extern "C" {
  * @return CUML_SUCCESS on success and other corresponding flags upon any failures.
  * @{
  */
-cumlError_t cumlSpDbscanFit(cumlHandle_t handle, float *input, long n_rows,
-                            long n_cols, float eps, int min_pts, long *labels,
+cumlError_t cumlSpDbscanFit(cumlHandle_t handle, float *input, int n_rows,
+                            int n_cols, float eps, int min_pts, int *labels,
                             size_t max_bytes_per_batch, int verbose);
-cumlError_t cumlDpDbscanFit(cumlHandle_t handle, double *input, long n_rows,
-                            long n_cols, double eps, int min_pts, long *labels,
+cumlError_t cumlDpDbscanFit(cumlHandle_t handle, double *input, int n_rows,
+                            int n_cols, double eps, int min_pts, int *labels,
                             size_t max_bytes_per_batch, int verbose);
 /** @} */
 
diff --git a/cpp/src/dbscan/runner.h b/cpp/src/dbscan/runner.h
index df8faabb12..c8160677ac 100644
--- a/cpp/src/dbscan/runner.h
+++ b/cpp/src/dbscan/runner.h
@@ -88,6 +88,7 @@ size_t run(const ML::cumlHandle_impl& handle, Type_f* x, Index_ N, Index_ D,
       adjSize + corePtsSize + 2 * xaSize + mSize + vdSize + exScanSize;
     return size;
   }
+
   // partition the temporary workspace needed for different stages of dbscan
   Index_ adjlen = 0;
   Index_ curradjlen = 0;
@@ -120,7 +121,7 @@ size_t run(const ML::cumlHandle_impl& handle, Type_f* x, Index_ N, Index_ D,
     if (nPoints <= 0) continue;
 
     if (verbose)
-      std::cout << "- Iteration " << i + 1 << " / " << nBatches
+      std::cout << "- Iteration " << i + 1 << "  " << nBatches
                 << ". Batch size is " << nPoints << " samples." << std::endl;
 
     if (verbose) std::cout << "Computing vertex degrees" << std::endl;
@@ -130,6 +131,9 @@ size_t run(const ML::cumlHandle_impl& handle, Type_f* x, Index_ N, Index_ D,
     CUDA_CHECK(cudaStreamSynchronize(stream));
     ML::POP_RANGE();
 
+    if (verbose)
+      std::cout << "Computing adjacency graph of size " << curradjlen
+                << std::endl;
     // Running AdjGraph
     ML::PUSH_RANGE("Trace::Dbscan::AdjGraph");
     if (curradjlen > adjlen || adj_graph.data() == NULL) {
@@ -137,8 +141,6 @@ size_t run(const ML::cumlHandle_impl& handle, Type_f* x, Index_ N, Index_ D,
       adj_graph.resize(adjlen, stream);
     }
 
-    if (verbose) std::cout << "Computing adjacency graph" << std::endl;
-
     AdjGraph::run<Type, Index_>(handle, adj, vd, adj_graph.data(), adjlen,
                                 ex_scan, N, minPts, core_pts, algoAdj, nPoints,
                                 stream);
diff --git a/cpp/src/dbscan/vertexdeg/algo.h b/cpp/src/dbscan/vertexdeg/algo.h
index 6bc5398bd0..3f8fdaca51 100644
--- a/cpp/src/dbscan/vertexdeg/algo.h
+++ b/cpp/src/dbscan/vertexdeg/algo.h
@@ -41,7 +41,7 @@ void launcher(const ML::cumlHandle_impl &handle, Pack<value_t, index_t> data,
          "index_t should be 4 or 8 bytes");
 
   index_t m = data.N;
-  index_t n = min(data.N - startVertexId, batchSize);
+  index_t n = batchSize;
   index_t k = data.D;
 
   index_t *vd = data.vd;
@@ -60,17 +60,17 @@ void launcher(const ML::cumlHandle_impl &handle, Pack<value_t, index_t> data,
 
   if (workspaceSize != 0) workspace.resize(workspaceSize, stream);
 
-  auto fused_op = [vd, n] __device__(index_t global_c_idx, bool in_neigh) {
+  auto fused_op = [vd, n, batchSize] __device__(index_t global_c_idx,
+                                                bool in_neigh) {
     // fused construction of vertex degree
-    index_t batch_vertex = global_c_idx - (n * (global_c_idx / n));
+    index_t batch_vertex = fmod(global_c_idx, n);
 
     if (sizeof(index_t) == 4) {
-      atomicAdd((int *)(vd + batch_vertex), (int)in_neigh);
-      atomicAdd((int *)(vd + n), (int)in_neigh);
+      atomicAdd((unsigned int *)(vd + batch_vertex), in_neigh);
+      atomicAdd((unsigned int *)(vd + n), in_neigh);
     } else if (sizeof(index_t) == 8) {
-      atomicAdd((unsigned long long *)(vd + batch_vertex),
-                (unsigned long long)in_neigh);
-      atomicAdd((unsigned long long *)(vd + n), (unsigned long long)in_neigh);
+      atomicAdd((unsigned long long *)(vd + batch_vertex), in_neigh);
+      atomicAdd((unsigned long long *)(vd + n), in_neigh);
     }
   };
 
diff --git a/cpp/src/metrics/metrics.cu b/cpp/src/metrics/metrics.cu
index 0a6b51ffbc..79007262cb 100644
--- a/cpp/src/metrics/metrics.cu
+++ b/cpp/src/metrics/metrics.cu
@@ -59,6 +59,15 @@ double adjustedRandIndex(const cumlHandle &handle, const long *y,
     handle.getDeviceAllocator(), handle.getStream());
 }
 
+double adjustedRandIndex(const cumlHandle &handle, const int *y,
+                         const int *y_hat, const int n,
+                         const int lower_class_range,
+                         const int upper_class_range) {
+  return MLCommon::Metrics::computeAdjustedRandIndex(
+    y, y_hat, n, lower_class_range, upper_class_range,
+    handle.getDeviceAllocator(), handle.getStream());
+}
+
 double klDivergence(const cumlHandle &handle, const double *y,
                     const double *y_hat, int n) {
   return MLCommon::Metrics::klDivergence(
diff --git a/cpp/src/metrics/metrics.hpp b/cpp/src/metrics/metrics.hpp
index aa4736b3ac..881d5a9aab 100644
--- a/cpp/src/metrics/metrics.hpp
+++ b/cpp/src/metrics/metrics.hpp
@@ -111,6 +111,11 @@ double adjustedRandIndex(const cumlHandle &handle, const long *y,
                          const long lower_class_range,
                          const long upper_class_range);
 
+double adjustedRandIndex(const cumlHandle &handle, const int *y,
+                         const int *y_hat, const int n,
+                         const int lower_class_range,
+                         const int upper_class_range);
+
 /**
 * Calculates the "Kullback-Leibler Divergence"
 *
diff --git a/cpp/test/sg/dbscan_test.cu b/cpp/test/sg/dbscan_test.cu
index 075f02fbe7..9518b466d2 100644
--- a/cpp/test/sg/dbscan_test.cu
+++ b/cpp/test/sg/dbscan_test.cu
@@ -42,9 +42,9 @@ using namespace std;
 
 template <typename T>
 struct DbscanInputs {
-  long n_row;
-  long n_col;
-  long n_centers;
+  int n_row;
+  int n_col;
+  int n_centers;
   T cluster_std;
   T eps;
   int min_pts;
@@ -67,8 +67,8 @@ class DbscanTest : public ::testing::TestWithParam<DbscanInputs<T>> {
 
     device_buffer<T> out(handle.getDeviceAllocator(), handle.getStream(),
                          params.n_row * params.n_col);
-    device_buffer<long> l(handle.getDeviceAllocator(), handle.getStream(),
-                          params.n_row);
+    device_buffer<int> l(handle.getDeviceAllocator(), handle.getStream(),
+                         params.n_row);
 
     make_blobs(handle, out.data(), l.data(), params.n_row, params.n_col,
                params.n_centers, nullptr, nullptr, params.cluster_std, true,
@@ -108,26 +108,26 @@ class DbscanTest : public ::testing::TestWithParam<DbscanInputs<T>> {
 
  protected:
   DbscanInputs<T> params;
-  long *labels, *labels_ref;
+  int *labels, *labels_ref;
 
   double score;
 };
 
 const std::vector<DbscanInputs<float>> inputsf2 = {
-  {50000l, 16l, 5l, 0.01, 2, 2, (size_t)13e8, 1234ULL},
-  {500l, 16l, 5l, 0.01, 2, 2, (size_t)100, 1234ULL},
-  {1000l, 1000l, 10l, 0.01, 2, 2, (size_t)13e8, 1234ULL},
-  {50000l, 16l, 5l, 0.01, 2, 2, (size_t)13e8, 1234ULL},
-  {20000l, 10000l, 10l, 0.01, 2, 2, (size_t)13e8, 1234ULL},
-  {20000l, 100l, 5000l, 0.01, 2, 2, (size_t)13e8, 1234ULL}};
+  {50000, 16, 5, 0.01, 2, 2, (size_t)13e8, 1234ULL},
+  {500, 16, 5, 0.01, 2, 2, (size_t)100, 1234ULL},
+  {1000, 1000, 10, 0.01, 2, 2, (size_t)13e8, 1234ULL},
+  {50000, 16, 5l, 0.01, 2, 2, (size_t)13e8, 1234ULL},
+  {20000, 10000, 10, 0.01, 2, 2, (size_t)13e8, 1234ULL},
+  {20000, 100, 5000, 0.01, 2, 2, (size_t)13e8, 1234ULL}};
 
 const std::vector<DbscanInputs<double>> inputsd2 = {
-  {50000l, 16l, 5l, 0.01, 2, 2, (size_t)13e9, 1234ULL},
-  {500l, 16l, 5l, 0.01, 2, 2, (size_t)100, 1234ULL},
-  {1000l, 1000l, 10l, 0.01, 2, 2, (size_t)13e9, 1234ULL},
-  {100l, 10000l, 10l, 0.01, 2, 2, (size_t)13e9, 1234ULL},
-  {20000l, 10000l, 10l, 0.01, 2, 2, (size_t)13e9, 1234ULL},
-  {20000l, 100l, 5000l, 0.01, 2, 2, (size_t)13e9, 1234ULL}};
+  {50000, 16, 5, 0.01, 2, 2, (size_t)13e9, 1234ULL},
+  {500, 16, 5, 0.01, 2, 2, (size_t)100, 1234ULL},
+  {1000, 1000, 10, 0.01, 2, 2, (size_t)13e9, 1234ULL},
+  {100, 10000, 10, 0.01, 2, 2, (size_t)13e9, 1234ULL},
+  {20000, 10000, 10, 0.01, 2, 2, (size_t)13e9, 1234ULL},
+  {20000, 100, 5000, 0.01, 2, 2, (size_t)13e9, 1234ULL}};
 
 typedef DbscanTest<float> DbscanTestF;
 TEST_P(DbscanTestF, Result) { ASSERT_TRUE(score == 1.0); }
diff --git a/python/cuml/cluster/dbscan.pyx b/python/cuml/cluster/dbscan.pyx
index 1cb1660119..fa582d7bc2 100644
--- a/python/cuml/cluster/dbscan.pyx
+++ b/python/cuml/cluster/dbscan.pyx
@@ -38,21 +38,21 @@ cdef extern from "dbscan/dbscan.hpp" namespace "ML":
 
     cdef void dbscanFit(cumlHandle& handle,
                         float *input,
-                        long n_rows,
-                        long n_cols,
+                        int n_rows,
+                        int n_cols,
                         float eps,
                         int min_pts,
-                        long *labels,
+                        int *labels,
                         size_t max_bytes_per_batch,
                         bool verbose) except +
 
     cdef void dbscanFit(cumlHandle& handle,
                         double *input,
-                        long n_rows,
-                        long n_cols,
+                        int n_rows,
+                        int n_cols,
                         double eps,
                         int min_pts,
-                        long *labels,
+                        int *labels,
                         size_t max_bytes_per_batch,
                         bool verbose) except +
 
@@ -186,27 +186,27 @@ class DBSCAN(Base):
                                check_dtype=[np.float32, np.float64])
 
         cdef cumlHandle* handle_ = <cumlHandle*><size_t>self.handle.getHandle()
-        self.labels_ = cudf.Series(zeros(n_rows, dtype=np.int64))
+        self.labels_ = cudf.Series(zeros(n_rows, dtype=np.int32))
         cdef uintptr_t labels_ptr = get_cudf_column_ptr(self.labels_)
 
         if self.dtype == np.float32:
             dbscanFit(handle_[0],
                       <float*>input_ptr,
-                      <long> n_rows,
-                      <long> n_cols,
+                      <int> n_rows,
+                      <int> n_cols,
                       <float> self.eps,
                       <int> self.min_samples,
-                      <long*> labels_ptr,
+                      <int*> labels_ptr,
                       <size_t>self.max_bytes_per_batch,
                       <bool>self.verbose)
         else:
             dbscanFit(handle_[0],
                       <double*>input_ptr,
-                      <long> n_rows,
-                      <long> n_cols,
+                      <int> n_rows,
+                      <int> n_cols,
                       <double> self.eps,
                       <int> self.min_samples,
-                      <long*> labels_ptr,
+                      <int*> labels_ptr,
                       <size_t> self.max_bytes_per_batch,
                       <bool>self.verbose)
         # make sure that the `dbscanFit` is complete before the following

From a9889c7e1efc9b3d90cbbc6f9bba8018f91f112d Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 27 Sep 2019 14:19:48 -0400
Subject: [PATCH 11/63] More fixes

---
 cpp/src/dbscan/dbscan.cu   |  3 ---
 cpp/src_prims/sparse/csr.h | 11 +++++------
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/cpp/src/dbscan/dbscan.cu b/cpp/src/dbscan/dbscan.cu
index f94d622584..055c234847 100644
--- a/cpp/src/dbscan/dbscan.cu
+++ b/cpp/src/dbscan/dbscan.cu
@@ -24,9 +24,6 @@ namespace ML {
 
 using namespace Dbscan;
 
-// @todo
-// In the below 2 calls, the Index type has been hard-coded to `int64_t`
-// We should pick the right Index type based on the input dimensions.
 void dbscanFit(const cumlHandle &handle, float *input, int n_rows, int n_cols,
                float eps, int min_pts, int *labels, size_t max_bytes_per_batch,
                bool verbose) {
diff --git a/cpp/src_prims/sparse/csr.h b/cpp/src_prims/sparse/csr.h
index 790ad2eac5..03cbd8a0e8 100644
--- a/cpp/src_prims/sparse/csr.h
+++ b/cpp/src_prims/sparse/csr.h
@@ -708,10 +708,9 @@ __global__ void weak_cc_label_device(Index_ *labels, const Index_ *row_ind,
         cj = labels[j_ind];
         if (ci < cj) {
           if (sizeof(Index_) == 4)
-            atomicMin((int *)(labels + j_ind), (int)ci);
+            atomicMin((unsigned int *)(labels + j_ind), ci);
           else if (sizeof(Index_) == 8)
-            atomicMin((unsigned long long int *)(labels + j_ind),
-                      (unsigned long long int)ci);
+            atomicMin((unsigned long long int *)(labels + j_ind), ci);
           xa[j_ind] = true;
           m[0] = true;
         } else if (ci > cj) {
@@ -721,10 +720,10 @@ __global__ void weak_cc_label_device(Index_ *labels, const Index_ *row_ind,
       }
       if (ci_mod) {
         if (sizeof(Index_) == 4)
-          atomicMin((int *)(labels + startVertexId + tid), (int)ci);
+          atomicMin((unsigned int *)(labels + startVertexId + tid), ci);
         else if (sizeof(Index_) == 8)
           atomicMin((unsigned long long int *)(labels + startVertexId + tid),
-                    (unsigned long long int)ci);
+                    ci);
 
         xa[startVertexId + tid] = true;
         m[0] = true;
@@ -742,7 +741,7 @@ __global__ void weak_cc_init_label_kernel(Index_ *labels, Index_ startVertexId,
   Index_ tid = threadIdx.x + blockIdx.x * TPB_X;
   if (tid < batchSize) {
     if (filter_op(tid) && labels[tid + startVertexId] == MAX_LABEL)
-      labels[startVertexId + tid] = Index_(startVertexId + tid + 1);
+      labels[startVertexId + tid] = startVertexId + tid + 1;
   }
 }
 

From 503753a8d7bbe4ad20a21f77387a69b0ae26c1d1 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 27 Sep 2019 14:24:26 -0400
Subject: [PATCH 12/63] Couple single and double precision across floating
 point and int variants for now

---
 python/cuml/cluster/dbscan.pyx | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/cuml/cluster/dbscan.pyx b/python/cuml/cluster/dbscan.pyx
index fa582d7bc2..2633dbae7f 100644
--- a/python/cuml/cluster/dbscan.pyx
+++ b/python/cuml/cluster/dbscan.pyx
@@ -48,11 +48,11 @@ cdef extern from "dbscan/dbscan.hpp" namespace "ML":
 
     cdef void dbscanFit(cumlHandle& handle,
                         double *input,
-                        int n_rows,
-                        int n_cols,
+                        long n_rows,
+                        long n_cols,
                         double eps,
                         int min_pts,
-                        int *labels,
+                        long *labels,
                         size_t max_bytes_per_batch,
                         bool verbose) except +
 
@@ -202,11 +202,11 @@ class DBSCAN(Base):
         else:
             dbscanFit(handle_[0],
                       <double*>input_ptr,
-                      <int> n_rows,
-                      <int> n_cols,
+                      <long> n_rows,
+                      <long> n_cols,
                       <double> self.eps,
                       <int> self.min_samples,
-                      <int*> labels_ptr,
+                      <long*> labels_ptr,
                       <size_t> self.max_bytes_per_batch,
                       <bool>self.verbose)
         # make sure that the `dbscanFit` is complete before the following

From 2273d2ab9fac36ff4b68b0a454c5f85f940abcd4 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 27 Sep 2019 16:20:57 -0400
Subject: [PATCH 13/63] Still getting to the bottom of unfortunate single
 precision instability

---
 cpp/src/dbscan/runner.h           |  2 +-
 cpp/src/dbscan/vertexdeg/algo.h   | 16 ++++++++--------
 cpp/src_prims/distance/distance.h |  2 +-
 python/cuml/test/test_dbscan.py   |  4 ++--
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/cpp/src/dbscan/runner.h b/cpp/src/dbscan/runner.h
index c8160677ac..1c9ddd3f69 100644
--- a/cpp/src/dbscan/runner.h
+++ b/cpp/src/dbscan/runner.h
@@ -124,7 +124,7 @@ size_t run(const ML::cumlHandle_impl& handle, Type_f* x, Index_ N, Index_ D,
       std::cout << "- Iteration " << i + 1 << "  " << nBatches
                 << ". Batch size is " << nPoints << " samples." << std::endl;
 
-    if (verbose) std::cout << "Computing vertex degrees" << std::endl;
+    if (verbose) std::cout << "Compting vertex degrees" << std::endl;
     VertexDeg::run<Type_f, Index_>(handle, adj, vd, x, eps, N, D, algoVd,
                                    startVertexId, nPoints, stream);
     MLCommon::updateHost(&curradjlen, vd + nPoints, 1, stream);
diff --git a/cpp/src/dbscan/vertexdeg/algo.h b/cpp/src/dbscan/vertexdeg/algo.h
index 3f8fdaca51..6bc5398bd0 100644
--- a/cpp/src/dbscan/vertexdeg/algo.h
+++ b/cpp/src/dbscan/vertexdeg/algo.h
@@ -41,7 +41,7 @@ void launcher(const ML::cumlHandle_impl &handle, Pack<value_t, index_t> data,
          "index_t should be 4 or 8 bytes");
 
   index_t m = data.N;
-  index_t n = batchSize;
+  index_t n = min(data.N - startVertexId, batchSize);
   index_t k = data.D;
 
   index_t *vd = data.vd;
@@ -60,17 +60,17 @@ void launcher(const ML::cumlHandle_impl &handle, Pack<value_t, index_t> data,
 
   if (workspaceSize != 0) workspace.resize(workspaceSize, stream);
 
-  auto fused_op = [vd, n, batchSize] __device__(index_t global_c_idx,
-                                                bool in_neigh) {
+  auto fused_op = [vd, n] __device__(index_t global_c_idx, bool in_neigh) {
     // fused construction of vertex degree
-    index_t batch_vertex = fmod(global_c_idx, n);
+    index_t batch_vertex = global_c_idx - (n * (global_c_idx / n));
 
     if (sizeof(index_t) == 4) {
-      atomicAdd((unsigned int *)(vd + batch_vertex), in_neigh);
-      atomicAdd((unsigned int *)(vd + n), in_neigh);
+      atomicAdd((int *)(vd + batch_vertex), (int)in_neigh);
+      atomicAdd((int *)(vd + n), (int)in_neigh);
     } else if (sizeof(index_t) == 8) {
-      atomicAdd((unsigned long long *)(vd + batch_vertex), in_neigh);
-      atomicAdd((unsigned long long *)(vd + n), in_neigh);
+      atomicAdd((unsigned long long *)(vd + batch_vertex),
+                (unsigned long long)in_neigh);
+      atomicAdd((unsigned long long *)(vd + n), (unsigned long long)in_neigh);
     }
   };
 
diff --git a/cpp/src_prims/distance/distance.h b/cpp/src_prims/distance/distance.h
index 19f9d2c77a..97bdbdc7af 100644
--- a/cpp/src_prims/distance/distance.h
+++ b/cpp/src_prims/distance/distance.h
@@ -326,7 +326,7 @@ void pairwiseDistance(const Type *x, const Type *y, Type *dist, Index_ m,
  *                  the epsilon neighborhood.
  */
 template <DistanceType distanceType, typename T, typename Lambda,
-          typename Index_ = int, typename OutputTile_ = OutputTile_8x128x128>
+          typename Index_ = long, typename OutputTile_ = OutputTile_8x128x128>
 size_t epsilon_neighborhood(const T *a, const T *b, bool *adj, Index_ m,
                             Index_ n, Index_ k, T eps, void *workspace,
                             size_t worksize, cudaStream_t stream,
diff --git a/python/cuml/test/test_dbscan.py b/python/cuml/test/test_dbscan.py
index 795805465f..8768e6161a 100644
--- a/python/cuml/test/test_dbscan.py
+++ b/python/cuml/test/test_dbscan.py
@@ -73,8 +73,8 @@ def test_dbscan(datatype, input_type, use_handle,
     if nrows < 500000:
         skdbscan = skDBSCAN(eps=1, min_samples=2, algorithm="brute")
         sk_labels = skdbscan.fit_predict(X)
-        for i in range(X.shape[0]):
-            assert cu_labels[i] == sk_labels[i]
+        score = adjusted_rand_score(cu_labels, sk_labels)
+        assert score == 1
 
 
 @pytest.mark.parametrize("name", [

From 9057d94fb61c1df24a9a27121c9bc2ed01a788b3 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 27 Sep 2019 16:27:07 -0400
Subject: [PATCH 14/63] Computing proper vertex degrees now

---
 cpp/src/dbscan/runner.h         | 2 +-
 cpp/src/dbscan/vertexdeg/algo.h | 3 ++-
 python/cuml/cluster/dbscan.pyx  | 7 +++++--
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/cpp/src/dbscan/runner.h b/cpp/src/dbscan/runner.h
index 1c9ddd3f69..350f6b7a7d 100644
--- a/cpp/src/dbscan/runner.h
+++ b/cpp/src/dbscan/runner.h
@@ -149,7 +149,7 @@ size_t run(const ML::cumlHandle_impl& handle, Type_f* x, Index_ N, Index_ D,
 
     ML::PUSH_RANGE("Trace::Dbscan::WeakCC");
 
-    if (verbose) std::cout << "Computing connected components" << std::endl;
+    if (verbose) std::cout << "Compuing connected components" << std::endl;
 
     MLCommon::Sparse::weak_cc_batched<Index_, TPB>(
       labels, ex_scan, adj_graph.data(), adjlen, N, startVertexId, nPoints,
diff --git a/cpp/src/dbscan/vertexdeg/algo.h b/cpp/src/dbscan/vertexdeg/algo.h
index 6bc5398bd0..06502ba6e0 100644
--- a/cpp/src/dbscan/vertexdeg/algo.h
+++ b/cpp/src/dbscan/vertexdeg/algo.h
@@ -62,7 +62,8 @@ void launcher(const ML::cumlHandle_impl &handle, Pack<value_t, index_t> data,
 
   auto fused_op = [vd, n] __device__(index_t global_c_idx, bool in_neigh) {
     // fused construction of vertex degree
-    index_t batch_vertex = global_c_idx - (n * (global_c_idx / n));
+    index_t batch_vertex =
+      fmod(global_c_idx, n);  //global_c_idx - (n * (global_c_idx / n));
 
     if (sizeof(index_t) == 4) {
       atomicAdd((int *)(vd + batch_vertex), (int)in_neigh);
diff --git a/python/cuml/cluster/dbscan.pyx b/python/cuml/cluster/dbscan.pyx
index 2633dbae7f..1a7d4dfd11 100644
--- a/python/cuml/cluster/dbscan.pyx
+++ b/python/cuml/cluster/dbscan.pyx
@@ -186,10 +186,11 @@ class DBSCAN(Base):
                                check_dtype=[np.float32, np.float64])
 
         cdef cumlHandle* handle_ = <cumlHandle*><size_t>self.handle.getHandle()
-        self.labels_ = cudf.Series(zeros(n_rows, dtype=np.int32))
-        cdef uintptr_t labels_ptr = get_cudf_column_ptr(self.labels_)
 
+        cdef uintptr_t labels_ptr
         if self.dtype == np.float32:
+            self.labels_ = cudf.Series(zeros(n_rows, dtype=np.int32))
+            labels_ptr = get_cudf_column_ptr(self.labels_)
             dbscanFit(handle_[0],
                       <float*>input_ptr,
                       <int> n_rows,
@@ -200,6 +201,8 @@ class DBSCAN(Base):
                       <size_t>self.max_bytes_per_batch,
                       <bool>self.verbose)
         else:
+            self.labels_ = cudf.Series(zeros(n_rows, dtype=np.int64))
+            labels_ptr = get_cudf_column_ptr(self.labels_)
             dbscanFit(handle_[0],
                       <double*>input_ptr,
                       <long> n_rows,

From 1797923bf894916e90c47d76d893cfe91b6a1044 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 27 Sep 2019 16:42:40 -0400
Subject: [PATCH 15/63] Light cleanup

---
 cpp/src/dbscan/runner.h         | 14 +++++++++-----
 cpp/src/dbscan/vertexdeg/algo.h |  3 +--
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/cpp/src/dbscan/runner.h b/cpp/src/dbscan/runner.h
index 350f6b7a7d..077c47cada 100644
--- a/cpp/src/dbscan/runner.h
+++ b/cpp/src/dbscan/runner.h
@@ -121,10 +121,10 @@ size_t run(const ML::cumlHandle_impl& handle, Type_f* x, Index_ N, Index_ D,
     if (nPoints <= 0) continue;
 
     if (verbose)
-      std::cout << "- Iteration " << i + 1 << "  " << nBatches
+      std::cout << "- Iteration " << i + 1 << " out of " << nBatches
                 << ". Batch size is " << nPoints << " samples." << std::endl;
 
-    if (verbose) std::cout << "Compting vertex degrees" << std::endl;
+    if (verbose) std::cout << "--> Computing vertex degrees" << std::endl;
     VertexDeg::run<Type_f, Index_>(handle, adj, vd, x, eps, N, D, algoVd,
                                    startVertexId, nPoints, stream);
     MLCommon::updateHost(&curradjlen, vd + nPoints, 1, stream);
@@ -132,8 +132,8 @@ size_t run(const ML::cumlHandle_impl& handle, Type_f* x, Index_ N, Index_ D,
     ML::POP_RANGE();
 
     if (verbose)
-      std::cout << "Computing adjacency graph of size " << curradjlen
-                << std::endl;
+      std::cout << "--> Computing adjacency graph of size " << curradjlen
+                << " samples." << std::endl;
     // Running AdjGraph
     ML::PUSH_RANGE("Trace::Dbscan::AdjGraph");
     if (curradjlen > adjlen || adj_graph.data() == NULL) {
@@ -149,13 +149,15 @@ size_t run(const ML::cumlHandle_impl& handle, Type_f* x, Index_ N, Index_ D,
 
     ML::PUSH_RANGE("Trace::Dbscan::WeakCC");
 
-    if (verbose) std::cout << "Compuing connected components" << std::endl;
+    if (verbose) std::cout << "--> Compuing connected components" << std::endl;
 
     MLCommon::Sparse::weak_cc_batched<Index_, TPB>(
       labels, ex_scan, adj_graph.data(), adjlen, N, startVertexId, nPoints,
       &state, stream,
       [core_pts] __device__(Index_ tid) { return core_pts[tid]; });
     ML::POP_RANGE();
+
+    if (verbose) std::cout << " " << std::endl;
   }
 
   ML::PUSH_RANGE("Trace::Dbscan::FinalRelabel");
@@ -165,6 +167,8 @@ size_t run(const ML::cumlHandle_impl& handle, Type_f* x, Index_ N, Index_ D,
   relabelForSkl<Index_><<<nblks, TPB, 0, stream>>>(labels, N, MAX_LABEL);
   CUDA_CHECK(cudaPeekAtLastError());
   ML::POP_RANGE();
+
+  if (verbose) std::cout << "Done." << std::endl;
   return (size_t)0;
 }
 }  // namespace Dbscan
diff --git a/cpp/src/dbscan/vertexdeg/algo.h b/cpp/src/dbscan/vertexdeg/algo.h
index 06502ba6e0..abefb26d0a 100644
--- a/cpp/src/dbscan/vertexdeg/algo.h
+++ b/cpp/src/dbscan/vertexdeg/algo.h
@@ -62,8 +62,7 @@ void launcher(const ML::cumlHandle_impl &handle, Pack<value_t, index_t> data,
 
   auto fused_op = [vd, n] __device__(index_t global_c_idx, bool in_neigh) {
     // fused construction of vertex degree
-    index_t batch_vertex =
-      fmod(global_c_idx, n);  //global_c_idx - (n * (global_c_idx / n));
+    index_t batch_vertex = fmod(global_c_idx, n);
 
     if (sizeof(index_t) == 4) {
       atomicAdd((int *)(vd + batch_vertex), (int)in_neigh);

From e8e3d2661cc653655c8bbbd62cc0beaa91901545 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 27 Sep 2019 16:43:05 -0400
Subject: [PATCH 16/63] Fixing typos

---
 cpp/src/dbscan/runner.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/dbscan/runner.h b/cpp/src/dbscan/runner.h
index 077c47cada..8bd97a008b 100644
--- a/cpp/src/dbscan/runner.h
+++ b/cpp/src/dbscan/runner.h
@@ -121,7 +121,7 @@ size_t run(const ML::cumlHandle_impl& handle, Type_f* x, Index_ N, Index_ D,
     if (nPoints <= 0) continue;
 
     if (verbose)
-      std::cout << "- Iteration " << i + 1 << " out of " << nBatches
+      std::cout << "- Iteration " << i + 1 << " / " << nBatches
                 << ". Batch size is " << nPoints << " samples." << std::endl;
 
     if (verbose) std::cout << "--> Computing vertex degrees" << std::endl;
@@ -149,7 +149,7 @@ size_t run(const ML::cumlHandle_impl& handle, Type_f* x, Index_ N, Index_ D,
 
     ML::PUSH_RANGE("Trace::Dbscan::WeakCC");
 
-    if (verbose) std::cout << "--> Compuing connected components" << std::endl;
+    if (verbose) std::cout << "--> Computing connected components" << std::endl;
 
     MLCommon::Sparse::weak_cc_batched<Index_, TPB>(
       labels, ex_scan, adj_graph.data(), adjlen, N, startVertexId, nPoints,

From 8a0122bdc46724c9857d88391c5d6fec259528eb Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 27 Sep 2019 16:51:17 -0400
Subject: [PATCH 17/63] Updating changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b25cf8c994..0d9c26f8ab 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -18,6 +18,7 @@
 - PR #1086: Ensure RegressorMixin scorer uses device arrays
 - PR #1114: K-means: Exposing useful params, removing unused params, proxying params in Dask
 - PR #1136: CUDA 10.1 CI updates
+- PR #1163: Some more correctness improvements. Better verbose printing
 
 ## Bug Fixes
 

From 0b6db552fcadfe139cab44280bc9ab20e6877e92 Mon Sep 17 00:00:00 2001
From: Onur Yilmaz <oyilmaz@nvidia.com>
Date: Sat, 28 Sep 2019 11:56:13 -0400
Subject: [PATCH 18/63] Dask RF updated to fix the random seed bug.

---
 cpp/src/randomforest/randomforest_impl.cuh       | 16 +++++++---------
 cpp/src/randomforest/randomforest_impl.h         |  2 +-
 .../cuml/dask/ensemble/randomforestclassifier.py |  9 +++++++++
 .../cuml/dask/ensemble/randomforestregressor.py  |  9 +++++++++
 4 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/cpp/src/randomforest/randomforest_impl.cuh b/cpp/src/randomforest/randomforest_impl.cuh
index 60096c2e12..471e6da754 100644
--- a/cpp/src/randomforest/randomforest_impl.cuh
+++ b/cpp/src/randomforest/randomforest_impl.cuh
@@ -70,10 +70,10 @@ void random_uniformInt(int treeid, unsigned int* data, int len, int n_rows,
 template <typename T, typename L>
 void rf<T, L>::prepare_fit_per_tree(
   int tree_id, int n_rows, int n_sampled_rows, unsigned int* selected_rows,
-  int seed, const int num_sms, const cudaStream_t stream,
+  const int num_sms, const cudaStream_t stream,
   const std::shared_ptr<deviceAllocator> device_allocator) {
-  int rs = tree_id * 1000;
-  if (seed != -1) rs = seed * 1000;
+  int rs = tree_id;
+  if (rf_params.seed > -1) rs = rf_params.seed + tree_id;
 
   srand(rs * 1000);
   if (rf_params.bootstrap) {
@@ -225,9 +225,8 @@ void rfClassifier<T>::fit(const cumlHandle& user_handle, const T* input,
     rowids = selected_rows[stream_id]->data();
 
     this->prepare_fit_per_tree(
-      i, n_rows, n_sampled_rows, rowids, (this->rf_params.seed + i),
-      tempmem[stream_id]->num_sms, tempmem[stream_id]->stream,
-      handle.getDeviceAllocator());
+      i, n_rows, n_sampled_rows, rowids, tempmem[stream_id]->num_sms,
+      tempmem[stream_id]->stream, handle.getDeviceAllocator());
 
     /* Build individual tree in the forest.
        - input is a pointer to orig data that have n_cols features and n_rows rows.
@@ -488,9 +487,8 @@ void rfRegressor<T>::fit(const cumlHandle& user_handle, const T* input,
     int stream_id = omp_get_thread_num();
     unsigned int* rowids = selected_rows[stream_id]->data();
     this->prepare_fit_per_tree(
-      i, n_rows, n_sampled_rows, rowids, (this->rf_params.seed + i),
-      tempmem[stream_id]->num_sms, tempmem[stream_id]->stream,
-      handle.getDeviceAllocator());
+      i, n_rows, n_sampled_rows, rowids, tempmem[stream_id]->num_sms,
+      tempmem[stream_id]->stream, handle.getDeviceAllocator());
 
     /* Build individual tree in the forest.
        - input is a pointer to orig data that have n_cols features and n_rows rows.
diff --git a/cpp/src/randomforest/randomforest_impl.h b/cpp/src/randomforest/randomforest_impl.h
index cafa675838..3369c82ae1 100644
--- a/cpp/src/randomforest/randomforest_impl.h
+++ b/cpp/src/randomforest/randomforest_impl.h
@@ -30,7 +30,7 @@ class rf {
   virtual ~rf() = default;
   void prepare_fit_per_tree(
     int tree_id, int n_rows, int n_sampled_rows, unsigned int* selected_rows,
-    int seed, int num_sms, const cudaStream_t stream,
+    int num_sms, const cudaStream_t stream,
     const std::shared_ptr<deviceAllocator> device_allocator);
 
   void error_checking(const T* input, L* predictions, int n_rows, int n_cols,
diff --git a/python/cuml/dask/ensemble/randomforestclassifier.py b/python/cuml/dask/ensemble/randomforestclassifier.py
index 9e063e18a4..8187af137a 100755
--- a/python/cuml/dask/ensemble/randomforestclassifier.py
+++ b/python/cuml/dask/ensemble/randomforestclassifier.py
@@ -194,6 +194,12 @@ def __init__(
                 self.n_estimators_per_worker[i] + 1
             )
 
+        seeds = list()
+        seeds.append(0)
+        for i in range(1, len(self.n_estimators_per_worker)):
+            sd = self.n_estimators_per_worker[i-1] + seeds[i-1]
+            seeds.append(sd)
+
         key = str(uuid1())
         self.rfs = {
             worker: c.submit(
@@ -213,6 +219,7 @@ def __init__(
                 rows_sample,
                 max_leaves,
                 quantile_per_tree,
+                seeds[n],
                 dtype,
                 key="%s-%s" % (key, n),
                 workers=[worker],
@@ -243,6 +250,7 @@ def _func_build_rf(
         rows_sample,
         max_leaves,
         quantile_per_tree,
+        seed,
         dtype,
     ):
         return cuRFC(
@@ -262,6 +270,7 @@ def _func_build_rf(
             max_leaves=max_leaves,
             n_streams=n_streams,
             quantile_per_tree=quantile_per_tree,
+            seed=seed,
             gdf_datatype=dtype,
         )
 
diff --git a/python/cuml/dask/ensemble/randomforestregressor.py b/python/cuml/dask/ensemble/randomforestregressor.py
index b86e1a9269..e12bb75056 100755
--- a/python/cuml/dask/ensemble/randomforestregressor.py
+++ b/python/cuml/dask/ensemble/randomforestregressor.py
@@ -197,6 +197,12 @@ def __init__(
                 self.n_estimators_per_worker[i] + 1
             )
 
+        seeds = list()
+        seeds.append(0)
+        for i in range(1, len(self.n_estimators_per_worker)):
+            sd = self.n_estimators_per_worker[i-1] + seeds[i-1]
+            seeds.append(sd)
+
         key = str(uuid1())
         self.rfs = {
             worker: c.submit(
@@ -216,6 +222,7 @@ def __init__(
                 max_leaves,
                 accuracy_metric,
                 quantile_per_tree,
+                seeds[n],
                 key="%s-%s" % (key, n),
                 workers=[worker],
             )
@@ -245,6 +252,7 @@ def _func_build_rf(
         max_leaves,
         accuracy_metric,
         quantile_per_tree,
+        seed,
     ):
 
         return cuRFR(
@@ -264,6 +272,7 @@ def _func_build_rf(
             n_streams=n_streams,
             accuracy_metric=accuracy_metric,
             quantile_per_tree=quantile_per_tree,
+            seed=seed,
         )
 
     @staticmethod

From 0eacd14a382869208ebc04d30dda61a2c0dc0d53 Mon Sep 17 00:00:00 2001
From: Onur Yilmaz <oyilmaz@nvidia.com>
Date: Sat, 28 Sep 2019 12:02:49 -0400
Subject: [PATCH 19/63] Fixed the formatting issues.

---
 python/cuml/ensemble/randomforestclassifier.pyx | 4 ++--
 python/cuml/ensemble/randomforestregressor.pyx  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx
index 8dc308bd9d..5d7aadb656 100644
--- a/python/cuml/ensemble/randomforestclassifier.pyx
+++ b/python/cuml/ensemble/randomforestclassifier.pyx
@@ -304,8 +304,8 @@ class RandomForestClassifier(Base):
                  min_samples_leaf=None, min_weight_fraction_leaf=None,
                  max_leaf_nodes=None, min_impurity_decrease=None,
                  min_impurity_split=None, oob_score=None, n_jobs=None,
-                 random_state=None, warm_start=None, class_weight=None, 
-                 int seed=-1):
+                 random_state=None, warm_start=None, class_weight=None,
+                 seed=-1):
 
         sklearn_params = {"criterion": criterion,
                           "min_samples_leaf": min_samples_leaf,
diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx
index f86ff332ad..b3cd2f2bf6 100644
--- a/python/cuml/ensemble/randomforestregressor.pyx
+++ b/python/cuml/ensemble/randomforestregressor.pyx
@@ -288,7 +288,7 @@ class RandomForestRegressor(Base):
                  max_leaf_nodes=None, min_impurity_decrease=None,
                  min_impurity_split=None, oob_score=None,
                  random_state=None, warm_start=None, class_weight=None,
-                 quantile_per_tree=False, criterion=None, int seed=-1):
+                 quantile_per_tree=False, criterion=None, seed=-1):
 
         sklearn_params = {"criterion": criterion,
                           "min_samples_leaf": min_samples_leaf,

From c9b3f3d48cd224f852c86bf80e1616b1b91ef1c7 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Sat, 28 Sep 2019 13:06:03 -0400
Subject: [PATCH 20/63] Increasing number of batches when single precision
 isn't large enough to store the batch size * N (for adjacency graph & CSR row
 in array.

Will eventually expose an argument through Python to allow the user to control the precision of the outputs (and thus the precision of the internal structures.). It's important to allow the user to set this explicitly because there's a huge tradeoff to be made between a larger number of much faster batches and a smaller number of much slower batches.
---
 cpp/src/dbscan/adjgraph/naive.h   |  2 +-
 cpp/src/dbscan/adjgraph/pack.h    |  2 +-
 cpp/src/dbscan/dbscan.h           | 17 ++++++++++++---
 cpp/src/dbscan/runner.h           | 35 ++++++++++++++++++++++++-------
 cpp/src/dbscan/vertexdeg/algo.h   |  2 +-
 cpp/src/dbscan/vertexdeg/naive.h  |  2 +-
 cpp/src/dbscan/vertexdeg/runner.h |  2 +-
 cpp/src_prims/distance/distance.h |  2 +-
 python/cuml/cluster/dbscan.pyx    | 20 ++++++++----------
 9 files changed, 57 insertions(+), 27 deletions(-)

diff --git a/cpp/src/dbscan/adjgraph/naive.h b/cpp/src/dbscan/adjgraph/naive.h
index 41d10b2558..ff211ad46c 100644
--- a/cpp/src/dbscan/adjgraph/naive.h
+++ b/cpp/src/dbscan/adjgraph/naive.h
@@ -26,7 +26,7 @@ namespace Dbscan {
 namespace AdjGraph {
 namespace Naive {
 
-template <typename Type, typename Index_ = long>
+template <typename Type, typename Index_ = int>
 void launcher(const ML::cumlHandle_impl& handle, Pack<Type, Index_> data,
               Index_ batchSize, cudaStream_t stream) {
   Index_ k = 0;
diff --git a/cpp/src/dbscan/adjgraph/pack.h b/cpp/src/dbscan/adjgraph/pack.h
index f1d028a2e6..fd002363cd 100644
--- a/cpp/src/dbscan/adjgraph/pack.h
+++ b/cpp/src/dbscan/adjgraph/pack.h
@@ -19,7 +19,7 @@
 namespace Dbscan {
 namespace AdjGraph {
 
-template <typename Type, typename Index_ = long>
+template <typename Type, typename Index_ = int>
 struct Pack {
   /**
      * vertex degree array
diff --git a/cpp/src/dbscan/dbscan.h b/cpp/src/dbscan/dbscan.h
index b38793215d..b0dd3d5f9c 100644
--- a/cpp/src/dbscan/dbscan.h
+++ b/cpp/src/dbscan/dbscan.h
@@ -27,7 +27,7 @@ static const size_t DEFAULT_MAX_MEM_BYTES = 13e9;
 
 // Default max mem set to a reasonable value for a 16gb card.
 
-template <typename T, typename Index_ = long>
+template <typename T, typename Index_ = int>
 Index_ computeBatchCount(Index_ n_rows, size_t max_bytes_per_batch) {
   Index_ n_batches = 1;
   // There seems to be a weird overflow bug with cutlass gemm kernels
@@ -37,16 +37,27 @@ Index_ computeBatchCount(Index_ n_rows, size_t max_bytes_per_batch) {
 
   if (max_bytes_per_batch <= 0) max_bytes_per_batch = DEFAULT_MAX_MEM_BYTES;
 
+  Index_ MAX_LABEL = std::numeric_limits<Index_>::max();
+
   while (true) {
     size_t batchSize = ceildiv<size_t>(n_rows, n_batches);
-    if (batchSize * n_rows * sizeof(T) < max_bytes_per_batch || batchSize == 1)
+    if (((batchSize * n_rows * sizeof(T) < max_bytes_per_batch) &&
+         /**
+          * Though single precision can be faster per execution of each kernel,
+          * there's a trade-off to be made between using single precision with
+          * many more batches (which become smaller as n_rows grows) and using
+          * double precision, which will have less batches but could become 8-10x
+          * slower per batch.
+          */
+         (batchSize * n_rows < MAX_LABEL)) ||
+        batchSize == 1)
       break;
     ++n_batches;
   }
   return n_batches;
 }
 
-template <typename T, typename Index_ = long>
+template <typename T, typename Index_ = int>
 void dbscanFitImpl(const ML::cumlHandle_impl &handle, T *input, Index_ n_rows,
                    Index_ n_cols, T eps, int min_pts, Index_ *labels,
                    size_t max_bytes_per_batch, cudaStream_t stream,
diff --git a/cpp/src/dbscan/runner.h b/cpp/src/dbscan/runner.h
index 8bd97a008b..0e2136c1b6 100644
--- a/cpp/src/dbscan/runner.h
+++ b/cpp/src/dbscan/runner.h
@@ -36,7 +36,7 @@ static const int TPB = 256;
  * 1. Turn any labels matching MAX_LABEL into -1
  * 2. Subtract 1 from all other labels.
  */
-template <typename Index_ = long>
+template <typename Index_ = int>
 __global__ void relabelForSkl(Index_* labels, Index_ N, Index_ MAX_LABEL) {
   Index_ tid = threadIdx.x + blockDim.x * blockIdx.x;
   if (labels[tid] == MAX_LABEL)
@@ -49,7 +49,7 @@ __global__ void relabelForSkl(Index_* labels, Index_ N, Index_ MAX_LABEL) {
  * Turn the non-monotonic labels from weak_cc primitive into
  * an array of labels drawn from a monotonically increasing set.
  */
-template <typename Index_ = long>
+template <typename Index_ = int>
 void final_relabel(Index_* db_cluster, Index_ N, cudaStream_t stream) {
   Index_ MAX_LABEL = std::numeric_limits<Index_>::max();
   MLCommon::Label::make_monotonic(
@@ -69,13 +69,25 @@ void final_relabel(Index_* db_cluster, Index_ N, cudaStream_t stream) {
  * @param stream the cudaStream where to launch the kernels
  * @return in case the temp buffer is null, this returns the size needed.
  */
-template <typename Type, typename Type_f, typename Index_ = long>
+template <typename Type, typename Type_f, typename Index_ = int>
 size_t run(const ML::cumlHandle_impl& handle, Type_f* x, Index_ N, Index_ D,
            Type_f eps, Type minPts, Index_* labels, int algoVd, int algoAdj,
            int algoCcl, void* workspace, Index_ nBatches, cudaStream_t stream,
            bool verbose = false) {
   const size_t align = 256;
-  Index_ batchSize = ceildiv(N, nBatches);
+  size_t batchSize = ceildiv<size_t>(N, nBatches);
+
+  /**
+   * Note on coupling between data types:
+   * - adjacency graph has a worst case size of N * batchSize elements. Thus,
+   * if N is very close to being greater than the maximum 32-bit IdxType type used, a
+   * 64-bit IdxType should probably be used instead.
+   * - exclusive scan is the CSR row index for the adjacency graph and its values have a
+   * risk of overflowing when N * batchSize becomes larger what can be stored in IdxType
+   * - the vertex degree array has a worst case of each element having all other
+   * elements in their neighborhood, so any IdxType can be safely used, so long as N doesn't
+   * overflow.
+   */
   size_t adjSize = alignTo<size_t>(sizeof(bool) * N * batchSize, align);
   size_t corePtsSize = alignTo<size_t>(sizeof(bool) * batchSize, align);
   size_t xaSize = alignTo<size_t>(sizeof(bool) * N, align);
@@ -83,13 +95,23 @@ size_t run(const ML::cumlHandle_impl& handle, Type_f* x, Index_ N, Index_ D,
   size_t vdSize = alignTo<size_t>(sizeof(Index_) * (batchSize + 1), align);
   size_t exScanSize = alignTo<size_t>(sizeof(Index_) * batchSize, align);
 
+  // TODO: We should ASSERT that N * batchSize is greater than the maximum value used
+  Index_ MAX_LABEL = std::numeric_limits<Index_>::max();
+
+  ASSERT(
+    N * batchSize < MAX_LABEL,
+    "An overflow occurred with the current choice of precision "
+    "and the number of samples. (Max allowed batch size is %d, but was %d)",
+    MAX_LABEL / N, batchSize);
+
   if (workspace == NULL) {
     auto size =
       adjSize + corePtsSize + 2 * xaSize + mSize + vdSize + exScanSize;
     return size;
   }
 
-  // partition the temporary workspace needed for different stages of dbscan
+  // partition the temporary workspace needed for different stages of dbscan.
+
   Index_ adjlen = 0;
   Index_ curradjlen = 0;
   char* temp = (char*)workspace;
@@ -117,7 +139,7 @@ size_t run(const ML::cumlHandle_impl& handle, Type_f* x, Index_ N, Index_ D,
                                               stream);
 
     Index_ startVertexId = i * batchSize;
-    Index_ nPoints = min(N - startVertexId, batchSize);
+    Index_ nPoints = min(size_t(N - startVertexId), batchSize);
     if (nPoints <= 0) continue;
 
     if (verbose)
@@ -162,7 +184,6 @@ size_t run(const ML::cumlHandle_impl& handle, Type_f* x, Index_ N, Index_ D,
 
   ML::PUSH_RANGE("Trace::Dbscan::FinalRelabel");
   if (algoCcl == 2) final_relabel(labels, N, stream);
-  Index_ MAX_LABEL = std::numeric_limits<Index_>::max();
   size_t nblks = ceildiv<size_t>(N, TPB);
   relabelForSkl<Index_><<<nblks, TPB, 0, stream>>>(labels, N, MAX_LABEL);
   CUDA_CHECK(cudaPeekAtLastError());
diff --git a/cpp/src/dbscan/vertexdeg/algo.h b/cpp/src/dbscan/vertexdeg/algo.h
index abefb26d0a..6b1bf29b24 100644
--- a/cpp/src/dbscan/vertexdeg/algo.h
+++ b/cpp/src/dbscan/vertexdeg/algo.h
@@ -32,7 +32,7 @@ namespace Algo {
 /**
  * Calculates the vertex degree array and the epsilon neighborhood adjacency matrix for the batch.
  */
-template <typename value_t, typename index_t = long>
+template <typename value_t, typename index_t = int>
 void launcher(const ML::cumlHandle_impl &handle, Pack<value_t, index_t> data,
               index_t startVertexId, index_t batchSize, cudaStream_t stream) {
   data.resetArray(stream, batchSize + 1);
diff --git a/cpp/src/dbscan/vertexdeg/naive.h b/cpp/src/dbscan/vertexdeg/naive.h
index d5f8ba67f8..f2a9cf4b13 100644
--- a/cpp/src/dbscan/vertexdeg/naive.h
+++ b/cpp/src/dbscan/vertexdeg/naive.h
@@ -38,7 +38,7 @@ static const int TPB_Y = 8;
  * @param N number of rows
  * @param D number of columns
  */
-template <typename Type, typename Index_ = long>
+template <typename Type, typename Index_ = int>
 __global__ void vertex_degree_kernel(Pack<Type, Index_> data,
                                      Index_ startVertexId, Index_ batchSize) {
   const Type Zero = (Type)0;
diff --git a/cpp/src/dbscan/vertexdeg/runner.h b/cpp/src/dbscan/vertexdeg/runner.h
index ebfb23c350..b24e808053 100644
--- a/cpp/src/dbscan/vertexdeg/runner.h
+++ b/cpp/src/dbscan/vertexdeg/runner.h
@@ -24,7 +24,7 @@
 namespace Dbscan {
 namespace VertexDeg {
 
-template <typename Type_f, typename Index_ = long>
+template <typename Type_f, typename Index_ = int>
 void run(const ML::cumlHandle_impl& handle, bool* adj, Index_* vd, Type_f* x,
          Type_f eps, Index_ N, Index_ D, int algo, Index_ startVertexId,
          Index_ batchSize, cudaStream_t stream) {
diff --git a/cpp/src_prims/distance/distance.h b/cpp/src_prims/distance/distance.h
index 97bdbdc7af..19f9d2c77a 100644
--- a/cpp/src_prims/distance/distance.h
+++ b/cpp/src_prims/distance/distance.h
@@ -326,7 +326,7 @@ void pairwiseDistance(const Type *x, const Type *y, Type *dist, Index_ m,
  *                  the epsilon neighborhood.
  */
 template <DistanceType distanceType, typename T, typename Lambda,
-          typename Index_ = long, typename OutputTile_ = OutputTile_8x128x128>
+          typename Index_ = int, typename OutputTile_ = OutputTile_8x128x128>
 size_t epsilon_neighborhood(const T *a, const T *b, bool *adj, Index_ m,
                             Index_ n, Index_ k, T eps, void *workspace,
                             size_t worksize, cudaStream_t stream,
diff --git a/python/cuml/cluster/dbscan.pyx b/python/cuml/cluster/dbscan.pyx
index 1a7d4dfd11..5fad5228fe 100644
--- a/python/cuml/cluster/dbscan.pyx
+++ b/python/cuml/cluster/dbscan.pyx
@@ -48,11 +48,11 @@ cdef extern from "dbscan/dbscan.hpp" namespace "ML":
 
     cdef void dbscanFit(cumlHandle& handle,
                         double *input,
-                        long n_rows,
-                        long n_cols,
+                        int n_rows,
+                        int n_cols,
                         double eps,
                         int min_pts,
-                        long *labels,
+                        int *labels,
                         size_t max_bytes_per_batch,
                         bool verbose) except +
 
@@ -187,10 +187,10 @@ class DBSCAN(Base):
 
         cdef cumlHandle* handle_ = <cumlHandle*><size_t>self.handle.getHandle()
 
-        cdef uintptr_t labels_ptr
+        self.labels_ = cudf.Series(zeros(n_rows, dtype=np.int32))
+        cdef uintptr_t labels_ptr = get_cudf_column_ptr(self.labels_)
+
         if self.dtype == np.float32:
-            self.labels_ = cudf.Series(zeros(n_rows, dtype=np.int32))
-            labels_ptr = get_cudf_column_ptr(self.labels_)
             dbscanFit(handle_[0],
                       <float*>input_ptr,
                       <int> n_rows,
@@ -201,15 +201,13 @@ class DBSCAN(Base):
                       <size_t>self.max_bytes_per_batch,
                       <bool>self.verbose)
         else:
-            self.labels_ = cudf.Series(zeros(n_rows, dtype=np.int64))
-            labels_ptr = get_cudf_column_ptr(self.labels_)
             dbscanFit(handle_[0],
                       <double*>input_ptr,
-                      <long> n_rows,
-                      <long> n_cols,
+                      <int> n_rows,
+                      <int> n_cols,
                       <double> self.eps,
                       <int> self.min_samples,
-                      <long*> labels_ptr,
+                      <int*> labels_ptr,
                       <size_t> self.max_bytes_per_batch,
                       <bool>self.verbose)
         # make sure that the `dbscanFit` is complete before the following

From 4c1b4a8ffd905493fa40fd4ee18ebb0f684a34cc Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Sat, 28 Sep 2019 21:33:43 -0400
Subject: [PATCH 21/63] Adding `out_dtype` argument to dbscan fit() and tests
 for valid output

---
 python/cuml/cluster/dbscan.pyx  | 101 +++++++++++++++++++++++++-------
 python/cuml/test/test_dbscan.py |  18 +++++-
 2 files changed, 94 insertions(+), 25 deletions(-)

diff --git a/python/cuml/cluster/dbscan.pyx b/python/cuml/cluster/dbscan.pyx
index 5fad5228fe..2f002e5c36 100644
--- a/python/cuml/cluster/dbscan.pyx
+++ b/python/cuml/cluster/dbscan.pyx
@@ -56,6 +56,27 @@ cdef extern from "dbscan/dbscan.hpp" namespace "ML":
                         size_t max_bytes_per_batch,
                         bool verbose) except +
 
+    cdef void dbscanFit(cumlHandle& handle,
+                        float *input,
+                        long n_rows,
+                        long n_cols,
+                        double eps,
+                        int min_pts,
+                        long *labels,
+                        size_t max_bytes_per_batch,
+                        bool verbose) except +
+
+    cdef void dbscanFit(cumlHandle& handle,
+                        double *input,
+                        long n_rows,
+                        long n_cols,
+                        double eps,
+                        int min_pts,
+                        long *labels,
+                        size_t max_bytes_per_batch,
+                        bool verbose) except +
+
+
 
 class DBSCAN(Base):
     """
@@ -164,7 +185,7 @@ class DBSCAN(Base):
         if attr == 'labels_array':
             return self.labels_._column._data.mem
 
-    def fit(self, X):
+    def fit(self, X, out_dtype="auto"):
         """
         Perform DBSCAN clustering from features.
 
@@ -174,11 +195,23 @@ class DBSCAN(Base):
            Dense matrix (floats or doubles) of shape (n_samples, n_features).
            Acceptable formats: cuDF DataFrame, NumPy ndarray, Numba device
            ndarray, cuda array interface compliant array like CuPy
+        out_dtype: dtype Determines the precision of the output labels array.
+                         default: "auto". Valid values are { "auto", "int32",
+                         np.int32, "int64", np.int64}
         """
 
         if self.labels_ is not None:
             del self.labels_
 
+        if out_dtype == "auto":
+            out_dtype = np.int32 if X.shape[0] < 1e6 else np.int64
+        elif out_dtype not in ["int32", np.int32, "int64", np.int64]:
+            raise ValueError("Invalid value for out_dtype. "
+                             "Valid values are {'auto', 'int32', 'int64', "
+                             "np.int32, np.int64}")
+
+
+
         cdef uintptr_t input_ptr
 
         X_m, input_ptr, n_rows, n_cols, self.dtype = \
@@ -187,36 +220,60 @@ class DBSCAN(Base):
 
         cdef cumlHandle* handle_ = <cumlHandle*><size_t>self.handle.getHandle()
 
-        self.labels_ = cudf.Series(zeros(n_rows, dtype=np.int32))
+        self.labels_ = cudf.Series(zeros(n_rows, dtype=out_dtype))
         cdef uintptr_t labels_ptr = get_cudf_column_ptr(self.labels_)
 
         if self.dtype == np.float32:
-            dbscanFit(handle_[0],
-                      <float*>input_ptr,
-                      <int> n_rows,
-                      <int> n_cols,
-                      <float> self.eps,
-                      <int> self.min_samples,
-                      <int*> labels_ptr,
-                      <size_t>self.max_bytes_per_batch,
-                      <bool>self.verbose)
+            if out_dtype is "int32" or out_dtype is np.int32:
+                dbscanFit(handle_[0],
+                          <float*>input_ptr,
+                          <int> n_rows,
+                          <int> n_cols,
+                          <float> self.eps,
+                          <int> self.min_samples,
+                          <int*> labels_ptr,
+                          <size_t>self.max_bytes_per_batch,
+                          <bool>self.verbose)
+            else:
+                dbscanFit(handle_[0],
+                          <float*>input_ptr,
+                          <long> n_rows,
+                          <long> n_cols,
+                          <float> self.eps,
+                          <int> self.min_samples,
+                          <long*> labels_ptr,
+                          <size_t>self.max_bytes_per_batch,
+                          <bool>self.verbose)
+
         else:
-            dbscanFit(handle_[0],
-                      <double*>input_ptr,
-                      <int> n_rows,
-                      <int> n_cols,
-                      <double> self.eps,
-                      <int> self.min_samples,
-                      <int*> labels_ptr,
-                      <size_t> self.max_bytes_per_batch,
-                      <bool>self.verbose)
+            if out_dtype is "int32" or out_dtype is np.int32:
+                dbscanFit(handle_[0],
+                          <double*>input_ptr,
+                          <int> n_rows,
+                          <int> n_cols,
+                          <double> self.eps,
+                          <int> self.min_samples,
+                          <int*> labels_ptr,
+                          <size_t> self.max_bytes_per_batch,
+                          <bool>self.verbose)
+            else:
+                dbscanFit(handle_[0],
+                          <double*>input_ptr,
+                          <long> n_rows,
+                          <long> n_cols,
+                          <double> self.eps,
+                          <int> self.min_samples,
+                          <long*> labels_ptr,
+                          <size_t> self.max_bytes_per_batch,
+                          <bool>self.verbose)
+
         # make sure that the `dbscanFit` is complete before the following
         # delete call happens
         self.handle.sync()
         del(X_m)
         return self
 
-    def fit_predict(self, X):
+    def fit_predict(self, X, out_dtype="auto"):
         """
         Performs clustering on input_gdf and returns cluster labels.
 
@@ -232,7 +289,7 @@ class DBSCAN(Base):
         y : cuDF Series, shape (n_samples)
           cluster labels
         """
-        self.fit(X)
+        self.fit(X, out_dtype)
         return self.labels_
 
     def get_param_names(self):
diff --git a/python/cuml/test/test_dbscan.py b/python/cuml/test/test_dbscan.py
index 8768e6161a..7783b0f2b3 100644
--- a/python/cuml/test/test_dbscan.py
+++ b/python/cuml/test/test_dbscan.py
@@ -51,8 +51,13 @@ def stress_param(*args, **kwargs):
                          stress_param(500000)])
 @pytest.mark.parametrize('ncols', [unit_param(3), quality_param(100),
                          stress_param(1000)])
+@pytest.mark.parametrize('out_dtype', [unit_param("int32"),
+                                       unit_param(np.int32),
+                                       unit_param("int64"),
+                                       unit_param(np.int64), unit_param("auto"),
+                                       quality_param("auto"), stress_param("auto")])
 def test_dbscan(datatype, input_type, use_handle,
-                nrows, ncols, max_bytes_per_batch):
+                nrows, ncols, max_bytes_per_batch, out_dtype):
     n_samples = nrows
     n_feats = ncols
     X, y = make_blobs(n_samples=n_samples, cluster_std=0.01,
@@ -66,9 +71,9 @@ def test_dbscan(datatype, input_type, use_handle,
         X = pd.DataFrame(
             {'fea%d' % i: X[0:, i] for i in range(X.shape[1])})
         X_cudf = cudf.DataFrame.from_pandas(X)
-        cu_labels = cudbscan.fit_predict(X_cudf)
+        cu_labels = cudbscan.fit_predict(X_cudf, out_dtype=out_dtype)
     else:
-        cu_labels = cudbscan.fit_predict(X)
+        cu_labels = cudbscan.fit_predict(X, out_dtype=out_dtype)
 
     if nrows < 500000:
         skdbscan = skDBSCAN(eps=1, min_samples=2, algorithm="brute")
@@ -109,3 +114,10 @@ def test_dbscan_sklearn_comparison(name, nrows):
 
         score = adjusted_rand_score(sk_y_pred, cu_y_pred)
         assert(score == 1.0)
+
+@pytest.mark.xfail(strict=True, raises=ValueError)
+def test_dbscan_out_dtype_fails_invalid_input():
+    X, _ = make_blobs(n_samples=100)
+
+    cudbscan = cuDBSCAN()
+    cudbscan.fit_predict(X, out_dtype="bad_input")

From 6836727e7098df06f9f4aad9d3ae19678769e489 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Sat, 28 Sep 2019 21:38:11 -0400
Subject: [PATCH 22/63] Adding more explicit asserts for output label dtypes

---
 python/cuml/test/test_dbscan.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/python/cuml/test/test_dbscan.py b/python/cuml/test/test_dbscan.py
index 7783b0f2b3..28e2052dfc 100644
--- a/python/cuml/test/test_dbscan.py
+++ b/python/cuml/test/test_dbscan.py
@@ -54,7 +54,8 @@ def stress_param(*args, **kwargs):
 @pytest.mark.parametrize('out_dtype', [unit_param("int32"),
                                        unit_param(np.int32),
                                        unit_param("int64"),
-                                       unit_param(np.int64), unit_param("auto"),
+                                       unit_param(np.int64),
+                                       unit_param("auto"),
                                        quality_param("auto"), stress_param("auto")])
 def test_dbscan(datatype, input_type, use_handle,
                 nrows, ncols, max_bytes_per_batch, out_dtype):
@@ -81,6 +82,13 @@ def test_dbscan(datatype, input_type, use_handle,
         score = adjusted_rand_score(cu_labels, sk_labels)
         assert score == 1
 
+    if out_dtype is "int32" or out_dtype is np.int32:
+        assert cu_labels.dtype == np.int32
+    elif out_dtype is "int64" or out_dtype is np.int64:
+        assert cu_labels.dtype == np.int64
+    else: # out_dtype was "auto" and we shouldn't have any inputs > threshold
+        assert cu_labels.dtype == np.int32
+
 
 @pytest.mark.parametrize("name", [
                                  'noisy_moons',

From f46cdaae40a659d71ee69bd6c6bfb3fab583ee82 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Sat, 28 Sep 2019 22:24:40 -0400
Subject: [PATCH 23/63] Changing max_bytes_per_batch to max_mbytes_per_batch to
 avoid potential overflow and allow increased overall batch size.

---
 cpp/src/dbscan/dbscan.h         | 15 +++++++--------
 python/cuml/cluster/dbscan.pyx  | 33 ++++++++++++++++-----------------
 python/cuml/test/test_dbscan.py |  6 +++---
 3 files changed, 26 insertions(+), 28 deletions(-)

diff --git a/cpp/src/dbscan/dbscan.h b/cpp/src/dbscan/dbscan.h
index b0dd3d5f9c..6eabc54418 100644
--- a/cpp/src/dbscan/dbscan.h
+++ b/cpp/src/dbscan/dbscan.h
@@ -23,25 +23,24 @@
 namespace ML {
 
 using namespace Dbscan;
-static const size_t DEFAULT_MAX_MEM_BYTES = 13e9;
+static const size_t DEFAULT_MAX_MEM_MBYTES = 13e3;
 
 // Default max mem set to a reasonable value for a 16gb card.
-
 template <typename T, typename Index_ = int>
-Index_ computeBatchCount(Index_ n_rows, size_t max_bytes_per_batch) {
+Index_ computeBatchCount(Index_ n_rows, size_t max_mbytes_per_batch) {
   Index_ n_batches = 1;
   // There seems to be a weird overflow bug with cutlass gemm kernels
   // hence, artifically limiting to a smaller batchsize!
   ///TODO: in future, when we bump up the underlying cutlass version, this should go away
   // paving way to cudaMemGetInfo based workspace allocation
 
-  if (max_bytes_per_batch <= 0) max_bytes_per_batch = DEFAULT_MAX_MEM_BYTES;
+  if (max_mbytes_per_batch <= 0) max_mbytes_per_batch = DEFAULT_MAX_MEM_MBYTES;
 
   Index_ MAX_LABEL = std::numeric_limits<Index_>::max();
 
   while (true) {
     size_t batchSize = ceildiv<size_t>(n_rows, n_batches);
-    if (((batchSize * n_rows * sizeof(T) < max_bytes_per_batch) &&
+    if (((batchSize * n_rows * sizeof(T) * 1e-6 < max_mbytes_per_batch) &&
          /**
           * Though single precision can be faster per execution of each kernel,
           * there's a trade-off to be made between using single precision with
@@ -60,7 +59,7 @@ Index_ computeBatchCount(Index_ n_rows, size_t max_bytes_per_batch) {
 template <typename T, typename Index_ = int>
 void dbscanFitImpl(const ML::cumlHandle_impl &handle, T *input, Index_ n_rows,
                    Index_ n_cols, T eps, int min_pts, Index_ *labels,
-                   size_t max_bytes_per_batch, cudaStream_t stream,
+                   size_t max_mbytes_per_batch, cudaStream_t stream,
                    bool verbose) {
   ML::PUSH_RANGE("ML::Dbscan::Fit");
   int algoVd = 1;
@@ -68,14 +67,14 @@ void dbscanFitImpl(const ML::cumlHandle_impl &handle, T *input, Index_ n_rows,
   int algoCcl = 2;
 
   // @todo: Query device for remaining memory
-  Index_ n_batches = computeBatchCount<T, Index_>(n_rows, max_bytes_per_batch);
+  Index_ n_batches = computeBatchCount<T, Index_>(n_rows, max_mbytes_per_batch);
 
   if (verbose) {
     Index_ batchSize = ceildiv<Index_>(n_rows, n_batches);
     if (n_batches > 1) {
       std::cout << "Running batched training on " << n_batches
                 << " batches w/ ";
-      std::cout << batchSize * n_rows * sizeof(T) << " bytes." << std::endl;
+      std::cout << batchSize * n_rows * sizeof(T) * 1e-6 << " megabytes." << std::endl;
     }
   }
 
diff --git a/python/cuml/cluster/dbscan.pyx b/python/cuml/cluster/dbscan.pyx
index 2f002e5c36..5b3717fc83 100644
--- a/python/cuml/cluster/dbscan.pyx
+++ b/python/cuml/cluster/dbscan.pyx
@@ -43,7 +43,7 @@ cdef extern from "dbscan/dbscan.hpp" namespace "ML":
                         float eps,
                         int min_pts,
                         int *labels,
-                        size_t max_bytes_per_batch,
+                        size_t max_mbytes_per_batch,
                         bool verbose) except +
 
     cdef void dbscanFit(cumlHandle& handle,
@@ -53,7 +53,7 @@ cdef extern from "dbscan/dbscan.hpp" namespace "ML":
                         double eps,
                         int min_pts,
                         int *labels,
-                        size_t max_bytes_per_batch,
+                        size_t max_mbytes_per_batch,
                         bool verbose) except +
 
     cdef void dbscanFit(cumlHandle& handle,
@@ -63,7 +63,7 @@ cdef extern from "dbscan/dbscan.hpp" namespace "ML":
                         double eps,
                         int min_pts,
                         long *labels,
-                        size_t max_bytes_per_batch,
+                        size_t max_mbytes_per_batch,
                         bool verbose) except +
 
     cdef void dbscanFit(cumlHandle& handle,
@@ -73,7 +73,7 @@ cdef extern from "dbscan/dbscan.hpp" namespace "ML":
                         double eps,
                         int min_pts,
                         long *labels,
-                        size_t max_bytes_per_batch,
+                        size_t max_mbytes_per_batch,
                         bool verbose) except +
 
 
@@ -129,7 +129,7 @@ class DBSCAN(Base):
         considered as an important core point (including the point itself).
     verbose : bool
         Whether to print debug spews
-    max_bytes_per_batch : (optional) int64
+    max_mbytes_per_batch : (optional) int64
         Calculate batch size using no more than this number of bytes for the
         pairwise distance computation. This enables the trade-off between
         runtime and memory usage for making the N^2 pairwise distance
@@ -169,17 +169,17 @@ class DBSCAN(Base):
     """
 
     def __init__(self, eps=0.5, handle=None, min_samples=5, verbose=False,
-                 max_bytes_per_batch=None):
+                 max_mbytes_per_batch=None):
         super(DBSCAN, self).__init__(handle, verbose)
         self.eps = eps
         self.min_samples = min_samples
         self.labels_ = None
-        self.max_bytes_per_batch = max_bytes_per_batch
+        self.max_mbytes_per_batch = max_mbytes_per_batch
         self.verbose = verbose
 
         # C++ API expects this to be numeric.
-        if self.max_bytes_per_batch is None:
-            self.max_bytes_per_batch = 0
+        if self.max_mbytes_per_batch is None:
+            self.max_mbytes_per_batch = 0
 
     def __getattr__(self, attr):
         if attr == 'labels_array':
@@ -196,8 +196,9 @@ class DBSCAN(Base):
            Acceptable formats: cuDF DataFrame, NumPy ndarray, Numba device
            ndarray, cuda array interface compliant array like CuPy
         out_dtype: dtype Determines the precision of the output labels array.
-                         default: "auto". Valid values are { "auto", "int32",
-                         np.int32, "int64", np.int64}
+            default: "auto". Valid values are { "auto", "int32", np.int32,
+            "int64", np.int64}. When the number of samples exceed
+
         """
 
         if self.labels_ is not None:
@@ -210,8 +211,6 @@ class DBSCAN(Base):
                              "Valid values are {'auto', 'int32', 'int64', "
                              "np.int32, np.int64}")
 
-
-
         cdef uintptr_t input_ptr
 
         X_m, input_ptr, n_rows, n_cols, self.dtype = \
@@ -232,7 +231,7 @@ class DBSCAN(Base):
                           <float> self.eps,
                           <int> self.min_samples,
                           <int*> labels_ptr,
-                          <size_t>self.max_bytes_per_batch,
+                          <size_t>self.max_mbytes_per_batch,
                           <bool>self.verbose)
             else:
                 dbscanFit(handle_[0],
@@ -242,7 +241,7 @@ class DBSCAN(Base):
                           <float> self.eps,
                           <int> self.min_samples,
                           <long*> labels_ptr,
-                          <size_t>self.max_bytes_per_batch,
+                          <size_t>self.max_mbytes_per_batch,
                           <bool>self.verbose)
 
         else:
@@ -254,7 +253,7 @@ class DBSCAN(Base):
                           <double> self.eps,
                           <int> self.min_samples,
                           <int*> labels_ptr,
-                          <size_t> self.max_bytes_per_batch,
+                          <size_t> self.max_mbytes_per_batch,
                           <bool>self.verbose)
             else:
                 dbscanFit(handle_[0],
@@ -264,7 +263,7 @@ class DBSCAN(Base):
                           <double> self.eps,
                           <int> self.min_samples,
                           <long*> labels_ptr,
-                          <size_t> self.max_bytes_per_batch,
+                          <size_t> self.max_mbytes_per_batch,
                           <bool>self.verbose)
 
         # make sure that the `dbscanFit` is complete before the following
diff --git a/python/cuml/test/test_dbscan.py b/python/cuml/test/test_dbscan.py
index 28e2052dfc..c930a64b02 100644
--- a/python/cuml/test/test_dbscan.py
+++ b/python/cuml/test/test_dbscan.py
@@ -43,7 +43,7 @@ def stress_param(*args, **kwargs):
                  'noisy_circles', 'no_structure']
 
 
-@pytest.mark.parametrize('max_bytes_per_batch', [1e9, 5e9])
+@pytest.mark.parametrize('max_mbytes_per_batch', [1e9, 5e9])
 @pytest.mark.parametrize('datatype', [np.float32, np.float64])
 @pytest.mark.parametrize('input_type', ['ndarray'])
 @pytest.mark.parametrize('use_handle', [True, False])
@@ -58,7 +58,7 @@ def stress_param(*args, **kwargs):
                                        unit_param("auto"),
                                        quality_param("auto"), stress_param("auto")])
 def test_dbscan(datatype, input_type, use_handle,
-                nrows, ncols, max_bytes_per_batch, out_dtype):
+                nrows, ncols, max_mbytes_per_batch, out_dtype):
     n_samples = nrows
     n_feats = ncols
     X, y = make_blobs(n_samples=n_samples, cluster_std=0.01,
@@ -66,7 +66,7 @@ def test_dbscan(datatype, input_type, use_handle,
 
     handle, stream = get_handle(use_handle)
     cudbscan = cuDBSCAN(handle=handle, eps=1, min_samples=2,
-                        max_bytes_per_batch=max_bytes_per_batch)
+                        max_mbytes_per_batch=max_mbytes_per_batch)
 
     if input_type == 'dataframe':
         X = pd.DataFrame(

From 2f36ec1b75b2ccb687656e98b7643b273d26f9c3 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Sat, 28 Sep 2019 22:31:38 -0400
Subject: [PATCH 24/63] Changing scale of max_bytes to max_mbytes. This allows
 us to no longer need to worry about overflows from bytes being larger than
 int.

Now there's a hard-set rule for when long needs to be used- if N * batchSize > max_integer, an explicit exception will be thrown on the C++ side.
---
 python/cuml/cluster/dbscan.pyx | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/python/cuml/cluster/dbscan.pyx b/python/cuml/cluster/dbscan.pyx
index 5b3717fc83..0a3d76097b 100644
--- a/python/cuml/cluster/dbscan.pyx
+++ b/python/cuml/cluster/dbscan.pyx
@@ -185,7 +185,7 @@ class DBSCAN(Base):
         if attr == 'labels_array':
             return self.labels_._column._data.mem
 
-    def fit(self, X, out_dtype="auto"):
+    def fit(self, X, out_dtype="int32"):
         """
         Perform DBSCAN clustering from features.
 
@@ -196,17 +196,14 @@ class DBSCAN(Base):
            Acceptable formats: cuDF DataFrame, NumPy ndarray, Numba device
            ndarray, cuda array interface compliant array like CuPy
         out_dtype: dtype Determines the precision of the output labels array.
-            default: "auto". Valid values are { "auto", "int32", np.int32,
+            default: "int32". Valid values are { "int32", np.int32,
             "int64", np.int64}. When the number of samples exceed
-
         """
 
         if self.labels_ is not None:
             del self.labels_
 
-        if out_dtype == "auto":
-            out_dtype = np.int32 if X.shape[0] < 1e6 else np.int64
-        elif out_dtype not in ["int32", np.int32, "int64", np.int64]:
+        if out_dtype not in ["int32", np.int32, "int64", np.int64]:
             raise ValueError("Invalid value for out_dtype. "
                              "Valid values are {'auto', 'int32', 'int64', "
                              "np.int32, np.int64}")

From ff6464335e2b35bd12a0f5b8b803deaea9b00897 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Sat, 28 Sep 2019 22:36:55 -0400
Subject: [PATCH 25/63] Making style checker happy

---
 python/cuml/test/test_dbscan.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/cuml/test/test_dbscan.py b/python/cuml/test/test_dbscan.py
index c930a64b02..9d15043f67 100644
--- a/python/cuml/test/test_dbscan.py
+++ b/python/cuml/test/test_dbscan.py
@@ -56,7 +56,8 @@ def stress_param(*args, **kwargs):
                                        unit_param("int64"),
                                        unit_param(np.int64),
                                        unit_param("auto"),
-                                       quality_param("auto"), stress_param("auto")])
+                                       quality_param("auto"),
+                                       stress_param("auto")])
 def test_dbscan(datatype, input_type, use_handle,
                 nrows, ncols, max_mbytes_per_batch, out_dtype):
     n_samples = nrows
@@ -82,12 +83,10 @@ def test_dbscan(datatype, input_type, use_handle,
         score = adjusted_rand_score(cu_labels, sk_labels)
         assert score == 1
 
-    if out_dtype is "int32" or out_dtype is np.int32:
+    if out_dtype == "int32" or out_dtype == np.int32:
         assert cu_labels.dtype == np.int32
-    elif out_dtype is "int64" or out_dtype is np.int64:
+    elif out_dtype == "int64" or out_dtype == np.int64:
         assert cu_labels.dtype == np.int64
-    else: # out_dtype was "auto" and we shouldn't have any inputs > threshold
-        assert cu_labels.dtype == np.int32
 
 
 @pytest.mark.parametrize("name", [
@@ -123,6 +122,7 @@ def test_dbscan_sklearn_comparison(name, nrows):
         score = adjusted_rand_score(sk_y_pred, cu_y_pred)
         assert(score == 1.0)
 
+
 @pytest.mark.xfail(strict=True, raises=ValueError)
 def test_dbscan_out_dtype_fails_invalid_input():
     X, _ = make_blobs(n_samples=100)

From d81bd803b969f3cf05cfeffe85eba387b36b16e6 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Sat, 28 Sep 2019 22:39:40 -0400
Subject: [PATCH 26/63] Better assertion statement for users

---
 cpp/src/dbscan/runner.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/dbscan/runner.h b/cpp/src/dbscan/runner.h
index 0e2136c1b6..4cba104d14 100644
--- a/cpp/src/dbscan/runner.h
+++ b/cpp/src/dbscan/runner.h
@@ -95,13 +95,13 @@ size_t run(const ML::cumlHandle_impl& handle, Type_f* x, Index_ N, Index_ D,
   size_t vdSize = alignTo<size_t>(sizeof(Index_) * (batchSize + 1), align);
   size_t exScanSize = alignTo<size_t>(sizeof(Index_) * batchSize, align);
 
-  // TODO: We should ASSERT that N * batchSize is greater than the maximum value used
   Index_ MAX_LABEL = std::numeric_limits<Index_>::max();
 
   ASSERT(
     N * batchSize < MAX_LABEL,
     "An overflow occurred with the current choice of precision "
-    "and the number of samples. (Max allowed batch size is %d, but was %d)",
+    "and the number of samples. (Max allowed batch size is %d, but was %d). "
+    "Consider using double precision for the output labels.",
     MAX_LABEL / N, batchSize);
 
   if (workspace == NULL) {

From c6e98bc3f8f8a16496e682a67939203a608b911d Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Sat, 28 Sep 2019 22:43:59 -0400
Subject: [PATCH 27/63] Making style checker happy

---
 python/cuml/cluster/dbscan.pyx | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/cuml/cluster/dbscan.pyx b/python/cuml/cluster/dbscan.pyx
index 0a3d76097b..275974ee30 100644
--- a/python/cuml/cluster/dbscan.pyx
+++ b/python/cuml/cluster/dbscan.pyx
@@ -76,8 +76,6 @@ cdef extern from "dbscan/dbscan.hpp" namespace "ML":
                         size_t max_mbytes_per_batch,
                         bool verbose) except +
 
-
-
 class DBSCAN(Base):
     """
     DBSCAN is a very powerful yet fast clustering technique that finds clusters

From f2e0a2d6bbe3ffa5438bea949ae3a222b083c8aa Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Sat, 28 Sep 2019 22:46:25 -0400
Subject: [PATCH 28/63] Updating docs to use megabytes instead of bytes

---
 cpp/src/dbscan/dbscan.hpp      | 2 +-
 python/cuml/cluster/dbscan.pyx | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/cpp/src/dbscan/dbscan.hpp b/cpp/src/dbscan/dbscan.hpp
index 8f1e5ae4c8..364eae5efe 100644
--- a/cpp/src/dbscan/dbscan.hpp
+++ b/cpp/src/dbscan/dbscan.hpp
@@ -29,7 +29,7 @@ namespace ML {
  * @param[in] eps the epsilon value to use for epsilon-neighborhood determination
  * @param[in] min_pts minimum number of points to determine a cluster
  * @param[out] labels (size n_rows) output labels array
- * @param[in] max_mem_bytes: the maximum number of bytes to be used for each batch of
+ * @param[in] max_mem_mbytes: the maximum number of megabytes to be used for each batch of
  *            the pairwise distance calculation. This enables the trade off between
  *            memory usage and algorithm execution time.
  * @param[in] verbose: print useful information as algorithm executes
diff --git a/python/cuml/cluster/dbscan.pyx b/python/cuml/cluster/dbscan.pyx
index 275974ee30..cba90fa0be 100644
--- a/python/cuml/cluster/dbscan.pyx
+++ b/python/cuml/cluster/dbscan.pyx
@@ -128,15 +128,15 @@ class DBSCAN(Base):
     verbose : bool
         Whether to print debug spews
     max_mbytes_per_batch : (optional) int64
-        Calculate batch size using no more than this number of bytes for the
-        pairwise distance computation. This enables the trade-off between
+        Calculate batch size using no more than this number of megabytes for
+        the pairwise distance computation. This enables the trade-off between
         runtime and memory usage for making the N^2 pairwise distance
         computations more tractable for large numbers of samples.
         If you are experiencing out of memory errors when running DBSCAN, you
         can set this value based on the memory size of your device.
         Note: this option does not set the maximum total memory used in the
-        DBSCAN computation and so this value will not
-        be able to be set to the total memory available on the device.
+        DBSCAN computation and so this value will not be able to be set to
+        the total memory available on the device.
 
     Attributes
     -----------

From 70aedbc9133c51f51769301b60c71629f0d133af Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Sat, 28 Sep 2019 23:55:46 -0400
Subject: [PATCH 29/63] Style checker fixes

---
 python/cuml/cluster/dbscan.pyx | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/cuml/cluster/dbscan.pyx b/python/cuml/cluster/dbscan.pyx
index cba90fa0be..29687f0883 100644
--- a/python/cuml/cluster/dbscan.pyx
+++ b/python/cuml/cluster/dbscan.pyx
@@ -76,6 +76,7 @@ cdef extern from "dbscan/dbscan.hpp" namespace "ML":
                         size_t max_mbytes_per_batch,
                         bool verbose) except +
 
+
 class DBSCAN(Base):
     """
     DBSCAN is a very powerful yet fast clustering technique that finds clusters

From d0a9d9a64c1217416d2376716e3f5a776047c7f5 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Sat, 28 Sep 2019 23:57:00 -0400
Subject: [PATCH 30/63] Fixing dbscan test

---
 python/cuml/cluster/dbscan.pyx  | 2 +-
 python/cuml/test/test_dbscan.py | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/python/cuml/cluster/dbscan.pyx b/python/cuml/cluster/dbscan.pyx
index 29687f0883..8e7a5215fa 100644
--- a/python/cuml/cluster/dbscan.pyx
+++ b/python/cuml/cluster/dbscan.pyx
@@ -204,7 +204,7 @@ class DBSCAN(Base):
 
         if out_dtype not in ["int32", np.int32, "int64", np.int64]:
             raise ValueError("Invalid value for out_dtype. "
-                             "Valid values are {'auto', 'int32', 'int64', "
+                             "Valid values are {'int32', 'int64', "
                              "np.int32, np.int64}")
 
         cdef uintptr_t input_ptr
diff --git a/python/cuml/test/test_dbscan.py b/python/cuml/test/test_dbscan.py
index 9d15043f67..48d4b5641a 100644
--- a/python/cuml/test/test_dbscan.py
+++ b/python/cuml/test/test_dbscan.py
@@ -55,9 +55,8 @@ def stress_param(*args, **kwargs):
                                        unit_param(np.int32),
                                        unit_param("int64"),
                                        unit_param(np.int64),
-                                       unit_param("auto"),
-                                       quality_param("auto"),
-                                       stress_param("auto")])
+                                       quality_param("int32"),
+                                       stress_param("int32")])
 def test_dbscan(datatype, input_type, use_handle,
                 nrows, ncols, max_mbytes_per_batch, out_dtype):
     n_samples = nrows

From f0884eed92368a882d4d8e13c33ae81968d4050a Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Sun, 29 Sep 2019 00:22:08 -0400
Subject: [PATCH 31/63] Adding double precision outputs to dbscan gtests.

Removing long and using int64_t for portability
---
 cpp/src/datasets/make_blobs.cu  | 14 ++---
 cpp/src/datasets/make_blobs.hpp |  9 ++--
 cpp/src/dbscan/dbscan.cu        | 22 ++++----
 cpp/src/dbscan/dbscan.hpp       |  8 +--
 cpp/src/metrics/metrics.cu      |  8 +--
 cpp/src/metrics/metrics.hpp     |  8 +--
 cpp/test/sg/dbscan_test.cu      | 93 ++++++++++++++++++++++-----------
 python/cuml/cluster/dbscan.pyx  | 12 ++---
 8 files changed, 103 insertions(+), 71 deletions(-)

diff --git a/cpp/src/datasets/make_blobs.cu b/cpp/src/datasets/make_blobs.cu
index 3b476cd5ca..3c173fc4d7 100644
--- a/cpp/src/datasets/make_blobs.cu
+++ b/cpp/src/datasets/make_blobs.cu
@@ -23,19 +23,19 @@
 namespace ML {
 namespace Datasets {
 
-void make_blobs(const cumlHandle& handle, float* out, long* labels, long n_rows,
-                long n_cols, long n_clusters, const float* centers,
-                const float* cluster_std, const float cluster_std_scalar,
-                bool shuffle, float center_box_min, float center_box_max,
-                uint64_t seed) {
+void make_blobs(const cumlHandle& handle, float* out, int64_t* labels,
+                int64_t n_rows, int64_t n_cols, int64_t n_clusters,
+                const float* centers, const float* cluster_std,
+                const float cluster_std_scalar, bool shuffle,
+                float center_box_min, float center_box_max, uint64_t seed) {
   MLCommon::Random::make_blobs(out, labels, n_rows, n_cols, n_clusters,
                                handle.getDeviceAllocator(), handle.getStream(),
                                centers, cluster_std, cluster_std_scalar,
                                shuffle, center_box_min, center_box_max, seed);
 }
 
-void make_blobs(const cumlHandle& handle, double* out, long* labels,
-                long n_rows, long n_cols, long n_clusters,
+void make_blobs(const cumlHandle& handle, double* out, int64_t* labels,
+                int64_t n_rows, long n_cols, int64_t n_clusters,
                 const double* centers, const double* cluster_std,
                 const double cluster_std_scalar, bool shuffle,
                 double center_box_min, double center_box_max, uint64_t seed) {
diff --git a/cpp/src/datasets/make_blobs.hpp b/cpp/src/datasets/make_blobs.hpp
index 645f075e38..ca2ed1611a 100644
--- a/cpp/src/datasets/make_blobs.hpp
+++ b/cpp/src/datasets/make_blobs.hpp
@@ -49,15 +49,16 @@ namespace Datasets {
  * centers. Useful only if 'centers' is nullptr
  * @param seed seed for the RNG
  */
-void make_blobs(const cumlHandle& handle, float* out, long* labels, long n_rows,
-                long n_cols, long n_clusters, const float* centers = nullptr,
+void make_blobs(const cumlHandle& handle, float* out, int64_t* labels,
+                int64_t n_rows, int64_t n_cols, int64_t n_clusters,
+                const float* centers = nullptr,
                 const float* cluster_std = nullptr,
                 const float cluster_std_scalar = 1.f, bool shuffle = true,
                 float center_box_min = 10.f, float center_box_max = 10.f,
                 uint64_t seed = 0ULL);
 
-void make_blobs(const cumlHandle& handle, double* out, long* labels,
-                long n_rows, long n_cols, long n_clusters,
+void make_blobs(const cumlHandle& handle, double* out, int64_t* labels,
+                int64_t n_rows, int64_t n_cols, int64_t n_clusters,
                 const double* centers = nullptr,
                 const double* cluster_std = nullptr,
                 const double cluster_std_scalar = 1.f, bool shuffle = true,
diff --git a/cpp/src/dbscan/dbscan.cu b/cpp/src/dbscan/dbscan.cu
index 055c234847..833d6ef701 100644
--- a/cpp/src/dbscan/dbscan.cu
+++ b/cpp/src/dbscan/dbscan.cu
@@ -40,20 +40,20 @@ void dbscanFit(const cumlHandle &handle, double *input, int n_rows, int n_cols,
                              handle.getStream(), verbose);
 }
 
-void dbscanFit(const cumlHandle &handle, float *input, long n_rows, long n_cols,
-               float eps, int min_pts, long *labels, size_t max_bytes_per_batch,
-               bool verbose) {
-  dbscanFitImpl<float, long>(handle.getImpl(), input, n_rows, n_cols, eps,
-                             min_pts, labels, max_bytes_per_batch,
-                             handle.getStream(), verbose);
+void dbscanFit(const cumlHandle &handle, float *input, int64_t n_rows,
+               int64_t n_cols, float eps, int min_pts, int64_t *labels,
+               size_t max_bytes_per_batch, bool verbose) {
+  dbscanFitImpl<float, int64_t>(handle.getImpl(), input, n_rows, n_cols, eps,
+                                min_pts, labels, max_bytes_per_batch,
+                                handle.getStream(), verbose);
 }
 
-void dbscanFit(const cumlHandle &handle, double *input, long n_rows,
-               long n_cols, double eps, int min_pts, long *labels,
+void dbscanFit(const cumlHandle &handle, double *input, int64_t n_rows,
+               int64_t n_cols, double eps, int min_pts, int64_t *labels,
                size_t max_bytes_per_batch, bool verbose) {
-  dbscanFitImpl<double, long>(handle.getImpl(), input, n_rows, n_cols, eps,
-                              min_pts, labels, max_bytes_per_batch,
-                              handle.getStream(), verbose);
+  dbscanFitImpl<double, int64_t>(handle.getImpl(), input, n_rows, n_cols, eps,
+                                 min_pts, labels, max_bytes_per_batch,
+                                 handle.getStream(), verbose);
 }
 
 };  // end namespace ML
diff --git a/cpp/src/dbscan/dbscan.hpp b/cpp/src/dbscan/dbscan.hpp
index 364eae5efe..aebe3ad8f4 100644
--- a/cpp/src/dbscan/dbscan.hpp
+++ b/cpp/src/dbscan/dbscan.hpp
@@ -42,11 +42,11 @@ void dbscanFit(const cumlHandle &handle, double *input, int n_rows, int n_cols,
                double eps, int min_pts, int *labels,
                size_t max_bytes_per_batch = 0, bool verbose = false);
 
-void dbscanFit(const cumlHandle &handle, float *input, long n_rows, long n_cols,
-               float eps, int min_pts, long *labels,
+void dbscanFit(const cumlHandle &handle, float *input, int64_t n_rows,
+               int64_t n_cols, float eps, int min_pts, int64_t *labels,
                size_t max_bytes_per_batch = 0, bool verbose = false);
-void dbscanFit(const cumlHandle &handle, double *input, long n_rows,
-               long n_cols, double eps, int min_pts, long *labels,
+void dbscanFit(const cumlHandle &handle, double *input, int64_t n_rows,
+               int64_t n_cols, double eps, int min_pts, int64_t *labels,
                size_t max_bytes_per_batch = 0, bool verbose = false);
 
 /** @} */
diff --git a/cpp/src/metrics/metrics.cu b/cpp/src/metrics/metrics.cu
index 79007262cb..0a1faed817 100644
--- a/cpp/src/metrics/metrics.cu
+++ b/cpp/src/metrics/metrics.cu
@@ -50,10 +50,10 @@ double silhouetteScore(const cumlHandle &handle, double *y, int nRows,
     handle.getStream(), metric);
 }
 
-double adjustedRandIndex(const cumlHandle &handle, const long *y,
-                         const long *y_hat, const long n,
-                         const long lower_class_range,
-                         const long upper_class_range) {
+double adjustedRandIndex(const cumlHandle &handle, const int64_t *y,
+                         const int64_t *y_hat, const int64_t n,
+                         const int64_t lower_class_range,
+                         const int64_t upper_class_range) {
   return MLCommon::Metrics::computeAdjustedRandIndex(
     y, y_hat, n, lower_class_range, upper_class_range,
     handle.getDeviceAllocator(), handle.getStream());
diff --git a/cpp/src/metrics/metrics.hpp b/cpp/src/metrics/metrics.hpp
index 881d5a9aab..cc1184ecac 100644
--- a/cpp/src/metrics/metrics.hpp
+++ b/cpp/src/metrics/metrics.hpp
@@ -106,10 +106,10 @@ double silhouetteScore(const cumlHandle &handle, double *y, int nRows,
 * @param upper_class_range: the highest value in the range of classes
 * @return: The adjusted rand index value
 */
-double adjustedRandIndex(const cumlHandle &handle, const long *y,
-                         const long *y_hat, const long n,
-                         const long lower_class_range,
-                         const long upper_class_range);
+double adjustedRandIndex(const cumlHandle &handle, const int64_t *y,
+                         const int64_t *y_hat, const int64_t n,
+                         const int64_t lower_class_range,
+                         const int64_t upper_class_range);
 
 double adjustedRandIndex(const cumlHandle &handle, const int *y,
                          const int *y_hat, const int n,
diff --git a/cpp/test/sg/dbscan_test.cu b/cpp/test/sg/dbscan_test.cu
index 9518b466d2..bc58d89887 100644
--- a/cpp/test/sg/dbscan_test.cu
+++ b/cpp/test/sg/dbscan_test.cu
@@ -40,11 +40,11 @@ using namespace Datasets;
 using namespace Metrics;
 using namespace std;
 
-template <typename T>
+template <typename T, typename IdxT>
 struct DbscanInputs {
-  int n_row;
-  int n_col;
-  int n_centers;
+  IdxT n_row;
+  IdxT n_col;
+  IdxT n_centers;
   T cluster_std;
   T eps;
   int min_pts;
@@ -52,23 +52,24 @@ struct DbscanInputs {
   unsigned long long int seed;
 };
 
-template <typename T>
-::std::ostream& operator<<(::std::ostream& os, const DbscanInputs<T>& dims) {
+template <typename T, typename IdxT>
+::std::ostream& operator<<(::std::ostream& os,
+                           const DbscanInputs<T, IdxT>& dims) {
   return os;
 }
 
-template <typename T>
-class DbscanTest : public ::testing::TestWithParam<DbscanInputs<T>> {
+template <typename T, typename IdxT>
+class DbscanTest : public ::testing::TestWithParam<DbscanInputs<T, IdxT>> {
  protected:
   void basicTest() {
     cumlHandle handle;
 
-    params = ::testing::TestWithParam<DbscanInputs<T>>::GetParam();
+    params = ::testing::TestWithParam<DbscanInputs<T, IdxT>>::GetParam();
 
     device_buffer<T> out(handle.getDeviceAllocator(), handle.getStream(),
                          params.n_row * params.n_col);
-    device_buffer<int> l(handle.getDeviceAllocator(), handle.getStream(),
-                         params.n_row);
+    device_buffer<IdxT> l(handle.getDeviceAllocator(), handle.getStream(),
+                          params.n_row);
 
     make_blobs(handle, out.data(), l.data(), params.n_row, params.n_col,
                params.n_centers, nullptr, nullptr, params.cluster_std, true,
@@ -96,6 +97,8 @@ class DbscanTest : public ::testing::TestWithParam<DbscanInputs<T>> {
       std::cout << "y_hat: "
                 << arr2Str(labels, 25, "labels", handle.getStream())
                 << std::endl;
+
+      std::cout << "Score = " << score << std::endl;
     }
   }
 
@@ -107,38 +110,66 @@ class DbscanTest : public ::testing::TestWithParam<DbscanInputs<T>> {
   }
 
  protected:
-  DbscanInputs<T> params;
-  int *labels, *labels_ref;
+  DbscanInputs<T, IdxT> params;
+  IdxT *labels, *labels_ref;
 
   double score;
 };
 
-const std::vector<DbscanInputs<float>> inputsf2 = {
-  {50000, 16, 5, 0.01, 2, 2, (size_t)13e8, 1234ULL},
+const std::vector<DbscanInputs<float, int>> inputsf2 = {
+  {50000, 16, 5, 0.01, 2, 2, (size_t)13e3, 1234ULL},
+  {500, 16, 5, 0.01, 2, 2, (size_t)100, 1234ULL},
+  {1000, 1000, 10, 0.01, 2, 2, (size_t)13e3, 1234ULL},
+  {50000, 16, 5l, 0.01, 2, 2, (size_t)13e3, 1234ULL},
+  {20000, 10000, 10, 0.01, 2, 2, (size_t)13e3, 1234ULL},
+  {20000, 100, 5000, 0.01, 2, 2, (size_t)13e3, 1234ULL}};
+
+const std::vector<DbscanInputs<float, int64_t>> inputsf3 = {
+  {50000, 16, 5, 0.01, 2, 2, (size_t)9e3, 1234ULL},
+  {500, 16, 5, 0.01, 2, 2, (size_t)100, 1234ULL},
+  {1000, 1000, 10, 0.01, 2, 2, (size_t)9e3, 1234ULL},
+  {50000, 16, 5l, 0.01, 2, 2, (size_t)9e3, 1234ULL},
+  {20000, 10000, 10, 0.01, 2, 2, (size_t)9e3, 1234ULL},
+  {20000, 100, 5000, 0.01, 2, 2, (size_t)9e3, 1234ULL}};
+
+const std::vector<DbscanInputs<double, int>> inputsd2 = {
+  {50000, 16, 5, 0.01, 2, 2, (size_t)13e3, 1234ULL},
   {500, 16, 5, 0.01, 2, 2, (size_t)100, 1234ULL},
-  {1000, 1000, 10, 0.01, 2, 2, (size_t)13e8, 1234ULL},
-  {50000, 16, 5l, 0.01, 2, 2, (size_t)13e8, 1234ULL},
-  {20000, 10000, 10, 0.01, 2, 2, (size_t)13e8, 1234ULL},
-  {20000, 100, 5000, 0.01, 2, 2, (size_t)13e8, 1234ULL}};
+  {1000, 1000, 10, 0.01, 2, 2, (size_t)13e3, 1234ULL},
+  {100, 10000, 10, 0.01, 2, 2, (size_t)13e3, 1234ULL},
+  {20000, 10000, 10, 0.01, 2, 2, (size_t)13e3, 1234ULL},
+  {20000, 100, 5000, 0.01, 2, 2, (size_t)13e3, 1234ULL}};
 
-const std::vector<DbscanInputs<double>> inputsd2 = {
-  {50000, 16, 5, 0.01, 2, 2, (size_t)13e9, 1234ULL},
+const std::vector<DbscanInputs<double, int64_t>> inputsd3 = {
+  {50000, 16, 5, 0.01, 2, 2, (size_t)9e3, 1234ULL},
   {500, 16, 5, 0.01, 2, 2, (size_t)100, 1234ULL},
-  {1000, 1000, 10, 0.01, 2, 2, (size_t)13e9, 1234ULL},
-  {100, 10000, 10, 0.01, 2, 2, (size_t)13e9, 1234ULL},
-  {20000, 10000, 10, 0.01, 2, 2, (size_t)13e9, 1234ULL},
-  {20000, 100, 5000, 0.01, 2, 2, (size_t)13e9, 1234ULL}};
+  {1000, 1000, 10, 0.01, 2, 2, (size_t)9e3, 1234ULL},
+  {100, 10000, 10, 0.01, 2, 2, (size_t)9e3, 1234ULL},
+  {20000, 10000, 10, 0.01, 2, 2, (size_t)9e3, 1234ULL},
+  {20000, 100, 5000, 0.01, 2, 2, (size_t)9e3, 1234ULL}};
 
-typedef DbscanTest<float> DbscanTestF;
-TEST_P(DbscanTestF, Result) { ASSERT_TRUE(score == 1.0); }
+typedef DbscanTest<float, int> DbscanTestF_Int;
+TEST_P(DbscanTestF_Int, Result) { ASSERT_TRUE(score == 1.0); }
 
-typedef DbscanTest<double> DbscanTestD;
-TEST_P(DbscanTestD, Result) { ASSERT_TRUE(score == 1.0); }
+typedef DbscanTest<float, int64_t> DbscanTestF_Int64;
+TEST_P(DbscanTestF_Int64, Result) { ASSERT_TRUE(score == 1.0); }
 
-INSTANTIATE_TEST_CASE_P(DbscanTests, DbscanTestF,
+typedef DbscanTest<double, int> DbscanTestD_Int;
+TEST_P(DbscanTestD_Int, Result) { ASSERT_TRUE(score == 1.0); }
+
+typedef DbscanTest<double, int64_t> DbscanTestD_Int64;
+TEST_P(DbscanTestD_Int64, Result) { ASSERT_TRUE(score == 1.0); }
+
+INSTANTIATE_TEST_CASE_P(DbscanTests, DbscanTestF_Int,
                         ::testing::ValuesIn(inputsf2));
 
-INSTANTIATE_TEST_CASE_P(DbscanTests, DbscanTestD,
+INSTANTIATE_TEST_CASE_P(DbscanTests, DbscanTestF_Int64,
+                        ::testing::ValuesIn(inputsf3));
+
+INSTANTIATE_TEST_CASE_P(DbscanTests, DbscanTestD_Int,
                         ::testing::ValuesIn(inputsd2));
 
+INSTANTIATE_TEST_CASE_P(DbscanTests, DbscanTestD_Int64,
+                        ::testing::ValuesIn(inputsd3));
+
 }  // end namespace ML
diff --git a/python/cuml/cluster/dbscan.pyx b/python/cuml/cluster/dbscan.pyx
index 8e7a5215fa..059944144d 100644
--- a/python/cuml/cluster/dbscan.pyx
+++ b/python/cuml/cluster/dbscan.pyx
@@ -232,11 +232,11 @@ class DBSCAN(Base):
             else:
                 dbscanFit(handle_[0],
                           <float*>input_ptr,
-                          <long> n_rows,
-                          <long> n_cols,
+                          <int64_t> n_rows,
+                          <int64_t> n_cols,
                           <float> self.eps,
                           <int> self.min_samples,
-                          <long*> labels_ptr,
+                          <int64_t*> labels_ptr,
                           <size_t>self.max_mbytes_per_batch,
                           <bool>self.verbose)
 
@@ -254,11 +254,11 @@ class DBSCAN(Base):
             else:
                 dbscanFit(handle_[0],
                           <double*>input_ptr,
-                          <long> n_rows,
-                          <long> n_cols,
+                          <int64_t> n_rows,
+                          <int64_t> n_cols,
                           <double> self.eps,
                           <int> self.min_samples,
-                          <long*> labels_ptr,
+                          <int64_t*> labels_ptr,
                           <size_t> self.max_mbytes_per_batch,
                           <bool>self.verbose)
 

From 2e775769c4ae1cc4bf3666ba134d186eaf8f1924 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Sun, 29 Sep 2019 00:33:56 -0400
Subject: [PATCH 32/63] Catching any missing longs in make_blobs and dbscan

---
 cpp/src/datasets/make_blobs.cu   | 2 +-
 cpp/src/dbscan/adjgraph/algo.h   | 2 +-
 cpp/src/dbscan/vertexdeg/naive.h | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/src/datasets/make_blobs.cu b/cpp/src/datasets/make_blobs.cu
index 3c173fc4d7..f9bf1d8d99 100644
--- a/cpp/src/datasets/make_blobs.cu
+++ b/cpp/src/datasets/make_blobs.cu
@@ -35,7 +35,7 @@ void make_blobs(const cumlHandle& handle, float* out, int64_t* labels,
 }
 
 void make_blobs(const cumlHandle& handle, double* out, int64_t* labels,
-                int64_t n_rows, long n_cols, int64_t n_clusters,
+                int64_t n_rows, int64_t n_cols, int64_t n_clusters,
                 const double* centers, const double* cluster_std,
                 const double cluster_std_scalar, bool shuffle,
                 double center_box_min, double center_box_max, uint64_t seed) {
diff --git a/cpp/src/dbscan/adjgraph/algo.h b/cpp/src/dbscan/adjgraph/algo.h
index 0793ede1b0..1df95dfa21 100644
--- a/cpp/src/dbscan/adjgraph/algo.h
+++ b/cpp/src/dbscan/adjgraph/algo.h
@@ -40,7 +40,7 @@ static const int TPB_X = 256;
  * Takes vertex degree array (vd) and CSR row_ind array (ex_scan) to produce the
  * CSR row_ind_ptr array (adj_graph) and filters into a core_pts array based on min_pts.
  */
-template <typename Type, typename Index_ = long>
+template <typename Type, typename Index_ = int>
 void launcher(const ML::cumlHandle_impl &handle, Pack<Type, Index_> data,
               Index_ batchSize, cudaStream_t stream) {
   device_ptr<Index_> dev_vd = device_pointer_cast(data.vd);
diff --git a/cpp/src/dbscan/vertexdeg/naive.h b/cpp/src/dbscan/vertexdeg/naive.h
index f2a9cf4b13..8710640295 100644
--- a/cpp/src/dbscan/vertexdeg/naive.h
+++ b/cpp/src/dbscan/vertexdeg/naive.h
@@ -71,7 +71,7 @@ __global__ void vertex_degree_kernel(Pack<Type, Index_> data,
   }
 }
 
-template <typename Type, typename Index_ = long>
+template <typename Type, typename Index_ = int>
 void launcher(Pack<Type, Index_> data, Index_ startVertexId, Index_ batchSize,
               cudaStream_t stream) {
   ASSERT(sizeof(Index_) == 4 || sizeof(Index_) == 8,

From 36cd51a9ef055a489b774e68682feec91ded8715 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Sun, 29 Sep 2019 08:25:29 -0400
Subject: [PATCH 33/63] Importing int64_t

---
 python/cuml/cluster/kmeans.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cuml/cluster/kmeans.pyx b/python/cuml/cluster/kmeans.pyx
index dca1316e21..fcfa070de4 100644
--- a/python/cuml/cluster/kmeans.pyx
+++ b/python/cuml/cluster/kmeans.pyx
@@ -27,7 +27,7 @@ import warnings
 from librmm_cffi import librmm as rmm
 
 from libcpp cimport bool
-from libc.stdint cimport uintptr_t
+from libc.stdint cimport uintptr_t, int64_t
 from libc.stdlib cimport calloc, malloc, free
 
 from cuml.common.base import Base

From 2503d4fdb91e9131cdc3b280810818f185ff1fd6 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 1 Oct 2019 01:51:37 +1000
Subject: [PATCH 34/63] Updated Barnes Hut TSNE documentation

---
 python/cuml/manifold/t_sne.pyx | 35 +++++++++++++++++++---------------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/python/cuml/manifold/t_sne.pyx b/python/cuml/manifold/t_sne.pyx
index 453fa60a42..2196fbba4f 100644
--- a/python/cuml/manifold/t_sne.pyx
+++ b/python/cuml/manifold/t_sne.pyx
@@ -71,24 +71,21 @@ cdef extern from "tsne/tsne.h" namespace "ML" nogil:
 
 
 class TSNE(Base):
-    """
+"""
     TSNE (T-Distributed Stochastic Neighbor Embedding) is an extremely
     powerful dimensionality reduction technique that aims to maintain
     local distances between data points. It is extremely robust to whatever
     dataset you give it, and is used in many areas including cancer research,
     music analysis and neural network weight visualizations.
 
-    The current cuML TSNE implementation is a first experimental release. It
-    defaults to use the 'exact' fitting algorithm, which is signficantly slower
-    then the Barnes-Hut algorithm as data sizes grow. A preview implementation
-    of Barnes-Hut (derived from CannyLabs' BH open source CUDA code) is also
-    available for problems with n_components = 2, though this implementation
-    currently has outstanding issues that can lead to crashes in rare
-    scenarios. Future releases of TSNE will fix these issues (tracked as cuML
-    Issue #1002) and switch Barnes-Hut to be the default.
+    Currently, cuML's TSNE supports the fast Barnes Hut O(NlogN) TSNE
+    approximation (derived from CannyLabs' BH open source CUDA code). This
+    allows TSNE to produce extremely fast embeddings when n_components = 2.
+    cuML defaults to this algorithm. A slower but more accurate Exact
+    algorithm is also provided.
 
     Parameters
-    ----------
+    -----------
     n_components : int (default 2)
         The output dimensionality size. Currently only size=2 is tested, but
         the 'exact' algorithm will support greater dimensionality in future.
@@ -110,13 +107,18 @@ class TSNE(Base):
     metric : str 'euclidean' only (default 'euclidean')
         Currently only supports euclidean distance. Will support cosine in
         a future release.
-    init : str 'random' only (default 'random')
-        Currently only supports random intialization. Will support PCA
-        intialization in a future release.
+    init : str 'random' (default 'random')
+        Currently supports random intialization.
     verbose : int (default 0)
         Level of verbosity. If > 0, prints all help messages and warnings.
+        Most messages will be printed inside the Python Console.
     random_state : int (default None)
-        Setting this can allow future runs of TSNE to look the same.
+        Setting this can allow future runs of TSNE to look mostly the same.
+        It is known that TSNE tends to have vastly different outputs on
+        many runs. Try using PCA intialization (in future release) to 
+        possibly counteract this problem.
+        It is known that small perturbations can directly
+        change the result of the embedding for parallel TSNE implementations.
     method : str 'barnes_hut' or 'exact' (default 'barnes_hut')
         Options are either barnes_hut or exact. It is recommend that you use
         the barnes hut approximation for superior O(nlogn) complexity.
@@ -147,7 +149,7 @@ class TSNE(Base):
         one for you anew!
 
     References
-    ----------
+    -----------
     *   van der Maaten, L.J.P.
         t-Distributed Stochastic Neighbor Embedding
         https://lvdmaaten.github.io/tsne/
@@ -169,6 +171,9 @@ class TSNE(Base):
     specifying random_state and fixing it across runs can help, but TSNE does
     not guarantee similar results each time.
 
+    As suggested, PCA (in future release) can also help to alleviate this
+    issue.
+
     Reference Implementation
     -------------------------
     The CUDA implementation is derived from the excellent CannyLabs open source

From 4aa11d8f8d717f36cf895093ef474228d1eb6e85 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 1 Oct 2019 01:53:10 +1000
Subject: [PATCH 35/63] Update CHANGELOG.md

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7b471839e0..51c3016454 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -25,6 +25,7 @@
 - PR #1115: Moving dask_make_blobs to cuml.dask.datasets. Adding conversion to dask.DataFrame
 - PR #1136: CUDA 10.1 CI updates
 - PR #1165: Adding except + in all remaining cython
+- PR #1173: Docs: Barnes Hut TSNE documentation
 
 ## Bug Fixes
 

From 1da825e6fae4011b3631aee237c209328c6d2516 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 1 Oct 2019 01:55:42 +1000
Subject: [PATCH 36/63] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index fa8a4dfe22..d2ff95409a 100644
--- a/README.md
+++ b/README.md
@@ -75,7 +75,7 @@ repo](https://github.com/rapidsai/notebooks-contrib).
 | | Truncated Singular Value Decomposition (tSVD) | Multi-GPU version available (CUDA 10 only) |
 | | Uniform Manifold Approximation and Projection (UMAP) | |
 | | Random Projection | |
-| | t-Distributed Stochastic Neighbor Embedding (TSNE) | (Experimental) |
+| | t-Distributed Stochastic Neighbor Embedding (TSNE) | Barnes-Hut O(NlogN) and Exact O(N^2) |
 | **Linear Models for Regression or Classification** | Linear Regression (OLS) | Multi-GPU available in conda CUDA 10 package |
 | | Linear Regression with Lasso or Ridge Regularization | |
 | | ElasticNet Regression | |

From b739ed0bafa9ce0eec13aee47c4a7d22ea0e7338 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 1 Oct 2019 01:56:46 +1000
Subject: [PATCH 37/63] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index d2ff95409a..4a2958b7da 100644
--- a/README.md
+++ b/README.md
@@ -75,7 +75,7 @@ repo](https://github.com/rapidsai/notebooks-contrib).
 | | Truncated Singular Value Decomposition (tSVD) | Multi-GPU version available (CUDA 10 only) |
 | | Uniform Manifold Approximation and Projection (UMAP) | |
 | | Random Projection | |
-| | t-Distributed Stochastic Neighbor Embedding (TSNE) | Barnes-Hut O(NlogN) and Exact O(N^2) |
+| | t-Distributed Stochastic Neighbor Embedding (TSNE) | Both Barnes-Hut O(NlogN) and Exact O(N^2) are provided |
 | **Linear Models for Regression or Classification** | Linear Regression (OLS) | Multi-GPU available in conda CUDA 10 package |
 | | Linear Regression with Lasso or Ridge Regularization | |
 | | ElasticNet Regression | |

From 166160d025d308b44370c9853da44bede6ac9641 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 1 Oct 2019 01:58:50 +1000
Subject: [PATCH 38/63] Update t_sne.pyx

---
 python/cuml/manifold/t_sne.pyx | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/python/cuml/manifold/t_sne.pyx b/python/cuml/manifold/t_sne.pyx
index 2196fbba4f..55149da812 100644
--- a/python/cuml/manifold/t_sne.pyx
+++ b/python/cuml/manifold/t_sne.pyx
@@ -71,7 +71,7 @@ cdef extern from "tsne/tsne.h" namespace "ML" nogil:
 
 
 class TSNE(Base):
-"""
+    """
     TSNE (T-Distributed Stochastic Neighbor Embedding) is an extremely
     powerful dimensionality reduction technique that aims to maintain
     local distances between data points. It is extremely robust to whatever
@@ -306,9 +306,10 @@ class TSNE(Base):
         return
 
     def fit(self, X):
-        """Fit X into an embedded space.
+        """
+        Fit X into an embedded space.
         Parameters
-        ----------
+        -----------
         X : array-like (device or host) shape = (n_samples, n_features)
             X contains a sample per row.
             Acceptable formats: cuDF DataFrame, NumPy ndarray, Numba device
@@ -418,15 +419,16 @@ class TSNE(Base):
             self.Y = None
 
     def fit_transform(self, X):
-        """Fit X into an embedded space and return that transformed output.
+        """
+        Fit X into an embedded space and return that transformed output.
         Parameters
-        ----------
+        -----------
         X : array-like (device or host) shape = (n_samples, n_features)
             X contains a sample per row.
             Acceptable formats: cuDF DataFrame, NumPy ndarray, Numba device
             ndarray, cuda array interface compliant array like CuPy
         Returns
-        -------
+        --------
         X_new : array, shape (n_samples, n_components)
                 Embedding of the training data in low-dimensional space.
         """

From 9f37cba9551f0f65a9a12e3974b72ea7737663d7 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 1 Oct 2019 02:00:45 +1000
Subject: [PATCH 39/63] Update t_sne.pyx

---
 python/cuml/manifold/t_sne.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cuml/manifold/t_sne.pyx b/python/cuml/manifold/t_sne.pyx
index 55149da812..339c3e24c7 100644
--- a/python/cuml/manifold/t_sne.pyx
+++ b/python/cuml/manifold/t_sne.pyx
@@ -115,7 +115,7 @@ class TSNE(Base):
     random_state : int (default None)
         Setting this can allow future runs of TSNE to look mostly the same.
         It is known that TSNE tends to have vastly different outputs on
-        many runs. Try using PCA intialization (in future release) to 
+        many runs. Try using PCA intialization (in future release) to
         possibly counteract this problem.
         It is known that small perturbations can directly
         change the result of the embedding for parallel TSNE implementations.

From d4df9b743f1d39155882fe29f61b71645d6f28c9 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 1 Oct 2019 02:09:48 +1000
Subject: [PATCH 40/63] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 4a2958b7da..1ff2549f7f 100644
--- a/README.md
+++ b/README.md
@@ -75,7 +75,7 @@ repo](https://github.com/rapidsai/notebooks-contrib).
 | | Truncated Singular Value Decomposition (tSVD) | Multi-GPU version available (CUDA 10 only) |
 | | Uniform Manifold Approximation and Projection (UMAP) | |
 | | Random Projection | |
-| | t-Distributed Stochastic Neighbor Embedding (TSNE) | Both Barnes-Hut O(NlogN) and Exact O(N^2) are provided |
+| | t-Distributed Stochastic Neighbor Embedding (TSNE) | Both Barnes-Hut and Exact algorithms are provided |
 | **Linear Models for Regression or Classification** | Linear Regression (OLS) | Multi-GPU available in conda CUDA 10 package |
 | | Linear Regression with Lasso or Ridge Regularization | |
 | | ElasticNet Regression | |

From d3954f3078efe23d7a947a0c6a0761188ba8c49d Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 1 Oct 2019 03:49:39 +1000
Subject: [PATCH 41/63] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 1ff2549f7f..ec7c281282 100644
--- a/README.md
+++ b/README.md
@@ -75,7 +75,7 @@ repo](https://github.com/rapidsai/notebooks-contrib).
 | | Truncated Singular Value Decomposition (tSVD) | Multi-GPU version available (CUDA 10 only) |
 | | Uniform Manifold Approximation and Projection (UMAP) | |
 | | Random Projection | |
-| | t-Distributed Stochastic Neighbor Embedding (TSNE) | Both Barnes-Hut and Exact algorithms are provided |
+| | t-Distributed Stochastic Neighbor Embedding (TSNE) | |
 | **Linear Models for Regression or Classification** | Linear Regression (OLS) | Multi-GPU available in conda CUDA 10 package |
 | | Linear Regression with Lasso or Ridge Regularization | |
 | | ElasticNet Regression | |

From 421c9c3ff5f67c1a96eadf3c878476dc15fb2501 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 1 Oct 2019 03:51:05 +1000
Subject: [PATCH 42/63] Update t_sne.pyx

---
 python/cuml/manifold/t_sne.pyx | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/cuml/manifold/t_sne.pyx b/python/cuml/manifold/t_sne.pyx
index 339c3e24c7..20fea2e2b6 100644
--- a/python/cuml/manifold/t_sne.pyx
+++ b/python/cuml/manifold/t_sne.pyx
@@ -115,8 +115,8 @@ class TSNE(Base):
     random_state : int (default None)
         Setting this can allow future runs of TSNE to look mostly the same.
         It is known that TSNE tends to have vastly different outputs on
-        many runs. Try using PCA intialization (in future release) to
-        possibly counteract this problem.
+        many runs. Try using PCA intialization (upcoming with change #1098)
+        to possibly counteract this problem.
         It is known that small perturbations can directly
         change the result of the embedding for parallel TSNE implementations.
     method : str 'barnes_hut' or 'exact' (default 'barnes_hut')
@@ -171,8 +171,8 @@ class TSNE(Base):
     specifying random_state and fixing it across runs can help, but TSNE does
     not guarantee similar results each time.
 
-    As suggested, PCA (in future release) can also help to alleviate this
-    issue.
+    As suggested, PCA (upcoming with change #1098) can also help to alleviate
+    this issue.
 
     Reference Implementation
     -------------------------

From 24ae3d724ec731e8862cefddff535fe89f5e141b Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 1 Oct 2019 03:51:54 +1000
Subject: [PATCH 43/63] Update t_sne.pyx

---
 python/cuml/manifold/t_sne.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/cuml/manifold/t_sne.pyx b/python/cuml/manifold/t_sne.pyx
index 20fea2e2b6..17ba9765b8 100644
--- a/python/cuml/manifold/t_sne.pyx
+++ b/python/cuml/manifold/t_sne.pyx
@@ -419,8 +419,8 @@ class TSNE(Base):
             self.Y = None
 
     def fit_transform(self, X):
-        """
-        Fit X into an embedded space and return that transformed output.
+        """Fit X into an embedded space and return that transformed output.
+        
         Parameters
         -----------
         X : array-like (device or host) shape = (n_samples, n_features)

From 2a66c48837c08a13a3d15a69fd0a43219781db55 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 30 Sep 2019 14:34:21 -0400
Subject: [PATCH 44/63] Adding timing information to verbose printing

---
 cpp/src/dbscan/runner.h          | 29 ++++++++++++++++++++++++++++-
 cpp/src/dbscan/vertexdeg/algo.h  |  9 ++++-----
 cpp/src/dbscan/vertexdeg/naive.h |  4 ++--
 cpp/src_prims/sparse/csr.h       | 19 +++++++++++++------
 4 files changed, 47 insertions(+), 14 deletions(-)

diff --git a/cpp/src/dbscan/runner.h b/cpp/src/dbscan/runner.h
index 4cba104d14..112d4ded9d 100644
--- a/cpp/src/dbscan/runner.h
+++ b/cpp/src/dbscan/runner.h
@@ -25,6 +25,8 @@
 #include "sparse/csr.h"
 #include "vertexdeg/runner.h"
 
+#include <sys/time.h>
+
 namespace Dbscan {
 
 using namespace MLCommon;
@@ -57,6 +59,12 @@ void final_relabel(Index_* db_cluster, Index_ N, cudaStream_t stream) {
     [MAX_LABEL] __device__(Index_ val) { return val == MAX_LABEL; });
 }
 
+int64_t curTimeMillis() {
+  struct timeval tp;
+  gettimeofday(&tp, NULL);
+  return tp.tv_sec * 1000 + tp.tv_usec / 1000;
+}
+
 /* @param N number of points
  * @param D dimensionality of the points
  * @param eps epsilon neighborhood criterion
@@ -146,6 +154,8 @@ size_t run(const ML::cumlHandle_impl& handle, Type_f* x, Index_ N, Index_ D,
       std::cout << "- Iteration " << i + 1 << " / " << nBatches
                 << ". Batch size is " << nPoints << " samples." << std::endl;
 
+    int64_t start_time = curTimeMillis();
+
     if (verbose) std::cout << "--> Computing vertex degrees" << std::endl;
     VertexDeg::run<Type_f, Index_>(handle, adj, vd, x, eps, N, D, algoVd,
                                    startVertexId, nPoints, stream);
@@ -153,9 +163,15 @@ size_t run(const ML::cumlHandle_impl& handle, Type_f* x, Index_ N, Index_ D,
     CUDA_CHECK(cudaStreamSynchronize(stream));
     ML::POP_RANGE();
 
+    int64_t cur_time = curTimeMillis();
+    if (verbose)
+      std::cout << "    |-> Took " << (cur_time - start_time) << "ms."
+                << std::endl;
+
     if (verbose)
       std::cout << "--> Computing adjacency graph of size " << curradjlen
                 << " samples." << std::endl;
+    start_time = curTimeMillis();
     // Running AdjGraph
     ML::PUSH_RANGE("Trace::Dbscan::AdjGraph");
     if (curradjlen > adjlen || adj_graph.data() == NULL) {
@@ -171,14 +187,25 @@ size_t run(const ML::cumlHandle_impl& handle, Type_f* x, Index_ N, Index_ D,
 
     ML::PUSH_RANGE("Trace::Dbscan::WeakCC");
 
+    cur_time = curTimeMillis();
+    if (verbose)
+      std::cout << "    |-> Took " << (cur_time - start_time) << "ms."
+                << std::endl;
+
     if (verbose) std::cout << "--> Computing connected components" << std::endl;
 
-    MLCommon::Sparse::weak_cc_batched<Index_, TPB>(
+    start_time = curTimeMillis();
+    MLCommon::Sparse::weak_cc_batched<Index_, 1024>(
       labels, ex_scan, adj_graph.data(), adjlen, N, startVertexId, nPoints,
       &state, stream,
       [core_pts] __device__(Index_ tid) { return core_pts[tid]; });
     ML::POP_RANGE();
 
+    cur_time = curTimeMillis();
+    if (verbose)
+      std::cout << "    |-> Took " << (cur_time - start_time) << "ms."
+                << std::endl;
+
     if (verbose) std::cout << " " << std::endl;
   }
 
diff --git a/cpp/src/dbscan/vertexdeg/algo.h b/cpp/src/dbscan/vertexdeg/algo.h
index 6b1bf29b24..160281d697 100644
--- a/cpp/src/dbscan/vertexdeg/algo.h
+++ b/cpp/src/dbscan/vertexdeg/algo.h
@@ -65,12 +65,11 @@ void launcher(const ML::cumlHandle_impl &handle, Pack<value_t, index_t> data,
     index_t batch_vertex = fmod(global_c_idx, n);
 
     if (sizeof(index_t) == 4) {
-      atomicAdd((int *)(vd + batch_vertex), (int)in_neigh);
-      atomicAdd((int *)(vd + n), (int)in_neigh);
+      atomicAdd((unsigned int *)(vd + batch_vertex), in_neigh);
+      atomicAdd((unsigned int *)(vd + n), in_neigh);
     } else if (sizeof(index_t) == 8) {
-      atomicAdd((unsigned long long *)(vd + batch_vertex),
-                (unsigned long long)in_neigh);
-      atomicAdd((unsigned long long *)(vd + n), (unsigned long long)in_neigh);
+      atomicAdd((unsigned long long int *)(vd + batch_vertex), in_neigh);
+      atomicAdd((unsigned long long int *)(vd + n), in_neigh);
     }
   };
 
diff --git a/cpp/src/dbscan/vertexdeg/naive.h b/cpp/src/dbscan/vertexdeg/naive.h
index 8710640295..db17d0fe34 100644
--- a/cpp/src/dbscan/vertexdeg/naive.h
+++ b/cpp/src/dbscan/vertexdeg/naive.h
@@ -66,8 +66,8 @@ __global__ void vertex_degree_kernel(Pack<Type, Index_> data,
     atomicAdd((int *)(vd + row), (int)res);
     atomicAdd((int *)(vd + batchSize), (int)res);
   } else if (sizeof(Index_) == 8) {
-    atomicAdd((unsigned long long *)(vd + row), (unsigned long long)res);
-    atomicAdd((unsigned long long *)(vd + batchSize), (unsigned long long)res);
+    atomicAdd((unsigned long long *)(vd + row), res);
+    atomicAdd((unsigned long long *)(vd + batchSize), res);
   }
 }
 
diff --git a/cpp/src_prims/sparse/csr.h b/cpp/src_prims/sparse/csr.h
index 03cbd8a0e8..f35f8268c2 100644
--- a/cpp/src_prims/sparse/csr.h
+++ b/cpp/src_prims/sparse/csr.h
@@ -708,9 +708,9 @@ __global__ void weak_cc_label_device(Index_ *labels, const Index_ *row_ind,
         cj = labels[j_ind];
         if (ci < cj) {
           if (sizeof(Index_) == 4)
-            atomicMin((unsigned int *)(labels + j_ind), ci);
+            atomicMin((int *)(labels + j_ind), ci);
           else if (sizeof(Index_) == 8)
-            atomicMin((unsigned long long int *)(labels + j_ind), ci);
+            atomicMin((long long int *)(labels + j_ind), ci);
           xa[j_ind] = true;
           m[0] = true;
         } else if (ci > cj) {
@@ -720,11 +720,9 @@ __global__ void weak_cc_label_device(Index_ *labels, const Index_ *row_ind,
       }
       if (ci_mod) {
         if (sizeof(Index_) == 4)
-          atomicMin((unsigned int *)(labels + startVertexId + tid), ci);
+          atomicMin((int *)(labels + startVertexId + tid), ci);
         else if (sizeof(Index_) == 8)
-          atomicMin((unsigned long long int *)(labels + startVertexId + tid),
-                    ci);
-
+          atomicMin((long long int *)(labels + startVertexId + tid), ci);
         xa[startVertexId + tid] = true;
         m[0] = true;
       }
@@ -777,6 +775,7 @@ void weak_cc_label_batched(Index_ *labels, const Index_ *row_ind,
     labels, startVertexId, batchSize, MAX_LABEL, filter_op);
   CUDA_CHECK(cudaPeekAtLastError());
 
+  int n_iters = 0;
   do {
     CUDA_CHECK(cudaMemsetAsync(state->m, false, sizeof(bool), stream));
 
@@ -784,6 +783,8 @@ void weak_cc_label_batched(Index_ *labels, const Index_ *row_ind,
       labels, row_ind, row_ind_ptr, nnz, state->fa, state->xa, state->m,
       startVertexId, batchSize);
     CUDA_CHECK(cudaPeekAtLastError());
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+
 
     //** swapping F1 and F2
     MLCommon::updateHost(host_fa, state->fa, N, stream);
@@ -794,7 +795,12 @@ void weak_cc_label_batched(Index_ *labels, const Index_ *row_ind,
     //** Updating m *
     MLCommon::updateHost(&host_m, state->m, 1, stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
+
+    n_iters++;
   } while (host_m);
+
+  free(host_fa);
+  free(host_xa);
 }
 
 /**
@@ -833,6 +839,7 @@ void weak_cc_batched(Index_ *labels, const Index_ *row_ind,
   dim3 threads(TPB_X);
 
   Index_ MAX_LABEL = std::numeric_limits<Index_>::max();
+
   if (startVertexId == 0) {
     weak_cc_init_all_kernel<Index_, TPB_X><<<blocks, threads, 0, stream>>>(
       labels, state->fa, state->xa, N, MAX_LABEL);

From 4014df40e4f17d759542ff0ff1513adc3bcd47db Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 1 Oct 2019 05:00:04 +1000
Subject: [PATCH 45/63] Fix style

---
 python/cuml/manifold/t_sne.pyx | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/cuml/manifold/t_sne.pyx b/python/cuml/manifold/t_sne.pyx
index 17ba9765b8..98c72b1cca 100644
--- a/python/cuml/manifold/t_sne.pyx
+++ b/python/cuml/manifold/t_sne.pyx
@@ -420,7 +420,6 @@ class TSNE(Base):
 
     def fit_transform(self, X):
         """Fit X into an embedded space and return that transformed output.
-        
         Parameters
         -----------
         X : array-like (device or host) shape = (n_samples, n_features)

From c00beb564d4ac955c8cbe032ed3ed64fa24a515f Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 30 Sep 2019 15:34:52 -0400
Subject: [PATCH 46/63] Fixing Python to use int64_t

---
 python/cuml/cluster/dbscan.pyx | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/python/cuml/cluster/dbscan.pyx b/python/cuml/cluster/dbscan.pyx
index 059944144d..02eaafd66c 100644
--- a/python/cuml/cluster/dbscan.pyx
+++ b/python/cuml/cluster/dbscan.pyx
@@ -24,7 +24,7 @@ import cudf
 import numpy as np
 
 from libcpp cimport bool
-from libc.stdint cimport uintptr_t
+from libc.stdint cimport uintptr_t, int64_t
 from libc.stdlib cimport calloc, malloc, free
 
 from cuml.common.base import Base
@@ -58,21 +58,21 @@ cdef extern from "dbscan/dbscan.hpp" namespace "ML":
 
     cdef void dbscanFit(cumlHandle& handle,
                         float *input,
-                        long n_rows,
-                        long n_cols,
+                        int64_t n_rows,
+                        int64_t n_cols,
                         double eps,
                         int min_pts,
-                        long *labels,
+                        int64_t *labels,
                         size_t max_mbytes_per_batch,
                         bool verbose) except +
 
     cdef void dbscanFit(cumlHandle& handle,
                         double *input,
-                        long n_rows,
-                        long n_cols,
+                        int64_t n_rows,
+                        int64_t n_cols,
                         double eps,
                         int min_pts,
-                        long *labels,
+                        int64_t *labels,
                         size_t max_mbytes_per_batch,
                         bool verbose) except +
 

From f7407fa9e8f905f1d5473cad981b1381c5297fd1 Mon Sep 17 00:00:00 2001
From: John Zedlewski <jzedlewski@nvidia.com>
Date: Mon, 30 Sep 2019 15:20:40 -0700
Subject: [PATCH 47/63] Add metrics, utils, SVM to docs

---
 README.md                                     |  1 +
 docs/source/api.rst                           | 32 +++++++++++++++++--
 .../cuml/ensemble/randomforestclassifier.pyx  | 19 ++++++++---
 python/cuml/metrics/accuracy.pyx              | 13 +++++---
 python/cuml/svm/svm.pyx                       |  5 +--
 python/cuml/utils/input_utils.py              | 21 +++++++++---
 6 files changed, 72 insertions(+), 19 deletions(-)

diff --git a/README.md b/README.md
index fa8a4dfe22..9bd6206bcf 100644
--- a/README.md
+++ b/README.md
@@ -84,6 +84,7 @@ repo](https://github.com/rapidsai/notebooks-contrib).
 | **Nonlinear Models for Regression or Classification** | Random Forest (RF) Classification | Experimental multi-node, multi-GPU version available via Dask integration |
 | | Random Forest (RF) Regression | Experimental multi-node, multi-GPU version available via Dask integration |
 |  | K-Nearest Neighbors (KNN) | Multi-GPU <br> Uses [Faiss](https://github.com/facebookresearch/faiss) |
+|  | Support Vector Machine Classifier (SVC) | |
 | **Time Series** | Linear Kalman Filter | |
 |  | Holt-Winters Exponential Smoothing | |
 ---
diff --git a/docs/source/api.rst b/docs/source/api.rst
index fce956891b..1d1cc5f069 100644
--- a/docs/source/api.rst
+++ b/docs/source/api.rst
@@ -4,8 +4,8 @@ cuML API Reference
 
 
 
-Preprocessing
-==============
+Preprocessing, Metrics, and Utilities
+=====================================
 
 Model Selection and Data Splitting
 ----------------------------------
@@ -24,6 +24,28 @@ Dataset Generation
 
   .. automethod:: cuml.datasets.make_blobs
 
+Metrics
+---------
+
+  .. automodule:: cuml.metrics.regression
+    :members:
+
+  .. automodule:: cuml.metrics.accuracy
+    :members:
+
+  .. automodule:: cuml.metrics.trustworthiness
+    :members:
+
+
+Utilities for I/O and Numba
+---------------------------
+
+  .. automodule:: cuml.utils.input_utils
+    :members:
+
+  .. automodule:: cuml.utils.numba_utils
+    :members:
+
 Regression and Classification
 =============================
 
@@ -84,6 +106,12 @@ Quasi-Newton
 .. autoclass:: cuml.QN
     :members:
 
+Support Vector Machines
+------------------------
+
+.. autoclass:: cuml.svm.SVC
+    :members:
+
 Clustering
 ==========
 
diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx
index 04f848a92d..f5dc6c616f 100644
--- a/python/cuml/ensemble/randomforestclassifier.pyx
+++ b/python/cuml/ensemble/randomforestclassifier.pyx
@@ -608,6 +608,7 @@ class RandomForestClassifier(Base):
                 num_classes=2):
         """
         Predicts the labels for X.
+
         Parameters
         ----------
         X : array-like (device or host) shape = (n_samples, n_features)
@@ -639,9 +640,10 @@ class RandomForestClassifier(Base):
                     It is applied if output_class == True, else it is ignored
         num_classes : integer
                       number of different classes present in the dataset
+
         Returns
         ----------
-        y: NumPy
+        y : NumPy
            Dense vector (int) of shape (n_samples, 1)
         """
         if self.dtype == np.float64:
@@ -662,15 +664,17 @@ class RandomForestClassifier(Base):
     def _predict_get_all(self, X):
         """
         Predicts the labels for X.
+
         Parameters
         ----------
         X : array-like (device or host) shape = (n_samples, n_features)
             Dense matrix (floats or doubles) of shape (n_samples, n_features).
             Acceptable formats: cuDF DataFrame, NumPy ndarray, Numba device
             ndarray, cuda array interface compliant array like CuPy
+
         Returns
         ----------
-        y: NumPy
+        y : NumPy
            Dense vector (int) of shape (n_samples, 1)
         """
         cdef uintptr_t X_ptr
@@ -724,17 +728,20 @@ class RandomForestClassifier(Base):
     def score(self, X, y):
         """
         Calculates the accuracy metric score of the model for X.
+
         Parameters
         ----------
         X : array-like (device or host) shape = (n_samples, n_features)
             Dense matrix (floats or doubles) of shape (n_samples, n_features).
             Acceptable formats: cuDF DataFrame, NumPy ndarray, Numba device
             ndarray, cuda array interface compliant array like CuPy
-        y: NumPy
+        y : NumPy
            Dense vector (int) of shape (n_samples, 1)
+
         Returns
-        ----------
-        accuracy of the model
+        -------
+        float
+           Accuracy of the model [0.0 - 1.0]
         """
         cdef uintptr_t X_ptr, y_ptr
         X_m, X_ptr, n_rows, n_cols, _ = \
@@ -795,6 +802,7 @@ class RandomForestClassifier(Base):
         """
         Returns the value of all parameters
         required to configure this estimator as a dictionary.
+
         Parameters
         -----------
         deep : boolean (default = True)
@@ -811,6 +819,7 @@ class RandomForestClassifier(Base):
         Sets the value of parameters required to
         configure this estimator, it functions similar to
         the sklearn set_params.
+
         Parameters
         -----------
         params : dict of new params
diff --git a/python/cuml/metrics/accuracy.pyx b/python/cuml/metrics/accuracy.pyx
index dfebe1910f..5ce62bb591 100644
--- a/python/cuml/metrics/accuracy.pyx
+++ b/python/cuml/metrics/accuracy.pyx
@@ -42,13 +42,16 @@ def accuracy_score(ground_truth, predictions, handle=None):
 
         Parameters
         ----------
-            handle : cuml.Handle
-            prediction : The lables predicted by the model
-                         for the test dataset
-            ground_truth : The ground truth labels of the test dataset
+        handle : cuml.Handle
+        prediction : NumPy ndarray or Numba device 
+           The lables predicted by the model for the test dataset
+        ground_truth : NumPy ndarray, Numba device
+           The ground truth labels of the test dataset
+
         Returns
         -------
-            The accuracy of the model used for prediction
+        float
+          The accuracy of the model used for prediction
     """
     handle = cuml.common.handle.Handle() \
         if handle is None else handle
diff --git a/python/cuml/svm/svm.pyx b/python/cuml/svm/svm.pyx
index 141e8a5002..8ebf541f49 100644
--- a/python/cuml/svm/svm.pyx
+++ b/python/cuml/svm/svm.pyx
@@ -442,9 +442,10 @@ class SVC(Base):
             Dense matrix (floats or doubles) of shape (n_samples, n_features).
             Acceptable formats: cuDF DataFrame, NumPy ndarray, Numba device
             ndarray, cuda array interface compliant array like CuPy
+
         Returns
-        ----------
-        y: cuDF Series
+        -------
+        y : cuDF Series
            Dense vector (floats or doubles) of shape (n_samples, 1)
         """
 
diff --git a/python/cuml/utils/input_utils.py b/python/cuml/utils/input_utils.py
index 6170ab4223..282a0d4bea 100644
--- a/python/cuml/utils/input_utils.py
+++ b/python/cuml/utils/input_utils.py
@@ -71,14 +71,20 @@ def input_to_dev_array(X, order='F', deepcopy=False,
                        check_cols=False, check_rows=False,
                        fail_on_order=False):
     """
-    Convert input X to device array suitable for C++ methods
+    Convert input X to device array suitable for C++ methods.
+
     Acceptable input formats:
+
     * cuDF Dataframe - returns a deep copy always
+
     * cuDF Series - returns by reference or a deep copy depending on
         `deepcopy`
+
     * Numpy array - returns a copy in device always
+
     * cuda array interface compliant array (like Cupy) - returns a
-        reference unless deepcopy=True
+        reference unless `deepcopy`=True
+
     * numba device array - returns a reference unless deepcopy=True
 
     Parameters
@@ -309,13 +315,18 @@ def input_to_host_array(X, order='F', deepcopy=False,
     """
     Convert input X to host array (NumPy) suitable for C++ methods that accept
     host arrays.
+
     Acceptable input formats:
+
     * Numpy array - returns a pointer to the original input
+
     * cuDF Dataframe - returns a deep copy always
-    * cuDF Series - returns by reference or a deep copy depending on
-        `deepcopy`
-    * cuda array interface compliant array (like Cupy) - returns a
+
+    * cuDF Series - returns by reference or a deep copy depending on `deepcopy`
+
+    * cuda array interface compliant array (like Cupy) - returns a \
         reference unless deepcopy=True
+
     * numba device array - returns a reference unless deepcopy=True
 
     Parameters

From af4fe5a02a8411cd70ae2dec8b2adc734ba9826e Mon Sep 17 00:00:00 2001
From: John Zedlewski <jzedlewski@nvidia.com>
Date: Mon, 30 Sep 2019 15:27:30 -0700
Subject: [PATCH 48/63] Update changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7b471839e0..4f72f7b6aa 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -104,6 +104,7 @@
 - PR #978: Update README for 0.9
 - PR #1009: Fix references to notebooks-contrib
 - PR #1015: Ability to control the number of internal streams in cumlHandle_impl via cumlHandle
+- PR #1175: Add more modules to docs ToC
 
 ## Bug Fixes
 

From 39ede99cc64a410ae2b1cd7c91c0b966c030bfed Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 30 Sep 2019 21:09:00 -0400
Subject: [PATCH 49/63] Fixing out_dtype for fit_predict

---
 python/cuml/benchmark/algorithms.py |  3 ++-
 python/cuml/benchmark/runners.py    | 16 +++++++++++++---
 python/cuml/cluster/dbscan.pyx      |  2 +-
 3 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/python/cuml/benchmark/algorithms.py b/python/cuml/benchmark/algorithms.py
index 088e0e8910..63e7686455 100644
--- a/python/cuml/benchmark/algorithms.py
+++ b/python/cuml/benchmark/algorithms.py
@@ -109,6 +109,7 @@ def run_cuml(self, data, **override_args):
         all_args = {**all_args, **override_args}
 
         cuml_obj = self.cuml_class(**all_args)
+        print(str(cuml_obj))
         if self.data_prep_hook:
             data = self.data_prep_hook(data)
         if self.accepts_labels:
@@ -159,7 +160,7 @@ def all_algorithms():
         AlgorithmPair(
             sklearn.neighbors.NearestNeighbors,
             cuml.neighbors.NearestNeighbors,
-            shared_args=dict(n_neighbors=1024),
+            shared_args=dict(n_neighbors=1000000),
             cpu_args=dict(algorithm="brute"),
             cuml_args={},
             name="NearestNeighbors",
diff --git a/python/cuml/benchmark/runners.py b/python/cuml/benchmark/runners.py
index ca3f4d9bbe..b7a3d9b15e 100644
--- a/python/cuml/benchmark/runners.py
+++ b/python/cuml/benchmark/runners.py
@@ -42,11 +42,11 @@ def _run_one_size(
         cuml_param_overrides={},
         cpu_param_overrides={},
         run_cpu=True,
+        verbose=False
     ):
         data = datagen.gen_data(
             self.dataset_name, self.input_type, n_samples, n_features
         )
-        print("data type: ", data[0].__class__)
 
         cu_start = time.time()
         algo_pair.run_cuml(data, **param_overrides, **cuml_param_overrides)
@@ -59,10 +59,17 @@ def _run_one_size(
         else:
             cpu_elapsed = 0.0
 
+        speedup = cpu_elapsed / float(cu_elapsed)
+
+        if verbose is True:
+            print("Benchmark [n_samples=%d, "
+                  "n_features=%d with datatype=%s] = %f speedup." %
+                  (n_samples, n_features, data[0].__class__, speedup))
+
         return dict(
             cu_time=cu_elapsed,
             cpu_time=cpu_elapsed,
-            speedup=cpu_elapsed / float(cu_elapsed),
+            speedup=speedup,
             n_samples=n_samples,
             n_features=n_features,
             **param_overrides,
@@ -77,7 +84,9 @@ def run(
         cpu_param_overrides={},
         *,
         run_cpu=True,
-        raise_on_error=False
+        raise_on_error=False,
+        verbose=False
+
     ):
         all_results = []
         for ns in self.bench_rows:
@@ -92,6 +101,7 @@ def run(
                             cuml_param_overrides,
                             cpu_param_overrides,
                             run_cpu,
+                            verbose
                         )
                     )
                 except Exception as e:
diff --git a/python/cuml/cluster/dbscan.pyx b/python/cuml/cluster/dbscan.pyx
index 02eaafd66c..26be6f54ea 100644
--- a/python/cuml/cluster/dbscan.pyx
+++ b/python/cuml/cluster/dbscan.pyx
@@ -268,7 +268,7 @@ class DBSCAN(Base):
         del(X_m)
         return self
 
-    def fit_predict(self, X, out_dtype="auto"):
+    def fit_predict(self, X, out_dtype="int32"):
         """
         Performs clustering on input_gdf and returns cluster labels.
 

From a4673e9dbb264ab679e49d0f14d18fb6e97aeb9b Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <danteg@nvidia.com>
Date: Mon, 30 Sep 2019 20:16:59 -0500
Subject: [PATCH 50/63] FIX Change import of RMM from cffi to cython

---
 python/cuml/cluster/kmeans.pyx                      | 2 +-
 python/cuml/cluster/kmeans_mg.pyx                   | 2 +-
 python/cuml/dask/linear_model/linear_regression.py  | 2 +-
 python/cuml/dask/neighbors/nearest_neighbors.py     | 2 +-
 python/cuml/decomposition/pca.pyx                   | 2 +-
 python/cuml/decomposition/tsvd.pyx                  | 2 +-
 python/cuml/fil/fil.pyx                             | 2 +-
 python/cuml/filter/kalman_filter.pyx                | 2 +-
 python/cuml/manifold/t_sne.pyx                      | 2 +-
 python/cuml/manifold/umap.pyx                       | 2 +-
 python/cuml/neighbors/nearest_neighbors.pyx         | 4 ++--
 python/cuml/random_projection/random_projection.pyx | 2 +-
 python/cuml/solvers/qn.pyx                          | 2 +-
 python/cuml/utils/input_utils.py                    | 2 +-
 python/cuml/utils/numba_utils.py                    | 2 +-
 15 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/python/cuml/cluster/kmeans.pyx b/python/cuml/cluster/kmeans.pyx
index 802acdc907..aa1cd53fab 100644
--- a/python/cuml/cluster/kmeans.pyx
+++ b/python/cuml/cluster/kmeans.pyx
@@ -24,7 +24,7 @@ import cudf
 import numpy as np
 import warnings
 
-from librmm_cffi import librmm as rmm
+import rmm
 
 from libcpp cimport bool
 from libc.stdint cimport uintptr_t
diff --git a/python/cuml/cluster/kmeans_mg.pyx b/python/cuml/cluster/kmeans_mg.pyx
index d04d448f7a..57785a4a5d 100644
--- a/python/cuml/cluster/kmeans_mg.pyx
+++ b/python/cuml/cluster/kmeans_mg.pyx
@@ -24,7 +24,7 @@ import cudf
 import numpy as np
 import warnings
 
-from librmm_cffi import librmm as rmm
+import rmm
 
 from libcpp cimport bool
 from libc.stdint cimport uintptr_t
diff --git a/python/cuml/dask/linear_model/linear_regression.py b/python/cuml/dask/linear_model/linear_regression.py
index 1034af5489..a860c00b6f 100644
--- a/python/cuml/dask/linear_model/linear_regression.py
+++ b/python/cuml/dask/linear_model/linear_regression.py
@@ -26,7 +26,7 @@
 from dask import delayed
 from dask.distributed import wait, default_client
 from math import ceil
-from librmm_cffi import librmm as rmm
+import rmm
 from toolz import first
 from tornado import gen
 
diff --git a/python/cuml/dask/neighbors/nearest_neighbors.py b/python/cuml/dask/neighbors/nearest_neighbors.py
index b37ee0568c..7128eb2dd1 100644
--- a/python/cuml/dask/neighbors/nearest_neighbors.py
+++ b/python/cuml/dask/neighbors/nearest_neighbors.py
@@ -24,7 +24,7 @@
 import random
 
 from cuml.utils import numba_utils
-from librmm_cffi import librmm as rmm
+import rmm
 
 from dask import delayed
 from collections import defaultdict
diff --git a/python/cuml/decomposition/pca.pyx b/python/cuml/decomposition/pca.pyx
index 78aab743ad..cd28e929ed 100644
--- a/python/cuml/decomposition/pca.pyx
+++ b/python/cuml/decomposition/pca.pyx
@@ -23,7 +23,7 @@ import ctypes
 import cudf
 import numpy as np
 
-from librmm_cffi import librmm as rmm
+import rmm
 
 from libcpp cimport bool
 from libc.stdint cimport uintptr_t
diff --git a/python/cuml/decomposition/tsvd.pyx b/python/cuml/decomposition/tsvd.pyx
index eb10bf9565..66e4866048 100644
--- a/python/cuml/decomposition/tsvd.pyx
+++ b/python/cuml/decomposition/tsvd.pyx
@@ -23,7 +23,7 @@ import ctypes
 import cudf
 import numpy as np
 
-from librmm_cffi import librmm as rmm
+import rmm
 from libcpp cimport bool
 from libc.stdint cimport uintptr_t
 
diff --git a/python/cuml/fil/fil.pyx b/python/cuml/fil/fil.pyx
index 58ecb9645a..d55983abda 100644
--- a/python/cuml/fil/fil.pyx
+++ b/python/cuml/fil/fil.pyx
@@ -26,7 +26,7 @@ import math
 import numpy as np
 import warnings
 
-from librmm_cffi import librmm as rmm
+import rmm
 
 from libcpp cimport bool
 from libc.stdint cimport uintptr_t
diff --git a/python/cuml/filter/kalman_filter.pyx b/python/cuml/filter/kalman_filter.pyx
index 7308fa14ab..22bf510a33 100644
--- a/python/cuml/filter/kalman_filter.pyx
+++ b/python/cuml/filter/kalman_filter.pyx
@@ -24,7 +24,7 @@ import numpy as np
 
 from numba import cuda
 from cuml.utils import numba_utils
-from librmm_cffi import librmm as rmm
+import rmm
 
 from libc.stdint cimport uintptr_t
 from libc.stdlib cimport calloc, malloc, free
diff --git a/python/cuml/manifold/t_sne.pyx b/python/cuml/manifold/t_sne.pyx
index 453fa60a42..8e9a0ce10f 100644
--- a/python/cuml/manifold/t_sne.pyx
+++ b/python/cuml/manifold/t_sne.pyx
@@ -32,7 +32,7 @@ from cuml.common.base import Base
 from cuml.common.handle cimport cumlHandle
 
 from cuml.utils import input_to_dev_array as to_cuda
-from librmm_cffi import librmm as rmm
+import rmm
 
 from libcpp cimport bool
 from libc.stdint cimport uintptr_t
diff --git a/python/cuml/manifold/umap.pyx b/python/cuml/manifold/umap.pyx
index 1f120c0685..25a42669a5 100644
--- a/python/cuml/manifold/umap.pyx
+++ b/python/cuml/manifold/umap.pyx
@@ -31,7 +31,7 @@ from cuml.common.handle cimport cumlHandle
 from cuml.utils import get_cudf_column_ptr, get_dev_array_ptr, \
     input_to_dev_array, zeros, row_matrix
 
-from librmm_cffi import librmm as rmm
+import rmm
 
 from libcpp cimport bool
 from libc.stdint cimport uintptr_t
diff --git a/python/cuml/neighbors/nearest_neighbors.pyx b/python/cuml/neighbors/nearest_neighbors.pyx
index 7e21a1aa87..49812e76f4 100644
--- a/python/cuml/neighbors/nearest_neighbors.pyx
+++ b/python/cuml/neighbors/nearest_neighbors.pyx
@@ -35,14 +35,14 @@ from cython.operator cimport dereference as deref
 from libcpp cimport bool
 from libcpp.memory cimport shared_ptr
 
-from librmm_cffi import librmm as rmm
+import rmm
 from libc.stdlib cimport malloc, free
 
 from libc.stdint cimport uintptr_t
 from libc.stdlib cimport calloc, malloc, free
 
 from numba import cuda
-from librmm_cffi import librmm as rmm
+import rmm
 
 cimport cuml.common.handle
 cimport cuml.common.cuda
diff --git a/python/cuml/random_projection/random_projection.pyx b/python/cuml/random_projection/random_projection.pyx
index 7897eecc61..efa6ba8193 100644
--- a/python/cuml/random_projection/random_projection.pyx
+++ b/python/cuml/random_projection/random_projection.pyx
@@ -22,7 +22,7 @@
 import cudf
 import numpy as np
 
-from librmm_cffi import librmm as rmm
+import rmm
 
 from libc.stdint cimport uintptr_t
 from libcpp cimport bool
diff --git a/python/cuml/solvers/qn.pyx b/python/cuml/solvers/qn.pyx
index f783590dfe..393ab2b379 100644
--- a/python/cuml/solvers/qn.pyx
+++ b/python/cuml/solvers/qn.pyx
@@ -23,7 +23,7 @@ import cudf
 import numpy as np
 import warnings
 
-from librmm_cffi import librmm as rmm
+import rmm
 
 from libcpp cimport bool
 from libc.stdint cimport uintptr_t
diff --git a/python/cuml/utils/input_utils.py b/python/cuml/utils/input_utils.py
index 6170ab4223..51fea46e49 100644
--- a/python/cuml/utils/input_utils.py
+++ b/python/cuml/utils/input_utils.py
@@ -26,7 +26,7 @@
 from collections.abc import Collection
 from numba import cuda
 
-from librmm_cffi import librmm as rmm
+import rmm
 
 
 inp_array = namedtuple('inp_array', 'array pointer n_rows n_cols dtype')
diff --git a/python/cuml/utils/numba_utils.py b/python/cuml/utils/numba_utils.py
index 288063b165..dc6b2d8c00 100644
--- a/python/cuml/utils/numba_utils.py
+++ b/python/cuml/utils/numba_utils.py
@@ -18,7 +18,7 @@
 
 from numba import cuda
 from numba.cuda.cudadrv.driver import driver
-from librmm_cffi import librmm as rmm
+import rmm
 import numpy as np
 
 

From 13f8962127cdbbaa2e0367f0b3277a3dce7ca254 Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <danteg@nvidia.com>
Date: Mon, 30 Sep 2019 20:17:33 -0500
Subject: [PATCH 51/63] FIX Change import of RMM from cffi to cython

---
 python/cuml/preprocessing/LabelEncoder.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/cuml/preprocessing/LabelEncoder.py b/python/cuml/preprocessing/LabelEncoder.py
index a62765b4c5..f5f28a0f27 100644
--- a/python/cuml/preprocessing/LabelEncoder.py
+++ b/python/cuml/preprocessing/LabelEncoder.py
@@ -16,7 +16,7 @@
 
 import cudf
 import nvcategory
-from librmm_cffi import librmm
+import rmm
 import numpy as np
 
 
@@ -194,7 +194,7 @@ def fit_transform(self, y: cudf.Series) -> cudf.Series:
         self._cats = nvcategory.from_strings(y.data)
 
         self._fitted = True
-        arr: librmm.device_array = librmm.device_array(
+        arr: rmm.device_array = rmm.device_array(
             y.data.size(), dtype=np.int32
         )
         self._cats.values(devptr=arr.device_ctypes_pointer.value)

From 951cc4fabfad50e701c7a99a4ebc7de9678b732c Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <danteg@nvidia.com>
Date: Mon, 30 Sep 2019 20:18:53 -0500
Subject: [PATCH 52/63] DOC Add entry to changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7b471839e0..91d8feba0f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -25,6 +25,7 @@
 - PR #1115: Moving dask_make_blobs to cuml.dask.datasets. Adding conversion to dask.DataFrame
 - PR #1136: CUDA 10.1 CI updates
 - PR #1165: Adding except + in all remaining cython
+- PR #1176: Use new RMM API based on Cython
 
 ## Bug Fixes
 

From 14ec7f55f8738f100445e112aca0c73955a504d2 Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <danteg@nvidia.com>
Date: Mon, 30 Sep 2019 22:43:54 -0500
Subject: [PATCH 53/63] FIX Updated dask and distributed versions

---
 ci/gpu/build.sh                          | 4 ++--
 conda/environments/cuml_dev_cuda10.0.yml | 4 ++--
 conda/environments/cuml_dev_cuda10.1.yml | 4 ++--
 conda/environments/cuml_dev_cuda9.2.yml  | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 73f77369cf..8e9f3091e7 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -53,8 +53,8 @@ conda install -c conda-forge -c rapidsai -c rapidsai-nightly -c rapidsai/label/x
       "cmake==3.14.3" \
       "umap-learn" \
       "nccl>=2.4" \
-      "dask=2.3.0" \
-      "distributed=2.3.0" \
+      "dask=2.5.0" \
+      "distributed=2.5.1" \
       "dask-ml" \
       "dask-cudf=${MINOR_VERSION}" \
       "dask-cuda=${MINOR_VERSION}" \
diff --git a/conda/environments/cuml_dev_cuda10.0.yml b/conda/environments/cuml_dev_cuda10.0.yml
index 63159abc29..362dda8be3 100644
--- a/conda/environments/cuml_dev_cuda10.0.yml
+++ b/conda/environments/cuml_dev_cuda10.0.yml
@@ -17,8 +17,8 @@ dependencies:
 - scikit-learn>=0.21
 - umap-learn>=0.3.9
 - scikit-learn>=0.21
-- dask=2.3.0
-- distributed=2.3.0
+- dask=2.5.0
+- distributed=2.5.1
 - dask-ml
 - dask-cuda=0.9*
 - dask-cudf=0.10*
diff --git a/conda/environments/cuml_dev_cuda10.1.yml b/conda/environments/cuml_dev_cuda10.1.yml
index 999aebcc20..d4a97d2a64 100644
--- a/conda/environments/cuml_dev_cuda10.1.yml
+++ b/conda/environments/cuml_dev_cuda10.1.yml
@@ -17,8 +17,8 @@ dependencies:
 - scikit-learn>=0.21
 - umap-learn>=0.3.9
 - scikit-learn>=0.21
-- dask=2.3.0
-- distributed=2.3.0
+- dask=2.5.0
+- distributed=2.5.1
 - dask-ml
 - dask-cuda=0.9*
 - dask-cudf=0.10*
diff --git a/conda/environments/cuml_dev_cuda9.2.yml b/conda/environments/cuml_dev_cuda9.2.yml
index 40f928e884..6fd25a9bef 100644
--- a/conda/environments/cuml_dev_cuda9.2.yml
+++ b/conda/environments/cuml_dev_cuda9.2.yml
@@ -17,8 +17,8 @@ dependencies:
 - scikit-learn>=0.21
 - umap-learn>=0.3.9
 - scikit-learn>=0.21
-- dask=2.3.0
-- distributed=2.3.0
+- dask=2.5.0
+- distributed=2.5.1
 - dask-ml
 - dask-cuda=0.9*
 - dask-cudf=0.10*

From c0b8cacd730c0135d50d9c7e37989dd544ee6113 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 1 Oct 2019 14:34:50 +1000
Subject: [PATCH 54/63] Add blank lines

---
 python/cuml/manifold/t_sne.pyx | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/cuml/manifold/t_sne.pyx b/python/cuml/manifold/t_sne.pyx
index 98c72b1cca..9ee124b97f 100644
--- a/python/cuml/manifold/t_sne.pyx
+++ b/python/cuml/manifold/t_sne.pyx
@@ -306,8 +306,8 @@ class TSNE(Base):
         return
 
     def fit(self, X):
-        """
-        Fit X into an embedded space.
+        """Fit X into an embedded space.
+        
         Parameters
         -----------
         X : array-like (device or host) shape = (n_samples, n_features)
@@ -420,12 +420,14 @@ class TSNE(Base):
 
     def fit_transform(self, X):
         """Fit X into an embedded space and return that transformed output.
+        
         Parameters
         -----------
         X : array-like (device or host) shape = (n_samples, n_features)
             X contains a sample per row.
             Acceptable formats: cuDF DataFrame, NumPy ndarray, Numba device
             ndarray, cuda array interface compliant array like CuPy
+        
         Returns
         --------
         X_new : array, shape (n_samples, n_components)

From 6f789ad5cadf1aaff0614c17cb43f4f649ecff6b Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 1 Oct 2019 08:38:37 -0400
Subject: [PATCH 55/63] Adding utility in prims for getting the current time
 milliseconds

---
 cpp/src/dbscan/runner.h | 8 ++------
 cpp/src_prims/utils.h   | 9 +++++++++
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/cpp/src/dbscan/runner.h b/cpp/src/dbscan/runner.h
index 112d4ded9d..c9cc0ad925 100644
--- a/cpp/src/dbscan/runner.h
+++ b/cpp/src/dbscan/runner.h
@@ -25,6 +25,8 @@
 #include "sparse/csr.h"
 #include "vertexdeg/runner.h"
 
+#include "utils.h"
+
 #include <sys/time.h>
 
 namespace Dbscan {
@@ -59,12 +61,6 @@ void final_relabel(Index_* db_cluster, Index_ N, cudaStream_t stream) {
     [MAX_LABEL] __device__(Index_ val) { return val == MAX_LABEL; });
 }
 
-int64_t curTimeMillis() {
-  struct timeval tp;
-  gettimeofday(&tp, NULL);
-  return tp.tv_sec * 1000 + tp.tv_usec / 1000;
-}
-
 /* @param N number of points
  * @param D dimensionality of the points
  * @param eps epsilon neighborhood criterion
diff --git a/cpp/src_prims/utils.h b/cpp/src_prims/utils.h
index eb2b2bd59f..05a55cc0e9 100644
--- a/cpp/src_prims/utils.h
+++ b/cpp/src_prims/utils.h
@@ -18,6 +18,7 @@
 
 #include <cuda_runtime.h>
 #include <execinfo.h>
+#include <chrono>
 #include <cstdio>
 #include <iostream>
 #include <sstream>
@@ -177,6 +178,14 @@ void copyAsync(Type* dPtr1, const Type* dPtr2, size_t len,
   CUDA_CHECK(cudaMemcpyAsync(dPtr1, dPtr2, len * sizeof(Type),
                              cudaMemcpyDeviceToDevice, stream));
 }
+
+inline uint32_t curTimeMillis() {
+  auto now = std::chrono::high_resolution_clock::now();
+  auto duration = now.time_since_epoch();
+  return std::chrono::duration_cast<std::chrono::milliseconds>(duration)
+    .count();
+}
+
 /** @} */
 
 /** Helper function to calculate need memory for allocate to store dense matrix.

From 648b70279ab9363867edcb4568c6d536b146a771 Mon Sep 17 00:00:00 2001
From: John Zedlewski <jzedlewski@nvidia.com>
Date: Tue, 1 Oct 2019 09:03:24 -0700
Subject: [PATCH 56/63] Add benchmarking and cluster metrics to docs

---
 docs/source/api.rst              | 16 ++++++++++++++++
 python/cuml/benchmark/datagen.py |  2 +-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/docs/source/api.rst b/docs/source/api.rst
index 1d1cc5f069..ce0fb8d750 100644
--- a/docs/source/api.rst
+++ b/docs/source/api.rst
@@ -36,6 +36,22 @@ Metrics
   .. automodule:: cuml.metrics.trustworthiness
     :members:
 
+  .. automodule:: cuml.metrics.cluster
+    :members:
+
+Benchmarking
+-------------
+
+  .. automodule:: cuml.benchmark.algorithms
+    :members:
+
+  .. automodule:: cuml.benchmark.runners
+    :members:
+
+  .. automodule:: cuml.benchmark.datagen
+    :members:
+    
+       
 
 Utilities for I/O and Numba
 ---------------------------
diff --git a/python/cuml/benchmark/datagen.py b/python/cuml/benchmark/datagen.py
index 8a881687ad..203d6a6759 100644
--- a/python/cuml/benchmark/datagen.py
+++ b/python/cuml/benchmark/datagen.py
@@ -22,7 +22,7 @@
  * n_samples (set to 0 for 'default')
  * n_features (set to 0 for 'default')
  * random_state
- * .. and optional generator-specific parameters
+ * (and optional generator-specific parameters)
 
 The function should return a 2-tuple (X, y), where X is a Pandas
 dataframe and y is a Pandas series. If the generator does not produce

From 3d47170bb4bb1b30e159578f800bcaae858b791a Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 1 Oct 2019 12:39:33 -0400
Subject: [PATCH 57/63] Undoing accidentally checked in benchmark changes

---
 python/cuml/benchmark/algorithms.py |  3 +--
 python/cuml/benchmark/runners.py    | 16 +++-------------
 2 files changed, 4 insertions(+), 15 deletions(-)

diff --git a/python/cuml/benchmark/algorithms.py b/python/cuml/benchmark/algorithms.py
index 63e7686455..088e0e8910 100644
--- a/python/cuml/benchmark/algorithms.py
+++ b/python/cuml/benchmark/algorithms.py
@@ -109,7 +109,6 @@ def run_cuml(self, data, **override_args):
         all_args = {**all_args, **override_args}
 
         cuml_obj = self.cuml_class(**all_args)
-        print(str(cuml_obj))
         if self.data_prep_hook:
             data = self.data_prep_hook(data)
         if self.accepts_labels:
@@ -160,7 +159,7 @@ def all_algorithms():
         AlgorithmPair(
             sklearn.neighbors.NearestNeighbors,
             cuml.neighbors.NearestNeighbors,
-            shared_args=dict(n_neighbors=1000000),
+            shared_args=dict(n_neighbors=1024),
             cpu_args=dict(algorithm="brute"),
             cuml_args={},
             name="NearestNeighbors",
diff --git a/python/cuml/benchmark/runners.py b/python/cuml/benchmark/runners.py
index b7a3d9b15e..ca3f4d9bbe 100644
--- a/python/cuml/benchmark/runners.py
+++ b/python/cuml/benchmark/runners.py
@@ -42,11 +42,11 @@ def _run_one_size(
         cuml_param_overrides={},
         cpu_param_overrides={},
         run_cpu=True,
-        verbose=False
     ):
         data = datagen.gen_data(
             self.dataset_name, self.input_type, n_samples, n_features
         )
+        print("data type: ", data[0].__class__)
 
         cu_start = time.time()
         algo_pair.run_cuml(data, **param_overrides, **cuml_param_overrides)
@@ -59,17 +59,10 @@ def _run_one_size(
         else:
             cpu_elapsed = 0.0
 
-        speedup = cpu_elapsed / float(cu_elapsed)
-
-        if verbose is True:
-            print("Benchmark [n_samples=%d, "
-                  "n_features=%d with datatype=%s] = %f speedup." %
-                  (n_samples, n_features, data[0].__class__, speedup))
-
         return dict(
             cu_time=cu_elapsed,
             cpu_time=cpu_elapsed,
-            speedup=speedup,
+            speedup=cpu_elapsed / float(cu_elapsed),
             n_samples=n_samples,
             n_features=n_features,
             **param_overrides,
@@ -84,9 +77,7 @@ def run(
         cpu_param_overrides={},
         *,
         run_cpu=True,
-        raise_on_error=False,
-        verbose=False
-
+        raise_on_error=False
     ):
         all_results = []
         for ns in self.bench_rows:
@@ -101,7 +92,6 @@ def run(
                             cuml_param_overrides,
                             cpu_param_overrides,
                             run_cpu,
-                            verbose
                         )
                     )
                 except Exception as e:

From 779f6f803dad12d38f658c685dc011bc3272a29c Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 1 Oct 2019 12:52:12 -0400
Subject: [PATCH 58/63] Making style checker happy

---
 python/cuml/metrics/accuracy.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cuml/metrics/accuracy.pyx b/python/cuml/metrics/accuracy.pyx
index 5ce62bb591..7520178424 100644
--- a/python/cuml/metrics/accuracy.pyx
+++ b/python/cuml/metrics/accuracy.pyx
@@ -43,7 +43,7 @@ def accuracy_score(ground_truth, predictions, handle=None):
         Parameters
         ----------
         handle : cuml.Handle
-        prediction : NumPy ndarray or Numba device 
+        prediction : NumPy ndarray or Numba device
            The lables predicted by the model for the test dataset
         ground_truth : NumPy ndarray, Numba device
            The ground truth labels of the test dataset

From a87e0642d77a1bdefc08cf57b078c72a3d1516e3 Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <danteg@nvidia.com>
Date: Tue, 1 Oct 2019 12:32:47 -0500
Subject: [PATCH 59/63] DOC Add entry to changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6b81f80dff..be72cd2f59 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -47,6 +47,7 @@
 - PR #1132: DBSCAN Batching Bug Fix
 - PR #1162: DASK RF random seed bug fix
 - PR #1164: Fix check_dtype arg handling for input_to_dev_array
+- PR #1177: Update dask and distributed to 2.5
 
 # cuML 0.9.0 (21 Aug 2019)
 

From 079579bdf15843710f7b06d56aea442ea10efaf4 Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <danteg@nvidia.com>
Date: Tue, 1 Oct 2019 12:34:20 -0500
Subject: [PATCH 60/63] FIX PEP8 fixes

---
 python/cuml/metrics/accuracy.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/cuml/metrics/accuracy.pyx b/python/cuml/metrics/accuracy.pyx
index 5ce62bb591..0dea749494 100644
--- a/python/cuml/metrics/accuracy.pyx
+++ b/python/cuml/metrics/accuracy.pyx
@@ -43,8 +43,8 @@ def accuracy_score(ground_truth, predictions, handle=None):
         Parameters
         ----------
         handle : cuml.Handle
-        prediction : NumPy ndarray or Numba device 
-           The lables predicted by the model for the test dataset
+        prediction : NumPy ndarray or Numba device
+           The lablels predicted by the model for the test dataset
         ground_truth : NumPy ndarray, Numba device
            The ground truth labels of the test dataset
 

From 0f6d4a623ea1fc28bb45bcec9d328dfa5b225595 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 1 Oct 2019 16:06:12 -0400
Subject: [PATCH 61/63] Fixing adjusted rand score python to use int again

---
 .../metrics/cluster/adjustedrandindex.pyx     | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/python/cuml/metrics/cluster/adjustedrandindex.pyx b/python/cuml/metrics/cluster/adjustedrandindex.pyx
index 971763e664..2a1c4933db 100644
--- a/python/cuml/metrics/cluster/adjustedrandindex.pyx
+++ b/python/cuml/metrics/cluster/adjustedrandindex.pyx
@@ -32,11 +32,11 @@ cimport cuml.common.cuda
 cdef extern from "metrics/metrics.hpp" namespace "ML::Metrics":
 
     double adjustedRandIndex(cumlHandle &handle,
-                             long *y,
-                             long *y_hat,
-                             long n,
-                             long lower_class_range,
-                             long upper_class_range)
+                             int *y,
+                             int *y_hat,
+                             int n,
+                             int lower_class_range,
+                             int upper_class_range)
 
 
 def adjusted_rand_score(labels_true,
@@ -85,10 +85,10 @@ def adjusted_rand_score(labels_true,
         input_to_dev_array(labels_pred)
 
     rand_score = adjustedRandIndex(handle_[0],
-                                   <long*> y_ptr,
-                                   <long*> y_hat_ptr,
-                                   <long> n_rows,
-                                   <long> lower_class_range,
-                                   <long> upper_class_range)
+                                   <int*> y_ptr,
+                                   <int*> y_hat_ptr,
+                                   <int> n_rows,
+                                   <int> lower_class_range,
+                                   <int> upper_class_range)
 
     return rand_score

From 5190df2765cdd1d6662ea87471cfd05b6f1e30a7 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 1 Oct 2019 16:12:26 -0400
Subject: [PATCH 62/63] Fixing style checker issues from t-sne PR

---
 python/cuml/manifold/t_sne.pyx | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/cuml/manifold/t_sne.pyx b/python/cuml/manifold/t_sne.pyx
index cb27bbf9af..7499df0c7d 100644
--- a/python/cuml/manifold/t_sne.pyx
+++ b/python/cuml/manifold/t_sne.pyx
@@ -307,7 +307,7 @@ class TSNE(Base):
 
     def fit(self, X):
         """Fit X into an embedded space.
-        
+
         Parameters
         -----------
         X : array-like (device or host) shape = (n_samples, n_features)
@@ -420,14 +420,14 @@ class TSNE(Base):
 
     def fit_transform(self, X):
         """Fit X into an embedded space and return that transformed output.
-        
+
         Parameters
         -----------
         X : array-like (device or host) shape = (n_samples, n_features)
             X contains a sample per row.
             Acceptable formats: cuDF DataFrame, NumPy ndarray, Numba device
             ndarray, cuda array interface compliant array like CuPy
-        
+
         Returns
         --------
         X_new : array, shape (n_samples, n_components)

From 455741682f47ce095b7af4e85a22961b6efef89c Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 1 Oct 2019 21:01:04 -0400
Subject: [PATCH 63/63] Casting labels to int32 in adjusted rand score

---
 python/cuml/metrics/cluster/adjustedrandindex.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/cuml/metrics/cluster/adjustedrandindex.pyx b/python/cuml/metrics/cluster/adjustedrandindex.pyx
index 2a1c4933db..e0c20538a3 100644
--- a/python/cuml/metrics/cluster/adjustedrandindex.pyx
+++ b/python/cuml/metrics/cluster/adjustedrandindex.pyx
@@ -65,11 +65,11 @@ def adjusted_rand_score(labels_true,
     if labels_true.astype != np.int64:
         warnings.warn(" The dtype of ground truth is not int32"
                       " converting the ground truth to int32")
-        labels_true = labels_true.astype(np.int64)
+        labels_true = labels_true.astype(np.int32)
     if labels_pred.astype != np.int32:
         warnings.warn(" The dtype of predicted labels is not int32"
                       " converting the predicted labels to int32")
-        labels_pred = labels_pred.astype(np.int64)
+        labels_pred = labels_pred.astype(np.int32)
 
     min_val_y = np.nanmin(labels_true)
     lower_class_range = np.nanmin(labels_pred) \