Merge pull request #3 from rapidsai/branch-0.10

merging
rapidsai · Oct 2, 2019 · 38aa355 · 38aa355
2 parents 9f244b9 + 1ac1f33
commit 38aa355
Show file tree

Hide file tree

Showing 64 changed files with 795 additions and 388 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -22,9 +22,13 @@
 - PR #1086: Ensure RegressorMixin scorer uses device arrays
 - PR #1108: input_to_host_array function in input_utils for input processing to host arrays
 - PR #1114: K-means: Exposing useful params, removing unused params, proxying params in Dask
+- PR #1142: prims: expose separate InType and OutType for unaryOp and binaryOp
 - PR #1115: Moving dask_make_blobs to cuml.dask.datasets. Adding conversion to dask.DataFrame
 - PR #1136: CUDA 10.1 CI updates
+- PR #1163: Some more correctness improvements. Better verbose printing
 - PR #1165: Adding except + in all remaining cython
+- PR #1173: Docs: Barnes Hut TSNE documentation
+- PR #1176: Use new RMM API based on Cython
 
 ## Bug Fixes
 
@@ -43,7 +47,9 @@
 - PR #1106: Pinning Distributed version to match Dask for consistent CI results
 - PR #1116: TSNE CUDA 10.1 Bug Fixes
 - PR #1132: DBSCAN Batching Bug Fix
+- PR #1162: DASK RF random seed bug fix
 - PR #1164: Fix check_dtype arg handling for input_to_dev_array
+- PR #1177: Update dask and distributed to 2.5
 
 # cuML 0.9.0 (21 Aug 2019)
 
@@ -104,6 +110,7 @@
 - PR #978: Update README for 0.9
 - PR #1009: Fix references to notebooks-contrib
 - PR #1015: Ability to control the number of internal streams in cumlHandle_impl via cumlHandle
+- PR #1175: Add more modules to docs ToC
 
 ## Bug Fixes
 

diff --git a/README.md b/README.md
@@ -75,7 +75,7 @@ repo](https://github.com/rapidsai/notebooks-contrib).
 | | Truncated Singular Value Decomposition (tSVD) | Multi-GPU version available (CUDA 10 only) |
 | | Uniform Manifold Approximation and Projection (UMAP) | |
 | | Random Projection | |
-| | t-Distributed Stochastic Neighbor Embedding (TSNE) | (Experimental) |
+| | t-Distributed Stochastic Neighbor Embedding (TSNE) | |
 | **Linear Models for Regression or Classification** | Linear Regression (OLS) | Multi-GPU available in conda CUDA 10 package |
 | | Linear Regression with Lasso or Ridge Regularization | |
 | | ElasticNet Regression | |
@@ -84,6 +84,7 @@ repo](https://github.com/rapidsai/notebooks-contrib).
 | **Nonlinear Models for Regression or Classification** | Random Forest (RF) Classification | Experimental multi-node, multi-GPU version available via Dask integration |
 | | Random Forest (RF) Regression | Experimental multi-node, multi-GPU version available via Dask integration |
 |  | K-Nearest Neighbors (KNN) | Multi-GPU <br> Uses [Faiss](https://github.com/facebookresearch/faiss) |
+|  | Support Vector Machine Classifier (SVC) | |
 | **Time Series** | Linear Kalman Filter | |
 |  | Holt-Winters Exponential Smoothing | |
 ---

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
@@ -53,8 +53,8 @@ conda install -c conda-forge -c rapidsai -c rapidsai-nightly -c rapidsai/label/x
       "cmake==3.14.3" \
       "umap-learn" \
       "nccl>=2.4" \
-      "dask=2.3.0" \
-      "distributed=2.3.0" \
+      "dask=2.5.0" \
+      "distributed=2.5.1" \
       "dask-ml" \
       "dask-cudf=${MINOR_VERSION}" \
       "dask-cuda=${MINOR_VERSION}" \

diff --git a/conda/environments/cuml_dev_cuda10.0.yml b/conda/environments/cuml_dev_cuda10.0.yml
@@ -17,8 +17,8 @@ dependencies:
 - scikit-learn>=0.21
 - umap-learn>=0.3.9
 - scikit-learn>=0.21
-- dask=2.3.0
-- distributed=2.3.0
+- dask=2.5.0
+- distributed=2.5.1
 - dask-ml
 - dask-cuda=0.9*
 - dask-cudf=0.10*

diff --git a/conda/environments/cuml_dev_cuda10.1.yml b/conda/environments/cuml_dev_cuda10.1.yml
@@ -17,8 +17,8 @@ dependencies:
 - scikit-learn>=0.21
 - umap-learn>=0.3.9
 - scikit-learn>=0.21
-- dask=2.3.0
-- distributed=2.3.0
+- dask=2.5.0
+- distributed=2.5.1
 - dask-ml
 - dask-cuda=0.9*
 - dask-cudf=0.10*

diff --git a/conda/environments/cuml_dev_cuda9.2.yml b/conda/environments/cuml_dev_cuda9.2.yml
@@ -17,8 +17,8 @@ dependencies:
 - scikit-learn>=0.21
 - umap-learn>=0.3.9
 - scikit-learn>=0.21
-- dask=2.3.0
-- distributed=2.3.0
+- dask=2.5.0
+- distributed=2.5.1
 - dask-ml
 - dask-cuda=0.9*
 - dask-cudf=0.10*

diff --git a/cpp/examples/dbscan/dbscan_example.cpp b/cpp/examples/dbscan/dbscan_example.cpp
@@ -241,10 +241,10 @@ int main(int argc, char* argv[]) {
   cumlHandle.setStream(stream);
 
   std::vector<int> h_labels(nRows);
-  long* d_labels = nullptr;
+  int* d_labels = nullptr;
   float* d_inputData = nullptr;
 
-  CUDA_RT_CALL(cudaMalloc(&d_labels, nRows * sizeof(long)));
+  CUDA_RT_CALL(cudaMalloc(&d_labels, nRows * sizeof(int)));
   CUDA_RT_CALL(cudaMalloc(&d_inputData, nRows * nCols * sizeof(float)));
   CUDA_RT_CALL(cudaMemcpyAsync(d_inputData, h_inputData.data(),
                                nRows * nCols * sizeof(float),
@@ -259,7 +259,7 @@ int main(int argc, char* argv[]) {
 
   ML::dbscanFit(cumlHandle, d_inputData, nRows, nCols, eps, minPts, d_labels,
                 max_bytes_per_batch, false);
-  CUDA_RT_CALL(cudaMemcpyAsync(h_labels.data(), d_labels, nRows * sizeof(long),
+  CUDA_RT_CALL(cudaMemcpyAsync(h_labels.data(), d_labels, nRows * sizeof(int),
                                cudaMemcpyDeviceToHost, stream));
   CUDA_RT_CALL(cudaStreamSynchronize(stream));
 

diff --git a/cpp/src/datasets/make_blobs.cu b/cpp/src/datasets/make_blobs.cu
@@ -23,19 +23,19 @@
 namespace ML {
 namespace Datasets {
 
-void make_blobs(const cumlHandle& handle, float* out, long* labels, long n_rows,
-                long n_cols, long n_clusters, const float* centers,
-                const float* cluster_std, const float cluster_std_scalar,
-                bool shuffle, float center_box_min, float center_box_max,
-                uint64_t seed) {
+void make_blobs(const cumlHandle& handle, float* out, int64_t* labels,
+                int64_t n_rows, int64_t n_cols, int64_t n_clusters,
+                const float* centers, const float* cluster_std,
+                const float cluster_std_scalar, bool shuffle,
+                float center_box_min, float center_box_max, uint64_t seed) {
   MLCommon::Random::make_blobs(out, labels, n_rows, n_cols, n_clusters,
                                handle.getDeviceAllocator(), handle.getStream(),
                                centers, cluster_std, cluster_std_scalar,
                                shuffle, center_box_min, center_box_max, seed);
 }
 
-void make_blobs(const cumlHandle& handle, double* out, long* labels,
-                long n_rows, long n_cols, long n_clusters,
+void make_blobs(const cumlHandle& handle, double* out, int64_t* labels,
+                int64_t n_rows, int64_t n_cols, int64_t n_clusters,
                 const double* centers, const double* cluster_std,
                 const double cluster_std_scalar, bool shuffle,
                 double center_box_min, double center_box_max, uint64_t seed) {
@@ -45,5 +45,26 @@ void make_blobs(const cumlHandle& handle, double* out, long* labels,
                                shuffle, center_box_min, center_box_max, seed);
 }
 
+void make_blobs(const cumlHandle& handle, float* out, int* labels, int n_rows,
+                int n_cols, int n_clusters, const float* centers,
+                const float* cluster_std, const float cluster_std_scalar,
+                bool shuffle, float center_box_min, float center_box_max,
+                uint64_t seed) {
+  MLCommon::Random::make_blobs(out, labels, n_rows, n_cols, n_clusters,
+                               handle.getDeviceAllocator(), handle.getStream(),
+                               centers, cluster_std, cluster_std_scalar,
+                               shuffle, center_box_min, center_box_max, seed);
+}
+
+void make_blobs(const cumlHandle& handle, double* out, int* labels, int n_rows,
+                int n_cols, int n_clusters, const double* centers,
+                const double* cluster_std, const double cluster_std_scalar,
+                bool shuffle, double center_box_min, double center_box_max,
+                uint64_t seed) {
+  MLCommon::Random::make_blobs(out, labels, n_rows, n_cols, n_clusters,
+                               handle.getDeviceAllocator(), handle.getStream(),
+                               centers, cluster_std, cluster_std_scalar,
+                               shuffle, center_box_min, center_box_max, seed);
+}
 }  // namespace Datasets
-}  // end namespace ML
+}  // namespace ML
diff --git a/cpp/src/datasets/make_blobs.hpp b/cpp/src/datasets/make_blobs.hpp
@@ -49,21 +49,33 @@ namespace Datasets {
  * centers. Useful only if 'centers' is nullptr
  * @param seed seed for the RNG
  */
-void make_blobs(const cumlHandle& handle, float* out, long* labels, long n_rows,
-                long n_cols, long n_clusters, const float* centers = nullptr,
+void make_blobs(const cumlHandle& handle, float* out, int64_t* labels,
+                int64_t n_rows, int64_t n_cols, int64_t n_clusters,
+                const float* centers = nullptr,
                 const float* cluster_std = nullptr,
                 const float cluster_std_scalar = 1.f, bool shuffle = true,
                 float center_box_min = 10.f, float center_box_max = 10.f,
                 uint64_t seed = 0ULL);
 
-void make_blobs(const cumlHandle& handle, double* out, long* labels,
-                long n_rows, long n_cols, long n_clusters,
+void make_blobs(const cumlHandle& handle, double* out, int64_t* labels,
+                int64_t n_rows, int64_t n_cols, int64_t n_clusters,
                 const double* centers = nullptr,
                 const double* cluster_std = nullptr,
                 const double cluster_std_scalar = 1.f, bool shuffle = true,
                 double center_box_min = 10.f, double center_box_max = 10.f,
                 uint64_t seed = 0ULL);
-/** @} */
+
+void make_blobs(const cumlHandle& handle, float* out, int* labels, int n_rows,
+                int n_cols, int n_clusters, const float* centers,
+                const float* cluster_std, const float cluster_std_scalar,
+                bool shuffle, float center_box_min, float center_box_max,
+                uint64_t seed);
+
+void make_blobs(const cumlHandle& handle, double* out, int* labels, int n_rows,
+                int n_cols, int n_clusters, const double* centers,
+                const double* cluster_std, const double cluster_std_scalar,
+                bool shuffle, double center_box_min, double center_box_max,
+                uint64_t seed);
 
 }  // namespace Datasets
 }  // namespace ML
diff --git a/cpp/src/dbscan/adjgraph/algo.h b/cpp/src/dbscan/adjgraph/algo.h
@@ -40,7 +40,7 @@ static const int TPB_X = 256;
  * Takes vertex degree array (vd) and CSR row_ind array (ex_scan) to produce the
  * CSR row_ind_ptr array (adj_graph) and filters into a core_pts array based on min_pts.
  */
-template <typename Type, typename Index_ = long>
+template <typename Type, typename Index_ = int>
 void launcher(const ML::cumlHandle_impl &handle, Pack<Type, Index_> data,
               Index_ batchSize, cudaStream_t stream) {
   device_ptr<Index_> dev_vd = device_pointer_cast(data.vd);

diff --git a/cpp/src/dbscan/adjgraph/naive.h b/cpp/src/dbscan/adjgraph/naive.h
@@ -26,7 +26,7 @@ namespace Dbscan {
 namespace AdjGraph {
 namespace Naive {
 
-template <typename Type, typename Index_ = long>
+template <typename Type, typename Index_ = int>
 void launcher(const ML::cumlHandle_impl& handle, Pack<Type, Index_> data,
               Index_ batchSize, cudaStream_t stream) {
   Index_ k = 0;

diff --git a/cpp/src/dbscan/adjgraph/pack.h b/cpp/src/dbscan/adjgraph/pack.h
@@ -19,7 +19,7 @@
 namespace Dbscan {
 namespace AdjGraph {
 
-template <typename Type, typename Index_ = long>
+template <typename Type, typename Index_ = int>
 struct Pack {
   /**
      * vertex degree array

diff --git a/cpp/src/dbscan/dbscan.cu b/cpp/src/dbscan/dbscan.cu
@@ -24,19 +24,32 @@ namespace ML {
 
 using namespace Dbscan;
 
-// @todo
-// In the below 2 calls, the Index type has been hard-coded to `int64_t`
-// We should pick the right Index type based on the input dimensions.
-void dbscanFit(const cumlHandle &handle, float *input, long n_rows, long n_cols,
-               float eps, int min_pts, long *labels, size_t max_bytes_per_batch,
+void dbscanFit(const cumlHandle &handle, float *input, int n_rows, int n_cols,
+               float eps, int min_pts, int *labels, size_t max_bytes_per_batch,
                bool verbose) {
+  dbscanFitImpl<float, int>(handle.getImpl(), input, n_rows, n_cols, eps,
+                            min_pts, labels, max_bytes_per_batch,
+                            handle.getStream(), verbose);
+}
+
+void dbscanFit(const cumlHandle &handle, double *input, int n_rows, int n_cols,
+               double eps, int min_pts, int *labels, size_t max_bytes_per_batch,
+               bool verbose) {
+  dbscanFitImpl<double, int>(handle.getImpl(), input, n_rows, n_cols, eps,
+                             min_pts, labels, max_bytes_per_batch,
+                             handle.getStream(), verbose);
+}
+
+void dbscanFit(const cumlHandle &handle, float *input, int64_t n_rows,
+               int64_t n_cols, float eps, int min_pts, int64_t *labels,
+               size_t max_bytes_per_batch, bool verbose) {
   dbscanFitImpl<float, int64_t>(handle.getImpl(), input, n_rows, n_cols, eps,
                                 min_pts, labels, max_bytes_per_batch,
                                 handle.getStream(), verbose);
 }
 
-void dbscanFit(const cumlHandle &handle, double *input, long n_rows,
-               long n_cols, double eps, int min_pts, long *labels,
+void dbscanFit(const cumlHandle &handle, double *input, int64_t n_rows,
+               int64_t n_cols, double eps, int min_pts, int64_t *labels,
                size_t max_bytes_per_batch, bool verbose) {
   dbscanFitImpl<double, int64_t>(handle.getImpl(), input, n_rows, n_cols, eps,
                                  min_pts, labels, max_bytes_per_batch,

diff --git a/cpp/src/dbscan/dbscan.h b/cpp/src/dbscan/dbscan.h
@@ -23,48 +23,58 @@
 namespace ML {
 
 using namespace Dbscan;
-static const size_t DEFAULT_MAX_MEM_BYTES = 13e9;
+static const size_t DEFAULT_MAX_MEM_MBYTES = 13e3;
 
 // Default max mem set to a reasonable value for a 16gb card.
-
-template <typename T, typename Index_ = long>
-Index_ computeBatchCount(Index_ n_rows, size_t max_bytes_per_batch) {
+template <typename T, typename Index_ = int>
+Index_ computeBatchCount(Index_ n_rows, size_t max_mbytes_per_batch) {
   Index_ n_batches = 1;
   // There seems to be a weird overflow bug with cutlass gemm kernels
   // hence, artifically limiting to a smaller batchsize!
   ///TODO: in future, when we bump up the underlying cutlass version, this should go away
   // paving way to cudaMemGetInfo based workspace allocation
 
-  if (max_bytes_per_batch <= 0) max_bytes_per_batch = DEFAULT_MAX_MEM_BYTES;
+  if (max_mbytes_per_batch <= 0) max_mbytes_per_batch = DEFAULT_MAX_MEM_MBYTES;
+
+  Index_ MAX_LABEL = std::numeric_limits<Index_>::max();
 
   while (true) {
     size_t batchSize = ceildiv<size_t>(n_rows, n_batches);
-    if (batchSize * n_rows * sizeof(T) < max_bytes_per_batch || batchSize == 1)
+    if (((batchSize * n_rows * sizeof(T) * 1e-6 < max_mbytes_per_batch) &&
+         /**
+          * Though single precision can be faster per execution of each kernel,
+          * there's a trade-off to be made between using single precision with
+          * many more batches (which become smaller as n_rows grows) and using
+          * double precision, which will have less batches but could become 8-10x
+          * slower per batch.
+          */
+         (batchSize * n_rows < MAX_LABEL)) ||
+        batchSize == 1)
       break;
     ++n_batches;
   }
   return n_batches;
 }
 
-template <typename T, typename Index_ = long>
+template <typename T, typename Index_ = int>
 void dbscanFitImpl(const ML::cumlHandle_impl &handle, T *input, Index_ n_rows,
                    Index_ n_cols, T eps, int min_pts, Index_ *labels,
-                   size_t max_bytes_per_batch, cudaStream_t stream,
+                   size_t max_mbytes_per_batch, cudaStream_t stream,
                    bool verbose) {
   ML::PUSH_RANGE("ML::Dbscan::Fit");
   int algoVd = 1;
   int algoAdj = 1;
   int algoCcl = 2;
 
   // @todo: Query device for remaining memory
-  Index_ n_batches = computeBatchCount<T, Index_>(n_rows, max_bytes_per_batch);
+  Index_ n_batches = computeBatchCount<T, Index_>(n_rows, max_mbytes_per_batch);
 
   if (verbose) {
     Index_ batchSize = ceildiv<Index_>(n_rows, n_batches);
     if (n_batches > 1) {
       std::cout << "Running batched training on " << n_batches
                 << " batches w/ ";
-      std::cout << batchSize * n_rows * sizeof(T) << " bytes." << std::endl;
+      std::cout << batchSize * n_rows * sizeof(T) * 1e-6 << " megabytes." << std::endl;
     }
   }
 

diff --git a/cpp/src/dbscan/dbscan.hpp b/cpp/src/dbscan/dbscan.hpp
@@ -29,18 +29,26 @@ namespace ML {
  * @param[in] eps the epsilon value to use for epsilon-neighborhood determination
  * @param[in] min_pts minimum number of points to determine a cluster
  * @param[out] labels (size n_rows) output labels array
- * @param[in] max_mem_bytes: the maximum number of bytes to be used for each batch of
+ * @param[in] max_mem_mbytes: the maximum number of megabytes to be used for each batch of
  *            the pairwise distance calculation. This enables the trade off between
  *            memory usage and algorithm execution time.
  * @param[in] verbose: print useful information as algorithm executes
  * @{
  */
-void dbscanFit(const cumlHandle &handle, float *input, long n_rows, long n_cols,
-               float eps, int min_pts, long *labels,
+void dbscanFit(const cumlHandle &handle, float *input, int n_rows, int n_cols,
+               float eps, int min_pts, int *labels,
                size_t max_bytes_per_batch = 0, bool verbose = false);
-void dbscanFit(const cumlHandle &handle, double *input, long n_rows,
-               long n_cols, double eps, int min_pts, long *labels,
+void dbscanFit(const cumlHandle &handle, double *input, int n_rows, int n_cols,
+               double eps, int min_pts, int *labels,
                size_t max_bytes_per_batch = 0, bool verbose = false);
+
+void dbscanFit(const cumlHandle &handle, float *input, int64_t n_rows,
+               int64_t n_cols, float eps, int min_pts, int64_t *labels,
+               size_t max_bytes_per_batch = 0, bool verbose = false);
+void dbscanFit(const cumlHandle &handle, double *input, int64_t n_rows,
+               int64_t n_cols, double eps, int min_pts, int64_t *labels,
+               size_t max_bytes_per_batch = 0, bool verbose = false);
+
 /** @} */
 
 }  // namespace ML
diff --git a/cpp/src/dbscan/dbscan_api.cpp b/cpp/src/dbscan/dbscan_api.cpp
@@ -18,8 +18,8 @@
 #include "common/cumlHandle.hpp"
 #include "dbscan.hpp"
 
-cumlError_t cumlSpDbscanFit(cumlHandle_t handle, float *input, long n_rows,
-                            long n_cols, float eps, int min_pts, long *labels,
+cumlError_t cumlSpDbscanFit(cumlHandle_t handle, float *input, int n_rows,
+                            int n_cols, float eps, int min_pts, int *labels,
                             size_t max_bytes_per_batch, int verbose) {
   cumlError_t status;
   ML::cumlHandle *handle_ptr;
@@ -42,8 +42,8 @@ cumlError_t cumlSpDbscanFit(cumlHandle_t handle, float *input, long n_rows,
   return status;
 }
 
-cumlError_t cumlDpDbscanFit(cumlHandle_t handle, double *input, long n_rows,
-                            long n_cols, double eps, int min_pts, long *labels,
+cumlError_t cumlDpDbscanFit(cumlHandle_t handle, double *input, int n_rows,
+                            int n_cols, double eps, int min_pts, int *labels,
                             size_t max_bytes_per_batch, int verbose) {
   cumlError_t status;
   ML::cumlHandle *handle_ptr;