Merge pull request #4 from rapidsai/branch-0.10

Merge 0.10
rapidsai · Oct 7, 2019 · 9a5f47c · 9a5f47c
2 parents 38aa355 + 39aed94
commit 9a5f47c
Show file tree

Hide file tree

Showing 27 changed files with 699 additions and 383 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -27,11 +27,13 @@
 - PR #1136: CUDA 10.1 CI updates
 - PR #1163: Some more correctness improvements. Better verbose printing
 - PR #1165: Adding except + in all remaining cython
+- PR #1186: Using LocalCUDACluster Pytest fixture
 - PR #1173: Docs: Barnes Hut TSNE documentation
 - PR #1176: Use new RMM API based on Cython
 
 ## Bug Fixes
 
+- PR #1208: compile dbscan bug
 - PR #1016: Use correct libcumlprims version in GPU CI
 - PR #1040: Update version of numba in development conda yaml files
 - PR #1043: Updates to accomodate cuDF python code reorganization
@@ -49,7 +51,12 @@
 - PR #1132: DBSCAN Batching Bug Fix
 - PR #1162: DASK RF random seed bug fix
 - PR #1164: Fix check_dtype arg handling for input_to_dev_array
+- PR #1171: SVM prediction bug fix
 - PR #1177: Update dask and distributed to 2.5
+- PR #1204: Fix SVM crash on Turing
+- PR #1199: Replaced sprintf() with snprintf() in THROW()
+- PR #1205: Update dask-cuda in yml envs
+- PR #1211: Fixing Dask k-means transform bug and adding test
 
 # cuML 0.9.0 (21 Aug 2019)
 

diff --git a/conda/environments/cuml_dev_cuda10.0.yml b/conda/environments/cuml_dev_cuda10.0.yml
@@ -20,7 +20,7 @@ dependencies:
 - dask=2.5.0
 - distributed=2.5.1
 - dask-ml
-- dask-cuda=0.9*
+- dask-cuda=0.10*
 - dask-cudf=0.10*
 - nccl>=2.4
 - libcumlprims=0.10*

diff --git a/conda/environments/cuml_dev_cuda10.1.yml b/conda/environments/cuml_dev_cuda10.1.yml
@@ -20,7 +20,7 @@ dependencies:
 - dask=2.5.0
 - distributed=2.5.1
 - dask-ml
-- dask-cuda=0.9*
+- dask-cuda=0.10*
 - dask-cudf=0.10*
 - nccl>=2.4
 - libcumlprims=0.10*

diff --git a/cpp/src/dbscan/vertexdeg/algo.h b/cpp/src/dbscan/vertexdeg/algo.h
@@ -62,7 +62,7 @@ void launcher(const ML::cumlHandle_impl &handle, Pack<value_t, index_t> data,
 
   auto fused_op = [vd, n] __device__(index_t global_c_idx, bool in_neigh) {
     // fused construction of vertex degree
-    index_t batch_vertex = fmod(global_c_idx, n);
+    index_t batch_vertex = global_c_idx % n;
 
     if (sizeof(index_t) == 4) {
       atomicAdd((unsigned int *)(vd + batch_vertex), in_neigh);

diff --git a/cpp/src/svm/smosolver.h b/cpp/src/svm/smosolver.h
@@ -71,20 +71,21 @@ class SmoSolver {
   bool verbose = false;
   SmoSolver(const cumlHandle_impl &handle, math_t C, math_t tol,
             MLCommon::Matrix::GramMatrixBase<math_t> *kernel,
-            float cache_size = 200)
+            float cache_size = 200, int nochange_steps = 1000)
     : handle(handle),
       n_rows(n_rows),
       C(C),
       tol(tol),
       kernel(kernel),
       cache_size(cache_size),
+      nochange_steps(nochange_steps),
       stream(handle.getStream()),
       return_buff(handle.getDeviceAllocator(), stream, 2),
       alpha(handle.getDeviceAllocator(), stream),
       delta_alpha(handle.getDeviceAllocator(), stream),
       f(handle.getDeviceAllocator(), stream) {}
 
-#define SMO_WS_SIZE 1024
+#define SMO_WS_SIZE 512
   /**
    * Solve the quadratic optimization problem.
    *
@@ -229,19 +230,25 @@ class SmoSolver {
   // Variables to track convergence of training
   math_t diff_prev;
   int n_small_diff;
+  int nochange_steps;
 
   bool CheckStoppingCondition(math_t diff) {
-    // TODO improve stopping condition to detect oscillationsq, see Issue #947
+    // TODO improve stopping condition to detect oscillations, see Issue #947
     bool keep_going = true;
     if (abs(diff - diff_prev) < 0.001 * tol) {
       n_small_diff++;
     } else {
       diff_prev = diff;
       n_small_diff = 0;
     }
-    if (diff < tol || n_small_diff > 10) {
+    if (n_small_diff > nochange_steps) {
+      if (verbose) {
+        std::cout << "SMO error: Stopping due to unchanged diff over "
+                  << nochange_steps << " consecutive steps\n";
+      }
       keep_going = false;
     }
+    if (diff < tol) keep_going = false;
     // ASSERT(!isnan(diff), "SMO: NaN found during fitting")
     if (isnan(diff)) {
       std::cout << "SMO error: NaN found during fitting\n";

diff --git a/cpp/src/svm/svc.cu b/cpp/src/svm/svc.cu
@@ -47,12 +47,14 @@ template void svcFit<double>(const cumlHandle &handle, double *input,
 template void svcPredict<float>(const cumlHandle &handle, float *input,
                                 int n_rows, int n_cols,
                                 MLCommon::Matrix::KernelParams &kernel_params,
-                                const svmModel<float> &model, float *preds);
+                                const svmModel<float> &model, float *preds,
+                                float buffer_size);
 
 template void svcPredict<double>(const cumlHandle &handle, double *input,
                                  int n_rows, int n_cols,
                                  MLCommon::Matrix::KernelParams &kernel_params,
-                                 const svmModel<double> &model, double *preds);
+                                 const svmModel<double> &model, double *preds,
+                                 double buffer_size);
 
 template void svmFreeBuffers(const cumlHandle &handle, svmModel<float> &m);
 
@@ -61,9 +63,9 @@ template void svmFreeBuffers(const cumlHandle &handle, svmModel<double> &m);
 template <typename math_t>
 SVC<math_t>::SVC(cumlHandle &handle, math_t C, math_t tol,
                  Matrix::KernelParams kernel_params, math_t cache_size,
-                 int max_iter, bool verbose)
+                 int max_iter, int nochange_steps, bool verbose)
   : handle(handle),
-    param(svmParameter{C, cache_size, max_iter, tol, verbose}),
+    param(svmParameter{C, cache_size, max_iter, nochange_steps, tol, verbose}),
     kernel_params(kernel_params) {
   model.n_support = 0;
   model.dual_coefs = nullptr;
@@ -87,7 +89,9 @@ void SVC<math_t>::fit(math_t *input, int n_rows, int n_cols, math_t *labels) {
 template <typename math_t>
 void SVC<math_t>::predict(math_t *input, int n_rows, int n_cols,
                           math_t *preds) {
-  svcPredict(handle, input, n_rows, n_cols, kernel_params, model, preds);
+  math_t buffer_size = param.cache_size;
+  svcPredict(handle, input, n_rows, n_cols, kernel_params, model, preds,
+             buffer_size);
 }
 
 // Instantiate templates for the shared library

diff --git a/cpp/src/svm/svc.hpp b/cpp/src/svm/svc.hpp
@@ -71,11 +71,13 @@ void svcFit(const cumlHandle &handle, math_t *input, int n_rows, int n_cols,
  * @param [in] model SVM model parameters
  * @param [out] preds device pointer to store the predicted class labels.
  *    Size [n_rows]. Should be allocated on entry.
+ * @param [in] buffer_size size of temporary buffer in MiB
  */
 template <typename math_t>
 void svcPredict(const cumlHandle &handle, math_t *input, int n_rows, int n_cols,
                 MLCommon::Matrix::KernelParams &kernel_params,
-                const svmModel<math_t> &model, math_t *preds);
+                const svmModel<math_t> &model, math_t *preds,
+                math_t buffer_size);
 
 /**
  * Deallocate device buffers in the svmModel struct.
@@ -126,7 +128,8 @@ class SVC {
   SVC(cumlHandle &handle, math_t C = 1, math_t tol = 1.0e-3,
       MLCommon::Matrix::KernelParams kernel_params =
         MLCommon::Matrix::KernelParams{MLCommon::Matrix::LINEAR, 3, 1, 0},
-      math_t cache_size = 200, int max_iter = -1, bool verbose = false);
+      math_t cache_size = 200, int max_iter = -1, int nochange_steps = 1000,
+      bool verbose = false);
 
   ~SVC();
 

diff --git a/cpp/src/svm/svc_impl.h b/cpp/src/svm/svc_impl.h
@@ -23,13 +23,17 @@
 #include <iostream>
 
 #include <cublas_v2.h>
+#include <thrust/copy.h>
+#include <thrust/device_ptr.h>
+#include <thrust/iterator/counting_iterator.h>
 #include "common/cumlHandle.hpp"
 #include "common/device_buffer.hpp"
 #include "kernelcache.h"
 #include "label/classlabels.h"
 #include "linalg/cublas_wrappers.h"
 #include "linalg/unary_op.h"
 #include "matrix/kernelfactory.h"
+#include "matrix/matrix.h"
 #include "smosolver.h"
 #include "svm_model.h"
 #include "svm_parameter.h"
@@ -90,7 +94,7 @@ void svcFit(const cumlHandle &handle, math_t *input, int n_rows, int n_cols,
     MLCommon::Matrix::KernelFactory<math_t>::create(
       kernel_params, handle_impl.getCublasHandle());
   SmoSolver<math_t> smo(handle_impl, param.C, param.tol, kernel,
-                        param.cache_size);
+                        param.cache_size, param.nochange_steps);
   smo.verbose = param.verbose;
   smo.Solve(input, n_rows, n_cols, y.data(), &(model.dual_coefs),
             &(model.n_support), &(model.x_support), &(model.support_idx),
@@ -109,6 +113,13 @@ void svcFit(const cumlHandle &handle, math_t *input, int n_rows, int n_cols,
  * We evaluate f(x_i), and then instead of taking the sign to return +/-1 labels,
  * we map it to the original labels, and return those.
  *
+ * We process the input vectors batchwise, and evaluate the full rows of kernel
+ * matrix K(x_i, x_j) for a batch (size n_batch * n_support). The maximum size
+ * of this buffer (i.e. the maximum batch_size) is controlled by the
+ * buffer_size input parameter. For models where n_support is large, increasing
+ * buffer_size might improve prediction performance.
+ *
+ *
  * @tparam math_t floating point type
  * @param handle the cuML handle
  * @param [in] input device pointer for the input data in column major format,
@@ -119,42 +130,75 @@ void svcFit(const cumlHandle &handle, math_t *input, int n_rows, int n_cols,
  * @param [in] model SVM model parameters
  * @param [out] preds device pointer to store the predicted class labels.
  *    Size [n_rows]. Should be allocated on entry.
+ * @param [in] buffer_size size of temporary buffer in MiB
  */
 template <typename math_t>
 void svcPredict(const cumlHandle &handle, math_t *input, int n_rows, int n_cols,
                 MLCommon::Matrix::KernelParams &kernel_params,
-                const svmModel<math_t> &model, math_t *preds) {
+                const svmModel<math_t> &model, math_t *preds,
+                math_t buffer_size) {
   ASSERT(n_cols == model.n_cols,
          "Parameter n_cols: shall be the same that was used for fitting");
   // We might want to query the available memory before selecting the batch size.
   // We will need n_batch * n_support floats for the kernel matrix K.
-#define N_PRED_BATCH 4096
+  const int N_PRED_BATCH = 4096;
   int n_batch = N_PRED_BATCH < n_rows ? N_PRED_BATCH : n_rows;
 
+  // Limit the memory size of the prediction buffer
+  buffer_size = buffer_size * 1024 * 1024;
+  if (n_batch * model.n_support * sizeof(math_t) > buffer_size) {
+    n_batch = buffer_size / (model.n_support * sizeof(math_t));
+    if (n_batch < 1) n_batch = 1;
+  }
+
   const cumlHandle_impl &handle_impl = handle.getImpl();
   cudaStream_t stream = handle_impl.getStream();
 
   MLCommon::device_buffer<math_t> K(handle_impl.getDeviceAllocator(), stream,
                                     n_batch * model.n_support);
   MLCommon::device_buffer<math_t> y(handle_impl.getDeviceAllocator(), stream,
                                     n_rows);
+  MLCommon::device_buffer<math_t> x_rbf(handle_impl.getDeviceAllocator(),
+                                        stream);
+  MLCommon::device_buffer<int> idx(handle_impl.getDeviceAllocator(), stream);
 
   cublasHandle_t cublas_handle = handle_impl.getCublasHandle();
 
   MLCommon::Matrix::GramMatrixBase<math_t> *kernel =
     MLCommon::Matrix::KernelFactory<math_t>::create(kernel_params,
                                                     cublas_handle);
-
+  if (kernel_params.kernel == MLCommon::Matrix::RBF) {
+    // Temporary buffers for the RBF kernel, see below
+    x_rbf.resize(n_batch * n_cols, stream);
+    idx.resize(n_batch, stream);
+  }
   // We process the input data batchwise:
   //  - calculate the kernel values K[x_batch, x_support]
   //  - calculate y(x_batch) = K[x_batch, x_support] * dual_coeffs
   for (int i = 0; i < n_rows; i += n_batch) {
     if (i + n_batch >= n_rows) {
       n_batch = n_rows - i;
     }
-    kernel->evaluate(input + i, n_batch, n_cols, model.x_support,
-                     model.n_support, K.data(), stream, n_rows, model.n_support,
-                     n_batch);
+    math_t *x_ptr = nullptr;
+    int ld1 = 0;
+    if (kernel_params.kernel == MLCommon::Matrix::RBF) {
+      // The RBF kernel does not support ld parameters (See issue #1172)
+      // To come around this limitation, we copy the batch into a temporary
+      // buffer.
+      thrust::counting_iterator<int> first(i);
+      thrust::counting_iterator<int> last = first + n_batch;
+      thrust::device_ptr<int> idx_ptr(idx.data());
+      thrust::copy(thrust::cuda::par.on(stream), first, last, idx_ptr);
+      MLCommon::Matrix::copyRows(input, n_rows, n_cols, x_rbf.data(),
+                                 idx.data(), n_batch, stream, false);
+      x_ptr = x_rbf.data();
+      ld1 = n_batch;
+    } else {
+      x_ptr = input + i;
+      ld1 = n_rows;
+    }
+    kernel->evaluate(x_ptr, n_batch, n_cols, model.x_support, model.n_support,
+                     K.data(), stream, ld1, model.n_support, n_batch);
     math_t one = 1;
     math_t null = 0;
     CUBLAS_CHECK(MLCommon::LinAlg::cublasgemv(

diff --git a/cpp/src/svm/svm_api.cpp b/cpp/src/svm/svm_api.cpp
@@ -25,16 +25,17 @@
 
 cumlError_t cumlSpSvcFit(cumlHandle_t handle, float *input, int n_rows,
                          int n_cols, float *labels, float C, float cache_size,
-                         int max_iter, float tol, int verbose,
-                         cumlSvmKernelType kernel, int degree, float gamma,
-                         float coef0, int *n_support, float *b,
+                         int max_iter, int nochange_steps, float tol,
+                         int verbose, cumlSvmKernelType kernel, int degree,
+                         float gamma, float coef0, int *n_support, float *b,
                          float **dual_coefs, float **x_support,
                          int **support_idx, int *n_classes,
                          float **unique_labels) {
   ML::SVM::svmParameter param;
   param.C = C;
   param.cache_size = cache_size;
   param.max_iter = max_iter;
+  param.nochange_steps = nochange_steps;
   param.tol = tol;
   param.verbose = verbose;
 
@@ -76,16 +77,17 @@ cumlError_t cumlSpSvcFit(cumlHandle_t handle, float *input, int n_rows,
 
 cumlError_t cumlDpSvcFit(cumlHandle_t handle, double *input, int n_rows,
                          int n_cols, double *labels, double C,
-                         double cache_size, int max_iter, double tol,
-                         int verbose, cumlSvmKernelType kernel, int degree,
-                         double gamma, double coef0, int *n_support, double *b,
-                         double **dual_coefs, double **x_support,
+                         double cache_size, int max_iter, int nochange_steps,
+                         double tol, int verbose, cumlSvmKernelType kernel,
+                         int degree, double gamma, double coef0, int *n_support,
+                         double *b, double **dual_coefs, double **x_support,
                          int **support_idx, int *n_classes,
                          double **unique_labels) {
   ML::SVM::svmParameter param;
   param.C = C;
   param.cache_size = cache_size;
   param.max_iter = max_iter;
+  param.nochange_steps = nochange_steps;
   param.tol = tol;
   param.verbose = verbose;
 
@@ -129,7 +131,8 @@ cumlError_t cumlSpSvcPredict(cumlHandle_t handle, float *input, int n_rows,
                              int n_cols, cumlSvmKernelType kernel, int degree,
                              float gamma, float coef0, int n_support, float b,
                              float *dual_coefs, float *x_support, int n_classes,
-                             float *unique_labels, float *preds) {
+                             float *unique_labels, float *preds,
+                             float buffer_size) {
   MLCommon::Matrix::KernelParams kernel_param;
   kernel_param.kernel = (MLCommon::Matrix::KernelType)kernel;
   kernel_param.degree = degree;
@@ -151,7 +154,7 @@ cumlError_t cumlSpSvcPredict(cumlHandle_t handle, float *input, int n_rows,
   if (status == CUML_SUCCESS) {
     try {
       ML::SVM::svcPredict(*handle_ptr, input, n_rows, n_cols, kernel_param,
-                          model, preds);
+                          model, preds, buffer_size);
     }
     //TODO: Implement this
     //catch (const MLCommon::Exception& e)
@@ -171,7 +174,7 @@ cumlError_t cumlDpSvcPredict(cumlHandle_t handle, double *input, int n_rows,
                              double gamma, double coef0, int n_support,
                              double b, double *dual_coefs, double *x_support,
                              int n_classes, double *unique_labels,
-                             double *preds) {
+                             double *preds, double buffer_size) {
   MLCommon::Matrix::KernelParams kernel_param;
   kernel_param.kernel = (MLCommon::Matrix::KernelType)kernel;
   kernel_param.degree = degree;
@@ -193,7 +196,7 @@ cumlError_t cumlDpSvcPredict(cumlHandle_t handle, double *input, int n_rows,
   if (status == CUML_SUCCESS) {
     try {
       ML::SVM::svcPredict(*handle_ptr, input, n_rows, n_cols, kernel_param,
-                          model, preds);
+                          model, preds, buffer_size);
     }
     //TODO: Implement this
     //catch (const MLCommon::Exception& e)