Merge pull request rapidsai#2 from rapidsai/branch-0.10

merge
jakirkham · Sep 28, 2019 · be9e8c0 · be9e8c0
2 parents ddce47d + 91c53fb
commit be9e8c0
Show file tree

Hide file tree

Showing 63 changed files with 5,583 additions and 141 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,9 @@
 - PR #1113: prims: new batched make-symmetric-matrix primitive
 - PR #1112: prims: new batched-gemv primitive
 - PR #855: Added benchmark tools
+- PR #1149 Add YYMMDD to version tag for nightly conda packages
+- PR #892: General Gram matrices prim
+- PR #912: Support Vector Machine
 
 ## Improvements
 - PR #961: High Peformance RF; HIST algo
@@ -21,6 +24,7 @@
 - PR #1114: K-means: Exposing useful params, removing unused params, proxying params in Dask
 - PR #1115: Moving dask_make_blobs to cuml.dask.datasets. Adding conversion to dask.DataFrame
 - PR #1136: CUDA 10.1 CI updates
+- PR #1165: Adding except + in all remaining cython
 
 ## Bug Fixes
 
@@ -39,6 +43,7 @@
 - PR #1106: Pinning Distributed version to match Dask for consistent CI results
 - PR #1116: TSNE CUDA 10.1 Bug Fixes
 - PR #1132: DBSCAN Batching Bug Fix
+- PR #1164: Fix check_dtype arg handling for input_to_dev_array
 
 # cuML 0.9.0 (21 Aug 2019)
 

diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh
@@ -29,6 +29,11 @@ cd $WORKSPACE
 export GIT_DESCRIBE_TAG=`git describe --abbrev=0 --tags`
 export GIT_DESCRIBE_NUMBER=`git rev-list ${GIT_DESCRIBE_TAG}..HEAD --count`
 
+# If nightly build, append current YYMMDD to version
+if [[ "$BUILD_MODE" = "branch" && "$SOURCE_BRANCH" = branch-* ]] ; then
+  export VERSION_SUFFIX=`date +%y%m%d`
+fi
+
 ################################################################################
 # SETUP - Check environment
 ################################################################################

diff --git a/conda/recipes/cuml/meta.yaml b/conda/recipes/cuml/meta.yaml
@@ -2,7 +2,7 @@
 
 # Usage:
 #   conda build . -c defaults -c conda-forge -c numba -c rapidsai -c pytorch
-{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') %}
+{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
 {% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
 {% set git_revision_count=environ.get('GIT_DESCRIBE_NUMBER', 0) %}
 {% set cuda_version='.'.join(environ.get('CUDA', 'unknown').split('.')[:2]) %}
@@ -22,6 +22,7 @@ build:
     - CC
     - CXX
     - BUILD_ABI
+    - VERSION_SUFFIX
 
 requirements:
   build:

diff --git a/conda/recipes/libcuml/meta.yaml b/conda/recipes/libcuml/meta.yaml
@@ -2,7 +2,7 @@
 
 # Usage:
 #   conda build . -c defaults -c conda-forge -c nvidia -c rapidsai -c pytorch
-{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') %}
+{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
 {% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
 {% set git_revision_count=environ.get('GIT_DESCRIBE_NUMBER', 0) %}
 {% set cuda_version='.'.join(environ.get('CUDA', '9.2').split('.')[:2]) %}
@@ -24,6 +24,7 @@ build:
     - CUDAHOSTCXX
     - BUILD_ABI
     - PARALLEL_LEVEL
+    - VERSION_SUFFIX
 
 requirements:
   build:

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -334,6 +334,8 @@ if(BUILD_CUML_CPP_LIBRARY)
     src/random_projection/rproj.cu
     src/solver/solver.cu
     src/spectral/spectral.cu
+    src/svm/svc.cu
+    src/svm/ws_util.cu
     src/tsne/tsne.cu
     src/tsvd/tsvd.cu
     src/umap/umap.cu
@@ -376,7 +378,8 @@ if(BUILD_CUML_C_LIBRARY)
     src/common/cuml_api.cpp
     src/dbscan/dbscan_api.cpp
     src/glm/glm_api.cpp
-    src/holtwinters/holtwinters_api.cpp)
+    src/holtwinters/holtwinters_api.cpp
+    src/svm/svm_api.cpp)
   target_link_libraries(${CUML_C_TARGET} ${CUML_CPP_TARGET})
 endif(BUILD_CUML_C_LIBRARY)
 

diff --git a/cpp/src/svm/kernelcache.h b/cpp/src/svm/kernelcache.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuda_utils.h>
+#include <linalg/gemm.h>
+#include "cache/cache.h"
+#include "common/cumlHandle.hpp"
+#include "common/host_buffer.hpp"
+#include "matrix/grammatrix.h"
+#include "matrix/matrix.h"
+#include "ml_utils.h"
+
+namespace ML {
+namespace SVM {
+
+/**
+* @brief Buffer to store a kernel tile
+*
+* We calculate the kernel matrix for the vectors in the working set.
+* For every vector x_i in the working set, we always calculate a full row of the
+* kernel matrix K(x_j, x_i), j=1..n_rows.
+*
+* A kernel tile stores all the kernel rows for the working set, i.e. K(x_j, x_i)
+* for all i in the working set, and j in 1..n_rows.
+*
+* The kernel values can be cached to avoid repeated calculation of the kernel
+* function.
+*/
+template <typename math_t>
+class KernelCache {
+ private:
+  const math_t *x;  //!< pointer to the training vectors
+
+  MLCommon::device_buffer<math_t>
+    x_ws;  //!< feature vectors in the current working set
+  MLCommon::device_buffer<int>
+    ws_cache_idx;  //!< cache position of a workspace vectors
+  MLCommon::device_buffer<math_t> tile;  //!< Kernel matrix  tile
+
+  int n_rows;  //!< number of rows in x
+  int n_cols;  //!< number of columns in x
+  int n_ws;    //!< number of elements in the working set
+
+  cublasHandle_t cublas_handle;
+
+  MLCommon::Matrix::GramMatrixBase<math_t> *kernel;
+
+  const cumlHandle_impl handle;
+
+  const int TPB = 256;  //!< threads per block for kernels launched
+
+  MLCommon::Cache::Cache<math_t> cache;
+
+  cudaStream_t stream;
+
+ public:
+  /**
+   * Construct an object to manage kernel cache
+   *
+   * @param handle reference to cumlHandle implementation
+   * @param x device array of training vectors in column major format,
+   *   size [n_rows x n_cols]
+   * @param n_rows number of training vectors
+   * @param n_cols number of features
+   * @param n_ws size of working set
+   * @param kernel pointer to kernel (default linear)
+   * @param cache_size (default 200 MiB)
+   */
+  KernelCache(const cumlHandle_impl &handle, const math_t *x, int n_rows,
+              int n_cols, int n_ws,
+              MLCommon::Matrix::GramMatrixBase<math_t> *kernel,
+              float cache_size = 200)
+    : cache(handle.getDeviceAllocator(), handle.getStream(), n_rows,
+            cache_size),
+      kernel(kernel),
+      x(x),
+      n_rows(n_rows),
+      n_cols(n_cols),
+      n_ws(n_ws),
+      cublas_handle(handle.getCublasHandle()),
+      x_ws(handle.getDeviceAllocator(), handle.getStream(), n_ws * n_cols),
+      tile(handle.getDeviceAllocator(), handle.getStream(), n_ws * n_rows),
+      ws_cache_idx(handle.getDeviceAllocator(), handle.getStream(), n_ws) {
+    ASSERT(kernel != nullptr, "Kernel pointer required for KernelCache!");
+
+    stream = handle.getStream();
+  }
+
+  ~KernelCache(){};
+
+  /**
+   * @brief Get all the kernel matrix rows for the working set.
+   * @param ws_idx indices of the working set
+   * @return pointer to the kernel tile [ n_rows x n_ws] K_j,i = K(x_j, x_q)
+   * where j=1..n_rows and q = ws_idx[i], j is the contiguous dimension
+   */
+  math_t *GetTile(int *ws_idx) {
+    if (cache.GetSize() > 0) {
+      int n_cached;
+      cache.GetCacheIdxPartitioned(ws_idx, n_ws, ws_cache_idx.data(), &n_cached,
+                                   stream);
+      // collect allready cached values
+      cache.GetVecs(ws_cache_idx.data(), n_cached, tile.data(), stream);
+
+      int non_cached = n_ws - n_cached;
+      if (non_cached > 0) {
+        int *ws_idx_new = ws_idx + n_cached;
+        // AssignCacheIdx can permute ws_idx_new, therefore it has to come
+        // before calcKernel. Could come on separate stream to do collectrows
+        // while AssignCacheIdx runs
+        cache.AssignCacheIdx(ws_idx_new, non_cached,
+                             ws_cache_idx.data() + n_cached,
+                             stream);  // cache stream
+
+        // collect training vectors for kernel elements that needs to be calculated
+        MLCommon::Matrix::copyRows(x, n_rows, n_cols, x_ws.data(), ws_idx_new,
+                                   non_cached, stream, false);
+        math_t *tile_new = tile.data() + n_cached * n_rows;
+        (*kernel)(x, n_rows, n_cols, x_ws.data(), non_cached, tile_new, stream);
+        // We need AssignCacheIdx to be finished before calling StoreCols
+        cache.StoreVecs(tile_new, n_rows, non_cached,
+                        ws_cache_idx.data() + n_cached, stream);
+      }
+    } else {
+      if (n_ws > 0) {
+        // collect all the feature vectors in the working set
+        MLCommon::Matrix::copyRows(x, n_rows, n_cols, x_ws.data(), ws_idx, n_ws,
+                                   stream, false);
+        (*kernel)(x, n_rows, n_cols, x_ws.data(), n_ws, tile.data(), stream);
+      }
+    }
+    return tile.data();
+  }
+};
+
+};  // end namespace SVM
+};  // end namespace ML