Merge pull request #20 from rapidsai/branch-0.18

Sync with upstream
daxiongshu · Dec 28, 2020 · 8b1b7c3 · 8b1b7c3
2 parents e6d8ec3 + ae7e444
commit 8b1b7c3
Show file tree

Hide file tree

Showing 236 changed files with 14,344 additions and 2,861 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,19 @@
+# cuML 0.18.0 (Date TBD)
+
+## New Features
+
+## Improvements
+
+## Bug Fixes
+- PR #3279: Correct pure virtual declaration in manifold_inputs_t
+
 # cuML 0.17.0 (Date TBD)
 
 ## New Features
+- PR #3160: Least Angle Regression (experimental)
+- PR #2659: Add initial max inner product sparse knn
+- PR #2836: Refactor UMAP to accept sparse inputs
+- PR #3126: Experimental versions of GPU accelerated Kernel and Permutation SHAP
 
 ## Improvements
 - PR #3077: Improve runtime for test_kmeans
@@ -14,9 +27,12 @@
 - PR #2956: Follow cuML array conventions in ARIMA and remove redundancy
 - PR #3000: Pin cmake policies to cmake 3.17 version, bump project version to 0.17
 - PR #3083: Improving test_make_blobs testing time
+- PR #3223: Increase default SVM kernel cache to 2000 MiB
 - PR #2906: Moving `linalg` decomp to RAFT namespaces
+- PR #2988: FIL: use tree-per-class reduction for GROVE_PER_CLASS_FEW_CLASSES
 - PR #2996: Removing the max_depth restriction for switching to the batched backend
 - PR #3004: Remove Single Process Multi GPU (SPMG) code
+- PR #3032: FIL: Add optimization parameter `blocks_per_sm` that will help all but tiniest models
 - PR #3044: Move leftover `linalg` and `stats` to RAFT namespaces
 - PR #3067: Deleting prims moved to RAFT and updating header paths
 - PR #3074: Reducing dask coordinate descent test runtime
@@ -26,11 +42,34 @@
 - PR #3115: Speeding up MNMG UMAP testing
 - PR #3112: Speed test_array
 - PR #3111: Adding Cython to Code Coverage
-- PR #3129:  Update notebooks README
+- PR #3129: Update notebooks README
+- PR #3002: Update flake8 Config To With Per File Settings
+- PR #3135: Add QuasiNewton tests
 - PR #3040: Improved Array Conversion with CumlArrayDescriptor and Decorators
 - PR #3134: Improving the Deprecation Message Formatting in Documentation
+- PR #3154: Adding estimator pickling demo notebooks (and docs)
+- PR #3151: MNMG Logistic Regression via dask-glm
+- PR #3113: Add tags and prefered memory order tags to estimators
+- PR #3137: Reorganize Pytest Config and Add Quick Run Option
+- PR #3144: Adding Ability to Set Arbitrary Cmake Flags in ./build.sh
+- PR #3155: Eliminate unnecessary warnings from random projection test
+- PR #3176: Add probabilistic SVM tests with various input array types
+- PR #3180: FIL: `blocks_per_sm` support in Python
+- PR #3186: Add gain to RF JSON dump
+- PR #3219: Update CI to use XGBoost 1.3.0 RCs
+- PR #3221: Update contributing doc for label support
+- PR #3177: Make Multinomial Naive Bayes inherit from `ClassifierMixin` and use it for score
+- PR #3241: Updating RAFT to latest
+- PR #3240: Minor doc updates
+- PR #3275: Return confusion matrix as int unless float weights are used
 
 ## Bug Fixes
+- PR #3164: Expose silhouette score in Python
+- PR #3258: Revert silhouette_score Python exposure due to memory issue
+- PR #3218: Specify dependency branches in conda dev environment to avoid pip resolver issue
+- PR #3196: Disable ascending=false path for sortColumnsPerRow
+- PR #3051: MNMG KNN Cl&Re fix + multiple improvements
+- PR #3179: Remove unused metrics.cu file
 - PR #3069: Prevent conversion of DataFrames to Series in preprocessing
 - PR #3065: Refactoring prims metrics function names from camelcase to underscore format
 - PR #3033: Splitting ml metrics to individual files
@@ -40,6 +79,8 @@
 - PR #3011: Fix unused initialize_embeddings parameter in Barnes-Hut t-SNE
 - PR #3008: Check number of columns in check_array validator
 - PR #3012: Increasing learning rate for SGD log loss and invscaling pytests
+- PR #2950: Fix includes in UMAP
+- PR #3194: Fix cuDF to cuPy conversion (missing value)
 - PR #3021: Fix a hang in cuML RF experimental backend
 - PR #3039: Update RF and decision tree parameter initializations in benchmark codes
 - PR #3060: Speed up test suite `test_fil`
@@ -48,13 +89,34 @@
 - PR #3062: Bumping xgboost version to match cuml version
 - PR #3084: Fix artifacts in t-SNE results
 - PR #3086: Reverting FIL Notebook Testing
+- PR #3192: Enable pipeline usage for OneHotEncoder and LabelEncoder
 - PR #3114: Fixed a typo in SVC's predict_proba AttributeError
 - PR #3117: Fix two crashes in experimental RF backend
-- PR #3119: Fix memset args for benchmark 
+- PR #3119: Fix memset args for benchmark
 - PR #3130: Return Python string from `dump_as_json()` of RF
+- PR #3132: Add `min_samples_split` + Rename `min_rows_per_node` -> `min_samples_leaf`
 - PR #3136: Fix stochastic gradient descent example
-
-# cuML 0.16.0 (Date TBD)
+- PR #3152: Fix access to attributes of individual NB objects in dask NB
+- PR #3156: Force local conda artifact install
+- PR #3162: Removing accidentally checked in debug file
+- PR #3191: Fix __repr__ function for preprocessing models
+- PR #3175: Fix gtest pinned cmake version for build from source option
+- PR #3182: Fix a bug in MSE metric calculation
+- PR #3187: Update docstring to document behavior of `bootstrap=False`
+- PR #3215: Add a missing `__syncthreads()`
+- PR #3246: Fix MNMG KNN doc (adding batch_size)
+- PR #3185: Add documentation for Distributed TFIDF Transformer
+- PR #3190: Fix Attribute error on ICPA #3183 and PCA input type
+- PR #3208: Fix EXITCODE override in notebook test script
+- PR #3250: Fixing label binarizer bug with multiple partitions
+- PR #3214: Correct flaky silhouette score test by setting atol
+- PR #3216: Ignore splits that do not satisfy constraints
+- PR #3239: Fix intermittent dask random forest failure
+- PR #3243: Avoid unnecessary split for degenerate case where all labels are identical
+- PR #3245: Rename `rows_sample` -> `max_samples` to be consistent with sklearn's RF
+- PR #3282: Add secondary test to kernel explainer pytests for stability in Volta
+
+# cuML 0.16.0 (23 Oct 2020)
 
 ## New Features
 - PR #2922: Install RAFT headers with cuML
@@ -158,7 +220,7 @@
 - PR #2990: Reduce MNMG kneighbors regressor test threshold
 - PR #2997: Changing ARIMA `get/set_params` to `get/set_fit_params`
 
-# cuML 0.15.0 (Date TBD)
+# cuML 0.15.0 (26 Aug 2020)
 
 ## New Features
 - PR #2581: Added model persistence via joblib in each section of estimator_intro.ipynb

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -41,10 +41,21 @@ into three categories:
 ### A note related to our CI process
 After you have started a PR (refer to step 6 in the previous section), every time you do a `git push <yourRemote> <pr-branch>`, it triggers a new CI run on all the commits thus far. Even though GPUCI has mechanisms to deal with this to a certain extent, if you keep `push`ing too frequently, it might just clog our GPUCI servers and slow down every PR and conda package generation! So, please be mindful of this and try not to do many frequent pushes.
 
-To quantify this, the average check in our CI takes between 25 and 32 minutes on our servers. The GPUCI infrastructure has limited resources, so if the servers get overwhelmed, every current active PR will not be able to correctly schedule CI.
+To quantify this, the average check in our CI takes between 80 and 90 minutes on our servers. The GPUCI infrastructure has limited resources, so if the servers get overwhelmed, every current active PR will not be able to correctly schedule CI.
 
 Remember, if you are unsure about anything, don't hesitate to comment on issues and ask for clarifications!
 
+### Managing PR labels
+
+Each PR must be labeled according to whether it is a "breaking" or "non-breaking" change (using Github labels). This is used to highlight changes that users should know about when upgrading.
+
+For cuML, a "breaking" change is one that modifies the public, non-experimental, Python API in a
+non-backward-compatible way. The C++ API does not have an expectation of backward compatibility at this
+time, so changes to it are not typically considered breaking. Backward-compatible API changes to the Python
+API (such as adding a new keyword argument to a function) do not need to be labeled.
+
+Additional labels must be applied to indicate whether the change is a feature, improvement, bugfix, or documentation change. See the shared RAPIDS documentation for these labels: https://github.com/rapidsai/kb/issues/42.
+
 ### Seasoned developers
 
 Once you have gotten your feet wet and are more comfortable with the code, you

diff --git a/README.md b/README.md
@@ -96,7 +96,7 @@ repo](https://github.com/rapidsai/notebooks-contrib).
 | **Linear Models for Regression or Classification** | Linear Regression (OLS) | Multi-node multi-GPU via Dask |
 | | Linear Regression with Lasso or Ridge Regularization | Multi-node multi-GPU via Dask |
 | | ElasticNet Regression | |
-| | Logistic Regression | |
+| | Logistic Regression | Multi-node multi-GPU via Dask-GLM [demo](https://github.com/daxiongshu/rapids-demos) |
 | | Naive Bayes | Multi-node multi-GPU via Dask |
 | | Stochastic Gradient Descent (SGD), Coordinate Descent (CD), and Quasi-Newton (QN) (including L-BFGS and OWL-QN) solvers for linear models  | |
 | **Nonlinear Models for Regression or Classification** | Random Forest (RF) Classification | Experimental multi-node multi-GPU via Dask |
@@ -108,7 +108,10 @@ repo](https://github.com/rapidsai/notebooks-contrib).
 |  | Epsilon-Support Vector Regression (SVR) | |
 | **Time Series** | Holt-Winters Exponential Smoothing | |
 |  | Auto-regressive Integrated Moving Average (ARIMA) | Supports seasonality (SARIMA) |
-| **Other** | K-Nearest Neighbors (KNN) Search | Multi-node multi-GPU via Dask+[UCX](https://github.com/rapidsai/ucx-py), uses [Faiss](https://github.com/facebookresearch/faiss) for Nearest Neighbors Query. |
+| **Model Explanation**                                 | SHAP Kernel Explainer                                                                                                               | [Based on SHAP](https://shap.readthedocs.io/en/latest/) (experimental)                                                                                                                                               |
+|                                                       | SHAP Permutation Explainer                       | [Based on SHAP](https://shap.readthedocs.io/en/latest/) (experimental)                                                                                                                                                |
+| **Other**                                             | K-Nearest Neighbors (KNN) Search                                                                                                          | Multi-node multi-GPU via Dask+[UCX](https://github.com/rapidsai/ucx-py), uses [Faiss](https://github.com/facebookresearch/faiss) for Nearest Neighbors Query. |
+
 ---
 
 ## Installation
@@ -127,11 +130,13 @@ Please see our [guide for contributing to cuML](CONTRIBUTING.md).
 
 ## References
 
+The RAPIDS team has a number of blogs with deeper technical dives and examples. [You can find them here on Medium.](https://medium.com/rapids-ai/tagged/machine-learning)
+
 For additional details on the technologies behind cuML, as well as a broader overview of the Python Machine Learning landscape, see [_Machine Learning in Python: Main developments and technology trends in data science, machine learning, and artificial intelligence_ (2020)](https://arxiv.org/abs/2002.04803) by Sebastian Raschka, Joshua Patterson, and Corey Nolet.
 
 Please consider citing this when using cuML in a project. You can use the citation BibTeX:
 
-```
+```bibtex
 @article{raschka2020machine,
   title={Machine Learning in Python: Main developments and technology trends in data science, machine learning, and artificial intelligence},
   author={Raschka, Sebastian and Patterson, Joshua and Nolet, Corey},

diff --git a/build.sh b/build.sh
@@ -48,6 +48,13 @@ HELP="$0 [<target> ...] [<flag> ...]
    -h               - print this text
 
  default action (no args) is to build and install 'libcuml', 'cuml', and 'prims' targets only for the detected GPU arch
+
+ The following environment variables are also accepted to allow further customization:
+   PARALLEL_LEVEL         - Number of parallel threads to use in compilation.
+   CUML_EXTRA_CMAKE_ARGS  - Extra arguments to pass directly to cmake. Values listed in environment
+                            variable will override existing arguments. Example:
+                            CUML_EXTRA_CMAKE_ARGS=\"-DBUILD_CUML_C_LIBRARY=OFF\" ./build.sh
+   CUML_EXTRA_PYTHON_ARGS - Extra argument to pass directly to python setup.py
 "
 LIBCUML_BUILD_DIR=${LIBCUML_BUILD_DIR:=${REPODIR}/cpp/build}
 CUML_BUILD_DIR=${REPODIR}/python/build
@@ -60,7 +67,7 @@ BUILD_TYPE=Release
 INSTALL_TARGET=install
 BUILD_ALL_GPU_ARCH=0
 SINGLEGPU_CPP_FLAG=""
-BUILD_PYTHON_ARGS=${BUILD_PYTHON_ARGS:=""}
+CUML_EXTRA_PYTHON_ARGS=${CUML_EXTRA_PYTHON_ARGS:=""}
 NVTX=OFF
 CLEAN=0
 BUILD_DISABLE_DEPRECATION_WARNING=ON
@@ -74,6 +81,12 @@ BUILD_STATIC_FAISS=OFF
 INSTALL_PREFIX=${INSTALL_PREFIX:=${PREFIX:=${CONDA_PREFIX}}}
 PARALLEL_LEVEL=${PARALLEL_LEVEL:=""}
 
+# Allow setting arbitrary cmake args via the $CUML_ADDL_CMAKE_ARGS variable. Any
+# values listed here will override existing arguments. For example:
+# CUML_EXTRA_CMAKE_ARGS="-DBUILD_CUML_C_LIBRARY=OFF" ./build.sh
+# Will disable building the C library even though it is hard coded to ON
+CUML_EXTRA_CMAKE_ARGS=${CUML_EXTRA_CMAKE_ARGS:=""}
+
 function hasArg {
     (( ${NUMARGS} != 0 )) && (echo " ${ARGS} " | grep -q " $1 ")
 }
@@ -117,7 +130,7 @@ if hasArg --allgpuarch; then
     BUILD_ALL_GPU_ARCH=1
 fi
 if hasArg --singlegpu; then
-    BUILD_PYTHON_ARGS="${BUILD_PYTHON_ARGS} --singlegpu"
+    CUML_EXTRA_PYTHON_ARGS="${CUML_EXTRA_PYTHON_ARGS} --singlegpu"
     SINGLEGPU_CPP_FLAG=ON
 fi
 if hasArg cpp-mgtests; then
@@ -136,7 +149,7 @@ if hasArg --show_depr_warn; then
     BUILD_DISABLE_DEPRECATION_WARNING=OFF
 fi
 if hasArg --codecov; then
-    BUILD_PYTHON_ARGS="${BUILD_PYTHON_ARGS} --linetrace=1 --profile"
+    CUML_EXTRA_PYTHON_ARGS="${CUML_EXTRA_PYTHON_ARGS} --linetrace=1 --profile"
 fi
 if hasArg clean; then
     CLEAN=1
@@ -189,6 +202,7 @@ if completeBuild || hasArg libcuml || hasArg prims || hasArg bench || hasArg pri
           -DNCCL_PATH=${INSTALL_PREFIX} \
           -DDISABLE_DEPRECATION_WARNING=${BUILD_DISABLE_DEPRECATION_WARNING} \
           -DCMAKE_PREFIX_PATH=${INSTALL_PREFIX} \
+          ${CUML_EXTRA_CMAKE_ARGS} \
           ..
 fi
 
@@ -229,9 +243,9 @@ fi
 if completeBuild || hasArg cuml || hasArg pydocs; then
     cd ${REPODIR}/python
     if [[ ${INSTALL_TARGET} != "" ]]; then
-        python setup.py build_ext -j${PARALLEL_LEVEL:-1} ${BUILD_PYTHON_ARGS} --library-dir=${LIBCUML_BUILD_DIR} install --single-version-externally-managed --record=record.txt
+        python setup.py build_ext -j${PARALLEL_LEVEL:-1} ${CUML_EXTRA_PYTHON_ARGS} --library-dir=${LIBCUML_BUILD_DIR} install --single-version-externally-managed --record=record.txt
     else
-        python setup.py build_ext -j${PARALLEL_LEVEL:-1} ${BUILD_PYTHON_ARGS} --library-dir=${LIBCUML_BUILD_DIR}
+        python setup.py build_ext -j${PARALLEL_LEVEL:-1} ${CUML_EXTRA_PYTHON_ARGS} --library-dir=${LIBCUML_BUILD_DIR}
     fi
 
     if hasArg pydocs; then

diff --git a/ci/checks/copyright.py b/ci/checks/copyright.py
@@ -26,7 +26,6 @@
     re.compile(r"CMakeLists[.]txt$"),
     re.compile(r"CMakeLists_standalone[.]txt$"),
     re.compile(r"setup[.]cfg$"),
-    re.compile(r"[.]flake8[.]cython$"),
     re.compile(r"meta[.]yaml$")
 ]
 

diff --git a/ci/checks/style.sh b/ci/checks/style.sh
@@ -16,7 +16,7 @@ export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
 conda install "ucx-py=${MINOR_VERSION}"
 
 # Run flake8 and get results/return code
-FLAKE=`flake8 --exclude=cpp,thirdparty,__init__.py,versioneer.py && flake8 --config=python/.flake8.cython`
+FLAKE=`flake8 --config=python/setup.cfg`
 RETVAL=$?
 
 # Output results if failure otherwise show pass

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
@@ -53,7 +53,7 @@ gpuci_conda_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvid
       "dask-cudf=${MINOR_VERSION}" \
       "dask-cuda=${MINOR_VERSION}" \
       "ucx-py=${MINOR_VERSION}" \
-      "xgboost=1.2.0dev.rapidsai${MINOR_VERSION}" \
+      "xgboost=1.3.0dev.rapidsai${MINOR_VERSION}" \
       "rapids-build-env=${MINOR_VERSION}.*" \
       "rapids-notebook-env=${MINOR_VERSION}.*" \
       "rapids-doc-env=${MINOR_VERSION}.*"
@@ -70,8 +70,8 @@ fi
 
 gpuci_logger "Install the master version of dask and distributed"
 set -x
-pip install "git+https://github.com/dask/distributed.git" --upgrade --no-deps
-pip install "git+https://github.com/dask/dask.git" --upgrade --no-deps
+pip install "git+https://github.com/dask/distributed.git@master" --upgrade --no-deps
+pip install "git+https://github.com/dask/dask.git@master" --upgrade --no-deps
 set +x
 
 gpuci_logger "Check compiler versions"
@@ -110,9 +110,6 @@ if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
     ################################################################################
     # TEST - Run GoogleTest and py.tests for libcuml and cuML
     ################################################################################
-    set +e -Eo pipefail
-    EXITCODE=0
-    trap "EXITCODE=1" ERR
 
     if hasArg --skip-tests; then
         gpuci_logger "Skipping Tests"
@@ -139,6 +136,9 @@ if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
     ################################################################################
     # TEST - Run notebook tests
     ################################################################################
+    set +e -Eo pipefail
+    EXITCODE=0
+    trap "EXITCODE=1" ERR
 
     gpuci_logger "Notebook tests"
     ${WORKSPACE}/ci/gpu/test-notebooks.sh 2>&1 | tee nbtest.log
@@ -186,8 +186,11 @@ else
     patchelf --replace-needed `patchelf --print-needed ./test/ml | grep faiss` libfaiss.so ./test/ml
     GTEST_OUTPUT="xml:${WORKSPACE}/test-results/libcuml_cpp/" ./test/ml
 
-    gpuci_logger "Installing libcuml"
-    conda install -c $WORKSPACE/ci/artifacts/cuml/cpu/conda-bld/ libcuml
+    CONDA_FILE=`find $WORKSPACE/ci/artifacts/cuml/cpu/conda-bld/ -name "libcuml*.tar.bz2"`
+    CONDA_FILE=`basename "$CONDA_FILE" .tar.bz2` #get filename without extension
+    CONDA_FILE=${CONDA_FILE//-/=} #convert to conda install
+    gpuci_logger "Installing $CONDA_FILE"
+    conda install -c $WORKSPACE/ci/artifacts/cuml/cpu/conda-bld/ "$CONDA_FILE"
 
     gpuci_logger "Building cuml"
     "$WORKSPACE/build.sh" -v cuml --codecov
@@ -238,4 +241,8 @@ else
 
 fi
 
+if [ -n "\${CODECOV_TOKEN}" ]; then
+    codecov -t \$CODECOV_TOKEN
+fi
+
 return ${EXITCODE}
diff --git a/ci/gpu/test-notebooks.sh b/ci/gpu/test-notebooks.sh
@@ -15,7 +15,7 @@ SKIPNBS="cuml_benchmarks.ipynb"
 ## Check env
 env
 
-EXITCODE=0
+NOTEBOOKS_EXITCODE=0
 
 # Always run nbtest in all TOPLEVEL_NB_FOLDERS, set EXITCODE to failure
 # if any run fails
@@ -36,12 +36,12 @@ for nb in $(find . -name "*.ipynb"); do
     else
         nvidia-smi
         ${NBTEST} ${nbBasename}
-        EXITCODE=$((EXITCODE | $?))
+        NOTEBOOKS_EXITCODE=$((NOTEBOOKS_EXITCODE | $?))
         rm -rf ${LIBCUDF_KERNEL_CACHE_PATH}/*
     fi
 done
 
 
 nvidia-smi
 
-exit ${EXITCODE}
+exit ${NOTEBOOKS_EXITCODE}
diff --git a/ci/local/README.md b/ci/local/README.md
@@ -32,7 +32,7 @@ Style Check:
 $ bash ci/local/build.sh -r ~/rapids/cuml -s
 $ source activate rapids    # Activate gpuCI conda environment
 $ cd rapids
-$ flake8 python
+$ flake8 --config=python/setup.cfg
 ```
 
 ## Information