Merge branch 'master' into span

dmlc · Aug 13, 2018 · e277399 · e277399
2 parents 5002c8d + 96826a3
commit e277399
Show file tree

Hide file tree

Showing 42 changed files with 246 additions and 89 deletions.
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -9,7 +9,7 @@ dockerRun = 'tests/ci_build/ci_build.sh'
 def buildMatrix = [
     [ "enabled": true,  "os" : "linux", "withGpu": true, "withNccl": true,  "withOmp": true, "pythonVersion": "2.7", "cudaVersion": "9.2" ],
     [ "enabled": true,  "os" : "linux", "withGpu": true, "withNccl": true,  "withOmp": true, "pythonVersion": "2.7", "cudaVersion": "8.0" ],
-    [ "enabled": false,  "os" : "linux", "withGpu": false, "withNccl": false, "withOmp": true, "pythonVersion": "2.7", "cudaVersion": ""  ],
+    [ "enabled": true,  "os" : "linux", "withGpu": true, "withNccl": false, "withOmp": true, "pythonVersion": "2.7", "cudaVersion": "8.0" ],
 ]
 
 pipeline {
@@ -34,6 +34,28 @@ pipeline {
                 milestone label: 'Sources ready', ordinal: 1
             }
         }
+        stage('Build doc') {
+            agent any
+            steps {
+                script {
+                    if (env.CHANGE_ID == null) {  // This is a branch
+                        def commit_id = "${GIT_COMMIT}"
+                        def branch_name = "${GIT_LOCAL_BRANCH}"
+                        echo 'Building doc...'
+                        dir ('jvm-packages') {
+                            sh "bash ./build_doc.sh ${commit_id}"
+                            archiveArtifacts artifacts: "${commit_id}.tar.bz2", allowEmptyArchive: true
+                            echo 'Deploying doc...'
+                            withAWS(credentials:'xgboost-doc-bucket') {
+                                s3Upload file: "${commit_id}.tar.bz2", bucket: 'xgboost-docs', acl: 'PublicRead', path: "${branch_name}.tar.bz2"
+                            }
+                        }
+                    } else {                      // This is a pull request
+                        echo 'Skipping doc build step for pull request'
+                    }
+                }
+            }
+        }
         stage('Build & Test') {
             steps {
                 script {
@@ -121,7 +143,7 @@ def cmakeOptions(conf) {
 }
 
 def getBuildName(conf) {
-    def gpuLabel = conf['withGpu'] ? "_cuda" + conf['cudaVersion'] : "_cpu"
+    def gpuLabel = conf['withGpu'] ? ("_cuda" + conf['cudaVersion'] + (conf['withNccl'] ? "_nccl" : "_nonccl")) : "_cpu"
     def ompLabel = conf['withOmp'] ? "_omp" : ""
     def pyLabel = "_py${conf['pythonVersion']}"
     return "${conf['os']}${gpuLabel}${ompLabel}${pyLabel}"

diff --git a/NEWS.md b/NEWS.md
@@ -3,6 +3,55 @@ XGBoost Change Log
 
 This file records the changes in xgboost library in reverse chronological order.
 
+## v0.80 (2018.08.13)
+* **JVM packages received a major upgrade**: To consolidate the APIs and improve the user experience, we refactored the design of XGBoost4J-Spark in a significant manner. (#3387)
+  - Consolidated APIs: It is now much easier to integrate XGBoost models into a Spark ML pipeline. Users can control behaviors like output leaf prediction results by setting corresponding column names. Training is now more consistent with other Estimators in Spark MLLIB: there is now one single method `fit()` to train decision trees.
+  - Better user experience: we refactored the parameters relevant modules in XGBoost4J-Spark to provide both camel-case (Spark ML style) and underscore (XGBoost style) parameters
+  - A brand-new tutorial is [available](https://xgboost.readthedocs.io/en/release_0.80/jvm/xgboost4j_spark_tutorial.html) for XGBoost4J-Spark.
+  - Latest API documentation is now hosted at https://xgboost.readthedocs.io/.
+* XGBoost documentation now keeps track of multiple versions:
+  - Latest master: https://xgboost.readthedocs.io/en/latest
+  - 0.80 stable: https://xgboost.readthedocs.io/en/release_0.80
+  - 0.72 stable: https://xgboost.readthedocs.io/en/release_0.72
+* Ranking task now uses instance weights (#3379)
+* Fix inaccurate decimal parsing (#3546)
+* New functionality
+  - Query ID column support in LIBSVM data files (#2749). This is convenient for performing ranking task in distributed setting.
+  - Hinge loss for binary classification (`binary:hinge`) (#3477)
+  - Ability to specify delimiter and instance weight column for CSV files (#3546)
+  - Ability to use 1-based indexing instead of 0-based (#3546)
+* GPU support
+  - Quantile sketch, binning, and index compression are now performed on GPU, eliminating PCIe transfer for 'gpu_hist' algorithm (#3319, #3393)
+  - Upgrade to NCCL2 for multi-GPU training (#3404).
+  - Use shared memory atomics for faster training (#3384).
+  - Dynamically allocate GPU memory, to prevent large allocations for deep trees (#3519)
+  - Fix memory copy bug for large files (#3472)
+* Python package
+  - Importing data from Python datatable (#3272)
+  - Pre-built binary wheels available for 64-bit Linux and Windows (#3424, #3443)
+  - Add new importance measures 'total_gain', 'total_cover' (#3498)
+  - Sklearn API now supports saving and loading models (#3192)
+  - Arbitrary cross validation fold indices (#3353)
+  - `predict()` function in Sklearn API uses `best_ntree_limit` if available, to make early stopping easier to use (#3445)
+  - Informational messages are now directed to Python's `print()` rather than standard output (#3438). This way, messages appear inside Jupyter notebooks.
+* R package
+  - Oracle Solaris support, per CRAN policy (#3372)
+* JVM packages
+  - Single-instance prediction (#3464)
+  - Pre-built JARs are now available from Maven Central (#3401)
+  - Add NULL pointer check (#3021)
+  - Consider `spark.task.cpus` when controlling parallelism (#3530)
+  - Handle missing values in prediction (#3529)
+  - Eliminate outputs of `System.out` (#3572)
+* Refactored C++ DMatrix class for simplicity and de-duplication (#3301)
+* Refactored C++ histogram facilities (#3564)
+* Refactored constraints / regularization mechanism for split finding (#3335, #3429). Users may specify an elastic net (L2 + L1 regularization) on leaf weights as well as monotonic constraints on test nodes. The refactor will be useful for a future addition of feature interaction constraints.
+* Statically link `libstdc++` for MinGW32 (#3430)
+* Enable loading from `group`, `base_margin` and `weight` (see [here](http://xgboost.readthedocs.io/en/latest/tutorials/input_format.html#auxiliary-files-for-additional-information)) for Python, R, and JVM packages (#3431)
+* Fix model saving for `count:possion` so that `max_delta_step` doesn't get truncated (#3515)
+* Fix loading of sparse CSC matrix (#3553)
+* Fix incorrect handling of `base_score` parameter for Tweedie regression (#3295)
+
 ## v0.72.1 (2018.07.08)
 This version is only applicable for the Python package. The content is identical to that of v0.72.
 

diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: xgboost
 Type: Package
 Title: Extreme Gradient Boosting
-Version: 0.71.2
-Date: 2018-06-08
+Version: 0.80.1
+Date: 2018-08-13
 Authors@R: c(
   person("Tianqi", "Chen", role = c("aut"),
          email = "tianqi.tchen@gmail.com"),

diff --git a/doc/build.rst b/doc/build.rst
@@ -4,15 +4,17 @@ Installation Guide
 
 .. note:: Pre-built binary wheel for Python
 
-  If you are planning to use Python on a Linux system, consider installing XGBoost from a pre-built binary wheel. The wheel is available from Python Package Index (PyPI). You may download and install it by running
+  If you are planning to use Python, consider installing XGBoost from a pre-built binary wheel, available from Python Package Index (PyPI). You may download and install it by running
 
   .. code-block:: bash
 
-    # Ensure that you are downloading xgboost-{version}-py2.py3-none-manylinux1_x86_64.whl
+    # Ensure that you are downloading one of the following:
+    #   * xgboost-{version}-py2.py3-none-manylinux1_x86_64.whl
+    #   * xgboost-{version}-py2.py3-none-win_amd64.whl
     pip3 install xgboost
 
-  * This package will support GPU algorithms (`gpu_exact`, `gpu_hist`) on machines with NVIDIA GPUs.
-  * Currently, PyPI has a binary wheel only for 64-bit Linux.
+  * The binary wheel will support GPU algorithms (`gpu_exact`, `gpu_hist`) on machines with NVIDIA GPUs. **However, it will not support multi-GPU training; only single GPU will be used.** To enable multi-GPU training, download and install the binary wheel from `this page <https://s3-us-west-2.amazonaws.com/xgboost-wheels/list.html>`_.
+  * Currently, we provide binary wheels for 64-bit Linux and Windows.
 
 ****************************
 Building XGBoost from source
@@ -187,13 +189,15 @@ After the build process successfully ends, you will find a ``xgboost.dll`` libra
 
 Unofficial windows binaries and instructions on how to use them are hosted on `Guido Tapia's blog <http://www.picnet.com.au/blogs/guido/post/2016/09/22/xgboost-windows-x64-binaries-for-download/>`_.
 
+.. _build_gpu_support:
+
 Building with GPU support
 =========================
 XGBoost can be built with GPU support for both Linux and Windows using CMake. GPU support works with the Python package as well as the CLI version. See `Installing R package with GPU support`_ for special instructions for R.
 
 An up-to-date version of the CUDA toolkit is required.
 
-From the command line on Linux starting from the xgboost directory:
+From the command line on Linux starting from the XGBoost directory:
 
 .. code-block:: bash
 
@@ -202,9 +206,16 @@ From the command line on Linux starting from the xgboost directory:
   cmake .. -DUSE_CUDA=ON
   make -j
 
-.. note:: Windows requirements for GPU build
+.. note:: Enabling multi-GPU training
+
+  By default, multi-GPU training is disabled and only a single GPU will be used. To enable multi-GPU training, set the option ``USE_NCCL=ON``. Multi-GPU training depends on NCCL2, available at `this link <https://developer.nvidia.com/nccl>`_. Since NCCL2 is only available for Linux machines, **multi-GPU training is available only for Linux**.
+
+  .. code-block:: bash
 
-  Only Visual C++ 2015 or 2013 with CUDA v8.0 were fully tested. Either install Visual C++ 2015 Build Tools separately, or as a part of Visual Studio 2015. If you already have Visual Studio 2017, the Visual C++ 2015 Toolchain componenet has to be installed using the VS 2017 Installer. Likely, you would need to use the VS2015 x64 Native Tools command prompt to run the cmake commands given below. In some situations, however, things run just fine from MSYS2 bash command line.
+    mkdir build
+    cd build
+    cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON
+    make -j
 
 On Windows, see what options for generators you have for CMake, and choose one with ``[arch]`` replaced with Win64:
 

diff --git a/doc/conf.py b/doc/conf.py
@@ -12,11 +12,21 @@
 # All configuration values have a default; values that are commented out
 # serve to show the default.
 from subprocess import call
+from sh.contrib import git
+import urllib.request
 from recommonmark.parser import CommonMarkParser
 import sys
+import re
 import os, subprocess
 import shlex
 import guzzle_sphinx_theme
+
+git_branch = [re.sub(r'origin/', '', x.lstrip(' ')) for x in str(git.branch('-r', '--contains', 'HEAD')).rstrip('\n').split('\n')]
+git_branch = [x for x in git_branch if 'HEAD' not in x]
+print('git_branch = {}'.format(git_branch[0]))
+filename, _ = urllib.request.urlretrieve('https://s3-us-west-2.amazonaws.com/xgboost-docs/{}.tar.bz2'.format(git_branch[0]))
+call('if [ -d tmp ]; then rm -rf tmp; fi; mkdir -p tmp/jvm; cd tmp/jvm; tar xvf {}'.format(filename), shell=True)
+
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
@@ -94,6 +104,7 @@
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 exclude_patterns = ['_build']
+html_extra_path = ['./tmp']
 
 # The reST default role (used for this markup: `text`) to use for all
 # documents.

diff --git a/doc/gpu/index.rst b/doc/gpu/index.rst
@@ -8,7 +8,7 @@ To install GPU support, checkout the :doc:`/build`.
 *********************************************
 CUDA Accelerated Tree Construction Algorithms
 *********************************************
-This plugin adds GPU accelerated tree construction and prediction algorithms to XGBoost.
+Tree construction (training) and prediction can be accelerated with CUDA-capable GPUs.
 
 Usage
 =====
@@ -59,7 +59,11 @@ The device ordinal can be selected using the ``gpu_id`` parameter, which default
 
 Multiple GPUs can be used with the ``gpu_hist`` tree method using the ``n_gpus`` parameter. which defaults to 1. If this is set to -1 all available GPUs will be used.  If ``gpu_id`` is specified as non-zero, the gpu device order is ``mod(gpu_id + i) % n_visible_devices`` for ``i=0`` to ``n_gpus-1``.  As with GPU vs. CPU, multi-GPU will not always be faster than a single GPU due to PCI bus bandwidth that can limit performance.
 
-This plugin currently works with the CLI, python and R - see :doc:`/build` for details.
+.. note:: Enabling multi-GPU training
+
+  Default installation may not enable multi-GPU training. To use multiple GPUs, make sure to read :ref:`build_gpu_support`.
+
+The GPU algorithms currently work with CLI, Python and R packages. See :doc:`/build` for details.
 
 .. code-block:: python
   :caption: Python example

diff --git a/doc/jvm/index.rst b/doc/jvm/index.rst
@@ -58,10 +58,9 @@ For sbt, please add the repository and dependency in build.sbt as following:
 
 If you want to use XGBoost4J-Spark, replace ``xgboost4j`` with ``xgboost4j-spark``.
 
-.. note:: Spark 2.0 Required
-
-  After integrating with Dataframe/Dataset APIs of Spark 2.0, XGBoost4J-Spark only supports compile with Spark 2.x. You can build XGBoost4J-Spark as a component of XGBoost4J by running ``mvn package``, and you can specify the version of spark with ``mvn -Dspark.version=2.0.0 package``.   (To continue working with Spark 1.x, the users are supposed to update pom.xml by modifying the properties like ``spark.version``, ``scala.version``, and ``scala.binary.version``. Users also need to change the implementation by replacing ``SparkSession`` with ``SQLContext`` and the type of API parameters from ``Dataset[_]`` to ``Dataframe``)
+.. note:: XGBoost4J-Spark requires Spark 2.3+
 
+  XGBoost4J-Spark now requires Spark 2.3+. Latest versions of XGBoost4J-Spark uses facilities of `org.apache.spark.ml.param.shared` extensively to provide for a tight integration with Spark MLLIB framework, and these facilities are not fully available on earlier versions of Spark.
 
 Installation from maven repo
 ============================
@@ -150,7 +149,7 @@ Contents
   java_intro
   XGBoost4J-Spark Tutorial <xgboost4j_spark_tutorial>
   Code Examples <https://github.com/dmlc/xgboost/tree/master/jvm-packages/xgboost4j-example>
-  XGBoost4J Java API <http://dmlc.ml/docs/javadocs/index.html>
-  XGBoost4J Scala API <http://dmlc.ml/docs/scaladocs/xgboost4j/index.html>
-  XGBoost4J-Spark Scala API <http://dmlc.ml/docs/scaladocs/xgboost4j-spark/index.html>
-  XGBoost4J-Flink Scala API <http://dmlc.ml/docs/scaladocs/xgboost4j-flink/index.html>
+  XGBoost4J Java API <javadocs/index>
+  XGBoost4J Scala API <scaladocs/xgboost4j/index>
+  XGBoost4J-Spark Scala API <scaladocs/xgboost4j-spark/index>
+  XGBoost4J-Flink Scala API <scaladocs/xgboost4j-flink/index>
diff --git a/doc/jvm/java_intro.rst b/doc/jvm/java_intro.rst
@@ -6,15 +6,15 @@ This tutorial introduces Java API for XGBoost.
 **************
 Data Interface
 **************
-Like the XGBoost python module, XGBoost4J uses DMatrix to handle data,
-LIBSVM txt format file, sparse matrix in CSR/CSC format, and dense matrix is
+Like the XGBoost python module, XGBoost4J uses DMatrix to handle data.
+LIBSVM txt format file, sparse matrix in CSR/CSC format, and dense matrix are
 supported.
 
 * The first step is to import DMatrix:
 
   .. code-block:: java
 
-    import org.dmlc.xgboost4j.java.DMatrix;
+    import ml.dmlc.xgboost4j.java.DMatrix;
 
 * Use DMatrix constructor to load data from a libsvm text format file:
 
@@ -39,7 +39,8 @@ supported.
     long[] rowHeaders = new long[] {0,2,4,7};
     float[] data = new float[] {1f,2f,4f,3f,3f,1f,2f};
     int[] colIndex = new int[] {0,2,0,3,0,1,2};
-    DMatrix dmat = new DMatrix(rowHeaders, colIndex, data, DMatrix.SparseType.CSR);
+    int numColumn = 4;
+    DMatrix dmat = new DMatrix(rowHeaders, colIndex, data, DMatrix.SparseType.CSR, numColumn);
   
   ... or in `Compressed Sparse Column (CSC) <https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_column_(CSC_or_CCS)>`_ format:
 
@@ -48,7 +49,8 @@ supported.
     long[] colHeaders = new long[] {0,3,4,6,7};
     float[] data = new float[] {1f,4f,3f,1f,2f,2f,3f};
     int[] rowIndex = new int[] {0,1,2,2,0,2,1};
-    DMatrix dmat = new DMatrix(colHeaders, rowIndex, data, DMatrix.SparseType.CSC);
+    int numRow = 3;
+    DMatrix dmat = new DMatrix(colHeaders, rowIndex, data, DMatrix.SparseType.CSC, numRow);
 
 * You may also load your data from a dense matrix. Let's assume we have a matrix of form
 
@@ -66,7 +68,7 @@ supported.
     int nrow = 3;
     int ncol = 2;
     float missing = 0.0f;
-    DMatrix dmat = new Matrix(data, nrow, ncol, missing);
+    DMatrix dmat = new DMatrix(data, nrow, ncol, missing);
 
 * To set weight:
 
@@ -82,7 +84,7 @@ To set parameters, parameters are specified as a Map:
 
 .. code-block:: java
 
-  Map<String, Object> params = new HashMap<>() {
+  Map<String, Object> params = new HashMap<String, Object>() {
     {
       put("eta", 1.0);
       put("max_depth", 2);
@@ -101,20 +103,22 @@ With parameters and data, you are able to train a booster model.
 
   .. code-block:: java
 
-    import org.dmlc.xgboost4j.java.Booster;
-    import org.dmlc.xgboost4j.java.XGBoost;
+    import ml.dmlc.xgboost4j.java.Booster;
+    import ml.dmlc.xgboost4j.java.XGBoost;
 
 * Training
 
   .. code-block:: java
 
     DMatrix trainMat = new DMatrix("train.svm.txt");
     DMatrix validMat = new DMatrix("valid.svm.txt");
-    // Specify a watchList to see the performance
-    // Any Iterable<Entry<String, DMatrix>> object could be used as watchList
-    List<Entry<String, DMatrix>> watches = new ArrayList<>();
-    watches.add(new SimpleEntry<>("train", trainMat));
-    watches.add(new SimpleEntry<>("test", testMat));
+    // Specify a watch list to see model accuracy on data sets
+    Map<String, DMatrix> watches = new HashMap<String, DMatrix>() {
+      {
+        put("train", trainMat);
+        put("test", testMat);
+      }
+    };
     int nround = 2;
     Booster booster = XGBoost.train(trainMat, params, nround, watches, null, null);
 
@@ -130,15 +134,16 @@ With parameters and data, you are able to train a booster model.
 
   .. code-block:: java
 
-    String[] model_dump = booster.getModelDump(null, false)
+    // dump without feature map
+    String[] model_dump = booster.getModelDump(null, false);
     // dump with feature map
-    String[] model_dump_with_feature_map = booster.getModelDump("featureMap.txt", false)
+    String[] model_dump_with_feature_map = booster.getModelDump("featureMap.txt", false);
 
 * Load a model
 
   .. code-block:: java
 
-    Booster booster = Booster.loadModel("model.bin");
+    Booster booster = XGBoost.loadModel("model.bin");
 
 **********
 Prediction

diff --git a/doc/jvm/javadocs/index.rst b/doc/jvm/javadocs/index.rst
@@ -0,0 +1,3 @@
+==================
+XGBoost4J Java API
+==================
diff --git a/doc/jvm/scaladocs/xgboost4j-flink/index.rst b/doc/jvm/scaladocs/xgboost4j-flink/index.rst
@@ -0,0 +1,3 @@
+=========================
+XGBoost4J-Flink Scala API
+=========================
diff --git a/doc/jvm/scaladocs/xgboost4j-spark/index.rst b/doc/jvm/scaladocs/xgboost4j-spark/index.rst
@@ -0,0 +1,3 @@
+=========================
+XGBoost4J-Spark Scala API
+=========================
diff --git a/doc/jvm/scaladocs/xgboost4j/index.rst b/doc/jvm/scaladocs/xgboost4j/index.rst
@@ -0,0 +1,3 @@
+===================
+XGBoost4J Scala API
+===================
diff --git a/doc/jvm/xgboost4j_spark_tutorial.rst b/doc/jvm/xgboost4j_spark_tutorial.rst
@@ -61,6 +61,10 @@ and then refer to the snapshot dependency by adding:
       <version>next_version_num-SNAPSHOT</version>
   </dependency>
 
+.. note:: XGBoost4J-Spark requires Spark 2.3+
+
+  XGBoost4J-Spark now requires Spark 2.3+. Latest versions of XGBoost4J-Spark uses facilities of `org.apache.spark.ml.param.shared` extensively to provide for a tight integration with Spark MLLIB framework, and these facilities are not fully available on earlier versions of Spark.
+
 Data Preparation
 ================