diff --git a/Jenkinsfile b/Jenkinsfile
index a46c081b169d..26dc1617d316 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -9,7 +9,7 @@ dockerRun = 'tests/ci_build/ci_build.sh'
 def buildMatrix = [
     [ "enabled": true,  "os" : "linux", "withGpu": true, "withNccl": true,  "withOmp": true, "pythonVersion": "2.7", "cudaVersion": "9.2" ],
     [ "enabled": true,  "os" : "linux", "withGpu": true, "withNccl": true,  "withOmp": true, "pythonVersion": "2.7", "cudaVersion": "8.0" ],
-    [ "enabled": false,  "os" : "linux", "withGpu": false, "withNccl": false, "withOmp": true, "pythonVersion": "2.7", "cudaVersion": ""  ],
+    [ "enabled": true,  "os" : "linux", "withGpu": true, "withNccl": false, "withOmp": true, "pythonVersion": "2.7", "cudaVersion": "8.0" ],
 ]
 
 pipeline {
@@ -34,6 +34,28 @@ pipeline {
                 milestone label: 'Sources ready', ordinal: 1
             }
         }
+        stage('Build doc') {
+            agent any
+            steps {
+                script {
+                    if (env.CHANGE_ID == null) {  // This is a branch
+                        def commit_id = "${GIT_COMMIT}"
+                        def branch_name = "${GIT_LOCAL_BRANCH}"
+                        echo 'Building doc...'
+                        dir ('jvm-packages') {
+                            sh "bash ./build_doc.sh ${commit_id}"
+                            archiveArtifacts artifacts: "${commit_id}.tar.bz2", allowEmptyArchive: true
+                            echo 'Deploying doc...'
+                            withAWS(credentials:'xgboost-doc-bucket') {
+                                s3Upload file: "${commit_id}.tar.bz2", bucket: 'xgboost-docs', acl: 'PublicRead', path: "${branch_name}.tar.bz2"
+                            }
+                        }
+                    } else {                      // This is a pull request
+                        echo 'Skipping doc build step for pull request'
+                    }
+                }
+            }
+        }
         stage('Build & Test') {
             steps {
                 script {
@@ -121,7 +143,7 @@ def cmakeOptions(conf) {
 }
 
 def getBuildName(conf) {
-    def gpuLabel = conf['withGpu'] ? "_cuda" + conf['cudaVersion'] : "_cpu"
+    def gpuLabel = conf['withGpu'] ? ("_cuda" + conf['cudaVersion'] + (conf['withNccl'] ? "_nccl" : "_nonccl")) : "_cpu"
     def ompLabel = conf['withOmp'] ? "_omp" : ""
     def pyLabel = "_py${conf['pythonVersion']}"
     return "${conf['os']}${gpuLabel}${ompLabel}${pyLabel}"
diff --git a/NEWS.md b/NEWS.md
index 10d42ce77b67..5c7e11794c49 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -3,6 +3,55 @@ XGBoost Change Log
 
 This file records the changes in xgboost library in reverse chronological order.
 
+## v0.80 (2018.08.13)
+* **JVM packages received a major upgrade**: To consolidate the APIs and improve the user experience, we refactored the design of XGBoost4J-Spark in a significant manner. (#3387)
+  - Consolidated APIs: It is now much easier to integrate XGBoost models into a Spark ML pipeline. Users can control behaviors like output leaf prediction results by setting corresponding column names. Training is now more consistent with other Estimators in Spark MLLIB: there is now one single method `fit()` to train decision trees.
+  - Better user experience: we refactored the parameters relevant modules in XGBoost4J-Spark to provide both camel-case (Spark ML style) and underscore (XGBoost style) parameters
+  - A brand-new tutorial is [available](https://xgboost.readthedocs.io/en/release_0.80/jvm/xgboost4j_spark_tutorial.html) for XGBoost4J-Spark.
+  - Latest API documentation is now hosted at https://xgboost.readthedocs.io/.
+* XGBoost documentation now keeps track of multiple versions:
+  - Latest master: https://xgboost.readthedocs.io/en/latest
+  - 0.80 stable: https://xgboost.readthedocs.io/en/release_0.80
+  - 0.72 stable: https://xgboost.readthedocs.io/en/release_0.72
+* Ranking task now uses instance weights (#3379)
+* Fix inaccurate decimal parsing (#3546)
+* New functionality
+  - Query ID column support in LIBSVM data files (#2749). This is convenient for performing ranking task in distributed setting.
+  - Hinge loss for binary classification (`binary:hinge`) (#3477)
+  - Ability to specify delimiter and instance weight column for CSV files (#3546)
+  - Ability to use 1-based indexing instead of 0-based (#3546)
+* GPU support
+  - Quantile sketch, binning, and index compression are now performed on GPU, eliminating PCIe transfer for 'gpu_hist' algorithm (#3319, #3393)
+  - Upgrade to NCCL2 for multi-GPU training (#3404).
+  - Use shared memory atomics for faster training (#3384).
+  - Dynamically allocate GPU memory, to prevent large allocations for deep trees (#3519)
+  - Fix memory copy bug for large files (#3472)
+* Python package
+  - Importing data from Python datatable (#3272)
+  - Pre-built binary wheels available for 64-bit Linux and Windows (#3424, #3443)
+  - Add new importance measures 'total_gain', 'total_cover' (#3498)
+  - Sklearn API now supports saving and loading models (#3192)
+  - Arbitrary cross validation fold indices (#3353)
+  - `predict()` function in Sklearn API uses `best_ntree_limit` if available, to make early stopping easier to use (#3445)
+  - Informational messages are now directed to Python's `print()` rather than standard output (#3438). This way, messages appear inside Jupyter notebooks.
+* R package
+  - Oracle Solaris support, per CRAN policy (#3372)
+* JVM packages
+  - Single-instance prediction (#3464)
+  - Pre-built JARs are now available from Maven Central (#3401)
+  - Add NULL pointer check (#3021)
+  - Consider `spark.task.cpus` when controlling parallelism (#3530)
+  - Handle missing values in prediction (#3529)
+  - Eliminate outputs of `System.out` (#3572)
+* Refactored C++ DMatrix class for simplicity and de-duplication (#3301)
+* Refactored C++ histogram facilities (#3564)
+* Refactored constraints / regularization mechanism for split finding (#3335, #3429). Users may specify an elastic net (L2 + L1 regularization) on leaf weights as well as monotonic constraints on test nodes. The refactor will be useful for a future addition of feature interaction constraints.
+* Statically link `libstdc++` for MinGW32 (#3430)
+* Enable loading from `group`, `base_margin` and `weight` (see [here](http://xgboost.readthedocs.io/en/latest/tutorials/input_format.html#auxiliary-files-for-additional-information)) for Python, R, and JVM packages (#3431)
+* Fix model saving for `count:possion` so that `max_delta_step` doesn't get truncated (#3515)
+* Fix loading of sparse CSC matrix (#3553)
+* Fix incorrect handling of `base_score` parameter for Tweedie regression (#3295)
+
 ## v0.72.1 (2018.07.08)
 This version is only applicable for the Python package. The content is identical to that of v0.72.
 
diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION
index 4a2f01bbc025..0a69d9549890 100644
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: xgboost
 Type: Package
 Title: Extreme Gradient Boosting
-Version: 0.71.2
-Date: 2018-06-08
+Version: 0.80.1
+Date: 2018-08-13
 Authors@R: c(
   person("Tianqi", "Chen", role = c("aut"),
          email = "tianqi.tchen@gmail.com"),
diff --git a/doc/build.rst b/doc/build.rst
index e2065fc5e81d..162aaf853104 100644
--- a/doc/build.rst
+++ b/doc/build.rst
@@ -4,15 +4,17 @@ Installation Guide
 
 .. note:: Pre-built binary wheel for Python
 
-  If you are planning to use Python on a Linux system, consider installing XGBoost from a pre-built binary wheel. The wheel is available from Python Package Index (PyPI). You may download and install it by running
+  If you are planning to use Python, consider installing XGBoost from a pre-built binary wheel, available from Python Package Index (PyPI). You may download and install it by running
 
   .. code-block:: bash
 
-    # Ensure that you are downloading xgboost-{version}-py2.py3-none-manylinux1_x86_64.whl
+    # Ensure that you are downloading one of the following:
+    #   * xgboost-{version}-py2.py3-none-manylinux1_x86_64.whl
+    #   * xgboost-{version}-py2.py3-none-win_amd64.whl
     pip3 install xgboost
 
-  * This package will support GPU algorithms (`gpu_exact`, `gpu_hist`) on machines with NVIDIA GPUs.
-  * Currently, PyPI has a binary wheel only for 64-bit Linux.
+  * The binary wheel will support GPU algorithms (`gpu_exact`, `gpu_hist`) on machines with NVIDIA GPUs. **However, it will not support multi-GPU training; only single GPU will be used.** To enable multi-GPU training, download and install the binary wheel from `this page <https://s3-us-west-2.amazonaws.com/xgboost-wheels/list.html>`_.
+  * Currently, we provide binary wheels for 64-bit Linux and Windows.
 
 ****************************
 Building XGBoost from source
@@ -187,13 +189,15 @@ After the build process successfully ends, you will find a ``xgboost.dll`` libra
 
 Unofficial windows binaries and instructions on how to use them are hosted on `Guido Tapia's blog <http://www.picnet.com.au/blogs/guido/post/2016/09/22/xgboost-windows-x64-binaries-for-download/>`_.
 
+.. _build_gpu_support:
+
 Building with GPU support
 =========================
 XGBoost can be built with GPU support for both Linux and Windows using CMake. GPU support works with the Python package as well as the CLI version. See `Installing R package with GPU support`_ for special instructions for R.
 
 An up-to-date version of the CUDA toolkit is required.
 
-From the command line on Linux starting from the xgboost directory:
+From the command line on Linux starting from the XGBoost directory:
 
 .. code-block:: bash
 
@@ -202,9 +206,16 @@ From the command line on Linux starting from the xgboost directory:
   cmake .. -DUSE_CUDA=ON
   make -j
 
-.. note:: Windows requirements for GPU build
+.. note:: Enabling multi-GPU training
+
+  By default, multi-GPU training is disabled and only a single GPU will be used. To enable multi-GPU training, set the option ``USE_NCCL=ON``. Multi-GPU training depends on NCCL2, available at `this link <https://developer.nvidia.com/nccl>`_. Since NCCL2 is only available for Linux machines, **multi-GPU training is available only for Linux**.
+
+  .. code-block:: bash
 
-  Only Visual C++ 2015 or 2013 with CUDA v8.0 were fully tested. Either install Visual C++ 2015 Build Tools separately, or as a part of Visual Studio 2015. If you already have Visual Studio 2017, the Visual C++ 2015 Toolchain componenet has to be installed using the VS 2017 Installer. Likely, you would need to use the VS2015 x64 Native Tools command prompt to run the cmake commands given below. In some situations, however, things run just fine from MSYS2 bash command line.
+    mkdir build
+    cd build
+    cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON
+    make -j
 
 On Windows, see what options for generators you have for CMake, and choose one with ``[arch]`` replaced with Win64:
 
diff --git a/doc/conf.py b/doc/conf.py
index 51dbde538e04..7efdf2070f9c 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -12,11 +12,21 @@
 # All configuration values have a default; values that are commented out
 # serve to show the default.
 from subprocess import call
+from sh.contrib import git
+import urllib.request
 from recommonmark.parser import CommonMarkParser
 import sys
+import re
 import os, subprocess
 import shlex
 import guzzle_sphinx_theme
+
+git_branch = [re.sub(r'origin/', '', x.lstrip(' ')) for x in str(git.branch('-r', '--contains', 'HEAD')).rstrip('\n').split('\n')]
+git_branch = [x for x in git_branch if 'HEAD' not in x]
+print('git_branch = {}'.format(git_branch[0]))
+filename, _ = urllib.request.urlretrieve('https://s3-us-west-2.amazonaws.com/xgboost-docs/{}.tar.bz2'.format(git_branch[0]))
+call('if [ -d tmp ]; then rm -rf tmp; fi; mkdir -p tmp/jvm; cd tmp/jvm; tar xvf {}'.format(filename), shell=True)
+
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
@@ -94,6 +104,7 @@
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 exclude_patterns = ['_build']
+html_extra_path = ['./tmp']
 
 # The reST default role (used for this markup: `text`) to use for all
 # documents.
diff --git a/doc/gpu/index.rst b/doc/gpu/index.rst
index f561768e9950..dc0f877e6009 100644
--- a/doc/gpu/index.rst
+++ b/doc/gpu/index.rst
@@ -8,7 +8,7 @@ To install GPU support, checkout the :doc:`/build`.
 *********************************************
 CUDA Accelerated Tree Construction Algorithms
 *********************************************
-This plugin adds GPU accelerated tree construction and prediction algorithms to XGBoost.
+Tree construction (training) and prediction can be accelerated with CUDA-capable GPUs.
 
 Usage
 =====
@@ -59,7 +59,11 @@ The device ordinal can be selected using the ``gpu_id`` parameter, which default
 
 Multiple GPUs can be used with the ``gpu_hist`` tree method using the ``n_gpus`` parameter. which defaults to 1. If this is set to -1 all available GPUs will be used.  If ``gpu_id`` is specified as non-zero, the gpu device order is ``mod(gpu_id + i) % n_visible_devices`` for ``i=0`` to ``n_gpus-1``.  As with GPU vs. CPU, multi-GPU will not always be faster than a single GPU due to PCI bus bandwidth that can limit performance.
 
-This plugin currently works with the CLI, python and R - see :doc:`/build` for details.
+.. note:: Enabling multi-GPU training
+
+  Default installation may not enable multi-GPU training. To use multiple GPUs, make sure to read :ref:`build_gpu_support`.
+
+The GPU algorithms currently work with CLI, Python and R packages. See :doc:`/build` for details.
 
 .. code-block:: python
   :caption: Python example
diff --git a/doc/jvm/index.rst b/doc/jvm/index.rst
index 8cfa4221338d..9c84a1e17ecf 100644
--- a/doc/jvm/index.rst
+++ b/doc/jvm/index.rst
@@ -58,10 +58,9 @@ For sbt, please add the repository and dependency in build.sbt as following:
 
 If you want to use XGBoost4J-Spark, replace ``xgboost4j`` with ``xgboost4j-spark``.
 
-.. note:: Spark 2.0 Required
-
-  After integrating with Dataframe/Dataset APIs of Spark 2.0, XGBoost4J-Spark only supports compile with Spark 2.x. You can build XGBoost4J-Spark as a component of XGBoost4J by running ``mvn package``, and you can specify the version of spark with ``mvn -Dspark.version=2.0.0 package``.   (To continue working with Spark 1.x, the users are supposed to update pom.xml by modifying the properties like ``spark.version``, ``scala.version``, and ``scala.binary.version``. Users also need to change the implementation by replacing ``SparkSession`` with ``SQLContext`` and the type of API parameters from ``Dataset[_]`` to ``Dataframe``)
+.. note:: XGBoost4J-Spark requires Spark 2.3+
 
+  XGBoost4J-Spark now requires Spark 2.3+. Latest versions of XGBoost4J-Spark uses facilities of `org.apache.spark.ml.param.shared` extensively to provide for a tight integration with Spark MLLIB framework, and these facilities are not fully available on earlier versions of Spark.
 
 Installation from maven repo
 ============================
@@ -150,7 +149,7 @@ Contents
   java_intro
   XGBoost4J-Spark Tutorial <xgboost4j_spark_tutorial>
   Code Examples <https://github.com/dmlc/xgboost/tree/master/jvm-packages/xgboost4j-example>
-  XGBoost4J Java API <http://dmlc.ml/docs/javadocs/index.html>
-  XGBoost4J Scala API <http://dmlc.ml/docs/scaladocs/xgboost4j/index.html>
-  XGBoost4J-Spark Scala API <http://dmlc.ml/docs/scaladocs/xgboost4j-spark/index.html>
-  XGBoost4J-Flink Scala API <http://dmlc.ml/docs/scaladocs/xgboost4j-flink/index.html>
+  XGBoost4J Java API <javadocs/index>
+  XGBoost4J Scala API <scaladocs/xgboost4j/index>
+  XGBoost4J-Spark Scala API <scaladocs/xgboost4j-spark/index>
+  XGBoost4J-Flink Scala API <scaladocs/xgboost4j-flink/index>
diff --git a/doc/jvm/java_intro.rst b/doc/jvm/java_intro.rst
index c36b489ab36c..57a5866f94f8 100644
--- a/doc/jvm/java_intro.rst
+++ b/doc/jvm/java_intro.rst
@@ -6,15 +6,15 @@ This tutorial introduces Java API for XGBoost.
 **************
 Data Interface
 **************
-Like the XGBoost python module, XGBoost4J uses DMatrix to handle data,
-LIBSVM txt format file, sparse matrix in CSR/CSC format, and dense matrix is
+Like the XGBoost python module, XGBoost4J uses DMatrix to handle data.
+LIBSVM txt format file, sparse matrix in CSR/CSC format, and dense matrix are
 supported.
 
 * The first step is to import DMatrix:
 
   .. code-block:: java
 
-    import org.dmlc.xgboost4j.java.DMatrix;
+    import ml.dmlc.xgboost4j.java.DMatrix;
 
 * Use DMatrix constructor to load data from a libsvm text format file:
 
@@ -39,7 +39,8 @@ supported.
     long[] rowHeaders = new long[] {0,2,4,7};
     float[] data = new float[] {1f,2f,4f,3f,3f,1f,2f};
     int[] colIndex = new int[] {0,2,0,3,0,1,2};
-    DMatrix dmat = new DMatrix(rowHeaders, colIndex, data, DMatrix.SparseType.CSR);
+    int numColumn = 4;
+    DMatrix dmat = new DMatrix(rowHeaders, colIndex, data, DMatrix.SparseType.CSR, numColumn);
   
   ... or in `Compressed Sparse Column (CSC) <https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_column_(CSC_or_CCS)>`_ format:
   
@@ -48,7 +49,8 @@ supported.
     long[] colHeaders = new long[] {0,3,4,6,7};
     float[] data = new float[] {1f,4f,3f,1f,2f,2f,3f};
     int[] rowIndex = new int[] {0,1,2,2,0,2,1};
-    DMatrix dmat = new DMatrix(colHeaders, rowIndex, data, DMatrix.SparseType.CSC);
+    int numRow = 3;
+    DMatrix dmat = new DMatrix(colHeaders, rowIndex, data, DMatrix.SparseType.CSC, numRow);
 
 * You may also load your data from a dense matrix. Let's assume we have a matrix of form
 
@@ -66,7 +68,7 @@ supported.
     int nrow = 3;
     int ncol = 2;
     float missing = 0.0f;
-    DMatrix dmat = new Matrix(data, nrow, ncol, missing);
+    DMatrix dmat = new DMatrix(data, nrow, ncol, missing);
 
 * To set weight:
 
@@ -82,7 +84,7 @@ To set parameters, parameters are specified as a Map:
 
 .. code-block:: java
 
-  Map<String, Object> params = new HashMap<>() {
+  Map<String, Object> params = new HashMap<String, Object>() {
     {
       put("eta", 1.0);
       put("max_depth", 2);
@@ -101,8 +103,8 @@ With parameters and data, you are able to train a booster model.
 
   .. code-block:: java
 
-    import org.dmlc.xgboost4j.java.Booster;
-    import org.dmlc.xgboost4j.java.XGBoost;
+    import ml.dmlc.xgboost4j.java.Booster;
+    import ml.dmlc.xgboost4j.java.XGBoost;
 
 * Training
 
@@ -110,11 +112,13 @@ With parameters and data, you are able to train a booster model.
 
     DMatrix trainMat = new DMatrix("train.svm.txt");
     DMatrix validMat = new DMatrix("valid.svm.txt");
-    // Specify a watchList to see the performance
-    // Any Iterable<Entry<String, DMatrix>> object could be used as watchList
-    List<Entry<String, DMatrix>> watches = new ArrayList<>();
-    watches.add(new SimpleEntry<>("train", trainMat));
-    watches.add(new SimpleEntry<>("test", testMat));
+    // Specify a watch list to see model accuracy on data sets
+    Map<String, DMatrix> watches = new HashMap<String, DMatrix>() {
+      {
+        put("train", trainMat);
+        put("test", testMat);
+      }
+    };
     int nround = 2;
     Booster booster = XGBoost.train(trainMat, params, nround, watches, null, null);
 
@@ -130,15 +134,16 @@ With parameters and data, you are able to train a booster model.
 
   .. code-block:: java
 
-    String[] model_dump = booster.getModelDump(null, false)
+    // dump without feature map
+    String[] model_dump = booster.getModelDump(null, false);
     // dump with feature map
-    String[] model_dump_with_feature_map = booster.getModelDump("featureMap.txt", false)
+    String[] model_dump_with_feature_map = booster.getModelDump("featureMap.txt", false);
 
 * Load a model
 
   .. code-block:: java
 
-    Booster booster = Booster.loadModel("model.bin");
+    Booster booster = XGBoost.loadModel("model.bin");
 
 **********
 Prediction
diff --git a/doc/jvm/javadocs/index.rst b/doc/jvm/javadocs/index.rst
new file mode 100644
index 000000000000..33bf528129e4
--- /dev/null
+++ b/doc/jvm/javadocs/index.rst
@@ -0,0 +1,3 @@
+==================
+XGBoost4J Java API
+==================
diff --git a/doc/jvm/scaladocs/xgboost4j-flink/index.rst b/doc/jvm/scaladocs/xgboost4j-flink/index.rst
new file mode 100644
index 000000000000..fe35703c0a54
--- /dev/null
+++ b/doc/jvm/scaladocs/xgboost4j-flink/index.rst
@@ -0,0 +1,3 @@
+=========================
+XGBoost4J-Flink Scala API
+=========================
diff --git a/doc/jvm/scaladocs/xgboost4j-spark/index.rst b/doc/jvm/scaladocs/xgboost4j-spark/index.rst
new file mode 100644
index 000000000000..c702f6e42196
--- /dev/null
+++ b/doc/jvm/scaladocs/xgboost4j-spark/index.rst
@@ -0,0 +1,3 @@
+=========================
+XGBoost4J-Spark Scala API
+=========================
diff --git a/doc/jvm/scaladocs/xgboost4j/index.rst b/doc/jvm/scaladocs/xgboost4j/index.rst
new file mode 100644
index 000000000000..542dbc4d338b
--- /dev/null
+++ b/doc/jvm/scaladocs/xgboost4j/index.rst
@@ -0,0 +1,3 @@
+===================
+XGBoost4J Scala API
+===================
diff --git a/doc/jvm/xgboost4j_spark_tutorial.rst b/doc/jvm/xgboost4j_spark_tutorial.rst
index 8d679a9563e1..8abe24196319 100644
--- a/doc/jvm/xgboost4j_spark_tutorial.rst
+++ b/doc/jvm/xgboost4j_spark_tutorial.rst
@@ -61,6 +61,10 @@ and then refer to the snapshot dependency by adding:
       <version>next_version_num-SNAPSHOT</version>
   </dependency>
 
+.. note:: XGBoost4J-Spark requires Spark 2.3+
+
+  XGBoost4J-Spark now requires Spark 2.3+. Latest versions of XGBoost4J-Spark uses facilities of `org.apache.spark.ml.param.shared` extensively to provide for a tight integration with Spark MLLIB framework, and these facilities are not fully available on earlier versions of Spark.
+
 Data Preparation
 ================
 
diff --git a/doc/python/python_intro.rst b/doc/python/python_intro.rst
index bf7d800175f1..06ac40292a41 100644
--- a/doc/python/python_intro.rst
+++ b/doc/python/python_intro.rst
@@ -25,7 +25,8 @@ The XGBoost python module is able to load data from:
 - LibSVM text format file
 - Comma-separated values (CSV) file
 - NumPy 2D array
-- SciPy 2D sparse array, and
+- SciPy 2D sparse array
+- Pandas data frame, and
 - XGBoost binary buffer file.
 
 (See :doc:`/tutorials/input_format` for detailed description of text input format.)
@@ -66,6 +67,14 @@ The data is stored in a :py:class:`DMatrix <xgboost.DMatrix>` object.
     csr = scipy.sparse.csr_matrix((dat, (row, col)))
     dtrain = xgb.DMatrix(csr)
 
+* To load a Pandas data frame into :py:class:`DMatrix <xgboost.DMatrix>`:
+
+  .. code-block:: python
+
+    data = pandas.DataFrame(np.arange(12).reshape((4,3)), columns=['a', 'b', 'c'])
+    label = pandas.DataFrame(np.random.randint(2, size=4))
+    dtrain = xgb.DMatrix(data, label=label)
+
 * Saving :py:class:`DMatrix <xgboost.DMatrix>` into a XGBoost binary file will make loading faster:
 
   .. code-block:: python
diff --git a/doc/requirements.txt b/doc/requirements.txt
index b97bacf9ce25..74c434357fde 100644
--- a/doc/requirements.txt
+++ b/doc/requirements.txt
@@ -2,3 +2,4 @@ sphinx
 mock
 guzzle_sphinx_theme
 breathe
+sh>=1.12.14
diff --git a/doc/tutorials/aws_yarn.rst b/doc/tutorials/aws_yarn.rst
index 9988c0248341..62c30967f820 100644
--- a/doc/tutorials/aws_yarn.rst
+++ b/doc/tutorials/aws_yarn.rst
@@ -5,9 +5,9 @@ This is a step-by-step tutorial on how to setup and run distributed `XGBoost <ht
 on an AWS EC2 cluster. Distributed XGBoost runs on various platforms such as MPI, SGE and Hadoop YARN.
 In this tutorial, we use YARN as an example since this is a widely used solution for distributed computing.
 
-.. note:: XGBoost on Spark
+.. note:: XGBoost with Spark
 
-  If you are preprocessing training data with Spark, you may want to look at `XGBoost4J-Spark <https://xgboost.ai/2016/10/26/a-full-integration-of-xgboost-and-spark.html>`_, which supports distributed training on Resilient Distributed Dataset (RDD).
+  If you are preprocessing training data with Spark, consider using :doc:`XGBoost4J-Spark </jvm/xgboost4j_spark_tutorial>`.
 
 ************
 Prerequisite
diff --git a/jvm-packages/build_doc.sh b/jvm-packages/build_doc.sh
new file mode 100755
index 000000000000..614ea611424b
--- /dev/null
+++ b/jvm-packages/build_doc.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+if [ $# -ne 1 ]; then
+  echo "Usage: $0 [commit id]"
+  exit 1
+fi
+
+set -e
+set -x
+
+commit_id=$1
+
+# Install JVM packages in local Maven repository
+mvn install -DskipTests
+# Build Scaladocs
+mvn scala:doc -DskipTests
+# Build Javadocs
+mvn javadoc:javadoc -DskipTests
+
+# Package JVM docs in a tarball
+mkdir -p tmp/scaladocs
+cp -rv xgboost4j/target/site/apidocs/ ./tmp/javadocs/
+cp -rv xgboost4j/target/site/scaladocs/ ./tmp/scaladocs/xgboost4j/
+cp -rv xgboost4j-spark/target/site/scaladocs/ ./tmp/scaladocs/xgboost4j-spark/
+cp -rv xgboost4j-flink/target/site/scaladocs/ ./tmp/scaladocs/xgboost4j-flink/
+
+cd tmp
+tar cvjf ${commit_id}.tar.bz2 javadocs/ scaladocs/
+mv ${commit_id}.tar.bz2 ..
+cd ..
+rm -rfv tmp/
+
+set +x
+set +e
diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index da4e90b02cd3..2e6075f4fc65 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -6,7 +6,7 @@
 
     <groupId>ml.dmlc</groupId>
     <artifactId>xgboost-jvm</artifactId>
-    <version>0.80-SNAPSHOT</version>
+    <version>0.80</version>
     <packaging>pom</packaging>
     <name>XGBoost JVM Package</name>
     <description>JVM Package for XGBoost</description>
diff --git a/jvm-packages/xgboost4j-example/pom.xml b/jvm-packages/xgboost4j-example/pom.xml
index 9c9b6cef2e33..f1d516378c1f 100644
--- a/jvm-packages/xgboost4j-example/pom.xml
+++ b/jvm-packages/xgboost4j-example/pom.xml
@@ -6,10 +6,10 @@
     <parent>
         <groupId>ml.dmlc</groupId>
         <artifactId>xgboost-jvm</artifactId>
-        <version>0.80-SNAPSHOT</version>
+        <version>0.80</version>
     </parent>
     <artifactId>xgboost4j-example</artifactId>
-    <version>0.80-SNAPSHOT</version>
+    <version>0.80</version>
     <packaging>jar</packaging>
     <build>
         <plugins>
@@ -26,7 +26,7 @@
         <dependency>
             <groupId>ml.dmlc</groupId>
             <artifactId>xgboost4j-spark</artifactId>
-            <version>0.80-SNAPSHOT</version>
+            <version>0.80</version>
         </dependency>
         <dependency>
             <groupId>org.apache.spark</groupId>
@@ -37,7 +37,7 @@
         <dependency>
             <groupId>ml.dmlc</groupId>
             <artifactId>xgboost4j-flink</artifactId>
-            <version>0.80-SNAPSHOT</version>
+            <version>0.80</version>
         </dependency>
         <dependency>
             <groupId>org.apache.commons</groupId>
diff --git a/jvm-packages/xgboost4j-flink/pom.xml b/jvm-packages/xgboost4j-flink/pom.xml
index 57da4ab2aa8a..e9c5d06eb785 100644
--- a/jvm-packages/xgboost4j-flink/pom.xml
+++ b/jvm-packages/xgboost4j-flink/pom.xml
@@ -6,10 +6,10 @@
     <parent>
         <groupId>ml.dmlc</groupId>
         <artifactId>xgboost-jvm</artifactId>
-        <version>0.80-SNAPSHOT</version>
+        <version>0.80</version>
     </parent>
     <artifactId>xgboost4j-flink</artifactId>
-    <version>0.80-SNAPSHOT</version>
+    <version>0.80</version>
     <build>
         <plugins>
             <plugin>
@@ -26,7 +26,7 @@
         <dependency>
             <groupId>ml.dmlc</groupId>
             <artifactId>xgboost4j</artifactId>
-            <version>0.80-SNAPSHOT</version>
+            <version>0.80</version>
         </dependency>
         <dependency>
             <groupId>org.apache.commons</groupId>
diff --git a/jvm-packages/xgboost4j-spark/pom.xml b/jvm-packages/xgboost4j-spark/pom.xml
index 971a45e12c7b..f66f748d6ca3 100644
--- a/jvm-packages/xgboost4j-spark/pom.xml
+++ b/jvm-packages/xgboost4j-spark/pom.xml
@@ -6,7 +6,7 @@
     <parent>
         <groupId>ml.dmlc</groupId>
         <artifactId>xgboost-jvm</artifactId>
-        <version>0.80-SNAPSHOT</version>
+        <version>0.80</version>
     </parent>
     <artifactId>xgboost4j-spark</artifactId>
     <build>
@@ -24,7 +24,7 @@
         <dependency>
             <groupId>ml.dmlc</groupId>
             <artifactId>xgboost4j</artifactId>
-            <version>0.80-SNAPSHOT</version>
+            <version>0.80</version>
         </dependency>
         <dependency>
             <groupId>org.apache.spark</groupId>
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/BoosterParams.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/BoosterParams.scala
index 89e2ef1021ef..d67d0d4e0db3 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/BoosterParams.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/BoosterParams.scala
@@ -240,7 +240,7 @@ private[spark] trait BoosterParams extends Params {
   final val treeLimit = new IntParam(this, name = "treeLimit",
     doc = "number of trees used in the prediction; defaults to 0 (use all trees).")
 
-  final def getTreeLimit: Double = $(lambdaBias)
+  final def getTreeLimit: Double = $(treeLimit)
 
   setDefault(eta -> 0.3, gamma -> 0, maxDepth -> 6,
     minChildWeight -> 1, maxDeltaStep -> 0,
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/org/apache/spark/SparkParallelismTracker.scala b/jvm-packages/xgboost4j-spark/src/main/scala/org/apache/spark/SparkParallelismTracker.scala
index 0f430b21920b..7a950eed7b65 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/org/apache/spark/SparkParallelismTracker.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/org/apache/spark/SparkParallelismTracker.scala
@@ -82,7 +82,7 @@ class SparkParallelismTracker(
     try {
       body
     } finally {
-      sc.listenerBus.removeListener(listener)
+      sc.removeSparkListener(listener)
     }
   }
 
diff --git a/jvm-packages/xgboost4j/pom.xml b/jvm-packages/xgboost4j/pom.xml
index e73bafd0cc81..e9db2d63e24a 100644
--- a/jvm-packages/xgboost4j/pom.xml
+++ b/jvm-packages/xgboost4j/pom.xml
@@ -6,10 +6,10 @@
     <parent>
         <groupId>ml.dmlc</groupId>
         <artifactId>xgboost-jvm</artifactId>
-        <version>0.80-SNAPSHOT</version>
+        <version>0.80</version>
     </parent>
     <artifactId>xgboost4j</artifactId>
-    <version>0.80-SNAPSHOT</version>
+    <version>0.80</version>
     <packaging>jar</packaging>
 
     <dependencies>
diff --git a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java
index 91fc2cb4d264..0faa6bb5805e 100644
--- a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java
+++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java
@@ -531,13 +531,11 @@ public void write(Kryo kryo, Output output) {
     try {
       byte[] serObj = this.toByteArray();
       int serObjSize = serObj.length;
-      System.out.println("==== serialized obj size " + serObjSize);
       output.writeInt(serObjSize);
       output.writeInt(version);
       output.write(serObj);
     } catch (XGBoostError ex) {
-      ex.printStackTrace();
-      logger.error(ex.getMessage());
+      logger.error(ex.getMessage(), ex);
     }
   }
 
@@ -547,13 +545,11 @@ public void read(Kryo kryo, Input input) {
       this.init(null);
       int serObjSize = input.readInt();
       this.version = input.readInt();
-      System.out.println("==== the size of the object: " + serObjSize);
       byte[] bytes = new byte[serObjSize];
       input.readBytes(bytes);
       XGBoostJNI.checkCall(XGBoostJNI.XGBoosterLoadModelFromBuffer(this.handle, bytes));
     } catch (XGBoostError ex) {
-      ex.printStackTrace();
-      logger.error(ex.getMessage());
+      logger.error(ex.getMessage(), ex);
     }
   }
 }
diff --git a/python-package/xgboost/VERSION b/python-package/xgboost/VERSION
index b214dd991b6b..885b0568652e 100644
--- a/python-package/xgboost/VERSION
+++ b/python-package/xgboost/VERSION
@@ -1 +1 @@
-0.72
+0.80
diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc
index 6d883cb7870f..c0bea5980c56 100644
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@@ -36,7 +36,7 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {
   auto iter = p_fmat->RowIterator();
   iter->BeforeFirst();
   while (iter->Next()) {
-     auto batch = iter->Value();
+     auto &batch = iter->Value();
     #pragma omp parallel num_threads(nthread)
     {
       CHECK_EQ(nthread, omp_get_num_threads());
@@ -137,7 +137,7 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat, int max_num_bins) {
   iter->BeforeFirst();
   row_ptr.push_back(0);
   while (iter->Next()) {
-     auto batch = iter->Value();
+     auto &batch = iter->Value();
     const size_t rbegin = row_ptr.size() - 1;
     for (size_t i = 0; i < batch.Size(); ++i) {
       row_ptr.push_back(batch[i].size() + row_ptr.back());
diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc
index 6df2f607ba4c..c14faf0ced4e 100644
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -67,7 +67,7 @@ void SimpleDMatrix::MakeOneBatch(SparsePage* pcol, bool sorted) {
 
   iter->BeforeFirst();
   while (iter->Next()) {
-     auto batch = iter->Value();
+     auto &batch = iter->Value();
     #pragma omp parallel for schedule(static)
     for (long i = 0; i < static_cast<long>(batch.Size()); ++i) { // NOLINT(*)
       int tid = omp_get_thread_num();
diff --git a/src/data/sparse_page_dmatrix.cc b/src/data/sparse_page_dmatrix.cc
index ec2bd2b67305..55e078d847b2 100644
--- a/src/data/sparse_page_dmatrix.cc
+++ b/src/data/sparse_page_dmatrix.cc
@@ -185,7 +185,7 @@ void SparsePageDMatrix::InitColAccess(
 
     while (true) {
       if (batch_ptr != batch_top) {
-         auto batch = iter->Value();
+         auto &batch = iter->Value();
         CHECK_EQ(batch_top, batch.Size());
         for (size_t i = batch_ptr; i < batch_top; ++i) {
           auto ridx = static_cast<bst_uint>(batch.base_rowid + i);
diff --git a/src/gbm/gblinear.cc b/src/gbm/gblinear.cc
index e50a6089e072..6a432c057658 100644
--- a/src/gbm/gblinear.cc
+++ b/src/gbm/gblinear.cc
@@ -155,7 +155,7 @@ class GBLinear : public GradientBooster {
      auto iter = p_fmat->RowIterator();
     iter->BeforeFirst();
     while (iter->Next()) {
-       auto batch = iter->Value();
+       auto &batch = iter->Value();
       // parallel over local batch
       const auto nsize = static_cast<bst_omp_uint>(batch.Size());
       #pragma omp parallel for schedule(static)
@@ -207,7 +207,7 @@ class GBLinear : public GradientBooster {
     const int ngroup = model_.param.num_output_group;
     preds.resize(p_fmat->Info().num_row_ * ngroup);
     while (iter->Next()) {
-       auto batch = iter->Value();
+       auto &batch = iter->Value();
       // output convention: nrow * k, where nrow is number of rows
       // k is number of group
       // parallel over local batch
diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index a619114d8fec..739acde3b6be 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -441,7 +441,7 @@ class Dart : public GBTree {
     auto* self = static_cast<Derived*>(this);
     iter->BeforeFirst();
     while (iter->Next()) {
-      auto batch = iter->Value();
+      auto &batch = iter->Value();
       // parallel over local batch
       constexpr int kUnroll = 8;
       const auto nsize = static_cast<bst_omp_uint>(batch.Size());
diff --git a/src/linear/coordinate_common.h b/src/linear/coordinate_common.h
index 3e41188648b0..72b0c9802167 100644
--- a/src/linear/coordinate_common.h
+++ b/src/linear/coordinate_common.h
@@ -67,7 +67,7 @@ inline std::pair<double, double> GetGradient(int group_idx, int num_group, int f
   double sum_grad = 0.0, sum_hess = 0.0;
   auto iter = p_fmat->ColIterator();
   while (iter->Next()) {
-    auto batch = iter->Value();
+    auto &batch = iter->Value();
     auto col = batch[fidx];
     const auto ndata = static_cast<bst_omp_uint>(col.size());
     for (bst_omp_uint j = 0; j < ndata; ++j) {
@@ -98,7 +98,7 @@ inline std::pair<double, double> GetGradientParallel(int group_idx, int num_grou
   double sum_grad = 0.0, sum_hess = 0.0;
   auto iter = p_fmat->ColIterator();
   while (iter->Next()) {
-    auto batch = iter->Value();
+    auto &batch = iter->Value();
     auto col = batch[fidx];
     const auto ndata = static_cast<bst_omp_uint>(col.size());
 #pragma omp parallel for schedule(static) reduction(+ : sum_grad, sum_hess)
@@ -156,7 +156,7 @@ inline void UpdateResidualParallel(int fidx, int group_idx, int num_group,
   if (dw == 0.0f) return;
   auto iter = p_fmat->ColIterator();
   while (iter->Next()) {
-    auto batch = iter->Value();
+    auto &batch = iter->Value();
     auto col = batch[fidx];
     // update grad value
     const auto num_row = static_cast<bst_omp_uint>(col.size());
@@ -327,7 +327,7 @@ class GreedyFeatureSelector : public FeatureSelector {
     std::fill(gpair_sums_.begin(), gpair_sums_.end(), std::make_pair(0., 0.));
     auto iter = p_fmat->ColIterator();
     while (iter->Next()) {
-      auto batch = iter->Value();
+      auto &batch = iter->Value();
       #pragma omp parallel for schedule(static)
       for (bst_omp_uint i = 0; i < nfeat; ++i) {
         const auto col = batch[i];
@@ -394,7 +394,7 @@ class ThriftyFeatureSelector : public FeatureSelector {
     std::fill(gpair_sums_.begin(), gpair_sums_.end(), std::make_pair(0., 0.));
     auto iter = p_fmat->ColIterator();
     while (iter->Next()) {
-      auto batch = iter->Value();
+      auto &batch = iter->Value();
       // column-parallel is usually faster than row-parallel
       #pragma omp parallel for schedule(static)
       for (bst_omp_uint i = 0; i < nfeat; ++i) {
diff --git a/src/linear/updater_gpu_coordinate.cu b/src/linear/updater_gpu_coordinate.cu
index c8dd8190a336..ca1536cd1e36 100644
--- a/src/linear/updater_gpu_coordinate.cu
+++ b/src/linear/updater_gpu_coordinate.cu
@@ -237,7 +237,7 @@ class GPUCoordinateUpdater : public LinearUpdater {
     auto iter = p_fmat->ColIterator();
     CHECK(p_fmat->SingleColBlock());
     iter->Next();
-    auto batch = iter->Value();
+    auto &batch = iter->Value();
 
     shards.resize(n_devices);
     // Create device shards
diff --git a/src/linear/updater_shotgun.cc b/src/linear/updater_shotgun.cc
index 995afa69d145..fc666cfa1d43 100644
--- a/src/linear/updater_shotgun.cc
+++ b/src/linear/updater_shotgun.cc
@@ -81,7 +81,7 @@ class ShotgunUpdater : public LinearUpdater {
                      param_.reg_alpha_denorm, param_.reg_lambda_denorm, 0);
      auto iter = p_fmat->ColIterator();
     while (iter->Next()) {
-      auto batch = iter->Value();
+      auto &batch = iter->Value();
       const auto nfeat = static_cast<bst_omp_uint>(batch.Size());
 #pragma omp parallel for schedule(static)
       for (bst_omp_uint i = 0; i < nfeat; ++i) {
diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc
index 58a95a9e08bb..964bbaa0d5ae 100644
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -236,7 +236,7 @@ class CPUPredictor : public Predictor {
     auto iter = p_fmat->RowIterator();
     iter->BeforeFirst();
     while (iter->Next()) {
-      auto batch = iter->Value();
+      auto &batch = iter->Value();
       // parallel over local batch
       const auto nsize = static_cast<bst_omp_uint>(batch.Size());
 #pragma omp parallel for schedule(static)
@@ -285,7 +285,7 @@ class CPUPredictor : public Predictor {
     const std::vector<bst_float>& base_margin = info.base_margin_;
     iter->BeforeFirst();
     while (iter->Next()) {
-      auto batch = iter->Value();
+      auto &batch = iter->Value();
       // parallel over local batch
       const auto nsize = static_cast<bst_omp_uint>(batch.Size());
 #pragma omp parallel for schedule(static)
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index 9e576adb44d8..21afc37f3f9c 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -64,7 +64,7 @@ struct DeviceMatrix {
     iter->BeforeFirst();
     size_t data_offset = 0;
     while (iter->Next()) {
-      auto batch = iter->Value();
+      auto &batch = iter->Value();
       // Copy row ptr
       dh::safe_cuda(cudaMemcpy(
           row_ptr.Data() + batch.base_rowid, batch.offset.data(),
diff --git a/src/tree/updater_basemaker-inl.h b/src/tree/updater_basemaker-inl.h
index c64c5b29a5b1..55ff2c7743bb 100644
--- a/src/tree/updater_basemaker-inl.h
+++ b/src/tree/updater_basemaker-inl.h
@@ -46,7 +46,7 @@ class BaseMaker: public TreeUpdater {
       auto iter = p_fmat->ColIterator();
       iter->BeforeFirst();
       while (iter->Next()) {
-        auto batch = iter->Value();
+        auto &batch = iter->Value();
         for (bst_uint fid = 0; fid < batch.Size(); ++fid) {
            auto c = batch[fid];
           if (c.size() != 0) {
@@ -305,7 +305,7 @@ class BaseMaker: public TreeUpdater {
     this->GetSplitSet(nodes, tree, &fsplits);
     auto iter = p_fmat->ColIterator();
     while (iter->Next()) {
-      auto batch = iter->Value();
+      auto &batch = iter->Value();
       for (auto fid : fsplits) {
         auto col = batch[fid];
         const auto ndata = static_cast<bst_omp_uint>(col.size());
diff --git a/src/tree/updater_colmaker.cc b/src/tree/updater_colmaker.cc
index 533638c516e8..d4eaab7afa11 100644
--- a/src/tree/updater_colmaker.cc
+++ b/src/tree/updater_colmaker.cc
@@ -731,7 +731,7 @@ class ColMaker: public TreeUpdater {
       fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
       auto iter = p_fmat->ColIterator();
       while (iter->Next()) {
-        auto batch = iter->Value();
+        auto &batch = iter->Value();
         for (auto fid : fsplits) {
           auto col = batch[fid];
           const auto ndata = static_cast<bst_omp_uint>(col.size());
@@ -862,7 +862,7 @@ class DistColMaker : public ColMaker {
       }
       auto iter = p_fmat->ColIterator();
       while (iter->Next()) {
-        auto batch = iter->Value();
+        auto &batch = iter->Value();
         for (auto fid : fsplits) {
           auto col = batch[fid];
           const auto ndata = static_cast<bst_omp_uint>(col.size());
diff --git a/src/tree/updater_gpu.cu b/src/tree/updater_gpu.cu
index 425ebd415902..ee23299ac813 100644
--- a/src/tree/updater_gpu.cu
+++ b/src/tree/updater_gpu.cu
@@ -666,7 +666,7 @@ class GPUMaker : public TreeUpdater {
     auto iter = dmat->ColIterator();
     iter->BeforeFirst();
     while (iter->Next()) {
-      auto batch = iter->Value();
+      auto &batch = iter->Value();
       for (int i = 0; i < batch.Size(); i++) {
         auto col = batch[i];
         for (const Entry* it = col.data(); it != col.data() + col.size();
diff --git a/src/tree/updater_histmaker.cc b/src/tree/updater_histmaker.cc
index cdf7e64d09c9..62b5b13e19cd 100644
--- a/src/tree/updater_histmaker.cc
+++ b/src/tree/updater_histmaker.cc
@@ -347,7 +347,7 @@ class CQHistMaker: public HistMaker<TStats> {
       auto iter = p_fmat->ColIterator();
       iter->BeforeFirst();
       while (iter->Next()) {
-        auto batch = iter->Value();
+        auto &batch = iter->Value();
         // start enumeration
         const auto nsize = static_cast<bst_omp_uint>(fset.size());
         #pragma omp parallel for schedule(dynamic, 1)
@@ -429,7 +429,7 @@ class CQHistMaker: public HistMaker<TStats> {
       auto iter = p_fmat->ColIterator();
       iter->BeforeFirst();
       while (iter->Next()) {
-        auto batch = iter->Value();
+        auto &batch = iter->Value();
         // TWOPASS: use the real set + split set in the column iteration.
         this->CorrectNonDefaultPositionByBatch(batch, fsplit_set_, tree);
 
@@ -717,7 +717,7 @@ class GlobalProposalHistMaker: public CQHistMaker<TStats> {
       auto iter = p_fmat->ColIterator();
       iter->BeforeFirst();
       while (iter->Next()) {
-        auto batch = iter->Value();
+        auto &batch = iter->Value();
         // TWOPASS: use the real set + split set in the column iteration.
         this->CorrectNonDefaultPositionByBatch(batch, this->fsplit_set_, tree);
 
@@ -775,7 +775,7 @@ class QuantileHistMaker: public HistMaker<TStats> {
     auto iter = p_fmat->RowIterator();
     iter->BeforeFirst();
     while (iter->Next()) {
-      auto batch = iter->Value();
+      auto &batch = iter->Value();
       // parallel convert to column major format
       common::ParallelGroupBuilder<Entry>
           builder(&col_ptr_, &col_data_, &thread_col_ptr_);
diff --git a/src/tree/updater_refresh.cc b/src/tree/updater_refresh.cc
index c365c6a9635b..b14fa248d51b 100644
--- a/src/tree/updater_refresh.cc
+++ b/src/tree/updater_refresh.cc
@@ -60,7 +60,7 @@ class TreeRefresher: public TreeUpdater {
        auto *iter = p_fmat->RowIterator();
       iter->BeforeFirst();
       while (iter->Next()) {
-         auto batch = iter->Value();
+         auto &batch = iter->Value();
         CHECK_LT(batch.Size(), std::numeric_limits<unsigned>::max());
         const auto nbatch = static_cast<bst_omp_uint>(batch.Size());
         #pragma omp parallel for schedule(static)
diff --git a/src/tree/updater_skmaker.cc b/src/tree/updater_skmaker.cc
index 0696d6329666..50f1a56c407f 100644
--- a/src/tree/updater_skmaker.cc
+++ b/src/tree/updater_skmaker.cc
@@ -147,7 +147,7 @@ class SketchMaker: public BaseMaker {
     auto iter = p_fmat->ColIterator();
     iter->BeforeFirst();
     while (iter->Next()) {
-      auto batch = iter->Value();
+      auto &batch = iter->Value();
       // start enumeration
       const auto nsize = static_cast<bst_omp_uint>(batch.Size());
       #pragma omp parallel for schedule(dynamic, 1)