diff --git a/.travis.yml b/.travis.yml
index 645504631e16..d3925cf03fab 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -3,7 +3,6 @@ sudo: required
 
 # Enabling test on Linux and OS X
 os:
-  - linux
   - osx
 
 osx_image: xcode9.3
@@ -11,61 +10,15 @@ osx_image: xcode9.3
 # Use Build Matrix to do lint and build seperately
 env:
   matrix:
-    # code lint
-    - TASK=lint
-    # r package test
-    - TASK=r_test
     # python package test
     - TASK=python_test
-    - TASK=python_lightweight_test
     # java package test
     - TASK=java_test
     # cmake test
-    - TASK=cmake_test
-    # c++ test
-    - TASK=cpp_test
-    # distributed test
-    - TASK=distributed_test
-    # address sanitizer test
-    - TASK=sanitizer_test
-
-matrix:
-  exclude:
-    - os: osx
-      env: TASK=lint
-    - os: osx
-      env: TASK=cmake_test
-    - os: linux
-      env: TASK=r_test
-    - os: osx
-      env: TASK=python_lightweight_test
-    - os: osx
-      env: TASK=cpp_test
-    - os: osx
-      env: TASK=distributed_test
-    - os: osx
-      env: TASK=sanitizer_test
+    # - TASK=cmake_test
 
 # dependent apt packages
 addons:
-  apt:
-    sources:
-      - llvm-toolchain-trusty-5.0
-      - ubuntu-toolchain-r-test
-      - george-edison55-precise-backports
-    packages:
-      - clang
-      - clang-tidy-5.0
-      - cmake-data
-      - doxygen
-      - wget
-      - libcurl4-openssl-dev
-      - unzip
-      - graphviz
-      - gcc-5
-      - g++-5
-      - gcc-7
-      - g++-7
   homebrew:
     packages:
       - gcc@7
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 65614d0c54c3..f8aa879119a4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -78,7 +78,7 @@ else ()
 endif (MINGW OR R_LIB)
 add_library(rabit STATIC ${RABIT_SOURCES})
 target_include_directories(rabit PRIVATE
-  $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}/rabit/include>
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}/dmlc-core/include>
   $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}/rabit/include/rabit>)
 set_target_properties(rabit
   PROPERTIES
diff --git a/Jenkinsfile b/Jenkinsfile
index 96268901ee70..f7dc488fad99 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -3,129 +3,325 @@
 // Jenkins pipeline
 // See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
 
-import groovy.transform.Field
-
-/* Unrestricted tasks: tasks that do NOT generate artifacts */
-
 // Command to run command inside a docker container
-def dockerRun = 'tests/ci_build/ci_build.sh'
-// Utility functions
-@Field
-def utils
-
-def buildMatrix = [
-    [ "enabled": true,  "os" : "linux", "withGpu": true, "withNccl": true,  "withOmp": true, "pythonVersion": "2.7", "cudaVersion": "10.0", "multiGpu": true],
-    [ "enabled": true,  "os" : "linux", "withGpu": true, "withNccl": true,  "withOmp": true, "pythonVersion": "2.7", "cudaVersion": "9.2" ],
-    [ "enabled": true,  "os" : "linux", "withGpu": true, "withNccl": true,  "withOmp": true, "pythonVersion": "2.7", "cudaVersion": "8.0" ],
-    [ "enabled": true,  "os" : "linux", "withGpu": true, "withNccl": false, "withOmp": true, "pythonVersion": "2.7", "cudaVersion": "8.0" ],
-]
+dockerRun = 'tests/ci_build/ci_build.sh'
 
 pipeline {
-    // Each stage specify its own agent
-    agent none
+  // Each stage specify its own agent
+  agent none
 
-    environment {
-        DOCKER_CACHE_REPO = '492475357299.dkr.ecr.us-west-2.amazonaws.com'
-    }
+  environment {
+    DOCKER_CACHE_REPO = '492475357299.dkr.ecr.us-west-2.amazonaws.com'
+  }
 
-    // Setup common job properties
-    options {
-        ansiColor('xterm')
-        timestamps()
-        timeout(time: 120, unit: 'MINUTES')
-        buildDiscarder(logRotator(numToKeepStr: '10'))
-    }
+  // Setup common job properties
+  options {
+    ansiColor('xterm')
+    timestamps()
+    timeout(time: 120, unit: 'MINUTES')
+    buildDiscarder(logRotator(numToKeepStr: '10'))
+    preserveStashes()
+  }
 
-    // Build stages
-    stages {
-        stage('Jenkins: Get sources') {
-            agent {
-                label 'unrestricted'
-            }
-            steps {
-                script {
-                    utils = load('tests/ci_build/jenkins_tools.Groovy')
-                    utils.checkoutSrcs()
-                }
-                stash name: 'srcs', excludes: '.git/'
-                milestone label: 'Sources ready', ordinal: 1
-            }
+  // Build stages
+  stages {
+    stage('Get sources') {
+      agent { label 'linux && cpu' }
+      steps {
+        script {
+          checkoutSrcs()
         }
-        stage('Jenkins: Build & Test') {
-            steps {
-                script {
-                    parallel (buildMatrix.findAll{it['enabled']}.collectEntries{ c ->
-                        def buildName = utils.getBuildName(c)
-                        utils.buildFactory(buildName, c, false, this.&buildPlatformCmake)
-                    } + [ "clang-tidy" : { buildClangTidyJob() } ])
-                }
-            }
+        stash name: 'srcs'
+        milestone ordinal: 1
+      }
+    }
+    stage('Formatting Check') {
+      agent none
+      steps {
+        script {
+          parallel ([
+            'clang-tidy': { ClangTidy() },
+            'lint': { Lint() },
+            'sphinx-doc': { SphinxDoc() },
+            'doxygen': { Doxygen() }
+          ])
         }
+        milestone ordinal: 2
+      }
     }
-}
-
-/**
- * Build platform and test it via cmake.
- */
-def buildPlatformCmake(buildName, conf, nodeReq, dockerTarget) {
-    def opts = utils.cmakeOptions(conf)
-    // Destination dir for artifacts
-    def distDir = "dist/${buildName}"
-    def dockerArgs = ""
-    if (conf["withGpu"]) {
-        dockerArgs = "--build-arg CUDA_VERSION=" + conf["cudaVersion"]
+    stage('Build') {
+      agent none
+      steps {
+        script {
+          parallel ([
+            'build-cpu': { BuildCPU() },
+            'build-gpu-cuda8.0': { BuildCUDA(cuda_version: '8.0') },
+            'build-gpu-cuda9.2': { BuildCUDA(cuda_version: '9.2') },
+            'build-gpu-cuda10.0': { BuildCUDA(cuda_version: '10.0') },
+            'build-jvm-packages': { BuildJVMPackages(spark_version: '2.4.1') },
+            'build-jvm-doc': { BuildJVMDoc() }
+          ])
+        }
+        milestone ordinal: 3
+      }
     }
-    def test_suite = conf["withGpu"] ? (conf["multiGpu"] ? "mgpu" : "gpu") : "cpu"
-    // Build node - this is returned result
-    retry(1) {
-        node(nodeReq) {
-            unstash name: 'srcs'
-            echo """
-            |===== XGBoost CMake build =====
-            |  dockerTarget: ${dockerTarget}
-            |  cmakeOpts   : ${opts}
-            |=========================
-            """.stripMargin('|')
-            // Invoke command inside docker
-            sh """
-            ${dockerRun} ${dockerTarget} ${dockerArgs} tests/ci_build/build_via_cmake.sh ${opts}
-            ${dockerRun} ${dockerTarget} ${dockerArgs} tests/ci_build/test_${test_suite}.sh
-            """
-            if (!conf["multiGpu"]) {
-                sh """
-                ${dockerRun} ${dockerTarget} ${dockerArgs} bash -c "cd python-package; rm -f dist/*; python setup.py bdist_wheel --universal"
-                rm -rf "${distDir}"; mkdir -p "${distDir}/py"
-                cp xgboost "${distDir}"
-                cp -r python-package/dist "${distDir}/py"
-                # Test the wheel for compatibility on a barebones CPU container
-                ${dockerRun} release ${dockerArgs} bash -c " \
-                    pip install --user python-package/dist/xgboost-*-none-any.whl && \
-		    pytest -v --fulltrace -s tests/python"
-                # Test the wheel for compatibility on CUDA 10.0 container
-                ${dockerRun} gpu --build-arg CUDA_VERSION=10.0 bash -c " \
-                    pip install --user python-package/dist/xgboost-*-none-any.whl && \
-		    pytest -v -s --fulltrace -m '(not mgpu) and (not slow)' tests/python-gpu"
-                """
-            }
+    stage('Test') {
+      agent none
+      steps {
+        script {
+          parallel ([
+            'test-python-cpu': { TestPythonCPU() },
+            'test-python-gpu-cuda8.0': { TestPythonGPU(cuda_version: '8.0') },
+            'test-python-gpu-cuda9.2': { TestPythonGPU(cuda_version: '9.2') },
+            'test-python-gpu-cuda10.0': { TestPythonGPU(cuda_version: '10.0') },
+            'test-python-mgpu-cuda10.0': { TestPythonGPU(cuda_version: '10.0', multi_gpu: true) },
+            'test-cpp-gpu': { TestCppGPU(cuda_version: '10.0') },
+            'test-cpp-mgpu': { TestCppGPU(cuda_version: '10.0', multi_gpu: true) },
+            'test-jvm-jdk8': { CrossTestJVMwithJDK(jdk_version: '8') },
+            'test-jvm-jdk11': { CrossTestJVMwithJDK(jdk_version: '11') },
+            'test-jvm-jdk12': { CrossTestJVMwithJDK(jdk_version: '12') },
+            'test-r-3.4.4': { TestR(use_r35: false) },
+            'test-r-3.5.3': { TestR(use_r35: true) }
+          ])
         }
+        milestone ordinal: 4
+      }
     }
+  }
 }
 
-/**
- * Run a clang-tidy job on a GPU machine
- */
-def buildClangTidyJob() {
-    def nodeReq = "linux && gpu && unrestricted"
-    node(nodeReq) {
-        unstash name: 'srcs'
-        echo "Running clang-tidy job..."
-        // Invoke command inside docker
-        // Install Google Test and Python yaml
-        dockerTarget = "clang_tidy"
-        dockerArgs = "--build-arg CUDA_VERSION=9.2"
-        sh """
-        ${dockerRun} ${dockerTarget} ${dockerArgs} tests/ci_build/clang_tidy.sh
-        """
+// check out source code from git
+def checkoutSrcs() {
+  retry(5) {
+    try {
+      timeout(time: 2, unit: 'MINUTES') {
+        checkout scm
+        sh 'git submodule update --init'
       }
+    } catch (exc) {
+      deleteDir()
+      error "Failed to fetch source codes"
+    }
+  }
+}
+
+def ClangTidy() {
+  node('linux && cpu') {
+    unstash name: 'srcs'
+    echo "Running clang-tidy job..."
+    def container_type = "clang_tidy"
+    def docker_binary = "docker"
+    def dockerArgs = "--build-arg CUDA_VERSION=9.2"
+    sh """
+    ${dockerRun} ${container_type} ${docker_binary} ${dockerArgs} tests/ci_build/clang_tidy.sh
+    """
+    deleteDir()
+  }
+}
+
+def Lint() {
+  node('linux && cpu') {
+    unstash name: 'srcs'
+    echo "Running lint..."
+    def container_type = "cpu"
+    def docker_binary = "docker"
+    sh """
+    ${dockerRun} ${container_type} ${docker_binary} make lint
+    """
+    deleteDir()
+  }
+}
+
+def SphinxDoc() {
+  node('linux && cpu') {
+    unstash name: 'srcs'
+    echo "Running sphinx-doc..."
+    def container_type = "cpu"
+    def docker_binary = "docker"
+    def docker_extra_params = "CI_DOCKER_EXTRA_PARAMS_INIT='-e SPHINX_GIT_BRANCH=${BRANCH_NAME}'"
+    sh """#!/bin/bash
+    ${docker_extra_params} ${dockerRun} ${container_type} ${docker_binary} make -C doc html
+    """
+    deleteDir()
+  }
+}
+
+def Doxygen() {
+  node('linux && cpu') {
+    unstash name: 'srcs'
+    echo "Running doxygen..."
+    def container_type = "cpu"
+    def docker_binary = "docker"
+    sh """
+    ${dockerRun} ${container_type} ${docker_binary} tests/ci_build/doxygen.sh
+    """
+    deleteDir()
+  }
+}
+
+def BuildCPU() {
+  node('linux && cpu') {
+    unstash name: 'srcs'
+    echo "Build CPU"
+    def container_type = "cpu"
+    def docker_binary = "docker"
+    sh """
+    ${dockerRun} ${container_type} ${docker_binary} tests/ci_build/build_via_cmake.sh
+    ${dockerRun} ${container_type} ${docker_binary} build/testxgboost
+    """
+    // Sanitizer test
+    def docker_extra_params = "CI_DOCKER_EXTRA_PARAMS_INIT='-e ASAN_SYMBOLIZER_PATH=/usr/bin/llvm-symbolizer -e ASAN_OPTIONS=symbolize=1 --cap-add SYS_PTRACE'"
+    sh """
+    ${dockerRun} ${container_type} ${docker_binary} tests/ci_build/build_via_cmake.sh -DUSE_SANITIZER=ON -DENABLED_SANITIZERS="address" \
+      -DCMAKE_BUILD_TYPE=Debug -DSANITIZER_PATH=/usr/lib/x86_64-linux-gnu/
+    ${docker_extra_params} ${dockerRun} ${container_type} ${docker_binary} build/testxgboost
+    """
+    deleteDir()
+  }
+}
+
+def BuildCUDA(args) {
+  node('linux && cpu') {
+    unstash name: 'srcs'
+    echo "Build with CUDA ${args.cuda_version}"
+    def container_type = "gpu_build"
+    def docker_binary = "docker"
+    def docker_args = "--build-arg CUDA_VERSION=${args.cuda_version}"
+    sh """
+    ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/build_via_cmake.sh -DUSE_CUDA=ON -DUSE_NCCL=ON -DOPEN_MP:BOOL=ON
+    ${dockerRun} ${container_type} ${docker_binary} ${docker_args} bash -c "cd python-package && rm -rf dist/* && python setup.py bdist_wheel --universal"
+    """
+    // Only stash wheel for CUDA 8.0 target
+    if (args.cuda_version == '8.0') {
+      echo 'Stashing Python wheel...'
+      stash name: 'xgboost_whl', includes: 'python-package/dist/*.whl'
+      archiveArtifacts artifacts: "python-package/dist/*.whl", allowEmptyArchive: true
+      echo 'Stashing C++ test executable (testxgboost)...'
+      stash name: 'xgboost_cpp_tests', includes: 'build/testxgboost'
+    }
+    deleteDir()
   }
+}
 
+def BuildJVMPackages(args) {
+  node('linux && cpu') {
+    unstash name: 'srcs'
+    echo "Build XGBoost4J-Spark with Spark ${args.spark_version}"
+    def container_type = "jvm"
+    def docker_binary = "docker"
+    // Use only 4 CPU cores
+    def docker_extra_params = "CI_DOCKER_EXTRA_PARAMS_INIT='--cpuset-cpus 0-3'"
+    sh """
+    ${docker_extra_params} ${dockerRun} ${container_type} ${docker_binary} tests/ci_build/build_jvm_packages.sh
+    """
+    echo 'Stashing XGBoost4J JAR...'
+    stash name: 'xgboost4j_jar', includes: 'jvm-packages/xgboost4j/target/*.jar,jvm-packages/xgboost4j-spark/target/*.jar,jvm-packages/xgboost4j-example/target/*.jar'
+    deleteDir()
+  }
+}
+
+def BuildJVMDoc() {
+  node('linux && cpu') {
+    unstash name: 'srcs'
+    echo "Building JVM doc..."
+    def container_type = "jvm"
+    def docker_binary = "docker"
+    sh """
+    ${dockerRun} ${container_type} ${docker_binary} tests/ci_build/build_jvm_doc.sh ${BRANCH_NAME}
+    """
+    archiveArtifacts artifacts: "jvm-packages/${BRANCH_NAME}.tar.bz2", allowEmptyArchive: true
+    echo 'Uploading doc...'
+    s3Upload file: "jvm-packages/${BRANCH_NAME}.tar.bz2", bucket: 'xgboost-docs', acl: 'PublicRead', path: "${BRANCH_NAME}.tar.bz2"
+    deleteDir()
+  }
+}
+
+def TestPythonCPU() {
+  node('linux && cpu') {
+    unstash name: 'xgboost_whl'
+    unstash name: 'srcs'
+    echo "Test Python CPU"
+    def container_type = "cpu"
+    def docker_binary = "docker"
+    sh """
+    ${dockerRun} ${container_type} ${docker_binary} tests/ci_build/test_python.sh cpu
+    """
+    deleteDir()
+  }
+}
+
+def TestPythonGPU(args) {
+  nodeReq = (args.multi_gpu) ? 'linux && mgpu' : 'linux && gpu'
+  node(nodeReq) {
+    unstash name: 'xgboost_whl'
+    unstash name: 'srcs'
+    echo "Test Python GPU: CUDA ${args.cuda_version}"
+    def container_type = "gpu"
+    def docker_binary = "nvidia-docker"
+    def docker_args = "--build-arg CUDA_VERSION=${args.cuda_version}"
+    if (args.multi_gpu) {
+      echo "Using multiple GPUs"
+      sh """
+      ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh mgpu
+      """
+    } else {
+      echo "Using a single GPU"
+      sh """
+      ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh gpu
+      """
+    }
+    deleteDir()
+  }
+}
+
+def TestCppGPU(args) {
+  nodeReq = (args.multi_gpu) ? 'linux && mgpu' : 'linux && gpu'
+  node(nodeReq) {
+    unstash name: 'xgboost_cpp_tests'
+    unstash name: 'srcs'
+    echo "Test C++, CUDA ${args.cuda_version}"
+    def container_type = "gpu"
+    def docker_binary = "nvidia-docker"
+    def docker_args = "--build-arg CUDA_VERSION=${args.cuda_version}"
+    if (args.multi_gpu) {
+      echo "Using multiple GPUs"
+      sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} build/testxgboost --gtest_filter=*.MGPU_*"
+    } else {
+      echo "Using a single GPU"
+      sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} build/testxgboost --gtest_filter=-*.MGPU_*"
+    }
+    deleteDir()
+  }
+}
+
+def CrossTestJVMwithJDK(args) {
+  node('linux && cpu') {
+    unstash name: 'xgboost4j_jar'
+    unstash name: 'srcs'
+    echo "Test XGBoost4J on a machine with JDK ${args.jdk_version}"
+    def container_type = "jvm_cross"
+    def docker_binary = "docker"
+    def docker_args = "--build-arg JDK_VERSION=${args.jdk_version}"
+    // Only run integration tests for JDK 8, as Spark doesn't support later JDKs yet
+    def docker_extra_params = (args.jdk_version == '8') ? "CI_DOCKER_EXTRA_PARAMS_INIT='-e RUN_INTEGRATION_TEST=1'" : ""
+    sh """
+    ${docker_extra_params} ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_jvm_cross.sh
+    """
+    deleteDir()
+  }
+}
+
+def TestR(args) {
+  node('linux && cpu') {
+    unstash name: 'srcs'
+    echo "Test R package"
+    def container_type = "rproject"
+    def docker_binary = "docker"
+    def use_r35_flag = (args.use_r35) ? "1" : "0"
+    def docker_args = "--build-arg USE_R35=${use_r35_flag}"
+    sh """
+    ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/build_test_rpkg.sh
+    """
+    deleteDir()
+  }
+}
diff --git a/Jenkinsfile-restricted b/Jenkinsfile-restricted
deleted file mode 100644
index f55997640d9d..000000000000
--- a/Jenkinsfile-restricted
+++ /dev/null
@@ -1,124 +0,0 @@
-#!/usr/bin/groovy
-// -*- mode: groovy -*-
-// Jenkins pipeline
-// See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
-
-import groovy.transform.Field
-
-/* Restricted tasks: tasks generating artifacts, such as binary wheels and
-                     documentation */
-
-// Command to run command inside a docker container
-def dockerRun = 'tests/ci_build/ci_build.sh'
-// Utility functions
-@Field
-def utils
-@Field
-def commit_id
-@Field
-def branch_name
-
-def buildMatrix = [
-    [ "enabled": true,  "os" : "linux", "withGpu": true, "withNccl": true,  "withOmp": true, "pythonVersion": "2.7", "cudaVersion": "10.0" ],
-    [ "enabled": true,  "os" : "linux", "withGpu": true, "withNccl": true,  "withOmp": true, "pythonVersion": "2.7", "cudaVersion": "9.2" ],
-    [ "enabled": true,  "os" : "linux", "withGpu": true, "withNccl": true,  "withOmp": true, "pythonVersion": "2.7", "cudaVersion": "8.0" ],
-    [ "enabled": true,  "os" : "linux", "withGpu": true, "withNccl": false, "withOmp": true, "pythonVersion": "2.7", "cudaVersion": "8.0" ],
-]
-
-pipeline {
-    // Each stage specify its own agent
-    agent none
-
-    // Setup common job properties
-    options {
-        ansiColor('xterm')
-        timestamps()
-        timeout(time: 120, unit: 'MINUTES')
-        buildDiscarder(logRotator(numToKeepStr: '10'))
-    }
-
-    // Build stages
-    stages {
-        stage('Jenkins: Get sources') {
-            agent {
-                label 'restricted'
-            }
-            steps {
-                script {
-                    utils = load('tests/ci_build/jenkins_tools.Groovy')
-                    utils.checkoutSrcs()
-                    commit_id = "${GIT_COMMIT}"
-                    branch_name = "${GIT_LOCAL_BRANCH}"
-                }
-                stash name: 'srcs', excludes: '.git/'
-                milestone label: 'Sources ready', ordinal: 1
-            }
-        }
-        stage('Jenkins: Build doc') {
-            steps {
-                script {
-                    retry(1) {
-                        node('linux && cpu && restricted') {
-                            unstash name: 'srcs'
-                            echo 'Building doc...'
-                            dir ('jvm-packages') {
-                                sh "bash ./build_doc.sh ${commit_id}"
-                                archiveArtifacts artifacts: "${commit_id}.tar.bz2", allowEmptyArchive: true
-                                echo 'Deploying doc...'
-                                withAWS(credentials:'xgboost-doc-bucket') {
-                                    s3Upload file: "${commit_id}.tar.bz2", bucket: 'xgboost-docs', acl: 'PublicRead', path: "${branch_name}.tar.bz2"
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
-
-        stage('Jenkins: Build artifacts') {
-            steps {
-                script {
-                    parallel (buildMatrix.findAll{it['enabled']}.collectEntries{ c ->
-                        def buildName = utils.getBuildName(c)
-                        utils.buildFactory(buildName, c, true, this.&buildPlatformCmake)
-                    })
-                }
-            }
-        }
-    }
-}
-
-/**
- * Build platform and test it via cmake.
- */
-def buildPlatformCmake(buildName, conf, nodeReq, dockerTarget) {
-    def opts = utils.cmakeOptions(conf)
-    // Destination dir for artifacts
-    def distDir = "dist/${buildName}"
-    def dockerArgs = ""
-    if(conf["withGpu"]){
-        dockerArgs = "--build-arg CUDA_VERSION=" + conf["cudaVersion"]
-    }
-    // Build node - this is returned result
-    retry(1) {
-        node(nodeReq) {
-            unstash name: 'srcs'
-            echo """
-            |===== XGBoost CMake build =====
-            |  dockerTarget: ${dockerTarget}
-            |  cmakeOpts   : ${opts}
-            |=========================
-            """.stripMargin('|')
-            // Invoke command inside docker
-            sh """
-            ${dockerRun} ${dockerTarget} ${dockerArgs} tests/ci_build/build_via_cmake.sh ${opts}
-            ${dockerRun} ${dockerTarget} ${dockerArgs} bash -c "cd python-package; rm -f dist/*; python setup.py bdist_wheel --universal"
-            rm -rf "${distDir}"; mkdir -p "${distDir}/py"
-            cp xgboost "${distDir}"
-            cp -r lib "${distDir}"
-            cp -r python-package/dist "${distDir}/py"
-            """
-            archiveArtifacts artifacts: "${distDir}/**/*.*", allowEmptyArchive: true
-        }
-    }
-}
diff --git a/Makefile b/Makefile
index 6c8477186675..42d3bfe1a0ca 100644
--- a/Makefile
+++ b/Makefile
@@ -176,7 +176,11 @@ rcpplint:
 	python3 dmlc-core/scripts/lint.py xgboost ${LINT_LANG} R-package/src
 
 lint: rcpplint
-	python3 dmlc-core/scripts/lint.py --pylint-rc ${PWD}/python-package/.pylintrc xgboost ${LINT_LANG} include src plugin python-package
+	python3 dmlc-core/scripts/lint.py --exclude_path python-package/xgboost/dmlc-core \
+	  python-package/xgboost/include python-package/xgboost/lib \
+	  python-package/xgboost/make python-package/xgboost/rabit \
+	  python-package/xgboost/src --pylint-rc ${PWD}/python-package/.pylintrc xgboost \
+	  ${LINT_LANG} include src plugin python-package
 
 pylint:
 	flake8 --ignore E501 python-package
diff --git a/demo/README.md b/demo/README.md
index 08a157456fec..3f93f1fab38d 100644
--- a/demo/README.md
+++ b/demo/README.md
@@ -119,6 +119,7 @@ If you have particular usecase of xgboost that you would like to highlight.
 Send a PR to add a one sentence description:)
 
 - XGBoost is used in [Kaggle Script](https://www.kaggle.com/scripts) to solve data science challenges.
+- Distribute XGBoost as Rest API server from Jupyter notebook with [BentoML](https://github.com/bentoml/bentoml). [Link to notebook](https://github.com/bentoml/BentoML/blob/master/examples/xgboost-predict-titanic-survival/XGBoost-titanic-survival-prediction.ipynb)
 - [Seldon predictive service powered by XGBoost](http://docs.seldon.io/iris-demo.html)
 - XGBoost Distributed is used in [ODPS Cloud Service by Alibaba](https://yq.aliyun.com/articles/6355) (in Chinese)
 - XGBoost is incoporated as part of [Graphlab Create](https://dato.com/products/create/) for scalable machine learning.
diff --git a/doc/conf.py b/doc/conf.py
index 7e0d81a769ea..0ccfe8039e66 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -22,8 +22,11 @@
 import shlex
 import guzzle_sphinx_theme
 
-git_branch = [re.sub(r'origin/', '', x.lstrip(' ')) for x in str(git.branch('-r', '--contains', 'HEAD')).rstrip('\n').split('\n')]
-git_branch = [x for x in git_branch if 'HEAD' not in x]
+git_branch = os.getenv('SPHINX_GIT_BRANCH', default=None)
+if git_branch is None:
+    # If SPHINX_GIT_BRANCH environment variable is not given, run git to determine branch name
+    git_branch = [re.sub(r'origin/', '', x.lstrip(' ')) for x in str(git.branch('-r', '--contains', 'HEAD')).rstrip('\n').split('\n')]
+    git_branch = [x for x in git_branch if 'HEAD' not in x]
 print('git_branch = {}'.format(git_branch[0]))
 try:
   filename, _ = urllib.request.urlretrieve('https://s3-us-west-2.amazonaws.com/xgboost-docs/{}.tar.bz2'.format(git_branch[0]))
diff --git a/doc/tutorials/model.rst b/doc/tutorials/model.rst
index 5c394724dac7..a4c2cf98c192 100644
--- a/doc/tutorials/model.rst
+++ b/doc/tutorials/model.rst
@@ -82,7 +82,7 @@ that classifies whether someone will like computer games.
 We classify the members of a family into different leaves, and assign them the score on the corresponding leaf.
 A CART is a bit different from decision trees, in which the leaf only contains decision values. In CART, a real score
 is associated with each of the leaves, which gives us richer interpretations that go beyond classification.
-This also allows for a pricipled, unified approach to optimization, as we will see in a later part of this tutorial.
+This also allows for a principled, unified approach to optimization, as we will see in a later part of this tutorial.
 
 Usually, a single tree is not strong enough to be used in practice. What is actually used is the ensemble model,
 which sums the prediction of multiple trees together.
diff --git a/include/xgboost/tree_model.h b/include/xgboost/tree_model.h
index 31dc15093f77..37f61dfae8d1 100644
--- a/include/xgboost/tree_model.h
+++ b/include/xgboost/tree_model.h
@@ -62,6 +62,13 @@ struct TreeParam : public dmlc::Parameter<TreeParam> {
     DMLC_DECLARE_FIELD(size_leaf_vector).set_lower_bound(0).set_default(0)
         .describe("Size of leaf vector, reserved for vector tree");
   }
+
+  bool operator==(const TreeParam& b) const {
+    return num_roots == b.num_roots && num_nodes == b.num_nodes &&
+           num_deleted == b.num_deleted && max_depth == b.max_depth &&
+           num_feature == b.num_feature &&
+           size_leaf_vector == b.size_leaf_vector;
+  }
 };
 
 /*! \brief node statistics used in regression tree */
@@ -74,6 +81,10 @@ struct RTreeNodeStat {
   bst_float base_weight;
   /*! \brief number of child that is leaf node known up to now */
   int leaf_child_cnt;
+  bool operator==(const RTreeNodeStat& b) const {
+    return loss_chg == b.loss_chg && sum_hess == b.sum_hess &&
+           base_weight == b.base_weight && leaf_child_cnt == b.leaf_child_cnt;
+  }
 };
 
 /*!
@@ -179,11 +190,20 @@ class RegTree {
     XGBOOST_DEVICE void MarkDelete() {
       this->sindex_ = std::numeric_limits<unsigned>::max();
     }
+    /*! \brief Reuse this deleted node. */
+    XGBOOST_DEVICE void Reuse() {
+      this->sindex_ = 0;
+    }
     // set parent
     XGBOOST_DEVICE void SetParent(int pidx, bool is_left_child = true) {
       if (is_left_child) pidx |= (1U << 31);
       this->parent_ = pidx;
     }
+    bool operator==(const Node& b) const {
+      return parent_ == b.parent_ && cleft_ == b.cleft_ &&
+             cright_ == b.cright_ && sindex_ == b.sindex_ &&
+             info_.leaf_value == b.info_.leaf_value;
+    }
 
    private:
     /*!
@@ -300,6 +320,11 @@ class RegTree {
     fo->Write(dmlc::BeginPtr(stats_), sizeof(RTreeNodeStat) * nodes_.size());
   }
 
+  bool operator==(const RegTree& b) const {
+    return nodes_ == b.nodes_ && stats_ == b.stats_ &&
+           deleted_nodes_ == b.deleted_nodes_ && param == b.param;
+  }
+
   /**
    * \brief Expands a leaf node into two additional leaf nodes.
    *
@@ -503,10 +528,11 @@ class RegTree {
   // !!!!!! NOTE: may cause BUG here, nodes.resize
   int AllocNode() {
     if (param.num_deleted != 0) {
-      int nd = deleted_nodes_.back();
+      int nid = deleted_nodes_.back();
       deleted_nodes_.pop_back();
+      nodes_[nid].Reuse();
       --param.num_deleted;
-      return nd;
+      return nid;
     }
     int nd = param.num_nodes++;
     CHECK_LT(param.num_nodes, std::numeric_limits<int>::max())
diff --git a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/ExternalMemory.java b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/ExternalMemory.java
index a4a1cb703850..349098ae1386 100644
--- a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/ExternalMemory.java
+++ b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/ExternalMemory.java
@@ -32,8 +32,8 @@ public static void main(String[] args) throws XGBoostError {
     //this is the only difference, add a # followed by a cache prefix name
     //several cache file with the prefix will be generated
     //currently only support convert from libsvm file
-    DMatrix trainMat = new DMatrix("../demo/data/agaricus.txt.train#dtrain.cache");
-    DMatrix testMat = new DMatrix("../demo/data/agaricus.txt.test#dtest.cache");
+    DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train#dtrain.cache");
+    DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test#dtest.cache");
 
     //specify parameters
     HashMap<String, Object> params = new HashMap<String, Object>();
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/RabitTrackerRobustnessSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/RabitRobustnessSuite.scala
similarity index 67%
rename from jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/RabitTrackerRobustnessSuite.scala
rename to jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/RabitRobustnessSuite.scala
index 276eb1ba61ba..322131affb18 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/RabitTrackerRobustnessSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/RabitRobustnessSuite.scala
@@ -16,14 +16,73 @@
 
 package ml.dmlc.xgboost4j.scala.spark
 
+import java.util.concurrent.LinkedBlockingDeque
+
+import scala.util.Random
+
 import ml.dmlc.xgboost4j.java.{IRabitTracker, Rabit, RabitTracker => PyRabitTracker}
 import ml.dmlc.xgboost4j.scala.rabit.{RabitTracker => ScalaRabitTracker}
 import ml.dmlc.xgboost4j.java.IRabitTracker.TrackerStatus
+import ml.dmlc.xgboost4j.scala.DMatrix
+
 import org.apache.spark.{SparkConf, SparkContext}
 import org.scalatest.FunSuite
 
 
-class RabitTrackerRobustnessSuite extends FunSuite with PerTest {
+class RabitSuite extends FunSuite with PerTest {
+
+  test("training with Scala-implemented Rabit tracker") {
+    val eval = new EvalError()
+    val training = buildDataFrame(Classification.train)
+    val testDM = new DMatrix(Classification.test.iterator)
+    val paramMap = Map("eta" -> "1", "max_depth" -> "6",
+      "objective" -> "binary:logistic", "num_round" -> 5, "num_workers" -> numWorkers,
+      "tracker_conf" -> TrackerConf(60 * 60 * 1000, "scala"))
+    val model = new XGBoostClassifier(paramMap).fit(training)
+    assert(eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM) < 0.1)
+  }
+
+  test("test Rabit allreduce to validate Scala-implemented Rabit tracker") {
+    val vectorLength = 100
+    val rdd = sc.parallelize(
+      (1 to numWorkers * vectorLength).toArray.map { _ => Random.nextFloat() }, numWorkers).cache()
+
+    val tracker = new ScalaRabitTracker(numWorkers)
+    tracker.start(0)
+    val trackerEnvs = tracker.getWorkerEnvs
+    val collectedAllReduceResults = new LinkedBlockingDeque[Array[Float]]()
+
+    val rawData = rdd.mapPartitions { iter =>
+      Iterator(iter.toArray)
+    }.collect()
+
+    val maxVec = (0 until vectorLength).toArray.map { j =>
+      (0 until numWorkers).toArray.map { i => rawData(i)(j) }.max
+    }
+
+    val allReduceResults = rdd.mapPartitions { iter =>
+      Rabit.init(trackerEnvs)
+      val arr = iter.toArray
+      val results = Rabit.allReduce(arr, Rabit.OpType.MAX)
+      Rabit.shutdown()
+      Iterator(results)
+    }.cache()
+
+    val sparkThread = new Thread() {
+      override def run(): Unit = {
+        allReduceResults.foreachPartition(() => _)
+        val byPartitionResults = allReduceResults.collect()
+        assert(byPartitionResults(0).length == vectorLength)
+        collectedAllReduceResults.put(byPartitionResults(0))
+      }
+    }
+    sparkThread.start()
+    assert(tracker.waitFor(0L) == 0)
+    sparkThread.join()
+
+    assert(collectedAllReduceResults.poll().sameElements(maxVec))
+  }
+
   test("test Java RabitTracker wrapper's exception handling: it should not hang forever.") {
     /*
       Deliberately create new instances of SparkContext in each unit test to avoid reusing the
@@ -148,4 +207,23 @@ class RabitTrackerRobustnessSuite extends FunSuite with PerTest {
     // should fail due to connection timeout
     assert(tracker.waitFor(0L) == TrackerStatus.FAILURE.getStatusCode)
   }
+
+  test("should allow the dataframe containing rabit calls to be partially evaluated for" +
+    " multiple times (ISSUE-4406)") {
+    val paramMap = Map(
+      "eta" -> "1",
+      "max_depth" -> "6",
+      "silent" -> "1",
+      "objective" -> "binary:logistic")
+    val trainingDF = buildDataFrame(Classification.train)
+    val model = new XGBoostClassifier(paramMap ++ Array("num_round" -> 10,
+      "num_workers" -> numWorkers)).fit(trainingDF)
+    val prediction = model.transform(trainingDF)
+    // a partial evaluation of dataframe will cause rabit initialized but not shutdown in some
+    // threads
+    prediction.show()
+    // a full evaluation here will re-run init and shutdown all rabit proxy
+    // expecting no error
+    prediction.collect()
+  }
 }
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala
index 64eca4b6701f..45ff6e060bcb 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala
@@ -17,67 +17,17 @@
 package ml.dmlc.xgboost4j.scala.spark
 
 import java.nio.file.Files
-import java.util.concurrent.LinkedBlockingDeque
 
 import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
 import ml.dmlc.xgboost4j.scala.DMatrix
-import ml.dmlc.xgboost4j.scala.rabit.RabitTracker
 import ml.dmlc.xgboost4j.scala.{XGBoost => SXGBoost, _}
 import org.apache.hadoop.fs.{FileSystem, Path}
 
 import org.apache.spark.TaskContext
-import org.apache.spark.ml.linalg.Vectors
-import org.apache.spark.sql._
 import org.scalatest.FunSuite
-import scala.util.Random
-
-import ml.dmlc.xgboost4j.java.Rabit
-
-import org.apache.spark.ml.feature.VectorAssembler
 
 class XGBoostGeneralSuite extends FunSuite with PerTest {
 
-  test("test Rabit allreduce to validate Scala-implemented Rabit tracker") {
-    val vectorLength = 100
-    val rdd = sc.parallelize(
-      (1 to numWorkers * vectorLength).toArray.map { _ => Random.nextFloat() }, numWorkers).cache()
-
-    val tracker = new RabitTracker(numWorkers)
-    tracker.start(0)
-    val trackerEnvs = tracker.getWorkerEnvs
-    val collectedAllReduceResults = new LinkedBlockingDeque[Array[Float]]()
-
-    val rawData = rdd.mapPartitions { iter =>
-      Iterator(iter.toArray)
-    }.collect()
-
-    val maxVec = (0 until vectorLength).toArray.map { j =>
-      (0 until numWorkers).toArray.map { i => rawData(i)(j) }.max
-    }
-
-    val allReduceResults = rdd.mapPartitions { iter =>
-      Rabit.init(trackerEnvs)
-      val arr = iter.toArray
-      val results = Rabit.allReduce(arr, Rabit.OpType.MAX)
-      Rabit.shutdown()
-      Iterator(results)
-    }.cache()
-
-    val sparkThread = new Thread() {
-      override def run(): Unit = {
-        allReduceResults.foreachPartition(() => _)
-        val byPartitionResults = allReduceResults.collect()
-        assert(byPartitionResults(0).length == vectorLength)
-        collectedAllReduceResults.put(byPartitionResults(0))
-      }
-    }
-    sparkThread.start()
-    assert(tracker.waitFor(0L) == 0)
-    sparkThread.join()
-
-    assert(collectedAllReduceResults.poll().sameElements(maxVec))
-  }
-
   test("distributed training with the specified worker number") {
     val trainingRDD = sc.parallelize(Classification.train)
     val (booster, metrics) = XGBoost.trainDistributed(
@@ -101,18 +51,6 @@ class XGBoostGeneralSuite extends FunSuite with PerTest {
     assert(eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM) < 0.1)
   }
 
-
-  test("training with Scala-implemented Rabit tracker") {
-    val eval = new EvalError()
-    val training = buildDataFrame(Classification.train)
-    val testDM = new DMatrix(Classification.test.iterator)
-    val paramMap = Map("eta" -> "1", "max_depth" -> "6",
-      "objective" -> "binary:logistic", "num_round" -> 5, "num_workers" -> numWorkers,
-      "tracker_conf" -> TrackerConf(60 * 60 * 1000, "scala"))
-    val model = new XGBoostClassifier(paramMap).fit(training)
-    assert(eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM) < 0.1)
-  }
-
   test("test with quantile hist with monotone_constraints (lossguide)") {
     val eval = new EvalError()
     val training = buildDataFrame(Classification.train)
diff --git a/jvm-packages/xgboost4j-tester/generate_pom.py b/jvm-packages/xgboost4j-tester/generate_pom.py
new file mode 100644
index 000000000000..3f1099c61a24
--- /dev/null
+++ b/jvm-packages/xgboost4j-tester/generate_pom.py
@@ -0,0 +1,208 @@
+import sys
+
+pom_template = """
+<?xml version="1.0" encoding="UTF-8"?>
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <groupId>ml.dmlc</groupId>
+  <artifactId>xgboost4j-tester</artifactId>
+  <version>1.0-SNAPSHOT</version>
+
+  <name>xgboost4j-tester</name>
+
+  <properties>
+    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    <maven.compiler.source>{maven_compiler_source}</maven.compiler.source>
+    <maven.compiler.target>{maven_compiler_target}</maven.compiler.target>
+    <spark.version>{spark_version}</spark.version>
+    <scala.version>{scala_version}</scala.version>
+    <scala.binary.version>{scala_binary_version}</scala.binary.version>
+  </properties>
+
+  <dependencies>
+    <dependency>
+      <groupId>com.esotericsoftware.kryo</groupId>
+      <artifactId>kryo</artifactId>
+      <version>2.21</version>
+    </dependency>
+    <dependency>
+      <groupId>org.scala-lang</groupId>
+      <artifactId>scala-compiler</artifactId>
+      <version>${{scala.version}}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.scala-lang</groupId>
+      <artifactId>scala-reflect</artifactId>
+      <version>${{scala.version}}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.scala-lang</groupId>
+      <artifactId>scala-library</artifactId>
+      <version>${{scala.version}}</version>
+    </dependency>
+    <dependency>
+      <groupId>commons-logging</groupId>
+      <artifactId>commons-logging</artifactId>
+      <version>1.2</version>
+    </dependency>
+    <dependency>
+      <groupId>com.typesafe.akka</groupId>
+      <artifactId>akka-actor_${{scala.binary.version}}</artifactId>
+      <version>2.3.11</version>
+      <scope>compile</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.typesafe.akka</groupId>
+      <artifactId>akka-testkit_${{scala.binary.version}}</artifactId>
+      <version>2.3.11</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.scalatest</groupId>
+      <artifactId>scalatest_${{scala.binary.version}}</artifactId>
+      <version>3.0.0</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.commons</groupId>
+      <artifactId>commons-lang3</artifactId>
+      <version>3.4</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-core_${{scala.binary.version}}</artifactId>
+      <version>${{spark.version}}</version>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-sql_${{scala.binary.version}}</artifactId>
+      <version>${{spark.version}}</version>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-mllib_${{scala.binary.version}}</artifactId>
+      <version>${{spark.version}}</version>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <version>4.11</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>ml.dmlc</groupId>
+      <artifactId>xgboost4j</artifactId>
+      <version>{xgboost4j_version}</version>
+    </dependency>
+    <dependency>
+      <groupId>ml.dmlc</groupId>
+      <artifactId>xgboost4j</artifactId>
+      <version>{xgboost4j_version}</version>
+      <classifier>tests</classifier>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>ml.dmlc</groupId>
+      <artifactId>xgboost4j-spark</artifactId>
+      <version>{xgboost4j_version}</version>
+    </dependency>
+    <dependency>
+      <groupId>ml.dmlc</groupId>
+      <artifactId>xgboost4j-example</artifactId>
+      <version>{xgboost4j_version}</version>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <plugins>
+      <!-- clean lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#clean_Lifecycle -->
+      <plugin>
+        <artifactId>maven-clean-plugin</artifactId>
+        <version>3.1.0</version>
+      </plugin>
+      <!-- default lifecycle, jar packaging: see https://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
+      <plugin>
+        <artifactId>maven-resources-plugin</artifactId>
+        <version>3.0.2</version>
+      </plugin>
+      <plugin>
+        <artifactId>maven-compiler-plugin</artifactId>
+        <version>3.8.0</version>
+      </plugin>
+      <plugin>
+        <artifactId>maven-jar-plugin</artifactId>
+        <version>3.0.2</version>
+      </plugin>
+      <plugin>
+        <artifactId>maven-install-plugin</artifactId>
+        <version>2.5.2</version>
+      </plugin>
+      <plugin>
+        <artifactId>maven-deploy-plugin</artifactId>
+        <version>2.8.2</version>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-assembly-plugin</artifactId>
+        <version>2.4</version>
+        <configuration>
+          <descriptorRefs>
+            <descriptorRef>jar-with-dependencies</descriptorRef>
+          </descriptorRefs>
+          <archive>
+            <manifest>
+              <mainClass>ml.dmlc.xgboost4j.tester.App</mainClass>
+            </manifest>
+          </archive>
+        </configuration>
+        <executions>
+          <execution>
+            <phase>package</phase>
+            <goals>
+              <goal>single</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+      <!-- site lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#site_Lifecycle -->
+      <plugin>
+        <artifactId>maven-site-plugin</artifactId>
+        <version>3.7.1</version>
+      </plugin>
+      <plugin>
+        <artifactId>maven-project-info-reports-plugin</artifactId>
+        <version>3.0.0</version>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-surefire-plugin</artifactId>
+        <version>2.22.1</version>
+        <configuration>
+          <dependenciesToScan>
+            <dependency>ml.dmlc:xgboost4j</dependency>
+          </dependenciesToScan>
+        </configuration>
+      </plugin>
+    </plugins>
+  </build>
+</project>
+"""
+
+if __name__ == '__main__':
+  if len(sys.argv) != 7:
+    print('Usage: {} [xgboost4j version] [maven compiler source level] [maven compiler target level] [spark version] [scala version] [scala binary version]'.format(sys.argv[0]))
+    sys.exit(1)
+  with open('pom.xml', 'w') as f:
+    print(pom_template.format(xgboost4j_version=sys.argv[1],
+                              maven_compiler_source=sys.argv[2],
+                              maven_compiler_target=sys.argv[3],
+                              spark_version=sys.argv[4],
+                              scala_version=sys.argv[5],
+                              scala_binary_version=sys.argv[6]), file=f)
diff --git a/jvm-packages/xgboost4j-tester/get_iris.py b/jvm-packages/xgboost4j-tester/get_iris.py
new file mode 100644
index 000000000000..f234bb95e198
--- /dev/null
+++ b/jvm-packages/xgboost4j-tester/get_iris.py
@@ -0,0 +1,10 @@
+from sklearn.datasets import load_iris
+import numpy as np
+import pandas
+
+X, y = load_iris(return_X_y=True)
+y = y.astype(np.int)
+df = pandas.DataFrame(data=X, columns=['sepal length', 'sepal width', 'petal length', 'petal width'])
+class_id_to_name = {0:'Iris-setosa', 1:'Iris-versicolor', 2:'Iris-virginica'}
+df['class'] = np.vectorize(class_id_to_name.get)(y)
+df.to_csv('./iris.csv', float_format='%.1f', header=False, index=False)
diff --git a/jvm-packages/xgboost4j-tester/src/main/java/ml/dmlc/xgboost4j/tester/App.java b/jvm-packages/xgboost4j-tester/src/main/java/ml/dmlc/xgboost4j/tester/App.java
new file mode 100644
index 000000000000..917f5062061c
--- /dev/null
+++ b/jvm-packages/xgboost4j-tester/src/main/java/ml/dmlc/xgboost4j/tester/App.java
@@ -0,0 +1,26 @@
+package ml.dmlc.xgboost4j.tester;
+
+import ml.dmlc.xgboost4j.java.example.*;
+
+import java.io.IOException;
+import ml.dmlc.xgboost4j.java.XGBoostError;
+
+public class App {
+  public static void main(String[] args) throws IOException, XGBoostError {
+    String[] args2 = new String[0];
+    System.out.println("BoostFromPrediction");
+    BoostFromPrediction.main(args2);
+    System.out.println("CrossValidation");
+    CrossValidation.main(args2);
+    System.out.println("CustomObjective");
+    CustomObjective.main(args2);
+    System.out.println("ExternalMemory");
+    ExternalMemory.main(args2);
+    System.out.println("GeneralizedLinearModel");
+    GeneralizedLinearModel.main(args2);
+    System.out.println("PredictFirstNtree");
+    PredictFirstNtree.main(args2);
+    System.out.println("PredictLeafIndices");
+    PredictLeafIndices.main(args2);
+  }
+}
diff --git a/jvm-packages/xgboost4j-tester/src/test/java/ml/dmlc/xgboost4j/tester/AppTest.java b/jvm-packages/xgboost4j-tester/src/test/java/ml/dmlc/xgboost4j/tester/AppTest.java
new file mode 100644
index 000000000000..2df69374806a
--- /dev/null
+++ b/jvm-packages/xgboost4j-tester/src/test/java/ml/dmlc/xgboost4j/tester/AppTest.java
@@ -0,0 +1,20 @@
+package ml.dmlc.xgboost4j.tester;
+
+import static org.junit.Assert.assertTrue;
+
+import org.junit.Test;
+
+/**
+ * Unit test for simple App.
+ */
+public class AppTest 
+{
+    /**
+     * Rigorous Test :-)
+     */
+    @Test
+    public void shouldAnswerWithTrue()
+    {
+        assertTrue( true );
+    }
+}
diff --git a/jvm-packages/xgboost4j/pom.xml b/jvm-packages/xgboost4j/pom.xml
index 83372a88ca7a..2a1f0068282d 100644
--- a/jvm-packages/xgboost4j/pom.xml
+++ b/jvm-packages/xgboost4j/pom.xml
@@ -71,6 +71,18 @@
                   </execution>
               </executions>
           </plugin>
+          <plugin>
+              <groupId>org.apache.maven.plugins</groupId>
+              <artifactId>maven-jar-plugin</artifactId>
+              <version>3.0.2</version>
+              <executions>
+                  <execution>
+                      <goals>
+                          <goal>test-jar</goal>
+                      </goals>
+                  </execution>
+              </executions>
+          </plugin>
           <plugin>
               <groupId>org.apache.maven.plugins</groupId>
               <artifactId>maven-resources-plugin</artifactId>
diff --git a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Rabit.java b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Rabit.java
index 710165d4cb01..35b500757544 100644
--- a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Rabit.java
+++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Rabit.java
@@ -9,6 +9,7 @@
  * Rabit global class for synchronization.
  */
 public class Rabit {
+
   public enum OpType implements Serializable {
     MAX(0), MIN(1), SUM(2), BITWISE_OR(3);
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 89c2d2f575a2..cacf3582f6be 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -57,6 +57,12 @@ if (USE_CUDA)
     target_compile_definitions(objxgboost PRIVATE -DXGBOOST_USE_NVTX=1)
   endif (USE_NVTX)
 
+  # OpenMP is mandatory for cuda version
+  find_package(OpenMP REQUIRED)
+  target_compile_options(objxgboost PRIVATE  
+    $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=${OpenMP_CXX_FLAGS}>
+  )
+
   set_target_properties(objxgboost PROPERTIES
     CUDA_SEPARABLE_COMPILATION OFF)
 else (USE_CUDA)
diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index fdcc7b41b35e..33d74b5b1bb5 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -12,6 +12,7 @@
 #include "span.h"
 
 #include <algorithm>
+#include <omp.h>
 #include <chrono>
 #include <ctime>
 #include <cub/cub.cuh>
@@ -752,6 +753,29 @@ void Gather(int device_idx, T *out, const T *in, const int *instId, int nVals) {
                                        });
 }
 
+class SaveCudaContext {
+ private:
+  int saved_device_;
+
+ public:
+  template <typename Functor>
+  explicit SaveCudaContext (Functor func) : saved_device_{-1} {
+    // When compiled with CUDA but running on CPU only device,
+    // cudaGetDevice will fail.
+    try {
+      safe_cuda(cudaGetDevice(&saved_device_));
+    } catch (const dmlc::Error &except) {
+      saved_device_ = -1;
+    }
+    func();
+  }
+  ~SaveCudaContext() {
+    if (saved_device_ != -1) {
+      safe_cuda(cudaSetDevice(saved_device_));
+    }
+  }
+};
+
 /**
  * \class AllReducer
  *
@@ -777,8 +801,18 @@ class AllReducer {
                  allreduce_calls_(0) {}
 
   /**
-   * \fn  void Init(const std::vector<int> &device_ordinals)
-   *
+   * \brief If we are using a single GPU only
+   */
+  bool IsSingleGPU() {
+#ifdef XGBOOST_USE_NCCL
+    CHECK(device_counts.size() > 0) << "AllReducer not initialised.";
+    return device_counts.size() <= 1 && device_counts.at(0) == 1;
+#else
+    return true;
+#endif
+  }
+
+  /**
    * \brief Initialise with the desired device ordinals for this communication
    * group.
    *
@@ -956,6 +990,22 @@ class AllReducer {
 #endif
   };
 
+  /**
+   * \brief Synchronizes the device 
+   *
+   * \param device_id Identifier for the device.
+   */
+  void Synchronize(int device_id) {
+#ifdef XGBOOST_USE_NCCL
+    SaveCudaContext([&]() {
+      dh::safe_cuda(cudaSetDevice(device_id));
+      int idx = std::find(device_ordinals.begin(), device_ordinals.end(), device_id) - device_ordinals.begin();
+      CHECK(idx < device_ordinals.size());
+      dh::safe_cuda(cudaStreamSynchronize(streams[idx]));
+    });
+#endif
+  };
+
 #ifdef XGBOOST_USE_NCCL
   /**
    * \fn  ncclUniqueId GetUniqueId()
@@ -980,29 +1030,6 @@ class AllReducer {
 #endif
 };
 
-class SaveCudaContext {
- private:
-  int saved_device_;
-
- public:
-  template <typename Functor>
-  explicit SaveCudaContext (Functor func) : saved_device_{-1} {
-    // When compiled with CUDA but running on CPU only device,
-    // cudaGetDevice will fail.
-    try {
-      safe_cuda(cudaGetDevice(&saved_device_));
-    } catch (const dmlc::Error &except) {
-      saved_device_ = -1;
-    }
-    func();
-  }
-  ~SaveCudaContext() {
-    if (saved_device_ != -1) {
-      safe_cuda(cudaSetDevice(saved_device_));
-    }
-  }
-};
-
 /**
  * \brief Executes some operation on each element of the input vector, using a
  * single controlling thread for each element. In addition, passes the shard index
@@ -1017,11 +1044,15 @@ class SaveCudaContext {
 template <typename T, typename FunctionT>
 void ExecuteIndexShards(std::vector<T> *shards, FunctionT f) {
   SaveCudaContext{[&]() {
+    // Temporarily turn off dynamic so we have a guaranteed number of threads
+    bool dynamic = omp_get_dynamic();
+    omp_set_dynamic(false);
     const long shards_size = static_cast<long>(shards->size());
 #pragma omp parallel for schedule(static, 1) if (shards_size > 1)
     for (long shard = 0; shard < shards_size; ++shard) {
       f(shard, shards->at(shard));
     }
+    omp_set_dynamic(dynamic);
   }};
 }
 
diff --git a/src/common/random.h b/src/common/random.h
index 5e28e8878b77..d81d76a824ab 100644
--- a/src/common/random.h
+++ b/src/common/random.h
@@ -115,8 +115,16 @@ class ColumnSampler {
  public:
   /** 
    * \brief Column sampler constructor.
-   * \note This constructor synchronizes the RNG seed across processes.
+   * \note This constructor manually sets the rng seed
    */
+  explicit ColumnSampler(uint32_t seed) {
+    rng_.seed(seed);
+  }
+
+  /**
+  * \brief Column sampler constructor.
+  * \note This constructor synchronizes the RNG seed across processes.
+  */
   ColumnSampler() {
     uint32_t seed = common::GlobalRandom()();
     rabit::Broadcast(&seed, sizeof(seed), 0);
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index b9fd83ccfac9..0fcd0270ec66 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -342,7 +342,8 @@ class GPUPredictor : public xgboost::Predictor {
   }
 
  public:
-  GPUPredictor() : cpu_predictor_(Predictor::Create("cpu_predictor")) {}
+  GPUPredictor()                                               // NOLINT
+      : cpu_predictor_(Predictor::Create("cpu_predictor")) {}  // NOLINT
 
   void PredictBatch(DMatrix* dmat, HostDeviceVector<bst_float>* out_preds,
                     const gbm::GBTreeModel& model, int tree_begin,
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 4b4d7d03ac9a..a980661e7aa9 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -38,6 +38,7 @@ struct GPUHistMakerTrainParam
   bool single_precision_histogram;
   // number of rows in a single GPU batch
   int gpu_batch_nrows;
+  bool debug_synchronize;
   // declare parameters
   DMLC_DECLARE_PARAMETER(GPUHistMakerTrainParam) {
     DMLC_DECLARE_FIELD(single_precision_histogram).set_default(false).describe(
@@ -47,6 +48,8 @@ struct GPUHistMakerTrainParam
         .set_default(0)
         .describe("Number of rows in a GPU batch, used for finding quantiles on GPU; "
                   "-1 to use all rows assignted to a GPU, and 0 to auto-deduce");
+    DMLC_DECLARE_FIELD(debug_synchronize).set_default(false).describe(
+        "Check if all distributed tree are identical after tree construction.");
   }
 };
 #if !defined(GTEST_TEST)
@@ -598,12 +601,23 @@ inline void SortPosition(dh::CubMemory* temp_memory, common::Span<int> position,
 }
 
 /*! \brief Count how many rows are assigned to left node. */
-__forceinline__ __device__ void CountLeft(int64_t* d_count, int val, int left_nidx) {
+__forceinline__ __device__ void CountLeft(int64_t* d_count, int val,
+                                          int left_nidx) {
+#if __CUDACC_VER_MAJOR__ > 8
+  int mask = __activemask();
+  unsigned ballot = __ballot_sync(mask, val == left_nidx);
+  int leader = __ffs(mask) - 1;
+  if (threadIdx.x % 32 == leader) {
+    atomicAdd(reinterpret_cast<unsigned long long*>(d_count),    // NOLINT
+              static_cast<unsigned long long>(__popc(ballot)));  // NOLINT
+  }
+#else
   unsigned ballot = __ballot(val == left_nidx);
   if (threadIdx.x % 32 == 0) {
     atomicAdd(reinterpret_cast<unsigned long long*>(d_count),    // NOLINT
               static_cast<unsigned long long>(__popc(ballot)));  // NOLINT
   }
+#endif
 }
 
 template <typename GradientSumT>
@@ -621,6 +635,7 @@ template <typename GradientSumT>
 struct DeviceShard {
   int n_bins;
   int device_id;
+  int shard_idx;  // Position in the local array of shards
 
   dh::BulkAllocator ba;
 
@@ -670,18 +685,31 @@ struct DeviceShard {
 
   std::vector<cudaStream_t> streams;
 
+  common::Monitor monitor;
+  std::vector<ValueConstraint> node_value_constraints;
+  common::ColumnSampler column_sampler;
+
   std::unique_ptr<GPUHistBuilderBase<GradientSumT>> hist_builder;
 
+  using ExpandQueue =
+      std::priority_queue<ExpandEntry, std::vector<ExpandEntry>,
+                          std::function<bool(ExpandEntry, ExpandEntry)>>;
+  std::unique_ptr<ExpandQueue> qexpand;
+
   // TODO(canonizer): do add support multi-batch DMatrix here
-  DeviceShard(int _device_id, bst_uint row_begin, bst_uint row_end,
-              TrainParam _param)
+  DeviceShard(int _device_id, int shard_idx, bst_uint row_begin,
+              bst_uint row_end, TrainParam _param, uint32_t column_sampler_seed)
       : device_id(_device_id),
+        shard_idx(shard_idx),
         row_begin_idx(row_begin),
         row_end_idx(row_end),
         n_rows(row_end - row_begin),
         n_bins(0),
         param(std::move(_param)),
-        prediction_cache_initialised(false) {}
+        prediction_cache_initialised(false),
+        column_sampler(column_sampler_seed) {
+    monitor.Init(std::string("DeviceShard") + std::to_string(device_id));
+  }
 
   /* Init row_ptrs and row_stride */
   size_t InitRowPtrs(const SparsePage& row_batch) {
@@ -736,7 +764,16 @@ struct DeviceShard {
   }
 
   // Reset values for each update iteration
-  void Reset(HostDeviceVector<GradientPair>* dh_gpair) {
+  // Note that the column sampler must be passed by value because it is not
+  // thread safe
+  void Reset(HostDeviceVector<GradientPair>* dh_gpair, int64_t num_columns) {
+    if (param.grow_policy == TrainParam::kLossGuide) {
+      qexpand.reset(new ExpandQueue(LossGuide));
+    } else {
+      qexpand.reset(new ExpandQueue(DepthWise));
+    }
+    this->column_sampler.Init(num_columns, param.colsample_bynode,
+      param.colsample_bylevel, param.colsample_bytree);
     dh::safe_cuda(cudaSetDevice(device_id));
     thrust::fill(
         thrust::device_pointer_cast(position.Current()),
@@ -764,8 +801,6 @@ struct DeviceShard {
 
   std::vector<DeviceSplitCandidate> EvaluateSplits(
       std::vector<int> nidxs, const RegTree& tree,
-      common::ColumnSampler* column_sampler,
-      const std::vector<ValueConstraint>& value_constraints,
       size_t num_columns) {
     dh::safe_cuda(cudaSetDevice(device_id));
     auto result = pinned_memory.GetSpan<DeviceSplitCandidate>(nidxs.size());
@@ -800,7 +835,7 @@ struct DeviceShard {
     auto& streams = this->GetStreams(nidxs.size());
     for (auto i = 0ull; i < nidxs.size(); i++) {
       auto nidx = nidxs[i];
-      auto p_feature_set = column_sampler->GetFeatureSet(tree.GetDepth(nidx));
+      auto p_feature_set = column_sampler.GetFeatureSet(tree.GetDepth(nidx));
       p_feature_set->Shard(GPUSet(device_id, 1));
       auto d_feature_set = p_feature_set->DeviceSpan(device_id);
       auto d_split_candidates =
@@ -812,7 +847,7 @@ struct DeviceShard {
       EvaluateSplitKernel<kBlockThreads, GradientSumT>
           <<<uint32_t(d_feature_set.size()), kBlockThreads, 0, streams[i]>>>(
               hist.GetNodeHistogram(nidx), d_feature_set, node, ellpack_matrix,
-              gpu_param, d_split_candidates, value_constraints[nidx],
+              gpu_param, d_split_candidates, node_value_constraints[nidx],
               monotone_constraints);
 
       // Reduce over features to find best feature
@@ -997,6 +1032,179 @@ struct DeviceShard {
         out_preds_d, prediction_cache.data(),
         prediction_cache.size() * sizeof(bst_float), cudaMemcpyDefault));
   }
+
+  void AllReduceHist(int nidx, dh::AllReducer* reducer) {
+    monitor.StartCuda("AllReduce");
+    auto d_node_hist = hist.GetNodeHistogram(nidx).data();
+    reducer->AllReduceSum(
+        shard_idx,
+        reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
+        reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
+        ellpack_matrix.BinCount() *
+            (sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT)));
+    reducer->Synchronize(device_id);
+
+    monitor.StopCuda("AllReduce");
+  }
+
+  /**
+   * \brief Build GPU local histograms for the left and right child of some parent node
+   */
+  void BuildHistLeftRight(int nidx_parent, int nidx_left, int nidx_right, dh::AllReducer* reducer) {
+    auto build_hist_nidx = nidx_left;
+    auto subtraction_trick_nidx = nidx_right;
+
+    // If we are using a single GPU, build the histogram for the node with the
+    // fewest training instances
+    // If we are distributed, don't bother
+    if (reducer->IsSingleGPU()) {
+      bool fewer_right =
+          ridx_segments[nidx_right].Size() < ridx_segments[nidx_left].Size();
+      if (fewer_right) {
+        std::swap(build_hist_nidx, subtraction_trick_nidx);
+      }
+    }
+
+    this->BuildHist(build_hist_nidx);
+    this->AllReduceHist(build_hist_nidx, reducer);
+
+    // Check whether we can use the subtraction trick to calculate the other
+    bool do_subtraction_trick = this->CanDoSubtractionTrick(
+        nidx_parent, build_hist_nidx, subtraction_trick_nidx);
+
+    if (do_subtraction_trick) {
+      // Calculate other histogram using subtraction trick
+      this->SubtractionTrick(nidx_parent, build_hist_nidx,
+                             subtraction_trick_nidx);
+    } else {
+      // Calculate other histogram manually
+      this->BuildHist(subtraction_trick_nidx);
+      this->AllReduceHist(subtraction_trick_nidx, reducer);
+    }
+  }
+
+  void ApplySplit(const ExpandEntry& candidate, RegTree* p_tree) {
+    RegTree& tree = *p_tree;
+
+    GradStats left_stats;
+    left_stats.Add(candidate.split.left_sum);
+    GradStats right_stats;
+    right_stats.Add(candidate.split.right_sum);
+    GradStats parent_sum;
+    parent_sum.Add(left_stats);
+    parent_sum.Add(right_stats);
+    node_value_constraints.resize(tree.GetNodes().size());
+    auto base_weight = node_value_constraints[candidate.nid].CalcWeight(param, parent_sum);
+    auto left_weight =
+        node_value_constraints[candidate.nid].CalcWeight(param, left_stats)*param.learning_rate;
+    auto right_weight =
+        node_value_constraints[candidate.nid].CalcWeight(param, right_stats)*param.learning_rate;
+    tree.ExpandNode(candidate.nid, candidate.split.findex,
+                    candidate.split.fvalue, candidate.split.dir == kLeftDir,
+                    base_weight, left_weight, right_weight,
+                    candidate.split.loss_chg, parent_sum.sum_hess);
+    // Set up child constraints
+    node_value_constraints.resize(tree.GetNodes().size());
+    node_value_constraints[candidate.nid].SetChild(
+        param, tree[candidate.nid].SplitIndex(), left_stats, right_stats,
+        &node_value_constraints[tree[candidate.nid].LeftChild()],
+        &node_value_constraints[tree[candidate.nid].RightChild()]);
+    node_sum_gradients[tree[candidate.nid].LeftChild()] =
+        candidate.split.left_sum;
+    node_sum_gradients[tree[candidate.nid].RightChild()] =
+        candidate.split.right_sum;
+  }
+
+  void InitRoot(RegTree* p_tree, HostDeviceVector<GradientPair>* gpair_all,
+                dh::AllReducer* reducer, int64_t num_columns) {
+    constexpr int kRootNIdx = 0;
+
+    const auto &gpair = gpair_all->DeviceSpan(device_id);
+
+    dh::SumReduction(temp_memory, gpair, node_sum_gradients_d,
+                     gpair.size());
+    reducer->AllReduceSum(
+        shard_idx, reinterpret_cast<float*>(node_sum_gradients_d.data()),
+        reinterpret_cast<float*>(node_sum_gradients_d.data()), 2);
+    reducer->Synchronize(device_id);
+    dh::safe_cuda(cudaMemcpy(node_sum_gradients.data(),
+                             node_sum_gradients_d.data(), sizeof(GradientPair),
+                             cudaMemcpyDeviceToHost));
+
+    this->BuildHist(kRootNIdx);
+    this->AllReduceHist(kRootNIdx, reducer);
+
+    // Remember root stats
+    p_tree->Stat(kRootNIdx).sum_hess = node_sum_gradients[kRootNIdx].GetHess();
+    auto weight = CalcWeight(param, node_sum_gradients[kRootNIdx]);
+    p_tree->Stat(kRootNIdx).base_weight = weight;
+    (*p_tree)[kRootNIdx].SetLeaf(param.learning_rate * weight);
+
+    // Initialise root constraint
+    node_value_constraints.resize(p_tree->GetNodes().size());
+
+    // Generate first split
+    auto split = this->EvaluateSplits({kRootNIdx}, *p_tree, num_columns);
+    qexpand->push(
+        ExpandEntry(kRootNIdx, p_tree->GetDepth(kRootNIdx), split.at(0), 0));
+  }
+
+  void UpdateTree(HostDeviceVector<GradientPair>* gpair_all, DMatrix* p_fmat,
+                  RegTree* p_tree, dh::AllReducer* reducer) {
+    auto& tree = *p_tree;
+    monitor.StartCuda("Reset");
+    this->Reset(gpair_all, p_fmat->Info().num_col_);
+    monitor.StopCuda("Reset");
+
+    monitor.StartCuda("InitRoot");
+    this->InitRoot(p_tree, gpair_all, reducer, p_fmat->Info().num_col_);
+    monitor.StopCuda("InitRoot");
+
+    auto timestamp = qexpand->size();
+    auto num_leaves = 1;
+
+    while (!qexpand->empty()) {
+      ExpandEntry candidate = qexpand->top();
+      qexpand->pop();
+      if (!candidate.IsValid(param, num_leaves)) {
+        continue;
+      }
+
+      this->ApplySplit(candidate, p_tree);
+
+      num_leaves++;
+
+      int left_child_nidx = tree[candidate.nid].LeftChild();
+      int right_child_nidx = tree[candidate.nid].RightChild();
+      // Only create child entries if needed
+      if (ExpandEntry::ChildIsValid(param, tree.GetDepth(left_child_nidx),
+        num_leaves)) {
+        monitor.StartCuda("UpdatePosition");
+        this->UpdatePosition(candidate.nid, (*p_tree)[candidate.nid]);
+        monitor.StopCuda("UpdatePosition");
+
+        monitor.StartCuda("BuildHist");
+        this->BuildHistLeftRight(candidate.nid, left_child_nidx, right_child_nidx, reducer);
+        monitor.StopCuda("BuildHist");
+
+        monitor.StartCuda("EvaluateSplits");
+        auto splits = this->EvaluateSplits({left_child_nidx, right_child_nidx},
+                                           *p_tree, p_fmat->Info().num_col_);
+        monitor.StopCuda("EvaluateSplits");
+
+        qexpand->push(ExpandEntry(left_child_nidx,
+                                   tree.GetDepth(left_child_nidx), splits.at(0),
+                                   timestamp++));
+        qexpand->push(ExpandEntry(right_child_nidx,
+                                   tree.GetDepth(right_child_nidx),
+                                   splits.at(1), timestamp++));
+      }
+    }
+
+    monitor.StartCuda("FinalisePosition");
+    this->FinalisePosition(p_tree);
+    monitor.StopCuda("FinalisePosition");
+  }
 };
 
 template <typename GradientSumT>
@@ -1179,12 +1387,6 @@ class GPUHistMakerSpecialised{
 
     dh::CheckComputeCapability();
 
-    if (param_.grow_policy == TrainParam::kLossGuide) {
-      qexpand_.reset(new ExpandQueue(LossGuide));
-    } else {
-      qexpand_.reset(new ExpandQueue(DepthWise));
-    }
-
     monitor_.Init("updater_gpu_hist");
   }
 
@@ -1223,17 +1425,23 @@ class GPUHistMakerSpecialised{
 
     auto batch_iter = dmat->GetRowBatches().begin();
     const SparsePage& batch = *batch_iter;
+
+    // Synchronise the column sampling seed
+    uint32_t column_sampling_seed = common::GlobalRandom()();
+    rabit::Broadcast(&column_sampling_seed, sizeof(column_sampling_seed), 0);
+
     // Create device shards
     shards_.resize(n_devices);
     dh::ExecuteIndexShards(
         &shards_,
-        [&](int i, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
-          dh::safe_cuda(cudaSetDevice(dist_.Devices().DeviceId(i)));
-          size_t start = dist_.ShardStart(info_->num_row_, i);
-          size_t size = dist_.ShardSize(info_->num_row_, i);
+        [&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
+          dh::safe_cuda(cudaSetDevice(dist_.Devices().DeviceId(idx)));
+          size_t start = dist_.ShardStart(info_->num_row_, idx);
+          size_t size = dist_.ShardSize(info_->num_row_, idx);
           shard = std::unique_ptr<DeviceShard<GradientSumT>>(
-              new DeviceShard<GradientSumT>(dist_.Devices().DeviceId(i), start,
-                                            start + size, param_));
+            new DeviceShard<GradientSumT>(dist_.Devices().DeviceId(idx), idx,
+                                          start, start + size, param_,
+                                          column_sampling_seed));
         });
 
     // Find the cuts.
@@ -1264,277 +1472,61 @@ class GPUHistMakerSpecialised{
       this->InitDataOnce(dmat);
       monitor_.StopCuda("InitDataOnce");
     }
-
-    column_sampler_.Init(info_->num_col_, param_.colsample_bynode,
-                         param_.colsample_bylevel, param_.colsample_bytree);
-
-    // Copy gpair & reset memory
-    monitor_.StartCuda("InitDataReset");
-
-    gpair->Shard(dist_);
-    dh::ExecuteIndexShards(
-        &shards_,
-        [&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
-          dh::safe_cuda(cudaSetDevice(shard->device_id));
-          shard->Reset(gpair);
-        });
-    monitor_.StopCuda("InitDataReset");
   }
 
-  void AllReduceHist(int nidx) {
-    if (shards_.size() == 1 && !rabit::IsDistributed()) {
-      return;
+  // Only call this method for testing
+  void CheckTreesSynchronized(const std::vector<RegTree>& local_trees) const {
+    std::string s_model;
+    common::MemoryBufferStream fs(&s_model);
+    int rank = rabit::GetRank();
+    if (rank == 0) {
+      local_trees.front().Save(&fs);
     }
-    monitor_.StartCuda("AllReduce");
-
-    reducer_.GroupStart();
-    for (auto& shard : shards_) {
-      auto d_node_hist = shard->hist.GetNodeHistogram(nidx).data();
-      reducer_.AllReduceSum(
-          dist_.Devices().Index(shard->device_id),
-          reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
-          reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
-          n_bins_ * (sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT)));
-    }
-    reducer_.GroupEnd();
-    reducer_.Synchronize();
-
-    monitor_.StopCuda("AllReduce");
-  }
-
-  /**
-   * \brief Build GPU local histograms for the left and right child of some parent node
-   */
-  void BuildHistLeftRight(int nidx_parent, int nidx_left, int nidx_right) {
-    size_t left_node_max_elements = 0;
-    size_t right_node_max_elements = 0;
-    for (auto& shard : shards_) {
-      left_node_max_elements = (std::max)(
-        left_node_max_elements, shard->ridx_segments[nidx_left].Size());
-      right_node_max_elements = (std::max)(
-        right_node_max_elements, shard->ridx_segments[nidx_right].Size());
-    }
-
-    rabit::Allreduce<rabit::op::Max, size_t>(&left_node_max_elements, 1);
-    rabit::Allreduce<rabit::op::Max, size_t>(&right_node_max_elements, 1);
-
-    auto build_hist_nidx = nidx_left;
-    auto subtraction_trick_nidx = nidx_right;
-
-    if (right_node_max_elements < left_node_max_elements) {
-      build_hist_nidx = nidx_right;
-      subtraction_trick_nidx = nidx_left;
-    }
-
-    // Build histogram for node with the smallest number of training examples
-    dh::ExecuteIndexShards(
-        &shards_,
-        [&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
-          dh::safe_cuda(cudaSetDevice(shard->device_id));
-          shard->BuildHist(build_hist_nidx);
-        });
-
-    this->AllReduceHist(build_hist_nidx);
-
-    // Check whether we can use the subtraction trick to calculate the other
-    bool do_subtraction_trick = true;
-    for (auto& shard : shards_) {
-      do_subtraction_trick &= shard->CanDoSubtractionTrick(
-        nidx_parent, build_hist_nidx, subtraction_trick_nidx);
-    }
-
-    if (do_subtraction_trick) {
-      // Calculate other histogram using subtraction trick
-      dh::ExecuteIndexShards(
-          &shards_,
-          [&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
-            dh::safe_cuda(cudaSetDevice(shard->device_id));
-            shard->SubtractionTrick(nidx_parent, build_hist_nidx,
-                                    subtraction_trick_nidx);
-          });
-    } else {
-      // Calculate other histogram manually
-      dh::ExecuteIndexShards(
-          &shards_,
-          [&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
-            dh::safe_cuda(cudaSetDevice(shard->device_id));
-            shard->BuildHist(subtraction_trick_nidx);
-          });
-
-      this->AllReduceHist(subtraction_trick_nidx);
-    }
-  }
-
-  std::vector<DeviceSplitCandidate> EvaluateSplits(std::vector<int> nidx,
-                                                   RegTree* p_tree) {
-    dh::safe_cuda(cudaSetDevice(shards_.front()->device_id));
-    return shards_.front()->EvaluateSplits(nidx, *p_tree, &column_sampler_,
-                                           node_value_constraints_,
-                                           info_->num_col_);
-  }
-
-  void InitRoot(RegTree* p_tree) {
-    constexpr int kRootNIdx = 0;
-    // Sum gradients
-    std::vector<GradientPair> tmp_sums(shards_.size());
-
-    dh::ExecuteIndexShards(
-        &shards_,
-        [&](int i, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
-          dh::safe_cuda(cudaSetDevice(shard->device_id));
-          tmp_sums[i] = dh::SumReduction(
-              shard->temp_memory, shard->gpair.data(), shard->gpair.size());
-        });
-
-    GradientPair sum_gradient =
-        std::accumulate(tmp_sums.begin(), tmp_sums.end(), GradientPair());
-
-    rabit::Allreduce<rabit::op::Sum>(
-        reinterpret_cast<GradientPair::ValueT*>(&sum_gradient), 2);
-
-    // Generate root histogram
-    dh::ExecuteIndexShards(
-        &shards_,
-        [&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
-          dh::safe_cuda(cudaSetDevice(shard->device_id));
-          shard->BuildHist(kRootNIdx);
-        });
-
-    this->AllReduceHist(kRootNIdx);
-
-    // Remember root stats
-    p_tree->Stat(kRootNIdx).sum_hess = sum_gradient.GetHess();
-    auto weight = CalcWeight(param_, sum_gradient);
-    p_tree->Stat(kRootNIdx).base_weight = weight;
-    (*p_tree)[kRootNIdx].SetLeaf(param_.learning_rate * weight);
-
-    // Store sum gradients
-    for (auto& shard : shards_) {
-      shard->node_sum_gradients[kRootNIdx] = sum_gradient;
-    }
-
-    // Initialise root constraint
-    node_value_constraints_.resize(p_tree->GetNodes().size());
-
-    // Generate first split
-    auto split = this->EvaluateSplits({ kRootNIdx }, p_tree);
-    qexpand_->push(
-        ExpandEntry(kRootNIdx, p_tree->GetDepth(kRootNIdx), split.at(0), 0));
-  }
-
-  void UpdatePosition(const ExpandEntry& candidate, RegTree* p_tree) {
-    dh::ExecuteIndexShards(
-        &shards_,
-        [&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
-          dh::safe_cuda(cudaSetDevice(shard->device_id));
-          shard->UpdatePosition(candidate.nid,
-                                p_tree->GetNodes()[candidate.nid]);
-        });
-  }
-  void FinalisePosition(RegTree* p_tree) {
-    dh::ExecuteIndexShards(
-        &shards_,
-        [&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
-          shard->FinalisePosition(p_tree);
-        });
-  }
-
-  void ApplySplit(const ExpandEntry& candidate, RegTree* p_tree) {
-    RegTree& tree = *p_tree;
-
-    GradStats left_stats;
-    left_stats.Add(candidate.split.left_sum);
-    GradStats right_stats;
-    right_stats.Add(candidate.split.right_sum);
-    GradStats parent_sum;
-    parent_sum.Add(left_stats);
-    parent_sum.Add(right_stats);
-    node_value_constraints_.resize(tree.GetNodes().size());
-    auto base_weight = node_value_constraints_[candidate.nid].CalcWeight(param_, parent_sum);
-    auto left_weight =
-        node_value_constraints_[candidate.nid].CalcWeight(param_, left_stats)*param_.learning_rate;
-    auto right_weight =
-        node_value_constraints_[candidate.nid].CalcWeight(param_, right_stats)*param_.learning_rate;
-    tree.ExpandNode(candidate.nid, candidate.split.findex,
-                    candidate.split.fvalue, candidate.split.dir == kLeftDir,
-                    base_weight, left_weight, right_weight,
-                    candidate.split.loss_chg, parent_sum.sum_hess);
-    // Set up child constraints
-    node_value_constraints_.resize(tree.GetNodes().size());
-    node_value_constraints_[candidate.nid].SetChild(
-        param_, tree[candidate.nid].SplitIndex(), left_stats, right_stats,
-        &node_value_constraints_[tree[candidate.nid].LeftChild()],
-        &node_value_constraints_[tree[candidate.nid].RightChild()]);
-
-    // Store sum gradients
-    for (auto& shard : shards_) {
-      shard->node_sum_gradients[tree[candidate.nid].LeftChild()] = candidate.split.left_sum;
-      shard->node_sum_gradients[tree[candidate.nid].RightChild()] = candidate.split.right_sum;
+    fs.Seek(0);
+    rabit::Broadcast(&s_model, 0);
+    RegTree reference_tree;
+    reference_tree.Load(&fs);
+    for (const auto& tree : local_trees) {
+      CHECK(tree == reference_tree);
     }
   }
 
   void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat,
                   RegTree* p_tree) {
-    auto& tree = *p_tree;
-
     monitor_.StartCuda("InitData");
     this->InitData(gpair, p_fmat);
     monitor_.StopCuda("InitData");
-    monitor_.StartCuda("InitRoot");
-    this->InitRoot(p_tree);
-    monitor_.StopCuda("InitRoot");
-
-    auto timestamp = qexpand_->size();
-    auto num_leaves = 1;
 
-    while (!qexpand_->empty()) {
-      ExpandEntry candidate = qexpand_->top();
-      qexpand_->pop();
-      if (!candidate.IsValid(param_, num_leaves)) {
-        continue;
-      }
+    std::vector<RegTree> trees(shards_.size());
+    for (auto& tree : trees) {
+      tree = *p_tree;
+    }
+    gpair->Reshard(dist_);
 
-      this->ApplySplit(candidate, p_tree);
-      num_leaves++;
+    // Launch one thread for each device "shard" containing a subset of rows.
+    // Threads will cooperatively build the tree, synchronising over histograms.
+    // Each thread will redundantly build its own copy of the tree
+    dh::ExecuteIndexShards(
+        &shards_,
+        [&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
+          shard->UpdateTree(gpair, p_fmat, &trees.at(idx), &reducer_);
+        });
 
-      int left_child_nidx = tree[candidate.nid].LeftChild();
-      int right_child_nidx = tree[candidate.nid].RightChild();
-      // Only create child entries if needed
-      if (ExpandEntry::ChildIsValid(param_, tree.GetDepth(left_child_nidx),
-                                    num_leaves)) {
-        monitor_.StartCuda("UpdatePosition");
-        this->UpdatePosition(candidate, p_tree);
-        monitor_.StopCuda("UpdatePosition");
-
-        monitor_.StartCuda("BuildHist");
-        this->BuildHistLeftRight(candidate.nid, left_child_nidx,
-                                 right_child_nidx);
-        monitor_.StopCuda("BuildHist");
-
-        monitor_.StartCuda("EvaluateSplits");
-        auto splits =
-            this->EvaluateSplits({left_child_nidx, right_child_nidx}, p_tree);
-        qexpand_->push(ExpandEntry(left_child_nidx,
-                                   tree.GetDepth(left_child_nidx), splits.at(0),
-                                   timestamp++));
-        qexpand_->push(ExpandEntry(right_child_nidx,
-                                   tree.GetDepth(right_child_nidx),
-                                   splits.at(1), timestamp++));
-        monitor_.StopCuda("EvaluateSplits");
-      }
+    // All trees are expected to be identical
+    if (hist_maker_param_.debug_synchronize) {
+      this->CheckTreesSynchronized(trees);
     }
 
-    monitor_.StartCuda("FinalisePosition");
-    this->FinalisePosition(p_tree);
-    monitor_.StopCuda("FinalisePosition");
+    // Write the output tree
+    *p_tree = trees.front();
   }
 
   bool UpdatePredictionCache(
       const DMatrix* data, HostDeviceVector<bst_float>* p_out_preds) {
-    monitor_.StartCuda("UpdatePredictionCache");
     if (shards_.empty() || p_last_fmat_ == nullptr || p_last_fmat_ != data) {
       return false;
     }
+    monitor_.StartCuda("UpdatePredictionCache");
     p_out_preds->Shard(dist_.Devices());
     dh::ExecuteIndexShards(
         &shards_,
@@ -1552,9 +1544,6 @@ class GPUHistMakerSpecialised{
   MetaInfo* info_;              // NOLINT
 
   std::vector<std::unique_ptr<DeviceShard<GradientSumT>>> shards_;  // NOLINT
-  common::ColumnSampler column_sampler_;                            // NOLINT
-
-  std::vector<ValueConstraint> node_value_constraints_;  // NOLINT
 
  private:
   bool initialised_;
@@ -1565,10 +1554,6 @@ class GPUHistMakerSpecialised{
   GPUHistMakerTrainParam hist_maker_param_;
   common::GHistIndexMatrix gmat_;
 
-  using ExpandQueue =
-      std::priority_queue<ExpandEntry, std::vector<ExpandEntry>,
-                          std::function<bool(ExpandEntry, ExpandEntry)>>;
-  std::unique_ptr<ExpandQueue> qexpand_;
   dh::AllReducer reducer_;
 
   DMatrix* p_last_fmat_;
diff --git a/tests/ci_build/Dockerfile.clang_tidy b/tests/ci_build/Dockerfile.clang_tidy
index ec68e2c666dc..18941999f81f 100644
--- a/tests/ci_build/Dockerfile.clang_tidy
+++ b/tests/ci_build/Dockerfile.clang_tidy
@@ -16,17 +16,6 @@ RUN \
     update-alternatives --install /usr/bin/clang-tidy clang-tidy /usr/bin/clang-tidy-7 100 && \
     update-alternatives --install /usr/bin/clang clang /usr/bin/clang-7 100
 
-# NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
-RUN \
-    export CUDA_SHORT=`echo $CUDA_VERSION | egrep -o '[0-9]+\.[0-9]'` && \
-    if [ "${CUDA_SHORT}" != "10.0" ]; then \
-    wget https://developer.download.nvidia.com/compute/redist/nccl/v2.2/nccl_2.2.13-1%2Bcuda${CUDA_SHORT}_x86_64.txz && \
-    tar xf "nccl_2.2.13-1+cuda${CUDA_SHORT}_x86_64.txz" && \
-    cp nccl_2.2.13-1+cuda${CUDA_SHORT}_x86_64/include/nccl.h /usr/include && \
-    cp nccl_2.2.13-1+cuda${CUDA_SHORT}_x86_64/lib/* /usr/lib && \
-    rm -f nccl_2.2.13-1+cuda${CUDA_SHORT}_x86_64.txz && \
-    rm -r nccl_2.2.13-1+cuda${CUDA_SHORT}_x86_64; fi
-
 # Install Python packages
 RUN \
     pip3 install pyyaml
diff --git a/tests/ci_build/Dockerfile.cpu b/tests/ci_build/Dockerfile.cpu
new file mode 100644
index 000000000000..4f8849218423
--- /dev/null
+++ b/tests/ci_build/Dockerfile.cpu
@@ -0,0 +1,38 @@
+FROM ubuntu:18.04
+
+# Environment
+ENV DEBIAN_FRONTEND noninteractive
+
+# Install all basic requirements
+RUN \
+    apt-get update && \
+    apt-get install -y tar unzip wget git build-essential doxygen graphviz llvm libasan2 && \
+    # CMake
+    wget -nv -nc https://cmake.org/files/v3.12/cmake-3.12.0-Linux-x86_64.sh --no-check-certificate && \
+    bash cmake-3.12.0-Linux-x86_64.sh --skip-license --prefix=/usr && \
+    # Python
+    wget https://repo.continuum.io/miniconda/Miniconda3-4.5.12-Linux-x86_64.sh && \
+    bash Miniconda3-4.5.12-Linux-x86_64.sh -b -p /opt/python
+
+ENV PATH=/opt/python/bin:$PATH
+
+ENV GOSU_VERSION 1.10
+
+# Install Python packages
+RUN \
+    pip install pyyaml cpplint pylint astroid sphinx numpy scipy pandas matplotlib sh recommonmark guzzle_sphinx_theme mock \
+                breathe matplotlib graphviz pytest scikit-learn wheel kubernetes urllib3 && \
+    pip install https://h2o-release.s3.amazonaws.com/datatable/stable/datatable-0.7.0/datatable-0.7.0-cp37-cp37m-linux_x86_64.whl
+
+# Install lightweight sudo (not bound to TTY)
+RUN set -ex; \
+    wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \
+    chmod +x /usr/local/bin/gosu && \
+    gosu nobody true
+
+# Default entry-point to use if running locally
+# It will preserve attributes of created files
+COPY entrypoint.sh /scripts/
+
+WORKDIR /workspace
+ENTRYPOINT ["/scripts/entrypoint.sh"]
diff --git a/tests/ci_build/Dockerfile.gpu b/tests/ci_build/Dockerfile.gpu
index 1bf705558ac0..2d646824b158 100644
--- a/tests/ci_build/Dockerfile.gpu
+++ b/tests/ci_build/Dockerfile.gpu
@@ -1,47 +1,22 @@
 ARG CUDA_VERSION
-FROM nvidia/cuda:$CUDA_VERSION-devel-centos6
+FROM nvidia/cuda:$CUDA_VERSION-runtime-ubuntu16.04
 
 # Environment
 ENV DEBIAN_FRONTEND noninteractive
 
 # Install all basic requirements
 RUN \
-    yum -y update && \
-    yum install -y tar unzip wget xz git centos-release-scl yum-utils && \
-    yum-config-manager --enable centos-sclo-rh-testing && \
-    yum -y update && \
-    yum install -y devtoolset-4-gcc devtoolset-4-binutils devtoolset-4-gcc-c++ && \
+    apt-get update && \
+    apt-get install -y wget unzip bzip2 libgomp1 && \
     # Python
-    wget https://repo.continuum.io/miniconda/Miniconda2-4.3.27-Linux-x86_64.sh && \
-    bash Miniconda2-4.3.27-Linux-x86_64.sh -b -p /opt/python && \
-    # CMake
-    wget -nv -nc https://cmake.org/files/v3.12/cmake-3.12.0-Linux-x86_64.sh --no-check-certificate && \
-    bash cmake-3.12.0-Linux-x86_64.sh --skip-license --prefix=/usr
-
-# NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
-RUN \
-    export CUDA_SHORT=`echo $CUDA_VERSION | egrep -o '[0-9]+\.[0-9]'` && \
-    if [ "${CUDA_SHORT}" != "10.0" ]; then \
-    wget https://developer.download.nvidia.com/compute/redist/nccl/v2.2/nccl_2.2.13-1%2Bcuda${CUDA_SHORT}_x86_64.txz && \
-    tar xf "nccl_2.2.13-1+cuda${CUDA_SHORT}_x86_64.txz" && \
-    cp nccl_2.2.13-1+cuda${CUDA_SHORT}_x86_64/include/nccl.h /usr/include && \
-    cp nccl_2.2.13-1+cuda${CUDA_SHORT}_x86_64/lib/* /usr/lib && \
-    rm -f nccl_2.2.13-1+cuda${CUDA_SHORT}_x86_64.txz && \
-    rm -r nccl_2.2.13-1+cuda${CUDA_SHORT}_x86_64; else \
-    wget https://developer.download.nvidia.com/compute/machine-learning/repos/rhel7/x86_64/nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm && \
-    rpm -i nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm && \
-    yum -y update && \
-    yum install -y libnccl-2.4.2-1+cuda10.0 libnccl-devel-2.4.2-1+cuda10.0 libnccl-static-2.4.2-1+cuda10.0 && \
-    rm -f nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm; fi
+    wget https://repo.continuum.io/miniconda/Miniconda3-4.5.12-Linux-x86_64.sh && \
+    bash Miniconda3-4.5.12-Linux-x86_64.sh -b -p /opt/python
 
 ENV PATH=/opt/python/bin:$PATH
-ENV CC=/opt/rh/devtoolset-4/root/usr/bin/gcc
-ENV CXX=/opt/rh/devtoolset-4/root/usr/bin/c++
-ENV CPP=/opt/rh/devtoolset-4/root/usr/bin/cpp
 
 # Install Python packages
 RUN \
-    pip install numpy pytest scipy scikit-learn wheel kubernetes urllib3==1.22
+    pip install numpy pytest scipy scikit-learn pandas matplotlib wheel kubernetes urllib3 graphviz
 
 ENV GOSU_VERSION 1.10
 
diff --git a/tests/ci_build/Dockerfile.gpu_build b/tests/ci_build/Dockerfile.gpu_build
new file mode 100644
index 000000000000..ffc3442681a5
--- /dev/null
+++ b/tests/ci_build/Dockerfile.gpu_build
@@ -0,0 +1,59 @@
+ARG CUDA_VERSION
+FROM nvidia/cuda:$CUDA_VERSION-devel-centos6
+
+# Environment
+ENV DEBIAN_FRONTEND noninteractive
+
+# Install all basic requirements
+RUN \
+    yum -y update && \
+    yum install -y tar unzip wget xz git centos-release-scl yum-utils && \
+    yum-config-manager --enable centos-sclo-rh-testing && \
+    yum -y update && \
+    yum install -y devtoolset-4-gcc devtoolset-4-binutils devtoolset-4-gcc-c++ && \
+    # Python
+    wget https://repo.continuum.io/miniconda/Miniconda3-4.5.12-Linux-x86_64.sh && \
+    bash Miniconda3-4.5.12-Linux-x86_64.sh -b -p /opt/python && \
+    # CMake
+    wget -nv -nc https://cmake.org/files/v3.12/cmake-3.12.0-Linux-x86_64.sh --no-check-certificate && \
+    bash cmake-3.12.0-Linux-x86_64.sh --skip-license --prefix=/usr
+
+# NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
+RUN \
+    export CUDA_SHORT=`echo $CUDA_VERSION | egrep -o '[0-9]+\.[0-9]'` && \
+    if [ "${CUDA_SHORT}" != "10.0" ]; then \
+    wget https://developer.download.nvidia.com/compute/redist/nccl/v2.2/nccl_2.2.13-1%2Bcuda${CUDA_SHORT}_x86_64.txz && \
+    tar xf "nccl_2.2.13-1+cuda${CUDA_SHORT}_x86_64.txz" && \
+    cp nccl_2.2.13-1+cuda${CUDA_SHORT}_x86_64/include/nccl.h /usr/include && \
+    cp nccl_2.2.13-1+cuda${CUDA_SHORT}_x86_64/lib/* /usr/lib && \
+    rm -f nccl_2.2.13-1+cuda${CUDA_SHORT}_x86_64.txz && \
+    rm -r nccl_2.2.13-1+cuda${CUDA_SHORT}_x86_64; else \
+    wget https://developer.download.nvidia.com/compute/machine-learning/repos/rhel7/x86_64/nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm && \
+    rpm -i nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm && \
+    yum -y update && \
+    yum install -y libnccl-2.4.2-1+cuda10.0 libnccl-devel-2.4.2-1+cuda10.0 libnccl-static-2.4.2-1+cuda10.0 && \
+    rm -f nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm; fi
+
+ENV PATH=/opt/python/bin:$PATH
+ENV CC=/opt/rh/devtoolset-4/root/usr/bin/gcc
+ENV CXX=/opt/rh/devtoolset-4/root/usr/bin/c++
+ENV CPP=/opt/rh/devtoolset-4/root/usr/bin/cpp
+
+# Install Python packages
+RUN \
+    pip install numpy pytest scipy scikit-learn wheel kubernetes urllib3==1.22
+
+ENV GOSU_VERSION 1.10
+
+# Install lightweight sudo (not bound to TTY)
+RUN set -ex; \
+    wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \
+    chmod +x /usr/local/bin/gosu && \
+    gosu nobody true
+
+# Default entry-point to use if running locally
+# It will preserve attributes of created files
+COPY entrypoint.sh /scripts/
+
+WORKDIR /workspace
+ENTRYPOINT ["/scripts/entrypoint.sh"]
diff --git a/tests/ci_build/Dockerfile.jvm b/tests/ci_build/Dockerfile.jvm
new file mode 100644
index 000000000000..17a54830ed84
--- /dev/null
+++ b/tests/ci_build/Dockerfile.jvm
@@ -0,0 +1,43 @@
+FROM centos:6
+
+# Install all basic requirements
+RUN \
+    yum -y update && \
+    yum install -y tar unzip wget xz git centos-release-scl yum-utils java-1.8.0-openjdk-devel && \
+    yum-config-manager --enable centos-sclo-rh-testing && \
+    yum -y update && \
+    yum install -y devtoolset-4-gcc devtoolset-4-binutils devtoolset-4-gcc-c++ && \
+    # Python
+    wget https://repo.continuum.io/miniconda/Miniconda3-4.5.12-Linux-x86_64.sh && \
+    bash Miniconda3-4.5.12-Linux-x86_64.sh -b -p /opt/python && \
+    # CMake
+    wget -nv -nc https://cmake.org/files/v3.12/cmake-3.12.0-Linux-x86_64.sh --no-check-certificate && \
+    bash cmake-3.12.0-Linux-x86_64.sh --skip-license --prefix=/usr && \
+    # Maven
+    wget http://apache.osuosl.org/maven/maven-3/3.6.1/binaries/apache-maven-3.6.1-bin.tar.gz && \
+    tar xvf apache-maven-3.6.1-bin.tar.gz -C /opt && \
+    ln -s /opt/apache-maven-3.6.1/ /opt/maven
+
+ENV PATH=/opt/python/bin:/opt/maven/bin:$PATH
+ENV CC=/opt/rh/devtoolset-4/root/usr/bin/gcc
+ENV CXX=/opt/rh/devtoolset-4/root/usr/bin/c++
+ENV CPP=/opt/rh/devtoolset-4/root/usr/bin/cpp
+
+# Install Python packages
+RUN \
+    pip install numpy pytest scipy scikit-learn wheel kubernetes urllib3==1.22 awscli
+
+ENV GOSU_VERSION 1.10
+
+# Install lightweight sudo (not bound to TTY)
+RUN set -ex; \
+    wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \
+    chmod +x /usr/local/bin/gosu && \
+    gosu nobody true
+
+# Default entry-point to use if running locally
+# It will preserve attributes of created files
+COPY entrypoint.sh /scripts/
+
+WORKDIR /workspace
+ENTRYPOINT ["/scripts/entrypoint.sh"]
diff --git a/tests/ci_build/Dockerfile.jvm_cross b/tests/ci_build/Dockerfile.jvm_cross
new file mode 100644
index 000000000000..d6a6a5d69a11
--- /dev/null
+++ b/tests/ci_build/Dockerfile.jvm_cross
@@ -0,0 +1,48 @@
+FROM ubuntu:19.04
+ARG JDK_VERSION=8
+
+# Environment
+ENV DEBIAN_FRONTEND noninteractive
+
+# Install all basic requirements
+RUN \
+    apt-get update && \
+    apt-get install -y software-properties-common && \
+    add-apt-repository ppa:openjdk-r/ppa && \
+    apt-get update && \
+    apt-get install -y tar unzip wget openjdk-$JDK_VERSION-jdk libgomp1 && \
+    # Python
+    wget https://repo.continuum.io/miniconda/Miniconda3-4.5.12-Linux-x86_64.sh && \
+    bash Miniconda3-4.5.12-Linux-x86_64.sh -b -p /opt/python && \
+    # Maven
+    wget http://apache.osuosl.org/maven/maven-3/3.6.1/binaries/apache-maven-3.6.1-bin.tar.gz && \
+    tar xvf apache-maven-3.6.1-bin.tar.gz -C /opt && \
+    ln -s /opt/apache-maven-3.6.1/ /opt/maven && \
+    # Spark
+    wget https://archive.apache.org/dist/spark/spark-2.4.1/spark-2.4.1-bin-hadoop2.7.tgz && \
+    tar xvf spark-2.4.1-bin-hadoop2.7.tgz -C /opt && \
+    ln -s /opt/spark-2.4.1-bin-hadoop2.7 /opt/spark
+
+ENV PATH=/opt/python/bin:/opt/spark/bin:/opt/maven/bin:$PATH
+
+# Install Python packages
+RUN \
+    pip install numpy scipy pandas scikit-learn
+
+ENV GOSU_VERSION 1.10
+
+# Install lightweight sudo (not bound to TTY)
+RUN set -ex; \
+    wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \
+    chmod +x /usr/local/bin/gosu && \
+    gosu nobody true
+
+# Set default JDK version
+RUN update-java-alternatives -v -s java-1.$JDK_VERSION.0-openjdk-amd64
+
+# Default entry-point to use if running locally
+# It will preserve attributes of created files
+COPY entrypoint.sh /scripts/
+
+WORKDIR /workspace
+ENTRYPOINT ["/scripts/entrypoint.sh"]
diff --git a/tests/ci_build/Dockerfile.release b/tests/ci_build/Dockerfile.release
index baf4d3d94461..f1da067898ca 100644
--- a/tests/ci_build/Dockerfile.release
+++ b/tests/ci_build/Dockerfile.release
@@ -1,8 +1,5 @@
 FROM centos:6
 
-# Environment
-ENV DEBIAN_FRONTEND noninteractive
-
 # Install all basic requirements
 RUN \
     yum -y update && \
diff --git a/tests/ci_build/Dockerfile.rproject b/tests/ci_build/Dockerfile.rproject
new file mode 100644
index 000000000000..efacf2e29478
--- /dev/null
+++ b/tests/ci_build/Dockerfile.rproject
@@ -0,0 +1,44 @@
+FROM ubuntu:18.04
+ARG USE_R35=0
+
+# Environment
+ENV DEBIAN_FRONTEND noninteractive
+
+# Install all basic requirements
+RUN \
+    apt-get update && \
+    apt-get install -y software-properties-common tar unzip wget git build-essential doxygen graphviz libcurl4-openssl-dev libssl-dev libxml2-dev && \
+    if [ $USE_R35 -eq 1 ]; then \
+      apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E084DAB9 && \
+      add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/' && \
+      apt-get update; \
+    fi && \
+    apt-get install -y r-base r-base-core r-recommended && \
+    # CMake
+    wget -nv -nc https://cmake.org/files/v3.12/cmake-3.12.0-Linux-x86_64.sh --no-check-certificate && \
+    bash cmake-3.12.0-Linux-x86_64.sh --skip-license --prefix=/usr
+
+# Use 16 workers to compile R packages
+ENV MAKE 'make -j16'
+
+# Install R packages
+RUN Rscript -e "install.packages( \
+        c('devtools', 'testthat', 'lintr', 'DiagrammeR', 'Ckmeans.1d.dp', 'vcd', 'data.table', 'stringi'), \
+        repos = 'http://cloud.r-project.org', \
+        dependencies = c('Depends', 'Imports', 'LinkingTo') \
+    )"
+
+ENV GOSU_VERSION 1.10
+
+# Install lightweight sudo (not bound to TTY)
+RUN set -ex; \
+    wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \
+    chmod +x /usr/local/bin/gosu && \
+    gosu nobody true
+
+# Default entry-point to use if running locally
+# It will preserve attributes of created files
+COPY entrypoint.sh /scripts/
+
+WORKDIR /workspace
+ENTRYPOINT ["/scripts/entrypoint.sh"]
diff --git a/jvm-packages/build_doc.sh b/tests/ci_build/build_jvm_doc.sh
similarity index 57%
rename from jvm-packages/build_doc.sh
rename to tests/ci_build/build_jvm_doc.sh
index 614ea611424b..a536b0efeeb3 100755
--- a/jvm-packages/build_doc.sh
+++ b/tests/ci_build/build_jvm_doc.sh
@@ -1,21 +1,27 @@
 #!/bin/bash
 
 if [ $# -ne 1 ]; then
-  echo "Usage: $0 [commit id]"
+  echo "Usage: $0 [branch name]"
   exit 1
 fi
 
 set -e
 set -x
 
-commit_id=$1
+# Initialize local Maven repository
+./tests/ci_build/initialize_maven.sh
+
+rm -rf build/
+cd jvm-packages
+
+branch_name=$1
 
 # Install JVM packages in local Maven repository
-mvn install -DskipTests
+mvn --no-transfer-progress install -DskipTests
 # Build Scaladocs
-mvn scala:doc -DskipTests
+mvn --no-transfer-progress scala:doc -DskipTests
 # Build Javadocs
-mvn javadoc:javadoc -DskipTests
+mvn --no-transfer-progress javadoc:javadoc -DskipTests
 
 # Package JVM docs in a tarball
 mkdir -p tmp/scaladocs
@@ -25,8 +31,8 @@ cp -rv xgboost4j-spark/target/site/scaladocs/ ./tmp/scaladocs/xgboost4j-spark/
 cp -rv xgboost4j-flink/target/site/scaladocs/ ./tmp/scaladocs/xgboost4j-flink/
 
 cd tmp
-tar cvjf ${commit_id}.tar.bz2 javadocs/ scaladocs/
-mv ${commit_id}.tar.bz2 ..
+tar cvjf ${branch_name}.tar.bz2 javadocs/ scaladocs/
+mv ${branch_name}.tar.bz2 ..
 cd ..
 rm -rfv tmp/
 
diff --git a/tests/ci_build/build_jvm_packages.sh b/tests/ci_build/build_jvm_packages.sh
new file mode 100755
index 000000000000..5dcc95a0feb0
--- /dev/null
+++ b/tests/ci_build/build_jvm_packages.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+set -e
+set -x
+
+# Initialize local Maven repository
+./tests/ci_build/initialize_maven.sh
+
+rm -rf build/
+cd jvm-packages
+
+mvn --no-transfer-progress package
+
+set +x
+set +e
diff --git a/tests/ci_build/build_test_rpkg.sh b/tests/ci_build/build_test_rpkg.sh
new file mode 100755
index 000000000000..7a28a7d47027
--- /dev/null
+++ b/tests/ci_build/build_test_rpkg.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+set -e
+set -x
+
+make Rpack
+cd xgboost/
+
+# Run tests
+echo "Building with R CMD build"
+R CMD build --no-build-vignettes --no-manual .
+
+echo "Running R tests"
+R_PACKAGE_TARBALL=$(ls -1t *.tar.gz | head -n 1)
+
+export _R_CHECK_TIMINGS_=0
+export _R_CHECK_FORCE_SUGGESTS_=false
+R CMD check \
+  ${R_PACKAGE_TARBALL} \
+  --no-vignettes \
+  --no-manual \
+  --as-cran \
+  --install-args=--build
diff --git a/tests/ci_build/build_via_cmake.sh b/tests/ci_build/build_via_cmake.sh
index 46bdd152cb87..cbfc9725548d 100755
--- a/tests/ci_build/build_via_cmake.sh
+++ b/tests/ci_build/build_via_cmake.sh
@@ -16,5 +16,5 @@ mkdir build
 cd build
 cmake .. "$@" -DGOOGLE_TEST=ON -DGTEST_ROOT=$PWD/../gtest -DCMAKE_VERBOSE_MAKEFILE=ON
 make clean
-make -j
+make -j$(nproc)
 cd ..
diff --git a/tests/ci_build/ci_build.sh b/tests/ci_build/ci_build.sh
index 676613d9ab8f..3b3fd07476f6 100755
--- a/tests/ci_build/ci_build.sh
+++ b/tests/ci_build/ci_build.sh
@@ -2,16 +2,21 @@
 #
 # Execute command within a docker container
 #
-# Usage: ci_build.sh <CONTAINER_TYPE> [--dockerfile <DOCKERFILE_PATH>] [-it]
-#                    <COMMAND>
+# Usage: ci_build.sh <CONTAINER_TYPE> <DOCKER_BINARY>
+#                    [--dockerfile <DOCKERFILE_PATH>] [-it]
+#                    [--build-arg <BUILD_ARG>] <COMMAND>
 #
 # CONTAINER_TYPE: Type of the docker container used the run the build: e.g.,
 #                 (cpu | gpu)
 #
+# DOCKER_BINARY: Command to invoke docker, e.g. (docker | nvidia-docker).
+#
 # DOCKERFILE_PATH: (Optional) Path to the Dockerfile used for docker build.  If
 #                  this optional value is not supplied (via the --dockerfile
 #                  flag), will use Dockerfile.CONTAINER_TYPE in default
 #
+# BUILD_ARG: (Optional) an argument to be passed to docker build
+#
 # COMMAND: Command to be executed in the docker container
 #
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
@@ -24,6 +29,10 @@ shift 1
 DOCKERFILE_PATH="${SCRIPT_DIR}/Dockerfile.${CONTAINER_TYPE}"
 DOCKER_CONTEXT_PATH="${SCRIPT_DIR}"
 
+# Get docker binary command (should be either docker or nvidia-docker)
+DOCKER_BINARY="$1"
+shift 1
+
 if [[ "$1" == "--dockerfile" ]]; then
     DOCKERFILE_PATH="$2"
     DOCKER_CONTEXT_PATH=$(dirname "${DOCKERFILE_PATH}")
@@ -32,6 +41,11 @@ if [[ "$1" == "--dockerfile" ]]; then
     shift 2
 fi
 
+if [[ -n "${CI_DOCKER_EXTRA_PARAMS_INIT}" ]]
+then
+    IFS=' ' read -r -a CI_DOCKER_EXTRA_PARAMS <<< "${CI_DOCKER_EXTRA_PARAMS_INIT}"
+fi
+
 if [[ "$1" == "-it" ]]; then
     CI_DOCKER_EXTRA_PARAMS+=('-it')
     shift 1
@@ -61,13 +75,6 @@ if [ "$#" -lt 1 ] || [ ! -e "${SCRIPT_DIR}/Dockerfile.${CONTAINER_TYPE}" ]; then
       exit 1
 fi
 
-# Use nvidia-docker if the container is GPU.
-if [[ "${CONTAINER_TYPE}" == *"gpu"* ]]; then
-    DOCKER_BINARY="nvidia-docker"
-else
-    DOCKER_BINARY="docker"
-fi
-
 # Helper function to traverse directories up until given file is found.
 function upsearch () {
     test / == "$PWD" && return || \
@@ -84,7 +91,9 @@ DOCKER_IMG_NAME="xgb-ci.${CONTAINER_TYPE}"
 
 # Append cuda version if available
 CUDA_VERSION=$(echo "${CI_DOCKER_BUILD_ARG}" | grep CUDA_VERSION | egrep -o '[0-9]*\.[0-9]*')
-DOCKER_IMG_NAME=$DOCKER_IMG_NAME$CUDA_VERSION 
+# Append jdk version if available
+JDK_VERSION=$(echo "${CI_DOCKER_BUILD_ARG}" | grep JDK_VERSION | egrep -o '[0-9]*')
+DOCKER_IMG_NAME=$DOCKER_IMG_NAME$CUDA_VERSION$JDK_VERSION
 
 # Under Jenkins matrix build, the build tag may contain characters such as
 # commas (,) and equal signs (=), which are not valid inside docker image names.
@@ -98,7 +107,7 @@ UBUNTU_ON_WINDOWS=$([ -e /proc/version ] && grep -l Microsoft /proc/version || e
 # MSYS, Git Bash, etc.
 MSYS=$([ -e /proc/version ] && grep -l MINGW /proc/version || echo "")
 
-if [[ -z "$UBUNTU_ON_WINDOWS" ]] && [[ -z "$MSYS" ]]; then
+if [[ -z "$UBUNTU_ON_WINDOWS" ]] && [[ -z "$MSYS" ]] && [[ ! "$OSTYPE" == "darwin"* ]]; then
     USER_IDS="-e CI_BUILD_UID=$( id -u ) -e CI_BUILD_GID=$( id -g ) -e CI_BUILD_USER=$( id -un ) -e CI_BUILD_GROUP=$( id -gn ) -e CI_BUILD_HOME=${WORKSPACE}"
 fi
 
@@ -181,6 +190,7 @@ echo "Running '${COMMAND[*]}' inside ${DOCKER_IMG_NAME}..."
 # By default we cleanup - remove the container once it finish running (--rm)
 # and share the PID namespace (--pid=host) so the process inside does not have
 # pid 1 and SIGKILL is propagated to the process inside (jenkins can kill it).
+set -x
 ${DOCKER_BINARY} run --rm --pid=host \
     -v "${WORKSPACE}":/workspace \
     -w /workspace \
diff --git a/tests/ci_build/doxygen.sh b/tests/ci_build/doxygen.sh
new file mode 100755
index 000000000000..e96fdd9d863e
--- /dev/null
+++ b/tests/ci_build/doxygen.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+set -e
+set -x
+
+rm -rf build
+mkdir build
+cd build
+cmake .. -DBUILD_C_DOC=ON
+make -j
diff --git a/tests/ci_build/initialize_maven.sh b/tests/ci_build/initialize_maven.sh
new file mode 100755
index 000000000000..a41caa7a444e
--- /dev/null
+++ b/tests/ci_build/initialize_maven.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+set -e
+set -x
+
+if [ -z ${CI_BUILD_USER} ]
+then
+  echo 'Must be run inside Jenkins CI'
+  exit 1
+fi
+gosu root mkdir -p /cache
+gosu root chown ${CI_BUILD_USER}:${CI_BUILD_GROUP} /cache
+
+# Download cached Maven repository, to speed up build
+python3 -m awscli s3 cp s3://xgboost-ci-jenkins-artifacts/maven-repo-cache.tar.bz2 /cache/maven-repo-cache.tar.bz2 || true
+
+if [[ -f "/cache/maven-repo-cache.tar.bz2" ]]
+then
+  tar xvf /cache/maven-repo-cache.tar.bz2 -C ${HOME}
+fi
diff --git a/tests/ci_build/jenkins_tools.Groovy b/tests/ci_build/jenkins_tools.Groovy
index e7ac6a443f22..1bc2574c6ac0 100644
--- a/tests/ci_build/jenkins_tools.Groovy
+++ b/tests/ci_build/jenkins_tools.Groovy
@@ -6,20 +6,6 @@
 // Command to run command inside a docker container
 dockerRun = 'tests/ci_build/ci_build.sh'
 
-// initialize source codes
-def checkoutSrcs() {
-  retry(5) {
-    try {
-      timeout(time: 2, unit: 'MINUTES') {
-        checkout scm
-        sh 'git submodule update --init'
-      }
-    } catch (exc) {
-      deleteDir()
-      error "Failed to fetch source codes"
-    }
-  }
-}
 
 /**
  * Creates cmake and make builds
diff --git a/tests/ci_build/test_gpu.sh b/tests/ci_build/test_gpu.sh
deleted file mode 100755
index 0347a1538fdf..000000000000
--- a/tests/ci_build/test_gpu.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/usr/bin/env bash
-set -e
-
-cd python-package
-python setup.py install --user
-cd ..
-pytest -v -s --fulltrace -m "(not mgpu) and (not slow)" tests/python-gpu
-pushd .
-cd build
-./testxgboost --gtest_filter=-*.MGPU_*
-ctest --output-on-failure --tests-regex "TestXGBoostCLI"
-popd
diff --git a/tests/ci_build/test_jvm_cross.sh b/tests/ci_build/test_jvm_cross.sh
new file mode 100755
index 000000000000..9a9f35011665
--- /dev/null
+++ b/tests/ci_build/test_jvm_cross.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+set -e
+set -x
+
+# Initialize local Maven repository
+./tests/ci_build/initialize_maven.sh
+
+# Get version number of XGBoost4J and other auxiliary information
+cd jvm-packages
+xgboost4j_version=$(mvn help:evaluate -Dexpression=project.version -q -DforceStdout)
+maven_compiler_source=$(mvn help:evaluate -Dexpression=maven.compiler.source -q -DforceStdout)
+maven_compiler_target=$(mvn help:evaluate -Dexpression=maven.compiler.target -q -DforceStdout)
+spark_version=$(mvn help:evaluate -Dexpression=spark.version -q -DforceStdout)
+scala_version=$(mvn help:evaluate -Dexpression=scala.version -q -DforceStdout)
+scala_binary_version=$(mvn help:evaluate -Dexpression=scala.binary.version -q -DforceStdout)
+
+# Install XGBoost4J JAR into local Maven repository
+mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j/target/xgboost4j-${xgboost4j_version}.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j -Dversion=${xgboost4j_version} -Dpackaging=jar
+mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j/target/xgboost4j-${xgboost4j_version}-tests.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j -Dversion=${xgboost4j_version} -Dpackaging=test-jar -Dclassifier=tests
+mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j-spark/target/xgboost4j-spark-${xgboost4j_version}.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j-spark -Dversion=${xgboost4j_version} -Dpackaging=jar
+mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j-example/target/xgboost4j-example-${xgboost4j_version}.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j-example -Dversion=${xgboost4j_version} -Dpackaging=jar
+
+cd xgboost4j-tester
+# Generate pom.xml for XGBoost4J-tester, a dummy project to run XGBoost4J tests
+python3 ./generate_pom.py ${xgboost4j_version} ${maven_compiler_source} ${maven_compiler_target} ${spark_version} ${scala_version} ${scala_binary_version}
+# Run unit tests with XGBoost4J
+mvn --no-transfer-progress package
+
+# Run integration tests with XGBoost4J
+java -jar ./target/xgboost4j-tester-1.0-SNAPSHOT-jar-with-dependencies.jar
+
+# Run integration tests with XGBoost4J-Spark
+if [ ! -z "$RUN_INTEGRATION_TEST" ]
+then
+  python3 get_iris.py
+  spark-submit --class ml.dmlc.xgboost4j.scala.example.spark.SparkTraining --master 'local[8]' ./target/xgboost4j-tester-1.0-SNAPSHOT-jar-with-dependencies.jar ${PWD}/iris.csv
+  spark-submit --class ml.dmlc.xgboost4j.scala.example.spark.SparkMLlibPipeline --master 'local[8]' ./target/xgboost4j-tester-1.0-SNAPSHOT-jar-with-dependencies.jar ${PWD}/iris.csv ${PWD}/native_model ${PWD}/pipeline_model
+fi
+
+set +x
+set +e
diff --git a/tests/ci_build/test_mgpu.sh b/tests/ci_build/test_mgpu.sh
deleted file mode 100755
index b003d9c424d4..000000000000
--- a/tests/ci_build/test_mgpu.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/usr/bin/env bash
-set -e
-
-cd python-package
-python setup.py install --user
-cd ..
-pytest -v -s --fulltrace -m "(not slow) and mgpu" tests/python-gpu
-./build/testxgboost --gtest_filter=*.MGPU_*
-
-cd tests/distributed
-./runtests-gpu.sh
diff --git a/tests/ci_build/test_python.sh b/tests/ci_build/test_python.sh
new file mode 100755
index 000000000000..439f6e59443b
--- /dev/null
+++ b/tests/ci_build/test_python.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+set -e
+set -x
+
+suite=$1
+
+# Install XGBoost Python package
+wheel_found=0
+for file in python-package/dist/*.whl
+do
+  pip install --user ${file}
+  wheel_found=1
+  break  # need just one
+done
+if [ "$wheel_found" -eq 0 ]
+then
+  pushd .
+  cd python-package
+  python setup.py install --user
+  popd
+fi
+
+# Run specified test suite
+case "$suite" in
+  gpu)
+    pytest -v -s --fulltrace -m "(not slow) and (not mgpu)" tests/python-gpu
+    ;;
+
+  mgpu)
+    pytest -v -s --fulltrace -m "(not slow) and mgpu" tests/python-gpu
+    cd tests/distributed
+    ./runtests-gpu.sh
+    ;;
+
+  cpu)
+    pytest -v -s --fulltrace tests/python
+    cd tests/distributed
+    ./runtests.sh
+    ;;
+
+  *)
+    echo "Usage: $0 {gpu|mgpu|cpu}"
+    exit 1
+    ;;
+esac
diff --git a/tests/cpp/common/test_random.cc b/tests/cpp/common/test_random.cc
index 702f29907d29..e408f11bc75c 100644
--- a/tests/cpp/common/test_random.cc
+++ b/tests/cpp/common/test_random.cc
@@ -1,3 +1,4 @@
+#include <valarray>
 #include "../../../src/common/random.h"
 #include "../helpers.h"
 #include "gtest/gtest.h"
@@ -33,7 +34,8 @@ TEST(ColumnSampler, Test) {
 
   // No level or node sampling, should be the same at different depth
   cs.Init(n, 1.0f, 1.0f, 0.5f);
-  ASSERT_EQ(cs.GetFeatureSet(0)->HostVector(), cs.GetFeatureSet(1)->HostVector());
+  ASSERT_EQ(cs.GetFeatureSet(0)->HostVector(),
+            cs.GetFeatureSet(1)->HostVector());
 
   cs.Init(n, 1.0f, 1.0f, 1.0f);
   auto set5 = *cs.GetFeatureSet(0);
@@ -45,7 +47,34 @@ TEST(ColumnSampler, Test) {
   // Should always be a minimum of one feature
   cs.Init(n, 1e-16f, 1e-16f, 1e-16f);
   ASSERT_EQ(cs.GetFeatureSet(0)->Size(), 1);
+}
 
+// Test if different threads using the same seed produce the same result
+TEST(ColumnSampler, ThreadSynchronisation) {
+  const int64_t num_threads = 100;
+  int n = 128;
+  int iterations = 10;
+  int levels = 5;
+  std::vector<int> reference_result;
+  bool success =
+      true;  // Cannot use google test asserts in multithreaded region
+#pragma omp parallel num_threads(num_threads)
+  {
+    for (auto j = 0ull; j < iterations; j++) {
+      ColumnSampler cs(j);
+      cs.Init(n, 0.5f, 0.5f, 0.5f);
+      for (auto level = 0ull; level < levels; level++) {
+        auto result = cs.GetFeatureSet(level)->ConstHostVector();
+#pragma omp single
+        { reference_result = result; }
+        if (result != reference_result) {
+          success = false;
+        }
+#pragma omp barrier
+      }
+    }
+  }
+  ASSERT_TRUE(success);
 }
 }  // namespace common
 }  // namespace xgboost
diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
index 37cce8b577df..897b9d77196d 100644
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -89,7 +89,7 @@ TEST(GpuHist, BuildGidxDense) {
   param.n_gpus = 1;
   param.max_leaves = 0;
 
-  DeviceShard<GradientPairPrecise> shard(0, 0, kNRows, param);
+  DeviceShard<GradientPairPrecise> shard(0, 0, 0, kNRows, param, kNCols);
   BuildGidx(&shard, kNRows, kNCols);
 
   std::vector<common::CompressedByteT> h_gidx_buffer(shard.gidx_buffer.size());
@@ -128,7 +128,7 @@ TEST(GpuHist, BuildGidxSparse) {
   param.n_gpus = 1;
   param.max_leaves = 0;
 
-  DeviceShard<GradientPairPrecise> shard(0, 0, kNRows, param);
+  DeviceShard<GradientPairPrecise> shard(0, 0, 0, kNRows, param, kNCols);
   BuildGidx(&shard, kNRows, kNCols, 0.9f);
 
   std::vector<common::CompressedByteT> h_gidx_buffer(shard.gidx_buffer.size());
@@ -172,7 +172,7 @@ void TestBuildHist(GPUHistBuilderBase<GradientSumT>& builder) {
   param.n_gpus = 1;
   param.max_leaves = 0;
 
-  DeviceShard<GradientSumT> shard(0, 0, kNRows, param);
+  DeviceShard<GradientSumT> shard(0, 0, 0, kNRows, param, kNCols);
 
   BuildGidx(&shard, kNRows, kNCols);
 
@@ -282,8 +282,8 @@ TEST(GpuHist, EvaluateSplits) {
   int max_bins = 4;
 
   // Initialize DeviceShard
-  std::unique_ptr<DeviceShard<GradientPairPrecise>> shard {
-    new DeviceShard<GradientPairPrecise>(0, 0, kNRows, param)};
+  std::unique_ptr<DeviceShard<GradientPairPrecise>> shard{
+      new DeviceShard<GradientPairPrecise>(0, 0, 0, kNRows, param, kNCols)};
   // Initialize DeviceShard::node_sum_gradients
   shard->node_sum_gradients = {{6.4f, 12.8f}};
 
@@ -321,12 +321,7 @@ TEST(GpuHist, EvaluateSplits) {
   thrust::copy(hist.begin(), hist.end(),
                shard->hist.Data().begin());
 
-  // Initialize GPUHistMaker
-  GPUHistMakerSpecialised<GradientPairPrecise> hist_maker =
-      GPUHistMakerSpecialised<GradientPairPrecise>();
-  hist_maker.param_ = param;
-  hist_maker.shards_.push_back(std::move(shard));
-  hist_maker.column_sampler_.Init(kNCols,
+  shard->column_sampler.Init(kNCols,
                                   param.colsample_bynode,
                                   param.colsample_bylevel,
                                   param.colsample_bytree,
@@ -337,13 +332,12 @@ TEST(GpuHist, EvaluateSplits) {
   info.num_row_ = kNRows;
   info.num_col_ = kNCols;
 
-  hist_maker.info_ = &info;
-  hist_maker.node_value_constraints_.resize(1);
-  hist_maker.node_value_constraints_[0].lower_bound = -1.0;
-  hist_maker.node_value_constraints_[0].upper_bound = 1.0;
+  shard->node_value_constraints.resize(1);
+  shard->node_value_constraints[0].lower_bound = -1.0;
+  shard->node_value_constraints[0].upper_bound = 1.0;
 
   std::vector<DeviceSplitCandidate> res =
-    hist_maker.EvaluateSplits({ 0,0 }, &tree);
+    shard->EvaluateSplits({ 0,0 }, tree, kNCols);
 
   ASSERT_EQ(res[0].findex, 7);
   ASSERT_EQ(res[1].findex, 7);
@@ -368,7 +362,8 @@ TEST(GpuHist, ApplySplit) {
   }
 
   hist_maker.shards_.resize(1);
-  hist_maker.shards_[0].reset(new DeviceShard<GradientPairPrecise>(0, 0, kNRows, param));
+  hist_maker.shards_[0].reset(
+      new DeviceShard<GradientPairPrecise>(0, 0, 0, kNRows, param, kNCols));
 
   auto& shard = hist_maker.shards_.at(0);
   shard->ridx_segments.resize(3);  // 3 nodes.
@@ -435,8 +430,8 @@ TEST(GpuHist, ApplySplit) {
       shard->gidx_buffer.data(), num_symbols);
 
   hist_maker.info_ = &info;
-  hist_maker.ApplySplit(candidate_entry, &tree);
-  hist_maker.UpdatePosition(candidate_entry, &tree);
+  shard->ApplySplit(candidate_entry, &tree);
+  shard->UpdatePosition(candidate_entry.nid, tree[candidate_entry.nid]);
 
   ASSERT_FALSE(tree[kNId].IsLeaf());
 
diff --git a/tests/cpp/tree/test_tree_model.cc b/tests/cpp/tree/test_tree_model.cc
index 21c2eeeda431..da0e6bcb6459 100644
--- a/tests/cpp/tree/test_tree_model.cc
+++ b/tests/cpp/tree/test_tree_model.cc
@@ -84,4 +84,21 @@ TEST(Tree, Load) {
   EXPECT_EQ(tree[1].LeafValue(), 0.1f);
   EXPECT_TRUE(tree[1].IsLeaf());
 }
+
+TEST(Tree, AllocateNode) {
+  RegTree tree;
+  tree.ExpandNode(
+      0, 0, 0.0f, false, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
+  tree.CollapseToLeaf(0, 0);
+  ASSERT_EQ(tree.NumExtraNodes(), 0);
+
+  tree.ExpandNode(
+      0, 0, 0.0f, false, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
+  ASSERT_EQ(tree.NumExtraNodes(), 2);
+
+  auto& nodes = tree.GetNodes();
+  ASSERT_FALSE(nodes.at(1).IsDeleted());
+  ASSERT_TRUE(nodes.at(1).IsLeaf());
+  ASSERT_TRUE(nodes.at(2).IsLeaf());
+}
 }  // namespace xgboost
diff --git a/tests/distributed/distributed_gpu.py b/tests/distributed/distributed_gpu.py
index 172b0443d010..0099051104b9 100644
--- a/tests/distributed/distributed_gpu.py
+++ b/tests/distributed/distributed_gpu.py
@@ -54,7 +54,8 @@ def run_test(name, params_fun):
     'max_depth': 2,
     'eta': 1,
     'verbosity': 0,
-    'objective': 'binary:logistic'
+    'objective': 'binary:logistic',
+    'debug_synchronize': True
 }
 
 
diff --git a/tests/distributed/runtests-gpu.sh b/tests/distributed/runtests-gpu.sh
index de8e71facae7..950704f9850b 100755
--- a/tests/distributed/runtests-gpu.sh
+++ b/tests/distributed/runtests-gpu.sh
@@ -3,7 +3,6 @@
 rm -f *.model*
 
 export DMLC_SUBMIT_CLUSTER=local
-export PYTHONPATH=../../python-package
 submit="timeout 30 python ../../dmlc-core/tracker/dmlc-submit"
 
 echo -e "\n ====== 1. Basic distributed-gpu test with Python: 4 workers; 1 GPU per worker ====== \n"
diff --git a/tests/distributed/runtests.sh b/tests/distributed/runtests.sh
index 2e60b1c5aa98..075456885efe 100755
--- a/tests/distributed/runtests.sh
+++ b/tests/distributed/runtests.sh
@@ -1,9 +1,13 @@
 #!/bin/bash
 
+rm -f *.model*
+
+export DMLC_SUBMIT_CLUSTER=local
+
+submit="timeout 30 python ../../dmlc-core/tracker/dmlc-submit"
+
 echo "====== 1. Basic distributed test with Python ======"
-PYTHONPATH=../../python-package/ python ../../dmlc-core/tracker/dmlc-submit  --cluster=local --num-workers=3\
-  python test_basic.py
+$submit --cluster=local --num-workers=3 python test_basic.py
 
 echo "====== 2. Regression test for issue #3402 ======"
-PYTHONPATH=../../python-package/ python ../../dmlc-core/tracker/dmlc-submit  --cluster=local --num-workers=2 --worker-cores=1\
-  python test_issue3402.py
+$submit --cluster=local --num-workers=2 --worker-cores=1 python test_issue3402.py
diff --git a/tests/python-gpu/test_gpu_updaters.py b/tests/python-gpu/test_gpu_updaters.py
index 4cb683a88450..5c26ae11cab9 100644
--- a/tests/python-gpu/test_gpu_updaters.py
+++ b/tests/python-gpu/test_gpu_updaters.py
@@ -51,7 +51,7 @@ def test_gpu_hist_mgpu(self):
         variable_param = {'n_gpus': [-1], 'max_depth': [2, 10],
                           'max_leaves': [255, 4],
                           'max_bin': [2, 256],
-                          'grow_policy': ['lossguide']}
+                          'grow_policy': ['lossguide'], 'debug_synchronize': [True]}
         for param in parameter_combinations(variable_param):
             param['tree_method'] = 'gpu_hist'
             gpu_results = run_suite(param, select_datasets=datasets)
diff --git a/tests/travis/run_test.sh b/tests/travis/run_test.sh
index e29bd53b714a..bf413cf8e75c 100755
--- a/tests/travis/run_test.sh
+++ b/tests/travis/run_test.sh
@@ -1,34 +1,8 @@
 #!/bin/bash
 
-if [ ${TASK} == "lint" ]; then
-    source activate python3
-    conda install numpy scipy
-    python -m pip install cpplint pylint astroid
-    make lint || exit -1
-    echo "Check documentations..."
-
-    mkdir build_doc
-    cd build_doc
-    cmake .. -DBUILD_C_DOC=ON
-    make doc_doxygen 2> log.txt
-
-    (cat log.txt| grep -v ENABLE_PREPROCESSING |grep -v "unsupported tag") > logclean.txt
-    echo "---------Error Log----------"
-    cat logclean.txt
-    echo "----------------------------"
-    (cat logclean.txt|grep warning) && exit -1
-    (cat logclean.txt|grep error) && exit -1
-
-    cd -
-    rm -rf build_doc
-
-    exit 0
-fi
-
 cp make/travis.mk config.mk
 make -f dmlc-core/scripts/packages.mk lz4
 
-
 if [ ${TRAVIS_OS_NAME} == "osx" ]; then
     echo 'USE_OPENMP=0' >> config.mk
     echo 'TMPVAR := $(XGB_PLUGINS)' >> config.mk
@@ -45,96 +19,10 @@ if [ ${TASK} == "python_test" ]; then
     python --version
     conda install numpy scipy pandas matplotlib scikit-learn
 
-    # Install data table from source
-    wget http://releases.llvm.org/5.0.2/clang+llvm-5.0.2-x86_64-linux-gnu-ubuntu-14.04.tar.xz
-    tar xf clang+llvm-5.0.2-x86_64-linux-gnu-ubuntu-14.04.tar.xz
-    export LLVM5=$(pwd)/clang+llvm-5.0.2-x86_64-linux-gnu-ubuntu-14.04
-    python -m pip install datatable --no-binary datatable
-
     python -m pip install graphviz pytest pytest-cov codecov
+    python -m pip install https://h2o-release.s3.amazonaws.com/datatable/stable/datatable-0.7.0/datatable-0.7.0-cp37-cp37m-linux_x86_64.whl
     python -m pytest -v --fulltrace -s tests/python --cov=python-package/xgboost || exit -1
     codecov
-
-    source activate python2
-    echo "-------------------------------"
-    python --version
-    conda install numpy scipy pandas matplotlib scikit-learn
-    python -m pip install graphviz pytest
-    python -m pytest -v --fulltrace -s tests/python || exit -1
-    exit 0
-fi
-
-if [ ${TASK} == "python_lightweight_test" ]; then
-    make all || exit -1
-
-    echo "-------------------------------"
-    source activate python3
-    python --version
-    conda install numpy scipy
-    python -m pip install graphviz pytest pytest-cov codecov
-    python -m pytest -v --fulltrace -s tests/python --cov=python-package/xgboost || exit -1
-    codecov
-
-    source activate python2
-    echo "-------------------------------"
-    python --version
-    conda install numpy scipy pytest
-    python -m pip install graphviz
-    python -m pip install flake8==3.4.1
-    python -m pytest -v --fulltrace -s tests/python || exit -1
-
-    flake8 --ignore E501 python-package || exit -1
-    flake8 --ignore E501 tests/python || exit -1
-    exit 0
-fi
-
-if [ ${TASK} == "r_test" ]; then
-    set -e
-
-    make Rpack
-    cd ./xgboost
-
-    # Install package deps
-    Rscript -e "install.packages( \
-        c('devtools', 'testthat', 'lintr') \
-        , repos = 'http://cloud.r-project.org' \
-        , dependencies = c('Depends', 'Imports', 'LinkingTo') \
-    )"
-
-    Rscript -e \
-        "devtools::install_deps( \
-            repos = 'http://cloud.r-project.org' \
-            , upgrade = 'never' \
-            , dependencies = c('Depends', 'Imports', 'LinkingTo') \
-        )"
-
-    # install suggested packages separately to avoid huge build times
-    Rscript -e "install.packages( \
-        c('DiagrammeR', 'Ckmeans.1d.dp', 'vcd') \
-        , repos = 'https://cloud.r-project.org' \
-        , dependencies = c('Depends', 'Imports', 'LinkingTo') \
-    )"
-
-    # Run tests
-    echo "Building with R CMD build"
-    R CMD build \
-        --no-build-vignettes \
-        --no-manual \
-        .
-
-    echo "Running R tests"
-    R_PACKAGE_TARBALL=$(ls -1t *.tar.gz | head -n 1)
-
-    export _R_CHECK_TIMINGS_=0
-    export _R_CHECK_FORCE_SUGGESTS_=false
-    R CMD check \
-        ${R_PACKAGE_TARBALL} \
-        --no-vignettes \
-        --no-manual \
-        --as-cran \
-        --install-args=--build
-
-    exit 0
 fi
 
 if [ ${TASK} == "java_test" ]; then
@@ -150,7 +38,7 @@ if [ ${TASK} == "cmake_test" ]; then
     wget -nc https://github.com/google/googletest/archive/release-1.7.0.zip
     unzip -n release-1.7.0.zip
     mv googletest-release-1.7.0 gtest && cd gtest
-    cmake . && make
+    CC=gcc-7 CXX=g++-7 cmake . && make
     mkdir lib && mv libgtest.a lib
     cd ..
     rm -rf release-1.7.0.zip
@@ -159,59 +47,9 @@ if [ ${TASK} == "cmake_test" ]; then
     rm -rf build
     mkdir build && cd build
     PLUGINS="-DPLUGIN_LZ4=ON -DPLUGIN_DENSE_PARSER=ON"
-    cmake .. -DGOOGLE_TEST=ON -DGTEST_ROOT=$PWD/../gtest/ ${PLUGINS}
+    CC=gcc-7 CXX=g++-7 cmake .. -DGOOGLE_TEST=ON -DGTEST_ROOT=$PWD/../gtest/ ${PLUGINS}
     make
     ./testxgboost
     cd ..
     rm -rf build
 fi
-
-if [ ${TASK} == "cpp_test" ]; then
-    set -e
-    make -f dmlc-core/scripts/packages.mk gtest
-    echo "TEST_COVER=1" >> config.mk
-    echo "GTEST_PATH="${CACHE_PREFIX} >> config.mk
-    make cover
-fi
-
-
-if [ ${TASK} == "distributed_test" ]; then
-    set -e
-    make all || exit -1
-    echo "-------------------------------"
-    source activate python3
-    python --version
-    conda install numpy scipy
-    python -m pip install kubernetes
-    cd tests/distributed
-    ./runtests.sh
-fi
-
-if [ ${TASK} == "sanitizer_test" ]; then
-    set -e
-    # Build gtest via cmake
-    wget -nc https://github.com/google/googletest/archive/release-1.7.0.zip
-    unzip -n release-1.7.0.zip
-    mv googletest-release-1.7.0 gtest && cd gtest
-    CC=gcc-7 CXX=g++-7 cmake -DCMAKE_CXX_FLAGS="-fuse-ld=gold" \
-      -DCMAKE_C_FLAGS="-fuse-ld=gold"
-    make
-    mkdir lib && mv libgtest.a lib
-    cd ..
-    rm -rf release-1.7.0.zip
-
-    mkdir build && cd build
-    CC=gcc-7 CXX=g++-7 cmake .. -DGOOGLE_TEST=ON -DGTEST_ROOT=$PWD/../gtest/ \
-      -DUSE_SANITIZER=ON -DENABLED_SANITIZERS="address" \
-      -DCMAKE_BUILD_TYPE=Debug \
-      -DSANITIZER_PATH=/usr/lib/x86_64-linux-gnu/ \
-      -DCMAKE_CXX_FLAGS="-fuse-ld=gold" \
-      -DCMAKE_C_FLAGS="-fuse-ld=gold"
-    make
-
-    export ASAN_SYMBOLIZER_PATH=$(which llvm-symbolizer)
-    ASAN_OPTIONS=symbolize=1 ./testxgboost
-    cd ..
-    rm -rf build
-    exit 0
-fi
diff --git a/tests/travis/setup.sh b/tests/travis/setup.sh
index 751f6419d5f8..8288b9c9ff67 100755
--- a/tests/travis/setup.sh
+++ b/tests/travis/setup.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-if [ ${TASK} == "lint" ]; then
+if [ ${TASK} == "python_test" ]; then
     if [ ${TRAVIS_OS_NAME} == "osx" ]; then
         wget -O conda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
     else
@@ -16,20 +16,6 @@ if [ ${TASK} == "lint" ]; then
     conda create -n python3 python=3.7
 fi
 
-
-if [ ${TASK} == "python_test" ] || [ ${TASK} == "python_lightweight_test" ] || [ ${TASK} ==  "distributed_test" ]; then
-    if [ ${TRAVIS_OS_NAME} == "osx" ]; then
-        wget -O conda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-    else
-        wget -O conda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
-    fi
-    bash conda.sh -b -p $HOME/miniconda
-    export PATH="$HOME/miniconda/bin:$PATH"
-    hash -r
-    conda config --set always_yes yes --set changeps1 no
-    conda update -q conda
-    # Useful for debugging any issues with conda
-    conda info -a
-    conda create -n python3 python=3.7
-    conda create -n python2 python=2.7
+if [ ${TASK} == "cmake_test" ] && [ ${TRAVIS_OS_NAME} == "osx" ]; then
+    sudo softwareupdate -i "Command Line Tools (macOS High Sierra version 10.13) for Xcode-9.3"
 fi