RukhovichIV · RukhovichIV · Aug 30, 2021 · Aug 24, 2021 · Aug 25, 2021 · Aug 25, 2021
diff --git a/Jenkinsfile-win64 b/Jenkinsfile-win64
@@ -40,7 +40,8 @@ pipeline {
       steps {
         script {
           parallel ([
-            'build-win64-cuda10.1': { BuildWin64() }
+            'build-win64-cuda10.1': { BuildWin64() },
+            'build-rpkg-win64-cuda10.1': { BuildRPackageWithCUDAWin64() }
           ])
         }
       }
@@ -75,6 +76,7 @@ def checkoutSrcs() {
 
 def BuildWin64() {
   node('win64 && cuda10_unified') {
+    deleteDir()
     unstash name: 'srcs'
     echo "Building XGBoost for Windows AMD64 target..."
     bat "nvcc --version"
@@ -115,8 +117,26 @@ def BuildWin64() {
   }
 }
 
+def BuildRPackageWithCUDAWin64() {
+  node('win64 && cuda10_unified') {
+    deleteDir()
+    unstash name: 'srcs'
+    bat "nvcc --version"
+    if (env.BRANCH_NAME == 'master' || env.BRANCH_NAME.startsWith('release')) {
+      bat """
+      bash tests/ci_build/build_r_pkg_with_cuda_win64.sh ${commit_id}
+      """
+      echo 'Uploading R tarball...'
+      path = ("${BRANCH_NAME}" == 'master') ? '' : "${BRANCH_NAME}/"
+      s3Upload bucket: 'xgboost-nightly-builds', path: path, acl: 'PublicRead', includePathPattern:'xgboost_r_gpu_win64_*.tar.gz'
+    }
+    deleteDir()
+  }
+}
+
 def TestWin64() {
   node('win64 && cuda10_unified') {
+    deleteDir()
     unstash name: 'srcs'
     unstash name: 'xgboost_whl'
     unstash name: 'xgboost_cli'
@@ -127,7 +147,7 @@ def TestWin64() {
     bat "build\\testxgboost.exe"
     echo "Installing Python dependencies..."
     def env_name = 'win64_' + UUID.randomUUID().toString().replaceAll('-', '')
-    bat "conda env create -n ${env_name} --file=tests/ci_build/conda_env/win64_test.yml"
+    bat "conda activate && mamba env create -n ${env_name} --file=tests/ci_build/conda_env/win64_test.yml"
     echo "Installing Python wheel..."
     bat """
     conda activate ${env_name} && for /R %%i in (python-package\\dist\\*.whl) DO python -m pip install "%%i"

diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
@@ -90,7 +90,9 @@ function(format_gencode_flags flags out)
   endif()
   # Set up architecture flags
   if(NOT flags)
-    if (CUDA_VERSION VERSION_GREATER_EQUAL "11.0")
+    if (CUDA_VERSION VERSION_GREATER_EQUAL "11.1")
+      set(flags "50;52;60;61;70;75;80;86")
+    elseif (CUDA_VERSION VERSION_GREATER_EQUAL "11.0")
       set(flags "35;50;52;60;61;70;75;80")
     elseif(CUDA_VERSION VERSION_GREATER_EQUAL "10.0")
       set(flags "35;50;52;60;61;70;75")

diff --git a/doc/install.rst b/doc/install.rst
@@ -61,9 +61,12 @@ R
      and then run ``install.packages("xgboost")``. Without OpenMP, XGBoost will only use a
      single CPU core, leading to suboptimal training speed.
 
-* We also provide **experimental** pre-built binary on Linux x86_64 with GPU support.
+* We also provide **experimental** pre-built binary with GPU support. With this binary,
+  you will be able to use the GPU algorithm without building XGBoost from the source.
   Download the binary package from the Releases page. The file name will be of the form
-  ``xgboost_r_gpu_linux_[version].tar.gz``. Then install XGBoost by running:
+  ``xgboost_r_gpu_[os]_[version].tar.gz``, where ``[os]`` is either ``linux`` or ``win64``.
+  (We build the binaries for 64-bit Linux and Windows.)
+  Then install XGBoost by running:
 
   .. code-block:: bash
 
@@ -142,9 +145,11 @@ R
 -
 
 Other than standard CRAN installation, we also provide *experimental* pre-built binary on
-Linux x86_64 with GPU support.  You can go to `this page
+with GPU support.  You can go to `this page
 <https://s3-us-west-2.amazonaws.com/xgboost-nightly-builds/list.html>`_, Find the commit
-ID you want to install: ``xgboost_r_gpu_linux_[commit].tar.gz``, download it then run:
+ID you want to install and then locate the file ``xgboost_r_gpu_[os]_[commit].tar.gz``,
+where ``[os]`` is either ``linux`` or ``win64``. (We build the binaries for 64-bit Linux
+and Windows.) Download it and run the following commands:
 
 .. code-block:: bash
 

diff --git a/doc/parameter.rst b/doc/parameter.rst
@@ -243,16 +243,6 @@ Additional parameters for ``hist`` and ``gpu_hist`` tree method
 
   - Use single precision to build histograms instead of double precision.
 
-Additional parameters for ``gpu_hist`` tree method
-==================================================
-
-* ``deterministic_histogram``, [default=``true``]
-
-  - Build histogram on GPU deterministically.  Histogram building is not deterministic due
-    to the non-associative aspect of floating point summation.  We employ a pre-rounding
-    routine to mitigate the issue, which may lead to slightly lower accuracy.  Set to
-    ``false`` to disable it.
-
 Additional parameters for Dart Booster (``booster=dart``)
 =========================================================
 

diff --git a/include/xgboost/base.h b/include/xgboost/base.h
@@ -255,9 +255,12 @@ class GradientPairInternal {
 
 /*! \brief gradient statistics pair usually needed in gradient boosting */
 using GradientPair = detail::GradientPairInternal<float>;
-
 /*! \brief High precision gradient statistics pair */
 using GradientPairPrecise = detail::GradientPairInternal<double>;
+/*! \brief Fixed point representation for gradient pair. */
+using GradientPairInt32 = detail::GradientPairInternal<int>;
+/*! \brief Fixed point representation for high precision gradient pair. */
+using GradientPairInt64 = detail::GradientPairInternal<int64_t>;
 
 using Args = std::vector<std::pair<std::string, std::string> >;
 

diff --git a/python-package/setup.py b/python-package/setup.py
@@ -302,7 +302,7 @@ def run(self):
 
     with open(os.path.join(CURRENT_DIR, 'README.rst'), encoding='utf-8') as fd:
         description = fd.read()
-    with open(os.path.join(CURRENT_DIR, 'xgboost/VERSION')) as fd:
+    with open(os.path.join(CURRENT_DIR, 'xgboost/VERSION'), encoding="ascii") as fd:
         version = fd.read().strip()
 
     setup(name='xgboost',

diff --git a/python-package/xgboost/__init__.py b/python-package/xgboost/__init__.py
@@ -22,7 +22,7 @@
     pass
 
 VERSION_FILE = os.path.join(os.path.dirname(__file__), 'VERSION')
-with open(VERSION_FILE) as f:
+with open(VERSION_FILE, encoding="ascii") as f:
     __version__ = f.read().strip()
 
 __all__ = ['DMatrix', 'DeviceQuantileDMatrix', 'Booster', 'DataIter',

diff --git a/python-package/xgboost/compat.py b/python-package/xgboost/compat.py
@@ -70,7 +70,7 @@ class XGBoostLabelEncoder(LabelEncoder):
         '''Label encoder with JSON serialization methods.'''
         def to_json(self):
             '''Returns a JSON compatible dictionary'''
-            meta = dict()
+            meta = {}
             for k, v in self.__dict__.items():
                 if isinstance(v, np.ndarray):
                     meta[k] = v.tolist()
@@ -81,7 +81,7 @@ def to_json(self):
         def from_json(self, doc):
             # pylint: disable=attribute-defined-outside-init
             '''Load the encoder back from a JSON compatible dict.'''
-            meta = dict()
+            meta = {}
             for k, v in doc.items():
                 if k == 'classes_':
                     self.classes_ = np.array(v)

diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
@@ -2197,7 +2197,8 @@ def dump_model(self, fout, fmap='', with_stats=False, dump_format="text"):
         """
         if isinstance(fout, (STRING_TYPES, os.PathLike)):
             fout = os.fspath(os.path.expanduser(fout))
-            fout = open(fout, 'w')  # pylint: disable=consider-using-with
+            # pylint: disable=consider-using-with
+            fout = open(fout, 'w', encoding="utf-8")
             need_close = True
         else:
             need_close = False

diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
@@ -538,7 +538,7 @@ def get_xgb_params(self) -> Dict[str, Any]:
             'importance_type', 'kwargs', 'missing', 'n_estimators', 'use_label_encoder',
             "enable_categorical"
         }
-        filtered = dict()
+        filtered = {}
         for k, v in params.items():
             if k not in wrapper_specific and not callable(v):
                 filtered[k] = v
@@ -557,7 +557,7 @@ def _get_type(self) -> str:
         return self._estimator_type  # pylint: disable=no-member
 
     def save_model(self, fname: Union[str, os.PathLike]) -> None:
-        meta = dict()
+        meta = {}
         for k, v in self.__dict__.items():
             if k == '_le':
                 meta['_le'] = self._le.to_json()
@@ -596,7 +596,7 @@ def load_model(self, fname: Union[str, bytearray, os.PathLike]) -> None:
             )
             return
         meta = json.loads(meta_str)
-        states = dict()
+        states = {}
         for k, v in meta.items():
             if k == '_le':
                 self._le = XGBoostLabelEncoder()

diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2017-2020 XGBoost contributors
+ * Copyright 2017-2021 XGBoost contributors
  */
 #pragma once
 #include <thrust/device_ptr.h>
@@ -98,24 +98,28 @@ template <typename T = size_t,
           std::enable_if_t<std::is_same<size_t, T>::value &&
                            !std::is_same<size_t, unsigned long long>::value> * =  // NOLINT
               nullptr>
-T __device__ __forceinline__ atomicAdd(T *addr, T v) {  // NOLINT
+XGBOOST_DEV_INLINE T atomicAdd(T *addr, T v) {  // NOLINT
   using Type = typename dh::detail::AtomicDispatcher<sizeof(T)>::Type;
   Type ret = ::atomicAdd(reinterpret_cast<Type *>(addr), static_cast<Type>(v));
   return static_cast<T>(ret);
 }
-
 namespace dh {
 
 #ifdef XGBOOST_USE_NCCL
 #define safe_nccl(ans) ThrowOnNcclError((ans), __FILE__, __LINE__)
 
 inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file,
-                                        int line) {
+                                     int line) {
   if (code != ncclSuccess) {
     std::stringstream ss;
-    ss << "NCCL failure :" << ncclGetErrorString(code) << " ";
-    ss << file << "(" << line << ")";
-    throw std::runtime_error(ss.str());
+    ss << "NCCL failure :" << ncclGetErrorString(code);
+    if (code == ncclUnhandledCudaError) {
+      // nccl usually preserves the last error so we can get more details.
+      auto err = cudaPeekAtLastError();
+      ss << " " << thrust::system_error(err, thrust::cuda_category()).what();
+    }
+    ss << " " << file << "(" << line << ")";
+    LOG(FATAL) << ss.str();
   }
 
   return code;
@@ -1104,6 +1108,44 @@ XGBOOST_DEV_INLINE void AtomicAddGpair(OutputGradientT* dest,
             static_cast<typename OutputGradientT::ValueT>(gpair.GetHess()));
 }
 
+/**
+ * \brief An atomicAdd designed for gradient pair with better performance.  For general
+ *        int64_t atomicAdd, one can simply cast it to unsigned long long.
+ */
+XGBOOST_DEV_INLINE void AtomicAdd64As32(int64_t *dst, int64_t src) {
+  uint32_t* y_low = reinterpret_cast<uint32_t *>(dst);
+  uint32_t *y_high = y_low + 1;
+
+  auto cast_src = reinterpret_cast<uint64_t *>(&src);
+
+  uint32_t const x_low = static_cast<uint32_t>(src);
+  uint32_t const x_high = (*cast_src) >> 32;
+
+  auto const old = atomicAdd(y_low, x_low);
+  uint32_t const carry = old > (std::numeric_limits<uint32_t>::max() - x_low) ? 1 : 0;
+  uint32_t const sig = x_high + carry;
+  atomicAdd(y_high, sig);
+}
+
+XGBOOST_DEV_INLINE void
+AtomicAddGpair(xgboost::GradientPairInt64 *dest,
+               xgboost::GradientPairInt64 const &gpair) {
+  auto dst_ptr = reinterpret_cast<int64_t *>(dest);
+  auto g = gpair.GetGrad();
+  auto h = gpair.GetHess();
+
+  AtomicAdd64As32(dst_ptr, g);
+  AtomicAdd64As32(dst_ptr + 1, h);
+}
+
+XGBOOST_DEV_INLINE void
+AtomicAddGpair(xgboost::GradientPairInt32 *dest,
+               xgboost::GradientPairInt32 const &gpair) {
+  auto dst_ptr = reinterpret_cast<typename xgboost::GradientPairInt32::ValueT*>(dest);
+
+  ::atomicAdd(dst_ptr, static_cast<int>(gpair.GetGrad()));
+  ::atomicAdd(dst_ptr + 1, static_cast<int>(gpair.GetHess()));
+}
 
 // Thrust version of this function causes error on Windows
 template <typename ReturnT, typename IterT, typename FuncT>

diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
@@ -142,7 +142,7 @@ void GBTree::PerformTreeMethodHeuristic(DMatrix* fmat) {
     LOG(INFO) << "Tree method is automatically set to 'approx' "
                  "since external-memory data matrix is used.";
     tparam_.tree_method = TreeMethod::kApprox;
-  } else if (fmat->Info().num_row_ >= (4UL << 20UL)) {
+  } else if (fmat->Info().num_row_ >= (1UL << 18UL)) {
     /* Choose tree_method='hist' automatically for large data matrix */
     LOG(INFO) << "Tree method is automatically selected to be "
                  "'hist' for faster work. To use the old behavior "