diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 06badec5f2e2..1a8098071ba3 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -12,7 +12,7 @@ updates: - package-ecosystem: "maven" directory: "/jvm-packages/xgboost4j" schedule: - interval: "daily" + interval: "monthly" - package-ecosystem: "maven" directory: "/jvm-packages/xgboost4j-gpu" schedule: @@ -24,7 +24,7 @@ updates: - package-ecosystem: "maven" directory: "/jvm-packages/xgboost4j-spark" schedule: - interval: "daily" + interval: "monthly" - package-ecosystem: "maven" directory: "/jvm-packages/xgboost4j-spark-gpu" schedule: diff --git a/.github/workflows/i386.yml b/.github/workflows/i386.yml index a7c71a590cb8..78dc92bffff6 100644 --- a/.github/workflows/i386.yml +++ b/.github/workflows/i386.yml @@ -23,7 +23,7 @@ jobs: with: submodules: 'true' - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 + uses: docker/setup-buildx-action@v3.4.0 with: driver-opts: network=host - name: Build and push container diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 001e17b510a3..0408f358fba5 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -180,7 +180,7 @@ jobs: - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 with: submodules: 'true' - - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 + - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 with: python-version: "3.8" architecture: 'x64' diff --git a/.github/workflows/python_tests.yml b/.github/workflows/python_tests.yml index 83f0ad495fc3..e232cd754f7b 100644 --- a/.github/workflows/python_tests.yml +++ b/.github/workflows/python_tests.yml @@ -319,7 +319,7 @@ jobs: submodules: 'true' - name: Set up Python 3.8 - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 + uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 with: python-version: 3.8 diff --git a/.github/workflows/r_tests.yml b/.github/workflows/r_tests.yml index 9fb9d4684ad1..4298fd9c6a5d 100644 --- a/.github/workflows/r_tests.yml +++ b/.github/workflows/r_tests.yml @@ -84,7 +84,7 @@ jobs: key: ${{ runner.os }}-r-${{ matrix.config.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} - - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 + - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 with: python-version: "3.8" architecture: 'x64' diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml index 222700da4a58..a108f282214f 100644 --- a/.github/workflows/scorecards.yml +++ b/.github/workflows/scorecards.yml @@ -41,7 +41,7 @@ jobs: # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF # format to the repository Actions tab. - name: "Upload artifact" - uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4.3.3 + uses: actions/upload-artifact@0b2256b8c012f0828dc542b3febcab082c67f72b # v4.3.4 with: name: SARIF file path: results.sarif diff --git a/CMakeLists.txt b/CMakeLists.txt index f7cf8a6cfa87..034d52164ad2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -71,7 +71,6 @@ option(HIDE_CXX_SYMBOLS "Build shared library and hide all C++ symbols" OFF) option(KEEP_BUILD_ARTIFACTS_IN_BINARY_DIR "Output build artifacts in CMake binary dir" OFF) ## CUDA option(USE_CUDA "Build with GPU acceleration" OFF) -option(USE_PER_THREAD_DEFAULT_STREAM "Build with per-thread default stream" ON) option(USE_NCCL "Build with NCCL to enable distributed GPU support." OFF) # This is specifically designed for PyPI binary release and should be disabled for most of the cases. option(USE_DLOPEN_NCCL "Whether to load nccl dynamically." OFF) diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in index 5fbf479c5640..ed4b38f99ea7 100644 --- a/R-package/src/Makevars.in +++ b/R-package/src/Makevars.in @@ -113,6 +113,7 @@ OBJECTS= \ $(PKGROOT)/src/common/charconv.o \ $(PKGROOT)/src/common/column_matrix.o \ $(PKGROOT)/src/common/common.o \ + $(PKGROOT)/src/common/cuda_rt_utils.o \ $(PKGROOT)/src/common/error_msg.o \ $(PKGROOT)/src/common/hist_util.o \ $(PKGROOT)/src/common/host_device_vector.o \ diff --git a/R-package/src/Makevars.win b/R-package/src/Makevars.win index a5a5c131edf3..d4330120f890 100644 --- a/R-package/src/Makevars.win +++ b/R-package/src/Makevars.win @@ -113,6 +113,7 @@ OBJECTS= \ $(PKGROOT)/src/common/charconv.o \ $(PKGROOT)/src/common/column_matrix.o \ $(PKGROOT)/src/common/common.o \ + $(PKGROOT)/src/common/cuda_rt_utils.o \ $(PKGROOT)/src/common/error_msg.o \ $(PKGROOT)/src/common/hist_util.o \ $(PKGROOT)/src/common/host_device_vector.o \ diff --git a/README.md b/README.md index b27cce673585..1126a588d8ae 100644 --- a/README.md +++ b/README.md @@ -4,8 +4,8 @@ [![Build Status](https://badge.buildkite.com/aca47f40a32735c00a8550540c5eeff6a4c1d246a580cae9b0.svg?branch=master)](https://buildkite.com/xgboost/xgboost-ci) [![XGBoost-CI](https://github.com/dmlc/xgboost/workflows/XGBoost-CI/badge.svg?branch=master)](https://github.com/dmlc/xgboost/actions) [![Documentation Status](https://readthedocs.org/projects/xgboost/badge/?version=latest)](https://xgboost.readthedocs.org) -[![GitHub license](http://dmlc.github.io/img/apache2.svg)](./LICENSE) -[![CRAN Status Badge](http://www.r-pkg.org/badges/version/xgboost)](http://cran.r-project.org/web/packages/xgboost) +[![GitHub license](https://dmlc.github.io/img/apache2.svg)](./LICENSE) +[![CRAN Status Badge](https://www.r-pkg.org/badges/version/xgboost)](https://cran.r-project.org/web/packages/xgboost) [![PyPI version](https://badge.fury.io/py/xgboost.svg)](https://pypi.python.org/pypi/xgboost/) [![Conda version](https://img.shields.io/conda/vn/conda-forge/py-xgboost.svg)](https://anaconda.org/conda-forge/py-xgboost) [![Optuna](https://img.shields.io/badge/Optuna-integrated-blue)](https://optuna.org) @@ -35,7 +35,7 @@ Checkout the [Community Page](https://xgboost.ai/community). Reference --------- -- Tianqi Chen and Carlos Guestrin. [XGBoost: A Scalable Tree Boosting System](http://arxiv.org/abs/1603.02754). In 22nd SIGKDD Conference on Knowledge Discovery and Data Mining, 2016 +- Tianqi Chen and Carlos Guestrin. [XGBoost: A Scalable Tree Boosting System](https://arxiv.org/abs/1603.02754). In 22nd SIGKDD Conference on Knowledge Discovery and Data Mining, 2016 - XGBoost originates from research project at University of Washington. Sponsors diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake index 9563ec3076b2..266cf29b38b2 100644 --- a/cmake/Utils.cmake +++ b/cmake/Utils.cmake @@ -80,12 +80,8 @@ function(xgboost_set_cuda_flags target) $<$:--expt-extended-lambda> $<$:--expt-relaxed-constexpr> $<$:-Xcompiler=${OpenMP_CXX_FLAGS}> - $<$:-Xfatbin=-compress-all>) - - if(USE_PER_THREAD_DEFAULT_STREAM) - target_compile_options(${target} PRIVATE - $<$:--default-stream per-thread>) - endif() + $<$:-Xfatbin=-compress-all> + $<$:--default-stream per-thread>) if(FORCE_COLORED_OUTPUT) if(FORCE_COLORED_OUTPUT AND (CMAKE_GENERATOR STREQUAL "Ninja") AND diff --git a/dev/release-artifacts.py b/dev/release-artifacts.py index 1e0b5723e89b..f53ef134630b 100644 --- a/dev/release-artifacts.py +++ b/dev/release-artifacts.py @@ -2,6 +2,7 @@ tqdm, sh are required to run this script. """ + import argparse import os import shutil @@ -106,6 +107,15 @@ def make_pysrc_wheel( if not os.path.exists(dist): os.mkdir(dist) + # Apply patch to remove NCCL dependency + # Save the original content of pyproject.toml so that we can restore it later + with DirectoryExcursion(ROOT): + with open("python-package/pyproject.toml", "r") as f: + orig_pyproj_lines = f.read() + with open("tests/buildkite/remove_nccl_dep.patch", "r") as f: + patch_lines = f.read() + subprocess.run(["patch", "-p0"], input=patch_lines, text=True) + with DirectoryExcursion(os.path.join(ROOT, "python-package")): subprocess.check_call(["python", "-m", "build", "--sdist"]) if rc is not None: @@ -117,6 +127,10 @@ def make_pysrc_wheel( target = os.path.join(dist, name) shutil.move(src, target) + with DirectoryExcursion(ROOT): + with open("python-package/pyproject.toml", "w") as f: + print(orig_pyproj_lines, file=f, end="") + def download_py_packages( branch: str, major: int, minor: int, commit_hash: str, outdir: str diff --git a/doc/install.rst b/doc/install.rst index e5229702e16c..79082a7ed581 100644 --- a/doc/install.rst +++ b/doc/install.rst @@ -76,6 +76,19 @@ Capabilities of binary wheels for each platform: | Windows | |tick| | |cross| | +---------------------+---------+----------------------+ +Minimal installation (CPU-only) +******************************* +The default installation with ``pip`` will install the full XGBoost package, including the support for the GPU algorithms and federated learning. + +You may choose to reduce the size of the installed package and save the disk space, by opting to install ``xgboost-cpu`` instead: + +.. code-block:: bash + + pip install xgboost-cpu + +The ``xgboost-cpu`` variant will have drastically smaller disk footprint, but does not provide some features, such as the GPU algorithms and +federated learning. + Conda ***** diff --git a/include/xgboost/base.h b/include/xgboost/base.h index 9abe72b87859..64aab5c41b0c 100644 --- a/include/xgboost/base.h +++ b/include/xgboost/base.h @@ -7,6 +7,8 @@ #define XGBOOST_BASE_H_ #include // for omp_uint, omp_ulong +// Put the windefs here to guard as many files as possible. +#include #include // for int32_t, uint64_t, int16_t #include // for ostream diff --git a/include/xgboost/collective/poll_utils.h b/include/xgboost/collective/poll_utils.h index a4d2fbacda27..41b674964efc 100644 --- a/include/xgboost/collective/poll_utils.h +++ b/include/xgboost/collective/poll_utils.h @@ -4,13 +4,14 @@ * \author Tianqi Chen */ #pragma once -#include "xgboost/collective/result.h" -#include "xgboost/collective/socket.h" +#include +#include #if defined(_WIN32) +#include +// Socket API #include #include - #else #include diff --git a/include/xgboost/collective/socket.h b/include/xgboost/collective/socket.h index c5dd977f6255..bf5fffdaf155 100644 --- a/include/xgboost/collective/socket.h +++ b/include/xgboost/collective/socket.h @@ -1,12 +1,8 @@ /** - * Copyright (c) 2022-2024, XGBoost Contributors + * Copyright 2022-2024, XGBoost Contributors */ #pragma once -#if !defined(NOMINMAX) && defined(_WIN32) -#define NOMINMAX -#endif // !defined(NOMINMAX) - #include // errno, EINTR, EBADF #include // HOST_NAME_MAX #include // std::size_t @@ -18,18 +14,12 @@ #if defined(__linux__) #include // for TIOCOUTQ, FIONREAD -#endif // defined(__linux__) - -#if !defined(xgboost_IS_MINGW) - -#if defined(__MINGW32__) -#define xgboost_IS_MINGW 1 -#endif // defined(__MINGW32__) - -#endif // xgboost_IS_MINGW +#endif // defined(__linux__) #if defined(_WIN32) - +// Guard the include. +#include +// Socket API #include #include @@ -41,9 +31,9 @@ using in_port_t = std::uint16_t; #if !defined(xgboost_IS_MINGW) using ssize_t = int; -#endif // !xgboost_IS_MINGW() +#endif // !xgboost_IS_MINGW() -#else // UNIX +#else // UNIX #include // inet_ntop #include // fcntl, F_GETFL, O_NONBLOCK @@ -839,7 +829,3 @@ Result INetNToP(H const &host, std::string *p_out) { } // namespace xgboost #undef xgboost_CHECK_SYS_CALL - -#if defined(xgboost_IS_MINGW) -#undef xgboost_IS_MINGW -#endif diff --git a/include/xgboost/windefs.h b/include/xgboost/windefs.h new file mode 100644 index 000000000000..e7e743184a17 --- /dev/null +++ b/include/xgboost/windefs.h @@ -0,0 +1,33 @@ +/** + * Copyright 2024, XGBoost Contributors + * + * @brief Macro for Windows. + */ +#pragma once + +#if !defined(xgboost_IS_WIN) + +#if defined(_MSC_VER) || defined(__MINGW32__) +#define xgboost_IS_WIN 1 +#endif // defined(_MSC_VER) || defined(__MINGW32__) + +#endif // !defined(xgboost_IS_WIN) + +#if defined(xgboost_IS_WIN) + +#if !defined(NOMINMAX) +#define NOMINMAX +#endif // !defined(NOMINMAX) + +// A macro used inside `windows.h` to avoid conflicts with `winsock2.h` +#define WIN32_LEAN_AND_MEAN + +#if !defined(xgboost_IS_MINGW) + +#if defined(__MINGW32__) +#define xgboost_IS_MINGW 1 +#endif // defined(__MINGW32__) + +#endif // xgboost_IS_MINGW + +#endif // defined(xgboost_IS_WIN) diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml index fb6aaf020c3e..a154f2d489ae 100644 --- a/jvm-packages/pom.xml +++ b/jvm-packages/pom.xml @@ -37,17 +37,17 @@ 4.13.2 3.5.1 3.5.1 - 2.15.2 + 2.17.2 2.12.18 2.12 3.4.0 5 OFF OFF - 24.04.0 - 24.04.1 + 24.06.0 + 24.06.0 cuda12 - 3.2.18 + 3.2.19 2.12.0 false diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml index a273d8c135d8..8835def25858 100644 --- a/python-package/pyproject.toml +++ b/python-package/pyproject.toml @@ -7,13 +7,13 @@ build-backend = "packager.pep517" [project] name = "xgboost" -version = "2.2.0-dev" +description = "XGBoost Python Package" +readme = { file = "README.rst", content-type = "text/x-rst" } authors = [ { name = "Hyunsu Cho", email = "chohyu01@cs.washington.edu" }, { name = "Jiaming Yuan", email = "jm.yuan@outlook.com" } ] -description = "XGBoost Python Package" -readme = { file = "README.rst", content-type = "text/x-rst" } +version = "2.2.0-dev" requires-python = ">=3.8" license = { text = "Apache-2.0" } classifiers = [ diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py index e0096c89c9a8..0ed1f3e68431 100644 --- a/python-package/xgboost/testing/__init__.py +++ b/python-package/xgboost/testing/__init__.py @@ -45,6 +45,7 @@ get_cancer, get_digits, get_sparse, + make_batches, memory, ) @@ -161,7 +162,16 @@ def no_cudf() -> PytestSkip: def no_cupy() -> PytestSkip: - return no_mod("cupy") + skip_cupy = no_mod("cupy") + if not skip_cupy["condition"] and system() == "Windows": + import cupy as cp + + # Cupy might run into issue on Windows due to missing compiler + try: + cp.array([1, 2, 3]).sum() + except Exception: # pylint: disable=broad-except + skip_cupy["condition"] = True + return skip_cupy def no_dask_cudf() -> PytestSkip: diff --git a/python-package/xgboost/testing/data.py b/python-package/xgboost/testing/data.py index f4e97e59d363..4071219c44ef 100644 --- a/python-package/xgboost/testing/data.py +++ b/python-package/xgboost/testing/data.py @@ -9,6 +9,7 @@ Callable, Dict, Generator, + List, NamedTuple, Optional, Tuple, @@ -506,6 +507,36 @@ def get_mq2008( ) +def make_batches( # pylint: disable=too-many-arguments,too-many-locals + n_samples_per_batch: int, + n_features: int, + n_batches: int, + use_cupy: bool = False, + *, + vary_size: bool = False, + random_state: int = 1994, +) -> Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray]]: + """Make batches of dense data.""" + X = [] + y = [] + w = [] + if use_cupy: + import cupy # pylint: disable=import-error + + rng = cupy.random.RandomState(random_state) + else: + rng = np.random.RandomState(random_state) + for i in range(n_batches): + n_samples = n_samples_per_batch + i * 10 if vary_size else n_samples_per_batch + _X = rng.randn(n_samples, n_features) + _y = rng.randn(n_samples) + _w = rng.uniform(low=0, high=1, size=n_samples) + X.append(_X) + y.append(_y) + w.append(_w) + return X, y, w + + RelData = Tuple[sparse.csr_matrix, npt.NDArray[np.int32], npt.NDArray[np.int32]] diff --git a/src/c_api/c_api_error.h b/src/c_api/c_api_error.h index 0ad4ac073dbd..a1928e6b14d7 100644 --- a/src/c_api/c_api_error.h +++ b/src/c_api/c_api_error.h @@ -7,10 +7,9 @@ #define XGBOOST_C_API_C_API_ERROR_H_ #include -#include -#include "c_api_utils.h" -#include "xgboost/collective/result.h" +#include "c_api_utils.h" // for XGBoostAPIGuard +#include "xgboost/logging.h" /*! \brief macro to guard beginning and end section of all functions */ #ifdef LOG_CAPI_INVOCATION diff --git a/src/cli_main.cc b/src/cli_main.cc index 54a3450276f4..1c388cf845c2 100644 --- a/src/cli_main.cc +++ b/src/cli_main.cc @@ -4,29 +4,26 @@ * \brief The command line interface program of xgboost. * This file is not included in dynamic library. */ -#if !defined(NOMINMAX) && defined(_WIN32) -#define NOMINMAX -#endif // !defined(NOMINMAX) - #include - -#include +#include #include #include +#include #include #include -#include -#include -#include #include #include +#include +#include +#include #include + +#include "c_api/c_api_utils.h" #include "common/common.h" #include "common/config.h" #include "common/io.h" #include "common/version.h" -#include "c_api/c_api_utils.h" namespace xgboost { enum CLITask { diff --git a/src/collective/tracker.cc b/src/collective/tracker.cc index 6cb3601db7f4..bbc7a7c5a730 100644 --- a/src/collective/tracker.cc +++ b/src/collective/tracker.cc @@ -7,11 +7,10 @@ #include // socket, AF_INET6, AF_INET, connect, getsockname #endif // defined(__unix__) || defined(__APPLE__) -#if !defined(NOMINMAX) && defined(_WIN32) -#define NOMINMAX -#endif // !defined(NOMINMAX) - #if defined(_WIN32) +// Guard the include +#include +// Socket API #include #include #endif // defined(_WIN32) diff --git a/src/common/common.cc b/src/common/common.cc index 086f4c00d167..10a667070da9 100644 --- a/src/common/common.cc +++ b/src/common/common.cc @@ -1,5 +1,5 @@ /** - * Copyright 2015-2023 by Contributors + * Copyright 2015-2024, XGBoost Contributors */ #include "common.h" @@ -54,9 +54,4 @@ void EscapeU8(std::string const &string, std::string *p_buffer) { } } } - -#if !defined(XGBOOST_USE_CUDA) -int AllVisibleGPUs() { return 0; } -#endif // !defined(XGBOOST_USE_CUDA) - } // namespace xgboost::common diff --git a/src/common/common.cu b/src/common/common.cu index b6965904a2b0..958f93779308 100644 --- a/src/common/common.cu +++ b/src/common/common.cu @@ -1,29 +1,21 @@ -/*! - * Copyright 2018-2022 XGBoost contributors +/** + * Copyright 2018-2024, XGBoost contributors */ -#include "common.h" - -namespace xgboost { -namespace common { +#include +#include -void SetDevice(std::int32_t device) { - if (device >= 0) { - dh::safe_cuda(cudaSetDevice(device)); - } -} +#include "common.h" -int AllVisibleGPUs() { - int n_visgpus = 0; - try { - // When compiled with CUDA but running on CPU only device, - // cudaGetDeviceCount will fail. - dh::safe_cuda(cudaGetDeviceCount(&n_visgpus)); - } catch (const dmlc::Error &) { - cudaGetLastError(); // reset error. - return 0; +namespace dh { +void ThrowOnCudaError(cudaError_t code, const char *file, int line) { + if (code != cudaSuccess) { + std::string f; + if (file != nullptr) { + f = file; + } + LOG(FATAL) << thrust::system_error(code, thrust::cuda_category(), + f + ": " + std::to_string(line)) + .what(); } - return n_visgpus; } - -} // namespace common -} // namespace xgboost +} // namespace dh diff --git a/src/common/common.h b/src/common/common.h index 950dee5210b1..93151670b7be 100644 --- a/src/common/common.h +++ b/src/common/common.h @@ -1,5 +1,5 @@ /** - * Copyright 2015-2023 by XGBoost Contributors + * Copyright 2015-2024, XGBoost Contributors * \file common.h * \brief Common utilities */ @@ -19,9 +19,8 @@ #include "xgboost/base.h" // for XGBOOST_DEVICE #include "xgboost/logging.h" // for LOG, LOG_FATAL, LogMessageFatal +// magic to define functions based on the compiler. #if defined(__CUDACC__) -#include -#include #define WITH_CUDA() true @@ -31,23 +30,20 @@ #endif // defined(__CUDACC__) +#if defined(XGBOOST_USE_CUDA) +#include +#endif + namespace dh { -#if defined(__CUDACC__) +#if defined(XGBOOST_USE_CUDA) /* - * Error handling functions + * Error handling functions */ +void ThrowOnCudaError(cudaError_t code, const char *file, int line); + #define safe_cuda(ans) ThrowOnCudaError((ans), __FILE__, __LINE__) -inline cudaError_t ThrowOnCudaError(cudaError_t code, const char *file, - int line) { - if (code != cudaSuccess) { - LOG(FATAL) << thrust::system_error(code, thrust::cuda_category(), - std::string{file} + ": " + // NOLINT - std::to_string(line)).what(); - } - return code; -} -#endif // defined(__CUDACC__) +#endif // defined(XGBOOST_USE_CUDA) } // namespace dh namespace xgboost::common { @@ -167,8 +163,6 @@ class Range { Iterator end_; }; -int AllVisibleGPUs(); - inline void AssertGPUSupport() { #ifndef XGBOOST_USE_CUDA LOG(FATAL) << "XGBoost version not compiled with GPU support."; @@ -187,16 +181,6 @@ inline void AssertSYCLSupport() { #endif // XGBOOST_USE_SYCL } -void SetDevice(std::int32_t device); - -#if !defined(XGBOOST_USE_CUDA) -inline void SetDevice(std::int32_t device) { - if (device >= 0) { - AssertGPUSupport(); - } -} -#endif - /** * @brief Last index of a group in a CSR style of index pointer. */ diff --git a/src/common/cuda_rt_utils.cc b/src/common/cuda_rt_utils.cc new file mode 100644 index 000000000000..d41981d8fb18 --- /dev/null +++ b/src/common/cuda_rt_utils.cc @@ -0,0 +1,86 @@ +/** + * Copyright 2015-2024, XGBoost Contributors + */ +#include "cuda_rt_utils.h" + +#if defined(XGBOOST_USE_CUDA) +#include +#endif // defined(XGBOOST_USE_CUDA) + +#include // for int32_t + +#include "common.h" // for safe_cuda + +namespace xgboost::common { +#if defined(XGBOOST_USE_CUDA) +std::int32_t AllVisibleGPUs() { + int n_visgpus = 0; + try { + // When compiled with CUDA but running on CPU only device, + // cudaGetDeviceCount will fail. + dh::safe_cuda(cudaGetDeviceCount(&n_visgpus)); + } catch (const dmlc::Error &) { + cudaGetLastError(); // reset error. + return 0; + } + return n_visgpus; +} + +std::int32_t CurrentDevice() { + std::int32_t device = 0; + dh::safe_cuda(cudaGetDevice(&device)); + return device; +} + +// alternatively: `nvidia-smi -q | grep Addressing` +bool SupportsPageableMem() { + std::int32_t res{0}; + dh::safe_cuda(cudaDeviceGetAttribute(&res, cudaDevAttrPageableMemoryAccess, CurrentDevice())); + return res == 1; +} + +bool SupportsAts() { + std::int32_t res{0}; + dh::safe_cuda(cudaDeviceGetAttribute(&res, cudaDevAttrPageableMemoryAccessUsesHostPageTables, + CurrentDevice())); + return res == 1; +} + +void CheckComputeCapability() { + for (std::int32_t d_idx = 0; d_idx < AllVisibleGPUs(); ++d_idx) { + cudaDeviceProp prop; + dh::safe_cuda(cudaGetDeviceProperties(&prop, d_idx)); + std::ostringstream oss; + oss << "CUDA Capability Major/Minor version number: " << prop.major << "." << prop.minor + << " is insufficient. Need >=3.5"; + int failed = prop.major < 3 || (prop.major == 3 && prop.minor < 5); + if (failed) LOG(WARNING) << oss.str() << " for device: " << d_idx; + } +} + +void SetDevice(std::int32_t device) { + if (device >= 0) { + dh::safe_cuda(cudaSetDevice(device)); + } +} +#else +std::int32_t AllVisibleGPUs() { return 0; } + +std::int32_t CurrentDevice() { + AssertGPUSupport(); + return -1; +} + +bool SupportsPageableMem() { return false; } + +bool SupportsAts() { return false; } + +void CheckComputeCapability() {} + +void SetDevice(std::int32_t device) { + if (device >= 0) { + AssertGPUSupport(); + } +} +#endif // !defined(XGBOOST_USE_CUDA) +} // namespace xgboost::common diff --git a/src/common/cuda_rt_utils.h b/src/common/cuda_rt_utils.h new file mode 100644 index 000000000000..210f1e07d7f8 --- /dev/null +++ b/src/common/cuda_rt_utils.h @@ -0,0 +1,58 @@ +/** + * Copyright 2024, XGBoost contributors + */ +#pragma once +#include // for int32_t + +#if defined(XGBOOST_USE_NVTX) +#include +#endif // defined(XGBOOST_USE_NVTX) + +namespace xgboost::common { +std::int32_t AllVisibleGPUs(); + +std::int32_t CurrentDevice(); + +// Whether the device supports coherently accessing pageable memory without calling +// `cudaHostRegister` on it +bool SupportsPageableMem(); + +// Address Translation Service (ATS) +bool SupportsAts(); + +void CheckComputeCapability(); + +void SetDevice(std::int32_t device); + +struct NvtxDomain { + static constexpr char const *name{"libxgboost"}; // NOLINT +}; + +#if defined(XGBOOST_USE_NVTX) +using NvtxScopedRange = ::nvtx3::scoped_range_in; +using NvtxEventAttr = ::nvtx3::event_attributes; +using NvtxRgb = ::nvtx3::rgb; +#else +class NvtxScopedRange { + public: + template + explicit NvtxScopedRange(Args &&...) {} +}; +class NvtxEventAttr { + public: + template + explicit NvtxEventAttr(Args &&...) {} +}; +class NvtxRgb { + public: + template + explicit NvtxRgb(Args &&...) {} +}; +#endif // defined(XGBOOST_USE_NVTX) +} // namespace xgboost::common + +#if defined(XGBOOST_USE_NVTX) +#define xgboost_NVTX_FN_RANGE() NVTX3_FUNC_RANGE_IN(::xgboost::common::NvtxDomain) +#else +#define xgboost_NVTX_FN_RANGE() +#endif // defined(XGBOOST_USE_NVTX) diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index 1754c9507036..34faa4eb013f 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -157,18 +157,6 @@ inline size_t MaxSharedMemoryOptin(int device_idx) { return static_cast(max_shared_memory); } -inline void CheckComputeCapability() { - for (int d_idx = 0; d_idx < xgboost::common::AllVisibleGPUs(); ++d_idx) { - cudaDeviceProp prop; - safe_cuda(cudaGetDeviceProperties(&prop, d_idx)); - std::ostringstream oss; - oss << "CUDA Capability Major/Minor version number: " << prop.major << "." - << prop.minor << " is insufficient. Need >=3.5"; - int failed = prop.major < 3 || (prop.major == 3 && prop.minor < 5); - if (failed) LOG(WARNING) << oss.str() << " for device: " << d_idx; - } -} - XGBOOST_DEV_INLINE void AtomicOrByte(unsigned int *__restrict__ buffer, size_t ibyte, unsigned char b) { atomicOr(&buffer[ibyte / sizeof(unsigned int)], @@ -273,13 +261,15 @@ void Iota(Container array, cudaStream_t stream) { } // dh::DebugSyncDevice(__FILE__, __LINE__); -inline void DebugSyncDevice(std::string file="", int32_t line = -1) { - if (file != "" && line != -1) { - auto rank = xgboost::collective::GetRank(); - LOG(DEBUG) << "R:" << rank << ": " << file << ":" << line; +inline void DebugSyncDevice(char const *file = __builtin_FILE(), int32_t line = __builtin_LINE()) { + { + auto err = cudaDeviceSynchronize(); + ThrowOnCudaError(err, file, line); + } + { + auto err = cudaGetLastError(); + ThrowOnCudaError(err, file, line); } - safe_cuda(cudaDeviceSynchronize()); - safe_cuda(cudaGetLastError()); } // Faster to instantiate than caching_device_vector and invokes no synchronisation @@ -510,7 +500,7 @@ xgboost::common::Span ToSpan(thrust::device_vector& vec, template xgboost::common::Span ToSpan(DeviceUVector &vec) { - return {thrust::raw_pointer_cast(vec.data()), vec.size()}; + return {vec.data(), vec.size()}; } // thrust begin, similiar to std::begin diff --git a/src/common/device_vector.cuh b/src/common/device_vector.cuh index 35386856cc9c..f99b7c3fe1f0 100644 --- a/src/common/device_vector.cuh +++ b/src/common/device_vector.cuh @@ -284,47 +284,64 @@ class LoggingResource : public rmm::mr::device_memory_resource { LoggingResource *GlobalLoggingResource(); +#endif // defined(XGBOOST_USE_RMM) + /** - * @brief Container class that doesn't initialize the data. + * @brief Container class that doesn't initialize the data when RMM is used. */ template -class DeviceUVector : public rmm::device_uvector { - using Super = rmm::device_uvector; +class DeviceUVector { + private: +#if defined(XGBOOST_USE_RMM) + rmm::device_uvector data_{0, rmm::cuda_stream_per_thread, GlobalLoggingResource()}; +#else + ::dh::device_vector data_; +#endif // defined(XGBOOST_USE_RMM) public: - DeviceUVector() : Super{0, rmm::cuda_stream_per_thread, GlobalLoggingResource()} {} + using value_type = T; // NOLINT + using pointer = value_type *; // NOLINT + using const_pointer = value_type const *; // NOLINT + using reference = value_type &; // NOLINT + using const_reference = value_type const &; // NOLINT - void Resize(std::size_t n) { Super::resize(n, rmm::cuda_stream_per_thread); } - void Resize(std::size_t n, T const &v) { + public: + DeviceUVector() = default; + DeviceUVector(DeviceUVector const &that) = delete; + DeviceUVector &operator=(DeviceUVector const &that) = delete; + DeviceUVector(DeviceUVector &&that) = default; + DeviceUVector &operator=(DeviceUVector &&that) = default; + + void resize(std::size_t n) { // NOLINT +#if defined(XGBOOST_USE_RMM) + data_.resize(n, rmm::cuda_stream_per_thread); +#else + data_.resize(n); +#endif + } + void resize(std::size_t n, T const &v) { // NOLINT +#if defined(XGBOOST_USE_RMM) auto orig = this->size(); - Super::resize(n, rmm::cuda_stream_per_thread); + data_.resize(n, rmm::cuda_stream_per_thread); if (orig < n) { thrust::fill(rmm::exec_policy_nosync{}, this->begin() + orig, this->end(), v); } +#else + data_.resize(n, v); +#endif } + [[nodiscard]] std::size_t size() const { return data_.size(); } // NOLINT - private: - // undefined private, cannot be accessed. - void resize(std::size_t n, rmm::cuda_stream_view stream); // NOLINT -}; - -#else + [[nodiscard]] auto begin() { return data_.begin(); } // NOLINT + [[nodiscard]] auto end() { return data_.end(); } // NOLINT -/** - * @brief Without RMM, the initialization will happen. - */ -template -class DeviceUVector : public thrust::device_vector> { - using Super = thrust::device_vector>; + [[nodiscard]] auto begin() const { return this->cbegin(); } // NOLINT + [[nodiscard]] auto end() const { return this->cend(); } // NOLINT - public: - void Resize(std::size_t n) { Super::resize(n); } - void Resize(std::size_t n, T const &v) { Super::resize(n, v); } + [[nodiscard]] auto cbegin() const { return data_.cbegin(); } // NOLINT + [[nodiscard]] auto cend() const { return data_.cend(); } // NOLINT - private: - // undefined private, cannot be accessed. - void resize(std::size_t n, T const &v = T{}); // NOLINT + [[nodiscard]] auto data() { return thrust::raw_pointer_cast(data_.data()); } // NOLINT + [[nodiscard]] auto data() const { return thrust::raw_pointer_cast(data_.data()); } // NOLINT }; - -#endif // defined(XGBOOST_USE_RMM) } // namespace dh diff --git a/src/common/host_device_vector.cu b/src/common/host_device_vector.cu index 16a1aa027f09..da4a55285765 100644 --- a/src/common/host_device_vector.cu +++ b/src/common/host_device_vector.cu @@ -67,12 +67,12 @@ class HostDeviceVectorImpl { T* DevicePointer() { LazySyncDevice(GPUAccess::kWrite); - return thrust::raw_pointer_cast(data_d_->data()); + return data_d_->data(); } const T* ConstDevicePointer() { LazySyncDevice(GPUAccess::kRead); - return thrust::raw_pointer_cast(data_d_->data()); + return data_d_->data(); } common::Span DeviceSpan() { @@ -181,7 +181,7 @@ class HostDeviceVectorImpl { gpu_access_ = GPUAccess::kWrite; SetDevice(); auto old_size = data_d_->size(); - data_d_->Resize(new_size, std::forward(args)...); + data_d_->resize(new_size, std::forward(args)...); } else { // resize on host LazySyncHost(GPUAccess::kNone); @@ -200,8 +200,8 @@ class HostDeviceVectorImpl { gpu_access_ = access; if (data_h_.size() != data_d_->size()) { data_h_.resize(data_d_->size()); } SetDevice(); - dh::safe_cuda(cudaMemcpy(data_h_.data(), thrust::raw_pointer_cast(data_d_->data()), - data_d_->size() * sizeof(T), cudaMemcpyDeviceToHost)); + dh::safe_cuda(cudaMemcpy(data_h_.data(), data_d_->data(), data_d_->size() * sizeof(T), + cudaMemcpyDeviceToHost)); } void LazySyncDevice(GPUAccess access) { @@ -214,9 +214,8 @@ class HostDeviceVectorImpl { // data is on the host LazyResizeDevice(data_h_.size()); SetDevice(); - dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(data_d_->data()), data_h_.data(), - data_d_->size() * sizeof(T), cudaMemcpyHostToDevice, - dh::DefaultStream())); + dh::safe_cuda(cudaMemcpyAsync(data_d_->data(), data_h_.data(), data_d_->size() * sizeof(T), + cudaMemcpyHostToDevice, dh::DefaultStream())); gpu_access_ = access; } @@ -260,7 +259,7 @@ class HostDeviceVectorImpl { void LazyResizeDevice(size_t new_size) { if (data_d_ && new_size == data_d_->size()) { return; } SetDevice(); - data_d_->Resize(new_size); + data_d_->resize(new_size); } void SetDevice() { diff --git a/src/common/io.cc b/src/common/io.cc index 1715669b091a..4bc8d9de4f53 100644 --- a/src/common/io.cc +++ b/src/common/io.cc @@ -1,26 +1,21 @@ /** - * Copyright 2019-2023, by XGBoost Contributors + * Copyright 2019-2024, by XGBoost Contributors */ -#if !defined(NOMINMAX) && defined(_WIN32) -#define NOMINMAX -#endif // !defined(NOMINMAX) +#if defined(__unix__) || defined(__APPLE__) -#if !defined(xgboost_IS_WIN) +#include // for open, O_RDONLY +#include // for mmap, mmap64, munmap, madvise +#include // for close, getpagesize -#if defined(_MSC_VER) || defined(__MINGW32__) -#define xgboost_IS_WIN 1 -#endif // defined(_MSC_VER) || defined(__MINGW32__) +#else -#endif // !defined(xgboost_IS_WIN) +#include -#if defined(__unix__) || defined(__APPLE__) -#include // for open, O_RDONLY -#include // for mmap, mmap64, munmap -#include // for close, getpagesize -#elif defined(xgboost_IS_WIN) -#define WIN32_LEAN_AND_MEAN +#if defined(xgboost_IS_WIN) #include -#endif // defined(__unix__) +#endif // defined(xgboost_IS_WIN) + +#endif // defined(__unix__) || defined(__APPLE__) #include // for copy, transform #include // for tolower @@ -31,8 +26,7 @@ #include // for filesystem, weakly_canonical #include // for ifstream #include // for distance -#include // for numeric_limits -#include // for unique_ptr +#include // for unique_ptr, make_unique #include // for string #include // for error_code, system_category #include // for move @@ -40,7 +34,12 @@ #include "io.h" #include "xgboost/collective/socket.h" // for LastError -#include "xgboost/logging.h" +#include "xgboost/logging.h" // for CHECK_LE +#include "xgboost/string_view.h" // for StringView + +#if !defined(__linux__) && !defined(__GLIBC__) && !defined(xgboost_IS_WIN) +#include // for numeric_limits +#endif namespace xgboost::common { size_t PeekableInStream::Read(void* dptr, size_t size) { @@ -182,39 +181,9 @@ std::string FileExtension(std::string fname, bool lower) { // NVCC 11.8 doesn't allow `noexcept(false) = default` altogether. ResourceHandler::~ResourceHandler() noexcept(false) {} // NOLINT -struct MMAPFile { -#if defined(xgboost_IS_WIN) - HANDLE fd{INVALID_HANDLE_VALUE}; - HANDLE file_map{INVALID_HANDLE_VALUE}; -#else - std::int32_t fd{0}; -#endif - std::byte* base_ptr{nullptr}; - std::size_t base_size{0}; - std::size_t delta{0}; - std::string path; - - MMAPFile() = default; - -#if defined(xgboost_IS_WIN) - MMAPFile(HANDLE fd, HANDLE fm, std::byte* base_ptr, std::size_t base_size, std::size_t delta, - std::string path) - : fd{fd}, - file_map{fm}, - base_ptr{base_ptr}, - base_size{base_size}, - delta{delta}, - path{std::move(path)} {} -#else - MMAPFile(std::int32_t fd, std::byte* base_ptr, std::size_t base_size, std::size_t delta, - std::string path) - : fd{fd}, base_ptr{base_ptr}, base_size{base_size}, delta{delta}, path{std::move(path)} {} -#endif -}; - -std::unique_ptr Open(std::string path, std::size_t offset, std::size_t length) { +MMAPFile* detail::OpenMmap(std::string path, std::size_t offset, std::size_t length) { if (length == 0) { - return std::make_unique(); + return new MMAPFile{}; } #if defined(xgboost_IS_WIN) @@ -234,10 +203,8 @@ std::unique_ptr Open(std::string path, std::size_t offset, std::size_t #if defined(__linux__) || defined(__GLIBC__) int prot{PROT_READ}; ptr = reinterpret_cast(mmap64(nullptr, view_size, prot, MAP_PRIVATE, fd, view_start)); - madvise(ptr, view_size, MADV_WILLNEED); CHECK_NE(ptr, MAP_FAILED) << "Failed to map: " << path << ". " << SystemErrorMsg(); - auto handle = - std::make_unique(fd, ptr, view_size, offset - view_start, std::move(path)); + auto handle = new MMAPFile{fd, ptr, view_size, offset - view_start, std::move(path)}; #elif defined(xgboost_IS_WIN) auto file_size = GetFileSize(fd, nullptr); DWORD access = PAGE_READONLY; @@ -248,55 +215,62 @@ std::unique_ptr Open(std::string path, std::size_t offset, std::size_t CHECK(map_file) << "Failed to map: " << path << ". " << SystemErrorMsg(); ptr = reinterpret_cast(MapViewOfFile(map_file, access, hoff, loff, view_size)); CHECK_NE(ptr, nullptr) << "Failed to map: " << path << ". " << SystemErrorMsg(); - auto handle = std::make_unique(fd, map_file, ptr, view_size, offset - view_start, - std::move(path)); + auto handle = new MMAPFile{fd, map_file, ptr, view_size, offset - view_start, std::move(path)}; #else CHECK_LE(offset, std::numeric_limits::max()) << "File size has exceeded the limit on the current system."; int prot{PROT_READ}; ptr = reinterpret_cast(mmap(nullptr, view_size, prot, MAP_PRIVATE, fd, view_start)); CHECK_NE(ptr, MAP_FAILED) << "Failed to map: " << path << ". " << SystemErrorMsg(); - auto handle = - std::make_unique(fd, ptr, view_size, offset - view_start, std::move(path)); -#endif // defined(__linux__) + auto handle = new MMAPFile{fd, ptr, view_size, offset - view_start, std::move(path)}; +#endif // defined(__linux__) || defined(__GLIBC__) return handle; } -MmapResource::MmapResource(std::string path, std::size_t offset, std::size_t length) - : ResourceHandler{kMmap}, handle_{Open(std::move(path), offset, length)}, n_{length} {} - -MmapResource::~MmapResource() noexcept(false) { - if (!handle_) { +void detail::CloseMmap(MMAPFile* handle) { + if (!handle) { return; } #if defined(xgboost_IS_WIN) - if (handle_->base_ptr) { - CHECK(UnmapViewOfFile(handle_->base_ptr)) "Faled to call munmap: " << SystemErrorMsg(); + if (handle->base_ptr) { + CHECK(UnmapViewOfFile(handle->base_ptr)) "Faled to call munmap: " << SystemErrorMsg(); } - if (handle_->fd != INVALID_HANDLE_VALUE) { - CHECK(CloseHandle(handle_->fd)) << "Failed to close handle: " << SystemErrorMsg(); + if (handle->fd != INVALID_HANDLE_VALUE) { + CHECK(CloseHandle(handle->fd)) << "Failed to close handle: " << SystemErrorMsg(); } - if (handle_->file_map != INVALID_HANDLE_VALUE) { - CHECK(CloseHandle(handle_->file_map)) << "Failed to close mapping object: " << SystemErrorMsg(); + if (handle->file_map != INVALID_HANDLE_VALUE) { + CHECK(CloseHandle(handle->file_map)) << "Failed to close mapping object: " << SystemErrorMsg(); } #else - if (handle_->base_ptr) { - CHECK_NE(munmap(handle_->base_ptr, handle_->base_size), -1) - << "Faled to call munmap: " << handle_->path << ". " << SystemErrorMsg(); + if (handle->base_ptr) { + CHECK_NE(munmap(handle->base_ptr, handle->base_size), -1) + << "Faled to call munmap: `" << handle->path << "`. " << SystemErrorMsg(); } - if (handle_->fd != 0) { - CHECK_NE(close(handle_->fd), -1) - << "Faled to close: " << handle_->path << ". " << SystemErrorMsg(); + if (handle->fd != 0) { + CHECK_NE(close(handle->fd), -1) + << "Faled to close: `" << handle->path << "`. " << SystemErrorMsg(); } #endif + delete handle; } +MmapResource::MmapResource(StringView path, std::size_t offset, std::size_t length) + : ResourceHandler{kMmap}, + handle_{detail::OpenMmap(std::string{path}, offset, length), detail::CloseMmap}, + n_{length} { +#if defined(__unix__) || defined(__APPLE__) + madvise(handle_->base_ptr, handle_->base_size, MADV_WILLNEED); +#endif // defined(__unix__) || defined(__APPLE__) +} + +MmapResource::~MmapResource() noexcept(false) = default; + [[nodiscard]] void* MmapResource::Data() { if (!handle_) { return nullptr; } - return handle_->base_ptr + handle_->delta; + return this->handle_->Data(); } [[nodiscard]] std::size_t MmapResource::Size() const { return n_; } @@ -329,7 +303,3 @@ AlignedMemWriteStream::~AlignedMemWriteStream() = default; return this->pimpl_->Tell(); } } // namespace xgboost::common - -#if defined(xgboost_IS_WIN) -#undef xgboost_IS_WIN -#endif // defined(xgboost_IS_WIN) diff --git a/src/common/io.h b/src/common/io.h index 198ce70143be..5f2e2833625a 100644 --- a/src/common/io.h +++ b/src/common/io.h @@ -7,7 +7,11 @@ #ifndef XGBOOST_COMMON_IO_H_ #define XGBOOST_COMMON_IO_H_ -#include +#include + +#if defined(xgboost_IS_WIN) +#include +#endif // defined(xgboost_IS_WIN) #include // for min, fill_n, copy_n #include // for array @@ -15,6 +19,7 @@ #include // for malloc, realloc, free #include // for memcpy #include // for ifstream +#include // for function #include // for numeric_limits #include // for unique_ptr #include // for string @@ -23,6 +28,7 @@ #include // for vector #include "common.h" // for DivRoundUp +#include "dmlc/io.h" // for SeekStream #include "xgboost/string_view.h" // for StringView namespace xgboost::common { @@ -224,7 +230,48 @@ inline std::string ReadAll(std::string const &path) { return content; } -struct MMAPFile; +/** + * @brief A handle to mmap file. + */ +struct MMAPFile { +#if defined(xgboost_IS_WIN) + HANDLE fd{INVALID_HANDLE_VALUE}; + HANDLE file_map{INVALID_HANDLE_VALUE}; +#else + std::int32_t fd{0}; +#endif // defined(xgboost_IS_WIN) + std::byte* base_ptr{nullptr}; + std::size_t base_size{0}; + std::size_t delta{0}; + std::string path; + + MMAPFile() = default; + +#if defined(xgboost_IS_WIN) + MMAPFile(HANDLE fd, HANDLE fm, std::byte* base_ptr, std::size_t base_size, std::size_t delta, + std::string path) + : fd{fd}, + file_map{fm}, + base_ptr{base_ptr}, + base_size{base_size}, + delta{delta}, + path{std::move(path)} {} +#else + MMAPFile(std::int32_t fd, std::byte* base_ptr, std::size_t base_size, std::size_t delta, + std::string path) + : fd{fd}, base_ptr{base_ptr}, base_size{base_size}, delta{delta}, path{std::move(path)} {} +#endif // defined(xgboost_IS_WIN) + + void const* Data() const { return this->base_ptr + this->delta; } + void* Data() { return this->base_ptr + this->delta; } +}; + +namespace detail { +// call mmap +[[nodiscard]] MMAPFile* OpenMmap(std::string path, std::size_t offset, std::size_t length); +// close the mapped file handle. +void CloseMmap(MMAPFile* handle); +} // namespace detail /** * @brief Handler for one-shot resource. Unlike `std::pmr::*`, the resource handler is @@ -237,6 +284,8 @@ class ResourceHandler { enum Kind : std::uint8_t { kMalloc = 0, kMmap = 1, + kCudaMalloc = 2, + kCudaMmap = 3, }; private: @@ -251,6 +300,20 @@ class ResourceHandler { [[nodiscard]] virtual std::size_t Size() const = 0; [[nodiscard]] auto Type() const { return kind_; } + [[nodiscard]] StringView TypeName() const { + switch (this->Type()) { + case kMalloc: + return "Malloc"; + case kMmap: + return "Mmap"; + case kCudaMalloc: + return "CudaMalloc"; + case kCudaMmap: + return "CudaMmap"; + } + LOG(FATAL) << "Unreachable."; + return {}; + } // Allow exceptions for cleaning up resource. virtual ~ResourceHandler() noexcept(false); @@ -339,11 +402,11 @@ class MallocResource : public ResourceHandler { * @brief A class for wrapping mmap as a resource for RAII. */ class MmapResource : public ResourceHandler { - std::unique_ptr handle_; + std::unique_ptr> handle_; std::size_t n_; public: - MmapResource(std::string path, std::size_t offset, std::size_t length); + MmapResource(StringView path, std::size_t offset, std::size_t length); ~MmapResource() noexcept(false) override; [[nodiscard]] void* Data() override; @@ -471,9 +534,9 @@ class PrivateMmapConstStream : public AlignedResourceReadStream { * @param offset See the `offset` parameter of `mmap` for details. * @param length See the `length` parameter of `mmap` for details. */ - explicit PrivateMmapConstStream(std::string path, std::size_t offset, std::size_t length) + explicit PrivateMmapConstStream(StringView path, std::size_t offset, std::size_t length) : AlignedResourceReadStream{std::shared_ptr{ // NOLINT - new MmapResource{std::move(path), offset, length}}} {} + new MmapResource{path, offset, length}}} {} ~PrivateMmapConstStream() noexcept(false) override; }; diff --git a/src/common/ref_resource_view.cuh b/src/common/ref_resource_view.cuh new file mode 100644 index 000000000000..d48b221a305d --- /dev/null +++ b/src/common/ref_resource_view.cuh @@ -0,0 +1,33 @@ +/** + * Copyright 2024, XGBoost Contributors + */ +#pragma once + +#include // for size_t +#include // for make_shared + +#include "cuda_context.cuh" // for CUDAContext +#include "ref_resource_view.h" // for RefResourceView +#include "resource.cuh" // for CudaAllocResource +#include "xgboost/context.h" // for Context + +namespace xgboost::common { +/** + * @brief Make a fixed size `RefResourceView` with cudaMalloc resource. + */ +template +[[nodiscard]] RefResourceView MakeFixedVecWithCudaMalloc(Context const*, + std::size_t n_elements) { + auto resource = std::make_shared(n_elements * sizeof(T)); + auto ref = RefResourceView{resource->DataAs(), n_elements, resource}; + return ref; +} + +template +[[nodiscard]] RefResourceView MakeFixedVecWithCudaMalloc(Context const* ctx, + std::size_t n_elements, T const& init) { + auto ref = MakeFixedVecWithCudaMalloc(ctx, n_elements); + thrust::fill_n(ctx->CUDACtx()->CTP(), ref.data(), ref.size(), init); + return ref; +} +} // namespace xgboost::common diff --git a/src/common/ref_resource_view.h b/src/common/ref_resource_view.h index 61adfdb7bea8..81058d923d3b 100644 --- a/src/common/ref_resource_view.h +++ b/src/common/ref_resource_view.h @@ -43,24 +43,16 @@ class RefResourceView { } public: - RefResourceView(value_type* ptr, size_type n, std::shared_ptr mem) - : ptr_{ptr}, size_{n}, mem_{std::move(mem)} { - CHECK_GE(mem_->Size(), n); - } /** * @brief Construct a view on ptr with length n. The ptr is held by the mem resource. * * @param ptr The pointer to view. * @param n The length of the view. * @param mem The owner of the pointer. - * @param init Initialize the view with this value. */ - RefResourceView(value_type* ptr, size_type n, std::shared_ptr mem, - T const& init) - : RefResourceView{ptr, n, mem} { - if (n != 0) { - std::fill_n(ptr_, n, init); - } + RefResourceView(value_type* ptr, size_type n, std::shared_ptr mem) + : ptr_{ptr}, size_{n}, mem_{std::move(mem)} { + CHECK_GE(mem_->Size(), n); } ~RefResourceView() = default; @@ -159,7 +151,9 @@ template template [[nodiscard]] RefResourceView MakeFixedVecWithMalloc(std::size_t n_elements, T const& init) { auto resource = std::make_shared(n_elements * sizeof(T)); - return RefResourceView{resource->DataAs(), n_elements, resource, init}; + auto ref = RefResourceView{resource->DataAs(), n_elements, resource}; + std::fill_n(ref.data(), ref.size(), init); + return ref; } template diff --git a/src/common/resource.cu b/src/common/resource.cu new file mode 100644 index 000000000000..ef662e3bd6e0 --- /dev/null +++ b/src/common/resource.cu @@ -0,0 +1,43 @@ +/** + * Copyright 2024, XGBoost Contributors + */ +#include "device_helpers.cuh" // for CurrentDevice +#include "resource.cuh" +#include "xgboost/string_view.h" // for StringView + +namespace xgboost::common { +CudaMmapResource::CudaMmapResource(StringView path, std::size_t offset, std::size_t length) + : ResourceHandler{kCudaMmap}, + handle_{detail::OpenMmap(std::string{path}, offset, length), + [](MMAPFile* handle) { + // Don't close the mmap while CUDA kernel is running. + if (handle) { + dh::DefaultStream().Sync(); + } + detail::CloseMmap(handle); + }}, + n_{length} { + auto device = dh::CurrentDevice(); + dh::safe_cuda( + cudaMemAdvise(handle_->base_ptr, handle_->base_size, cudaMemAdviseSetReadMostly, device)); + dh::safe_cuda(cudaMemAdvise(handle_->base_ptr, handle_->base_size, + cudaMemAdviseSetPreferredLocation, device)); + dh::safe_cuda( + cudaMemAdvise(handle_->base_ptr, handle_->base_size, cudaMemAdviseSetAccessedBy, device)); + dh::safe_cuda( + cudaMemPrefetchAsync(handle_->base_ptr, handle_->base_size, device, dh::DefaultStream())); +} + +[[nodiscard]] void* CudaMmapResource::Data() { + if (!handle_) { + return nullptr; + } + return this->handle_->Data(); +} + +[[nodiscard]] std::size_t CudaMmapResource::Size() const { return n_; } + +CudaMmapResource::~CudaMmapResource() noexcept(false) = default; + +PrivateCudaMmapConstStream::~PrivateCudaMmapConstStream() noexcept(false) = default; +} // namespace xgboost::common diff --git a/src/common/resource.cuh b/src/common/resource.cuh new file mode 100644 index 000000000000..e950a8d90695 --- /dev/null +++ b/src/common/resource.cuh @@ -0,0 +1,52 @@ +/** + * Copyright 2024, XGBoost Contributors + */ +#pragma once +#include // for size_t +#include // for function + +#include "device_vector.cuh" // for DeviceUVector +#include "io.h" // for ResourceHandler, MMAPFile +#include "xgboost/string_view.h" // for StringView + +namespace xgboost::common { +/** + * @brief Resource backed by `cudaMalloc`. + */ +class CudaMallocResource : public ResourceHandler { + dh::DeviceUVector storage_; + + void Clear() noexcept(true) { this->Resize(0); } + + public: + explicit CudaMallocResource(std::size_t n_bytes) : ResourceHandler{kCudaMalloc} { + this->Resize(n_bytes); + } + ~CudaMallocResource() noexcept(true) override { this->Clear(); } + + [[nodiscard]] void* Data() override { return storage_.data(); } + [[nodiscard]] std::size_t Size() const override { return storage_.size(); } + void Resize(std::size_t n_bytes) { this->storage_.resize(n_bytes); } +}; + +class CudaMmapResource : public ResourceHandler { + std::unique_ptr> handle_; + std::size_t n_; + + public: + CudaMmapResource() : ResourceHandler{kCudaMmap} {} + CudaMmapResource(StringView path, std::size_t offset, std::size_t length); + ~CudaMmapResource() noexcept(false) override; + + [[nodiscard]] void* Data() override; + [[nodiscard]] std::size_t Size() const override; +}; + +class PrivateCudaMmapConstStream : public AlignedResourceReadStream { + public: + explicit PrivateCudaMmapConstStream(StringView path, std::size_t offset, std::size_t length) + : AlignedResourceReadStream{ + std::shared_ptr{new CudaMmapResource{path, offset, length}}} {} + ~PrivateCudaMmapConstStream() noexcept(false) override; +}; +} // namespace xgboost::common diff --git a/src/common/timer.cc b/src/common/timer.cc index 9b1f49fbd5c8..0b55d1623dbc 100644 --- a/src/common/timer.cc +++ b/src/common/timer.cc @@ -6,9 +6,10 @@ #include #include "../collective/communicator-inl.h" +#include "cuda_rt_utils.h" #if defined(XGBOOST_USE_NVTX) -#include +#include #endif // defined(XGBOOST_USE_NVTX) namespace xgboost::common { @@ -17,8 +18,8 @@ void Monitor::Start(std::string const &name) { auto &stats = statistics_map_[name]; stats.timer.Start(); #if defined(XGBOOST_USE_NVTX) - std::string nvtx_name = "xgboost::" + label_ + "::" + name; - stats.nvtx_id = nvtxRangeStartA(nvtx_name.c_str()); + auto range_handle = nvtx3::start_range_in(label_ + "::" + name); + stats.nvtx_id = range_handle.get_value(); #endif // defined(XGBOOST_USE_NVTX) } } @@ -29,34 +30,32 @@ void Monitor::Stop(const std::string &name) { stats.timer.Stop(); stats.count++; #if defined(XGBOOST_USE_NVTX) - nvtxRangeEnd(stats.nvtx_id); + nvtx3::end_range_in(nvtx3::range_handle{stats.nvtx_id}); #endif // defined(XGBOOST_USE_NVTX) } } -void Monitor::PrintStatistics(StatMap const& statistics) const { +void Monitor::PrintStatistics(StatMap const &statistics) const { for (auto &kv : statistics) { if (kv.second.first == 0) { - LOG(WARNING) << - "Timer for " << kv.first << " did not get stopped properly."; + LOG(WARNING) << "Timer for " << kv.first << " did not get stopped properly."; continue; } - LOG(CONSOLE) << kv.first << ": " << static_cast(kv.second.second) / 1e+6 - << "s, " << kv.second.first << " calls @ " - << kv.second.second - << "us" << std::endl; + LOG(CONSOLE) << kv.first << ": " << static_cast(kv.second.second) / 1e+6 << "s, " + << kv.second.first << " calls @ " << kv.second.second << "us" << std::endl; } } void Monitor::Print() const { - if (!ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) { return; } + if (!ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) { + return; + } auto rank = collective::GetRank(); StatMap stat_map; for (auto const &kv : statistics_map_) { stat_map[kv.first] = std::make_pair( - kv.second.count, std::chrono::duration_cast( - kv.second.timer.elapsed) - .count()); + kv.second.count, + std::chrono::duration_cast(kv.second.timer.elapsed).count()); } if (stat_map.empty()) { return; diff --git a/src/context.cc b/src/context.cc index ef7110e7ce19..19060d5fc830 100644 --- a/src/context.cc +++ b/src/context.cc @@ -1,5 +1,5 @@ /** - * Copyright 2014-2023 by XGBoost Contributors + * Copyright 2014-2024, XGBoost Contributors * * \brief Context object used for controlling runtime parameters. */ @@ -11,8 +11,9 @@ #include // for optional #include // for regex_replace, regex_match -#include "common/common.h" // AssertGPUSupport -#include "common/error_msg.h" // WarnDeprecatedGPUId +#include "common/common.h" // AssertGPUSupport +#include "common/cuda_rt_utils.h" // for AllVisibleGPUs +#include "common/error_msg.h" // WarnDeprecatedGPUId #include "common/threading_utils.h" #include "xgboost/string_view.h" diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu index 81656284eb49..fc28b7c56f12 100644 --- a/src/data/ellpack_page.cu +++ b/src/data/ellpack_page.cu @@ -11,8 +11,9 @@ #include "../common/categorical.h" #include "../common/cuda_context.cuh" #include "../common/hist_util.cuh" -#include "../common/transform_iterator.h" // MakeIndexTransformIter -#include "device_adapter.cuh" // for NoInfInData +#include "../common/ref_resource_view.cuh" // for MakeFixedVecWithCudaMalloc +#include "../common/transform_iterator.h" // MakeIndexTransformIter +#include "device_adapter.cuh" // for NoInfInData #include "ellpack_page.cuh" #include "ellpack_page.h" #include "gradient_index.h" @@ -43,21 +44,19 @@ __global__ void CompressBinEllpackKernel( common::CompressedBufferWriter wr, common::CompressedByteT* __restrict__ buffer, // gidx_buffer const size_t* __restrict__ row_ptrs, // row offset of input data - const Entry* __restrict__ entries, // One batch of input data - const float* __restrict__ cuts, // HistogramCuts::cut_values_ - const uint32_t* __restrict__ cut_ptrs, // HistogramCuts::cut_ptrs_ + const Entry* __restrict__ entries, // One batch of input data + const float* __restrict__ cuts, // HistogramCuts::cut_values_ + const uint32_t* __restrict__ cut_ptrs, // HistogramCuts::cut_ptrs_ common::Span feature_types, - size_t base_row, // batch_row_begin - size_t n_rows, - size_t row_stride, - unsigned int null_gidx_value) { + size_t base_row, // batch_row_begin + size_t n_rows, size_t row_stride, std::uint32_t null_gidx_value) { size_t irow = threadIdx.x + blockIdx.x * blockDim.x; int ifeature = threadIdx.y + blockIdx.y * blockDim.y; if (irow >= n_rows || ifeature >= row_stride) { return; } int row_length = static_cast(row_ptrs[irow + 1] - row_ptrs[irow]); - unsigned int bin = null_gidx_value; + std::uint32_t bin = null_gidx_value; if (ifeature < row_length) { Entry entry = entries[row_ptrs[irow] - row_ptrs[0] + ifeature]; int feature = entry.index; @@ -89,25 +88,23 @@ __global__ void CompressBinEllpackKernel( } // Construct an ELLPACK matrix with the given number of empty rows. -EllpackPageImpl::EllpackPageImpl(DeviceOrd device, +EllpackPageImpl::EllpackPageImpl(Context const* ctx, std::shared_ptr cuts, bool is_dense, bst_idx_t row_stride, bst_idx_t n_rows) : is_dense(is_dense), cuts_(std::move(cuts)), row_stride{row_stride}, n_rows{n_rows} { monitor_.Init("ellpack_page"); - dh::safe_cuda(cudaSetDevice(device.ordinal)); + dh::safe_cuda(cudaSetDevice(ctx->Ordinal())); - monitor_.Start("InitCompressedData"); - this->InitCompressedData(device); - monitor_.Stop("InitCompressedData"); + this->InitCompressedData(ctx); } -EllpackPageImpl::EllpackPageImpl(DeviceOrd device, +EllpackPageImpl::EllpackPageImpl(Context const* ctx, std::shared_ptr cuts, const SparsePage& page, bool is_dense, size_t row_stride, common::Span feature_types) : cuts_(std::move(cuts)), is_dense(is_dense), n_rows(page.Size()), row_stride(row_stride) { - this->InitCompressedData(device); - this->CreateHistIndices(device, page, feature_types); + this->InitCompressedData(ctx); + this->CreateHistIndices(ctx->Device(), page, feature_types); } // Construct an ELLPACK matrix in memory. @@ -129,9 +126,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchP } monitor_.Stop("Quantiles"); - monitor_.Start("InitCompressedData"); - this->InitCompressedData(ctx->Device()); - monitor_.Stop("InitCompressedData"); + this->InitCompressedData(ctx); dmat->Info().feature_types.SetDevice(ctx->Device()); auto ft = dmat->Info().feature_types.ConstDeviceSpan(); @@ -234,7 +229,7 @@ void CopyDataToEllpack(const AdapterBatchT& batch, common::SpanGetDeviceAccessor(device); common::CompressedBufferWriter writer(device_accessor.NumSymbols()); - auto d_compressed_buffer = dst->gidx_buffer.DevicePointer(); + auto d_compressed_buffer = dst->gidx_buffer.data(); // We redirect the scan output into this functor to do the actual writing WriteCompressedEllpackFunctor functor( @@ -275,7 +270,7 @@ void WriteNullValues(EllpackPageImpl* dst, DeviceOrd device, common::SpanGetDeviceAccessor(device); common::CompressedBufferWriter writer(device_accessor.NumSymbols()); - auto d_compressed_buffer = dst->gidx_buffer.DevicePointer(); + auto d_compressed_buffer = dst->gidx_buffer.data(); auto row_stride = dst->row_stride; dh::LaunchN(row_stride * dst->n_rows, [=] __device__(size_t idx) { // For some reason this variable got captured as const @@ -290,20 +285,20 @@ void WriteNullValues(EllpackPageImpl* dst, DeviceOrd device, common::Span -EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, DeviceOrd device, bool is_dense, - common::Span row_counts_span, +EllpackPageImpl::EllpackPageImpl(Context const* ctx, AdapterBatch batch, float missing, + bool is_dense, common::Span row_counts_span, common::Span feature_types, size_t row_stride, size_t n_rows, std::shared_ptr cuts) { - dh::safe_cuda(cudaSetDevice(device.ordinal)); + dh::safe_cuda(cudaSetDevice(ctx->Ordinal())); - *this = EllpackPageImpl(device, cuts, is_dense, row_stride, n_rows); - CopyDataToEllpack(batch, feature_types, this, device, missing); - WriteNullValues(this, device, row_counts_span); + *this = EllpackPageImpl(ctx, cuts, is_dense, row_stride, n_rows); + CopyDataToEllpack(batch, feature_types, this, ctx->Device(), missing); + WriteNullValues(this, ctx->Device(), row_counts_span); } #define ELLPACK_BATCH_SPECIALIZE(__BATCH_T) \ template EllpackPageImpl::EllpackPageImpl( \ - __BATCH_T batch, float missing, DeviceOrd device, bool is_dense, \ + Context const* ctx, __BATCH_T batch, float missing, bool is_dense, \ common::Span row_counts_span, common::Span feature_types, \ size_t row_stride, size_t n_rows, std::shared_ptr cuts); @@ -365,12 +360,10 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag row_stride = *std::max_element(it, it + page.Size()); CHECK(ctx->IsCUDA()); - monitor_.Start("InitCompressedData"); - InitCompressedData(ctx->Device()); - monitor_.Stop("InitCompressedData"); + InitCompressedData(ctx); // copy gidx - common::CompressedByteT* d_compressed_buffer = gidx_buffer.DevicePointer(); + common::CompressedByteT* d_compressed_buffer = gidx_buffer.data(); dh::device_vector row_ptr(page.row_ptr.size()); auto d_row_ptr = dh::ToSpan(row_ptr); dh::safe_cuda(cudaMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(), @@ -389,20 +382,20 @@ struct CopyPage { // The number of elements to skip. size_t offset; - CopyPage(EllpackPageImpl *dst, EllpackPageImpl const *src, size_t offset) - : cbw{dst->NumSymbols()}, dst_data_d{dst->gidx_buffer.DevicePointer()}, - src_iterator_d{src->gidx_buffer.DevicePointer(), src->NumSymbols()}, + CopyPage(EllpackPageImpl* dst, EllpackPageImpl const* src, size_t offset) + : cbw{dst->NumSymbols()}, + dst_data_d{dst->gidx_buffer.data()}, + src_iterator_d{src->gidx_buffer.data(), src->NumSymbols()}, offset(offset) {} __device__ void operator()(size_t element_id) { - cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[element_id], - element_id + offset); + cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[element_id], element_id + offset); } }; // Copy the data from the given EllpackPage to the current page. -size_t EllpackPageImpl::Copy(DeviceOrd device, EllpackPageImpl const* page, size_t offset) { - monitor_.Start("Copy"); +size_t EllpackPageImpl::Copy(Context const* ctx, EllpackPageImpl const* page, bst_idx_t offset) { + monitor_.Start(__func__); bst_idx_t num_elements = page->n_rows * page->row_stride; CHECK_EQ(row_stride, page->row_stride); CHECK_EQ(NumSymbols(), page->NumSymbols()); @@ -411,10 +404,8 @@ size_t EllpackPageImpl::Copy(DeviceOrd device, EllpackPageImpl const* page, size LOG(FATAL) << "Concatenating the same Ellpack."; return this->n_rows * this->row_stride; } - gidx_buffer.SetDevice(device); - page->gidx_buffer.SetDevice(device); - dh::LaunchN(num_elements, CopyPage(this, page, offset)); - monitor_.Stop("Copy"); + dh::LaunchN(num_elements, ctx->CUDACtx()->Stream(), CopyPage{this, page, offset}); + monitor_.Stop(__func__); return num_elements; } @@ -423,8 +414,8 @@ struct CompactPage { common::CompressedBufferWriter cbw; common::CompressedByteT* dst_data_d; common::CompressedIterator src_iterator_d; - /*! \brief An array that maps the rows from the full DMatrix to the compacted - * page. + /** + * @brief An array that maps the rows from the full DMatrix to the compacted page. * * The total size is the number of rows in the original, uncompacted DMatrix. * Elements are the row ids in the compacted page. Rows not needed are set to @@ -438,24 +429,24 @@ struct CompactPage { size_t base_rowid; size_t row_stride; - CompactPage(EllpackPageImpl* dst, EllpackPageImpl const* src, - common::Span row_indexes) + CompactPage(EllpackPageImpl* dst, EllpackPageImpl const* src, common::Span row_indexes) : cbw{dst->NumSymbols()}, - dst_data_d{dst->gidx_buffer.DevicePointer()}, - src_iterator_d{src->gidx_buffer.DevicePointer(), src->NumSymbols()}, + dst_data_d{dst->gidx_buffer.data()}, + src_iterator_d{src->gidx_buffer.data(), src->NumSymbols()}, row_indexes(row_indexes), base_rowid{src->base_rowid}, row_stride{src->row_stride} {} - __device__ void operator()(size_t row_id) { + __device__ void operator()(bst_idx_t row_id) { size_t src_row = base_rowid + row_id; size_t dst_row = row_indexes[src_row]; - if (dst_row == SIZE_MAX) return; + if (dst_row == SIZE_MAX) { + return; + } size_t dst_offset = dst_row * row_stride; size_t src_offset = row_id * row_stride; for (size_t j = 0; j < row_stride; j++) { - cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[src_offset + j], - dst_offset + j); + cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[src_offset + j], dst_offset + j); } } }; @@ -467,28 +458,22 @@ void EllpackPageImpl::Compact(Context const* ctx, EllpackPageImpl const* page, CHECK_EQ(row_stride, page->row_stride); CHECK_EQ(NumSymbols(), page->NumSymbols()); CHECK_LE(page->base_rowid + page->n_rows, row_indexes.size()); - gidx_buffer.SetDevice(ctx->Device()); - page->gidx_buffer.SetDevice(ctx->Device()); auto cuctx = ctx->CUDACtx(); - dh::LaunchN(page->n_rows, cuctx->Stream(), CompactPage(this, page, row_indexes)); + dh::LaunchN(page->n_rows, cuctx->Stream(), CompactPage{this, page, row_indexes}); monitor_.Stop(__func__); } // Initialize the buffer to stored compressed features. -void EllpackPageImpl::InitCompressedData(DeviceOrd device) { - size_t num_symbols = NumSymbols(); +void EllpackPageImpl::InitCompressedData(Context const* ctx) { + monitor_.Start(__func__); + auto num_symbols = NumSymbols(); // Required buffer size for storing data matrix in ELLPack format. - size_t compressed_size_bytes = + std::size_t compressed_size_bytes = common::CompressedBufferWriter::CalculateBufferSize(row_stride * n_rows, num_symbols); - gidx_buffer.SetDevice(device); - // Don't call fill unnecessarily - if (gidx_buffer.Size() == 0) { - gidx_buffer.Resize(compressed_size_bytes, 0); - } else { - gidx_buffer.Resize(compressed_size_bytes, 0); - thrust::fill(dh::tbegin(gidx_buffer), dh::tend(gidx_buffer), 0); - } + auto init = static_cast(0); + gidx_buffer = common::MakeFixedVecWithCudaMalloc(ctx, compressed_size_bytes, init); + monitor_.Stop(__func__); } // Compress a CSR page into ELLPACK. @@ -496,7 +481,7 @@ void EllpackPageImpl::CreateHistIndices(DeviceOrd device, const SparsePage& row_batch, common::Span feature_types) { if (row_batch.Size() == 0) return; - unsigned int null_gidx_value = NumSymbols() - 1; + std::uint32_t null_gidx_value = NumSymbols() - 1; const auto& offset_vec = row_batch.offset.ConstHostVector(); @@ -541,13 +526,11 @@ void EllpackPageImpl::CreateHistIndices(DeviceOrd device, const dim3 grid3(common::DivRoundUp(batch_nrows, block3.x), common::DivRoundUp(row_stride, block3.y), 1); auto device_accessor = GetDeviceAccessor(device); - dh::LaunchKernel {grid3, block3}( - CompressBinEllpackKernel, common::CompressedBufferWriter(NumSymbols()), - gidx_buffer.DevicePointer(), row_ptrs.data().get(), - entries_d.data().get(), device_accessor.gidx_fvalue_map.data(), - device_accessor.feature_segments.data(), feature_types, - batch_row_begin, batch_nrows, row_stride, - null_gidx_value); + dh::LaunchKernel{grid3, block3}( // NOLINT + CompressBinEllpackKernel, common::CompressedBufferWriter(NumSymbols()), gidx_buffer.data(), + row_ptrs.data().get(), entries_d.data().get(), device_accessor.gidx_fvalue_map.data(), + device_accessor.feature_segments.data(), feature_types, batch_row_begin, batch_nrows, + row_stride, null_gidx_value); } } @@ -566,26 +549,31 @@ size_t EllpackPageImpl::MemCostBytes(size_t num_rows, size_t row_stride, EllpackDeviceAccessor EllpackPageImpl::GetDeviceAccessor( DeviceOrd device, common::Span feature_types) const { - gidx_buffer.SetDevice(device); return {device, cuts_, is_dense, row_stride, base_rowid, n_rows, - common::CompressedIterator(gidx_buffer.ConstDevicePointer(), - NumSymbols()), + common::CompressedIterator(gidx_buffer.data(), NumSymbols()), feature_types}; } + EllpackDeviceAccessor EllpackPageImpl::GetHostAccessor( + Context const* ctx, std::vector* h_gidx_buffer, common::Span feature_types) const { + h_gidx_buffer->resize(gidx_buffer.size()); + CHECK_EQ(h_gidx_buffer->size(), gidx_buffer.size()); + CHECK_NE(gidx_buffer.size(), 0); + dh::safe_cuda(cudaMemcpyAsync(h_gidx_buffer->data(), gidx_buffer.data(), gidx_buffer.size_bytes(), + cudaMemcpyDefault, dh::DefaultStream())); return {DeviceOrd::CPU(), cuts_, is_dense, row_stride, base_rowid, n_rows, - common::CompressedIterator(gidx_buffer.ConstHostPointer(), NumSymbols()), + common::CompressedIterator(h_gidx_buffer->data(), NumSymbols()), feature_types}; } } // namespace xgboost diff --git a/src/data/ellpack_page.cuh b/src/data/ellpack_page.cuh index d1f9472df4c4..18b9384afbd7 100644 --- a/src/data/ellpack_page.cuh +++ b/src/data/ellpack_page.cuh @@ -1,23 +1,25 @@ /** - * Copyright 2019-2023, XGBoost Contributors + * Copyright 2019-2024, XGBoost Contributors */ - #ifndef XGBOOST_DATA_ELLPACK_PAGE_CUH_ #define XGBOOST_DATA_ELLPACK_PAGE_CUH_ #include -#include #include "../common/categorical.h" #include "../common/compressed_iterator.h" #include "../common/device_helpers.cuh" #include "../common/hist_util.h" +#include "../common/ref_resource_view.h" // for RefResourceView #include "ellpack_page.h" +#include "xgboost/data.h" namespace xgboost { -/** \brief Struct for accessing and manipulating an ELLPACK matrix on the - * device. Does not own underlying memory and may be trivially copied into - * kernels.*/ +/** + * @brief Struct for accessing and manipulating an ELLPACK matrix on the device. + * + * Does not own underlying memory and may be trivially copied into kernels. + */ struct EllpackDeviceAccessor { /*! \brief Whether or not if the matrix is dense. */ bool is_dense; @@ -128,31 +130,31 @@ class GHistIndexMatrix; class EllpackPageImpl { public: - /*! - * \brief Default constructor. + /** + * @brief Default constructor. * * This is used in the external memory case. An empty ELLPACK page is constructed with its content * set later by the reader. */ EllpackPageImpl() = default; - /*! - * \brief Constructor from an existing EllpackInfo. + /** + * @brief Constructor from an existing EllpackInfo. * - * This is used in the sampling case. The ELLPACK page is constructed from an existing EllpackInfo - * and the given number of rows. + * This is used in the sampling case. The ELLPACK page is constructed from an existing + * Ellpack page and the given number of rows. */ - EllpackPageImpl(DeviceOrd device, std::shared_ptr cuts, + EllpackPageImpl(Context const* ctx, std::shared_ptr cuts, bool is_dense, bst_idx_t row_stride, bst_idx_t n_rows); - /*! - * \brief Constructor used for external memory. + /** + * @brief Constructor used for external memory. */ - EllpackPageImpl(DeviceOrd device, std::shared_ptr cuts, + EllpackPageImpl(Context const* ctx, std::shared_ptr cuts, const SparsePage& page, bool is_dense, size_t row_stride, common::Span feature_types); - /*! - * \brief Constructor from an existing DMatrix. + /** + * @brief Constructor from an existing DMatrix. * * This is used in the in-memory case. The ELLPACK page is constructed from an existing DMatrix * in CSR format. @@ -160,37 +162,39 @@ class EllpackPageImpl { explicit EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& parm); template - explicit EllpackPageImpl(AdapterBatch batch, float missing, DeviceOrd device, bool is_dense, + explicit EllpackPageImpl(Context const* ctx, AdapterBatch batch, float missing, bool is_dense, common::Span row_counts_span, common::Span feature_types, size_t row_stride, size_t n_rows, std::shared_ptr cuts); /** - * \brief Constructor from an existing CPU gradient index. + * @brief Constructor from an existing CPU gradient index. */ explicit EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& page, common::Span ft); - /*! \brief Copy the elements of the given ELLPACK page into this page. + /** + * @brief Copy the elements of the given ELLPACK page into this page. * - * @param device The GPU device to use. + * @param ctx The GPU context. * @param page The ELLPACK page to copy from. * @param offset The number of elements to skip before copying. * @returns The number of elements copied. */ - size_t Copy(DeviceOrd device, EllpackPageImpl const *page, size_t offset); + bst_idx_t Copy(Context const* ctx, EllpackPageImpl const* page, bst_idx_t offset); - /*! \brief Compact the given ELLPACK page into the current page. + /** + * @brief Compact the given ELLPACK page into the current page. * - * @param context The GPU context. + * @param ctx The GPU context. * @param page The ELLPACK page to compact from. * @param row_indexes Row indexes for the compacted page. */ void Compact(Context const* ctx, EllpackPageImpl const* page, common::Span row_indexes); - /*! \return Number of instances in the page. */ + /** @return Number of instances in the page. */ [[nodiscard]] bst_idx_t Size() const; - /*! \brief Set the base row id for this page. */ + /** @brief Set the base row id for this page. */ void SetBaseRowId(std::size_t row_id) { base_rowid = row_id; } @@ -199,43 +203,54 @@ class EllpackPageImpl { [[nodiscard]] std::shared_ptr CutsShared() const { return cuts_; } void SetCuts(std::shared_ptr cuts) { cuts_ = cuts; } - /*! \return Estimation of memory cost of this page. */ + /** @return Estimation of memory cost of this page. */ static size_t MemCostBytes(size_t num_rows, size_t row_stride, const common::HistogramCuts&cuts) ; - /*! \brief Return the total number of symbols (total number of bins plus 1 for - * not found). */ + /** + * @brief Return the total number of symbols (total number of bins plus 1 for not + * found). + */ [[nodiscard]] std::size_t NumSymbols() const { return cuts_->TotalBins() + 1; } - + /** + * @brief Get an accessor that can be passed into CUDA kernels. + */ [[nodiscard]] EllpackDeviceAccessor GetDeviceAccessor( DeviceOrd device, common::Span feature_types = {}) const; + /** + * @brief Get an accessor for host code. + */ [[nodiscard]] EllpackDeviceAccessor GetHostAccessor( + Context const* ctx, std::vector* h_gidx_buffer, common::Span feature_types = {}) const; private: - /*! - * \brief Compress a single page of CSR data into ELLPACK. + /** + * @brief Compress a single page of CSR data into ELLPACK. * * @param device The GPU device to use. * @param row_batch The CSR page. */ - void CreateHistIndices(DeviceOrd device, - const SparsePage& row_batch, + void CreateHistIndices(DeviceOrd device, const SparsePage& row_batch, common::Span feature_types); - /*! - * \brief Initialize the buffer to store compressed features. + /** + * @brief Initialize the buffer to store compressed features. */ - void InitCompressedData(DeviceOrd device); + void InitCompressedData(Context const* ctx); public: - /*! \brief Whether or not if the matrix is dense. */ + /** @brief Whether or not if the matrix is dense. */ bool is_dense; - /*! \brief Row length for ELLPACK. */ + /** @brief Row length for ELLPACK. */ bst_idx_t row_stride; bst_idx_t base_rowid{0}; - bst_idx_t n_rows{}; - /*! \brief global index of histogram, which is stored in ELLPACK format. */ - HostDeviceVector gidx_buffer; + bst_idx_t n_rows{0}; + /** + * @brief Index of the gradient histogram, which is stored in ELLPACK format. + * + * This can be backed by various storage types. + */ + common::RefResourceView gidx_buffer; private: std::shared_ptr cuts_; diff --git a/src/data/ellpack_page_raw_format.cu b/src/data/ellpack_page_raw_format.cu index 059dd9f213a5..86d1ac6da7eb 100644 --- a/src/data/ellpack_page_raw_format.cu +++ b/src/data/ellpack_page_raw_format.cu @@ -4,11 +4,13 @@ #include #include // for size_t -#include // for uint64_t +#include // for vector -#include "../common/io.h" // for AlignedResourceReadStream, AlignedFileWriteStream -#include "../common/ref_resource_view.h" // for ReadVec, WriteVec -#include "ellpack_page.cuh" // for EllpackPage +#include "../common/cuda_rt_utils.h" +#include "../common/io.h" // for AlignedResourceReadStream, AlignedFileWriteStream +#include "../common/ref_resource_view.cuh" // for MakeFixedVecWithCudaMalloc +#include "../common/ref_resource_view.h" // for ReadVec, WriteVec +#include "ellpack_page.cuh" // for EllpackPage #include "ellpack_page_raw_format.h" #include "ellpack_page_source.h" @@ -16,8 +18,12 @@ namespace xgboost::data { DMLC_REGISTRY_FILE_TAG(ellpack_page_raw_format); namespace { +// Function to support system without HMM or ATS template -[[nodiscard]] bool ReadDeviceVec(common::AlignedResourceReadStream* fi, HostDeviceVector* vec) { +[[nodiscard]] bool ReadDeviceVec(common::AlignedResourceReadStream* fi, + common::RefResourceView* vec) { + xgboost_NVTX_FN_RANGE(); + std::uint64_t n{0}; if (!fi->Read(&n)) { return false; @@ -33,82 +39,82 @@ template return false; } - vec->Resize(n); - auto d_vec = vec->DeviceSpan(); - dh::safe_cuda( - cudaMemcpyAsync(d_vec.data(), ptr, n_bytes, cudaMemcpyDefault, dh::DefaultStream())); + auto ctx = Context{}.MakeCUDA(common::CurrentDevice()); + *vec = common::MakeFixedVecWithCudaMalloc(&ctx, n); + dh::safe_cuda(cudaMemcpyAsync(vec->data(), ptr, n_bytes, cudaMemcpyDefault, dh::DefaultStream())); return true; } } // namespace +#define RET_IF_NOT(expr) \ + if (!(expr)) { \ + return false; \ + } + [[nodiscard]] bool EllpackPageRawFormat::Read(EllpackPage* page, common::AlignedResourceReadStream* fi) { + xgboost_NVTX_FN_RANGE(); auto* impl = page->Impl(); + impl->SetCuts(this->cuts_); - if (!fi->Read(&impl->n_rows)) { - return false; - } - if (!fi->Read(&impl->is_dense)) { - return false; - } - if (!fi->Read(&impl->row_stride)) { - return false; - } - impl->gidx_buffer.SetDevice(device_); - if (!ReadDeviceVec(fi, &impl->gidx_buffer)) { - return false; - } - if (!fi->Read(&impl->base_rowid)) { - return false; + RET_IF_NOT(fi->Read(&impl->n_rows)); + RET_IF_NOT(fi->Read(&impl->is_dense)); + RET_IF_NOT(fi->Read(&impl->row_stride)); + + if (has_hmm_ats_) { + RET_IF_NOT(common::ReadVec(fi, &impl->gidx_buffer)); + } else { + RET_IF_NOT(ReadDeviceVec(fi, &impl->gidx_buffer)); } + RET_IF_NOT(fi->Read(&impl->base_rowid)); + dh::DefaultStream().Sync(); return true; } [[nodiscard]] std::size_t EllpackPageRawFormat::Write(const EllpackPage& page, common::AlignedFileWriteStream* fo) { + xgboost_NVTX_FN_RANGE(); + std::size_t bytes{0}; auto* impl = page.Impl(); bytes += fo->Write(impl->n_rows); bytes += fo->Write(impl->is_dense); bytes += fo->Write(impl->row_stride); - CHECK(!impl->gidx_buffer.ConstHostVector().empty()); - bytes += common::WriteVec(fo, impl->gidx_buffer.HostVector()); + std::vector h_gidx_buffer; + Context ctx = Context{}.MakeCUDA(common::CurrentDevice()); + [[maybe_unused]] auto h_accessor = impl->GetHostAccessor(&ctx, &h_gidx_buffer); + bytes += common::WriteVec(fo, h_gidx_buffer); bytes += fo->Write(impl->base_rowid); dh::DefaultStream().Sync(); return bytes; } [[nodiscard]] bool EllpackPageRawFormat::Read(EllpackPage* page, EllpackHostCacheStream* fi) const { + xgboost_NVTX_FN_RANGE(); + auto* impl = page->Impl(); CHECK(this->cuts_->cut_values_.DeviceCanRead()); impl->SetCuts(this->cuts_); - if (!fi->Read(&impl->n_rows)) { - return false; - } - if (!fi->Read(&impl->is_dense)) { - return false; - } - if (!fi->Read(&impl->row_stride)) { - return false; - } - // Read vec - bst_idx_t n{0}; - if (!fi->Read(&n)) { - return false; - } - if (n != 0) { - impl->gidx_buffer.SetDevice(device_); - impl->gidx_buffer.Resize(n); - auto span = impl->gidx_buffer.DeviceSpan(); - if (!fi->Read(span.data(), span.size_bytes())) { - return false; + // Read vector + Context ctx = Context{}.MakeCUDA(common::CurrentDevice()); + auto read_vec = [&] { + common::NvtxScopedRange range{common::NvtxEventAttr{"read-vec", common::NvtxRgb{127, 255, 0}}}; + bst_idx_t n{0}; + RET_IF_NOT(fi->Read(&n)); + if (n == 0) { + return true; } - } + impl->gidx_buffer = common::MakeFixedVecWithCudaMalloc(&ctx, n); + RET_IF_NOT(fi->Read(impl->gidx_buffer.data(), impl->gidx_buffer.size_bytes())); + return true; + }; + RET_IF_NOT(read_vec()); - if (!fi->Read(&impl->base_rowid)) { - return false; - } + RET_IF_NOT(fi->Read(&impl->n_rows)); + RET_IF_NOT(fi->Read(&impl->is_dense)); + RET_IF_NOT(fi->Read(&impl->row_stride)); + RET_IF_NOT(fi->Read(&impl->base_rowid)); dh::DefaultStream().Sync(); return true; @@ -116,23 +122,32 @@ template [[nodiscard]] std::size_t EllpackPageRawFormat::Write(const EllpackPage& page, EllpackHostCacheStream* fo) const { + xgboost_NVTX_FN_RANGE(); + bst_idx_t bytes{0}; auto* impl = page.Impl(); - bytes += fo->Write(impl->n_rows); - bytes += fo->Write(impl->is_dense); - bytes += fo->Write(impl->row_stride); // Write vector - bst_idx_t n = impl->gidx_buffer.Size(); - bytes += fo->Write(n); + auto write_vec = [&] { + common::NvtxScopedRange range{common::NvtxEventAttr{"write-vec", common::NvtxRgb{127, 255, 0}}}; + bst_idx_t n = impl->gidx_buffer.size(); + bytes += fo->Write(n); - if (!impl->gidx_buffer.Empty()) { - auto span = impl->gidx_buffer.ConstDeviceSpan(); - bytes += fo->Write(span.data(), span.size_bytes()); - } + if (!impl->gidx_buffer.empty()) { + bytes += fo->Write(impl->gidx_buffer.data(), impl->gidx_buffer.size_bytes()); + } + }; + + write_vec(); + + bytes += fo->Write(impl->n_rows); + bytes += fo->Write(impl->is_dense); + bytes += fo->Write(impl->row_stride); bytes += fo->Write(impl->base_rowid); dh::DefaultStream().Sync(); return bytes; } + +#undef RET_IF_NOT } // namespace xgboost::data diff --git a/src/data/ellpack_page_raw_format.h b/src/data/ellpack_page_raw_format.h index 8c3f89f0c0b3..e2761c73f27c 100644 --- a/src/data/ellpack_page_raw_format.h +++ b/src/data/ellpack_page_raw_format.h @@ -26,10 +26,13 @@ class EllpackHostCacheStream; class EllpackPageRawFormat : public SparsePageFormat { std::shared_ptr cuts_; DeviceOrd device_; + // Supports CUDA HMM or ATS + bool has_hmm_ats_{false}; public: - explicit EllpackPageRawFormat(std::shared_ptr cuts, DeviceOrd device) - : cuts_{std::move(cuts)}, device_{device} {} + explicit EllpackPageRawFormat(std::shared_ptr cuts, DeviceOrd device, + bool has_hmm_ats) + : cuts_{std::move(cuts)}, device_{device}, has_hmm_ats_{has_hmm_ats} {} [[nodiscard]] bool Read(EllpackPage* page, common::AlignedResourceReadStream* fi) override; [[nodiscard]] std::size_t Write(const EllpackPage& page, common::AlignedFileWriteStream* fo) override; diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu index f53ae3ef1e2f..a70d9150ca47 100644 --- a/src/data/ellpack_page_source.cu +++ b/src/data/ellpack_page_source.cu @@ -11,6 +11,7 @@ #include "../common/common.h" // for safe_cuda #include "../common/cuda_pinned_allocator.h" // for pinned_allocator #include "../common/device_helpers.cuh" // for CUDAStreamView, DefaultStream +#include "../common/resource.cuh" // for PrivateCudaMmapConstStream #include "ellpack_page.cuh" // for EllpackPageImpl #include "ellpack_page.h" // for EllpackPage #include "ellpack_page_source.h" @@ -86,16 +87,16 @@ void EllpackHostCacheStream::Seek(bst_idx_t offset_bytes) { this->p_impl_->Seek( void EllpackHostCacheStream::Bound(bst_idx_t offset_bytes) { this->p_impl_->Bound(offset_bytes); } /** - * EllpackFormatType + * EllpackCacheStreamPolicy */ template typename F> -EllpackFormatStreamPolicy::EllpackFormatStreamPolicy() +EllpackCacheStreamPolicy::EllpackCacheStreamPolicy() : p_cache_{std::make_shared()} {} template typename F> -[[nodiscard]] std::unique_ptr::WriterT> -EllpackFormatStreamPolicy::CreateWriter(StringView, std::uint32_t iter) { +[[nodiscard]] std::unique_ptr::WriterT> +EllpackCacheStreamPolicy::CreateWriter(StringView, std::uint32_t iter) { auto fo = std::make_unique(this->p_cache_); if (iter == 0) { CHECK(this->p_cache_->cache.empty()); @@ -106,9 +107,8 @@ EllpackFormatStreamPolicy::CreateWriter(StringView, std::uint32_t iter) { } template typename F> -[[nodiscard]] std::unique_ptr::ReaderT> -EllpackFormatStreamPolicy::CreateReader(StringView, bst_idx_t offset, - bst_idx_t length) const { +[[nodiscard]] std::unique_ptr::ReaderT> +EllpackCacheStreamPolicy::CreateReader(StringView, bst_idx_t offset, bst_idx_t length) const { auto fi = std::make_unique(this->p_cache_); fi->Seek(offset); fi->Bound(offset + length); @@ -117,18 +117,40 @@ EllpackFormatStreamPolicy::CreateReader(StringView, bst_idx_t offset, } // Instantiation -template EllpackFormatStreamPolicy::EllpackFormatStreamPolicy(); +template EllpackCacheStreamPolicy::EllpackCacheStreamPolicy(); template std::unique_ptr< - typename EllpackFormatStreamPolicy::WriterT> -EllpackFormatStreamPolicy::CreateWriter(StringView name, - std::uint32_t iter); + typename EllpackCacheStreamPolicy::WriterT> +EllpackCacheStreamPolicy::CreateWriter(StringView name, + std::uint32_t iter); template std::unique_ptr< - typename EllpackFormatStreamPolicy::ReaderT> -EllpackFormatStreamPolicy::CreateReader( + typename EllpackCacheStreamPolicy::ReaderT> +EllpackCacheStreamPolicy::CreateReader( StringView name, std::uint64_t offset, std::uint64_t length) const; +/** + * EllpackMmapStreamPolicy + */ + +template typename F> +[[nodiscard]] std::unique_ptr::ReaderT> +EllpackMmapStreamPolicy::CreateReader(StringView name, bst_idx_t offset, + bst_idx_t length) const { + if (has_hmm_) { + return std::make_unique(name, offset, length); + } else { + return std::make_unique(name, offset, length); + } +} + +// Instantiation +template std::unique_ptr< + typename EllpackMmapStreamPolicy::ReaderT> +EllpackMmapStreamPolicy::CreateReader(StringView name, + bst_idx_t offset, + bst_idx_t length) const; + /** * EllpackPageSourceImpl */ @@ -146,8 +168,8 @@ void EllpackPageSourceImpl::Fetch() { auto const& csr = this->source_->Page(); this->page_.reset(new EllpackPage{}); auto* impl = this->page_->Impl(); - *impl = EllpackPageImpl{this->Device(), this->GetCuts(), *csr, - is_dense_, row_stride_, feature_types_}; + Context ctx = Context{}.MakeCUDA(this->Device().ordinal); + *impl = EllpackPageImpl{&ctx, this->GetCuts(), *csr, is_dense_, row_stride_, feature_types_}; this->page_->SetBaseRowId(csr->base_rowid); this->WriteCache(); } @@ -157,5 +179,7 @@ void EllpackPageSourceImpl::Fetch() { template void EllpackPageSourceImpl>::Fetch(); template void -EllpackPageSourceImpl>::Fetch(); +EllpackPageSourceImpl>::Fetch(); +template void +EllpackPageSourceImpl>::Fetch(); } // namespace xgboost::data diff --git a/src/data/ellpack_page_source.h b/src/data/ellpack_page_source.h index 7f50899b974d..1436f9151cf1 100644 --- a/src/data/ellpack_page_source.h +++ b/src/data/ellpack_page_source.h @@ -9,6 +9,7 @@ #include // for shared_ptr #include // for move +#include "../common/cuda_rt_utils.h" // for SupportsPageableMem #include "../common/hist_util.h" // for HistogramCuts #include "ellpack_page.h" // for EllpackPage #include "ellpack_page_raw_format.h" // for EllpackPageRawFormat @@ -59,14 +60,19 @@ template class EllpackFormatPolicy { std::shared_ptr cuts_{nullptr}; DeviceOrd device_; + bool has_hmm_{common::SupportsPageableMem()}; public: using FormatT = EllpackPageRawFormat; public: + EllpackFormatPolicy() = default; + // For testing with the HMM flag. + explicit EllpackFormatPolicy(bool has_hmm) : has_hmm_{has_hmm} {} + [[nodiscard]] auto CreatePageFormat() const { CHECK_EQ(cuts_->cut_values_.Device(), device_); - std::unique_ptr fmt{new EllpackPageRawFormat{cuts_, device_}}; + std::unique_ptr fmt{new EllpackPageRawFormat{cuts_, device_, has_hmm_}}; return fmt; } @@ -83,7 +89,7 @@ class EllpackFormatPolicy { }; template typename F> -class EllpackFormatStreamPolicy : public F { +class EllpackCacheStreamPolicy : public F { std::shared_ptr p_cache_; public: @@ -91,13 +97,42 @@ class EllpackFormatStreamPolicy : public F { using ReaderT = EllpackHostCacheStream; public: - EllpackFormatStreamPolicy(); + EllpackCacheStreamPolicy(); [[nodiscard]] std::unique_ptr CreateWriter(StringView name, std::uint32_t iter); [[nodiscard]] std::unique_ptr CreateReader(StringView name, bst_idx_t offset, bst_idx_t length) const; }; +template typename F> +class EllpackMmapStreamPolicy : public F { + bool has_hmm_{common::SupportsPageableMem()}; + + public: + using WriterT = common::AlignedFileWriteStream; + using ReaderT = common::AlignedResourceReadStream; + + public: + EllpackMmapStreamPolicy() = default; + // For testing with the HMM flag. + template < + typename std::enable_if_t, EllpackFormatPolicy>>* = nullptr> + explicit EllpackMmapStreamPolicy(bool has_hmm) : F{has_hmm}, has_hmm_{has_hmm} {} + + [[nodiscard]] std::unique_ptr CreateWriter(StringView name, std::uint32_t iter) { + std::unique_ptr fo; + if (iter == 0) { + fo = std::make_unique(name, "wb"); + } else { + fo = std::make_unique(name, "ab"); + } + return fo; + } + + [[nodiscard]] std::unique_ptr CreateReader(StringView name, bst_idx_t offset, + bst_idx_t length) const; +}; + template class EllpackPageSourceImpl : public PageSourceIncMixIn { using Super = PageSourceIncMixIn; @@ -128,11 +163,11 @@ class EllpackPageSourceImpl : public PageSourceIncMixIn { // Cache to host using EllpackPageHostSource = - EllpackPageSourceImpl>; + EllpackPageSourceImpl>; // Cache to disk using EllpackPageSource = - EllpackPageSourceImpl>; + EllpackPageSourceImpl>; #if !defined(XGBOOST_USE_CUDA) template diff --git a/src/data/gradient_index.cu b/src/data/gradient_index.cu index 42018eab47e3..f8c8f8d48970 100644 --- a/src/data/gradient_index.cu +++ b/src/data/gradient_index.cu @@ -16,7 +16,8 @@ template void SetIndexData(Context const* ctx, EllpackPageImpl const* page, std::vector* p_hit_count_tloc, CompressOffset&& get_offset, GHistIndexMatrix* out) { - auto accessor = page->GetHostAccessor(); + std::vector h_gidx_buffer; + auto accessor = page->GetHostAccessor(ctx, &h_gidx_buffer); auto const kNull = static_cast(accessor.NullValue()); common::Span index_data_span = {out->index.data(), out->index.Size()}; @@ -47,7 +48,8 @@ void GetRowPtrFromEllpack(Context const* ctx, EllpackPageImpl const* page, if (page->is_dense) { std::fill(row_ptr.begin() + 1, row_ptr.end(), page->row_stride); } else { - auto accessor = page->GetHostAccessor(); + std::vector h_gidx_buffer; + auto accessor = page->GetHostAccessor(ctx, &h_gidx_buffer); auto const kNull = static_cast(accessor.NullValue()); common::ParallelFor(page->Size(), ctx->Threads(), [&](auto i) { diff --git a/src/data/histogram_cut_format.h b/src/data/histogram_cut_format.h deleted file mode 100644 index d4eb81ad2849..000000000000 --- a/src/data/histogram_cut_format.h +++ /dev/null @@ -1,49 +0,0 @@ -/** - * Copyright 2021-2024, XGBoost contributors - */ -#ifndef XGBOOST_DATA_HISTOGRAM_CUT_FORMAT_H_ -#define XGBOOST_DATA_HISTOGRAM_CUT_FORMAT_H_ - -#include // for Stream - -#include // for size_t - -#include "../common/hist_util.h" // for HistogramCuts -#include "../common/io.h" // for AlignedResourceReadStream, AlignedFileWriteStream -#include "../common/ref_resource_view.h" // for WriteVec, ReadVec - -namespace xgboost::data { -inline bool ReadHistogramCuts(common::HistogramCuts *cuts, common::AlignedResourceReadStream *fi) { - if (!common::ReadVec(fi, &cuts->cut_values_.HostVector())) { - return false; - } - if (!common::ReadVec(fi, &cuts->cut_ptrs_.HostVector())) { - return false; - } - if (!common::ReadVec(fi, &cuts->min_vals_.HostVector())) { - return false; - } - bool has_cat{false}; - if (!fi->Read(&has_cat)) { - return false; - } - decltype(cuts->MaxCategory()) max_cat{0}; - if (!fi->Read(&max_cat)) { - return false; - } - cuts->SetCategorical(has_cat, max_cat); - return true; -} - -inline std::size_t WriteHistogramCuts(common::HistogramCuts const &cuts, - common::AlignedFileWriteStream *fo) { - std::size_t bytes = 0; - bytes += common::WriteVec(fo, cuts.Values()); - bytes += common::WriteVec(fo, cuts.Ptrs()); - bytes += common::WriteVec(fo, cuts.MinValues()); - bytes += fo->Write(cuts.HasCategorical()); - bytes += fo->Write(cuts.MaxCategory()); - return bytes; -} -} // namespace xgboost::data -#endif // XGBOOST_DATA_HISTOGRAM_CUT_FORMAT_H_ diff --git a/src/data/iterative_dmatrix.cu b/src/data/iterative_dmatrix.cu index 2e8da2c7e7ed..0cb32c5aa107 100644 --- a/src/data/iterative_dmatrix.cu +++ b/src/data/iterative_dmatrix.cu @@ -5,6 +5,7 @@ #include #include "../collective/allreduce.h" +#include "../common/cuda_rt_utils.h" // for AllVisibleGPUs #include "../common/hist_util.cuh" #include "batch_utils.h" // for RegenGHist #include "device_adapter.cuh" @@ -45,11 +46,17 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p, int32_t current_device; dh::safe_cuda(cudaGetDevice(¤t_device)); + auto get_ctx = [&]() { + Context d_ctx = (ctx->IsCUDA()) ? *ctx : Context{}.MakeCUDA(current_device); + CHECK(!d_ctx.IsCPU()); + return d_ctx; + }; auto get_device = [&]() { auto d = (ctx->IsCUDA()) ? ctx->Device() : DeviceOrd::CUDA(current_device); CHECK(!d.IsCPU()); return d; }; + fmat_ctx_ = get_ctx(); /** * Generate quantiles @@ -118,7 +125,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p, // that case device id is invalid. ellpack_.reset(new EllpackPage); *(ellpack_->Impl()) = - EllpackPageImpl(get_device(), cuts, this->IsDense(), row_stride, accumulated_rows); + EllpackPageImpl(&fmat_ctx_, cuts, this->IsDense(), row_stride, accumulated_rows); } }; @@ -142,10 +149,10 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p, proxy->Info().feature_types.SetDevice(get_device()); auto d_feature_types = proxy->Info().feature_types.ConstDeviceSpan(); auto new_impl = cuda_impl::Dispatch(proxy, [&](auto const& value) { - return EllpackPageImpl(value, missing, get_device(), is_dense, row_counts_span, - d_feature_types, row_stride, rows, cuts); + return EllpackPageImpl(&fmat_ctx_, value, missing, is_dense, row_counts_span, d_feature_types, + row_stride, rows, cuts); }); - size_t num_elements = ellpack_->Impl()->Copy(get_device(), &new_impl, offset); + std::size_t num_elements = ellpack_->Impl()->Copy(&fmat_ctx_, &new_impl, offset); offset += num_elements; proxy->Info().num_row_ = num_rows(); diff --git a/src/data/sparse_page_source.h b/src/data/sparse_page_source.h index 550631b72dc5..62b39886ed62 100644 --- a/src/data/sparse_page_source.h +++ b/src/data/sparse_page_source.h @@ -226,7 +226,7 @@ class SparsePageSourceImpl : public BatchIteratorImpl, public FormatStreamPol } // An heuristic for number of pre-fetched batches. We can make it part of BatchParam // to let user adjust number of pre-fetched batches when needed. - std::int32_t kPrefetches = 3; + std::int32_t constexpr kPrefetches = 3; std::int32_t n_prefetches = std::min(nthreads_, kPrefetches); n_prefetches = std::max(n_prefetches, 1); std::int32_t n_prefetch_batches = std::min(static_cast(n_prefetches), n_batches_); diff --git a/src/gbm/gblinear.cc b/src/gbm/gblinear.cc index 71905debcb3a..2d288fa9d025 100644 --- a/src/gbm/gblinear.cc +++ b/src/gbm/gblinear.cc @@ -10,12 +10,12 @@ #include #include -#include #include #include #include "../common/common.h" -#include "../common/error_msg.h" // NoCategorical, DeprecatedFunc +#include "../common/cuda_rt_utils.h" // for AllVisibleGPUs +#include "../common/error_msg.h" // NoCategorical, DeprecatedFunc #include "../common/threading_utils.h" #include "../common/timer.h" #include "gblinear_model.h" diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc index 9ff4abb4d00a..26c768fafea7 100644 --- a/src/gbm/gbtree.cc +++ b/src/gbm/gbtree.cc @@ -1,5 +1,5 @@ /** - * Copyright 2014-2023 by Contributors + * Copyright 2014-2024, XGBoost Contributors * \file gbtree.cc * \brief gradient boosted tree implementation. * \author Tianqi Chen @@ -10,14 +10,14 @@ #include #include // for equal -#include // for uint32_t -#include +#include // for uint32_t #include #include #include #include #include "../common/common.h" +#include "../common/cuda_rt_utils.h" // for AllVisibleGPUs #include "../common/error_msg.h" // for UnknownDevice, WarnOldSerialization, InplacePredictProxy #include "../common/random.h" #include "../common/threading_utils.h" @@ -347,7 +347,7 @@ void GBTree::LoadConfig(Json const& in) { // This would cause all trees to be pushed to trees_to_update // e.g. updating a model, then saving and loading it would result in an empty model tparam_.process_type = TreeProcessType::kDefault; - std::int32_t const n_gpus = xgboost::common::AllVisibleGPUs(); + std::int32_t const n_gpus = common::AllVisibleGPUs(); auto msg = StringView{ R"( diff --git a/src/learner.cc b/src/learner.cc index 8c2e92cebae9..46abdf0bf365 100644 --- a/src/learner.cc +++ b/src/learner.cc @@ -1317,7 +1317,7 @@ class LearnerImpl : public LearnerIO { std::ostringstream os; os.precision(std::numeric_limits::max_digits10); os << '[' << iter << ']' << std::setiosflags(std::ios::fixed); - if (metrics_.empty() && tparam_.disable_default_eval_metric <= 0) { + if (metrics_.empty() && !tparam_.disable_default_eval_metric) { metrics_.emplace_back(Metric::Create(obj_->DefaultEvalMetric(), &ctx_)); auto config = obj_->DefaultMetricConfig(); if (!IsA(config)) { diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu index 29fb6bb6a162..fe46e19ec63b 100644 --- a/src/predictor/gpu_predictor.cu +++ b/src/predictor/gpu_predictor.cu @@ -16,6 +16,7 @@ #include "../common/categorical.h" #include "../common/common.h" #include "../common/cuda_context.cuh" // for CUDAContext +#include "../common/cuda_rt_utils.h" // for AllVisibleGPUs #include "../common/device_helpers.cuh" #include "../common/error_msg.h" // for InplacePredictProxy #include "../data/device_adapter.cuh" diff --git a/src/tree/fit_stump.cu b/src/tree/fit_stump.cu index dd71465df1cc..4f1f994a6f38 100644 --- a/src/tree/fit_stump.cu +++ b/src/tree/fit_stump.cu @@ -3,9 +3,6 @@ * * @brief Utilities for estimating initial score. */ -#if !defined(NOMINMAX) && defined(_WIN32) -#define NOMINMAX -#endif // !defined(NOMINMAX) #include // cuda::par #include // thrust::make_counting_iterator diff --git a/src/tree/fit_stump.h b/src/tree/fit_stump.h index 2af779f77c46..ab947a659464 100644 --- a/src/tree/fit_stump.h +++ b/src/tree/fit_stump.h @@ -1,5 +1,5 @@ /** - * Copyright 2022 by XGBoost Contributors + * Copyright 2022-2024, XGBoost Contributors * * \brief Utilities for estimating initial score. */ @@ -7,18 +7,12 @@ #ifndef XGBOOST_TREE_FIT_STUMP_H_ #define XGBOOST_TREE_FIT_STUMP_H_ -#if !defined(NOMINMAX) && defined(_WIN32) -#define NOMINMAX -#endif // !defined(NOMINMAX) - #include // std::max -#include "../common/common.h" // AssertGPUSupport -#include "xgboost/base.h" // GradientPair -#include "xgboost/context.h" // Context -#include "xgboost/data.h" // MetaInfo -#include "xgboost/host_device_vector.h" // HostDeviceVector -#include "xgboost/linalg.h" // TensorView +#include "xgboost/base.h" // GradientPair +#include "xgboost/context.h" // Context +#include "xgboost/data.h" // MetaInfo +#include "xgboost/linalg.h" // TensorView namespace xgboost { namespace tree { diff --git a/src/tree/gpu_hist/gradient_based_sampler.cu b/src/tree/gpu_hist/gradient_based_sampler.cu index d2031ca21898..3235e9ec3ec1 100644 --- a/src/tree/gpu_hist/gradient_based_sampler.cu +++ b/src/tree/gpu_hist/gradient_based_sampler.cu @@ -163,14 +163,14 @@ GradientBasedSample ExternalMemoryNoSampling::Sample(Context const* ctx, if (!page_concatenated_) { // Concatenate all the external memory ELLPACK pages into a single in-memory page. page_.reset(nullptr); - size_t offset = 0; + bst_idx_t offset = 0; for (auto& batch : dmat->GetBatches(ctx, batch_param_)) { auto page = batch.Impl(); if (!page_) { - page_ = std::make_unique(ctx->Device(), page->CutsShared(), page->is_dense, + page_ = std::make_unique(ctx, page->CutsShared(), page->is_dense, page->row_stride, dmat->Info().num_row_); } - size_t num_elements = page_->Copy(ctx->Device(), page, offset); + bst_idx_t num_elements = page_->Copy(ctx, page, offset); offset += num_elements; } page_concatenated_ = true; @@ -228,11 +228,11 @@ GradientBasedSample ExternalMemoryUniformSampling::Sample(Context const* ctx, auto first_page = (*batch_iterator.begin()).Impl(); // Create a new ELLPACK page with empty rows. page_.reset(); // Release the device memory first before reallocating - page_.reset(new EllpackPageImpl(ctx->Device(), first_page->CutsShared(), first_page->is_dense, + page_.reset(new EllpackPageImpl(ctx, first_page->CutsShared(), first_page->is_dense, first_page->row_stride, sample_rows)); // Compact the ELLPACK pages into the single sample page. - thrust::fill(cuctx->CTP(), dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0); + thrust::fill(cuctx->CTP(), page_->gidx_buffer.begin(), page_->gidx_buffer.end(), 0); for (auto& batch : batch_iterator) { page_->Compact(ctx, batch.Impl(), dh::ToSpan(sample_row_index_)); } @@ -283,10 +283,10 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* c // Perform Poisson sampling in place. thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), thrust::counting_iterator(0), dh::tbegin(gpair), - PoissonSampling(dh::ToSpan(threshold_), threshold_index, - RandomWeight(common::GlobalRandom()()))); + PoissonSampling{dh::ToSpan(threshold_), threshold_index, + RandomWeight(common::GlobalRandom()())}); // Count the sampled rows. - size_t sample_rows = + bst_idx_t sample_rows = thrust::count_if(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), IsNonZero()); // Compact gradient pairs. gpair_.resize(sample_rows); @@ -302,10 +302,10 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* c auto first_page = (*batch_iterator.begin()).Impl(); // Create a new ELLPACK page with empty rows. page_.reset(); // Release the device memory first before reallocating - page_.reset(new EllpackPageImpl(ctx->Device(), first_page->CutsShared(), dmat->IsDense(), - first_page->row_stride, sample_rows)); + page_.reset(new EllpackPageImpl{ctx, first_page->CutsShared(), dmat->IsDense(), + first_page->row_stride, sample_rows}); // Compact the ELLPACK pages into the single sample page. - thrust::fill(cuctx->CTP(), dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0); + thrust::fill(cuctx->CTP(), page_->gidx_buffer.begin(), page_->gidx_buffer.end(), 0); for (auto& batch : batch_iterator) { page_->Compact(ctx, batch.Impl(), dh::ToSpan(sample_row_index_)); } diff --git a/src/tree/gpu_hist/gradient_based_sampler.cuh b/src/tree/gpu_hist/gradient_based_sampler.cuh index 5a57e2ae8ef8..79008b1ae572 100644 --- a/src/tree/gpu_hist/gradient_based_sampler.cuh +++ b/src/tree/gpu_hist/gradient_based_sampler.cuh @@ -1,20 +1,19 @@ /** - * Copyright 2019-2023, XGBoost Contributors + * Copyright 2019-2024, XGBoost Contributors */ #pragma once -#include -#include -#include +#include // for size_t -#include "../../common/device_helpers.cuh" -#include "../../data/ellpack_page.cuh" - -namespace xgboost { -namespace tree { +#include "../../common/device_vector.cuh" // for device_vector, caching_device_vector +#include "../../data/ellpack_page.cuh" // for EllpackPageImpl +#include "xgboost/base.h" // for GradientPair +#include "xgboost/data.h" // for BatchParam +#include "xgboost/span.h" // for Span +namespace xgboost::tree { struct GradientBasedSample { /*!\brief Number of sampled rows. */ - size_t sample_rows; + std::size_t sample_rows; /*!\brief Sampled rows in ELLPACK format. */ EllpackPageImpl const* page; /*!\brief Gradient pairs for the sampled rows. */ @@ -137,5 +136,4 @@ class GradientBasedSampler { common::Monitor monitor_; std::unique_ptr strategy_; }; -}; // namespace tree -}; // namespace xgboost +}; // namespace xgboost::tree diff --git a/src/tree/gpu_hist/histogram.cuh b/src/tree/gpu_hist/histogram.cuh index 862821b00b63..87c60a8bfdbc 100644 --- a/src/tree/gpu_hist/histogram.cuh +++ b/src/tree/gpu_hist/histogram.cuh @@ -5,12 +5,14 @@ #define HISTOGRAM_CUH_ #include // for unique_ptr -#include "../../common/cuda_context.cuh" // for CUDAContext -#include "../../data/ellpack_page.cuh" // for EllpackDeviceAccessor -#include "feature_groups.cuh" // for FeatureGroupsAccessor -#include "xgboost/base.h" // for GradientPair, GradientPairInt64 -#include "xgboost/context.h" // for Context -#include "xgboost/span.h" // for Span +#include "../../common/cuda_context.cuh" // for CUDAContext +#include "../../common/device_helpers.cuh" // for LaunchN +#include "../../common/device_vector.cuh" // for device_vector +#include "../../data/ellpack_page.cuh" // for EllpackDeviceAccessor +#include "feature_groups.cuh" // for FeatureGroupsAccessor +#include "xgboost/base.h" // for GradientPair, GradientPairInt64 +#include "xgboost/context.h" // for Context +#include "xgboost/span.h" // for Span namespace xgboost::tree { /** @@ -60,6 +62,111 @@ class GradientQuantiser { } }; +/** + * @brief Data storage for node histograms on device. Automatically expands. + * + * @tparam kStopGrowingSize Do not grow beyond this size + * + * @author Rory + * @date 28/07/2018 + */ +template +class DeviceHistogramStorage { + private: + using GradientSumT = GradientPairInt64; + /** @brief Map nidx to starting index of its histogram. */ + std::map nidx_map_; + // Large buffer of zeroed memory, caches histograms + dh::device_vector data_; + // If we run out of storage allocate one histogram at a time + // in overflow. Not cached, overwritten when a new histogram + // is requested + dh::device_vector overflow_; + std::map overflow_nidx_map_; + int n_bins_; + DeviceOrd device_id_; + static constexpr size_t kNumItemsInGradientSum = + sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT); + static_assert(kNumItemsInGradientSum == 2, "Number of items in gradient type should be 2."); + + public: + // Start with about 16mb + DeviceHistogramStorage() { data_.reserve(1 << 22); } + void Init(DeviceOrd device_id, int n_bins) { + this->n_bins_ = n_bins; + this->device_id_ = device_id; + } + + void Reset(Context const* ctx) { + auto d_data = data_.data().get(); + dh::LaunchN(data_.size(), ctx->CUDACtx()->Stream(), + [=] __device__(size_t idx) { d_data[idx] = 0.0f; }); + nidx_map_.clear(); + overflow_nidx_map_.clear(); + } + [[nodiscard]] bool HistogramExists(int nidx) const { + return nidx_map_.find(nidx) != nidx_map_.cend() || + overflow_nidx_map_.find(nidx) != overflow_nidx_map_.cend(); + } + [[nodiscard]] int Bins() const { return n_bins_; } + [[nodiscard]] size_t HistogramSize() const { return n_bins_ * kNumItemsInGradientSum; } + dh::device_vector& Data() { return data_; } + + void AllocateHistograms(Context const* ctx, const std::vector& new_nidxs) { + for (int nidx : new_nidxs) { + CHECK(!HistogramExists(nidx)); + } + // Number of items currently used in data + const size_t used_size = nidx_map_.size() * HistogramSize(); + const size_t new_used_size = used_size + HistogramSize() * new_nidxs.size(); + if (used_size >= kStopGrowingSize) { + // Use overflow + // Delete previous entries + overflow_nidx_map_.clear(); + overflow_.resize(HistogramSize() * new_nidxs.size()); + // Zero memory + auto d_data = overflow_.data().get(); + dh::LaunchN(overflow_.size(), ctx->CUDACtx()->Stream(), + [=] __device__(size_t idx) { d_data[idx] = 0.0; }); + // Append new histograms + for (int nidx : new_nidxs) { + overflow_nidx_map_[nidx] = overflow_nidx_map_.size() * HistogramSize(); + } + } else { + CHECK_GE(data_.size(), used_size); + // Expand if necessary + if (data_.size() < new_used_size) { + data_.resize(std::max(data_.size() * 2, new_used_size)); + } + // Append new histograms + for (int nidx : new_nidxs) { + nidx_map_[nidx] = nidx_map_.size() * HistogramSize(); + } + } + + CHECK_GE(data_.size(), nidx_map_.size() * HistogramSize()); + } + + /** + * \summary Return pointer to histogram memory for a given node. + * \param nidx Tree node index. + * \return hist pointer. + */ + common::Span GetNodeHistogram(int nidx) { + CHECK(this->HistogramExists(nidx)); + + if (nidx_map_.find(nidx) != nidx_map_.cend()) { + // Fetch from normal cache + auto ptr = data_.data().get() + nidx_map_.at(nidx); + return {reinterpret_cast(ptr), static_cast(n_bins_)}; + } else { + // Fetch from overflow + auto ptr = overflow_.data().get() + overflow_nidx_map_.at(nidx); + return {reinterpret_cast(ptr), static_cast(n_bins_)}; + } + } +}; + class DeviceHistogramBuilderImpl; class DeviceHistogramBuilder { diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 19957857218d..83f84ec1f4a5 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -16,7 +16,8 @@ #include "../collective/broadcast.h" #include "../common/bitfield.h" #include "../common/categorical.h" -#include "../common/cuda_context.cuh" // CUDAContext +#include "../common/cuda_context.cuh" // for CUDAContext +#include "../common/cuda_rt_utils.h" // for CheckComputeCapability #include "../common/device_helpers.cuh" #include "../common/hist_util.h" #include "../common/random.h" // for ColumnSampler, GlobalRandom @@ -48,113 +49,6 @@ namespace xgboost::tree { DMLC_REGISTRY_FILE_TAG(updater_gpu_hist); #endif // !defined(GTEST_TEST) -/** - * \struct DeviceHistogramStorage - * - * \summary Data storage for node histograms on device. Automatically expands. - * - * \tparam GradientSumT histogram entry type. - * \tparam kStopGrowingSize Do not grow beyond this size - * - * \author Rory - * \date 28/07/2018 - */ -template -class DeviceHistogramStorage { - private: - using GradientSumT = GradientPairInt64; - /*! \brief Map nidx to starting index of its histogram. */ - std::map nidx_map_; - // Large buffer of zeroed memory, caches histograms - dh::device_vector data_; - // If we run out of storage allocate one histogram at a time - // in overflow. Not cached, overwritten when a new histogram - // is requested - dh::device_vector overflow_; - std::map overflow_nidx_map_; - int n_bins_; - DeviceOrd device_id_; - static constexpr size_t kNumItemsInGradientSum = - sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT); - static_assert(kNumItemsInGradientSum == 2, "Number of items in gradient type should be 2."); - - public: - // Start with about 16mb - DeviceHistogramStorage() { data_.reserve(1 << 22); } - void Init(DeviceOrd device_id, int n_bins) { - this->n_bins_ = n_bins; - this->device_id_ = device_id; - } - - void Reset() { - auto d_data = data_.data().get(); - dh::LaunchN(data_.size(), [=] __device__(size_t idx) { d_data[idx] = 0.0f; }); - nidx_map_.clear(); - overflow_nidx_map_.clear(); - } - [[nodiscard]] bool HistogramExists(int nidx) const { - return nidx_map_.find(nidx) != nidx_map_.cend() || - overflow_nidx_map_.find(nidx) != overflow_nidx_map_.cend(); - } - [[nodiscard]] int Bins() const { return n_bins_; } - [[nodiscard]] size_t HistogramSize() const { return n_bins_ * kNumItemsInGradientSum; } - dh::device_vector& Data() { return data_; } - - void AllocateHistograms(const std::vector& new_nidxs) { - for (int nidx : new_nidxs) { - CHECK(!HistogramExists(nidx)); - } - // Number of items currently used in data - const size_t used_size = nidx_map_.size() * HistogramSize(); - const size_t new_used_size = used_size + HistogramSize() * new_nidxs.size(); - if (used_size >= kStopGrowingSize) { - // Use overflow - // Delete previous entries - overflow_nidx_map_.clear(); - overflow_.resize(HistogramSize() * new_nidxs.size()); - // Zero memory - auto d_data = overflow_.data().get(); - dh::LaunchN(overflow_.size(), - [=] __device__(size_t idx) { d_data[idx] = 0.0; }); - // Append new histograms - for (int nidx : new_nidxs) { - overflow_nidx_map_[nidx] = overflow_nidx_map_.size() * HistogramSize(); - } - } else { - CHECK_GE(data_.size(), used_size); - // Expand if necessary - if (data_.size() < new_used_size) { - data_.resize(std::max(data_.size() * 2, new_used_size)); - } - // Append new histograms - for (int nidx : new_nidxs) { - nidx_map_[nidx] = nidx_map_.size() * HistogramSize(); - } - } - - CHECK_GE(data_.size(), nidx_map_.size() * HistogramSize()); - } - - /** - * \summary Return pointer to histogram memory for a given node. - * \param nidx Tree node index. - * \return hist pointer. - */ - common::Span GetNodeHistogram(int nidx) { - CHECK(this->HistogramExists(nidx)); - - if (nidx_map_.find(nidx) != nidx_map_.cend()) { - // Fetch from normal cache - auto ptr = data_.data().get() + nidx_map_.at(nidx); - return {reinterpret_cast(ptr), static_cast(n_bins_)}; - } else { - // Fetch from overflow - auto ptr = overflow_.data().get() + overflow_nidx_map_.at(nidx); - return {reinterpret_cast(ptr), static_cast(n_bins_)}; - } - } -}; - // Manage memory for a single GPU struct GPUHistMakerDevice { private: @@ -257,7 +151,7 @@ struct GPUHistMakerDevice { // Init histogram hist.Init(ctx_->Device(), page->Cuts().TotalBins()); - hist.Reset(); + hist.Reset(ctx_); this->InitFeatureGroupsOnce(); @@ -656,7 +550,7 @@ struct GPUHistMakerDevice { all_new.insert(all_new.end(), subtraction_nidx.begin(), subtraction_nidx.end()); // Allocate the histograms // Guaranteed contiguous memory - hist.AllocateHistograms(all_new); + hist.AllocateHistograms(ctx_, all_new); for (auto nidx : hist_nidx) { this->BuildHist(nidx); @@ -747,7 +641,7 @@ struct GPUHistMakerDevice { ctx_, info_, linalg::MakeVec(reinterpret_cast(&root_sum_quantised), 2)); collective::SafeColl(rc); - hist.AllocateHistograms({kRootNIdx}); + hist.AllocateHistograms(ctx_, {kRootNIdx}); this->BuildHist(kRootNIdx); this->AllReduceHist(kRootNIdx, 1); @@ -826,7 +720,7 @@ class GPUHistMaker : public TreeUpdater { // Used in test to count how many configurations are performed LOG(DEBUG) << "[GPU Hist]: Configure"; hist_maker_param_.UpdateAllowUnknown(args); - dh::CheckComputeCapability(); + common::CheckComputeCapability(); initialised_ = false; monitor_.Init("updater_gpu_hist"); @@ -852,17 +746,13 @@ class GPUHistMaker : public TreeUpdater { CHECK_EQ(gpair->Shape(1), 1) << MTNotImplemented(); auto gpair_hdv = gpair->Data(); // build tree - try { - std::size_t t_idx{0}; - for (xgboost::RegTree* tree : trees) { - this->UpdateTree(param, gpair_hdv, dmat, tree, &out_position[t_idx]); - this->hist_maker_param_.CheckTreesSynchronized(ctx_, tree); - ++t_idx; - } - dh::safe_cuda(cudaGetLastError()); - } catch (const std::exception& e) { - LOG(FATAL) << "Exception in gpu_hist: " << e.what() << std::endl; + std::size_t t_idx{0}; + for (xgboost::RegTree* tree : trees) { + this->UpdateTree(param, gpair_hdv, dmat, tree, &out_position[t_idx]); + this->hist_maker_param_.CheckTreesSynchronized(ctx_, tree); + ++t_idx; } + dh::safe_cuda(cudaGetLastError()); monitor_.Stop("Update"); } @@ -958,7 +848,7 @@ class GPUGlobalApproxMaker : public TreeUpdater { if (hist_maker_param_.max_cached_hist_node != HistMakerTrainParam::DefaultNodes()) { LOG(WARNING) << "The `max_cached_hist_node` is ignored in GPU."; } - dh::CheckComputeCapability(); + common::CheckComputeCapability(); initialised_ = false; monitor_.Init(this->Name()); diff --git a/tests/buildkite/build-manylinux2014-aarch64.sh b/tests/buildkite/build-manylinux2014-aarch64.sh deleted file mode 100644 index 802db3f66aaf..000000000000 --- a/tests/buildkite/build-manylinux2014-aarch64.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -source tests/buildkite/conftest.sh - -WHEEL_TAG=manylinux2014_aarch64 -command_wrapper="tests/ci_build/ci_build.sh manylinux2014_aarch64" -python_bin="/opt/python/cp310-cp310/bin/python" - -echo "--- Build binary wheel for ${WHEEL_TAG}" -# Patch to add warning about manylinux2014 variant -patch -p0 < tests/buildkite/manylinux2014_warning.patch -$command_wrapper bash -c \ - "cd python-package && ${python_bin} -m pip wheel --no-deps -vvv . --wheel-dir dist/" -git checkout python-package/xgboost/core.py # discard the patch - -$command_wrapper auditwheel repair --plat ${WHEEL_TAG} python-package/dist/*.whl -$command_wrapper ${python_bin} tests/ci_build/rename_whl.py \ - --wheel-path wheelhouse/*.whl \ - --commit-hash ${BUILDKITE_COMMIT} \ - --platform-tag ${WHEEL_TAG} -rm -rf python-package/dist/ -mkdir python-package/dist/ -mv -v wheelhouse/*.whl python-package/dist/ - -echo "--- Upload Python wheel" -buildkite-agent artifact upload python-package/dist/*.whl -if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] -then - aws s3 cp python-package/dist/*.whl s3://xgboost-nightly-builds/${BRANCH_NAME}/ \ - --acl public-read --no-progress -fi diff --git a/tests/buildkite/build-manylinux2014-x86_64.sh b/tests/buildkite/build-manylinux2014-x86_64.sh deleted file mode 100644 index b00616315b8d..000000000000 --- a/tests/buildkite/build-manylinux2014-x86_64.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -source tests/buildkite/conftest.sh - -WHEEL_TAG=manylinux2014_x86_64 -command_wrapper="tests/ci_build/ci_build.sh manylinux2014_x86_64" -python_bin="/opt/python/cp310-cp310/bin/python" - -echo "--- Build binary wheel for ${WHEEL_TAG}" -# Patch to add warning about manylinux2014 variant -patch -p0 < tests/buildkite/manylinux2014_warning.patch -$command_wrapper bash -c \ - "cd python-package && ${python_bin} -m pip wheel --no-deps -vvv . --wheel-dir dist/" -git checkout python-package/xgboost/core.py # discard the patch - -$command_wrapper auditwheel repair --plat ${WHEEL_TAG} python-package/dist/*.whl -$command_wrapper ${python_bin} tests/ci_build/rename_whl.py \ - --wheel-path wheelhouse/*.whl \ - --commit-hash ${BUILDKITE_COMMIT} \ - --platform-tag ${WHEEL_TAG} -rm -rf python-package/dist/ -mkdir python-package/dist/ -mv -v wheelhouse/*.whl python-package/dist/ - -echo "--- Upload Python wheel" -buildkite-agent artifact upload python-package/dist/*.whl -if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] -then - aws s3 cp python-package/dist/*.whl s3://xgboost-nightly-builds/${BRANCH_NAME}/ \ - --acl public-read --no-progress -fi diff --git a/tests/buildkite/build-manylinux2014.sh b/tests/buildkite/build-manylinux2014.sh new file mode 100755 index 000000000000..426d32b5c361 --- /dev/null +++ b/tests/buildkite/build-manylinux2014.sh @@ -0,0 +1,63 @@ +#!/bin/bash + +set -euo pipefail + +if [ $# -ne 1 ]; then + echo "Usage: $0 {x86_64,aarch64}" + exit 1 +fi + +arch=$1 + +source tests/buildkite/conftest.sh + +WHEEL_TAG="manylinux2014_${arch}" +command_wrapper="tests/ci_build/ci_build.sh ${WHEEL_TAG}" +python_bin="/opt/python/cp310-cp310/bin/python" + +echo "--- Build binary wheel for ${WHEEL_TAG}" +# Patch to add warning about manylinux2014 variant +patch -p0 < tests/buildkite/remove_nccl_dep.patch +patch -p0 < tests/buildkite/manylinux2014_warning.patch +$command_wrapper bash -c \ + "cd python-package && ${python_bin} -m pip wheel --no-deps -v . --wheel-dir dist/" +git checkout python-package/pyproject.toml python-package/xgboost/core.py # discard the patch + +$command_wrapper auditwheel repair --plat ${WHEEL_TAG} python-package/dist/*.whl +$command_wrapper ${python_bin} tests/ci_build/rename_whl.py \ + --wheel-path wheelhouse/*.whl \ + --commit-hash ${BUILDKITE_COMMIT} \ + --platform-tag ${WHEEL_TAG} +rm -rf python-package/dist/ +mkdir python-package/dist/ +mv -v wheelhouse/*.whl python-package/dist/ + +echo "--- Build binary wheel for ${WHEEL_TAG} (CPU only)" +# Patch to rename pkg to xgboost-cpu +patch -p0 < tests/buildkite/remove_nccl_dep.patch +patch -p0 < tests/buildkite/cpu_only_pypkg.patch +$command_wrapper bash -c \ + "cd python-package && ${python_bin} -m pip wheel --no-deps -v . --wheel-dir dist/" +git checkout python-package/pyproject.toml # discard the patch + +$command_wrapper auditwheel repair --plat ${WHEEL_TAG} python-package/dist/xgboost_cpu-*.whl +$command_wrapper ${python_bin} tests/ci_build/rename_whl.py \ + --wheel-path wheelhouse/xgboost_cpu-*.whl \ + --commit-hash ${BUILDKITE_COMMIT} \ + --platform-tag ${WHEEL_TAG} +rm -v python-package/dist/xgboost_cpu-*.whl +mv -v wheelhouse/xgboost_cpu-*.whl python-package/dist/ + +echo "--- Upload Python wheel" +for wheel in python-package/dist/*.whl +do + buildkite-agent artifact upload "${wheel}" +done +if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] +then + for wheel in python-package/dist/*.whl + do + aws s3 cp "${wheel}" s3://xgboost-nightly-builds/${BRANCH_NAME}/ \ + --acl public-read --no-progress + done +fi diff --git a/tests/buildkite/cpu_only_pypkg.patch b/tests/buildkite/cpu_only_pypkg.patch new file mode 100644 index 000000000000..765ac5c098d0 --- /dev/null +++ b/tests/buildkite/cpu_only_pypkg.patch @@ -0,0 +1,55 @@ +diff --git python-package/README.rst python-package/README.rst +index 1fc0bb5a0..f1c68470b 100644 +--- python-package/README.rst ++++ python-package/README.rst +@@ -1,20 +1,15 @@ +-====================== +-XGBoost Python Package +-====================== ++================================= ++XGBoost Python Package (CPU only) ++================================= + + |PyPI version| + +-Installation +-============ ++The ``xgboost-cpu`` package provides for a minimal installation, with no support for the GPU algorithms ++or federated learning. It is provided to allow XGBoost to be installed in a space-constrained ++environments. + +-From `PyPI `_ +---------------------------------------------------- ++Note. ``xgboost-cpu`` package is only provided for x86_64 (amd64) Linux and Windows platforms. ++For other platforms, please install ``xgboost`` from https://pypi.org/project/xgboost/. + +-For a stable version, install using ``pip``:: +- +- pip install xgboost +- +-.. |PyPI version| image:: https://badge.fury.io/py/xgboost.svg +- :target: http://badge.fury.io/py/xgboost +- +-For building from source, see `build `_. ++Note. ``xgboost-cpu`` does not provide an sdist (source distribution). You may install sdist ++from https://pypi.org/project/xgboost/. +diff --git python-package/pyproject.toml python-package/pyproject.toml +index 46c1451c2..c5dc908d9 100644 +--- python-package/pyproject.toml ++++ python-package/pyproject.toml +@@ -6,7 +6,7 @@ backend-path = ["."] + build-backend = "packager.pep517" + + [project] +-name = "xgboost" ++name = "xgboost-cpu" + description = "XGBoost Python Package" + readme = { file = "README.rst", content-type = "text/x-rst" } + authors = [ +@@ -82,3 +82,6 @@ class-attribute-naming-style = "snake_case" + + # Allow single-letter variables + variable-rgx = "[a-zA-Z_][a-z0-9_]{0,30}$" ++ ++[tool.hatch.build.targets.wheel] ++packages = ["xgboost/"] diff --git a/tests/buildkite/manylinux2014_warning.patch b/tests/buildkite/manylinux2014_warning.patch index 692a92672d2f..679205988b7a 100644 --- a/tests/buildkite/manylinux2014_warning.patch +++ b/tests/buildkite/manylinux2014_warning.patch @@ -1,17 +1,3 @@ -diff --git python-package/pyproject.toml python-package/pyproject.toml -index a273d8c13..dee49686a 100644 ---- python-package/pyproject.toml -+++ python-package/pyproject.toml -@@ -30,8 +30,7 @@ classifiers = [ - ] - dependencies = [ - "numpy", -- "scipy", -- "nvidia-nccl-cu12 ; platform_system == 'Linux' and platform_machine != 'aarch64'" -+ "scipy" - ] - - [project.urls] diff --git python-package/xgboost/core.py python-package/xgboost/core.py index e8bc735e6..030972ef2 100644 --- python-package/xgboost/core.py diff --git a/tests/buildkite/pipeline.yml b/tests/buildkite/pipeline.yml index acdb71dba529..ee9637b8bd25 100644 --- a/tests/buildkite/pipeline.yml +++ b/tests/buildkite/pipeline.yml @@ -72,12 +72,12 @@ steps: agents: queue: linux-amd64-cpu - label: ":console: Build manylinux2014_x86_64 wheel" - command: "tests/buildkite/build-manylinux2014-x86_64.sh" + command: "tests/buildkite/build-manylinux2014.sh x86_64" key: build-manylinux2014-x86_64 agents: queue: linux-amd64-cpu - label: ":console: Build manylinux2014_aarch64 wheel" - command: "tests/buildkite/build-manylinux2014-aarch64.sh" + command: "tests/buildkite/build-manylinux2014.sh aarch64" key: build-manylinux2014-aarch64 agents: queue: linux-arm64-cpu diff --git a/tests/buildkite/remove_nccl_dep.patch b/tests/buildkite/remove_nccl_dep.patch new file mode 100644 index 000000000000..a2a4a5c88289 --- /dev/null +++ b/tests/buildkite/remove_nccl_dep.patch @@ -0,0 +1,14 @@ +diff --git python-package/pyproject.toml python-package/pyproject.toml +index 8835def25..46c1451c2 100644 +--- python-package/pyproject.toml ++++ python-package/pyproject.toml +@@ -30,8 +30,7 @@ classifiers = [ + ] + dependencies = [ + "numpy", +- "scipy", +- "nvidia-nccl-cu12 ; platform_system == 'Linux' and platform_machine != 'aarch64'" ++ "scipy" + ] + + [project.urls] diff --git a/tests/ci_build/Dockerfile.i386 b/tests/ci_build/Dockerfile.i386 index d7c133e2aee4..a582a54020bb 100644 --- a/tests/ci_build/Dockerfile.i386 +++ b/tests/ci_build/Dockerfile.i386 @@ -1,7 +1,7 @@ FROM i386/debian:sid ENV DEBIAN_FRONTEND noninteractive -SHELL ["/bin/bash", "-c"] # Use Bash as shell +SHELL ["/bin/bash", "-c"] RUN \ apt-get update && \ diff --git a/tests/ci_build/conda_env/macos_cpu_test.yml b/tests/ci_build/conda_env/macos_cpu_test.yml index ce9ca4b1ba7c..5bca323af5f4 100644 --- a/tests/ci_build/conda_env/macos_cpu_test.yml +++ b/tests/ci_build/conda_env/macos_cpu_test.yml @@ -37,5 +37,5 @@ dependencies: - pyspark>=3.4.0 - cloudpickle - pip: + - setuptools - sphinx_rtd_theme - - py-ubjson diff --git a/tests/cpp/collective/test_worker.h b/tests/cpp/collective/test_worker.h index 66c6ce9bf24e..2430911904e8 100644 --- a/tests/cpp/collective/test_worker.h +++ b/tests/cpp/collective/test_worker.h @@ -15,7 +15,7 @@ #include "../../../src/collective/comm.h" #include "../../../src/collective/communicator-inl.h" // for Init, Finalize #include "../../../src/collective/tracker.h" // for GetHostAddress -#include "../../../src/common/common.h" // for AllVisibleGPUs +#include "../../../src/common/cuda_rt_utils.h" // for AllVisibleGPUs #include "../helpers.h" // for FileExists #if defined(XGBOOST_USE_FEDERATED) diff --git a/tests/cpp/common/test_device_vector.cu b/tests/cpp/common/test_device_vector.cu index 95da4ef3f167..c6a8c0ab95ce 100644 --- a/tests/cpp/common/test_device_vector.cu +++ b/tests/cpp/common/test_device_vector.cu @@ -12,7 +12,7 @@ TEST(DeviceUVector, Basic) { std::int32_t verbosity{3}; std::swap(verbosity, xgboost::GlobalConfigThreadLocalStore::Get()->verbosity); DeviceUVector uvec; - uvec.Resize(12); + uvec.resize(12); auto peak = GlobalMemoryLogger().PeakMemory(); auto n_bytes = sizeof(decltype(uvec)::value_type) * uvec.size(); ASSERT_EQ(peak, n_bytes); diff --git a/tests/cpp/common/test_host_device_vector.cu b/tests/cpp/common/test_host_device_vector.cu index a0aa5fa11fce..c730390c37d8 100644 --- a/tests/cpp/common/test_host_device_vector.cu +++ b/tests/cpp/common/test_host_device_vector.cu @@ -4,9 +4,10 @@ #include #include #include +#include +#include "../../../src/common/cuda_rt_utils.h" // for SetDevice #include "../../../src/common/device_helpers.cuh" -#include namespace xgboost::common { namespace { diff --git a/tests/cpp/common/test_ref_resource_view.cc b/tests/cpp/common/test_ref_resource_view.cc index 9ae55fdec7f4..b201f69139d3 100644 --- a/tests/cpp/common/test_ref_resource_view.cc +++ b/tests/cpp/common/test_ref_resource_view.cc @@ -1,5 +1,5 @@ /** - * Copyright 2023, XGBoost Contributors + * Copyright 2023-2024, XGBoost Contributors */ #include @@ -16,17 +16,16 @@ TEST(RefResourceView, Basic) { std::size_t n_bytes = 1024; auto mem = std::make_shared(n_bytes); { - RefResourceView view{reinterpret_cast(mem->Data()), mem->Size() / sizeof(float), mem}; + RefResourceView view{static_cast(mem->Data()), mem->Size() / sizeof(float), mem}; - RefResourceView kview{reinterpret_cast(mem->Data()), mem->Size() / sizeof(float), - mem}; + RefResourceView kview{static_cast(mem->Data()), mem->Size() / sizeof(float), mem}; ASSERT_EQ(mem.use_count(), 3); ASSERT_EQ(view.size(), n_bytes / sizeof(1024)); ASSERT_EQ(kview.size(), n_bytes / sizeof(1024)); } { - RefResourceView view{reinterpret_cast(mem->Data()), mem->Size() / sizeof(float), mem, - 1.5f}; + RefResourceView view{static_cast(mem->Data()), mem->Size() / sizeof(float), mem}; + std::fill_n(static_cast(mem->Data()), mem->Size() / sizeof(float), 1.5f); for (auto v : view) { ASSERT_EQ(v, 1.5f); } diff --git a/tests/cpp/data/test_ellpack_page.cu b/tests/cpp/data/test_ellpack_page.cu index 9d9687dda81b..8aab51b7202e 100644 --- a/tests/cpp/data/test_ellpack_page.cu +++ b/tests/cpp/data/test_ellpack_page.cu @@ -27,15 +27,15 @@ TEST(EllpackPage, EmptyDMatrix) { auto impl = page.Impl(); ASSERT_EQ(impl->row_stride, 0); ASSERT_EQ(impl->Cuts().TotalBins(), 0); - ASSERT_EQ(impl->gidx_buffer.Size(), 4); + ASSERT_EQ(impl->gidx_buffer.size(), 4); } TEST(EllpackPage, BuildGidxDense) { int constexpr kNRows = 16, kNCols = 8; - auto page = BuildEllpackPage(kNRows, kNCols); - - std::vector h_gidx_buffer(page->gidx_buffer.HostVector()); - common::CompressedIterator gidx(h_gidx_buffer.data(), page->NumSymbols()); + auto ctx = MakeCUDACtx(0); + auto page = BuildEllpackPage(&ctx, kNRows, kNCols); + std::vector h_gidx_buffer; + auto h_accessor = page->GetHostAccessor(&ctx, &h_gidx_buffer); ASSERT_EQ(page->row_stride, kNCols); @@ -58,16 +58,17 @@ TEST(EllpackPage, BuildGidxDense) { 1, 4, 7, 10, 14, 16, 19, 21, }; for (size_t i = 0; i < kNRows * kNCols; ++i) { - ASSERT_EQ(solution[i], gidx[i]); + ASSERT_EQ(solution[i], h_accessor.gidx_iter[i]); } } TEST(EllpackPage, BuildGidxSparse) { int constexpr kNRows = 16, kNCols = 8; - auto page = BuildEllpackPage(kNRows, kNCols, 0.9f); + auto ctx = MakeCUDACtx(0); + auto page = BuildEllpackPage(&ctx, kNRows, kNCols, 0.9f); - std::vector h_gidx_buffer(page->gidx_buffer.HostVector()); - common::CompressedIterator gidx(h_gidx_buffer.data(), 25); + std::vector h_gidx_buffer; + auto h_accessor = page->GetHostAccessor(&ctx, &h_gidx_buffer); ASSERT_LE(page->row_stride, 3); @@ -78,7 +79,7 @@ TEST(EllpackPage, BuildGidxSparse) { 24, 7, 14, 16, 4, 24, 24, 24, 24, 24, 9, 24, 24, 1, 24, 24 }; for (size_t i = 0; i < kNRows * page->row_stride; ++i) { - ASSERT_EQ(solution[i], gidx[i]); + ASSERT_EQ(solution[i], h_accessor.gidx_iter[i]); } } @@ -94,7 +95,7 @@ TEST(EllpackPage, FromCategoricalBasic) { Context ctx{MakeCUDACtx(0)}; auto p = BatchParam{max_bins, tree::TrainParam::DftSparseThreshold()}; auto ellpack = EllpackPage(&ctx, m.get(), p); - auto accessor = ellpack.Impl()->GetDeviceAccessor(FstCU()); + auto accessor = ellpack.Impl()->GetDeviceAccessor(ctx.Device()); ASSERT_EQ(kCats, accessor.NumBins()); auto x_copy = x; @@ -110,13 +111,11 @@ TEST(EllpackPage, FromCategoricalBasic) { ASSERT_EQ(h_cuts_ptr.size(), 2); ASSERT_EQ(h_cuts_values.size(), kCats); - std::vector const &h_gidx_buffer = - ellpack.Impl()->gidx_buffer.HostVector(); - auto h_gidx_iter = common::CompressedIterator( - h_gidx_buffer.data(), accessor.NumSymbols()); + std::vector h_gidx_buffer; + auto h_accessor = ellpack.Impl()->GetHostAccessor(&ctx, &h_gidx_buffer); for (size_t i = 0; i < x.size(); ++i) { - auto bin = h_gidx_iter[i]; + auto bin = h_accessor.gidx_iter[i]; auto bin_value = h_cuts_values.at(bin); ASSERT_EQ(AsCat(x[i]), AsCat(bin_value)); } @@ -152,12 +151,12 @@ TEST(EllpackPage, Copy) { auto page = (*dmat->GetBatches(&ctx, param).begin()).Impl(); // Create an empty result page. - EllpackPageImpl result(FstCU(), page->CutsShared(), page->is_dense, page->row_stride, kRows); + EllpackPageImpl result(&ctx, page->CutsShared(), page->is_dense, page->row_stride, kRows); // Copy batch pages into the result page. size_t offset = 0; for (auto& batch : dmat->GetBatches(&ctx, param)) { - size_t num_elements = result.Copy(FstCU(), batch.Impl(), offset); + size_t num_elements = result.Copy(&ctx, batch.Impl(), offset); offset += num_elements; } @@ -171,11 +170,11 @@ TEST(EllpackPage, Copy) { EXPECT_EQ(impl->base_rowid, current_row); for (size_t i = 0; i < impl->Size(); i++) { - dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(FstCU()), current_row, + dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(ctx.Device()), current_row, row_d.data().get())); thrust::copy(row_d.begin(), row_d.end(), row.begin()); - dh::LaunchN(kCols, ReadRowFunction(result.GetDeviceAccessor(FstCU()), current_row, + dh::LaunchN(kCols, ReadRowFunction(result.GetDeviceAccessor(ctx.Device()), current_row, row_result_d.data().get())); thrust::copy(row_result_d.begin(), row_result_d.end(), row_result.begin()); @@ -200,7 +199,7 @@ TEST(EllpackPage, Compact) { auto page = (*dmat->GetBatches(&ctx, param).begin()).Impl(); // Create an empty result page. - EllpackPageImpl result(ctx.Device(), page->CutsShared(), page->is_dense, page->row_stride, + EllpackPageImpl result(&ctx, page->CutsShared(), page->is_dense, page->row_stride, kCompactedRows); // Compact batch pages into the result page. @@ -229,14 +228,13 @@ TEST(EllpackPage, Compact) { continue; } - dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(FstCU()), - current_row, row_d.data().get())); + dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(ctx.Device()), current_row, + row_d.data().get())); dh::safe_cuda(cudaDeviceSynchronize()); thrust::copy(row_d.begin(), row_d.end(), row.begin()); - dh::LaunchN(kCols, - ReadRowFunction(result.GetDeviceAccessor(FstCU()), compacted_row, - row_result_d.data().get())); + dh::LaunchN(kCols, ReadRowFunction(result.GetDeviceAccessor(ctx.Device()), compacted_row, + row_result_d.data().get())); thrust::copy(row_result_d.begin(), row_result_d.end(), row_result.begin()); EXPECT_EQ(row, row_result); @@ -269,16 +267,13 @@ class EllpackPageTest : public testing::TestWithParam { ASSERT_EQ(from_sparse_page->base_rowid, 0); ASSERT_EQ(from_sparse_page->base_rowid, from_ghist->base_rowid); ASSERT_EQ(from_sparse_page->n_rows, from_ghist->n_rows); - ASSERT_EQ(from_sparse_page->gidx_buffer.Size(), from_ghist->gidx_buffer.Size()); - auto const& h_gidx_from_sparse = from_sparse_page->gidx_buffer.HostVector(); - auto const& h_gidx_from_ghist = from_ghist->gidx_buffer.HostVector(); + ASSERT_EQ(from_sparse_page->gidx_buffer.size(), from_ghist->gidx_buffer.size()); + std::vector h_gidx_from_sparse, h_gidx_from_ghist; + auto from_ghist_acc = from_ghist->GetHostAccessor(&gpu_ctx, &h_gidx_from_ghist); + auto from_sparse_acc = from_sparse_page->GetHostAccessor(&gpu_ctx, &h_gidx_from_sparse); ASSERT_EQ(from_sparse_page->NumSymbols(), from_ghist->NumSymbols()); - common::CompressedIterator from_ghist_it(h_gidx_from_ghist.data(), - from_ghist->NumSymbols()); - common::CompressedIterator from_sparse_it(h_gidx_from_sparse.data(), - from_sparse_page->NumSymbols()); for (size_t i = 0; i < from_ghist->n_rows * from_ghist->row_stride; ++i) { - EXPECT_EQ(from_ghist_it[i], from_sparse_it[i]); + EXPECT_EQ(from_ghist_acc.gidx_iter[i], from_sparse_acc.gidx_iter[i]); } } } diff --git a/tests/cpp/data/test_ellpack_page_raw_format.cu b/tests/cpp/data/test_ellpack_page_raw_format.cu index d5ff721f869a..b7bb5f902c6c 100644 --- a/tests/cpp/data/test_ellpack_page_raw_format.cu +++ b/tests/cpp/data/test_ellpack_page_raw_format.cu @@ -14,9 +14,8 @@ namespace xgboost::data { namespace { template -void TestEllpackPageRawFormat() { - FormatStreamPolicy policy; - +void TestEllpackPageRawFormat(FormatStreamPolicy *p_policy) { + auto &policy = *p_policy; Context ctx{MakeCUDACtx(0)}; auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()}; @@ -55,16 +54,30 @@ void TestEllpackPageRawFormat() { ASSERT_EQ(loaded->Cuts().Values(), orig->Cuts().Values()); ASSERT_EQ(loaded->base_rowid, orig->base_rowid); ASSERT_EQ(loaded->row_stride, orig->row_stride); - ASSERT_EQ(loaded->gidx_buffer.HostVector(), orig->gidx_buffer.HostVector()); + std::vector h_loaded, h_orig; + [[maybe_unused]] auto h_loaded_acc = loaded->GetHostAccessor(&ctx, &h_loaded); + [[maybe_unused]] auto h_orig_acc = orig->GetHostAccessor(&ctx, &h_orig); + ASSERT_EQ(h_loaded, h_orig); } } } // anonymous namespace TEST(EllpackPageRawFormat, DiskIO) { - TestEllpackPageRawFormat>(); + EllpackMmapStreamPolicy policy{false}; + TestEllpackPageRawFormat(&policy); +} + +TEST(EllpackPageRawFormat, DiskIOHmm) { + if (common::SupportsPageableMem()) { + EllpackMmapStreamPolicy policy{true}; + TestEllpackPageRawFormat(&policy); + } else { + GTEST_SKIP_("HMM is not supported."); + } } TEST(EllpackPageRawFormat, HostIO) { - TestEllpackPageRawFormat>(); + EllpackCacheStreamPolicy policy; + TestEllpackPageRawFormat(&policy); } } // namespace xgboost::data diff --git a/tests/cpp/data/test_iterative_dmatrix.cu b/tests/cpp/data/test_iterative_dmatrix.cu index 503cb76965e6..5fb90a5c1526 100644 --- a/tests/cpp/data/test_iterative_dmatrix.cu +++ b/tests/cpp/data/test_iterative_dmatrix.cu @@ -1,5 +1,5 @@ /** - * Copyright 2020-2023, XGBoost contributors + * Copyright 2020-2024, XGBoost contributors */ #include @@ -21,10 +21,10 @@ void TestEquivalent(float sparsity) { std::size_t offset = 0; auto first = (*m.GetEllpackBatches(&ctx, {}).begin()).Impl(); std::unique_ptr page_concatenated{new EllpackPageImpl( - ctx.Device(), first->CutsShared(), first->is_dense, first->row_stride, 1000 * 100)}; + &ctx, first->CutsShared(), first->is_dense, first->row_stride, 1000 * 100)}; for (auto& batch : m.GetBatches(&ctx, {})) { auto page = batch.Impl(); - size_t num_elements = page_concatenated->Copy(ctx.Device(), page, offset); + size_t num_elements = page_concatenated->Copy(&ctx, page, offset); offset += num_elements; } auto from_iter = page_concatenated->GetDeviceAccessor(ctx.Device()); @@ -66,18 +66,15 @@ void TestEquivalent(float sparsity) { ASSERT_EQ(cut_ptrs_iter[i], cut_ptrs_data[i]); } - auto const& buffer_from_iter = page_concatenated->gidx_buffer; - auto const& buffer_from_data = ellpack.Impl()->gidx_buffer; - ASSERT_NE(buffer_from_data.Size(), 0); - - common::CompressedIterator data_buf{ - buffer_from_data.ConstHostPointer(), from_data.NumSymbols()}; - common::CompressedIterator data_iter{ - buffer_from_iter.ConstHostPointer(), from_iter.NumSymbols()}; + std::vector buffer_from_iter, buffer_from_data; + auto data_iter = page_concatenated->GetHostAccessor(&ctx, &buffer_from_iter); + auto data_buf = ellpack.Impl()->GetHostAccessor(&ctx, &buffer_from_data); + ASSERT_NE(buffer_from_data.size(), 0); + ASSERT_NE(buffer_from_iter.size(), 0); CHECK_EQ(from_data.NumSymbols(), from_iter.NumSymbols()); CHECK_EQ(from_data.n_rows * from_data.row_stride, from_data.n_rows * from_iter.row_stride); for (size_t i = 0; i < from_data.n_rows * from_data.row_stride; ++i) { - CHECK_EQ(data_buf[i], data_iter[i]); + CHECK_EQ(data_buf.gidx_iter[i], data_iter.gidx_iter[i]); } } } @@ -97,8 +94,8 @@ TEST(IterativeDeviceDMatrix, RowMajor) { for (auto& ellpack : m.GetBatches(&ctx, {})) { n_batches ++; auto impl = ellpack.Impl(); - common::CompressedIterator iterator( - impl->gidx_buffer.HostVector().data(), impl->NumSymbols()); + std::vector h_gidx; + auto h_accessor = impl->GetHostAccessor(&ctx, &h_gidx); auto cols = CudaArrayIterForTest::Cols(); auto rows = CudaArrayIterForTest::Rows(); @@ -111,7 +108,7 @@ TEST(IterativeDeviceDMatrix, RowMajor) { for(auto i = 0ull; i < rows * cols; i++) { int column_idx = i % cols; - EXPECT_EQ(impl->Cuts().SearchBin(h_data[i], column_idx), iterator[i]); + EXPECT_EQ(impl->Cuts().SearchBin(h_data[i], column_idx), h_accessor.gidx_iter[i]); } EXPECT_EQ(m.Info().num_col_, cols); EXPECT_EQ(m.Info().num_row_, rows); @@ -147,12 +144,12 @@ TEST(IterativeDeviceDMatrix, RowMajorMissing) { *m.GetBatches(&ctx, BatchParam{256, tree::TrainParam::DftSparseThreshold()}) .begin(); auto impl = ellpack.Impl(); - common::CompressedIterator iterator( - impl->gidx_buffer.HostVector().data(), impl->NumSymbols()); - EXPECT_EQ(iterator[1], impl->GetDeviceAccessor(ctx.Device()).NullValue()); - EXPECT_EQ(iterator[5], impl->GetDeviceAccessor(ctx.Device()).NullValue()); + std::vector h_gidx; + auto h_accessor = impl->GetHostAccessor(&ctx, &h_gidx); + EXPECT_EQ(h_accessor.gidx_iter[1], impl->GetDeviceAccessor(ctx.Device()).NullValue()); + EXPECT_EQ(h_accessor.gidx_iter[5], impl->GetDeviceAccessor(ctx.Device()).NullValue()); // null values get placed after valid values in a row - EXPECT_EQ(iterator[7], impl->GetDeviceAccessor(ctx.Device()).NullValue()); + EXPECT_EQ(h_accessor.gidx_iter[7], impl->GetDeviceAccessor(ctx.Device()).NullValue()); EXPECT_EQ(m.Info().num_col_, cols); EXPECT_EQ(m.Info().num_row_, rows); EXPECT_EQ(m.Info().num_nonzero_, rows* cols - 3); diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cu b/tests/cpp/data/test_sparse_page_dmatrix.cu index 327f2ba635fd..046c4eed4d80 100644 --- a/tests/cpp/data/test_sparse_page_dmatrix.cu +++ b/tests/cpp/data/test_sparse_page_dmatrix.cu @@ -154,13 +154,18 @@ TEST(SparsePageDMatrix, RetainEllpackPage) { for (auto it = begin; it != end; ++it) { iterators.push_back(it.Page()); gidx_buffers.emplace_back(); - gidx_buffers.back().Resize((*it).Impl()->gidx_buffer.Size()); - gidx_buffers.back().Copy((*it).Impl()->gidx_buffer); + gidx_buffers.back().SetDevice(ctx.Device()); + gidx_buffers.back().Resize((*it).Impl()->gidx_buffer.size()); + auto d_dst = gidx_buffers.back().DevicePointer(); + auto const& d_src = (*it).Impl()->gidx_buffer; + dh::safe_cuda(cudaMemcpyAsync(d_dst, d_src.data(), d_src.size_bytes(), cudaMemcpyDefault)); } ASSERT_GE(iterators.size(), 2); for (size_t i = 0; i < iterators.size(); ++i) { - ASSERT_EQ((*iterators[i]).Impl()->gidx_buffer.HostVector(), gidx_buffers.at(i).HostVector()); + std::vector h_buf; + [[maybe_unused]] auto h_acc = (*iterators[i]).Impl()->GetHostAccessor(&ctx, &h_buf); + ASSERT_EQ(h_buf, gidx_buffers.at(i).HostVector()); ASSERT_EQ(iterators[i].use_count(), 1); } @@ -210,11 +215,11 @@ class TestEllpackPageExt : public ::testing::TestWithParamGetBatches(&ctx, param)) { if (!impl_ext) { - impl_ext = std::make_unique( - batch.Impl()->gidx_buffer.Device(), batch.Impl()->CutsShared(), batch.Impl()->is_dense, - batch.Impl()->row_stride, kRows); + impl_ext = std::make_unique(&ctx, batch.Impl()->CutsShared(), + batch.Impl()->is_dense, + batch.Impl()->row_stride, kRows); } - auto n_elems = impl_ext->Copy(ctx.Device(), batch.Impl(), offset); + auto n_elems = impl_ext->Copy(&ctx, batch.Impl(), offset); offset += n_elems; } ASSERT_EQ(impl_ext->base_rowid, 0); @@ -223,8 +228,10 @@ class TestEllpackPageExt : public ::testing::TestWithParamrow_stride, 2); ASSERT_EQ(impl_ext->Cuts().TotalBins(), 4); - std::vector buffer(impl->gidx_buffer.HostVector()); - std::vector buffer_ext(impl_ext->gidx_buffer.HostVector()); + std::vector buffer; + [[maybe_unused]] auto h_acc = impl->GetHostAccessor(&ctx, &buffer); + std::vector buffer_ext; + [[maybe_unused]] auto h_ext_acc = impl_ext->GetHostAccessor(&ctx, &buffer_ext); ASSERT_EQ(buffer, buffer_ext); } }; diff --git a/tests/cpp/filesystem.h b/tests/cpp/filesystem.h index c8d144291b0f..fafc8c7d1bf9 100644 --- a/tests/cpp/filesystem.h +++ b/tests/cpp/filesystem.h @@ -1,13 +1,10 @@ -/*! - * Copyright (c) 2022 by XGBoost Contributors +/** + * Copyright 2022-2024, XGBoost Contributors */ #ifndef XGBOOST_TESTS_CPP_FILESYSTEM_H #define XGBOOST_TESTS_CPP_FILESYSTEM_H -// A macro used inside `windows.h` to avoid conflicts with `winsock2.h` -#ifndef WIN32_LEAN_AND_MEAN -#define WIN32_LEAN_AND_MEAN -#endif // WIN32_LEAN_AND_MEAN +#include #include "dmlc/filesystem.h" diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc index 9b988f9605bd..eebbaf8ef795 100644 --- a/tests/cpp/helpers.cc +++ b/tests/cpp/helpers.cc @@ -763,4 +763,6 @@ void DeleteRMMResource(RMMAllocator*) {} RMMAllocatorPtr SetUpRMMResourceForCppTests(int, char**) { return {nullptr, DeleteRMMResource}; } #endif // !defined(XGBOOST_USE_RMM) || XGBOOST_USE_RMM != 1 + +std::int32_t DistGpuIdx() { return common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank(); } } // namespace xgboost diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h index 2211b2d00cb2..2821a11380c8 100644 --- a/tests/cpp/helpers.h +++ b/tests/cpp/helpers.h @@ -21,14 +21,11 @@ #if defined(__CUDACC__) #include "../../src/collective/communicator-inl.h" // for GetRank -#include "../../src/common/common.h" // for AllVisibleGPUs +#include "../../src/common/cuda_rt_utils.h" // for AllVisibleGPUs #endif // defined(__CUDACC__) #include "filesystem.h" // dmlc::TemporaryDirectory #include "xgboost/linalg.h" -#if !defined(_OPENMP) -#include -#endif #if defined(__CUDACC__) #define DeclareUnifiedTest(name) GPU ## name @@ -529,6 +526,9 @@ inline std::int32_t AllThreadsForTest() { return Context{}.Threads(); } inline DeviceOrd FstCU() { return DeviceOrd::CUDA(0); } +// GPU device ordinal for distributed tests +std::int32_t DistGpuIdx(); + inline auto GMockThrow(StringView msg) { return ::testing::ThrowsMessage(::testing::HasSubstr(msg)); } diff --git a/tests/cpp/histogram_helpers.h b/tests/cpp/histogram_helpers.h index a33d6958ae3f..ff021e819821 100644 --- a/tests/cpp/histogram_helpers.h +++ b/tests/cpp/histogram_helpers.h @@ -23,7 +23,7 @@ class HistogramCutsWrapper : public common::HistogramCuts { }; } // namespace detail -inline std::unique_ptr BuildEllpackPage(int n_rows, int n_cols, +inline std::unique_ptr BuildEllpackPage(Context const* ctx, int n_rows, int n_cols, bst_float sparsity = 0) { auto dmat = RandomDataGenerator(n_rows, n_cols, sparsity).Seed(3).GenerateDMatrix(); const SparsePage& batch = *dmat->GetBatches().begin(); @@ -48,7 +48,7 @@ inline std::unique_ptr BuildEllpackPage(int n_rows, int n_cols, } auto page = std::unique_ptr( - new EllpackPageImpl(DeviceOrd::CUDA(0), cmat, batch, dmat->IsDense(), row_stride, {})); + new EllpackPageImpl(ctx, cmat, batch, dmat->IsDense(), row_stride, {})); return page; } diff --git a/tests/cpp/objective/test_aft_obj.cc b/tests/cpp/objective/test_aft_obj.cc index 972dfc53f58e..f31debb21af9 100644 --- a/tests/cpp/objective/test_aft_obj.cc +++ b/tests/cpp/objective/test_aft_obj.cc @@ -1,5 +1,5 @@ /** - * Copyright 2020-2023, XGBoost Contributors + * Copyright 2020-2024, XGBoost Contributors */ #include #include @@ -10,7 +10,6 @@ #include "xgboost/objective.h" #include "xgboost/logging.h" #include "../helpers.h" -#include "../../../src/common/survival_util.h" namespace xgboost::common { TEST(Objective, DeclareUnifiedTest(AFTObjConfiguration)) { diff --git a/tests/cpp/plugin/federated/test_federated_coll.cu b/tests/cpp/plugin/federated/test_federated_coll.cu index f3b9066133cc..31760a97f1fe 100644 --- a/tests/cpp/plugin/federated/test_federated_coll.cu +++ b/tests/cpp/plugin/federated/test_federated_coll.cu @@ -6,7 +6,7 @@ #include // for Result #include "../../../../src/collective/allreduce.h" -#include "../../../../src/common/common.h" // for AllVisibleGPUs +#include "../../../../src/common/cuda_rt_utils.h" // for AllVisibleGPUs #include "../../../../src/common/device_helpers.cuh" // for device_vector #include "../../../../src/common/type.h" // for EraseType #include "../../collective/test_worker.h" // for SocketTest diff --git a/tests/cpp/plugin/federated/test_federated_comm_group.cc b/tests/cpp/plugin/federated/test_federated_comm_group.cc index 9bfbdd3ae1b4..511b3d8d11a8 100644 --- a/tests/cpp/plugin/federated/test_federated_comm_group.cc +++ b/tests/cpp/plugin/federated/test_federated_comm_group.cc @@ -1,11 +1,11 @@ /** - * Copyright 2023, XGBoost Contributors + * Copyright 2023-2024, XGBoost Contributors */ #include #include // for Json #include "../../../../src/collective/comm_group.h" -#include "../../helpers.h" +#include "../../../../src/common/cuda_rt_utils.h" // for AllVisibleGPUs #include "test_worker.h" namespace xgboost::collective { diff --git a/tests/cpp/plugin/federated/test_federated_comm_group.cu b/tests/cpp/plugin/federated/test_federated_comm_group.cu index 747adb6fd87e..c6fd8921c0bb 100644 --- a/tests/cpp/plugin/federated/test_federated_comm_group.cu +++ b/tests/cpp/plugin/federated/test_federated_comm_group.cu @@ -1,10 +1,11 @@ /** - * Copyright 2023, XGBoost Contributors + * Copyright 2023-2024, XGBoost Contributors */ #include #include // for Json #include "../../../../src/collective/comm_group.h" +#include "../../../../src/common/cuda_rt_utils.h" // for AllVisibleGPUs #include "../../helpers.h" #include "test_worker.h" diff --git a/tests/cpp/test_context.cu b/tests/cpp/test_context.cu index 7684ff4672cd..0776980353f1 100644 --- a/tests/cpp/test_context.cu +++ b/tests/cpp/test_context.cu @@ -1,5 +1,5 @@ /** - * Copyright 2023, XGBoost Contributors + * Copyright 2023-2024, XGBoost Contributors */ #include #include // for Args @@ -8,7 +8,7 @@ #include // for string, to_string -#include "../../src/common/common.h" // for AllVisibleGPUs +#include "../../src/common/cuda_rt_utils.h" // for AllVisibleGPUs namespace xgboost { namespace { diff --git a/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu b/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu index 9a0304f87d58..85bea39c5f5c 100644 --- a/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu +++ b/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu @@ -1,5 +1,5 @@ /** - * Copyright 2020-2023, XGBoost Contributors + * Copyright 2020-2024, XGBoost Contributors */ #include @@ -102,19 +102,17 @@ TEST(GradientBasedSampler, NoSamplingExternalMemory) { EXPECT_EQ(sample.gpair.data(), gpair.DevicePointer()); EXPECT_EQ(sampled_page->n_rows, kRows); - std::vector buffer(sampled_page->gidx_buffer.HostVector()); - common::CompressedIterator - ci(buffer.data(), sampled_page->NumSymbols()); + std::vector h_gidx_buffer; + auto h_accessor = sampled_page->GetHostAccessor(&ctx, &h_gidx_buffer); - size_t offset = 0; + std::size_t offset = 0; for (auto& batch : dmat->GetBatches(&ctx, param)) { auto page = batch.Impl(); - std::vector page_buffer(page->gidx_buffer.HostVector()); - common::CompressedIterator - page_ci(page_buffer.data(), page->NumSymbols()); + std::vector h_page_gidx_buffer; + auto page_accessor = page->GetHostAccessor(&ctx, &h_page_gidx_buffer); size_t num_elements = page->n_rows * page->row_stride; for (size_t i = 0; i < num_elements; i++) { - EXPECT_EQ(ci[i + offset], page_ci[i]); + EXPECT_EQ(h_accessor.gidx_iter[i + offset], page_accessor.gidx_iter[i]); } offset += num_elements; } diff --git a/tests/cpp/tree/gpu_hist/test_histogram.cu b/tests/cpp/tree/gpu_hist/test_histogram.cu index d1128446617b..c9320f616983 100644 --- a/tests/cpp/tree/gpu_hist/test_histogram.cu +++ b/tests/cpp/tree/gpu_hist/test_histogram.cu @@ -14,6 +14,46 @@ #include "../../helpers.h" namespace xgboost::tree { +TEST(Histogram, DeviceHistogramStorage) { + // Ensures that node allocates correctly after reaching `kStopGrowingSize`. + auto ctx = MakeCUDACtx(0); + constexpr size_t kNBins = 128; + constexpr int kNNodes = 4; + constexpr size_t kStopGrowing = kNNodes * kNBins * 2u; + DeviceHistogramStorage histogram; + histogram.Init(FstCU(), kNBins); + for (int i = 0; i < kNNodes; ++i) { + histogram.AllocateHistograms(&ctx, {i}); + } + histogram.Reset(&ctx); + ASSERT_EQ(histogram.Data().size(), kStopGrowing); + + // Use allocated memory but do not erase nidx_map. + for (int i = 0; i < kNNodes; ++i) { + histogram.AllocateHistograms(&ctx, {i}); + } + for (int i = 0; i < kNNodes; ++i) { + ASSERT_TRUE(histogram.HistogramExists(i)); + } + + // Add two new nodes + histogram.AllocateHistograms(&ctx, {kNNodes}); + histogram.AllocateHistograms(&ctx, {kNNodes + 1}); + + // Old cached nodes should still exist + for (int i = 0; i < kNNodes; ++i) { + ASSERT_TRUE(histogram.HistogramExists(i)); + } + + // Should be deleted + ASSERT_FALSE(histogram.HistogramExists(kNNodes)); + // Most recent node should exist + ASSERT_TRUE(histogram.HistogramExists(kNNodes + 1)); + + // Add same node again - should fail + EXPECT_ANY_THROW(histogram.AllocateHistograms(&ctx, {kNNodes + 1});); +} + void TestDeterministicHistogram(bool is_dense, int shm_size, bool force_global) { Context ctx = MakeCUDACtx(0); size_t constexpr kBins = 256, kCols = 120, kRows = 16384, kRounds = 16; @@ -328,8 +368,7 @@ class HistogramExternalMemoryTest : public ::testing::TestWithParamGetBatches()) { concat.Push(page); } - EllpackPageImpl page{ - ctx.Device(), cuts, concat, p_fmat->IsDense(), p_fmat->Info().num_col_, {}}; + EllpackPageImpl page{&ctx, cuts, concat, p_fmat->IsDense(), p_fmat->Info().num_col_, {}}; auto ridx = partitioner.GetRows(0); auto d_histogram = dh::ToSpan(single_hist); DeviceHistogramBuilder builder; diff --git a/tests/cpp/tree/test_approx.cc b/tests/cpp/tree/test_approx.cc index 8f28bfa218c8..83e9243a2fa4 100644 --- a/tests/cpp/tree/test_approx.cc +++ b/tests/cpp/tree/test_approx.cc @@ -4,10 +4,12 @@ #include #include "../../../src/tree/common_row_partitioner.h" +#include "../../../src/tree/param.h" // for TrainParam #include "../collective/test_worker.h" // for TestDistributedGlobal #include "../helpers.h" #include "test_column_split.h" // for TestColumnSplit #include "test_partitioner.h" +#include "xgboost/tree_model.h" // for RegTree namespace xgboost::tree { namespace { @@ -76,6 +78,53 @@ TEST(Approx, Partitioner) { } } +TEST(Approx, InteractionConstraint) { + auto constexpr kRows = 32; + auto constexpr kCols = 16; + auto p_dmat = GenerateCatDMatrix(kRows, kCols, 0.6f, false); + Context ctx; + + linalg::Matrix gpair({kRows}, ctx.Device()); + gpair.Data()->Copy(GenerateRandomGradients(kRows)); + + ObjInfo task{ObjInfo::kRegression}; + { + // With constraints + RegTree tree{1, kCols}; + + std::unique_ptr updater{TreeUpdater::Create("grow_histmaker", &ctx, &task)}; + TrainParam param; + param.UpdateAllowUnknown( + Args{{"interaction_constraints", "[[0, 1]]"}, {"num_feature", std::to_string(kCols)}}); + std::vector> position(1); + updater->Configure(Args{}); + updater->Update(¶m, &gpair, p_dmat.get(), position, {&tree}); + + ASSERT_EQ(tree.NumExtraNodes(), 4); + ASSERT_EQ(tree[0].SplitIndex(), 1); + + ASSERT_EQ(tree[tree[0].LeftChild()].SplitIndex(), 0); + ASSERT_EQ(tree[tree[0].RightChild()].SplitIndex(), 0); + } + { + // Without constraints + RegTree tree{1u, kCols}; + + std::unique_ptr updater{TreeUpdater::Create("grow_histmaker", &ctx, &task)}; + std::vector> position(1); + TrainParam param; + param.Init(Args{}); + updater->Configure(Args{}); + updater->Update(¶m, &gpair, p_dmat.get(), position, {&tree}); + + ASSERT_EQ(tree.NumExtraNodes(), 10); + ASSERT_EQ(tree[0].SplitIndex(), 1); + + ASSERT_NE(tree[tree[0].LeftChild()].SplitIndex(), 0); + ASSERT_NE(tree[tree[0].RightChild()].SplitIndex(), 0); + } +} + namespace { void TestColumnSplitPartitioner(size_t n_samples, size_t base_rowid, std::shared_ptr Xy, std::vector* hess, float min_value, float mid_value, diff --git a/tests/cpp/tree/test_column_split.h b/tests/cpp/tree/test_column_split.h index b03597f38681..eba452a15a1c 100644 --- a/tests/cpp/tree/test_column_split.h +++ b/tests/cpp/tree/test_column_split.h @@ -23,9 +23,13 @@ inline std::shared_ptr GenerateCatDMatrix(std::size_t rows, std::size_t for (size_t i = 0; i < ft.size(); ++i) { ft[i] = (i % 3 == 0) ? FeatureType::kNumerical : FeatureType::kCategorical; } - return RandomDataGenerator(rows, cols, 0.6f).Seed(3).Type(ft).MaxCategory(17).GenerateDMatrix(); + return RandomDataGenerator(rows, cols, sparsity) + .Seed(3) + .Type(ft) + .MaxCategory(17) + .GenerateDMatrix(); } else { - return RandomDataGenerator{rows, cols, 0.6f}.Seed(3).GenerateDMatrix(); + return RandomDataGenerator{rows, cols, sparsity}.Seed(3).GenerateDMatrix(); } } diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu index 200fb39fb4e9..edd129353bdf 100644 --- a/tests/cpp/tree/test_gpu_hist.cu +++ b/tests/cpp/tree/test_gpu_hist.cu @@ -6,14 +6,14 @@ #include #include -#include #include #include #include "../../../src/common/common.h" -#include "../../../src/data/ellpack_page.cuh" // for EllpackPageImpl -#include "../../../src/data/ellpack_page.h" // for EllpackPage -#include "../../../src/tree/param.h" // for TrainParam +#include "../../../src/data/ellpack_page.cuh" // for EllpackPageImpl +#include "../../../src/data/ellpack_page.h" // for EllpackPage +#include "../../../src/tree/gpu_hist/histogram.cuh" // for DeviceHistogramStorage +#include "../../../src/tree/param.h" // for TrainParam #include "../../../src/tree/updater_gpu_hist.cu" #include "../collective/test_worker.h" // for BaseMGPUTest #include "../filesystem.h" // dmlc::TemporaryDirectory @@ -23,46 +23,6 @@ #include "xgboost/json.h" namespace xgboost::tree { -TEST(GpuHist, DeviceHistogramStorage) { - // Ensures that node allocates correctly after reaching `kStopGrowingSize`. - dh::safe_cuda(cudaSetDevice(0)); - constexpr size_t kNBins = 128; - constexpr int kNNodes = 4; - constexpr size_t kStopGrowing = kNNodes * kNBins * 2u; - DeviceHistogramStorage histogram; - histogram.Init(FstCU(), kNBins); - for (int i = 0; i < kNNodes; ++i) { - histogram.AllocateHistograms({i}); - } - histogram.Reset(); - ASSERT_EQ(histogram.Data().size(), kStopGrowing); - - // Use allocated memory but do not erase nidx_map. - for (int i = 0; i < kNNodes; ++i) { - histogram.AllocateHistograms({i}); - } - for (int i = 0; i < kNNodes; ++i) { - ASSERT_TRUE(histogram.HistogramExists(i)); - } - - // Add two new nodes - histogram.AllocateHistograms({kNNodes}); - histogram.AllocateHistograms({kNNodes + 1}); - - // Old cached nodes should still exist - for (int i = 0; i < kNNodes; ++i) { - ASSERT_TRUE(histogram.HistogramExists(i)); - } - - // Should be deleted - ASSERT_FALSE(histogram.HistogramExists(kNNodes)); - // Most recent node should exist - ASSERT_TRUE(histogram.HistogramExists(kNNodes + 1)); - - // Add same node again - should fail - EXPECT_ANY_THROW(histogram.AllocateHistograms({kNNodes + 1});); -} - std::vector GetHostHistGpair() { // 24 bins, 3 bins for each feature (column). std::vector hist_gpair = { @@ -81,6 +41,7 @@ std::vector GetHostHistGpair() { template void TestBuildHist(bool use_shared_memory_histograms) { int const kNRows = 16, kNCols = 8; + Context ctx{MakeCUDACtx(0)}; TrainParam param; Args args{ @@ -89,9 +50,8 @@ void TestBuildHist(bool use_shared_memory_histograms) { }; param.Init(args); - auto page = BuildEllpackPage(kNRows, kNCols); + auto page = BuildEllpackPage(&ctx, kNRows, kNCols); BatchParam batch_param{}; - Context ctx{MakeCUDACtx(0)}; auto cs = std::make_shared(0); GPUHistMakerDevice maker(&ctx, /*is_external_memory=*/false, {}, kNRows, param, cs, kNCols, batch_param, MetaInfo()); @@ -105,11 +65,10 @@ void TestBuildHist(bool use_shared_memory_histograms) { } gpair.SetDevice(ctx.Device()); - thrust::host_vector h_gidx_buffer(page->gidx_buffer.HostVector()); maker.row_partitioner = std::make_unique(&ctx, kNRows, 0); maker.hist.Init(ctx.Device(), page->Cuts().TotalBins()); - maker.hist.AllocateHistograms({0}); + maker.hist.AllocateHistograms(&ctx, {0}); maker.gpair = gpair.DeviceSpan(); maker.quantiser = std::make_unique(&ctx, maker.gpair, MetaInfo()); @@ -198,14 +157,12 @@ void TestHistogramIndexImpl() { auto grad = GenerateRandomGradients(kNRows); grad.SetDevice(DeviceOrd::CUDA(0)); maker->Reset(&grad, hist_maker_dmat.get(), kNCols); - std::vector h_gidx_buffer(maker->page->gidx_buffer.HostVector()); const auto &maker_ext = hist_maker_ext.maker; maker_ext->Reset(&grad, hist_maker_ext_dmat.get(), kNCols); - std::vector h_gidx_buffer_ext(maker_ext->page->gidx_buffer.HostVector()); ASSERT_EQ(maker->page->Cuts().TotalBins(), maker_ext->page->Cuts().TotalBins()); - ASSERT_EQ(maker->page->gidx_buffer.Size(), maker_ext->page->gidx_buffer.Size()); + ASSERT_EQ(maker->page->gidx_buffer.size(), maker_ext->page->gidx_buffer.size()); } TEST(GpuHist, TestHistogramIndex) { @@ -428,8 +385,8 @@ TEST(GpuHist, MaxDepth) { namespace { RegTree GetHistTree(Context const* ctx, DMatrix* dmat) { ObjInfo task{ObjInfo::kRegression}; - GPUHistMaker hist_maker{ctx, &task}; - hist_maker.Configure(Args{}); + std::unique_ptr hist_maker {TreeUpdater::Create("grow_gpu_hist", ctx, &task)}; + hist_maker->Configure(Args{}); TrainParam param; param.UpdateAllowUnknown(Args{}); @@ -439,8 +396,8 @@ RegTree GetHistTree(Context const* ctx, DMatrix* dmat) { std::vector> position(1); RegTree tree; - hist_maker.Update(¶m, &gpair, dmat, common::Span>{position}, - {&tree}); + hist_maker->Update(¶m, &gpair, dmat, common::Span>{position}, + {&tree}); return tree; } @@ -479,8 +436,8 @@ TEST_F(MGPUHistTest, HistColumnSplit) { namespace { RegTree GetApproxTree(Context const* ctx, DMatrix* dmat) { ObjInfo task{ObjInfo::kRegression}; - GPUGlobalApproxMaker approx_maker{ctx, &task}; - approx_maker.Configure(Args{}); + std::unique_ptr approx_maker{TreeUpdater::Create("grow_gpu_approx", ctx, &task)}; + approx_maker->Configure(Args{}); TrainParam param; param.UpdateAllowUnknown(Args{}); @@ -490,13 +447,13 @@ RegTree GetApproxTree(Context const* ctx, DMatrix* dmat) { std::vector> position(1); RegTree tree; - approx_maker.Update(¶m, &gpair, dmat, common::Span>{position}, - {&tree}); + approx_maker->Update(¶m, &gpair, dmat, common::Span>{position}, + {&tree}); return tree; } void VerifyApproxColumnSplit(bst_idx_t rows, bst_feature_t cols, RegTree const& expected_tree) { - Context ctx(MakeCUDACtx(GPUIDX)); + auto ctx = MakeCUDACtx(DistGpuIdx()); auto Xy = RandomDataGenerator{rows, cols, 0}.GenerateDMatrix(true); auto const world_size = collective::GetWorldSize(); diff --git a/tests/cpp/tree/test_histmaker.cc b/tests/cpp/tree/test_histmaker.cc deleted file mode 100644 index 888790aa7c3c..000000000000 --- a/tests/cpp/tree/test_histmaker.cc +++ /dev/null @@ -1,59 +0,0 @@ -/** - * Copyright 2019-2024, XGBoost Contributors - */ -#include -#include -#include - -#include "../../../src/tree/param.h" // for TrainParam -#include "../helpers.h" -#include "test_column_split.h" // for GenerateCatDMatrix - -namespace xgboost::tree { -TEST(GrowHistMaker, InteractionConstraint) { - auto constexpr kRows = 32; - auto constexpr kCols = 16; - auto p_dmat = GenerateCatDMatrix(kRows, kCols, 0.0, false); - Context ctx; - - linalg::Matrix gpair({kRows}, ctx.Device()); - gpair.Data()->Copy(GenerateRandomGradients(kRows)); - - ObjInfo task{ObjInfo::kRegression}; - { - // With constraints - RegTree tree{1, kCols}; - - std::unique_ptr updater{TreeUpdater::Create("grow_histmaker", &ctx, &task)}; - TrainParam param; - param.UpdateAllowUnknown( - Args{{"interaction_constraints", "[[0, 1]]"}, {"num_feature", std::to_string(kCols)}}); - std::vector> position(1); - updater->Configure(Args{}); - updater->Update(¶m, &gpair, p_dmat.get(), position, {&tree}); - - ASSERT_EQ(tree.NumExtraNodes(), 4); - ASSERT_EQ(tree[0].SplitIndex(), 1); - - ASSERT_EQ(tree[tree[0].LeftChild()].SplitIndex(), 0); - ASSERT_EQ(tree[tree[0].RightChild()].SplitIndex(), 0); - } - { - // Without constraints - RegTree tree{1u, kCols}; - - std::unique_ptr updater{TreeUpdater::Create("grow_histmaker", &ctx, &task)}; - std::vector> position(1); - TrainParam param; - param.Init(Args{}); - updater->Configure(Args{}); - updater->Update(¶m, &gpair, p_dmat.get(), position, {&tree}); - - ASSERT_EQ(tree.NumExtraNodes(), 10); - ASSERT_EQ(tree[0].SplitIndex(), 1); - - ASSERT_NE(tree[tree[0].LeftChild()].SplitIndex(), 0); - ASSERT_NE(tree[tree[0].RightChild()].SplitIndex(), 0); - } -} -} // namespace xgboost::tree