diff --git a/.github/workflows/jvm_tests.yml b/.github/workflows/jvm_tests.yml
index 330c037d7024..79aac0f0b538 100644
--- a/.github/workflows/jvm_tests.yml
+++ b/.github/workflows/jvm_tests.yml
@@ -51,14 +51,14 @@ jobs:
id: extract_branch
if: |
(github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) &&
- (matrix.os == 'windows-latest' || matrix.os == 'macos-11')
+ matrix.os == 'windows-latest'
- name: Publish artifact xgboost4j.dll to S3
run: |
cd lib/
Rename-Item -Path xgboost4j.dll -NewName xgboost4j_${{ github.sha }}.dll
dir
- python -m awscli s3 cp xgboost4j_${{ github.sha }}.dll s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/libxgboost4j/ --acl public-read
+ python -m awscli s3 cp xgboost4j_${{ github.sha }}.dll s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/ --acl public-read
if: |
(github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) &&
matrix.os == 'windows-latest'
@@ -66,19 +66,6 @@ jobs:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }}
- - name: Publish artifact libxgboost4j.dylib to S3
- run: |
- cd lib/
- mv -v libxgboost4j.dylib libxgboost4j_${{ github.sha }}.dylib
- ls
- python -m awscli s3 cp libxgboost4j_${{ github.sha }}.dylib s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/libxgboost4j/ --acl public-read
- if: |
- (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) &&
- matrix.os == 'macos-11'
- env:
- AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }}
- AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }}
-
- name: Test XGBoost4J (Core, Spark, Examples)
run: |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 561d327568a8..e524d2aaf7f2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
-project(xgboost LANGUAGES CXX C VERSION 2.0.1)
+project(xgboost LANGUAGES CXX C VERSION 2.0.0)
include(cmake/Utils.cmake)
list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules")
cmake_policy(SET CMP0022 NEW)
@@ -237,11 +237,6 @@ endif (RABIT_BUILD_MPI)
add_subdirectory(${xgboost_SOURCE_DIR}/src)
target_link_libraries(objxgboost PUBLIC dmlc)
-# Link -lstdc++fs for GCC 8.x
-if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "9.0")
- target_link_libraries(objxgboost PUBLIC stdc++fs)
-endif()
-
# Exports some R specific definitions and objects
if (R_LIB)
add_subdirectory(${xgboost_SOURCE_DIR}/R-package)
diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION
index d60ff28165bc..9f8934da8e34 100644
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -1,8 +1,8 @@
Package: xgboost
Type: Package
Title: Extreme Gradient Boosting
-Version: 2.0.1.1
-Date: 2023-10-12
+Version: 2.0.0.1
+Date: 2023-09-11
Authors@R: c(
person("Tianqi", "Chen", role = c("aut"),
email = "tianqi.tchen@gmail.com"),
diff --git a/R-package/configure b/R-package/configure
index 4017953347fd..19ea48a91234 100755
--- a/R-package/configure
+++ b/R-package/configure
@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.71 for xgboost 2.0.1.
+# Generated by GNU Autoconf 2.71 for xgboost 2.0.0.
#
#
# Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -607,8 +607,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='xgboost'
PACKAGE_TARNAME='xgboost'
-PACKAGE_VERSION='2.0.1'
-PACKAGE_STRING='xgboost 2.0.1'
+PACKAGE_VERSION='2.0.0'
+PACKAGE_STRING='xgboost 2.0.0'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -1225,7 +1225,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
-\`configure' configures xgboost 2.0.1 to adapt to many kinds of systems.
+\`configure' configures xgboost 2.0.0 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1287,7 +1287,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
- short | recursive ) echo "Configuration of xgboost 2.0.1:";;
+ short | recursive ) echo "Configuration of xgboost 2.0.0:";;
esac
cat <<\_ACEOF
@@ -1367,7 +1367,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
-xgboost configure 2.0.1
+xgboost configure 2.0.0
generated by GNU Autoconf 2.71
Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1533,7 +1533,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
-It was created by xgboost $as_me 2.0.1, which was
+It was created by xgboost $as_me 2.0.0, which was
generated by GNU Autoconf 2.71. Invocation command line was
$ $0$ac_configure_args_raw
@@ -3412,7 +3412,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
-This file was extended by xgboost $as_me 2.0.1, which was
+This file was extended by xgboost $as_me 2.0.0, which was
generated by GNU Autoconf 2.71. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -3467,7 +3467,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config='$ac_cs_config_escaped'
ac_cs_version="\\
-xgboost config.status 2.0.1
+xgboost config.status 2.0.0
configured by $0, generated by GNU Autoconf 2.71,
with options \\"\$ac_cs_config\\"
diff --git a/R-package/configure.ac b/R-package/configure.ac
index 1998b4f5a6d6..1fb6ea35acc4 100644
--- a/R-package/configure.ac
+++ b/R-package/configure.ac
@@ -2,7 +2,7 @@
AC_PREREQ(2.69)
-AC_INIT([xgboost],[2.0.1],[],[xgboost],[])
+AC_INIT([xgboost],[2.0.0],[],[xgboost],[])
: ${R_HOME=`R RHOME`}
if test -z "${R_HOME}"; then
diff --git a/include/xgboost/version_config.h b/include/xgboost/version_config.h
index fc29fd7a52a6..8005b83919c7 100644
--- a/include/xgboost/version_config.h
+++ b/include/xgboost/version_config.h
@@ -6,6 +6,6 @@
#define XGBOOST_VER_MAJOR 2 /* NOLINT */
#define XGBOOST_VER_MINOR 0 /* NOLINT */
-#define XGBOOST_VER_PATCH 1 /* NOLINT */
+#define XGBOOST_VER_PATCH 0 /* NOLINT */
#endif // XGBOOST_VERSION_CONFIG_H_
diff --git a/jvm-packages/CMakeLists.txt b/jvm-packages/CMakeLists.txt
index e1d0e94e285a..247c443789c3 100644
--- a/jvm-packages/CMakeLists.txt
+++ b/jvm-packages/CMakeLists.txt
@@ -25,3 +25,4 @@ target_include_directories(xgboost4j
${PROJECT_SOURCE_DIR}/rabit/include)
set_output_directory(xgboost4j ${PROJECT_SOURCE_DIR}/lib)
+target_link_libraries(xgboost4j PRIVATE ${JAVA_JVM_LIBRARY})
diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index 0faf52b8ee2d..80caa1320100 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -6,7 +6,7 @@
ml.dmlc
xgboost-jvm
- 2.0.1
+ 2.0.0
pom
XGBoost JVM Package
JVM Package for XGBoost
diff --git a/jvm-packages/xgboost4j-example/pom.xml b/jvm-packages/xgboost4j-example/pom.xml
index f428f7f7f335..0ea55a462d7b 100644
--- a/jvm-packages/xgboost4j-example/pom.xml
+++ b/jvm-packages/xgboost4j-example/pom.xml
@@ -6,11 +6,11 @@
ml.dmlc
xgboost-jvm
- 2.0.1
+ 2.0.0
xgboost4j-example
xgboost4j-example_${scala.binary.version}
- 2.0.1
+ 2.0.0
jar
diff --git a/jvm-packages/xgboost4j-flink/pom.xml b/jvm-packages/xgboost4j-flink/pom.xml
index 1071bf669adc..ed5ab0ce772c 100644
--- a/jvm-packages/xgboost4j-flink/pom.xml
+++ b/jvm-packages/xgboost4j-flink/pom.xml
@@ -6,12 +6,12 @@
ml.dmlc
xgboost-jvm
- 2.0.1
+ 2.0.0
xgboost4j-flink
xgboost4j-flink_${scala.binary.version}
- 2.0.1
+ 2.0.0
2.2.0
diff --git a/jvm-packages/xgboost4j-gpu/pom.xml b/jvm-packages/xgboost4j-gpu/pom.xml
index b9ff1590c1fc..a51d777bd068 100644
--- a/jvm-packages/xgboost4j-gpu/pom.xml
+++ b/jvm-packages/xgboost4j-gpu/pom.xml
@@ -6,11 +6,11 @@
ml.dmlc
xgboost-jvm
- 2.0.1
+ 2.0.0
xgboost4j-gpu_${scala.binary.version}
xgboost4j-gpu
- 2.0.1
+ 2.0.0
jar
diff --git a/jvm-packages/xgboost4j-spark-gpu/pom.xml b/jvm-packages/xgboost4j-spark-gpu/pom.xml
index bc0bf46dd252..a15f08d27c5a 100644
--- a/jvm-packages/xgboost4j-spark-gpu/pom.xml
+++ b/jvm-packages/xgboost4j-spark-gpu/pom.xml
@@ -6,7 +6,7 @@
ml.dmlc
xgboost-jvm
- 2.0.1
+ 2.0.0
xgboost4j-spark-gpu
xgboost4j-spark-gpu_${scala.binary.version}
diff --git a/jvm-packages/xgboost4j-spark/pom.xml b/jvm-packages/xgboost4j-spark/pom.xml
index 92e0e93d5d0f..03ef19cca8d8 100644
--- a/jvm-packages/xgboost4j-spark/pom.xml
+++ b/jvm-packages/xgboost4j-spark/pom.xml
@@ -6,7 +6,7 @@
ml.dmlc
xgboost-jvm
- 2.0.1
+ 2.0.0
xgboost4j-spark
xgboost4j-spark_${scala.binary.version}
diff --git a/jvm-packages/xgboost4j/pom.xml b/jvm-packages/xgboost4j/pom.xml
index 764c7f4cc90f..8d4f2c051d63 100644
--- a/jvm-packages/xgboost4j/pom.xml
+++ b/jvm-packages/xgboost4j/pom.xml
@@ -6,11 +6,11 @@
ml.dmlc
xgboost-jvm
- 2.0.1
+ 2.0.0
xgboost4j
xgboost4j_${scala.binary.version}
- 2.0.1
+ 2.0.0
jar
diff --git a/python-package/packager/nativelib.py b/python-package/packager/nativelib.py
index 9d3fec2bcc01..ff38fa11d01c 100644
--- a/python-package/packager/nativelib.py
+++ b/python-package/packager/nativelib.py
@@ -132,28 +132,16 @@ def locate_or_build_libxgboost(
if build_config.use_system_libxgboost:
# Find libxgboost from system prefix
- sys_prefix = pathlib.Path(sys.base_prefix)
- sys_prefix_candidates = [
- sys_prefix / "lib",
- # Paths possibly used on Windows
- sys_prefix / "bin",
- sys_prefix / "Library",
- sys_prefix / "Library" / "bin",
- sys_prefix / "Library" / "lib",
- ]
- sys_prefix_candidates = [
- p.expanduser().resolve() for p in sys_prefix_candidates
- ]
- for candidate_dir in sys_prefix_candidates:
- libtreelite_sys = candidate_dir / _lib_name()
- if libtreelite_sys.exists():
- logger.info("Using system XGBoost: %s", str(libtreelite_sys))
- return libtreelite_sys
- raise RuntimeError(
- f"use_system_libxgboost was specified but {_lib_name()} is "
- f"not found. Paths searched (in order): \n"
- + "\n".join([f"* {str(p)}" for p in sys_prefix_candidates])
- )
+ sys_base_prefix = pathlib.Path(sys.base_prefix).absolute().resolve()
+ libxgboost_sys = sys_base_prefix / "lib" / _lib_name()
+ if not libxgboost_sys.exists():
+ raise RuntimeError(
+ f"use_system_libxgboost was specified but {_lib_name()} is "
+ f"not found in {libxgboost_sys.parent}"
+ )
+
+ logger.info("Using system XGBoost: %s", str(libxgboost_sys))
+ return libxgboost_sys
libxgboost = locate_local_libxgboost(toplevel_dir, logger=logger)
if libxgboost is not None:
diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml
index 88f8823c31d6..b12eb5c7a3bd 100644
--- a/python-package/pyproject.toml
+++ b/python-package/pyproject.toml
@@ -7,7 +7,7 @@ build-backend = "packager.pep517"
[project]
name = "xgboost"
-version = "2.0.1"
+version = "2.0.0"
authors = [
{ name = "Hyunsu Cho", email = "chohyu01@cs.washington.edu" },
{ name = "Jiaming Yuan", email = "jm.yuan@outlook.com" }
diff --git a/python-package/xgboost/VERSION b/python-package/xgboost/VERSION
index 38f77a65b301..227cea215648 100644
--- a/python-package/xgboost/VERSION
+++ b/python-package/xgboost/VERSION
@@ -1 +1 @@
-2.0.1
+2.0.0
diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py
index 0022a17d4299..0317fd91ada0 100644
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@@ -317,6 +317,7 @@ def pandas_feature_info(
) -> Tuple[Optional[FeatureNames], Optional[FeatureTypes]]:
"""Handle feature info for pandas dataframe."""
import pandas as pd
+ from pandas.api.types import is_categorical_dtype, is_sparse
# handle feature names
if feature_names is None and meta is None:
@@ -331,10 +332,10 @@ def pandas_feature_info(
if feature_types is None and meta is None:
feature_types = []
for dtype in data.dtypes:
- if is_pd_sparse_dtype(dtype):
+ if is_sparse(dtype):
feature_types.append(_pandas_dtype_mapper[dtype.subtype.name])
elif (
- is_pd_cat_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
+ is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
) and enable_categorical:
feature_types.append(CAT_T)
else:
@@ -344,13 +345,18 @@ def pandas_feature_info(
def is_nullable_dtype(dtype: PandasDType) -> bool:
"""Whether dtype is a pandas nullable type."""
- from pandas.api.types import is_bool_dtype, is_float_dtype, is_integer_dtype
+ from pandas.api.types import (
+ is_bool_dtype,
+ is_categorical_dtype,
+ is_float_dtype,
+ is_integer_dtype,
+ )
is_int = is_integer_dtype(dtype) and dtype.name in pandas_nullable_mapper
# np.bool has alias `bool`, while pd.BooleanDtype has `boolean`.
is_bool = is_bool_dtype(dtype) and dtype.name == "boolean"
is_float = is_float_dtype(dtype) and dtype.name in pandas_nullable_mapper
- return is_int or is_bool or is_float or is_pd_cat_dtype(dtype)
+ return is_int or is_bool or is_float or is_categorical_dtype(dtype)
def is_pa_ext_dtype(dtype: Any) -> bool:
@@ -365,48 +371,17 @@ def is_pa_ext_categorical_dtype(dtype: Any) -> bool:
)
-def is_pd_cat_dtype(dtype: PandasDType) -> bool:
- """Wrapper for testing pandas category type."""
- import pandas as pd
-
- if hasattr(pd.util, "version") and hasattr(pd.util.version, "Version"):
- Version = pd.util.version.Version
- if Version(pd.__version__) >= Version("2.1.0"):
- from pandas import CategoricalDtype
-
- return isinstance(dtype, CategoricalDtype)
-
- from pandas.api.types import is_categorical_dtype
-
- return is_categorical_dtype(dtype)
-
-
-def is_pd_sparse_dtype(dtype: PandasDType) -> bool:
- """Wrapper for testing pandas sparse type."""
- import pandas as pd
-
- if hasattr(pd.util, "version") and hasattr(pd.util.version, "Version"):
- Version = pd.util.version.Version
- if Version(pd.__version__) >= Version("2.1.0"):
- from pandas import SparseDtype
-
- return isinstance(dtype, SparseDtype)
-
- from pandas.api.types import is_sparse
-
- return is_sparse(dtype)
-
-
def pandas_cat_null(data: DataFrame) -> DataFrame:
"""Handle categorical dtype and nullable extension types from pandas."""
import pandas as pd
+ from pandas.api.types import is_categorical_dtype
# handle category codes and nullable.
cat_columns = []
nul_columns = []
# avoid an unnecessary conversion if possible
for col, dtype in zip(data.columns, data.dtypes):
- if is_pd_cat_dtype(dtype):
+ if is_categorical_dtype(dtype):
cat_columns.append(col)
elif is_pa_ext_categorical_dtype(dtype):
raise ValueError(
@@ -423,7 +398,7 @@ def pandas_cat_null(data: DataFrame) -> DataFrame:
transformed = data
def cat_codes(ser: pd.Series) -> pd.Series:
- if is_pd_cat_dtype(ser.dtype):
+ if is_categorical_dtype(ser.dtype):
return ser.cat.codes
assert is_pa_ext_categorical_dtype(ser.dtype)
# Not yet supported, the index is not ordered for some reason. Alternately:
@@ -479,12 +454,14 @@ def _transform_pandas_df(
meta: Optional[str] = None,
meta_type: Optional[NumpyDType] = None,
) -> Tuple[np.ndarray, Optional[FeatureNames], Optional[FeatureTypes]]:
+ from pandas.api.types import is_categorical_dtype, is_sparse
+
pyarrow_extension = False
for dtype in data.dtypes:
if not (
(dtype.name in _pandas_dtype_mapper)
- or is_pd_sparse_dtype(dtype)
- or (is_pd_cat_dtype(dtype) and enable_categorical)
+ or is_sparse(dtype)
+ or (is_categorical_dtype(dtype) and enable_categorical)
or is_pa_ext_dtype(dtype)
):
_invalid_dataframe_dtype(data)
@@ -538,8 +515,9 @@ def _meta_from_pandas_series(
) -> None:
"""Help transform pandas series for meta data like labels"""
data = data.values.astype("float")
+ from pandas.api.types import is_sparse
- if is_pd_sparse_dtype(getattr(data, "dtype", data)):
+ if is_sparse(data):
data = data.to_dense() # type: ignore
assert len(data.shape) == 1 or data.shape[1] == 0 or data.shape[1] == 1
_meta_from_numpy(data, name, dtype, handle)
@@ -561,11 +539,13 @@ def _from_pandas_series(
feature_names: Optional[FeatureNames],
feature_types: Optional[FeatureTypes],
) -> DispatchedDataBackendReturnType:
+ from pandas.api.types import is_categorical_dtype
+
if (data.dtype.name not in _pandas_dtype_mapper) and not (
- is_pd_cat_dtype(data.dtype) and enable_categorical
+ is_categorical_dtype(data.dtype) and enable_categorical
):
_invalid_dataframe_dtype(data)
- if enable_categorical and is_pd_cat_dtype(data.dtype):
+ if enable_categorical and is_categorical_dtype(data.dtype):
data = data.cat.codes
return _from_numpy_array(
data.values.reshape(data.shape[0], 1).astype("float"),
diff --git a/python-package/xgboost/libpath.py b/python-package/xgboost/libpath.py
index 58c78df090ae..0437f3a4ca0f 100644
--- a/python-package/xgboost/libpath.py
+++ b/python-package/xgboost/libpath.py
@@ -31,15 +31,16 @@ def find_lib_path() -> List[str]:
]
if sys.platform == "win32":
- # On Windows, Conda may install libs in different paths
- dll_path.extend(
- [
- os.path.join(sys.base_prefix, "bin"),
- os.path.join(sys.base_prefix, "Library"),
- os.path.join(sys.base_prefix, "Library", "bin"),
- os.path.join(sys.base_prefix, "Library", "lib"),
- ]
- )
+ if platform.architecture()[0] == "64bit":
+ dll_path.append(os.path.join(curr_path, "../../windows/x64/Release/"))
+ # hack for pip installation when copy all parent source
+ # directory here
+ dll_path.append(os.path.join(curr_path, "./windows/x64/Release/"))
+ else:
+ dll_path.append(os.path.join(curr_path, "../../windows/Release/"))
+ # hack for pip installation when copy all parent source
+ # directory here
+ dll_path.append(os.path.join(curr_path, "./windows/Release/"))
dll_path = [os.path.join(p, "xgboost.dll") for p in dll_path]
elif sys.platform.startswith(("linux", "freebsd", "emscripten")):
dll_path = [os.path.join(p, "libxgboost.so") for p in dll_path]
diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py
index 9fe73005a073..6b1d2faaacd1 100644
--- a/python-package/xgboost/spark/core.py
+++ b/python-package/xgboost/spark/core.py
@@ -22,7 +22,7 @@
import numpy as np
import pandas as pd
-from pyspark import RDD, SparkContext, cloudpickle
+from pyspark import SparkContext, cloudpickle
from pyspark.ml import Estimator, Model
from pyspark.ml.functions import array_to_vector, vector_to_array
from pyspark.ml.linalg import VectorUDT
@@ -44,7 +44,6 @@
MLWritable,
MLWriter,
)
-from pyspark.resource import ResourceProfileBuilder, TaskResourceRequests
from pyspark.sql import Column, DataFrame
from pyspark.sql.functions import col, countDistinct, pandas_udf, rand, struct
from pyspark.sql.types import (
@@ -89,7 +88,6 @@
_get_rabit_args,
_get_spark_session,
_is_local,
- _is_standalone_or_localcluster,
deserialize_booster,
deserialize_xgb_model,
get_class_name,
@@ -344,54 +342,6 @@ def _gen_predict_params_dict(self) -> Dict[str, Any]:
predict_params[param.name] = self.getOrDefault(param)
return predict_params
- def _validate_gpu_params(self) -> None:
- """Validate the gpu parameters and gpu configurations"""
-
- if use_cuda(self.getOrDefault(self.device)) or self.getOrDefault(self.use_gpu):
- ss = _get_spark_session()
- sc = ss.sparkContext
-
- if _is_local(sc):
- # Support GPU training in Spark local mode is just for debugging
- # purposes, so it's okay for printing the below warning instead of
- # checking the real gpu numbers and raising the exception.
- get_logger(self.__class__.__name__).warning(
- "You have enabled GPU in spark local mode. Please make sure your"
- " local node has at least %d GPUs",
- self.getOrDefault(self.num_workers),
- )
- else:
- executor_gpus = sc.getConf().get("spark.executor.resource.gpu.amount")
- if executor_gpus is None:
- raise ValueError(
- "The `spark.executor.resource.gpu.amount` is required for training"
- " on GPU."
- )
-
- if not (ss.version >= "3.4.0" and _is_standalone_or_localcluster(sc)):
- # We will enable stage-level scheduling in spark 3.4.0+ which doesn't
- # require spark.task.resource.gpu.amount to be set explicitly
- gpu_per_task = sc.getConf().get("spark.task.resource.gpu.amount")
- if gpu_per_task is not None:
- if float(gpu_per_task) < 1.0:
- raise ValueError(
- "XGBoost doesn't support GPU fractional configurations. "
- "Please set `spark.task.resource.gpu.amount=spark.executor"
- ".resource.gpu.amount`"
- )
-
- if float(gpu_per_task) > 1.0:
- get_logger(self.__class__.__name__).warning(
- "%s GPUs for each Spark task is configured, but each "
- "XGBoost training task uses only 1 GPU.",
- gpu_per_task,
- )
- else:
- raise ValueError(
- "The `spark.task.resource.gpu.amount` is required for training"
- " on GPU."
- )
-
def _validate_params(self) -> None:
# pylint: disable=too-many-branches
init_model = self.getOrDefault("xgb_model")
@@ -471,7 +421,53 @@ def _validate_params(self) -> None:
"`pyspark.ml.linalg.Vector` type."
)
- self._validate_gpu_params()
+ if use_cuda(self.getOrDefault(self.device)) or self.getOrDefault(self.use_gpu):
+ gpu_per_task = (
+ _get_spark_session()
+ .sparkContext.getConf()
+ .get("spark.task.resource.gpu.amount")
+ )
+
+ is_local = _is_local(_get_spark_session().sparkContext)
+
+ if is_local:
+ # checking spark local mode.
+ if gpu_per_task is not None:
+ raise RuntimeError(
+ "The spark local mode does not support gpu configuration."
+ "Please remove spark.executor.resource.gpu.amount and "
+ "spark.task.resource.gpu.amount"
+ )
+
+ # Support GPU training in Spark local mode is just for debugging
+ # purposes, so it's okay for printing the below warning instead of
+ # checking the real gpu numbers and raising the exception.
+ get_logger(self.__class__.__name__).warning(
+ "You have enabled GPU in spark local mode. Please make sure your"
+ " local node has at least %d GPUs",
+ self.getOrDefault(self.num_workers),
+ )
+ else:
+ # checking spark non-local mode.
+ if gpu_per_task is not None:
+ if float(gpu_per_task) < 1.0:
+ raise ValueError(
+ "XGBoost doesn't support GPU fractional configurations. "
+ "Please set `spark.task.resource.gpu.amount=spark.executor"
+ ".resource.gpu.amount`"
+ )
+
+ if float(gpu_per_task) > 1.0:
+ get_logger(self.__class__.__name__).warning(
+ "%s GPUs for each Spark task is configured, but each "
+ "XGBoost training task uses only 1 GPU.",
+ gpu_per_task,
+ )
+ else:
+ raise ValueError(
+ "The `spark.task.resource.gpu.amount` is required for training"
+ " on GPU."
+ )
def _validate_and_convert_feature_col_as_float_col_list(
@@ -596,8 +592,6 @@ def __init__(self) -> None:
arbitrary_params_dict={},
)
- self.logger = get_logger(self.__class__.__name__)
-
def setParams(self, **kwargs: Any) -> None: # pylint: disable=invalid-name
"""
Set params for the estimator.
@@ -900,116 +894,6 @@ def _get_xgb_parameters(
return booster_params, train_call_kwargs_params, dmatrix_kwargs
- def _skip_stage_level_scheduling(self) -> bool:
- # pylint: disable=too-many-return-statements
- """Check if stage-level scheduling is not needed,
- return true to skip stage-level scheduling"""
-
- if use_cuda(self.getOrDefault(self.device)) or self.getOrDefault(self.use_gpu):
- ss = _get_spark_session()
- sc = ss.sparkContext
-
- if ss.version < "3.4.0":
- self.logger.info(
- "Stage-level scheduling in xgboost requires spark version 3.4.0+"
- )
- return True
-
- if not _is_standalone_or_localcluster(sc):
- self.logger.info(
- "Stage-level scheduling in xgboost requires spark standalone or "
- "local-cluster mode"
- )
- return True
-
- executor_cores = sc.getConf().get("spark.executor.cores")
- executor_gpus = sc.getConf().get("spark.executor.resource.gpu.amount")
- if executor_cores is None or executor_gpus is None:
- self.logger.info(
- "Stage-level scheduling in xgboost requires spark.executor.cores, "
- "spark.executor.resource.gpu.amount to be set."
- )
- return True
-
- if int(executor_cores) == 1:
- # there will be only 1 task running at any time.
- self.logger.info(
- "Stage-level scheduling in xgboost requires spark.executor.cores > 1 "
- )
- return True
-
- if int(executor_gpus) > 1:
- # For spark.executor.resource.gpu.amount > 1, we suppose user knows how to configure
- # to make xgboost run successfully.
- #
- self.logger.info(
- "Stage-level scheduling in xgboost will not work "
- "when spark.executor.resource.gpu.amount>1"
- )
- return True
-
- task_gpu_amount = sc.getConf().get("spark.task.resource.gpu.amount")
-
- if task_gpu_amount is None:
- # The ETL tasks will not grab a gpu when spark.task.resource.gpu.amount is not set,
- # but with stage-level scheduling, we can make training task grab the gpu.
- return False
-
- if float(task_gpu_amount) == float(executor_gpus):
- # spark.executor.resource.gpu.amount=spark.task.resource.gpu.amount "
- # results in only 1 task running at a time, which may cause perf issue.
- return True
-
- # We can enable stage-level scheduling
- return False
-
- # CPU training doesn't require stage-level scheduling
- return True
-
- def _try_stage_level_scheduling(self, rdd: RDD) -> RDD:
- """Try to enable stage-level scheduling"""
-
- if self._skip_stage_level_scheduling():
- return rdd
-
- ss = _get_spark_session()
-
- # executor_cores will not be None
- executor_cores = ss.sparkContext.getConf().get("spark.executor.cores")
- assert executor_cores is not None
-
- # Spark-rapids is a project to leverage GPUs to accelerate spark SQL.
- # If spark-rapids is enabled, to avoid GPU OOM, we don't allow other
- # ETL gpu tasks running alongside training tasks.
- spark_plugins = ss.conf.get("spark.plugins", " ")
- assert spark_plugins is not None
- spark_rapids_sql_enabled = ss.conf.get("spark.rapids.sql.enabled", "true")
- assert spark_rapids_sql_enabled is not None
-
- task_cores = (
- int(executor_cores)
- if "com.nvidia.spark.SQLPlugin" in spark_plugins
- and "true" == spark_rapids_sql_enabled.lower()
- else (int(executor_cores) // 2) + 1
- )
-
- # Each training task requires cpu cores > total executor cores//2 + 1 which can
- # make sure the tasks be sent to different executors.
- #
- # Please note that we can't use GPU to limit the concurrent tasks because of
- # https://issues.apache.org/jira/browse/SPARK-45527.
-
- task_gpus = 1.0
- treqs = TaskResourceRequests().cpus(task_cores).resource("gpu", task_gpus)
- rp = ResourceProfileBuilder().require(treqs).build
-
- self.logger.info(
- "XGBoost training tasks require the resource(cores=%s, gpu=%s).",
- task_cores,
- task_gpus,
- )
- return rdd.withResources(rp)
-
def _fit(self, dataset: DataFrame) -> "_SparkXGBModel":
# pylint: disable=too-many-statements, too-many-locals
self._validate_params()
@@ -1110,16 +994,14 @@ def _train_booster(
)
def _run_job() -> Tuple[str, str]:
- rdd = (
+ ret = (
dataset.mapInPandas(
- _train_booster, # type: ignore
- schema="config string, booster string",
+ _train_booster, schema="config string, booster string" # type: ignore
)
.rdd.barrier()
.mapPartitions(lambda x: x)
+ .collect()[0]
)
- rdd_with_resource = self._try_stage_level_scheduling(rdd)
- ret = rdd_with_resource.collect()[0]
return ret[0], ret[1]
get_logger("XGBoost-PySpark").info(
diff --git a/python-package/xgboost/spark/utils.py b/python-package/xgboost/spark/utils.py
index 395865386191..66d7ca4548ca 100644
--- a/python-package/xgboost/spark/utils.py
+++ b/python-package/xgboost/spark/utils.py
@@ -129,13 +129,6 @@ def _is_local(spark_context: SparkContext) -> bool:
return spark_context._jsc.sc().isLocal()
-def _is_standalone_or_localcluster(spark_context: SparkContext) -> bool:
- master = spark_context.getConf().get("spark.master")
- return master is not None and (
- master.startswith("spark://") or master.startswith("local-cluster")
- )
-
-
def _get_gpu_id(task_context: TaskContext) -> int:
"""Get the gpu id from the task resources"""
if task_context is None:
diff --git a/src/common/ref_resource_view.h b/src/common/ref_resource_view.h
index d4f82e615c6f..0fadf846dd5e 100644
--- a/src/common/ref_resource_view.h
+++ b/src/common/ref_resource_view.h
@@ -76,7 +76,7 @@ class RefResourceView {
[[nodiscard]] size_type size() const { return size_; } // NOLINT
[[nodiscard]] size_type size_bytes() const { // NOLINT
- return Span{data(), size()}.size_bytes();
+ return Span{data(), size()}.size_bytes();
}
[[nodiscard]] value_type* data() { return ptr_; }; // NOLINT
[[nodiscard]] value_type const* data() const { return ptr_; }; // NOLINT
diff --git a/src/common/threading_utils.cc b/src/common/threading_utils.cc
index 5e730e96d34e..349cc0ba7348 100644
--- a/src/common/threading_utils.cc
+++ b/src/common/threading_utils.cc
@@ -3,23 +3,14 @@
*/
#include "threading_utils.h"
-#include // for max
-#include // for exception
-#include // for path, exists
-#include // for ifstream
-#include // for string
+#include
+#include
-#include "common.h" // for DivRoundUp
+#include "xgboost/logging.h"
-namespace xgboost::common {
-/**
- * Modified from
- * github.com/psiha/sweater/blob/master/include/boost/sweater/hardware_concurrency.hpp
- *
- * MIT License: Copyright (c) 2016 Domagoj Šarić
- */
-std::int32_t GetCGroupV1Count(std::filesystem::path const& quota_path,
- std::filesystem::path const& peroid_path) {
+namespace xgboost {
+namespace common {
+int32_t GetCfsCPUCount() noexcept {
#if defined(__linux__)
// https://bugs.openjdk.java.net/browse/JDK-8146115
// http://hg.openjdk.java.net/jdk/hs/rev/7f22774a5f42
@@ -40,8 +31,8 @@ std::int32_t GetCGroupV1Count(std::filesystem::path const& quota_path,
}
};
// complete fair scheduler from Linux
- auto const cfs_quota(read_int(quota_path.c_str()));
- auto const cfs_period(read_int(peroid_path.c_str()));
+ auto const cfs_quota(read_int("/sys/fs/cgroup/cpu/cpu.cfs_quota_us"));
+ auto const cfs_period(read_int("/sys/fs/cgroup/cpu/cpu.cfs_period_us"));
if ((cfs_quota > 0) && (cfs_period > 0)) {
return std::max(cfs_quota / cfs_period, 1);
}
@@ -49,47 +40,6 @@ std::int32_t GetCGroupV1Count(std::filesystem::path const& quota_path,
return -1;
}
-std::int32_t GetCGroupV2Count(std::filesystem::path const& bandwidth_path) noexcept(true) {
- std::int32_t cnt{-1};
-#if defined(__linux__)
- namespace fs = std::filesystem;
-
- std::int32_t a{0}, b{0};
-
- auto warn = [] { LOG(WARNING) << "Invalid cgroupv2 file."; };
- try {
- std::ifstream fin{bandwidth_path, std::ios::in};
- fin >> a;
- fin >> b;
- } catch (std::exception const&) {
- warn();
- return cnt;
- }
- if (a > 0 && b > 0) {
- cnt = std::max(common::DivRoundUp(a, b), 1);
- }
-#endif // defined(__linux__)
- return cnt;
-}
-
-std::int32_t GetCfsCPUCount() noexcept {
- namespace fs = std::filesystem;
- fs::path const bandwidth_path{"/sys/fs/cgroup/cpu.max"};
- auto has_v2 = fs::exists(bandwidth_path);
- if (has_v2) {
- return GetCGroupV2Count(bandwidth_path);
- }
-
- fs::path const quota_path{"/sys/fs/cgroup/cpu/cpu.cfs_quota_us"};
- fs::path const peroid_path{"/sys/fs/cgroup/cpu/cpu.cfs_period_us"};
- auto has_v1 = fs::exists(quota_path) && fs::exists(peroid_path);
- if (has_v1) {
- return GetCGroupV1Count(quota_path, peroid_path);
- }
-
- return -1;
-}
-
std::int32_t OmpGetNumThreads(std::int32_t n_threads) {
// Don't use parallel if we are in a parallel region.
if (omp_in_parallel()) {
@@ -104,4 +54,5 @@ std::int32_t OmpGetNumThreads(std::int32_t n_threads) {
n_threads = std::max(n_threads, 1);
return n_threads;
}
-} // namespace xgboost::common
+} // namespace common
+} // namespace xgboost
diff --git a/src/common/threading_utils.h b/src/common/threading_utils.h
index ac71190353a7..4ca4ca0707d9 100644
--- a/src/common/threading_utils.h
+++ b/src/common/threading_utils.h
@@ -253,6 +253,11 @@ inline std::int32_t OmpGetThreadLimit() {
* \brief Get thread limit from CFS.
*
* This function has non-trivial overhead and should not be called repeatly.
+ *
+ * Modified from
+ * github.com/psiha/sweater/blob/master/include/boost/sweater/hardware_concurrency.hpp
+ *
+ * MIT License: Copyright (c) 2016 Domagoj Šarić
*/
std::int32_t GetCfsCPUCount() noexcept;
diff --git a/tests/ci_build/Dockerfile.gpu_build_centos7 b/tests/ci_build/Dockerfile.gpu_build_centos7
index 98a0a70333cb..6134d49aad66 100644
--- a/tests/ci_build/Dockerfile.gpu_build_centos7
+++ b/tests/ci_build/Dockerfile.gpu_build_centos7
@@ -1,5 +1,5 @@
ARG CUDA_VERSION_ARG
-FROM nvcr.io/nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
+FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
ARG CUDA_VERSION_ARG
ARG NCCL_VERSION_ARG
ARG RAPIDS_VERSION_ARG
diff --git a/tests/ci_build/Dockerfile.gpu_build_r_centos7 b/tests/ci_build/Dockerfile.gpu_build_r_centos7
index b73cf5adb0b7..6cfd30fe5f2e 100644
--- a/tests/ci_build/Dockerfile.gpu_build_r_centos7
+++ b/tests/ci_build/Dockerfile.gpu_build_r_centos7
@@ -1,5 +1,5 @@
ARG CUDA_VERSION_ARG
-FROM nvcr.io/nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
+FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
ARG CUDA_VERSION_ARG
# Install all basic requirements
diff --git a/tests/ci_build/Dockerfile.jvm_gpu_build b/tests/ci_build/Dockerfile.jvm_gpu_build
index 86ce7e72a4b2..d4a580495ea0 100644
--- a/tests/ci_build/Dockerfile.jvm_gpu_build
+++ b/tests/ci_build/Dockerfile.jvm_gpu_build
@@ -1,5 +1,5 @@
ARG CUDA_VERSION_ARG
-FROM nvcr.io/nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
+FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
ARG CUDA_VERSION_ARG
ARG NCCL_VERSION_ARG
diff --git a/tests/cpp/common/test_io.cc b/tests/cpp/common/test_io.cc
index f8aa9fd73ad1..8bc12698bd9d 100644
--- a/tests/cpp/common/test_io.cc
+++ b/tests/cpp/common/test_io.cc
@@ -148,8 +148,7 @@ TEST(IO, Resource) {
fout << 1.0 << std::endl;
fout.close();
- auto resource = std::shared_ptr{
- new MmapResource{path, 0, sizeof(double)}};
+ auto resource = std::make_shared(path, 0, sizeof(double));
ASSERT_EQ(resource->Size(), sizeof(double));
ASSERT_EQ(resource->Type(), ResourceHandler::kMmap);
ASSERT_EQ(resource->DataAs()[0], val);
diff --git a/tests/python/test_with_pandas.py b/tests/python/test_with_pandas.py
index a23a66b63a55..f8a21b6ab923 100644
--- a/tests/python/test_with_pandas.py
+++ b/tests/python/test_with_pandas.py
@@ -211,7 +211,7 @@ def test_pandas_weight(self):
y = np.random.randn(kRows)
w = np.random.uniform(size=kRows).astype(np.float32)
w_pd = pd.DataFrame(w)
- data = xgb.DMatrix(X, y, weight=w_pd)
+ data = xgb.DMatrix(X, y, w_pd)
assert data.num_row() == kRows
assert data.num_col() == kCols
@@ -301,14 +301,14 @@ def test_cv_as_pandas(self):
@pytest.mark.parametrize("DMatrixT", [xgb.DMatrix, xgb.QuantileDMatrix])
def test_nullable_type(self, DMatrixT) -> None:
- from xgboost.data import is_pd_cat_dtype
+ from pandas.api.types import is_categorical_dtype
for orig, df in pd_dtypes():
if hasattr(df.dtypes, "__iter__"):
- enable_categorical = any(is_pd_cat_dtype(dtype) for dtype in df.dtypes)
+ enable_categorical = any(is_categorical_dtype for dtype in df.dtypes)
else:
# series
- enable_categorical = is_pd_cat_dtype(df.dtype)
+ enable_categorical = is_categorical_dtype(df.dtype)
f0_orig = orig[orig.columns[0]] if isinstance(orig, pd.DataFrame) else orig
f0 = df[df.columns[0]] if isinstance(df, pd.DataFrame) else df