diff --git a/.github/workflows/i386.yml b/.github/workflows/i386.yml
new file mode 100644
index 000000000000..4a4d65b25b61
--- /dev/null
+++ b/.github/workflows/i386.yml
@@ -0,0 +1,39 @@
+name: XGBoost-i386-test
+
+on: [push, pull_request]
+
+permissions:
+ contents: read # to fetch code (actions/checkout)
+
+jobs:
+ build-32bit:
+ name: Build 32-bit
+ runs-on: ubuntu-latest
+ services:
+ registry:
+ image: registry:2
+ ports:
+ - 5000:5000
+ steps:
+ - uses: actions/checkout@v2.5.0
+ with:
+ submodules: 'true'
+ - name: Set up Docker Buildx
+ uses: docker/setup-buildx-action@v3
+ with:
+ driver-opts: network=host
+ - name: Build and push container
+ uses: docker/build-push-action@v5
+ with:
+ context: .
+ file: tests/ci_build/Dockerfile.i386
+ push: true
+ tags: localhost:5000/xgboost/build-32bit:latest
+ cache-from: type=gha
+ cache-to: type=gha,mode=max
+ - name: Build XGBoost
+ run: |
+ docker run --rm -v $PWD:/workspace -w /workspace \
+ -e CXXFLAGS='-Wno-error=overloaded-virtual -Wno-error=maybe-uninitialized -Wno-error=redundant-move' \
+ localhost:5000/xgboost/build-32bit:latest \
+ tests/ci_build/build_via_cmake.sh
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4f240e806282..dbfa1cdc225b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -39,9 +39,6 @@ elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
message(FATAL_ERROR "Need Clang 9.0 or newer to build XGBoost")
endif()
endif()
-if(CMAKE_SIZE_OF_VOID_P EQUAL 4)
- message(FATAL_ERROR "XGBoost does not support 32-bit archs. Please use 64-bit arch instead.")
-endif()
include(${xgboost_SOURCE_DIR}/cmake/PrefetchIntrinsics.cmake)
find_prefetch_intrinsics()
diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE
index 49f93bb57274..580d1f87325f 100644
--- a/R-package/NAMESPACE
+++ b/R-package/NAMESPACE
@@ -15,7 +15,6 @@ S3method(print,xgb.DMatrix)
S3method(print,xgb.cv.synchronous)
S3method(setinfo,xgb.Booster)
S3method(setinfo,xgb.DMatrix)
-S3method(slice,xgb.DMatrix)
S3method(variable.names,xgb.Booster)
export("xgb.attr<-")
export("xgb.attributes<-")
@@ -30,13 +29,12 @@ export(cb.reset.parameters)
export(cb.save.model)
export(getinfo)
export(setinfo)
-export(slice)
export(xgb.DMatrix)
export(xgb.DMatrix.hasinfo)
export(xgb.DMatrix.save)
+export(xgb.DataBatch)
export(xgb.DataIter)
export(xgb.ExternalDMatrix)
-export(xgb.ProxyDMatrix)
export(xgb.QuantileDMatrix)
export(xgb.QuantileDMatrix.from_iterator)
export(xgb.attr)
@@ -70,6 +68,7 @@ export(xgb.save)
export(xgb.save.raw)
export(xgb.set.config)
export(xgb.slice.Booster)
+export(xgb.slice.DMatrix)
export(xgb.train)
export(xgboost)
import(methods)
diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R
index 7613c9152d14..febefb757129 100644
--- a/R-package/R/xgb.Booster.R
+++ b/R-package/R/xgb.Booster.R
@@ -111,6 +111,21 @@ xgb.get.handle <- function(object) {
#' If passing "all", will use all of the rounds regardless of whether the model had early stopping or not.
#' @param strict_shape Default is `FALSE`. When set to `TRUE`, the output
#' type and shape of predictions are invariant to the model type.
+#' @param validate_features When `TRUE`, validate that the Booster's and newdata's feature_names
+#' match (only applicable when both `object` and `newdata` have feature names).
+#'
+#' If the column names differ and `newdata` is not an `xgb.DMatrix`, will try to reorder
+#' the columns in `newdata` to match with the booster's.
+#'
+#' If the booster has feature types and `newdata` is either an `xgb.DMatrix` or `data.frame`,
+#' will additionally verify that categorical columns are of the correct type in `newdata`,
+#' throwing an error if they do not match.
+#'
+#' If passing `FALSE`, it is assumed that the feature names and types are the same,
+#' and come in the same order as in the training data.
+#'
+#' Note that this check might add some sizable latency to the predictions, so it's
+#' recommended to disable it for performance-sensitive applications.
#' @param ... Not used.
#'
#' @details
@@ -271,7 +286,11 @@ xgb.get.handle <- function(object) {
#' @export
predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FALSE,
predleaf = FALSE, predcontrib = FALSE, approxcontrib = FALSE, predinteraction = FALSE,
- reshape = FALSE, training = FALSE, iterationrange = NULL, strict_shape = FALSE, ...) {
+ reshape = FALSE, training = FALSE, iterationrange = NULL, strict_shape = FALSE,
+ validate_features = FALSE, ...) {
+ if (validate_features) {
+ newdata <- validate.features(object, newdata)
+ }
if (!inherits(newdata, "xgb.DMatrix")) {
nthread <- xgb.nthread(object)
newdata <- xgb.DMatrix(
@@ -418,6 +437,85 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
return(arr)
}
+validate.features <- function(bst, newdata) {
+ if (is.character(newdata)) {
+ # this will be encountered when passing file paths
+ return(newdata)
+ }
+ if (inherits(newdata, "sparseVector")) {
+ # in this case, newdata won't have metadata
+ return(newdata)
+ }
+ if (is.vector(newdata)) {
+ newdata <- as.matrix(newdata)
+ }
+
+ booster_names <- getinfo(bst, "feature_name")
+ checked_names <- FALSE
+ if (NROW(booster_names)) {
+
+ try_reorder <- FALSE
+ if (inherits(newdata, "xgb.DMatrix")) {
+ curr_names <- getinfo(newdata, "feature_name")
+ } else {
+ curr_names <- colnames(newdata)
+ try_reorder <- TRUE
+ }
+
+ if (NROW(curr_names)) {
+ checked_names <- TRUE
+
+ if (length(curr_names) != length(booster_names) || any(curr_names != booster_names)) {
+
+ if (!try_reorder) {
+ stop("Feature names in 'newdata' do not match with booster's.")
+ } else {
+ if (inherits(newdata, "data.table")) {
+ newdata <- newdata[, booster_names, with = FALSE]
+ } else {
+ newdata <- newdata[, booster_names, drop = FALSE]
+ }
+ }
+
+ }
+
+ } # if (NROW(curr_names)) {
+
+ } # if (NROW(booster_names)) {
+
+ if (inherits(newdata, c("data.frame", "xgb.DMatrix"))) {
+
+ booster_types <- getinfo(bst, "feature_type")
+ if (!NROW(booster_types)) {
+ # Note: types in the booster are optional. Other interfaces
+ # might not even save it as booster attributes for example,
+ # even if the model uses categorical features.
+ return(newdata)
+ }
+ if (inherits(newdata, "xgb.DMatrix")) {
+ curr_types <- getinfo(newdata, "feature_type")
+ if (length(curr_types) != length(booster_types) || any(curr_types != booster_types)) {
+ stop("Feature types in 'newdata' do not match with booster's.")
+ }
+ }
+ if (inherits(newdata, "data.frame")) {
+ is_factor <- sapply(newdata, is.factor)
+ if (any(is_factor != (booster_types == "c"))) {
+ stop(
+ paste0(
+ "Feature types in 'newdata' do not match with booster's for same columns (by ",
+ ifelse(checked_names, "name", "position"),
+ ")."
+ )
+ )
+ }
+ }
+
+ }
+
+ return(newdata)
+}
+
#' @title Accessors for serializable attributes of a model
#'
diff --git a/R-package/R/xgb.DMatrix.R b/R-package/R/xgb.DMatrix.R
index da036b952b83..edbc267c1067 100644
--- a/R-package/R/xgb.DMatrix.R
+++ b/R-package/R/xgb.DMatrix.R
@@ -16,10 +16,6 @@
#' \item `matrix` objects, with types `numeric`, `integer`, or `logical`.
#' \item `data.frame` objects, with columns of types `numeric`, `integer`, `logical`, or `factor`.
#'
-#' If passing `enable_categorical=TRUE`, columns with `factor` type will be treated as categorical.
-#' Otherwise, if passing `enable_categorical=FALSE` and the data contains `factor` columns, an error
-#' will be thrown.
-#'
#' Note that xgboost uses base-0 encoding for categorical types, hence `factor` types (which use base-1
#' encoding') will be converted inside the function call. Be aware that the encoding used for `factor`
#' types is not kept as part of the model, so in subsequent calls to `predict`, it is the user's
@@ -32,12 +28,30 @@
#' 'xgb.QuantileDMatrix'.
#' \item Single-row CSR matrices, as class `dsparseVector` from package `Matrix`, which is interpreted
#' as a single row (only when making predictions from a fitted model).
-#' \item Text files in SVMLight / LibSVM formats, passed as a path to the file. These are \bold{not}
-#' supported for xgb.QuantileDMatrix'.
-#' \item Binary files generated by \link{xgb.DMatrix.save}, passed as a path to the file. These are
-#' \bold{not} supported for xgb.QuantileDMatrix'.
+#' \item Text files in a supported format, passed as a `character` variable containing the URI path to
+#' the file, with an optional format specifier.
+#'
+#' These are \bold{not} supported for `xgb.QuantileDMatrix`. Supported formats are:\itemize{
+#' \item XGBoost's own binary format for DMatrices, as produced by \link{xgb.DMatrix.save}.
+#' \item SVMLight (a.k.a. LibSVM) format for CSR matrices. This format can be signaled by suffix
+#' `?format=libsvm` at the end of the file path. It will be the default format if not
+#' otherwise specified.
+#' \item CSV files (comma-separated values). This format can be specified by adding suffix
+#' `?format=csv` at the end ofthe file path. It will \bold{not} be auto-deduced from file extensions.
+#' }
+#'
+#' Be aware that the format of the file will not be auto-deduced - for example, if a file is named 'file.csv',
+#' it will not look at the extension or file contents to determine that it is a comma-separated value.
+#' Instead, the format must be specified following the URI format, so the input to `data` should be passed
+#' like this: `"file.csv?format=csv"` (or `"file.csv?format=csv&label_column=0"` if the first column
+#' corresponds to the labels).
+#'
+#' For more information about passing text files as input, see the articles
+#' \href{https://xgboost.readthedocs.io/en/stable/tutorials/input_format.html}{Text Input Format of DMatrix} and
+#' \href{https://xgboost.readthedocs.io/en/stable/python/python_intro.html#python-data-interface}{Data Interface}.
#' }
-#' @param label Label of the training data.
+#' @param label Label of the training data. For classification problems, should be passed encoded as
+#' integers with numeration starting at zero.
#' @param weight Weight for each instance.
#'
#' Note that, for ranking task, weights are per-group. In ranking task, one weight
@@ -59,7 +73,7 @@
#' must be the same as in the DMatrix construction, regardless of the column names.
#' @param feature_types Set types for features.
#'
-#' If `data` is a `data.frame` and passing `enable_categorical=TRUE`, the types will be deduced
+#' If `data` is a `data.frame` and passing `feature_types` is not supplied, feature types will be deduced
#' automatically from the column types.
#'
#' Otherwise, one can pass a character vector with the same length as number of columns in `data`,
@@ -73,24 +87,24 @@
#' Note that, while categorical types are treated differently from the rest for model fitting
#' purposes, the other types do not influence the generated model, but have effects in other
#' functionalities such as feature importances.
+#'
+#' \bold{Important}: categorical features, if specified manually through `feature_types`, must
+#' be encoded as integers with numeration starting at zero, and the same encoding needs to be
+#' applied when passing data to `predict`. Even if passing `factor` types, the encoding will
+#' not be saved, so make sure that `factor` columns passed to `predict` have the same `levels`.
#' @param nthread Number of threads used for creating DMatrix.
#' @param group Group size for all ranking group.
#' @param qid Query ID for data samples, used for ranking.
#' @param label_lower_bound Lower bound for survival training.
#' @param label_upper_bound Upper bound for survival training.
#' @param feature_weights Set feature weights for column sampling.
-#' @param enable_categorical Experimental support of specializing for categorical features.
-#'
-#' If passing 'TRUE' and 'data' is a data frame,
-#' columns of categorical types will automatically
-#' be set to be of categorical type (feature_type='c') in the resulting DMatrix.
-#'
-#' If passing 'FALSE' and 'data' is a data frame with categorical columns,
-#' it will result in an error being thrown.
+#' @param data_split_mode When passing a URI (as R `character`) as input, this signals
+#' whether to split by row or column. Allowed values are `"row"` and `"col"`.
#'
-#' If 'data' is not a data frame, this argument is ignored.
+#' In distributed mode, the file is split accordingly; otherwise this is only an indicator on
+#' how the file was split beforehand. Default to row.
#'
-#' JSON/UBJSON serialization format is required for this.
+#' This is not used when `data` is not a URI.
#' @return An 'xgb.DMatrix' object. If calling 'xgb.QuantileDMatrix', it will have additional
#' subclass 'xgb.QuantileDMatrix'.
#'
@@ -128,7 +142,7 @@ xgb.DMatrix <- function(
label_lower_bound = NULL,
label_upper_bound = NULL,
feature_weights = NULL,
- enable_categorical = FALSE
+ data_split_mode = "row"
) {
if (!is.null(group) && !is.null(qid)) {
stop("Either one of 'group' or 'qid' should be NULL")
@@ -142,7 +156,14 @@ xgb.DMatrix <- function(
)
}
data <- path.expand(data)
- handle <- .Call(XGDMatrixCreateFromFile_R, data, as.integer(silent))
+ if (data_split_mode == "row") {
+ data_split_mode <- 0L
+ } else if (data_split_mode == "col") {
+ data_split_mode <- 1L
+ } else {
+ stop("Passed invalid 'data_split_mode': ", data_split_mode)
+ }
+ handle <- .Call(XGDMatrixCreateFromURI_R, data, as.integer(silent), data_split_mode)
} else if (is.matrix(data)) {
handle <- .Call(
XGDMatrixCreateFromMat_R, data, missing, nthread
@@ -180,7 +201,7 @@ xgb.DMatrix <- function(
nthread
)
} else if (is.data.frame(data)) {
- tmp <- .process.df.for.dmatrix(data, enable_categorical, feature_types)
+ tmp <- .process.df.for.dmatrix(data, feature_types)
feature_types <- tmp$feature_types
handle <- .Call(
XGDMatrixCreateFromDF_R, tmp$lst, missing, nthread
@@ -212,7 +233,7 @@ xgb.DMatrix <- function(
return(dmat)
}
-.process.df.for.dmatrix <- function(df, enable_categorical, feature_types) {
+.process.df.for.dmatrix <- function(df, feature_types) {
if (!nrow(df) || !ncol(df)) {
stop("'data' is an empty data.frame.")
}
@@ -225,12 +246,6 @@ xgb.DMatrix <- function(
} else {
feature_types <- sapply(df, function(col) {
if (is.factor(col)) {
- if (!enable_categorical) {
- stop(
- "When factor type is used, the parameter `enable_categorical`",
- " must be set to TRUE."
- )
- }
return("c")
} else if (is.integer(col)) {
return("int")
@@ -326,7 +341,6 @@ xgb.QuantileDMatrix <- function(
label_lower_bound = NULL,
label_upper_bound = NULL,
feature_weights = NULL,
- enable_categorical = FALSE,
ref = NULL,
max_bin = NULL
) {
@@ -357,8 +371,7 @@ xgb.QuantileDMatrix <- function(
qid = qid,
label_lower_bound = label_lower_bound,
label_upper_bound = label_upper_bound,
- feature_weights = feature_weights,
- enable_categorical = enable_categorical
+ feature_weights = feature_weights
)
)
data_iterator <- .single.data.iterator(iterator_env)
@@ -373,7 +386,7 @@ xgb.QuantileDMatrix <- function(
.Call(XGDMatrixFree_R, proxy_handle)
})
iterator_next <- function() {
- return(xgb.ProxyDMatrix.internal(proxy_handle, data_iterator))
+ return(xgb.ProxyDMatrix(proxy_handle, data_iterator))
}
iterator_reset <- function() {
return(data_iterator$f_reset(iterator_env))
@@ -416,12 +429,12 @@ xgb.QuantileDMatrix <- function(
#' to know which part of the data to pass next.
#' @param f_next `function(env)` which is responsible for:\itemize{
#' \item Accessing or retrieving the next batch of data in the iterator.
-#' \item Supplying this data by calling function \link{xgb.ProxyDMatrix} on it and returning the result.
+#' \item Supplying this data by calling function \link{xgb.DataBatch} on it and returning the result.
#' \item Keeping track of where in the iterator batch it is or will go next, which can for example
#' be done by modifiying variables in the `env` variable that is passed here.
#' \item Signaling whether there are more batches to be consumed or not, by returning `NULL`
#' when the stream of data ends (all batches in the iterator have been consumed), or the result from
-#' calling \link{xgb.ProxyDMatrix} when there are more batches in the line to be consumed.
+#' calling \link{xgb.DataBatch} when there are more batches in the line to be consumed.
#' }
#' @param f_reset `function(env)` which is responsible for reseting the data iterator
#' (i.e. taking it back to the first batch, called before and after the sequence of batches
@@ -431,7 +444,7 @@ xgb.QuantileDMatrix <- function(
#' (and in the same order) must be passed in subsequent iterations.
#' @return An `xgb.DataIter` object, containing the same inputs supplied here, which can then
#' be passed to \link{xgb.ExternalDMatrix}.
-#' @seealso \link{xgb.ExternalDMatrix}, \link{xgb.ProxyDMatrix}.
+#' @seealso \link{xgb.ExternalDMatrix}, \link{xgb.DataBatch}.
#' @export
xgb.DataIter <- function(env = new.env(), f_next, f_reset) {
if (!is.function(f_next)) {
@@ -459,7 +472,7 @@ xgb.DataIter <- function(env = new.env(), f_next, f_reset) {
env[["iter"]] <- curr_iter + 1L
})
return(
- xgb.ProxyDMatrix(
+ xgb.DataBatch(
data = env[["data"]],
label = env[["label"]],
weight = env[["weight"]],
@@ -470,8 +483,7 @@ xgb.DataIter <- function(env = new.env(), f_next, f_reset) {
qid = env[["qid"]],
label_lower_bound = env[["label_lower_bound"]],
label_upper_bound = env[["label_upper_bound"]],
- feature_weights = env[["feature_weights"]],
- enable_categorical = env[["enable_categorical"]]
+ feature_weights = env[["feature_weights"]]
)
)
}
@@ -490,13 +502,13 @@ xgb.DataIter <- function(env = new.env(), f_next, f_reset) {
.make.proxy.handle <- function() {
out <- .Call(XGProxyDMatrixCreate_R)
attributes(out) <- list(
- class = c("xgb.DMatrix", "xgb.ProxyDMatrixHandle"),
+ class = c("xgb.DMatrix", "xgb.ProxyDMatrix"),
fields = new.env()
)
return(out)
}
-#' @title Proxy DMatrix Updater
+#' @title Structure for Data Batches
#' @description Helper function to supply data in batches of a data iterator when
#' constructing a DMatrix from external memory through \link{xgb.ExternalDMatrix}
#' or through \link{xgb.QuantileDMatrix.from_iterator}.
@@ -506,8 +518,8 @@ xgb.DataIter <- function(env = new.env(), f_next, f_reset) {
#' when constructing a DMatrix through external memory - otherwise, one should call
#' \link{xgb.DMatrix} or \link{xgb.QuantileDMatrix}.
#'
-#' The object that results from calling this function directly is \bold{not} like the other
-#' `xgb.DMatrix` variants - i.e. cannot be used to train a model, nor to get predictions - only
+#' The object that results from calling this function directly is \bold{not} like
+#' an `xgb.DMatrix` - i.e. cannot be used to train a model, nor to get predictions - only
#' possible usage is to supply data to an iterator, from which a DMatrix is then constructed.
#'
#' For more information and for example usage, see the documentation for \link{xgb.ExternalDMatrix}.
@@ -525,11 +537,11 @@ xgb.DataIter <- function(env = new.env(), f_next, f_reset) {
#' \link{xgb.DMatrix} for details on it.
#' \item CSR matrices, as class `dgRMatrix` from package `Matrix`.
#' }
-#' @return An object of class `xgb.ProxyDMatrix`, which is just a list containing the
+#' @return An object of class `xgb.DataBatch`, which is just a list containing the
#' data and parameters passed here. It does \bold{not} inherit from `xgb.DMatrix`.
#' @seealso \link{xgb.DataIter}, \link{xgb.ExternalDMatrix}.
#' @export
-xgb.ProxyDMatrix <- function(
+xgb.DataBatch <- function(
data,
label = NULL,
weight = NULL,
@@ -540,8 +552,7 @@ xgb.ProxyDMatrix <- function(
qid = NULL,
label_lower_bound = NULL,
label_upper_bound = NULL,
- feature_weights = NULL,
- enable_categorical = FALSE
+ feature_weights = NULL
) {
stopifnot(inherits(data, c("matrix", "data.frame", "dgRMatrix")))
out <- list(
@@ -555,27 +566,27 @@ xgb.ProxyDMatrix <- function(
qid = qid,
label_lower_bound = label_lower_bound,
label_upper_bound = label_upper_bound,
- feature_weights = feature_weights,
- enable_categorical = enable_categorical
+ feature_weights = feature_weights
)
- class(out) <- "xgb.ProxyDMatrix"
+ class(out) <- "xgb.DataBatch"
return(out)
}
-xgb.ProxyDMatrix.internal <- function(proxy_handle, data_iterator) {
+# This is only for internal usage, class is not exposed to the user.
+xgb.ProxyDMatrix <- function(proxy_handle, data_iterator) {
lst <- data_iterator$f_next(data_iterator$env)
if (is.null(lst)) {
return(0L)
}
- if (!inherits(lst, "xgb.ProxyDMatrix")) {
- stop("DataIter 'f_next' must return either NULL or the result from calling 'xgb.ProxyDMatrix'.")
+ if (!inherits(lst, "xgb.DataBatch")) {
+ stop("DataIter 'f_next' must return either NULL or the result from calling 'xgb.DataBatch'.")
}
if (!is.null(lst$group) && !is.null(lst$qid)) {
stop("Either one of 'group' or 'qid' should be NULL")
}
if (is.data.frame(lst$data)) {
- tmp <- .process.df.for.dmatrix(lst$data, lst$enable_categorical, lst$feature_types)
+ tmp <- .process.df.for.dmatrix(lst$data, lst$feature_types)
lst$feature_types <- tmp$feature_types
.Call(XGProxyDMatrixSetDataColumnar_R, proxy_handle, tmp$lst)
rm(tmp)
@@ -634,7 +645,7 @@ xgb.ProxyDMatrix.internal <- function(proxy_handle, data_iterator) {
#' This should not pose any problem for `numeric` types, since they do have an inheret NaN value.
#' @return An 'xgb.DMatrix' object, with subclass 'xgb.ExternalDMatrix', in which the data is not
#' held internally but accessed through the iterator when needed.
-#' @seealso \link{xgb.DataIter}, \link{xgb.ProxyDMatrix}, \link{xgb.QuantileDMatrix.from_iterator}
+#' @seealso \link{xgb.DataIter}, \link{xgb.DataBatch}, \link{xgb.QuantileDMatrix.from_iterator}
#' @examples
#' library(xgboost)
#' data(mtcars)
@@ -674,10 +685,10 @@ xgb.ProxyDMatrix.internal <- function(proxy_handle, data_iterator) {
#' iterator_env[["iter"]] <- curr_iter + 1
#' })
#'
-#' # Function 'xgb.ProxyDMatrix' must be called manually
+#' # Function 'xgb.DataBatch' must be called manually
#' # at each batch with all the appropriate attributes,
#' # such as feature names and feature types.
-#' return(xgb.ProxyDMatrix(data = x_batch, label = y_batch))
+#' return(xgb.DataBatch(data = x_batch, label = y_batch))
#' }
#'
#' # This moves the iterator back to its beginning
@@ -721,7 +732,7 @@ xgb.ExternalDMatrix <- function(
.Call(XGDMatrixFree_R, proxy_handle)
})
iterator_next <- function() {
- return(xgb.ProxyDMatrix.internal(proxy_handle, data_iterator))
+ return(xgb.ProxyDMatrix(proxy_handle, data_iterator))
}
iterator_reset <- function() {
return(data_iterator$f_reset(data_iterator$env))
@@ -764,7 +775,7 @@ xgb.ExternalDMatrix <- function(
#' @inheritParams xgb.ExternalDMatrix
#' @inheritParams xgb.QuantileDMatrix
#' @return An 'xgb.DMatrix' object, with subclass 'xgb.QuantileDMatrix'.
-#' @seealso \link{xgb.DataIter}, \link{xgb.ProxyDMatrix}, \link{xgb.ExternalDMatrix},
+#' @seealso \link{xgb.DataIter}, \link{xgb.DataBatch}, \link{xgb.ExternalDMatrix},
#' \link{xgb.QuantileDMatrix}
#' @export
xgb.QuantileDMatrix.from_iterator <- function( # nolint
@@ -786,7 +797,7 @@ xgb.QuantileDMatrix.from_iterator <- function( # nolint
.Call(XGDMatrixFree_R, proxy_handle)
})
iterator_next <- function() {
- return(xgb.ProxyDMatrix.internal(proxy_handle, data_iterator))
+ return(xgb.ProxyDMatrix(proxy_handle, data_iterator))
}
iterator_reset <- function() {
return(data_iterator$f_reset(data_iterator$env))
@@ -1256,19 +1267,15 @@ xgb.get.DMatrix.data <- function(dmat) {
#' data(agaricus.train, package='xgboost')
#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
#'
-#' dsub <- slice(dtrain, 1:42)
+#' dsub <- xgb.slice.DMatrix(dtrain, 1:42)
#' labels1 <- getinfo(dsub, 'label')
#' dsub <- dtrain[1:42, ]
#' labels2 <- getinfo(dsub, 'label')
#' all.equal(labels1, labels2)
#'
-#' @rdname slice.xgb.DMatrix
-#' @export
-slice <- function(object, idxset) UseMethod("slice")
-
-#' @rdname slice.xgb.DMatrix
+#' @rdname xgb.slice.DMatrix
#' @export
-slice.xgb.DMatrix <- function(object, idxset) {
+xgb.slice.DMatrix <- function(object, idxset) {
if (!inherits(object, "xgb.DMatrix")) {
stop("object must be xgb.DMatrix")
}
@@ -1292,10 +1299,10 @@ slice.xgb.DMatrix <- function(object, idxset) {
return(structure(ret, class = "xgb.DMatrix"))
}
-#' @rdname slice.xgb.DMatrix
+#' @rdname xgb.slice.DMatrix
#' @export
`[.xgb.DMatrix` <- function(object, idxset, colset = NULL) {
- slice(object, idxset)
+ xgb.slice.DMatrix(object, idxset)
}
diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R
index eb0495631d6e..29bddb57f3e2 100644
--- a/R-package/R/xgb.cv.R
+++ b/R-package/R/xgb.cv.R
@@ -197,12 +197,12 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing
nthread = params$nthread
)
bst_folds <- lapply(seq_along(folds), function(k) {
- dtest <- slice(dall, folds[[k]])
+ dtest <- xgb.slice.DMatrix(dall, folds[[k]])
# code originally contributed by @RolandASc on stackoverflow
if (is.null(train_folds))
- dtrain <- slice(dall, unlist(folds[-k]))
+ dtrain <- xgb.slice.DMatrix(dall, unlist(folds[-k]))
else
- dtrain <- slice(dall, train_folds[[k]])
+ dtrain <- xgb.slice.DMatrix(dall, train_folds[[k]])
bst <- xgb.Booster(
params = params,
cachelist = list(dtrain, dtest),
diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R
index f0f2332b58c3..44cde2e7a843 100644
--- a/R-package/R/xgb.train.R
+++ b/R-package/R/xgb.train.R
@@ -178,6 +178,11 @@
#' Number of threads can also be manually specified via the \code{nthread}
#' parameter.
#'
+#' While in other interfaces, the default random seed defaults to zero, in R, if a parameter `seed`
+#' is not manually supplied, it will generate a random seed through R's own random number generator,
+#' whose seed in turn is controllable through `set.seed`. If `seed` is passed, it will override the
+#' RNG from R.
+#'
#' The evaluation metric is chosen automatically by XGBoost (according to the objective)
#' when the \code{eval_metric} parameter is not provided.
#' User may set one or several \code{eval_metric} parameters.
@@ -363,8 +368,8 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
# Sort the callbacks into categories
cb <- categorize.callbacks(callbacks)
params['validate_parameters'] <- TRUE
- if (!is.null(params[['seed']])) {
- warning("xgb.train: `seed` is ignored in R package. Use `set.seed()` instead.")
+ if (!("seed" %in% names(params))) {
+ params[["seed"]] <- sample(.Machine$integer.max, size = 1)
}
# The tree updating process would need slightly different handling
diff --git a/R-package/man/predict.xgb.Booster.Rd b/R-package/man/predict.xgb.Booster.Rd
index 7a6dd6c1306b..95e7a51fd043 100644
--- a/R-package/man/predict.xgb.Booster.Rd
+++ b/R-package/man/predict.xgb.Booster.Rd
@@ -17,6 +17,7 @@
training = FALSE,
iterationrange = NULL,
strict_shape = FALSE,
+ validate_features = FALSE,
...
)
}
@@ -66,6 +67,23 @@ base-1 indexing, and inclusive of both ends).
\item{strict_shape}{Default is \code{FALSE}. When set to \code{TRUE}, the output
type and shape of predictions are invariant to the model type.}
+\item{validate_features}{When \code{TRUE}, validate that the Booster's and newdata's feature_names
+match (only applicable when both \code{object} and \code{newdata} have feature names).
+
+\if{html}{\out{
}}\preformatted{ If the column names differ and `newdata` is not an `xgb.DMatrix`, will try to reorder
+ the columns in `newdata` to match with the booster's.
+
+ If the booster has feature types and `newdata` is either an `xgb.DMatrix` or `data.frame`,
+ will additionally verify that categorical columns are of the correct type in `newdata`,
+ throwing an error if they do not match.
+
+ If passing `FALSE`, it is assumed that the feature names and types are the same,
+ and come in the same order as in the training data.
+
+ Note that this check might add some sizable latency to the predictions, so it's
+ recommended to disable it for performance-sensitive applications.
+}\if{html}{\out{
}}}
+
\item{...}{Not used.}
}
\value{
diff --git a/R-package/man/xgb.DMatrix.Rd b/R-package/man/xgb.DMatrix.Rd
index ceb60dc42906..5f764ed45380 100644
--- a/R-package/man/xgb.DMatrix.Rd
+++ b/R-package/man/xgb.DMatrix.Rd
@@ -20,7 +20,7 @@ xgb.DMatrix(
label_lower_bound = NULL,
label_upper_bound = NULL,
feature_weights = NULL,
- enable_categorical = FALSE
+ data_split_mode = "row"
)
xgb.QuantileDMatrix(
@@ -37,7 +37,6 @@ xgb.QuantileDMatrix(
label_lower_bound = NULL,
label_upper_bound = NULL,
feature_weights = NULL,
- enable_categorical = FALSE,
ref = NULL,
max_bin = NULL
)
@@ -50,10 +49,6 @@ Supported input types are as follows:\itemize{
\item \code{matrix} objects, with types \code{numeric}, \code{integer}, or \code{logical}.
\item \code{data.frame} objects, with columns of types \code{numeric}, \code{integer}, \code{logical}, or \code{factor}.
-If passing \code{enable_categorical=TRUE}, columns with \code{factor} type will be treated as categorical.
-Otherwise, if passing \code{enable_categorical=FALSE} and the data contains \code{factor} columns, an error
-will be thrown.
-
Note that xgboost uses base-0 encoding for categorical types, hence \code{factor} types (which use base-1
encoding') will be converted inside the function call. Be aware that the encoding used for \code{factor}
types is not kept as part of the model, so in subsequent calls to \code{predict}, it is the user's
@@ -66,13 +61,31 @@ Other column types are not supported.
'xgb.QuantileDMatrix'.
\item Single-row CSR matrices, as class \code{dsparseVector} from package \code{Matrix}, which is interpreted
as a single row (only when making predictions from a fitted model).
-\item Text files in SVMLight / LibSVM formats, passed as a path to the file. These are \bold{not}
-supported for xgb.QuantileDMatrix'.
-\item Binary files generated by \link{xgb.DMatrix.save}, passed as a path to the file. These are
-\bold{not} supported for xgb.QuantileDMatrix'.
+\item Text files in a supported format, passed as a \code{character} variable containing the URI path to
+the file, with an optional format specifier.
+
+These are \bold{not} supported for \code{xgb.QuantileDMatrix}. Supported formats are:\itemize{
+\item XGBoost's own binary format for DMatrices, as produced by \link{xgb.DMatrix.save}.
+\item SVMLight (a.k.a. LibSVM) format for CSR matrices. This format can be signaled by suffix
+\code{?format=libsvm} at the end of the file path. It will be the default format if not
+otherwise specified.
+\item CSV files (comma-separated values). This format can be specified by adding suffix
+\code{?format=csv} at the end ofthe file path. It will \bold{not} be auto-deduced from file extensions.
+}
+
+Be aware that the format of the file will not be auto-deduced - for example, if a file is named 'file.csv',
+it will not look at the extension or file contents to determine that it is a comma-separated value.
+Instead, the format must be specified following the URI format, so the input to \code{data} should be passed
+like this: \code{"file.csv?format=csv"} (or \code{"file.csv?format=csv&label_column=0"} if the first column
+corresponds to the labels).
+
+For more information about passing text files as input, see the articles
+\href{https://xgboost.readthedocs.io/en/stable/tutorials/input_format.html}{Text Input Format of DMatrix} and
+\href{https://xgboost.readthedocs.io/en/stable/python/python_intro.html#python-data-interface}{Data Interface}.
}}
-\item{label}{Label of the training data.}
+\item{label}{Label of the training data. For classification problems, should be passed encoded as
+integers with numeration starting at zero.}
\item{weight}{Weight for each instance.
@@ -102,7 +115,7 @@ frame and matrix.
\item{feature_types}{Set types for features.
-If \code{data} is a \code{data.frame} and passing \code{enable_categorical=TRUE}, the types will be deduced
+If \code{data} is a \code{data.frame} and passing \code{feature_types} is not supplied, feature types will be deduced
automatically from the column types.
Otherwise, one can pass a character vector with the same length as number of columns in \code{data},
@@ -115,7 +128,12 @@ with the following possible values:\itemize{
Note that, while categorical types are treated differently from the rest for model fitting
purposes, the other types do not influence the generated model, but have effects in other
-functionalities such as feature importances.}
+functionalities such as feature importances.
+
+\bold{Important}: categorical features, if specified manually through \code{feature_types}, must
+be encoded as integers with numeration starting at zero, and the same encoding needs to be
+applied when passing data to \code{predict}. Even if passing \code{factor} types, the encoding will
+not be saved, so make sure that \code{factor} columns passed to \code{predict} have the same \code{levels}.}
\item{nthread}{Number of threads used for creating DMatrix.}
@@ -129,19 +147,13 @@ functionalities such as feature importances.}
\item{feature_weights}{Set feature weights for column sampling.}
-\item{enable_categorical}{Experimental support of specializing for categorical features.
-
-\if{html}{\out{}}\preformatted{ If passing 'TRUE' and 'data' is a data frame,
- columns of categorical types will automatically
- be set to be of categorical type (feature_type='c') in the resulting DMatrix.
+\item{data_split_mode}{When passing a URI (as R \code{character}) as input, this signals
+whether to split by row or column. Allowed values are \code{"row"} and \code{"col"}.
- If passing 'FALSE' and 'data' is a data frame with categorical columns,
- it will result in an error being thrown.
+In distributed mode, the file is split accordingly; otherwise this is only an indicator on
+how the file was split beforehand. Default to row.
- If 'data' is not a data frame, this argument is ignored.
-
- JSON/UBJSON serialization format is required for this.
-}\if{html}{\out{
}}}
+This is not used when \code{data} is not a URI.}
\item{ref}{The training dataset that provides quantile information, needed when creating
validation/test dataset with \code{xgb.QuantileDMatrix}. Supplying the training DMatrix
diff --git a/R-package/man/xgb.ProxyDMatrix.Rd b/R-package/man/xgb.DataBatch.Rd
similarity index 75%
rename from R-package/man/xgb.ProxyDMatrix.Rd
rename to R-package/man/xgb.DataBatch.Rd
index 5a9b6251af40..063b82b031cc 100644
--- a/R-package/man/xgb.ProxyDMatrix.Rd
+++ b/R-package/man/xgb.DataBatch.Rd
@@ -1,10 +1,10 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.DMatrix.R
-\name{xgb.ProxyDMatrix}
-\alias{xgb.ProxyDMatrix}
-\title{Proxy DMatrix Updater}
+\name{xgb.DataBatch}
+\alias{xgb.DataBatch}
+\title{Structure for Data Batches}
\usage{
-xgb.ProxyDMatrix(
+xgb.DataBatch(
data,
label = NULL,
weight = NULL,
@@ -15,8 +15,7 @@ xgb.ProxyDMatrix(
qid = NULL,
label_lower_bound = NULL,
label_upper_bound = NULL,
- feature_weights = NULL,
- enable_categorical = FALSE
+ feature_weights = NULL
)
}
\arguments{
@@ -34,7 +33,8 @@ conversions applied to it. See the documentation for parameter \code{data} in
\item CSR matrices, as class \code{dgRMatrix} from package \code{Matrix}.
}}
-\item{label}{Label of the training data.}
+\item{label}{Label of the training data. For classification problems, should be passed encoded as
+integers with numeration starting at zero.}
\item{weight}{Weight for each instance.
@@ -57,7 +57,7 @@ frame and matrix.
\item{feature_types}{Set types for features.
-If \code{data} is a \code{data.frame} and passing \code{enable_categorical=TRUE}, the types will be deduced
+If \code{data} is a \code{data.frame} and passing \code{feature_types} is not supplied, feature types will be deduced
automatically from the column types.
Otherwise, one can pass a character vector with the same length as number of columns in \code{data},
@@ -70,7 +70,12 @@ with the following possible values:\itemize{
Note that, while categorical types are treated differently from the rest for model fitting
purposes, the other types do not influence the generated model, but have effects in other
-functionalities such as feature importances.}
+functionalities such as feature importances.
+
+\bold{Important}: categorical features, if specified manually through \code{feature_types}, must
+be encoded as integers with numeration starting at zero, and the same encoding needs to be
+applied when passing data to \code{predict}. Even if passing \code{factor} types, the encoding will
+not be saved, so make sure that \code{factor} columns passed to \code{predict} have the same \code{levels}.}
\item{group}{Group size for all ranking group.}
@@ -81,23 +86,9 @@ functionalities such as feature importances.}
\item{label_upper_bound}{Upper bound for survival training.}
\item{feature_weights}{Set feature weights for column sampling.}
-
-\item{enable_categorical}{Experimental support of specializing for categorical features.
-
-\if{html}{\out{}}\preformatted{ If passing 'TRUE' and 'data' is a data frame,
- columns of categorical types will automatically
- be set to be of categorical type (feature_type='c') in the resulting DMatrix.
-
- If passing 'FALSE' and 'data' is a data frame with categorical columns,
- it will result in an error being thrown.
-
- If 'data' is not a data frame, this argument is ignored.
-
- JSON/UBJSON serialization format is required for this.
-}\if{html}{\out{
}}}
}
\value{
-An object of class \code{xgb.ProxyDMatrix}, which is just a list containing the
+An object of class \code{xgb.DataBatch}, which is just a list containing the
data and parameters passed here. It does \bold{not} inherit from \code{xgb.DMatrix}.
}
\description{
@@ -110,8 +101,8 @@ is passed as argument to function \link{xgb.DataIter} to construct a data iterat
when constructing a DMatrix through external memory - otherwise, one should call
\link{xgb.DMatrix} or \link{xgb.QuantileDMatrix}.
-The object that results from calling this function directly is \bold{not} like the other
-\code{xgb.DMatrix} variants - i.e. cannot be used to train a model, nor to get predictions - only
+The object that results from calling this function directly is \bold{not} like
+an \code{xgb.DMatrix} - i.e. cannot be used to train a model, nor to get predictions - only
possible usage is to supply data to an iterator, from which a DMatrix is then constructed.
For more information and for example usage, see the documentation for \link{xgb.ExternalDMatrix}.
diff --git a/R-package/man/xgb.DataIter.Rd b/R-package/man/xgb.DataIter.Rd
index 29cf5acc9cf4..2bd68ce5108c 100644
--- a/R-package/man/xgb.DataIter.Rd
+++ b/R-package/man/xgb.DataIter.Rd
@@ -15,12 +15,12 @@ to know which part of the data to pass next.}
\item{f_next}{\verb{function(env)} which is responsible for:\itemize{
\item Accessing or retrieving the next batch of data in the iterator.
-\item Supplying this data by calling function \link{xgb.ProxyDMatrix} on it and returning the result.
+\item Supplying this data by calling function \link{xgb.DataBatch} on it and returning the result.
\item Keeping track of where in the iterator batch it is or will go next, which can for example
be done by modifiying variables in the \code{env} variable that is passed here.
\item Signaling whether there are more batches to be consumed or not, by returning \code{NULL}
when the stream of data ends (all batches in the iterator have been consumed), or the result from
-calling \link{xgb.ProxyDMatrix} when there are more batches in the line to be consumed.
+calling \link{xgb.DataBatch} when there are more batches in the line to be consumed.
}}
\item{f_reset}{\verb{function(env)} which is responsible for reseting the data iterator
@@ -47,5 +47,5 @@ which will consume the data and create a DMatrix from it by executing the callba
For more information, and for a usage example, see the documentation for \link{xgb.ExternalDMatrix}.
}
\seealso{
-\link{xgb.ExternalDMatrix}, \link{xgb.ProxyDMatrix}.
+\link{xgb.ExternalDMatrix}, \link{xgb.DataBatch}.
}
diff --git a/R-package/man/xgb.ExternalDMatrix.Rd b/R-package/man/xgb.ExternalDMatrix.Rd
index 3e7844990b50..14a872cb585c 100644
--- a/R-package/man/xgb.ExternalDMatrix.Rd
+++ b/R-package/man/xgb.ExternalDMatrix.Rd
@@ -87,10 +87,10 @@ iterator_next <- function(iterator_env) {
iterator_env[["iter"]] <- curr_iter + 1
})
- # Function 'xgb.ProxyDMatrix' must be called manually
+ # Function 'xgb.DataBatch' must be called manually
# at each batch with all the appropriate attributes,
# such as feature names and feature types.
- return(xgb.ProxyDMatrix(data = x_batch, label = y_batch))
+ return(xgb.DataBatch(data = x_batch, label = y_batch))
}
# This moves the iterator back to its beginning
@@ -118,5 +118,5 @@ pred_dm <- predict(model, dm)
pred_mat <- predict(model, as.matrix(mtcars[, -1]))
}
\seealso{
-\link{xgb.DataIter}, \link{xgb.ProxyDMatrix}, \link{xgb.QuantileDMatrix.from_iterator}
+\link{xgb.DataIter}, \link{xgb.DataBatch}, \link{xgb.QuantileDMatrix.from_iterator}
}
diff --git a/R-package/man/xgb.QuantileDMatrix.from_iterator.Rd b/R-package/man/xgb.QuantileDMatrix.from_iterator.Rd
index 21f24576dcb1..791b5576e653 100644
--- a/R-package/man/xgb.QuantileDMatrix.from_iterator.Rd
+++ b/R-package/man/xgb.QuantileDMatrix.from_iterator.Rd
@@ -60,6 +60,6 @@ For more information, see the guide 'Using XGBoost External Memory Version':
\url{https://xgboost.readthedocs.io/en/stable/tutorials/external_memory.html}
}
\seealso{
-\link{xgb.DataIter}, \link{xgb.ProxyDMatrix}, \link{xgb.ExternalDMatrix},
+\link{xgb.DataIter}, \link{xgb.DataBatch}, \link{xgb.ExternalDMatrix},
\link{xgb.QuantileDMatrix}
}
diff --git a/R-package/man/slice.xgb.DMatrix.Rd b/R-package/man/xgb.slice.DMatrix.Rd
similarity index 84%
rename from R-package/man/slice.xgb.DMatrix.Rd
rename to R-package/man/xgb.slice.DMatrix.Rd
index a2dfb699bb0f..c9695996b66f 100644
--- a/R-package/man/slice.xgb.DMatrix.Rd
+++ b/R-package/man/xgb.slice.DMatrix.Rd
@@ -1,15 +1,12 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.DMatrix.R
-\name{slice}
-\alias{slice}
-\alias{slice.xgb.DMatrix}
+\name{xgb.slice.DMatrix}
+\alias{xgb.slice.DMatrix}
\alias{[.xgb.DMatrix}
\title{Get a new DMatrix containing the specified rows of
original xgb.DMatrix object}
\usage{
-slice(object, idxset)
-
-\method{slice}{xgb.DMatrix}(object, idxset)
+xgb.slice.DMatrix(object, idxset)
\method{[}{xgb.DMatrix}(object, idxset, colset = NULL)
}
@@ -28,7 +25,7 @@ original xgb.DMatrix object
data(agaricus.train, package='xgboost')
dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
-dsub <- slice(dtrain, 1:42)
+dsub <- xgb.slice.DMatrix(dtrain, 1:42)
labels1 <- getinfo(dsub, 'label')
dsub <- dtrain[1:42, ]
labels2 <- getinfo(dsub, 'label')
diff --git a/R-package/man/xgb.train.Rd b/R-package/man/xgb.train.Rd
index 0421b9c4a38a..21c5fe7eebe4 100644
--- a/R-package/man/xgb.train.Rd
+++ b/R-package/man/xgb.train.Rd
@@ -241,6 +241,11 @@ Parallelization is automatically enabled if \code{OpenMP} is present.
Number of threads can also be manually specified via the \code{nthread}
parameter.
+While in other interfaces, the default random seed defaults to zero, in R, if a parameter \code{seed}
+is not manually supplied, it will generate a random seed through R's own random number generator,
+whose seed in turn is controllable through \code{set.seed}. If \code{seed} is passed, it will override the
+RNG from R.
+
The evaluation metric is chosen automatically by XGBoost (according to the objective)
when the \code{eval_metric} parameter is not provided.
User may set one or several \code{eval_metric} parameters.
diff --git a/R-package/src/init.c b/R-package/src/init.c
index a9f3f3e380c2..36f3e8953639 100644
--- a/R-package/src/init.c
+++ b/R-package/src/init.c
@@ -46,7 +46,7 @@ extern SEXP XGSetArrayDimInplace_R(SEXP, SEXP);
extern SEXP XGSetArrayDimNamesInplace_R(SEXP, SEXP);
extern SEXP XGDMatrixCreateFromCSC_R(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
extern SEXP XGDMatrixCreateFromCSR_R(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
-extern SEXP XGDMatrixCreateFromFile_R(SEXP, SEXP);
+extern SEXP XGDMatrixCreateFromURI_R(SEXP, SEXP, SEXP);
extern SEXP XGDMatrixCreateFromMat_R(SEXP, SEXP, SEXP);
extern SEXP XGDMatrixGetFloatInfo_R(SEXP, SEXP);
extern SEXP XGDMatrixGetUIntInfo_R(SEXP, SEXP);
@@ -105,7 +105,7 @@ static const R_CallMethodDef CallEntries[] = {
{"XGSetArrayDimNamesInplace_R", (DL_FUNC) &XGSetArrayDimNamesInplace_R, 2},
{"XGDMatrixCreateFromCSC_R", (DL_FUNC) &XGDMatrixCreateFromCSC_R, 6},
{"XGDMatrixCreateFromCSR_R", (DL_FUNC) &XGDMatrixCreateFromCSR_R, 6},
- {"XGDMatrixCreateFromFile_R", (DL_FUNC) &XGDMatrixCreateFromFile_R, 2},
+ {"XGDMatrixCreateFromURI_R", (DL_FUNC) &XGDMatrixCreateFromURI_R, 3},
{"XGDMatrixCreateFromMat_R", (DL_FUNC) &XGDMatrixCreateFromMat_R, 3},
{"XGDMatrixGetFloatInfo_R", (DL_FUNC) &XGDMatrixGetFloatInfo_R, 2},
{"XGDMatrixGetUIntInfo_R", (DL_FUNC) &XGDMatrixGetUIntInfo_R, 2},
diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc
index c91fb94c447c..4192f82fbaa6 100644
--- a/R-package/src/xgboost_R.cc
+++ b/R-package/src/xgboost_R.cc
@@ -365,15 +365,22 @@ XGB_DLL SEXP XGBGetGlobalConfig_R() {
return mkString(json_str);
}
-XGB_DLL SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent) {
- SEXP ret = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
+XGB_DLL SEXP XGDMatrixCreateFromURI_R(SEXP uri, SEXP silent, SEXP data_split_mode) {
+ SEXP ret = Rf_protect(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
+ SEXP uri_char = Rf_protect(Rf_asChar(uri));
+ const char *uri_ptr = CHAR(uri_char);
R_API_BEGIN();
+ xgboost::Json jconfig{xgboost::Object{}};
+ jconfig["uri"] = std::string(uri_ptr);
+ jconfig["silent"] = Rf_asLogical(silent);
+ jconfig["data_split_mode"] = Rf_asInteger(data_split_mode);
+ const std::string sconfig = xgboost::Json::Dump(jconfig);
DMatrixHandle handle;
- CHECK_CALL(XGDMatrixCreateFromFile(CHAR(asChar(fname)), asInteger(silent), &handle));
+ CHECK_CALL(XGDMatrixCreateFromURI(sconfig.c_str(), &handle));
R_SetExternalPtrAddr(ret, handle);
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
R_API_END();
- UNPROTECT(1);
+ Rf_unprotect(2);
return ret;
}
diff --git a/R-package/src/xgboost_R.h b/R-package/src/xgboost_R.h
index d2e0ae82855d..652345e52b64 100644
--- a/R-package/src/xgboost_R.h
+++ b/R-package/src/xgboost_R.h
@@ -53,12 +53,13 @@ XGB_DLL SEXP XGBSetGlobalConfig_R(SEXP json_str);
XGB_DLL SEXP XGBGetGlobalConfig_R();
/*!
- * \brief load a data matrix
- * \param fname name of the content
+ * \brief load a data matrix from URI
+ * \param uri URI to the source file to read data from
* \param silent whether print messages
+ * \param Data split mode (0=rows, 1=columns)
* \return a loaded data matrix
*/
-XGB_DLL SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent);
+XGB_DLL SEXP XGDMatrixCreateFromURI_R(SEXP uri, SEXP silent, SEXP data_split_mode);
/*!
* \brief create matrix content from dense matrix
diff --git a/R-package/src/xgboost_custom.cc b/R-package/src/xgboost_custom.cc
index 6aaa3696a7c1..fdd444e5d5fe 100644
--- a/R-package/src/xgboost_custom.cc
+++ b/R-package/src/xgboost_custom.cc
@@ -41,16 +41,6 @@ double LogGamma(double v) {
return lgammafn(v);
}
#endif // !defined(XGBOOST_USE_CUDA)
-// customize random engine.
-void CustomGlobalRandomEngine::seed(CustomGlobalRandomEngine::result_type val) {
- // ignore the seed
-}
-// use R's PRNG to replacd
-CustomGlobalRandomEngine::result_type
-CustomGlobalRandomEngine::operator()() {
- return static_cast(
- std::floor(unif_rand() * CustomGlobalRandomEngine::max()));
-}
} // namespace common
} // namespace xgboost
diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R
index 03a8ddbe124d..fb3162e423ce 100644
--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@@ -778,3 +778,66 @@ test_that("DMatrix field are set to booster when training", {
expect_equal(getinfo(model_feature_types, "feature_type"), c("q", "c", "q"))
expect_equal(getinfo(model_both, "feature_type"), c("q", "c", "q"))
})
+
+test_that("Seed in params override PRNG from R", {
+ set.seed(123)
+ model1 <- xgb.train(
+ data = xgb.DMatrix(
+ agaricus.train$data,
+ label = agaricus.train$label, nthread = 1L
+ ),
+ params = list(
+ objective = "binary:logistic",
+ max_depth = 3L,
+ subsample = 0.1,
+ colsample_bytree = 0.1,
+ seed = 111L
+ ),
+ nrounds = 3L
+ )
+
+ set.seed(456)
+ model2 <- xgb.train(
+ data = xgb.DMatrix(
+ agaricus.train$data,
+ label = agaricus.train$label, nthread = 1L
+ ),
+ params = list(
+ objective = "binary:logistic",
+ max_depth = 3L,
+ subsample = 0.1,
+ colsample_bytree = 0.1,
+ seed = 111L
+ ),
+ nrounds = 3L
+ )
+
+ expect_equal(
+ xgb.save.raw(model1, raw_format = "json"),
+ xgb.save.raw(model2, raw_format = "json")
+ )
+
+ set.seed(123)
+ model3 <- xgb.train(
+ data = xgb.DMatrix(
+ agaricus.train$data,
+ label = agaricus.train$label, nthread = 1L
+ ),
+ params = list(
+ objective = "binary:logistic",
+ max_depth = 3L,
+ subsample = 0.1,
+ colsample_bytree = 0.1,
+ seed = 222L
+ ),
+ nrounds = 3L
+ )
+ expect_false(
+ isTRUE(
+ all.equal(
+ xgb.save.raw(model1, raw_format = "json"),
+ xgb.save.raw(model3, raw_format = "json")
+ )
+ )
+ )
+})
diff --git a/R-package/tests/testthat/test_dmatrix.R b/R-package/tests/testthat/test_dmatrix.R
index 65374240df00..45bcac08d479 100644
--- a/R-package/tests/testthat/test_dmatrix.R
+++ b/R-package/tests/testthat/test_dmatrix.R
@@ -166,7 +166,7 @@ test_that("xgb.DMatrix: getinfo & setinfo", {
test_that("xgb.DMatrix: slice, dim", {
dtest <- xgb.DMatrix(test_data, label = test_label, nthread = n_threads)
expect_equal(dim(dtest), dim(test_data))
- dsub1 <- slice(dtest, 1:42)
+ dsub1 <- xgb.slice.DMatrix(dtest, 1:42)
expect_equal(nrow(dsub1), 42)
expect_equal(ncol(dsub1), ncol(test_data))
@@ -182,12 +182,12 @@ test_that("xgb.DMatrix: slice, trailing empty rows", {
dtrain <- xgb.DMatrix(
data = train_data, label = train_label, nthread = n_threads
)
- slice(dtrain, 6513L)
+ xgb.slice.DMatrix(dtrain, 6513L)
train_data[6513, ] <- 0
dtrain <- xgb.DMatrix(
data = train_data, label = train_label, nthread = n_threads
)
- slice(dtrain, 6513L)
+ xgb.slice.DMatrix(dtrain, 6513L)
expect_equal(nrow(dtrain), 6513)
})
@@ -338,19 +338,18 @@ test_that("xgb.DMatrix: data.frame", {
stringsAsFactors = TRUE
)
- m <- xgb.DMatrix(df, enable_categorical = TRUE)
+ m <- xgb.DMatrix(df)
expect_equal(colnames(m), colnames(df))
expect_equal(
getinfo(m, "feature_type"), c("float", "float", "int", "i", "c", "c")
)
- expect_error(xgb.DMatrix(df, enable_categorical = FALSE))
df <- data.frame(
missing = c("a", "b", "d", NA),
valid = c("a", "b", "d", "c"),
stringsAsFactors = TRUE
)
- m <- xgb.DMatrix(df, enable_categorical = TRUE)
+ m <- xgb.DMatrix(df)
expect_equal(getinfo(m, "feature_type"), c("c", "c"))
})
@@ -473,7 +472,7 @@ test_that("xgb.DMatrix: ExternalDMatrix produces the same results as regular DMa
y = mtcars[, 1]
)
)
- iterator_next <- function(iterator_env, proxy_handle) {
+ iterator_next <- function(iterator_env) {
curr_iter <- iterator_env[["iter"]]
if (curr_iter >= 2) {
return(NULL)
@@ -488,7 +487,7 @@ test_that("xgb.DMatrix: ExternalDMatrix produces the same results as regular DMa
on.exit({
iterator_env[["iter"]] <- curr_iter + 1
})
- return(xgb.ProxyDMatrix(data = x_batch, label = y_batch))
+ return(xgb.DataBatch(data = x_batch, label = y_batch))
}
iterator_reset <- function(iterator_env) {
iterator_env[["iter"]] <- 0
@@ -547,7 +546,7 @@ test_that("xgb.DMatrix: External QDM produces same results as regular QDM", {
y = mtcars[, 1]
)
)
- iterator_next <- function(iterator_env, proxy_handle) {
+ iterator_next <- function(iterator_env) {
curr_iter <- iterator_env[["iter"]]
if (curr_iter >= 2) {
return(NULL)
@@ -562,7 +561,7 @@ test_that("xgb.DMatrix: External QDM produces same results as regular QDM", {
on.exit({
iterator_env[["iter"]] <- curr_iter + 1
})
- return(xgb.ProxyDMatrix(data = x_batch, label = y_batch))
+ return(xgb.DataBatch(data = x_batch, label = y_batch))
}
iterator_reset <- function(iterator_env) {
iterator_env[["iter"]] <- 0
@@ -605,7 +604,7 @@ test_that("xgb.DMatrix: R errors thrown on DataIterator are thrown back to the u
y = mtcars[, 1]
)
)
- iterator_next <- function(iterator_env, proxy_handle) {
+ iterator_next <- function(iterator_env) {
curr_iter <- iterator_env[["iter"]]
if (curr_iter >= 2) {
return(0)
@@ -619,7 +618,7 @@ test_that("xgb.DMatrix: R errors thrown on DataIterator are thrown back to the u
on.exit({
iterator_env[["iter"]] <- curr_iter + 1
})
- return(xgb.ProxyDMatrix(data = x_batch, label = y_batch))
+ return(xgb.DataBatch(data = x_batch, label = y_batch))
}
iterator_reset <- function(iterator_env) {
iterator_env[["iter"]] <- 0
@@ -693,3 +692,20 @@ test_that("xgb.DMatrix: quantile cuts look correct", {
}
)
})
+
+test_that("xgb.DMatrix: can read CSV", {
+ txt <- paste(
+ "1,2,3",
+ "-1,3,2",
+ sep = "\n"
+ )
+ fname <- file.path(tempdir(), "data.csv")
+ writeChar(txt, fname)
+ uri <- paste0(fname, "?format=csv&label_column=0")
+ dm <- xgb.DMatrix(uri, silent = TRUE)
+ expect_equal(getinfo(dm, "label"), c(1, -1))
+ expect_equal(
+ as.matrix(xgb.get.DMatrix.data(dm)),
+ matrix(c(2, 3, 3, 2), nrow = 2, byrow = TRUE)
+ )
+})
diff --git a/R-package/tests/testthat/test_helpers.R b/R-package/tests/testthat/test_helpers.R
index badac0213292..38b5ca0667bf 100644
--- a/R-package/tests/testthat/test_helpers.R
+++ b/R-package/tests/testthat/test_helpers.R
@@ -511,3 +511,82 @@ test_that('convert.labels works', {
expect_equal(class(res), 'numeric')
}
})
+
+test_that("validate.features works as expected", {
+ data(mtcars)
+ y <- mtcars$mpg
+ x <- as.matrix(mtcars[, -1])
+ dm <- xgb.DMatrix(x, label = y, nthread = 1)
+ model <- xgb.train(
+ params = list(nthread = 1),
+ data = dm,
+ nrounds = 3
+ )
+
+ # result is output as-is when needed
+ res <- validate.features(model, x)
+ expect_equal(res, x)
+ res <- validate.features(model, dm)
+ expect_identical(res, dm)
+ res <- validate.features(model, as(x[1, ], "dsparseVector"))
+ expect_equal(as.numeric(res), unname(x[1, ]))
+ res <- validate.features(model, "file.txt")
+ expect_equal(res, "file.txt")
+
+ # columns are reordered
+ res <- validate.features(model, mtcars[, rev(names(mtcars))])
+ expect_equal(names(res), colnames(x))
+ expect_equal(as.matrix(res), x)
+ res <- validate.features(model, as.matrix(mtcars[, rev(names(mtcars))]))
+ expect_equal(colnames(res), colnames(x))
+ expect_equal(res, x)
+ res <- validate.features(model, mtcars[1, rev(names(mtcars)), drop = FALSE])
+ expect_equal(names(res), colnames(x))
+ expect_equal(unname(as.matrix(res)), unname(x[1, , drop = FALSE]))
+ res <- validate.features(model, as.data.table(mtcars[, rev(names(mtcars))]))
+ expect_equal(names(res), colnames(x))
+ expect_equal(unname(as.matrix(res)), unname(x))
+
+ # error when columns are missing
+ expect_error({
+ validate.features(model, mtcars[, 1:3])
+ })
+ expect_error({
+ validate.features(model, as.matrix(mtcars[, 1:ncol(x)])) # nolint
+ })
+ expect_error({
+ validate.features(model, xgb.DMatrix(mtcars[, 1:3]))
+ })
+ expect_error({
+ validate.features(model, as(x[, 1:3], "CsparseMatrix"))
+ })
+
+ # error when it cannot reorder or subset
+ expect_error({
+ validate.features(model, xgb.DMatrix(mtcars))
+ }, "Feature names")
+ expect_error({
+ validate.features(model, xgb.DMatrix(x[, rev(colnames(x))]))
+ }, "Feature names")
+
+ # no error about types if the booster doesn't have types
+ expect_error({
+ validate.features(model, xgb.DMatrix(x, feature_types = c(rep("q", 5), rep("c", 5))))
+ }, NA)
+ tmp <- mtcars
+ tmp[["vs"]] <- factor(tmp[["vs"]])
+ expect_error({
+ validate.features(model, tmp)
+ }, NA)
+
+ # error when types do not match
+ setinfo(model, "feature_type", rep("q", 10))
+ expect_error({
+ validate.features(model, xgb.DMatrix(x, feature_types = c(rep("q", 5), rep("c", 5))))
+ }, "Feature types")
+ tmp <- mtcars
+ tmp[["vs"]] <- factor(tmp[["vs"]])
+ expect_error({
+ validate.features(model, tmp)
+ }, "Feature types")
+})
diff --git a/doc/R-package/index.rst b/doc/R-package/index.rst
index ebd49bb9cc2c..bf9c1f8d9007 100644
--- a/doc/R-package/index.rst
+++ b/doc/R-package/index.rst
@@ -26,3 +26,13 @@ Tutorials
Introduction to XGBoost in R
Understanding your dataset with XGBoost
+
+************
+Other topics
+************
+
+.. toctree::
+ :maxdepth: 2
+ :titlesonly:
+
+ Handling of indexable elements
diff --git a/doc/R-package/index_base.rst b/doc/R-package/index_base.rst
new file mode 100644
index 000000000000..495b2e7602dc
--- /dev/null
+++ b/doc/R-package/index_base.rst
@@ -0,0 +1,29 @@
+.. _index_base:
+
+Handling of indexable elements
+==============================
+
+There are many functionalities in XGBoost which refer to indexable elements in a countable set, such as boosting rounds / iterations / trees in a model (which can be referred to by number), classes, categories / levels in categorical features, among others.
+
+XGBoost, being written in C++, uses base-0 indexing and considers ranges / sequences to be inclusive of the left end but not the right one - for example, a range (0, 3) would include the first three elements, numbered 0, 1, and 2.
+
+The Python interface uses this same logic, since this is also the way that indexing in Python works, but other languages like R have different logic. In R, indexing is base-1 and ranges / sequences are inclusive of both ends - for example, to refer to the first three elements in a sequence, the interval would be written as (1, 3), and the elements numbered 1, 2, and 3.
+
+In order to provide a more idiomatic R interface, XGBoost adjusts its user-facing R interface to follow this and similar R conventions, but internally, it needs to convert all these numbers to the format that the C interface uses. This is made more problematic by the fact that models are meant to be serializable and loadable in other interfaces, which will have different indexing logic.
+
+The following adjustments are made in the R interface:
+
+- Slicing method for DMatrix, which takes an array of integers, is converted to base-0 indexing by subtracting 1 from each element. Note that this is done in the C-level wrapper function for R, unlike all other conversions which are done in R before being passed to C.
+- Slicing method for Booster takes a sequence defined by start, end, and step. The R interface is made to work the same way as R's ``seq`` from the user's POV, so it always adjusts the left end by subtracting one, and depending on whether the step size ends exactly or not at the right end, will also adjust the right end to be non-inclusive in C indexing.
+- Parameter ``iterationrange`` in ``predict`` is also made to behave the same way as R's ``seq``. Since it doesn't have a step size, just adjusting the left end by subtracting 1 suffices here.
+- ``best_iteration``, depending on the context, might be stored as both a C-level booster attribute, and as an R attribute. Since the C-level attributes are shared across interfaces and used in prediction methods, in order to improve compatibility, it leaves this C-level attribute in base-0 indexing, but the R attribute, if present, will be adjusted to base-1 indexing. Note that the ``predict`` method in R and other interfaces will look at the C-level attribute only.
+- Other references to iteration numbers or boosting rounds, such as when printing metrics or saving model snapshots, also follow base-1 indexing. These other references are coded entirely in R, as the C-level functions do not handle such functionalities.
+- Terminal leaf / node numbers are returned in base-0 indexing, just like they come from the C interface.
+- Tree numbers in plots follow base-1 indexing. Note that these are only displayed when producing these plots through the R interface's own handling of DiagrammeR objects, but not when using the C-level GraphViz 'dot' format generator for plots.
+- Feature numbers when producing feature importances, JSONs, trees-to-tables, and SHAP; are all following base-0 indexing.
+- Categorical features are defined in R as a ``factor`` type which encodes with base-1 indexing. When categorical features are passed as R ``factor`` types, the conversion is done automatically to base-0 indexing, but if the user whishes to manually supply categorical features as already-encoded integers, then those integers need to already be in base-0 encoding.
+- Categorical levels (categories) in outputs such as plots, JSONs, and trees-to-tables; are also referred to using base-0 indexing, regardless of whether they went into the model as integers or as ``factor``-typed columns.
+- Categorical labels for DMatrices do not undergo any extra processing - the user must supply base-0 encoded labels.
+- A function to retrieve class-specific coefficients when using the linear coefficients history callback takes a class index parameter, which also does not undergo any conversion (i.e. user must pass a base-0 index), in order to match with the label logic - that is, the same class index will refer to the class encoded with that number in the DMatrix ``label`` field.
+
+New additions to the R interface that take on indexable elements should be mindful of these conventions and try to mimic R's behavior as much as possible.
diff --git a/doc/parameter.rst b/doc/parameter.rst
index a7d8203b0aae..7898bb363549 100644
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@@ -450,7 +450,7 @@ Specify the learning task and the corresponding learning objective. The objectiv
* ``seed`` [default=0]
- - Random number seed. This parameter is ignored in R package, use `set.seed()` instead.
+ - Random number seed. In the R package, if not specified, instead of defaulting to seed 'zero', will take a random seed through R's own RNG engine.
* ``seed_per_iteration`` [default= ``false``]
diff --git a/doc/python/sklearn_estimator.rst b/doc/python/sklearn_estimator.rst
index a4835dcacd14..207b9fa30920 100644
--- a/doc/python/sklearn_estimator.rst
+++ b/doc/python/sklearn_estimator.rst
@@ -104,7 +104,7 @@ using cross validation with early stopping, here is a snippet to begin with:
clf = xgb.XGBClassifier(tree_method="hist", early_stopping_rounds=3)
- resutls = {}
+ results = {}
for train, test in cv.split(X, y):
X_train = X[train]
@@ -114,7 +114,7 @@ using cross validation with early stopping, here is a snippet to begin with:
est, train_score, test_score = fit_and_score(
clone(clf), X_train, X_test, y_train, y_test
)
- resutls[est] = (train_score, test_score)
+ results[est] = (train_score, test_score)
***********************************
diff --git a/doc/requirements.txt b/doc/requirements.txt
index 667ef268ffb3..ddff9be928b6 100644
--- a/doc/requirements.txt
+++ b/doc/requirements.txt
@@ -7,7 +7,9 @@ sh
matplotlib
graphviz
numpy
+scipy
myst-parser
+ray[train]
xgboost_ray
sphinx-gallery
pyspark
diff --git a/doc/xgboost_doc.yml b/doc/xgboost_doc.yml
index 90b877e735ca..177e8758fad9 100644
--- a/doc/xgboost_doc.yml
+++ b/doc/xgboost_doc.yml
@@ -1,15 +1,23 @@
name: xgboost_docs
dependencies:
- - python
+ - python=3.10
- pip
- pygraphviz
- sphinx
+ - sphinx-gallery
- recommonmark
- mock
- sh
- matplotlib
+ - numpy
+ - scipy
+ - scikit-learn
+ - myst-parser
+ - pyspark
- pip:
- breathe
- sphinx_rtd_theme
- pydot-ng
- graphviz
+ - ray[train]
+ - xgboost_ray
diff --git a/include/xgboost/base.h b/include/xgboost/base.h
index dec306f0cbb7..1f94c9b2fd1d 100644
--- a/include/xgboost/base.h
+++ b/include/xgboost/base.h
@@ -37,7 +37,7 @@
* \brief Whether to customize global PRNG.
*/
#ifndef XGBOOST_CUSTOMIZE_GLOBAL_PRNG
-#define XGBOOST_CUSTOMIZE_GLOBAL_PRNG XGBOOST_STRICT_R_MODE
+#define XGBOOST_CUSTOMIZE_GLOBAL_PRNG 0
#endif // XGBOOST_CUSTOMIZE_GLOBAL_PRNG
/*!
diff --git a/plugin/sycl/common/partition_builder.h b/plugin/sycl/common/partition_builder.h
new file mode 100644
index 000000000000..37d1af241ab1
--- /dev/null
+++ b/plugin/sycl/common/partition_builder.h
@@ -0,0 +1,101 @@
+/*!
+ * Copyright 2017-2024 XGBoost contributors
+ */
+#ifndef PLUGIN_SYCL_COMMON_PARTITION_BUILDER_H_
+#define PLUGIN_SYCL_COMMON_PARTITION_BUILDER_H_
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
+#pragma GCC diagnostic ignored "-W#pragma-messages"
+#include
+#pragma GCC diagnostic pop
+#include
+
+#include
+#include
+#include
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
+#include "../../../src/common/column_matrix.h"
+#pragma GCC diagnostic pop
+
+#include "../data.h"
+
+#include
+
+namespace xgboost {
+namespace sycl {
+namespace common {
+
+// The builder is required for samples partition to left and rights children for set of nodes
+class PartitionBuilder {
+ public:
+ template
+ void Init(::sycl::queue* qu, size_t n_nodes, Func funcNTaks) {
+ qu_ = qu;
+ nodes_offsets_.resize(n_nodes+1);
+ result_rows_.resize(2 * n_nodes);
+ n_nodes_ = n_nodes;
+
+
+ nodes_offsets_[0] = 0;
+ for (size_t i = 1; i < n_nodes+1; ++i) {
+ nodes_offsets_[i] = nodes_offsets_[i-1] + funcNTaks(i-1);
+ }
+
+ if (data_.Size() < nodes_offsets_[n_nodes]) {
+ data_.Resize(qu, nodes_offsets_[n_nodes]);
+ }
+ }
+
+ size_t GetNLeftElems(int nid) const {
+ return result_rows_[2 * nid];
+ }
+
+
+ size_t GetNRightElems(int nid) const {
+ return result_rows_[2 * nid + 1];
+ }
+
+ // For test purposes only
+ void SetNLeftElems(int nid, size_t val) {
+ result_rows_[2 * nid] = val;
+ }
+
+ // For test purposes only
+ void SetNRightElems(int nid, size_t val) {
+ result_rows_[2 * nid + 1] = val;
+ }
+
+ xgboost::common::Span GetData(int nid) {
+ return { data_.Data() + nodes_offsets_[nid], nodes_offsets_[nid + 1] - nodes_offsets_[nid] };
+ }
+
+ void MergeToArray(size_t nid,
+ size_t* data_result,
+ ::sycl::event event) {
+ size_t n_nodes_total = GetNLeftElems(nid) + GetNRightElems(nid);
+ if (n_nodes_total > 0) {
+ const size_t* data = data_.Data() + nodes_offsets_[nid];
+ qu_->memcpy(data_result, data, sizeof(size_t) * n_nodes_total, event);
+ }
+ }
+
+ protected:
+ std::vector nodes_offsets_;
+ std::vector result_rows_;
+ size_t n_nodes_;
+
+ USMVector parts_size_;
+ USMVector data_;
+
+ ::sycl::queue* qu_;
+};
+
+} // namespace common
+} // namespace sycl
+} // namespace xgboost
+
+
+#endif // PLUGIN_SYCL_COMMON_PARTITION_BUILDER_H_
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index d4cc217d1ee8..0f4748bfec27 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -1,5 +1,5 @@
/**
- * Copyright 2014-2023 by XGBoost Contributors
+ * Copyright 2014-2024 by XGBoost Contributors
*/
#include "xgboost/c_api.h"
@@ -991,8 +991,8 @@ XGB_DLL int XGBoosterBoostOneIter(BoosterHandle handle, DMatrixHandle dtrain, bs
auto *learner = static_cast(handle);
auto ctx = learner->Ctx()->MakeCPU();
- auto t_grad = linalg::MakeTensorView(&ctx, common::Span{grad, len}, len);
- auto t_hess = linalg::MakeTensorView(&ctx, common::Span{hess, len}, len);
+ auto t_grad = linalg::MakeTensorView(&ctx, common::Span{grad, static_cast(len)}, len);
+ auto t_hess = linalg::MakeTensorView(&ctx, common::Span{hess, static_cast(len)}, len);
auto s_grad = linalg::ArrayInterfaceStr(t_grad);
auto s_hess = linalg::ArrayInterfaceStr(t_hess);
diff --git a/src/common/column_matrix.h b/src/common/column_matrix.h
index 0862c21ad1fd..440f3c0a87c8 100644
--- a/src/common/column_matrix.h
+++ b/src/common/column_matrix.h
@@ -1,5 +1,5 @@
/**
- * Copyright 2017-2023, XGBoost Contributors
+ * Copyright 2017-2024, XGBoost Contributors
* \file column_matrix.h
* \brief Utility for fast column-wise access
* \author Philip Cho
@@ -176,7 +176,7 @@ class ColumnMatrix {
void SetValid(typename LBitField32::index_type i) { missing.Clear(i); }
/** @brief assign the storage to the view. */
void InitView() {
- missing = LBitField32{Span{storage.data(), storage.size()}};
+ missing = LBitField32{Span{storage.data(), static_cast(storage.size())}};
}
void GrowTo(std::size_t n_elements, bool init) {
@@ -318,8 +318,8 @@ class ColumnMatrix {
common::Span bin_index = {
reinterpret_cast(&index_[feature_offset * bins_type_size_]),
column_size};
- return std::move(DenseColumnIter{
- bin_index, static_cast(index_base_[fidx]), missing_.missing, feature_offset});
+ return DenseColumnIter{
+ bin_index, static_cast(index_base_[fidx]), missing_.missing, feature_offset};
}
// all columns are dense column and has no missing value
@@ -332,7 +332,7 @@ class ColumnMatrix {
DispatchBinType(bins_type_size_, [&](auto t) {
using ColumnBinT = decltype(t);
auto column_index = Span{reinterpret_cast(index_.data()),
- index_.size() / sizeof(ColumnBinT)};
+ static_cast(index_.size() / sizeof(ColumnBinT))};
ParallelFor(n_samples, n_threads, [&](auto rid) {
rid += base_rowid;
const size_t ibegin = rid * n_features;
diff --git a/src/common/hist_util.h b/src/common/hist_util.h
index fbbd15b49fb5..e829752dae3d 100644
--- a/src/common/hist_util.h
+++ b/src/common/hist_util.h
@@ -1,5 +1,5 @@
/**
- * Copyright 2017-2023 by XGBoost Contributors
+ * Copyright 2017-2024 by XGBoost Contributors
* \file hist_util.h
* \brief Utility for fast histogram aggregation
* \author Philip Cho, Tianqi Chen
@@ -113,8 +113,8 @@ class HistogramCuts {
auto end = ptrs[column_id + 1];
auto beg = ptrs[column_id];
auto it = std::upper_bound(values.cbegin() + beg, values.cbegin() + end, value);
- auto idx = it - values.cbegin();
- idx -= !!(idx == end);
+ auto idx = static_cast(it - values.cbegin());
+ idx -= !!(idx == static_cast(end));
return idx;
}
@@ -136,8 +136,8 @@ class HistogramCuts {
auto beg = ptrs[fidx] + vals.cbegin();
// Truncates the value in case it's not perfectly rounded.
auto v = static_cast(common::AsCat(value));
- auto bin_idx = std::lower_bound(beg, end, v) - vals.cbegin();
- if (bin_idx == ptrs.at(fidx + 1)) {
+ auto bin_idx = static_cast(std::lower_bound(beg, end, v) - vals.cbegin());
+ if (bin_idx == static_cast(ptrs.at(fidx + 1))) {
bin_idx -= 1;
}
return bin_idx;
diff --git a/src/common/quantile.cc b/src/common/quantile.cc
index eaed4b12f3dc..c3b0d431c35c 100644
--- a/src/common/quantile.cc
+++ b/src/common/quantile.cc
@@ -371,7 +371,6 @@ void AddCutPointSecure(typename SketchType::SummaryContainer const &summary, int
HistogramCuts *cuts) {
// For secure vertical pipeline, we fill the cut values corresponding to empty columns
// with a vector of minimum value
- const float mval = 1e-5f;
size_t required_cuts = std::min(summary.size, static_cast(max_bin));
// make a copy of required_cuts for mode selection
size_t required_cuts_original = required_cuts;
diff --git a/src/common/random.h b/src/common/random.h
index 2a94123a3f11..ece6fa46f16c 100644
--- a/src/common/random.h
+++ b/src/common/random.h
@@ -31,7 +31,7 @@ namespace xgboost::common {
*/
using RandomEngine = std::mt19937;
-#if XGBOOST_CUSTOMIZE_GLOBAL_PRNG
+#if defined(XGBOOST_CUSTOMIZE_GLOBAL_PRNG) && XGBOOST_CUSTOMIZE_GLOBAL_PRNG == 1
/*!
* \brief An customized random engine, used to be plugged in PRNG from other systems.
* The implementation of this library is not provided by xgboost core library.
diff --git a/src/common/ref_resource_view.h b/src/common/ref_resource_view.h
index d4f82e615c6f..61adfdb7bea8 100644
--- a/src/common/ref_resource_view.h
+++ b/src/common/ref_resource_view.h
@@ -1,5 +1,5 @@
/**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
*/
#ifndef XGBOOST_COMMON_REF_RESOURCE_VIEW_H_
#define XGBOOST_COMMON_REF_RESOURCE_VIEW_H_
@@ -76,7 +76,7 @@ class RefResourceView {
[[nodiscard]] size_type size() const { return size_; } // NOLINT
[[nodiscard]] size_type size_bytes() const { // NOLINT
- return Span{data(), size()}.size_bytes();
+ return Span{data(), static_cast(size())}.size_bytes();
}
[[nodiscard]] value_type* data() { return ptr_; }; // NOLINT
[[nodiscard]] value_type const* data() const { return ptr_; }; // NOLINT
diff --git a/src/data/gradient_index.cc b/src/data/gradient_index.cc
index 1d3faf94532e..88a38d5cce74 100644
--- a/src/data/gradient_index.cc
+++ b/src/data/gradient_index.cc
@@ -1,5 +1,5 @@
/**
- * Copyright 2017-2023, XGBoost Contributors
+ * Copyright 2017-2024, XGBoost Contributors
* \brief Data type for fast histogram aggregation.
*/
#include "gradient_index.h"
@@ -148,7 +148,8 @@ void GHistIndexMatrix::ResizeIndex(const size_t n_index, const bool isDense) {
new_vec = {new_ptr, n_bytes / sizeof(std::uint8_t), malloc_resource};
}
this->data = std::move(new_vec);
- this->index = common::Index{common::Span{data.data(), data.size()}, t_size};
+ this->index = common::Index{common::Span{data.data(), static_cast(data.size())},
+ t_size};
};
if ((MaxNumBinPerFeat() - 1 <= static_cast(std::numeric_limits::max())) &&
diff --git a/src/data/gradient_index_format.cc b/src/data/gradient_index_format.cc
index fa8f492ed12a..542d3aaebda7 100644
--- a/src/data/gradient_index_format.cc
+++ b/src/data/gradient_index_format.cc
@@ -1,5 +1,5 @@
/**
- * Copyright 2021-2023 XGBoost contributors
+ * Copyright 2021-2024 XGBoost contributors
*/
#include // for size_t
#include // for uint8_t
@@ -40,7 +40,9 @@ class GHistIndexRawFormat : public SparsePageFormat {
return false;
}
// - index
- page->index = common::Index{common::Span{page->data.data(), page->data.size()}, size_type};
+ page->index =
+ common::Index{common::Span{page->data.data(), static_cast(page->data.size())},
+ size_type};
// hit count
if (!common::ReadVec(fi, &page->hit_count)) {
diff --git a/src/predictor/predictor.cc b/src/predictor/predictor.cc
index aad33c272dc7..019804eda31c 100644
--- a/src/predictor/predictor.cc
+++ b/src/predictor/predictor.cc
@@ -1,5 +1,5 @@
/**
- * Copyright 2017-2023 by Contributors
+ * Copyright 2017-2024 by Contributors
*/
#include "xgboost/predictor.h"
@@ -46,7 +46,7 @@ void ValidateBaseMarginShape(linalg::Tensor const& margin, bst_row_t n
void Predictor::InitOutPredictions(const MetaInfo& info, HostDeviceVector* out_preds,
const gbm::GBTreeModel& model) const {
CHECK_NE(model.learner_model_param->num_output_group, 0);
- std::size_t n{model.learner_model_param->OutputLength() * info.num_row_};
+ auto n = static_cast(model.learner_model_param->OutputLength() * info.num_row_);
const HostDeviceVector* base_margin = info.base_margin_.Data();
if (ctx_->Device().IsCUDA()) {
diff --git a/src/tree/hist/hist_cache.h b/src/tree/hist/hist_cache.h
index 8a2ba193af0c..715e1d73e60c 100644
--- a/src/tree/hist/hist_cache.h
+++ b/src/tree/hist/hist_cache.h
@@ -1,5 +1,5 @@
/**
- * Copyright 2023 by XGBoost Contributors
+ * Copyright 2023-2024 by XGBoost Contributors
*/
#ifndef XGBOOST_TREE_HIST_HIST_CACHE_H_
#define XGBOOST_TREE_HIST_HIST_CACHE_H_
@@ -48,11 +48,13 @@ class BoundedHistCollection {
BoundedHistCollection() = default;
common::GHistRow operator[](std::size_t idx) {
auto offset = node_map_.at(idx);
- return common::Span{data_->data(), data_->size()}.subspan(offset, n_total_bins_);
+ return common::Span{data_->data(), static_cast(data_->size())}.subspan(
+ offset, n_total_bins_);
}
common::ConstGHistRow operator[](std::size_t idx) const {
auto offset = node_map_.at(idx);
- return common::Span{data_->data(), data_->size()}.subspan(offset, n_total_bins_);
+ return common::Span{data_->data(), static_cast(data_->size())}.subspan(
+ offset, n_total_bins_);
}
void Reset(bst_bin_t n_total_bins, std::size_t n_cached_nodes) {
n_total_bins_ = n_total_bins;
diff --git a/tests/ci_build/Dockerfile.i386 b/tests/ci_build/Dockerfile.i386
new file mode 100644
index 000000000000..d7c133e2aee4
--- /dev/null
+++ b/tests/ci_build/Dockerfile.i386
@@ -0,0 +1,8 @@
+FROM i386/debian:sid
+
+ENV DEBIAN_FRONTEND noninteractive
+SHELL ["/bin/bash", "-c"] # Use Bash as shell
+
+RUN \
+ apt-get update && \
+ apt-get install -y tar unzip wget git build-essential ninja-build cmake
diff --git a/tests/cpp/CMakeLists.txt b/tests/cpp/CMakeLists.txt
index 08862feee79a..20923519ac49 100644
--- a/tests/cpp/CMakeLists.txt
+++ b/tests/cpp/CMakeLists.txt
@@ -14,8 +14,38 @@ if(USE_CUDA)
endif()
file(GLOB_RECURSE SYCL_TEST_SOURCES "plugin/test_sycl_*.cc")
-if(NOT PLUGIN_SYCL)
- list(REMOVE_ITEM TEST_SOURCES ${SYCL_TEST_SOURCES})
+list(REMOVE_ITEM TEST_SOURCES ${SYCL_TEST_SOURCES})
+
+if(PLUGIN_SYCL)
+ set(CMAKE_CXX_COMPILER "icpx")
+ file(GLOB_RECURSE SYCL_TEST_SOURCES "plugin/test_sycl_*.cc")
+ add_library(plugin_sycl_test OBJECT ${SYCL_TEST_SOURCES})
+
+ target_include_directories(plugin_sycl_test
+ PRIVATE
+ ${gtest_SOURCE_DIR}/include
+ ${xgboost_SOURCE_DIR}/include
+ ${xgboost_SOURCE_DIR}/dmlc-core/include
+ ${xgboost_SOURCE_DIR}/rabit/include)
+
+ target_compile_definitions(plugin_sycl_test PUBLIC -DXGBOOST_USE_SYCL=1)
+
+ target_link_libraries(plugin_sycl_test PUBLIC -fsycl)
+
+ set_target_properties(plugin_sycl_test PROPERTIES
+ COMPILE_FLAGS -fsycl
+ CXX_STANDARD 17
+ CXX_STANDARD_REQUIRED ON
+ POSITION_INDEPENDENT_CODE ON)
+ if(USE_OPENMP)
+ find_package(OpenMP REQUIRED)
+ set_target_properties(plugin_sycl_test PROPERTIES
+ COMPILE_FLAGS "-fsycl -qopenmp")
+ endif()
+ # Get compilation and link flags of plugin_sycl and propagate to testxgboost
+ target_link_libraries(testxgboost PUBLIC plugin_sycl_test)
+ # Add all objects of plugin_sycl to testxgboost
+ target_sources(testxgboost INTERFACE $)
endif()
if(PLUGIN_FEDERATED)
diff --git a/tests/cpp/c_api/test_c_api.cc b/tests/cpp/c_api/test_c_api.cc
index 4491dee92de5..c4c1f0c45f42 100644
--- a/tests/cpp/c_api/test_c_api.cc
+++ b/tests/cpp/c_api/test_c_api.cc
@@ -1,5 +1,5 @@
/**
- * Copyright 2019-2023 XGBoost contributors
+ * Copyright 2019-2024 XGBoost contributors
*/
#include
#include
@@ -212,8 +212,8 @@ TEST(CAPI, JsonModelIO) {
bst_ulong saved_len{0};
XGBoosterSaveModelToBuffer(handle, R"({"format": "ubj"})", &saved_len, &saved);
ASSERT_EQ(len, saved_len);
- auto l = StringView{data, len};
- auto r = StringView{saved, saved_len};
+ auto l = StringView{data, static_cast(len)};
+ auto r = StringView{saved, static_cast(saved_len)};
ASSERT_EQ(l.size(), r.size());
ASSERT_EQ(l, r);
diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc
index 97db9dbd87fc..6ce362f46763 100644
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -1,5 +1,5 @@
/**
- * Copyright 2016-2023 by XGBoost contributors
+ * Copyright 2016-2024 by XGBoost contributors
*/
#include "helpers.h"
@@ -216,7 +216,7 @@ SimpleLCG::StateType SimpleLCG::Max() const { return max(); }
static_assert(SimpleLCG::max() - SimpleLCG::min());
void RandomDataGenerator::GenerateLabels(std::shared_ptr p_fmat) const {
- RandomDataGenerator{p_fmat->Info().num_row_, this->n_targets_, 0.0f}.GenerateDense(
+ RandomDataGenerator{static_cast(p_fmat->Info().num_row_), this->n_targets_, 0.0f}.GenerateDense(
p_fmat->Info().labels.Data());
CHECK_EQ(p_fmat->Info().labels.Size(), this->rows_ * this->n_targets_);
p_fmat->Info().labels.Reshape(this->rows_, this->n_targets_);
@@ -458,7 +458,7 @@ void RandomDataGenerator::GenerateCSR(
EXPECT_EQ(row_count, dmat->Info().num_row_);
if (with_label) {
- RandomDataGenerator{dmat->Info().num_row_, this->n_targets_, 0.0f}.GenerateDense(
+ RandomDataGenerator{static_cast(dmat->Info().num_row_), this->n_targets_, 0.0f}.GenerateDense(
dmat->Info().labels.Data());
CHECK_EQ(dmat->Info().labels.Size(), this->rows_ * this->n_targets_);
dmat->Info().labels.Reshape(this->rows_, this->n_targets_);
diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h
index 9adda8aedfad..d603685eb073 100644
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -1,5 +1,5 @@
/**
- * Copyright 2016-2023 by XGBoost contributors
+ * Copyright 2016-2024 by XGBoost contributors
*/
#pragma once
@@ -238,7 +238,7 @@ class RandomDataGenerator {
bst_bin_t bins_{0};
std::vector ft_;
- bst_cat_t max_cat_;
+ bst_cat_t max_cat_{32};
Json ArrayInterfaceImpl(HostDeviceVector* storage, size_t rows, size_t cols) const;
diff --git a/tests/cpp/plugin/test_sycl_multiclass_obj.cc b/tests/cpp/plugin/test_sycl_multiclass_obj.cc
index d809ecad3fc1..d306337ac599 100644
--- a/tests/cpp/plugin/test_sycl_multiclass_obj.cc
+++ b/tests/cpp/plugin/test_sycl_multiclass_obj.cc
@@ -2,7 +2,11 @@
* Copyright 2018-2023 XGBoost contributors
*/
#include
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
+#pragma GCC diagnostic ignored "-W#pragma-messages"
#include
+#pragma GCC diagnostic pop
#include "../objective/test_multiclass_obj.h"
diff --git a/tests/cpp/plugin/test_sycl_partition_builder.cc b/tests/cpp/plugin/test_sycl_partition_builder.cc
new file mode 100644
index 000000000000..90bc757eb1b0
--- /dev/null
+++ b/tests/cpp/plugin/test_sycl_partition_builder.cc
@@ -0,0 +1,91 @@
+/**
+ * Copyright 2020-2024 by XGBoost contributors
+ */
+#include
+
+#include
+#include
+#include
+
+#include "../../../plugin/sycl/common/partition_builder.h"
+#include "../../../plugin/sycl/device_manager.h"
+#include "../helpers.h"
+
+namespace xgboost::sycl::common {
+
+TEST(SyclPartitionBuilder, BasicTest) {
+ constexpr size_t kNodes = 5;
+ // Number of rows for each node
+ std::vector rows = { 5, 5, 10, 1, 2 };
+
+ DeviceManager device_manager;
+ auto qu = device_manager.GetQueue(DeviceOrd::SyclDefault());
+ PartitionBuilder builder;
+ builder.Init(&qu, kNodes, [&](size_t i) {
+ return rows[i];
+ });
+
+ // We test here only the basics, thus syntetic partition builder is adopted
+ // Number of rows to go left for each node.
+ std::vector rows_for_left_node = { 2, 0, 7, 1, 2 };
+
+ size_t first_row_id = 0;
+ for(size_t nid = 0; nid < kNodes; ++nid) {
+ size_t n_rows_nodes = rows[nid];
+
+ auto rid_buff = builder.GetData(nid);
+ size_t rid_buff_size = rid_buff.size();
+ auto* rid_buff_ptr = rid_buff.data();
+
+ size_t n_left = rows_for_left_node[nid];
+ size_t n_right = rows[nid] - n_left;
+
+ qu.submit([&](::sycl::handler& cgh) {
+ cgh.parallel_for<>(::sycl::range<1>(n_left), [=](::sycl::id<1> pid) {
+ int row_id = first_row_id + pid[0];
+ rid_buff_ptr[pid[0]] = row_id;
+ });
+ });
+ qu.wait();
+ first_row_id += n_left;
+
+ // We are storing indexes for the right side in the tail of the array to save some memory
+ qu.submit([&](::sycl::handler& cgh) {
+ cgh.parallel_for<>(::sycl::range<1>(n_right), [=](::sycl::id<1> pid) {
+ int row_id = first_row_id + pid[0];
+ rid_buff_ptr[rid_buff_size - pid[0] - 1] = row_id;
+ });
+ });
+ qu.wait();
+ first_row_id += n_right;
+
+ builder.SetNLeftElems(nid, n_left);
+ builder.SetNRightElems(nid, n_right);
+ }
+
+ ::sycl::event event;
+ std::vector v(*std::max_element(rows.begin(), rows.end()));
+ size_t row_id = 0;
+ for(size_t nid = 0; nid < kNodes; ++nid) {
+ builder.MergeToArray(nid, v.data(), event);
+ qu.wait();
+
+ // Check that row_id for left side are correct
+ for(size_t j = 0; j < rows_for_left_node[nid]; ++j) {
+ ASSERT_EQ(v[j], row_id++);
+ }
+
+ // Check that row_id for right side are correct
+ for(size_t j = 0; j < rows[nid] - rows_for_left_node[nid]; ++j) {
+ ASSERT_EQ(v[rows[nid] - j - 1], row_id++);
+ }
+
+ // Check that number of left/right rows are correct
+ size_t n_left = builder.GetNLeftElems(nid);
+ size_t n_right = builder.GetNRightElems(nid);
+ ASSERT_EQ(n_left, rows_for_left_node[nid]);
+ ASSERT_EQ(n_right, (rows[nid] - rows_for_left_node[nid]));
+ }
+}
+
+} // namespace xgboost::common
diff --git a/tests/cpp/plugin/test_sycl_predictor.cc b/tests/cpp/plugin/test_sycl_predictor.cc
index f82a9f33d5f8..d5b3a5e5cd9a 100755
--- a/tests/cpp/plugin/test_sycl_predictor.cc
+++ b/tests/cpp/plugin/test_sycl_predictor.cc
@@ -2,11 +2,19 @@
* Copyright 2017-2023 XGBoost contributors
*/
#include
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
+#pragma GCC diagnostic ignored "-W#pragma-messages"
#include
+#pragma GCC diagnostic pop
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
#include "../../../src/data/adapter.h"
-#include "../../../src/data/proxy_dmatrix.h"
#include "../../../src/gbm/gbtree.h"
+#pragma GCC diagnostic pop
+
+#include "../../../src/data/proxy_dmatrix.h"
#include "../../../src/gbm/gbtree_model.h"
#include "../filesystem.h" // dmlc::TemporaryDirectory
#include "../helpers.h"
diff --git a/tests/cpp/plugin/test_sycl_regression_obj.cc b/tests/cpp/plugin/test_sycl_regression_obj.cc
index 66b4ea508477..349415390268 100644
--- a/tests/cpp/plugin/test_sycl_regression_obj.cc
+++ b/tests/cpp/plugin/test_sycl_regression_obj.cc
@@ -2,7 +2,11 @@
* Copyright 2017-2019 XGBoost contributors
*/
#include
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
+#pragma GCC diagnostic ignored "-W#pragma-messages"
#include
+#pragma GCC diagnostic pop
#include
#include "../helpers.h"
diff --git a/tests/cpp/tree/hist/test_histogram.cc b/tests/cpp/tree/hist/test_histogram.cc
index 25a800367c49..76428d1d83b4 100644
--- a/tests/cpp/tree/hist/test_histogram.cc
+++ b/tests/cpp/tree/hist/test_histogram.cc
@@ -68,8 +68,8 @@ void TestAddHistRows(bool is_distributed) {
HistMakerTrainParam hist_param;
HistogramBuilder histogram_builder;
- histogram_builder.Reset(&ctx, gmat.cut.TotalBins(), {kMaxBins, 0.5}, is_distributed, false,
- &hist_param);
+ histogram_builder.Reset(&ctx, gmat.cut.TotalBins(), {kMaxBins, 0.5}, is_distributed,
+ false, false, &hist_param);
histogram_builder.AddHistRows(&tree, &nodes_to_build, &nodes_to_sub, false);
for (bst_node_t const &nidx : nodes_to_build) {
@@ -102,7 +102,7 @@ void TestSyncHist(bool is_distributed) {
HistogramBuilder histogram;
uint32_t total_bins = gmat.cut.Ptrs().back();
HistMakerTrainParam hist_param;
- histogram.Reset(&ctx, total_bins, {kMaxBins, 0.5}, is_distributed, false, &hist_param);
+ histogram.Reset(&ctx, total_bins, {kMaxBins, 0.5}, is_distributed, false, false, &hist_param);
common::RowSetCollection row_set_collection;
{
@@ -222,13 +222,13 @@ TEST(CPUHistogram, SyncHist) {
TestSyncHist(false);
}
-void TestBuildHistogram(bool is_distributed, bool force_read_by_column, bool is_col_split) {
+void TestBuildHistogram(bool is_distributed, bool force_read_by_column, bool is_col_split, bool is_secure) {
size_t constexpr kNRows = 8, kNCols = 16;
int32_t constexpr kMaxBins = 4;
Context ctx;
auto p_fmat =
RandomDataGenerator(kNRows, kNCols, 0.8).Seed(3).GenerateDMatrix();
- if (is_col_split) {
+ if (is_col_split && !is_secure) {
p_fmat = std::shared_ptr{
p_fmat->SliceCol(collective::GetWorldSize(), collective::GetRank())};
}
@@ -244,7 +244,7 @@ void TestBuildHistogram(bool is_distributed, bool force_read_by_column, bool is_
bst_node_t nid = 0;
HistogramBuilder histogram;
HistMakerTrainParam hist_param;
- histogram.Reset(&ctx, total_bins, {kMaxBins, 0.5}, is_distributed, is_col_split, &hist_param);
+ histogram.Reset(&ctx, total_bins, {kMaxBins, 0.5}, is_distributed, is_col_split, is_secure, &hist_param);
RegTree tree;
@@ -286,22 +286,41 @@ void TestBuildHistogram(bool is_distributed, bool force_read_by_column, bool is_
// Now validate the computed histogram returned by BuildHist
for (size_t i = 0; i < histogram.Histogram()[nid].size(); ++i) {
GradientPairPrecise sol = histogram_expected[i];
- ASSERT_NEAR(sol.GetGrad(), histogram.Histogram()[nid][i].GetGrad(), kEps);
- ASSERT_NEAR(sol.GetHess(), histogram.Histogram()[nid][i].GetHess(), kEps);
+ double grad = sol.GetGrad();
+ double hess = sol.GetHess();
+ if (is_distributed && (!is_col_split || (is_secure && is_col_split))) {
+ // the solution also needs to be allreduce
+ collective::Allreduce(&grad, 1);
+ collective::Allreduce(&hess, 1);
+ }
+ ASSERT_NEAR(grad, histogram.Histogram()[nid][i].GetGrad(), kEps);
+ ASSERT_NEAR(hess, histogram.Histogram()[nid][i].GetHess(), kEps);
}
}
TEST(CPUHistogram, BuildHist) {
- TestBuildHistogram(true, false, false);
- TestBuildHistogram(false, false, false);
- TestBuildHistogram(true, true, false);
- TestBuildHistogram(false, true, false);
+ TestBuildHistogram(true, false, false, false);
+ TestBuildHistogram(false, false, false, false);
+ TestBuildHistogram(true, true, false, false);
+ TestBuildHistogram(false, true, false, false);
+}
+
+TEST(CPUHistogram, BuildHistDist) {
+ auto constexpr kWorkers = 4;
+ RunWithInMemoryCommunicator(kWorkers, TestBuildHistogram, true, false, false, false);
+ RunWithInMemoryCommunicator(kWorkers, TestBuildHistogram, true, true, false, false);
+}
+
+TEST(CPUHistogram, BuildHistDistColSplit) {
+ auto constexpr kWorkers = 4;
+ RunWithInMemoryCommunicator(kWorkers, TestBuildHistogram, true, true, true, false);
+ RunWithInMemoryCommunicator(kWorkers, TestBuildHistogram, true, false, true, false);
}
-TEST(CPUHistogram, BuildHistColSplit) {
+TEST(CPUHistogram, BuildHistDistColSplitSecure) {
auto constexpr kWorkers = 4;
- RunWithInMemoryCommunicator(kWorkers, TestBuildHistogram, true, true, true);
- RunWithInMemoryCommunicator(kWorkers, TestBuildHistogram, true, false, true);
+ RunWithInMemoryCommunicator(kWorkers, TestBuildHistogram, true, true, true, true);
+ RunWithInMemoryCommunicator(kWorkers, TestBuildHistogram, true, false, true, true);
}
namespace {
@@ -360,7 +379,7 @@ void TestHistogramCategorical(size_t n_categories, bool force_read_by_column) {
HistogramBuilder cat_hist;
for (auto const &gidx : cat_m->GetBatches(&ctx, {kBins, 0.5})) {
auto total_bins = gidx.cut.TotalBins();
- cat_hist.Reset(&ctx, total_bins, {kBins, 0.5}, false, false, &hist_param);
+ cat_hist.Reset(&ctx, total_bins, {kBins, 0.5}, false, false, false, &hist_param);
cat_hist.AddHistRows(&tree, &nodes_to_build, &dummy_sub, false);
cat_hist.BuildHist(0, space, gidx, row_set_collection, nodes_to_build,
linalg::MakeTensorView(&ctx, gpair.ConstHostSpan(), gpair.Size()),
@@ -376,7 +395,7 @@ void TestHistogramCategorical(size_t n_categories, bool force_read_by_column) {
HistogramBuilder onehot_hist;
for (auto const &gidx : encode_m->GetBatches(&ctx, {kBins, 0.5})) {
auto total_bins = gidx.cut.TotalBins();
- onehot_hist.Reset(&ctx, total_bins, {kBins, 0.5}, false, false, &hist_param);
+ onehot_hist.Reset(&ctx, total_bins, {kBins, 0.5}, false, false, false, &hist_param);
onehot_hist.AddHistRows(&tree, &nodes_to_build, &dummy_sub, false);
onehot_hist.BuildHist(0, space, gidx, row_set_collection, nodes_to_build,
linalg::MakeTensorView(&ctx, gpair.ConstHostSpan(), gpair.Size()),
@@ -442,7 +461,7 @@ void TestHistogramExternalMemory(Context const *ctx, BatchParam batch_param, boo
}
ASSERT_EQ(n_samples, m->Info().num_row_);
- multi_build.Reset(ctx, total_bins, batch_param, false, false, &hist_param);
+ multi_build.Reset(ctx, total_bins, batch_param, false, false, false, &hist_param);
multi_build.AddHistRows(&tree, &nodes, &dummy_sub, false);
std::size_t page_idx{0};
for (auto const &page : m->GetBatches(ctx, batch_param)) {
@@ -465,7 +484,7 @@ void TestHistogramExternalMemory(Context const *ctx, BatchParam batch_param, boo
common::RowSetCollection row_set_collection;
InitRowPartitionForTest(&row_set_collection, n_samples);
- single_build.Reset(ctx, total_bins, batch_param, false, false, &hist_param);
+ single_build.Reset(ctx, total_bins, batch_param, false, false, false, &hist_param);
SparsePage concat;
std::vector hess(m->Info().num_row_, 1.0f);
for (auto const &page : m->GetBatches()) {
@@ -542,7 +561,7 @@ class OverflowTest : public ::testing::TestWithParam> {
CHECK_EQ(Xy->Info().IsColumnSplit(), is_col_split);
hist_builder.Reset(&ctx, n_total_bins, tree.NumTargets(), batch, is_distributed,
- Xy->Info().IsColumnSplit(), &hist_param);
+ Xy->Info().IsColumnSplit(), Xy->Info().IsSecure(), &hist_param);
std::vector partitioners;
partitioners.emplace_back(&ctx, Xy->Info().num_row_, /*base_rowid=*/0,