diff --git a/.github/workflows/i386.yml b/.github/workflows/i386.yml
new file mode 100644
index 000000000000..4a4d65b25b61
--- /dev/null
+++ b/.github/workflows/i386.yml
@@ -0,0 +1,39 @@
+name: XGBoost-i386-test
+
+on: [push, pull_request]
+
+permissions:
+ contents: read # to fetch code (actions/checkout)
+
+jobs:
+ build-32bit:
+ name: Build 32-bit
+ runs-on: ubuntu-latest
+ services:
+ registry:
+ image: registry:2
+ ports:
+ - 5000:5000
+ steps:
+ - uses: actions/checkout@v2.5.0
+ with:
+ submodules: 'true'
+ - name: Set up Docker Buildx
+ uses: docker/setup-buildx-action@v3
+ with:
+ driver-opts: network=host
+ - name: Build and push container
+ uses: docker/build-push-action@v5
+ with:
+ context: .
+ file: tests/ci_build/Dockerfile.i386
+ push: true
+ tags: localhost:5000/xgboost/build-32bit:latest
+ cache-from: type=gha
+ cache-to: type=gha,mode=max
+ - name: Build XGBoost
+ run: |
+ docker run --rm -v $PWD:/workspace -w /workspace \
+ -e CXXFLAGS='-Wno-error=overloaded-virtual -Wno-error=maybe-uninitialized -Wno-error=redundant-move' \
+ localhost:5000/xgboost/build-32bit:latest \
+ tests/ci_build/build_via_cmake.sh
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4f240e806282..dbfa1cdc225b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -39,9 +39,6 @@ elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
message(FATAL_ERROR "Need Clang 9.0 or newer to build XGBoost")
endif()
endif()
-if(CMAKE_SIZE_OF_VOID_P EQUAL 4)
- message(FATAL_ERROR "XGBoost does not support 32-bit archs. Please use 64-bit arch instead.")
-endif()
include(${xgboost_SOURCE_DIR}/cmake/PrefetchIntrinsics.cmake)
find_prefetch_intrinsics()
diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE
index 49f93bb57274..580d1f87325f 100644
--- a/R-package/NAMESPACE
+++ b/R-package/NAMESPACE
@@ -15,7 +15,6 @@ S3method(print,xgb.DMatrix)
S3method(print,xgb.cv.synchronous)
S3method(setinfo,xgb.Booster)
S3method(setinfo,xgb.DMatrix)
-S3method(slice,xgb.DMatrix)
S3method(variable.names,xgb.Booster)
export("xgb.attr<-")
export("xgb.attributes<-")
@@ -30,13 +29,12 @@ export(cb.reset.parameters)
export(cb.save.model)
export(getinfo)
export(setinfo)
-export(slice)
export(xgb.DMatrix)
export(xgb.DMatrix.hasinfo)
export(xgb.DMatrix.save)
+export(xgb.DataBatch)
export(xgb.DataIter)
export(xgb.ExternalDMatrix)
-export(xgb.ProxyDMatrix)
export(xgb.QuantileDMatrix)
export(xgb.QuantileDMatrix.from_iterator)
export(xgb.attr)
@@ -70,6 +68,7 @@ export(xgb.save)
export(xgb.save.raw)
export(xgb.set.config)
export(xgb.slice.Booster)
+export(xgb.slice.DMatrix)
export(xgb.train)
export(xgboost)
import(methods)
diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R
index 7613c9152d14..febefb757129 100644
--- a/R-package/R/xgb.Booster.R
+++ b/R-package/R/xgb.Booster.R
@@ -111,6 +111,21 @@ xgb.get.handle <- function(object) {
#' If passing "all", will use all of the rounds regardless of whether the model had early stopping or not.
#' @param strict_shape Default is `FALSE`. When set to `TRUE`, the output
#' type and shape of predictions are invariant to the model type.
+#' @param validate_features When `TRUE`, validate that the Booster's and newdata's feature_names
+#' match (only applicable when both `object` and `newdata` have feature names).
+#'
+#' If the column names differ and `newdata` is not an `xgb.DMatrix`, will try to reorder
+#' the columns in `newdata` to match with the booster's.
+#'
+#' If the booster has feature types and `newdata` is either an `xgb.DMatrix` or `data.frame`,
+#' will additionally verify that categorical columns are of the correct type in `newdata`,
+#' throwing an error if they do not match.
+#'
+#' If passing `FALSE`, it is assumed that the feature names and types are the same,
+#' and come in the same order as in the training data.
+#'
+#' Note that this check might add some sizable latency to the predictions, so it's
+#' recommended to disable it for performance-sensitive applications.
#' @param ... Not used.
#'
#' @details
@@ -271,7 +286,11 @@ xgb.get.handle <- function(object) {
#' @export
predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FALSE,
predleaf = FALSE, predcontrib = FALSE, approxcontrib = FALSE, predinteraction = FALSE,
- reshape = FALSE, training = FALSE, iterationrange = NULL, strict_shape = FALSE, ...) {
+ reshape = FALSE, training = FALSE, iterationrange = NULL, strict_shape = FALSE,
+ validate_features = FALSE, ...) {
+ if (validate_features) {
+ newdata <- validate.features(object, newdata)
+ }
if (!inherits(newdata, "xgb.DMatrix")) {
nthread <- xgb.nthread(object)
newdata <- xgb.DMatrix(
@@ -418,6 +437,85 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
return(arr)
}
+validate.features <- function(bst, newdata) {
+ if (is.character(newdata)) {
+ # this will be encountered when passing file paths
+ return(newdata)
+ }
+ if (inherits(newdata, "sparseVector")) {
+ # in this case, newdata won't have metadata
+ return(newdata)
+ }
+ if (is.vector(newdata)) {
+ newdata <- as.matrix(newdata)
+ }
+
+ booster_names <- getinfo(bst, "feature_name")
+ checked_names <- FALSE
+ if (NROW(booster_names)) {
+
+ try_reorder <- FALSE
+ if (inherits(newdata, "xgb.DMatrix")) {
+ curr_names <- getinfo(newdata, "feature_name")
+ } else {
+ curr_names <- colnames(newdata)
+ try_reorder <- TRUE
+ }
+
+ if (NROW(curr_names)) {
+ checked_names <- TRUE
+
+ if (length(curr_names) != length(booster_names) || any(curr_names != booster_names)) {
+
+ if (!try_reorder) {
+ stop("Feature names in 'newdata' do not match with booster's.")
+ } else {
+ if (inherits(newdata, "data.table")) {
+ newdata <- newdata[, booster_names, with = FALSE]
+ } else {
+ newdata <- newdata[, booster_names, drop = FALSE]
+ }
+ }
+
+ }
+
+ } # if (NROW(curr_names)) {
+
+ } # if (NROW(booster_names)) {
+
+ if (inherits(newdata, c("data.frame", "xgb.DMatrix"))) {
+
+ booster_types <- getinfo(bst, "feature_type")
+ if (!NROW(booster_types)) {
+ # Note: types in the booster are optional. Other interfaces
+ # might not even save it as booster attributes for example,
+ # even if the model uses categorical features.
+ return(newdata)
+ }
+ if (inherits(newdata, "xgb.DMatrix")) {
+ curr_types <- getinfo(newdata, "feature_type")
+ if (length(curr_types) != length(booster_types) || any(curr_types != booster_types)) {
+ stop("Feature types in 'newdata' do not match with booster's.")
+ }
+ }
+ if (inherits(newdata, "data.frame")) {
+ is_factor <- sapply(newdata, is.factor)
+ if (any(is_factor != (booster_types == "c"))) {
+ stop(
+ paste0(
+ "Feature types in 'newdata' do not match with booster's for same columns (by ",
+ ifelse(checked_names, "name", "position"),
+ ")."
+ )
+ )
+ }
+ }
+
+ }
+
+ return(newdata)
+}
+
#' @title Accessors for serializable attributes of a model
#'
diff --git a/R-package/R/xgb.DMatrix.R b/R-package/R/xgb.DMatrix.R
index 2a664090d2dd..ba0686cf9b04 100644
--- a/R-package/R/xgb.DMatrix.R
+++ b/R-package/R/xgb.DMatrix.R
@@ -16,10 +16,6 @@
#' \item `matrix` objects, with types `numeric`, `integer`, or `logical`.
#' \item `data.frame` objects, with columns of types `numeric`, `integer`, `logical`, or `factor`.
#'
-#' If passing `enable_categorical=TRUE`, columns with `factor` type will be treated as categorical.
-#' Otherwise, if passing `enable_categorical=FALSE` and the data contains `factor` columns, an error
-#' will be thrown.
-#'
#' Note that xgboost uses base-0 encoding for categorical types, hence `factor` types (which use base-1
#' encoding') will be converted inside the function call. Be aware that the encoding used for `factor`
#' types is not kept as part of the model, so in subsequent calls to `predict`, it is the user's
@@ -60,7 +56,7 @@
#' must be the same as in the DMatrix construction, regardless of the column names.
#' @param feature_types Set types for features.
#'
-#' If `data` is a `data.frame` and passing `enable_categorical=TRUE`, the types will be deduced
+#' If `data` is a `data.frame` and passing `feature_types` is not supplied, feature types will be deduced
#' automatically from the column types.
#'
#' Otherwise, one can pass a character vector with the same length as number of columns in `data`,
@@ -85,18 +81,6 @@
#' @param label_lower_bound Lower bound for survival training.
#' @param label_upper_bound Upper bound for survival training.
#' @param feature_weights Set feature weights for column sampling.
-#' @param enable_categorical Experimental support of specializing for categorical features.
-#'
-#' If passing 'TRUE' and 'data' is a data frame,
-#' columns of categorical types will automatically
-#' be set to be of categorical type (feature_type='c') in the resulting DMatrix.
-#'
-#' If passing 'FALSE' and 'data' is a data frame with categorical columns,
-#' it will result in an error being thrown.
-#'
-#' If 'data' is not a data frame, this argument is ignored.
-#'
-#' JSON/UBJSON serialization format is required for this.
#' @return An 'xgb.DMatrix' object. If calling 'xgb.QuantileDMatrix', it will have additional
#' subclass 'xgb.QuantileDMatrix'.
#'
@@ -133,8 +117,7 @@ xgb.DMatrix <- function(
qid = NULL,
label_lower_bound = NULL,
label_upper_bound = NULL,
- feature_weights = NULL,
- enable_categorical = FALSE
+ feature_weights = NULL
) {
if (!is.null(group) && !is.null(qid)) {
stop("Either one of 'group' or 'qid' should be NULL")
@@ -186,7 +169,7 @@ xgb.DMatrix <- function(
nthread
)
} else if (is.data.frame(data)) {
- tmp <- .process.df.for.dmatrix(data, enable_categorical, feature_types)
+ tmp <- .process.df.for.dmatrix(data, feature_types)
feature_types <- tmp$feature_types
handle <- .Call(
XGDMatrixCreateFromDF_R, tmp$lst, missing, nthread
@@ -218,7 +201,7 @@ xgb.DMatrix <- function(
return(dmat)
}
-.process.df.for.dmatrix <- function(df, enable_categorical, feature_types) {
+.process.df.for.dmatrix <- function(df, feature_types) {
if (!nrow(df) || !ncol(df)) {
stop("'data' is an empty data.frame.")
}
@@ -231,12 +214,6 @@ xgb.DMatrix <- function(
} else {
feature_types <- sapply(df, function(col) {
if (is.factor(col)) {
- if (!enable_categorical) {
- stop(
- "When factor type is used, the parameter `enable_categorical`",
- " must be set to TRUE."
- )
- }
return("c")
} else if (is.integer(col)) {
return("int")
@@ -332,7 +309,6 @@ xgb.QuantileDMatrix <- function(
label_lower_bound = NULL,
label_upper_bound = NULL,
feature_weights = NULL,
- enable_categorical = FALSE,
ref = NULL,
max_bin = NULL
) {
@@ -363,8 +339,7 @@ xgb.QuantileDMatrix <- function(
qid = qid,
label_lower_bound = label_lower_bound,
label_upper_bound = label_upper_bound,
- feature_weights = feature_weights,
- enable_categorical = enable_categorical
+ feature_weights = feature_weights
)
)
data_iterator <- .single.data.iterator(iterator_env)
@@ -379,7 +354,7 @@ xgb.QuantileDMatrix <- function(
.Call(XGDMatrixFree_R, proxy_handle)
})
iterator_next <- function() {
- return(xgb.ProxyDMatrix.internal(proxy_handle, data_iterator))
+ return(xgb.ProxyDMatrix(proxy_handle, data_iterator))
}
iterator_reset <- function() {
return(data_iterator$f_reset(iterator_env))
@@ -422,12 +397,12 @@ xgb.QuantileDMatrix <- function(
#' to know which part of the data to pass next.
#' @param f_next `function(env)` which is responsible for:\itemize{
#' \item Accessing or retrieving the next batch of data in the iterator.
-#' \item Supplying this data by calling function \link{xgb.ProxyDMatrix} on it and returning the result.
+#' \item Supplying this data by calling function \link{xgb.DataBatch} on it and returning the result.
#' \item Keeping track of where in the iterator batch it is or will go next, which can for example
#' be done by modifiying variables in the `env` variable that is passed here.
#' \item Signaling whether there are more batches to be consumed or not, by returning `NULL`
#' when the stream of data ends (all batches in the iterator have been consumed), or the result from
-#' calling \link{xgb.ProxyDMatrix} when there are more batches in the line to be consumed.
+#' calling \link{xgb.DataBatch} when there are more batches in the line to be consumed.
#' }
#' @param f_reset `function(env)` which is responsible for reseting the data iterator
#' (i.e. taking it back to the first batch, called before and after the sequence of batches
@@ -437,7 +412,7 @@ xgb.QuantileDMatrix <- function(
#' (and in the same order) must be passed in subsequent iterations.
#' @return An `xgb.DataIter` object, containing the same inputs supplied here, which can then
#' be passed to \link{xgb.ExternalDMatrix}.
-#' @seealso \link{xgb.ExternalDMatrix}, \link{xgb.ProxyDMatrix}.
+#' @seealso \link{xgb.ExternalDMatrix}, \link{xgb.DataBatch}.
#' @export
xgb.DataIter <- function(env = new.env(), f_next, f_reset) {
if (!is.function(f_next)) {
@@ -465,7 +440,7 @@ xgb.DataIter <- function(env = new.env(), f_next, f_reset) {
env[["iter"]] <- curr_iter + 1L
})
return(
- xgb.ProxyDMatrix(
+ xgb.DataBatch(
data = env[["data"]],
label = env[["label"]],
weight = env[["weight"]],
@@ -476,8 +451,7 @@ xgb.DataIter <- function(env = new.env(), f_next, f_reset) {
qid = env[["qid"]],
label_lower_bound = env[["label_lower_bound"]],
label_upper_bound = env[["label_upper_bound"]],
- feature_weights = env[["feature_weights"]],
- enable_categorical = env[["enable_categorical"]]
+ feature_weights = env[["feature_weights"]]
)
)
}
@@ -496,13 +470,13 @@ xgb.DataIter <- function(env = new.env(), f_next, f_reset) {
.make.proxy.handle <- function() {
out <- .Call(XGProxyDMatrixCreate_R)
attributes(out) <- list(
- class = c("xgb.DMatrix", "xgb.ProxyDMatrixHandle"),
+ class = c("xgb.DMatrix", "xgb.ProxyDMatrix"),
fields = new.env()
)
return(out)
}
-#' @title Proxy DMatrix Updater
+#' @title Structure for Data Batches
#' @description Helper function to supply data in batches of a data iterator when
#' constructing a DMatrix from external memory through \link{xgb.ExternalDMatrix}
#' or through \link{xgb.QuantileDMatrix.from_iterator}.
@@ -512,8 +486,8 @@ xgb.DataIter <- function(env = new.env(), f_next, f_reset) {
#' when constructing a DMatrix through external memory - otherwise, one should call
#' \link{xgb.DMatrix} or \link{xgb.QuantileDMatrix}.
#'
-#' The object that results from calling this function directly is \bold{not} like the other
-#' `xgb.DMatrix` variants - i.e. cannot be used to train a model, nor to get predictions - only
+#' The object that results from calling this function directly is \bold{not} like
+#' an `xgb.DMatrix` - i.e. cannot be used to train a model, nor to get predictions - only
#' possible usage is to supply data to an iterator, from which a DMatrix is then constructed.
#'
#' For more information and for example usage, see the documentation for \link{xgb.ExternalDMatrix}.
@@ -531,11 +505,11 @@ xgb.DataIter <- function(env = new.env(), f_next, f_reset) {
#' \link{xgb.DMatrix} for details on it.
#' \item CSR matrices, as class `dgRMatrix` from package `Matrix`.
#' }
-#' @return An object of class `xgb.ProxyDMatrix`, which is just a list containing the
+#' @return An object of class `xgb.DataBatch`, which is just a list containing the
#' data and parameters passed here. It does \bold{not} inherit from `xgb.DMatrix`.
#' @seealso \link{xgb.DataIter}, \link{xgb.ExternalDMatrix}.
#' @export
-xgb.ProxyDMatrix <- function(
+xgb.DataBatch <- function(
data,
label = NULL,
weight = NULL,
@@ -546,8 +520,7 @@ xgb.ProxyDMatrix <- function(
qid = NULL,
label_lower_bound = NULL,
label_upper_bound = NULL,
- feature_weights = NULL,
- enable_categorical = FALSE
+ feature_weights = NULL
) {
stopifnot(inherits(data, c("matrix", "data.frame", "dgRMatrix")))
out <- list(
@@ -561,27 +534,27 @@ xgb.ProxyDMatrix <- function(
qid = qid,
label_lower_bound = label_lower_bound,
label_upper_bound = label_upper_bound,
- feature_weights = feature_weights,
- enable_categorical = enable_categorical
+ feature_weights = feature_weights
)
- class(out) <- "xgb.ProxyDMatrix"
+ class(out) <- "xgb.DataBatch"
return(out)
}
-xgb.ProxyDMatrix.internal <- function(proxy_handle, data_iterator) {
+# This is only for internal usage, class is not exposed to the user.
+xgb.ProxyDMatrix <- function(proxy_handle, data_iterator) {
lst <- data_iterator$f_next(data_iterator$env)
if (is.null(lst)) {
return(0L)
}
- if (!inherits(lst, "xgb.ProxyDMatrix")) {
- stop("DataIter 'f_next' must return either NULL or the result from calling 'xgb.ProxyDMatrix'.")
+ if (!inherits(lst, "xgb.DataBatch")) {
+ stop("DataIter 'f_next' must return either NULL or the result from calling 'xgb.DataBatch'.")
}
if (!is.null(lst$group) && !is.null(lst$qid)) {
stop("Either one of 'group' or 'qid' should be NULL")
}
if (is.data.frame(lst$data)) {
- tmp <- .process.df.for.dmatrix(lst$data, lst$enable_categorical, lst$feature_types)
+ tmp <- .process.df.for.dmatrix(lst$data, lst$feature_types)
lst$feature_types <- tmp$feature_types
.Call(XGProxyDMatrixSetDataColumnar_R, proxy_handle, tmp$lst)
rm(tmp)
@@ -640,7 +613,7 @@ xgb.ProxyDMatrix.internal <- function(proxy_handle, data_iterator) {
#' This should not pose any problem for `numeric` types, since they do have an inheret NaN value.
#' @return An 'xgb.DMatrix' object, with subclass 'xgb.ExternalDMatrix', in which the data is not
#' held internally but accessed through the iterator when needed.
-#' @seealso \link{xgb.DataIter}, \link{xgb.ProxyDMatrix}, \link{xgb.QuantileDMatrix.from_iterator}
+#' @seealso \link{xgb.DataIter}, \link{xgb.DataBatch}, \link{xgb.QuantileDMatrix.from_iterator}
#' @examples
#' library(xgboost)
#' data(mtcars)
@@ -680,10 +653,10 @@ xgb.ProxyDMatrix.internal <- function(proxy_handle, data_iterator) {
#' iterator_env[["iter"]] <- curr_iter + 1
#' })
#'
-#' # Function 'xgb.ProxyDMatrix' must be called manually
+#' # Function 'xgb.DataBatch' must be called manually
#' # at each batch with all the appropriate attributes,
#' # such as feature names and feature types.
-#' return(xgb.ProxyDMatrix(data = x_batch, label = y_batch))
+#' return(xgb.DataBatch(data = x_batch, label = y_batch))
#' }
#'
#' # This moves the iterator back to its beginning
@@ -727,7 +700,7 @@ xgb.ExternalDMatrix <- function(
.Call(XGDMatrixFree_R, proxy_handle)
})
iterator_next <- function() {
- return(xgb.ProxyDMatrix.internal(proxy_handle, data_iterator))
+ return(xgb.ProxyDMatrix(proxy_handle, data_iterator))
}
iterator_reset <- function() {
return(data_iterator$f_reset(data_iterator$env))
@@ -770,7 +743,7 @@ xgb.ExternalDMatrix <- function(
#' @inheritParams xgb.ExternalDMatrix
#' @inheritParams xgb.QuantileDMatrix
#' @return An 'xgb.DMatrix' object, with subclass 'xgb.QuantileDMatrix'.
-#' @seealso \link{xgb.DataIter}, \link{xgb.ProxyDMatrix}, \link{xgb.ExternalDMatrix},
+#' @seealso \link{xgb.DataIter}, \link{xgb.DataBatch}, \link{xgb.ExternalDMatrix},
#' \link{xgb.QuantileDMatrix}
#' @export
xgb.QuantileDMatrix.from_iterator <- function( # nolint
@@ -792,7 +765,7 @@ xgb.QuantileDMatrix.from_iterator <- function( # nolint
.Call(XGDMatrixFree_R, proxy_handle)
})
iterator_next <- function() {
- return(xgb.ProxyDMatrix.internal(proxy_handle, data_iterator))
+ return(xgb.ProxyDMatrix(proxy_handle, data_iterator))
}
iterator_reset <- function() {
return(data_iterator$f_reset(data_iterator$env))
@@ -1262,19 +1235,15 @@ xgb.get.DMatrix.data <- function(dmat) {
#' data(agaricus.train, package='xgboost')
#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
#'
-#' dsub <- slice(dtrain, 1:42)
+#' dsub <- xgb.slice.DMatrix(dtrain, 1:42)
#' labels1 <- getinfo(dsub, 'label')
#' dsub <- dtrain[1:42, ]
#' labels2 <- getinfo(dsub, 'label')
#' all.equal(labels1, labels2)
#'
-#' @rdname slice.xgb.DMatrix
-#' @export
-slice <- function(object, idxset) UseMethod("slice")
-
-#' @rdname slice.xgb.DMatrix
+#' @rdname xgb.slice.DMatrix
#' @export
-slice.xgb.DMatrix <- function(object, idxset) {
+xgb.slice.DMatrix <- function(object, idxset) {
if (!inherits(object, "xgb.DMatrix")) {
stop("object must be xgb.DMatrix")
}
@@ -1298,10 +1267,10 @@ slice.xgb.DMatrix <- function(object, idxset) {
return(structure(ret, class = "xgb.DMatrix"))
}
-#' @rdname slice.xgb.DMatrix
+#' @rdname xgb.slice.DMatrix
#' @export
`[.xgb.DMatrix` <- function(object, idxset, colset = NULL) {
- slice(object, idxset)
+ xgb.slice.DMatrix(object, idxset)
}
diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R
index eb0495631d6e..29bddb57f3e2 100644
--- a/R-package/R/xgb.cv.R
+++ b/R-package/R/xgb.cv.R
@@ -197,12 +197,12 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing
nthread = params$nthread
)
bst_folds <- lapply(seq_along(folds), function(k) {
- dtest <- slice(dall, folds[[k]])
+ dtest <- xgb.slice.DMatrix(dall, folds[[k]])
# code originally contributed by @RolandASc on stackoverflow
if (is.null(train_folds))
- dtrain <- slice(dall, unlist(folds[-k]))
+ dtrain <- xgb.slice.DMatrix(dall, unlist(folds[-k]))
else
- dtrain <- slice(dall, train_folds[[k]])
+ dtrain <- xgb.slice.DMatrix(dall, train_folds[[k]])
bst <- xgb.Booster(
params = params,
cachelist = list(dtrain, dtest),
diff --git a/R-package/man/predict.xgb.Booster.Rd b/R-package/man/predict.xgb.Booster.Rd
index 7a6dd6c1306b..95e7a51fd043 100644
--- a/R-package/man/predict.xgb.Booster.Rd
+++ b/R-package/man/predict.xgb.Booster.Rd
@@ -17,6 +17,7 @@
training = FALSE,
iterationrange = NULL,
strict_shape = FALSE,
+ validate_features = FALSE,
...
)
}
@@ -66,6 +67,23 @@ base-1 indexing, and inclusive of both ends).
\item{strict_shape}{Default is \code{FALSE}. When set to \code{TRUE}, the output
type and shape of predictions are invariant to the model type.}
+\item{validate_features}{When \code{TRUE}, validate that the Booster's and newdata's feature_names
+match (only applicable when both \code{object} and \code{newdata} have feature names).
+
+\if{html}{\out{
}}\preformatted{ If the column names differ and `newdata` is not an `xgb.DMatrix`, will try to reorder
+ the columns in `newdata` to match with the booster's.
+
+ If the booster has feature types and `newdata` is either an `xgb.DMatrix` or `data.frame`,
+ will additionally verify that categorical columns are of the correct type in `newdata`,
+ throwing an error if they do not match.
+
+ If passing `FALSE`, it is assumed that the feature names and types are the same,
+ and come in the same order as in the training data.
+
+ Note that this check might add some sizable latency to the predictions, so it's
+ recommended to disable it for performance-sensitive applications.
+}\if{html}{\out{
}}}
+
\item{...}{Not used.}
}
\value{
diff --git a/R-package/man/xgb.DMatrix.Rd b/R-package/man/xgb.DMatrix.Rd
index 0e17a79f9cea..d182707332df 100644
--- a/R-package/man/xgb.DMatrix.Rd
+++ b/R-package/man/xgb.DMatrix.Rd
@@ -19,8 +19,7 @@ xgb.DMatrix(
qid = NULL,
label_lower_bound = NULL,
label_upper_bound = NULL,
- feature_weights = NULL,
- enable_categorical = FALSE
+ feature_weights = NULL
)
xgb.QuantileDMatrix(
@@ -37,7 +36,6 @@ xgb.QuantileDMatrix(
label_lower_bound = NULL,
label_upper_bound = NULL,
feature_weights = NULL,
- enable_categorical = FALSE,
ref = NULL,
max_bin = NULL
)
@@ -50,10 +48,6 @@ Supported input types are as follows:\itemize{
\item \code{matrix} objects, with types \code{numeric}, \code{integer}, or \code{logical}.
\item \code{data.frame} objects, with columns of types \code{numeric}, \code{integer}, \code{logical}, or \code{factor}.
-If passing \code{enable_categorical=TRUE}, columns with \code{factor} type will be treated as categorical.
-Otherwise, if passing \code{enable_categorical=FALSE} and the data contains \code{factor} columns, an error
-will be thrown.
-
Note that xgboost uses base-0 encoding for categorical types, hence \code{factor} types (which use base-1
encoding') will be converted inside the function call. Be aware that the encoding used for \code{factor}
types is not kept as part of the model, so in subsequent calls to \code{predict}, it is the user's
@@ -103,7 +97,7 @@ frame and matrix.
\item{feature_types}{Set types for features.
-If \code{data} is a \code{data.frame} and passing \code{enable_categorical=TRUE}, the types will be deduced
+If \code{data} is a \code{data.frame} and passing \code{feature_types} is not supplied, feature types will be deduced
automatically from the column types.
Otherwise, one can pass a character vector with the same length as number of columns in \code{data},
@@ -135,20 +129,6 @@ not be saved, so make sure that \code{factor} columns passed to \code{predict} h
\item{feature_weights}{Set feature weights for column sampling.}
-\item{enable_categorical}{Experimental support of specializing for categorical features.
-
-\if{html}{\out{}}\preformatted{ If passing 'TRUE' and 'data' is a data frame,
- columns of categorical types will automatically
- be set to be of categorical type (feature_type='c') in the resulting DMatrix.
-
- If passing 'FALSE' and 'data' is a data frame with categorical columns,
- it will result in an error being thrown.
-
- If 'data' is not a data frame, this argument is ignored.
-
- JSON/UBJSON serialization format is required for this.
-}\if{html}{\out{
}}}
-
\item{ref}{The training dataset that provides quantile information, needed when creating
validation/test dataset with \code{xgb.QuantileDMatrix}. Supplying the training DMatrix
as a reference means that the same quantisation applied to the training data is
diff --git a/R-package/man/xgb.ProxyDMatrix.Rd b/R-package/man/xgb.DataBatch.Rd
similarity index 78%
rename from R-package/man/xgb.ProxyDMatrix.Rd
rename to R-package/man/xgb.DataBatch.Rd
index cf90024cec79..063b82b031cc 100644
--- a/R-package/man/xgb.ProxyDMatrix.Rd
+++ b/R-package/man/xgb.DataBatch.Rd
@@ -1,10 +1,10 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.DMatrix.R
-\name{xgb.ProxyDMatrix}
-\alias{xgb.ProxyDMatrix}
-\title{Proxy DMatrix Updater}
+\name{xgb.DataBatch}
+\alias{xgb.DataBatch}
+\title{Structure for Data Batches}
\usage{
-xgb.ProxyDMatrix(
+xgb.DataBatch(
data,
label = NULL,
weight = NULL,
@@ -15,8 +15,7 @@ xgb.ProxyDMatrix(
qid = NULL,
label_lower_bound = NULL,
label_upper_bound = NULL,
- feature_weights = NULL,
- enable_categorical = FALSE
+ feature_weights = NULL
)
}
\arguments{
@@ -58,7 +57,7 @@ frame and matrix.
\item{feature_types}{Set types for features.
-If \code{data} is a \code{data.frame} and passing \code{enable_categorical=TRUE}, the types will be deduced
+If \code{data} is a \code{data.frame} and passing \code{feature_types} is not supplied, feature types will be deduced
automatically from the column types.
Otherwise, one can pass a character vector with the same length as number of columns in \code{data},
@@ -87,23 +86,9 @@ not be saved, so make sure that \code{factor} columns passed to \code{predict} h
\item{label_upper_bound}{Upper bound for survival training.}
\item{feature_weights}{Set feature weights for column sampling.}
-
-\item{enable_categorical}{Experimental support of specializing for categorical features.
-
-\if{html}{\out{}}\preformatted{ If passing 'TRUE' and 'data' is a data frame,
- columns of categorical types will automatically
- be set to be of categorical type (feature_type='c') in the resulting DMatrix.
-
- If passing 'FALSE' and 'data' is a data frame with categorical columns,
- it will result in an error being thrown.
-
- If 'data' is not a data frame, this argument is ignored.
-
- JSON/UBJSON serialization format is required for this.
-}\if{html}{\out{
}}}
}
\value{
-An object of class \code{xgb.ProxyDMatrix}, which is just a list containing the
+An object of class \code{xgb.DataBatch}, which is just a list containing the
data and parameters passed here. It does \bold{not} inherit from \code{xgb.DMatrix}.
}
\description{
@@ -116,8 +101,8 @@ is passed as argument to function \link{xgb.DataIter} to construct a data iterat
when constructing a DMatrix through external memory - otherwise, one should call
\link{xgb.DMatrix} or \link{xgb.QuantileDMatrix}.
-The object that results from calling this function directly is \bold{not} like the other
-\code{xgb.DMatrix} variants - i.e. cannot be used to train a model, nor to get predictions - only
+The object that results from calling this function directly is \bold{not} like
+an \code{xgb.DMatrix} - i.e. cannot be used to train a model, nor to get predictions - only
possible usage is to supply data to an iterator, from which a DMatrix is then constructed.
For more information and for example usage, see the documentation for \link{xgb.ExternalDMatrix}.
diff --git a/R-package/man/xgb.DataIter.Rd b/R-package/man/xgb.DataIter.Rd
index 29cf5acc9cf4..2bd68ce5108c 100644
--- a/R-package/man/xgb.DataIter.Rd
+++ b/R-package/man/xgb.DataIter.Rd
@@ -15,12 +15,12 @@ to know which part of the data to pass next.}
\item{f_next}{\verb{function(env)} which is responsible for:\itemize{
\item Accessing or retrieving the next batch of data in the iterator.
-\item Supplying this data by calling function \link{xgb.ProxyDMatrix} on it and returning the result.
+\item Supplying this data by calling function \link{xgb.DataBatch} on it and returning the result.
\item Keeping track of where in the iterator batch it is or will go next, which can for example
be done by modifiying variables in the \code{env} variable that is passed here.
\item Signaling whether there are more batches to be consumed or not, by returning \code{NULL}
when the stream of data ends (all batches in the iterator have been consumed), or the result from
-calling \link{xgb.ProxyDMatrix} when there are more batches in the line to be consumed.
+calling \link{xgb.DataBatch} when there are more batches in the line to be consumed.
}}
\item{f_reset}{\verb{function(env)} which is responsible for reseting the data iterator
@@ -47,5 +47,5 @@ which will consume the data and create a DMatrix from it by executing the callba
For more information, and for a usage example, see the documentation for \link{xgb.ExternalDMatrix}.
}
\seealso{
-\link{xgb.ExternalDMatrix}, \link{xgb.ProxyDMatrix}.
+\link{xgb.ExternalDMatrix}, \link{xgb.DataBatch}.
}
diff --git a/R-package/man/xgb.ExternalDMatrix.Rd b/R-package/man/xgb.ExternalDMatrix.Rd
index 3e7844990b50..14a872cb585c 100644
--- a/R-package/man/xgb.ExternalDMatrix.Rd
+++ b/R-package/man/xgb.ExternalDMatrix.Rd
@@ -87,10 +87,10 @@ iterator_next <- function(iterator_env) {
iterator_env[["iter"]] <- curr_iter + 1
})
- # Function 'xgb.ProxyDMatrix' must be called manually
+ # Function 'xgb.DataBatch' must be called manually
# at each batch with all the appropriate attributes,
# such as feature names and feature types.
- return(xgb.ProxyDMatrix(data = x_batch, label = y_batch))
+ return(xgb.DataBatch(data = x_batch, label = y_batch))
}
# This moves the iterator back to its beginning
@@ -118,5 +118,5 @@ pred_dm <- predict(model, dm)
pred_mat <- predict(model, as.matrix(mtcars[, -1]))
}
\seealso{
-\link{xgb.DataIter}, \link{xgb.ProxyDMatrix}, \link{xgb.QuantileDMatrix.from_iterator}
+\link{xgb.DataIter}, \link{xgb.DataBatch}, \link{xgb.QuantileDMatrix.from_iterator}
}
diff --git a/R-package/man/xgb.QuantileDMatrix.from_iterator.Rd b/R-package/man/xgb.QuantileDMatrix.from_iterator.Rd
index 21f24576dcb1..791b5576e653 100644
--- a/R-package/man/xgb.QuantileDMatrix.from_iterator.Rd
+++ b/R-package/man/xgb.QuantileDMatrix.from_iterator.Rd
@@ -60,6 +60,6 @@ For more information, see the guide 'Using XGBoost External Memory Version':
\url{https://xgboost.readthedocs.io/en/stable/tutorials/external_memory.html}
}
\seealso{
-\link{xgb.DataIter}, \link{xgb.ProxyDMatrix}, \link{xgb.ExternalDMatrix},
+\link{xgb.DataIter}, \link{xgb.DataBatch}, \link{xgb.ExternalDMatrix},
\link{xgb.QuantileDMatrix}
}
diff --git a/R-package/man/slice.xgb.DMatrix.Rd b/R-package/man/xgb.slice.DMatrix.Rd
similarity index 84%
rename from R-package/man/slice.xgb.DMatrix.Rd
rename to R-package/man/xgb.slice.DMatrix.Rd
index a2dfb699bb0f..c9695996b66f 100644
--- a/R-package/man/slice.xgb.DMatrix.Rd
+++ b/R-package/man/xgb.slice.DMatrix.Rd
@@ -1,15 +1,12 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.DMatrix.R
-\name{slice}
-\alias{slice}
-\alias{slice.xgb.DMatrix}
+\name{xgb.slice.DMatrix}
+\alias{xgb.slice.DMatrix}
\alias{[.xgb.DMatrix}
\title{Get a new DMatrix containing the specified rows of
original xgb.DMatrix object}
\usage{
-slice(object, idxset)
-
-\method{slice}{xgb.DMatrix}(object, idxset)
+xgb.slice.DMatrix(object, idxset)
\method{[}{xgb.DMatrix}(object, idxset, colset = NULL)
}
@@ -28,7 +25,7 @@ original xgb.DMatrix object
data(agaricus.train, package='xgboost')
dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
-dsub <- slice(dtrain, 1:42)
+dsub <- xgb.slice.DMatrix(dtrain, 1:42)
labels1 <- getinfo(dsub, 'label')
dsub <- dtrain[1:42, ]
labels2 <- getinfo(dsub, 'label')
diff --git a/R-package/tests/testthat/test_dmatrix.R b/R-package/tests/testthat/test_dmatrix.R
index 65374240df00..50621f241c3e 100644
--- a/R-package/tests/testthat/test_dmatrix.R
+++ b/R-package/tests/testthat/test_dmatrix.R
@@ -166,7 +166,7 @@ test_that("xgb.DMatrix: getinfo & setinfo", {
test_that("xgb.DMatrix: slice, dim", {
dtest <- xgb.DMatrix(test_data, label = test_label, nthread = n_threads)
expect_equal(dim(dtest), dim(test_data))
- dsub1 <- slice(dtest, 1:42)
+ dsub1 <- xgb.slice.DMatrix(dtest, 1:42)
expect_equal(nrow(dsub1), 42)
expect_equal(ncol(dsub1), ncol(test_data))
@@ -182,12 +182,12 @@ test_that("xgb.DMatrix: slice, trailing empty rows", {
dtrain <- xgb.DMatrix(
data = train_data, label = train_label, nthread = n_threads
)
- slice(dtrain, 6513L)
+ xgb.slice.DMatrix(dtrain, 6513L)
train_data[6513, ] <- 0
dtrain <- xgb.DMatrix(
data = train_data, label = train_label, nthread = n_threads
)
- slice(dtrain, 6513L)
+ xgb.slice.DMatrix(dtrain, 6513L)
expect_equal(nrow(dtrain), 6513)
})
@@ -338,19 +338,18 @@ test_that("xgb.DMatrix: data.frame", {
stringsAsFactors = TRUE
)
- m <- xgb.DMatrix(df, enable_categorical = TRUE)
+ m <- xgb.DMatrix(df)
expect_equal(colnames(m), colnames(df))
expect_equal(
getinfo(m, "feature_type"), c("float", "float", "int", "i", "c", "c")
)
- expect_error(xgb.DMatrix(df, enable_categorical = FALSE))
df <- data.frame(
missing = c("a", "b", "d", NA),
valid = c("a", "b", "d", "c"),
stringsAsFactors = TRUE
)
- m <- xgb.DMatrix(df, enable_categorical = TRUE)
+ m <- xgb.DMatrix(df)
expect_equal(getinfo(m, "feature_type"), c("c", "c"))
})
@@ -473,7 +472,7 @@ test_that("xgb.DMatrix: ExternalDMatrix produces the same results as regular DMa
y = mtcars[, 1]
)
)
- iterator_next <- function(iterator_env, proxy_handle) {
+ iterator_next <- function(iterator_env) {
curr_iter <- iterator_env[["iter"]]
if (curr_iter >= 2) {
return(NULL)
@@ -488,7 +487,7 @@ test_that("xgb.DMatrix: ExternalDMatrix produces the same results as regular DMa
on.exit({
iterator_env[["iter"]] <- curr_iter + 1
})
- return(xgb.ProxyDMatrix(data = x_batch, label = y_batch))
+ return(xgb.DataBatch(data = x_batch, label = y_batch))
}
iterator_reset <- function(iterator_env) {
iterator_env[["iter"]] <- 0
@@ -547,7 +546,7 @@ test_that("xgb.DMatrix: External QDM produces same results as regular QDM", {
y = mtcars[, 1]
)
)
- iterator_next <- function(iterator_env, proxy_handle) {
+ iterator_next <- function(iterator_env) {
curr_iter <- iterator_env[["iter"]]
if (curr_iter >= 2) {
return(NULL)
@@ -562,7 +561,7 @@ test_that("xgb.DMatrix: External QDM produces same results as regular QDM", {
on.exit({
iterator_env[["iter"]] <- curr_iter + 1
})
- return(xgb.ProxyDMatrix(data = x_batch, label = y_batch))
+ return(xgb.DataBatch(data = x_batch, label = y_batch))
}
iterator_reset <- function(iterator_env) {
iterator_env[["iter"]] <- 0
@@ -605,7 +604,7 @@ test_that("xgb.DMatrix: R errors thrown on DataIterator are thrown back to the u
y = mtcars[, 1]
)
)
- iterator_next <- function(iterator_env, proxy_handle) {
+ iterator_next <- function(iterator_env) {
curr_iter <- iterator_env[["iter"]]
if (curr_iter >= 2) {
return(0)
@@ -619,7 +618,7 @@ test_that("xgb.DMatrix: R errors thrown on DataIterator are thrown back to the u
on.exit({
iterator_env[["iter"]] <- curr_iter + 1
})
- return(xgb.ProxyDMatrix(data = x_batch, label = y_batch))
+ return(xgb.DataBatch(data = x_batch, label = y_batch))
}
iterator_reset <- function(iterator_env) {
iterator_env[["iter"]] <- 0
diff --git a/R-package/tests/testthat/test_helpers.R b/R-package/tests/testthat/test_helpers.R
index badac0213292..38b5ca0667bf 100644
--- a/R-package/tests/testthat/test_helpers.R
+++ b/R-package/tests/testthat/test_helpers.R
@@ -511,3 +511,82 @@ test_that('convert.labels works', {
expect_equal(class(res), 'numeric')
}
})
+
+test_that("validate.features works as expected", {
+ data(mtcars)
+ y <- mtcars$mpg
+ x <- as.matrix(mtcars[, -1])
+ dm <- xgb.DMatrix(x, label = y, nthread = 1)
+ model <- xgb.train(
+ params = list(nthread = 1),
+ data = dm,
+ nrounds = 3
+ )
+
+ # result is output as-is when needed
+ res <- validate.features(model, x)
+ expect_equal(res, x)
+ res <- validate.features(model, dm)
+ expect_identical(res, dm)
+ res <- validate.features(model, as(x[1, ], "dsparseVector"))
+ expect_equal(as.numeric(res), unname(x[1, ]))
+ res <- validate.features(model, "file.txt")
+ expect_equal(res, "file.txt")
+
+ # columns are reordered
+ res <- validate.features(model, mtcars[, rev(names(mtcars))])
+ expect_equal(names(res), colnames(x))
+ expect_equal(as.matrix(res), x)
+ res <- validate.features(model, as.matrix(mtcars[, rev(names(mtcars))]))
+ expect_equal(colnames(res), colnames(x))
+ expect_equal(res, x)
+ res <- validate.features(model, mtcars[1, rev(names(mtcars)), drop = FALSE])
+ expect_equal(names(res), colnames(x))
+ expect_equal(unname(as.matrix(res)), unname(x[1, , drop = FALSE]))
+ res <- validate.features(model, as.data.table(mtcars[, rev(names(mtcars))]))
+ expect_equal(names(res), colnames(x))
+ expect_equal(unname(as.matrix(res)), unname(x))
+
+ # error when columns are missing
+ expect_error({
+ validate.features(model, mtcars[, 1:3])
+ })
+ expect_error({
+ validate.features(model, as.matrix(mtcars[, 1:ncol(x)])) # nolint
+ })
+ expect_error({
+ validate.features(model, xgb.DMatrix(mtcars[, 1:3]))
+ })
+ expect_error({
+ validate.features(model, as(x[, 1:3], "CsparseMatrix"))
+ })
+
+ # error when it cannot reorder or subset
+ expect_error({
+ validate.features(model, xgb.DMatrix(mtcars))
+ }, "Feature names")
+ expect_error({
+ validate.features(model, xgb.DMatrix(x[, rev(colnames(x))]))
+ }, "Feature names")
+
+ # no error about types if the booster doesn't have types
+ expect_error({
+ validate.features(model, xgb.DMatrix(x, feature_types = c(rep("q", 5), rep("c", 5))))
+ }, NA)
+ tmp <- mtcars
+ tmp[["vs"]] <- factor(tmp[["vs"]])
+ expect_error({
+ validate.features(model, tmp)
+ }, NA)
+
+ # error when types do not match
+ setinfo(model, "feature_type", rep("q", 10))
+ expect_error({
+ validate.features(model, xgb.DMatrix(x, feature_types = c(rep("q", 5), rep("c", 5))))
+ }, "Feature types")
+ tmp <- mtcars
+ tmp[["vs"]] <- factor(tmp[["vs"]])
+ expect_error({
+ validate.features(model, tmp)
+ }, "Feature types")
+})
diff --git a/doc/python/sklearn_estimator.rst b/doc/python/sklearn_estimator.rst
index a4835dcacd14..207b9fa30920 100644
--- a/doc/python/sklearn_estimator.rst
+++ b/doc/python/sklearn_estimator.rst
@@ -104,7 +104,7 @@ using cross validation with early stopping, here is a snippet to begin with:
clf = xgb.XGBClassifier(tree_method="hist", early_stopping_rounds=3)
- resutls = {}
+ results = {}
for train, test in cv.split(X, y):
X_train = X[train]
@@ -114,7 +114,7 @@ using cross validation with early stopping, here is a snippet to begin with:
est, train_score, test_score = fit_and_score(
clone(clf), X_train, X_test, y_train, y_test
)
- resutls[est] = (train_score, test_score)
+ results[est] = (train_score, test_score)
***********************************
diff --git a/plugin/sycl/common/partition_builder.h b/plugin/sycl/common/partition_builder.h
new file mode 100644
index 000000000000..37d1af241ab1
--- /dev/null
+++ b/plugin/sycl/common/partition_builder.h
@@ -0,0 +1,101 @@
+/*!
+ * Copyright 2017-2024 XGBoost contributors
+ */
+#ifndef PLUGIN_SYCL_COMMON_PARTITION_BUILDER_H_
+#define PLUGIN_SYCL_COMMON_PARTITION_BUILDER_H_
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
+#pragma GCC diagnostic ignored "-W#pragma-messages"
+#include
+#pragma GCC diagnostic pop
+#include
+
+#include
+#include
+#include
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
+#include "../../../src/common/column_matrix.h"
+#pragma GCC diagnostic pop
+
+#include "../data.h"
+
+#include
+
+namespace xgboost {
+namespace sycl {
+namespace common {
+
+// The builder is required for samples partition to left and rights children for set of nodes
+class PartitionBuilder {
+ public:
+ template
+ void Init(::sycl::queue* qu, size_t n_nodes, Func funcNTaks) {
+ qu_ = qu;
+ nodes_offsets_.resize(n_nodes+1);
+ result_rows_.resize(2 * n_nodes);
+ n_nodes_ = n_nodes;
+
+
+ nodes_offsets_[0] = 0;
+ for (size_t i = 1; i < n_nodes+1; ++i) {
+ nodes_offsets_[i] = nodes_offsets_[i-1] + funcNTaks(i-1);
+ }
+
+ if (data_.Size() < nodes_offsets_[n_nodes]) {
+ data_.Resize(qu, nodes_offsets_[n_nodes]);
+ }
+ }
+
+ size_t GetNLeftElems(int nid) const {
+ return result_rows_[2 * nid];
+ }
+
+
+ size_t GetNRightElems(int nid) const {
+ return result_rows_[2 * nid + 1];
+ }
+
+ // For test purposes only
+ void SetNLeftElems(int nid, size_t val) {
+ result_rows_[2 * nid] = val;
+ }
+
+ // For test purposes only
+ void SetNRightElems(int nid, size_t val) {
+ result_rows_[2 * nid + 1] = val;
+ }
+
+ xgboost::common::Span GetData(int nid) {
+ return { data_.Data() + nodes_offsets_[nid], nodes_offsets_[nid + 1] - nodes_offsets_[nid] };
+ }
+
+ void MergeToArray(size_t nid,
+ size_t* data_result,
+ ::sycl::event event) {
+ size_t n_nodes_total = GetNLeftElems(nid) + GetNRightElems(nid);
+ if (n_nodes_total > 0) {
+ const size_t* data = data_.Data() + nodes_offsets_[nid];
+ qu_->memcpy(data_result, data, sizeof(size_t) * n_nodes_total, event);
+ }
+ }
+
+ protected:
+ std::vector nodes_offsets_;
+ std::vector result_rows_;
+ size_t n_nodes_;
+
+ USMVector parts_size_;
+ USMVector data_;
+
+ ::sycl::queue* qu_;
+};
+
+} // namespace common
+} // namespace sycl
+} // namespace xgboost
+
+
+#endif // PLUGIN_SYCL_COMMON_PARTITION_BUILDER_H_
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index d4cc217d1ee8..0f4748bfec27 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -1,5 +1,5 @@
/**
- * Copyright 2014-2023 by XGBoost Contributors
+ * Copyright 2014-2024 by XGBoost Contributors
*/
#include "xgboost/c_api.h"
@@ -991,8 +991,8 @@ XGB_DLL int XGBoosterBoostOneIter(BoosterHandle handle, DMatrixHandle dtrain, bs
auto *learner = static_cast(handle);
auto ctx = learner->Ctx()->MakeCPU();
- auto t_grad = linalg::MakeTensorView(&ctx, common::Span{grad, len}, len);
- auto t_hess = linalg::MakeTensorView(&ctx, common::Span{hess, len}, len);
+ auto t_grad = linalg::MakeTensorView(&ctx, common::Span{grad, static_cast(len)}, len);
+ auto t_hess = linalg::MakeTensorView(&ctx, common::Span{hess, static_cast(len)}, len);
auto s_grad = linalg::ArrayInterfaceStr(t_grad);
auto s_hess = linalg::ArrayInterfaceStr(t_hess);
diff --git a/src/common/column_matrix.h b/src/common/column_matrix.h
index 0862c21ad1fd..440f3c0a87c8 100644
--- a/src/common/column_matrix.h
+++ b/src/common/column_matrix.h
@@ -1,5 +1,5 @@
/**
- * Copyright 2017-2023, XGBoost Contributors
+ * Copyright 2017-2024, XGBoost Contributors
* \file column_matrix.h
* \brief Utility for fast column-wise access
* \author Philip Cho
@@ -176,7 +176,7 @@ class ColumnMatrix {
void SetValid(typename LBitField32::index_type i) { missing.Clear(i); }
/** @brief assign the storage to the view. */
void InitView() {
- missing = LBitField32{Span{storage.data(), storage.size()}};
+ missing = LBitField32{Span{storage.data(), static_cast(storage.size())}};
}
void GrowTo(std::size_t n_elements, bool init) {
@@ -318,8 +318,8 @@ class ColumnMatrix {
common::Span bin_index = {
reinterpret_cast(&index_[feature_offset * bins_type_size_]),
column_size};
- return std::move(DenseColumnIter{
- bin_index, static_cast(index_base_[fidx]), missing_.missing, feature_offset});
+ return DenseColumnIter{
+ bin_index, static_cast(index_base_[fidx]), missing_.missing, feature_offset};
}
// all columns are dense column and has no missing value
@@ -332,7 +332,7 @@ class ColumnMatrix {
DispatchBinType(bins_type_size_, [&](auto t) {
using ColumnBinT = decltype(t);
auto column_index = Span{reinterpret_cast(index_.data()),
- index_.size() / sizeof(ColumnBinT)};
+ static_cast(index_.size() / sizeof(ColumnBinT))};
ParallelFor(n_samples, n_threads, [&](auto rid) {
rid += base_rowid;
const size_t ibegin = rid * n_features;
diff --git a/src/common/hist_util.h b/src/common/hist_util.h
index fbbd15b49fb5..e829752dae3d 100644
--- a/src/common/hist_util.h
+++ b/src/common/hist_util.h
@@ -1,5 +1,5 @@
/**
- * Copyright 2017-2023 by XGBoost Contributors
+ * Copyright 2017-2024 by XGBoost Contributors
* \file hist_util.h
* \brief Utility for fast histogram aggregation
* \author Philip Cho, Tianqi Chen
@@ -113,8 +113,8 @@ class HistogramCuts {
auto end = ptrs[column_id + 1];
auto beg = ptrs[column_id];
auto it = std::upper_bound(values.cbegin() + beg, values.cbegin() + end, value);
- auto idx = it - values.cbegin();
- idx -= !!(idx == end);
+ auto idx = static_cast(it - values.cbegin());
+ idx -= !!(idx == static_cast(end));
return idx;
}
@@ -136,8 +136,8 @@ class HistogramCuts {
auto beg = ptrs[fidx] + vals.cbegin();
// Truncates the value in case it's not perfectly rounded.
auto v = static_cast(common::AsCat(value));
- auto bin_idx = std::lower_bound(beg, end, v) - vals.cbegin();
- if (bin_idx == ptrs.at(fidx + 1)) {
+ auto bin_idx = static_cast(std::lower_bound(beg, end, v) - vals.cbegin());
+ if (bin_idx == static_cast(ptrs.at(fidx + 1))) {
bin_idx -= 1;
}
return bin_idx;
diff --git a/src/common/ref_resource_view.h b/src/common/ref_resource_view.h
index d4f82e615c6f..61adfdb7bea8 100644
--- a/src/common/ref_resource_view.h
+++ b/src/common/ref_resource_view.h
@@ -1,5 +1,5 @@
/**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
*/
#ifndef XGBOOST_COMMON_REF_RESOURCE_VIEW_H_
#define XGBOOST_COMMON_REF_RESOURCE_VIEW_H_
@@ -76,7 +76,7 @@ class RefResourceView {
[[nodiscard]] size_type size() const { return size_; } // NOLINT
[[nodiscard]] size_type size_bytes() const { // NOLINT
- return Span{data(), size()}.size_bytes();
+ return Span{data(), static_cast(size())}.size_bytes();
}
[[nodiscard]] value_type* data() { return ptr_; }; // NOLINT
[[nodiscard]] value_type const* data() const { return ptr_; }; // NOLINT
diff --git a/src/data/gradient_index.cc b/src/data/gradient_index.cc
index 1d3faf94532e..88a38d5cce74 100644
--- a/src/data/gradient_index.cc
+++ b/src/data/gradient_index.cc
@@ -1,5 +1,5 @@
/**
- * Copyright 2017-2023, XGBoost Contributors
+ * Copyright 2017-2024, XGBoost Contributors
* \brief Data type for fast histogram aggregation.
*/
#include "gradient_index.h"
@@ -148,7 +148,8 @@ void GHistIndexMatrix::ResizeIndex(const size_t n_index, const bool isDense) {
new_vec = {new_ptr, n_bytes / sizeof(std::uint8_t), malloc_resource};
}
this->data = std::move(new_vec);
- this->index = common::Index{common::Span{data.data(), data.size()}, t_size};
+ this->index = common::Index{common::Span{data.data(), static_cast(data.size())},
+ t_size};
};
if ((MaxNumBinPerFeat() - 1 <= static_cast(std::numeric_limits::max())) &&
diff --git a/src/data/gradient_index_format.cc b/src/data/gradient_index_format.cc
index fa8f492ed12a..542d3aaebda7 100644
--- a/src/data/gradient_index_format.cc
+++ b/src/data/gradient_index_format.cc
@@ -1,5 +1,5 @@
/**
- * Copyright 2021-2023 XGBoost contributors
+ * Copyright 2021-2024 XGBoost contributors
*/
#include // for size_t
#include // for uint8_t
@@ -40,7 +40,9 @@ class GHistIndexRawFormat : public SparsePageFormat {
return false;
}
// - index
- page->index = common::Index{common::Span{page->data.data(), page->data.size()}, size_type};
+ page->index =
+ common::Index{common::Span{page->data.data(), static_cast(page->data.size())},
+ size_type};
// hit count
if (!common::ReadVec(fi, &page->hit_count)) {
diff --git a/src/predictor/predictor.cc b/src/predictor/predictor.cc
index aad33c272dc7..019804eda31c 100644
--- a/src/predictor/predictor.cc
+++ b/src/predictor/predictor.cc
@@ -1,5 +1,5 @@
/**
- * Copyright 2017-2023 by Contributors
+ * Copyright 2017-2024 by Contributors
*/
#include "xgboost/predictor.h"
@@ -46,7 +46,7 @@ void ValidateBaseMarginShape(linalg::Tensor const& margin, bst_row_t n
void Predictor::InitOutPredictions(const MetaInfo& info, HostDeviceVector* out_preds,
const gbm::GBTreeModel& model) const {
CHECK_NE(model.learner_model_param->num_output_group, 0);
- std::size_t n{model.learner_model_param->OutputLength() * info.num_row_};
+ auto n = static_cast(model.learner_model_param->OutputLength() * info.num_row_);
const HostDeviceVector* base_margin = info.base_margin_.Data();
if (ctx_->Device().IsCUDA()) {
diff --git a/src/tree/hist/hist_cache.h b/src/tree/hist/hist_cache.h
index 8a2ba193af0c..715e1d73e60c 100644
--- a/src/tree/hist/hist_cache.h
+++ b/src/tree/hist/hist_cache.h
@@ -1,5 +1,5 @@
/**
- * Copyright 2023 by XGBoost Contributors
+ * Copyright 2023-2024 by XGBoost Contributors
*/
#ifndef XGBOOST_TREE_HIST_HIST_CACHE_H_
#define XGBOOST_TREE_HIST_HIST_CACHE_H_
@@ -48,11 +48,13 @@ class BoundedHistCollection {
BoundedHistCollection() = default;
common::GHistRow operator[](std::size_t idx) {
auto offset = node_map_.at(idx);
- return common::Span{data_->data(), data_->size()}.subspan(offset, n_total_bins_);
+ return common::Span{data_->data(), static_cast(data_->size())}.subspan(
+ offset, n_total_bins_);
}
common::ConstGHistRow operator[](std::size_t idx) const {
auto offset = node_map_.at(idx);
- return common::Span{data_->data(), data_->size()}.subspan(offset, n_total_bins_);
+ return common::Span{data_->data(), static_cast(data_->size())}.subspan(
+ offset, n_total_bins_);
}
void Reset(bst_bin_t n_total_bins, std::size_t n_cached_nodes) {
n_total_bins_ = n_total_bins;
diff --git a/tests/ci_build/Dockerfile.i386 b/tests/ci_build/Dockerfile.i386
new file mode 100644
index 000000000000..d7c133e2aee4
--- /dev/null
+++ b/tests/ci_build/Dockerfile.i386
@@ -0,0 +1,8 @@
+FROM i386/debian:sid
+
+ENV DEBIAN_FRONTEND noninteractive
+SHELL ["/bin/bash", "-c"] # Use Bash as shell
+
+RUN \
+ apt-get update && \
+ apt-get install -y tar unzip wget git build-essential ninja-build cmake
diff --git a/tests/cpp/CMakeLists.txt b/tests/cpp/CMakeLists.txt
index 08862feee79a..20923519ac49 100644
--- a/tests/cpp/CMakeLists.txt
+++ b/tests/cpp/CMakeLists.txt
@@ -14,8 +14,38 @@ if(USE_CUDA)
endif()
file(GLOB_RECURSE SYCL_TEST_SOURCES "plugin/test_sycl_*.cc")
-if(NOT PLUGIN_SYCL)
- list(REMOVE_ITEM TEST_SOURCES ${SYCL_TEST_SOURCES})
+list(REMOVE_ITEM TEST_SOURCES ${SYCL_TEST_SOURCES})
+
+if(PLUGIN_SYCL)
+ set(CMAKE_CXX_COMPILER "icpx")
+ file(GLOB_RECURSE SYCL_TEST_SOURCES "plugin/test_sycl_*.cc")
+ add_library(plugin_sycl_test OBJECT ${SYCL_TEST_SOURCES})
+
+ target_include_directories(plugin_sycl_test
+ PRIVATE
+ ${gtest_SOURCE_DIR}/include
+ ${xgboost_SOURCE_DIR}/include
+ ${xgboost_SOURCE_DIR}/dmlc-core/include
+ ${xgboost_SOURCE_DIR}/rabit/include)
+
+ target_compile_definitions(plugin_sycl_test PUBLIC -DXGBOOST_USE_SYCL=1)
+
+ target_link_libraries(plugin_sycl_test PUBLIC -fsycl)
+
+ set_target_properties(plugin_sycl_test PROPERTIES
+ COMPILE_FLAGS -fsycl
+ CXX_STANDARD 17
+ CXX_STANDARD_REQUIRED ON
+ POSITION_INDEPENDENT_CODE ON)
+ if(USE_OPENMP)
+ find_package(OpenMP REQUIRED)
+ set_target_properties(plugin_sycl_test PROPERTIES
+ COMPILE_FLAGS "-fsycl -qopenmp")
+ endif()
+ # Get compilation and link flags of plugin_sycl and propagate to testxgboost
+ target_link_libraries(testxgboost PUBLIC plugin_sycl_test)
+ # Add all objects of plugin_sycl to testxgboost
+ target_sources(testxgboost INTERFACE $)
endif()
if(PLUGIN_FEDERATED)
diff --git a/tests/cpp/c_api/test_c_api.cc b/tests/cpp/c_api/test_c_api.cc
index 4491dee92de5..c4c1f0c45f42 100644
--- a/tests/cpp/c_api/test_c_api.cc
+++ b/tests/cpp/c_api/test_c_api.cc
@@ -1,5 +1,5 @@
/**
- * Copyright 2019-2023 XGBoost contributors
+ * Copyright 2019-2024 XGBoost contributors
*/
#include
#include
@@ -212,8 +212,8 @@ TEST(CAPI, JsonModelIO) {
bst_ulong saved_len{0};
XGBoosterSaveModelToBuffer(handle, R"({"format": "ubj"})", &saved_len, &saved);
ASSERT_EQ(len, saved_len);
- auto l = StringView{data, len};
- auto r = StringView{saved, saved_len};
+ auto l = StringView{data, static_cast(len)};
+ auto r = StringView{saved, static_cast(saved_len)};
ASSERT_EQ(l.size(), r.size());
ASSERT_EQ(l, r);
diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc
index 97db9dbd87fc..6ce362f46763 100644
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -1,5 +1,5 @@
/**
- * Copyright 2016-2023 by XGBoost contributors
+ * Copyright 2016-2024 by XGBoost contributors
*/
#include "helpers.h"
@@ -216,7 +216,7 @@ SimpleLCG::StateType SimpleLCG::Max() const { return max(); }
static_assert(SimpleLCG::max() - SimpleLCG::min());
void RandomDataGenerator::GenerateLabels(std::shared_ptr p_fmat) const {
- RandomDataGenerator{p_fmat->Info().num_row_, this->n_targets_, 0.0f}.GenerateDense(
+ RandomDataGenerator{static_cast(p_fmat->Info().num_row_), this->n_targets_, 0.0f}.GenerateDense(
p_fmat->Info().labels.Data());
CHECK_EQ(p_fmat->Info().labels.Size(), this->rows_ * this->n_targets_);
p_fmat->Info().labels.Reshape(this->rows_, this->n_targets_);
@@ -458,7 +458,7 @@ void RandomDataGenerator::GenerateCSR(
EXPECT_EQ(row_count, dmat->Info().num_row_);
if (with_label) {
- RandomDataGenerator{dmat->Info().num_row_, this->n_targets_, 0.0f}.GenerateDense(
+ RandomDataGenerator{static_cast(dmat->Info().num_row_), this->n_targets_, 0.0f}.GenerateDense(
dmat->Info().labels.Data());
CHECK_EQ(dmat->Info().labels.Size(), this->rows_ * this->n_targets_);
dmat->Info().labels.Reshape(this->rows_, this->n_targets_);
diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h
index 9adda8aedfad..d603685eb073 100644
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -1,5 +1,5 @@
/**
- * Copyright 2016-2023 by XGBoost contributors
+ * Copyright 2016-2024 by XGBoost contributors
*/
#pragma once
@@ -238,7 +238,7 @@ class RandomDataGenerator {
bst_bin_t bins_{0};
std::vector ft_;
- bst_cat_t max_cat_;
+ bst_cat_t max_cat_{32};
Json ArrayInterfaceImpl(HostDeviceVector* storage, size_t rows, size_t cols) const;
diff --git a/tests/cpp/plugin/test_sycl_multiclass_obj.cc b/tests/cpp/plugin/test_sycl_multiclass_obj.cc
index d809ecad3fc1..d306337ac599 100644
--- a/tests/cpp/plugin/test_sycl_multiclass_obj.cc
+++ b/tests/cpp/plugin/test_sycl_multiclass_obj.cc
@@ -2,7 +2,11 @@
* Copyright 2018-2023 XGBoost contributors
*/
#include
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
+#pragma GCC diagnostic ignored "-W#pragma-messages"
#include
+#pragma GCC diagnostic pop
#include "../objective/test_multiclass_obj.h"
diff --git a/tests/cpp/plugin/test_sycl_partition_builder.cc b/tests/cpp/plugin/test_sycl_partition_builder.cc
new file mode 100644
index 000000000000..90bc757eb1b0
--- /dev/null
+++ b/tests/cpp/plugin/test_sycl_partition_builder.cc
@@ -0,0 +1,91 @@
+/**
+ * Copyright 2020-2024 by XGBoost contributors
+ */
+#include
+
+#include
+#include
+#include
+
+#include "../../../plugin/sycl/common/partition_builder.h"
+#include "../../../plugin/sycl/device_manager.h"
+#include "../helpers.h"
+
+namespace xgboost::sycl::common {
+
+TEST(SyclPartitionBuilder, BasicTest) {
+ constexpr size_t kNodes = 5;
+ // Number of rows for each node
+ std::vector rows = { 5, 5, 10, 1, 2 };
+
+ DeviceManager device_manager;
+ auto qu = device_manager.GetQueue(DeviceOrd::SyclDefault());
+ PartitionBuilder builder;
+ builder.Init(&qu, kNodes, [&](size_t i) {
+ return rows[i];
+ });
+
+ // We test here only the basics, thus syntetic partition builder is adopted
+ // Number of rows to go left for each node.
+ std::vector rows_for_left_node = { 2, 0, 7, 1, 2 };
+
+ size_t first_row_id = 0;
+ for(size_t nid = 0; nid < kNodes; ++nid) {
+ size_t n_rows_nodes = rows[nid];
+
+ auto rid_buff = builder.GetData(nid);
+ size_t rid_buff_size = rid_buff.size();
+ auto* rid_buff_ptr = rid_buff.data();
+
+ size_t n_left = rows_for_left_node[nid];
+ size_t n_right = rows[nid] - n_left;
+
+ qu.submit([&](::sycl::handler& cgh) {
+ cgh.parallel_for<>(::sycl::range<1>(n_left), [=](::sycl::id<1> pid) {
+ int row_id = first_row_id + pid[0];
+ rid_buff_ptr[pid[0]] = row_id;
+ });
+ });
+ qu.wait();
+ first_row_id += n_left;
+
+ // We are storing indexes for the right side in the tail of the array to save some memory
+ qu.submit([&](::sycl::handler& cgh) {
+ cgh.parallel_for<>(::sycl::range<1>(n_right), [=](::sycl::id<1> pid) {
+ int row_id = first_row_id + pid[0];
+ rid_buff_ptr[rid_buff_size - pid[0] - 1] = row_id;
+ });
+ });
+ qu.wait();
+ first_row_id += n_right;
+
+ builder.SetNLeftElems(nid, n_left);
+ builder.SetNRightElems(nid, n_right);
+ }
+
+ ::sycl::event event;
+ std::vector v(*std::max_element(rows.begin(), rows.end()));
+ size_t row_id = 0;
+ for(size_t nid = 0; nid < kNodes; ++nid) {
+ builder.MergeToArray(nid, v.data(), event);
+ qu.wait();
+
+ // Check that row_id for left side are correct
+ for(size_t j = 0; j < rows_for_left_node[nid]; ++j) {
+ ASSERT_EQ(v[j], row_id++);
+ }
+
+ // Check that row_id for right side are correct
+ for(size_t j = 0; j < rows[nid] - rows_for_left_node[nid]; ++j) {
+ ASSERT_EQ(v[rows[nid] - j - 1], row_id++);
+ }
+
+ // Check that number of left/right rows are correct
+ size_t n_left = builder.GetNLeftElems(nid);
+ size_t n_right = builder.GetNRightElems(nid);
+ ASSERT_EQ(n_left, rows_for_left_node[nid]);
+ ASSERT_EQ(n_right, (rows[nid] - rows_for_left_node[nid]));
+ }
+}
+
+} // namespace xgboost::common
diff --git a/tests/cpp/plugin/test_sycl_predictor.cc b/tests/cpp/plugin/test_sycl_predictor.cc
index f82a9f33d5f8..d5b3a5e5cd9a 100755
--- a/tests/cpp/plugin/test_sycl_predictor.cc
+++ b/tests/cpp/plugin/test_sycl_predictor.cc
@@ -2,11 +2,19 @@
* Copyright 2017-2023 XGBoost contributors
*/
#include
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
+#pragma GCC diagnostic ignored "-W#pragma-messages"
#include
+#pragma GCC diagnostic pop
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
#include "../../../src/data/adapter.h"
-#include "../../../src/data/proxy_dmatrix.h"
#include "../../../src/gbm/gbtree.h"
+#pragma GCC diagnostic pop
+
+#include "../../../src/data/proxy_dmatrix.h"
#include "../../../src/gbm/gbtree_model.h"
#include "../filesystem.h" // dmlc::TemporaryDirectory
#include "../helpers.h"
diff --git a/tests/cpp/plugin/test_sycl_regression_obj.cc b/tests/cpp/plugin/test_sycl_regression_obj.cc
index 66b4ea508477..349415390268 100644
--- a/tests/cpp/plugin/test_sycl_regression_obj.cc
+++ b/tests/cpp/plugin/test_sycl_regression_obj.cc
@@ -2,7 +2,11 @@
* Copyright 2017-2019 XGBoost contributors
*/
#include
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
+#pragma GCC diagnostic ignored "-W#pragma-messages"
#include
+#pragma GCC diagnostic pop
#include
#include "../helpers.h"