From 97c3a80a346828165c57225929657461bcaeeda2 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Wed, 5 Oct 2022 09:52:15 +0800 Subject: [PATCH] Add C document to sphinx, fix arrow. (#8300) - Group C API. - Add C API sphinx doc. - Consistent use of `OptionalArg` and the parameter name `config`. - Remove call to deprecated functions in demo. - Fix some formatting errors. - Add links to c examples in the document (only visible with doxygen pages) - Fix arrow. --- .gitignore | 2 + .readthedocs.yaml | 7 + demo/c-api/basic/c-api-demo.c | 41 +- demo/c-api/external-memory/external_memory.c | 4 +- doc/Doxyfile.in | 14 +- doc/c.rst | 54 ++- doc/conf.py | 43 +- doc/contrib/docs.rst | 2 +- doc/tutorials/c_api_tutorial.rst | 2 +- include/xgboost/c_api.h | 475 +++++++++++-------- python-package/xgboost/core.py | 2 +- python-package/xgboost/data.py | 8 +- src/c_api/c_api.cc | 69 ++- src/c_api/c_api.cu | 12 +- src/c_api/c_api_utils.h | 8 +- src/data/adapter.h | 6 +- src/data/simple_dmatrix.cc | 6 +- 17 files changed, 458 insertions(+), 297 deletions(-) diff --git a/.gitignore b/.gitignore index 15503ad57bf4..121fa6688fbe 100644 --- a/.gitignore +++ b/.gitignore @@ -52,6 +52,8 @@ Debug R-package.Rproj *.cache* .mypy_cache/ +doxygen + # java java/xgboost4j/target java/xgboost4j/tmp diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 924f516f95e6..80c2b8404382 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -5,6 +5,9 @@ # Required version: 2 +submodules: + include: all + # Set the version of Python and other tools you might need build: os: ubuntu-22.04 @@ -12,6 +15,10 @@ build: python: "3.8" apt_packages: - graphviz + - cmake + - g++ + - doxygen + - ninja-build # Build documentation in the docs/ directory with Sphinx sphinx: diff --git a/demo/c-api/basic/c-api-demo.c b/demo/c-api/basic/c-api-demo.c index 1c3d58de915f..aee6285f9484 100644 --- a/demo/c-api/basic/c-api-demo.c +++ b/demo/c-api/basic/c-api-demo.c @@ -18,7 +18,7 @@ if (err != 0) { \ } \ } -int main(int argc, char** argv) { +int main() { int silent = 0; int use_gpu = 0; // set to 1 to use the GPU for training @@ -67,10 +67,21 @@ int main(int argc, char** argv) { // predict bst_ulong out_len = 0; - const float* out_result = NULL; int n_print = 10; - safe_xgboost(XGBoosterPredict(booster, dtest, 0, 0, 0, &out_len, &out_result)); + /* Run prediction with DMatrix object. */ + char const config[] = + "{\"training\": false, \"type\": 0, " + "\"iteration_begin\": 0, \"iteration_end\": 0, \"strict_shape\": false}"; + /* Shape of output prediction */ + uint64_t const* out_shape; + /* Dimension of output prediction */ + uint64_t out_dim; + /* Pointer to a thread local contigious array, assigned in prediction function. */ + float const* out_result = NULL; + safe_xgboost( + XGBoosterPredictFromDMatrix(booster, dtest, config, &out_shape, &out_dim, &out_result)); + printf("y_pred: "); for (int i = 0; i < n_print; ++i) { printf("%1.4f ", out_result[i]); @@ -98,12 +109,12 @@ int main(int argc, char** argv) { DMatrixHandle dmat; safe_xgboost(XGDMatrixCreateFromMat(values, 1, 127, 0.0, &dmat)); - bst_ulong out_len = 0; const float* out_result = NULL; - safe_xgboost(XGBoosterPredict(booster, dmat, 0, 0, 0, &out_len, - &out_result)); - assert(out_len == 1); + safe_xgboost( + XGBoosterPredictFromDMatrix(booster, dmat, config, &out_shape, &out_dim, &out_result)); + assert(out_dim == 1); + assert(out_shape[0] == 1); printf("%1.4f \n", out_result[0]); safe_xgboost(XGDMatrixFree(dmat)); @@ -122,12 +133,12 @@ int main(int argc, char** argv) { safe_xgboost(XGDMatrixCreateFromCSREx(indptr, indices, data, 2, 22, 127, &dmat)); - bst_ulong out_len = 0; const float* out_result = NULL; - safe_xgboost(XGBoosterPredict(booster, dmat, 0, 0, 0, &out_len, - &out_result)); - assert(out_len == 1); + safe_xgboost( + XGBoosterPredictFromDMatrix(booster, dmat, config, &out_shape, &out_dim, &out_result)); + assert(out_dim == 1); + assert(out_shape[0] == 1); printf("%1.4f \n", out_result[0]); safe_xgboost(XGDMatrixFree(dmat)); @@ -154,12 +165,12 @@ int main(int argc, char** argv) { safe_xgboost(XGDMatrixCreateFromCSCEx(col_ptr, indices, data, 128, 22, 1, &dmat)); - bst_ulong out_len = 0; const float* out_result = NULL; - safe_xgboost(XGBoosterPredict(booster, dmat, 0, 0, 0, &out_len, - &out_result)); - assert(out_len == 1); + safe_xgboost( + XGBoosterPredictFromDMatrix(booster, dmat, config, &out_shape, &out_dim, &out_result)); + assert(out_dim == 1); + assert(out_shape[0] == 1); printf("%1.4f \n", out_result[0]); safe_xgboost(XGDMatrixFree(dmat)); diff --git a/demo/c-api/external-memory/external_memory.c b/demo/c-api/external-memory/external_memory.c index 2718e8b6990e..d9c6123c148d 100644 --- a/demo/c-api/external-memory/external_memory.c +++ b/demo/c-api/external-memory/external_memory.c @@ -139,8 +139,8 @@ void TrainModel(DMatrix Xy) { Booster booster; DMatrix cache[] = {Xy}; safe_xgboost(XGBoosterCreate(cache, 1, &booster)); - /* Use approx for external memory training. */ - safe_xgboost(XGBoosterSetParam(booster, "tree_method", "approx")); + /* Use approx or hist for external memory training. */ + safe_xgboost(XGBoosterSetParam(booster, "tree_method", "hist")); safe_xgboost(XGBoosterSetParam(booster, "objective", "reg:squarederror")); /* Start training. */ diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in index 766034e4f818..b159ef1720e7 100644 --- a/doc/Doxyfile.in +++ b/doc/Doxyfile.in @@ -753,7 +753,7 @@ WARN_LOGFILE = # spaces. # Note: If this tag is empty the current directory is searched. -INPUT = @PROJECT_SOURCE_DIR@/include @PROJECT_SOURCE_DIR@/src/common +INPUT = @PROJECT_SOURCE_DIR@/include # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses @@ -822,7 +822,7 @@ EXCLUDE_SYMBOLS = # that contain example code fragments that are included (see the \include # command). -EXAMPLE_PATH = +EXAMPLE_PATH = @PROJECT_SOURCE_DIR@/demo/c-api/ # If the value of the EXAMPLE_PATH tag contains directories, you can use the # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and @@ -836,7 +836,7 @@ EXAMPLE_PATTERNS = # irrespective of the value of the RECURSIVE tag. # The default value is: NO. -EXAMPLE_RECURSIVE = NO +EXAMPLE_RECURSIVE = YES # The IMAGE_PATH tag can be used to specify one or more files or directories # that contain images that are to be included in the documentation (see the @@ -1934,7 +1934,7 @@ ENABLE_PREPROCESSING = YES # The default value is: NO. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -MACRO_EXPANSION = NO +MACRO_EXPANSION = YES # If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then # the macro expansion is limited to the macros specified with the PREDEFINED and @@ -1942,7 +1942,7 @@ MACRO_EXPANSION = NO # The default value is: NO. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -EXPAND_ONLY_PREDEF = NO +EXPAND_ONLY_PREDEF = YES # If the SEARCH_INCLUDES tag is set to YES the includes files in the # INCLUDE_PATH will be searched if a #include is found. @@ -1974,7 +1974,9 @@ INCLUDE_FILE_PATTERNS = # recursively expanded use the := operator instead of the = operator. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -PREDEFINED = DMLC_USE_CXX11 +PREDEFINED = DMLC_USE_CXX11 \ + "XGB_DLL=" \ + "XGB_EXTERN_C=" # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this # tag can be used to specify a list of macro names that should be expanded. The diff --git a/doc/c.rst b/doc/c.rst index ee9dd86297ad..02581b874e2e 100644 --- a/doc/c.rst +++ b/doc/c.rst @@ -6,7 +6,59 @@ XGBoost implements a set of C API designed for various bindings, we maintain its and the CMake/make build interface. See :doc:`/tutorials/c_api_tutorial` for an introduction and ``demo/c-api/`` for related examples. Also one can generate doxygen document by providing ``-DBUILD_C_DOC=ON`` as parameter to ``CMake`` during build, or -simply look at function comments in ``include/xgboost/c_api.h``. +simply look at function comments in ``include/xgboost/c_api.h``. The reference is exported +to sphinx with the help of breathe, which doesn't contain links to examples but might be +easier to read. For the original doxygen pages please visit: * `C API documentation (latest master branch) `_ * `C API documentation (last stable release) `_ + +*************** +C API Reference +*************** + +.. contents:: + :backlinks: none + :local: + +Library +======= + +.. doxygengroup:: Library + :project: xgboost + +DMatrix +======= + +.. doxygengroup:: DMatrix + :project: xgboost + +Streaming +--------- + +.. doxygengroup:: Streaming + :project: xgboost + +Booster +======= + +.. doxygengroup:: Booster + :project: xgboost + +Prediction +---------- + +.. doxygengroup:: Prediction + :project: xgboost + +Serialization +------------- + +.. doxygengroup:: Serialization + :project: xgboost + +Collective +========== + +.. doxygengroup:: Collective + :project: xgboost diff --git a/doc/conf.py b/doc/conf.py index 7e1126331d7f..83ad3e3dbb26 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -57,22 +57,24 @@ # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) -libpath = os.path.join(curr_path, '../python-package/') +CURR_PATH = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) +PROJECT_ROOT = os.path.normpath(os.path.join(CURR_PATH, os.path.pardir)) +libpath = os.path.join(PROJECT_ROOT, "python-package/") sys.path.insert(0, libpath) -sys.path.insert(0, curr_path) +sys.path.insert(0, CURR_PATH) # -- General configuration ------------------------------------------------ # General information about the project. -project = u'xgboost' -author = u'%s developers' % project -copyright = u'2021, %s' % author -github_doc_root = 'https://github.com/dmlc/xgboost/tree/master/doc/' +project = "xgboost" +author = "%s developers" % project +copyright = "2022, %s" % author +github_doc_root = "https://github.com/dmlc/xgboost/tree/master/doc/" -os.environ['XGBOOST_BUILD_DOC'] = '1' +os.environ["XGBOOST_BUILD_DOC"] = "1" # Version information. -import xgboost # NOQA +import xgboost # NOQA + version = xgboost.__version__ release = xgboost.__version__ @@ -105,7 +107,10 @@ plot_html_show_formats = False # Breathe extension variables -breathe_projects = {"xgboost": "doxyxml/"} +DOX_DIR = "doxygen" +breathe_projects = { + "xgboost": os.path.join(PROJECT_ROOT, DOX_DIR, "doc_doxygen/xml") +} breathe_default_project = "xgboost" # Add any paths that contain templates here, relative to this directory. @@ -216,23 +221,29 @@ # hook for doxygen -def run_doxygen(folder): +def run_doxygen(): """Run the doxygen make command in the designated folder.""" + curdir = os.path.normpath(os.path.abspath(os.path.curdir)) try: - retcode = subprocess.call("cd %s; make doxygen" % folder, shell=True) - if retcode < 0: - sys.stderr.write("doxygen terminated by signal %s" % (-retcode)) + os.chdir(PROJECT_ROOT) + if not os.path.exists(DOX_DIR): + os.mkdir(DOX_DIR) + os.chdir(os.path.join(PROJECT_ROOT, DOX_DIR)) + subprocess.check_call(["cmake", "..", "-DBUILD_C_DOC=ON", "-GNinja"]) + subprocess.check_call(["ninja", "doc_doxygen"]) except OSError as e: sys.stderr.write("doxygen execution failed: %s" % e) + finally: + os.chdir(curdir) def generate_doxygen_xml(app): """Run the doxygen make commands if we're on the ReadTheDocs server""" read_the_docs_build = os.environ.get('READTHEDOCS', None) == 'True' if read_the_docs_build: - run_doxygen('..') + run_doxygen() -# app.add_stylesheet() is deprecated. Use app.add_css_file() def setup(app): app.add_css_file('custom.css') + app.connect("builder-inited", generate_doxygen_xml) diff --git a/doc/contrib/docs.rst b/doc/contrib/docs.rst index 04de8d843e3f..c4249dbb7368 100644 --- a/doc/contrib/docs.rst +++ b/doc/contrib/docs.rst @@ -11,7 +11,7 @@ Documentation and Examples ********* Documents ********* -* Documentation is built using `Sphinx `_. +* Python and C documentation is built using `Sphinx `_. * Each document is written in `reStructuredText `_. * You can build document locally to see the effect, by running diff --git a/doc/tutorials/c_api_tutorial.rst b/doc/tutorials/c_api_tutorial.rst index fc7664c6d53d..5d4cb68cf7da 100644 --- a/doc/tutorials/c_api_tutorial.rst +++ b/doc/tutorials/c_api_tutorial.rst @@ -2,7 +2,7 @@ C API Tutorial ############## -In this tutorial, we are going to install XGBoost library & configure the CMakeLists.txt file of our C/C++ application to link XGBoost library with our application. Later on, we will see some useful tips for using C API and code snippets as examples to use various functions available in C API to perform basic task like loading, training model & predicting on test dataset. +In this tutorial, we are going to install XGBoost library & configure the CMakeLists.txt file of our C/C++ application to link XGBoost library with our application. Later on, we will see some useful tips for using C API and code snippets as examples to use various functions available in C API to perform basic task like loading, training model & predicting on test dataset. For API reference, please visit :doc:`/c` .. contents:: :backlinks: none diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h index 9cf9aa3bbeb8..2daa4203878b 100644 --- a/include/xgboost/c_api.h +++ b/include/xgboost/c_api.h @@ -1,5 +1,5 @@ /*! - * Copyright (c) 2015~2021 by Contributors + * Copyright (c) 2015~2022 by XGBoost Contributors * \file c_api.h * \author Tianqi Chen * \brief C API of XGBoost, used for interfacing to other languages. @@ -28,6 +28,24 @@ // manually define unsigned long typedef uint64_t bst_ulong; // NOLINT(*) +/** + * @mainpage + * + * \brief XGBoost C API reference. + * + * For the official document page see: + * XGBoost C Package. + */ + +/** + * @defgroup Library + * + * These functions are used to obtain general information about XGBoost including version, + * build info and current global configuration. + * + * @{ + */ + /*! \brief handle to DMatrix */ typedef void *DMatrixHandle; // NOLINT(*) /*! \brief handle to Booster */ @@ -63,7 +81,7 @@ XGB_DLL int XGBuildInfo(char const **out); * this function is thread safe and can be called by different thread * \return const char* error information */ -XGB_DLL const char *XGBGetLastError(void); +XGB_DLL const char *XGBGetLastError(); /*! * \brief register callback function for LOG(INFO) messages -- helpful messages @@ -78,18 +96,33 @@ XGB_DLL int XGBRegisterLogCallback(void (*callback)(const char*)); * \brief Set global configuration (collection of parameters that apply globally). This function * accepts the list of key-value pairs representing the global-scope parameters to be * configured. The list of key-value pairs are passed in as a JSON string. - * \param json_str a JSON string representing the list of key-value pairs. The JSON object shall + * \param config a JSON string representing the list of key-value pairs. The JSON object shall * be flat: no value can be a JSON object or an array. * \return 0 for success, -1 for failure */ -XGB_DLL int XGBSetGlobalConfig(const char* json_str); +XGB_DLL int XGBSetGlobalConfig(char const *config); /*! * \brief Get current global configuration (collection of parameters that apply globally). - * \param json_str pointer to received returned global configuration, represented as a JSON string. + * \param out_config pointer to received returned global configuration, represented as a JSON string. * \return 0 for success, -1 for failure */ -XGB_DLL int XGBGetGlobalConfig(const char** json_str); +XGB_DLL int XGBGetGlobalConfig(char const **out_config); + +/**@}*/ + +/** + * @defgroup DMatrix + * + * @brief DMatrix is the baisc data storage for XGBoost used by all XGBoost algorithms + * including both training, prediction and explanation. There are a few variants of + * `DMatrix` including normal `DMatrix`, which is a CSR matrix, `QuantileDMatrix`, + * which is used by histogram-based tree methods for saving memory, and lastly the + * experimental external-memory-based DMatrix, which reads data in batches during + * training. For the last two variants, see the @ref Streaming group. + * + * @{ + */ /*! * \brief load a data matrix @@ -98,9 +131,10 @@ XGB_DLL int XGBGetGlobalConfig(const char** json_str); * \param out a loaded data matrix * \return 0 when success, -1 when failure happens */ -XGB_DLL int XGDMatrixCreateFromFile(const char *fname, - int silent, - DMatrixHandle *out); +XGB_DLL int XGDMatrixCreateFromFile(const char *fname, int silent, DMatrixHandle *out); +/** + * @example c-api-demo.c + */ /*! * \brief create a matrix content from CSR format @@ -126,36 +160,26 @@ XGB_DLL int XGDMatrixCreateFromCSREx(const size_t* indptr, * \param indptr JSON encoded __array_interface__ to row pointers in CSR. * \param indices JSON encoded __array_interface__ to column indices in CSR. * \param data JSON encoded __array_interface__ to values in CSR. - * \param num_col Number of columns. - * \param json_config JSON encoded configuration. Required values are: - * - * - missing - * - nthread - * + * \param ncol Number of columns. + * \param config JSON encoded configuration. Required values are: + * - missing: Which value to represent missing value. + * - nthread (optional): Number of threads used for initializing DMatrix. * \param out created dmatrix * \return 0 when success, -1 when failure happens */ -XGB_DLL int XGDMatrixCreateFromCSR(char const *indptr, - char const *indices, char const *data, - bst_ulong ncol, - char const* json_config, - DMatrixHandle* out); - +XGB_DLL int XGDMatrixCreateFromCSR(char const *indptr, char const *indices, char const *data, + bst_ulong ncol, char const *config, DMatrixHandle *out); /*! * \brief Create a matrix from dense array. - * \param data JSON encoded __array_interface__ to array values. - * \param json_config JSON encoded configuration. Required values are: - * - * - missing - * - nthread - * + * \param data JSON encoded __array_interface__ to array values. + * \param config JSON encoded configuration. Required values are: + * - missing: Which value to represent missing value. + * - nthread (optional): Number of threads used for initializing DMatrix. * \param out created dmatrix * \return 0 when success, -1 when failure happens */ -XGB_DLL int XGDMatrixCreateFromDense(char const *data, - char const *json_config, - DMatrixHandle *out); +XGB_DLL int XGDMatrixCreateFromDense(char const *data, char const *config, DMatrixHandle *out); /*! * \brief create a matrix content from CSC format @@ -224,37 +248,33 @@ XGB_DLL int XGDMatrixCreateFromDT(void** data, /*! * \brief Create DMatrix from CUDA columnar format. (cuDF) * \param data Array of JSON encoded __cuda_array_interface__ for each column. - * \param json_config JSON encoded configuration. Required values are: - * - * - missing - * - nthread - * + * \param config JSON encoded configuration. Required values are: + * - missing: Which value to represent missing value. + * - nthread (optional): Number of threads used for initializing DMatrix. * \param out created dmatrix * \return 0 when success, -1 when failure happens */ -XGB_DLL int XGDMatrixCreateFromCudaColumnar(char const *data, - char const* json_config, +XGB_DLL int XGDMatrixCreateFromCudaColumnar(char const *data, char const *config, DMatrixHandle *out); /*! * \brief Create DMatrix from CUDA array. * \param data JSON encoded __cuda_array_interface__ for array data. - * \param json_config JSON encoded configuration. Required values are: - * - * - missing - * - nthread - * + * \param config JSON encoded configuration. Required values are: + * - missing: Which value to represent missing value. + * - nthread (optional): Number of threads used for initializing DMatrix. * \param out created dmatrix * \return 0 when success, -1 when failure happens */ -XGB_DLL int XGDMatrixCreateFromCudaArrayInterface(char const *data, - char const* json_config, +XGB_DLL int XGDMatrixCreateFromCudaArrayInterface(char const *data, char const *config, DMatrixHandle *out); /** - * ========================== Begin data callback APIs ========================= + * @defgroup Streaming + * @ingroup DMatrix * - * Short notes for data callback + * @brief Quantile DMatrix and external memory DMatrix can be created from batches of + * data. * * There are 2 sets of data callbacks for DMatrix. The first one is currently exclusively * used by JVM packages. It uses `XGBoostBatchCSR` to accept batches for CSR formated @@ -266,20 +286,20 @@ XGB_DLL int XGDMatrixCreateFromCudaArrayInterface(char const *data, * * Another set is used by external data iterator. It accept foreign data iterators as * callbacks. There are 2 different senarios where users might want to pass in callbacks - * instead of raw data. First it's the Quantile DMatrix used by GPU Hist. For this case, - * the data is first compressed by quantile sketching then merged. This is particular - * useful for distributed setting as it eliminates 2 copies of data. 1 by a `concat` from - * external library to make the data into a blob for normal DMatrix initialization, - * another by the internal CSR copy of DMatrix. The second use case is external memory - * support where users can pass a custom data iterator into XGBoost for loading data in - * batches. There are short notes on each of the use case in respected DMatrix factory - * function. + * instead of raw data. First it's the Quantile DMatrix used by hist and GPU Hist. For + * this case, the data is first compressed by quantile sketching then merged. This is + * particular useful for distributed setting as it eliminates 2 copies of data. 1 by a + * `concat` from external library to make the data into a blob for normal DMatrix + * initialization, another by the internal CSR copy of DMatrix. The second use case is + * external memory support where users can pass a custom data iterator into XGBoost for + * loading data in batches. There are short notes on each of the use cases in respected + * DMatrix factory function. * * Related functions are: * * # Factory functions * - \ref XGDMatrixCreateFromCallback for external memory - * - \ref XGDeviceQuantileDMatrixCreateFromCallback for quantile DMatrix + * - \ref XGQuantileDMatrixCreateFromCallback for quantile DMatrix * * # Proxy that callers can use to pass data to XGBoost * - \ref XGProxyDMatrixCreate @@ -290,6 +310,8 @@ XGB_DLL int XGDMatrixCreateFromCudaArrayInterface(char const *data, * - \ref XGProxyDMatrixSetDataDense * - \ref XGProxyDMatrixSetDataCSR * - ... (data setters) + * + * @{ */ /* ==== First set of callback functions, used exclusively by JVM packages. ==== */ @@ -396,30 +418,29 @@ XGB_EXTERN_C typedef void DataIterResetCallback(DataIterHandle handle); // NOLIN * Short note for how to use second set of callback for external memory data support: * * - Step 0: Define a data iterator with 2 methods `reset`, and `next`. - * - Step 1: Create a DMatrix proxy by `XGProxyDMatrixCreate` and hold the handle. + * - Step 1: Create a DMatrix proxy by \ref XGProxyDMatrixCreate and hold the handle. * - Step 2: Pass the iterator handle, proxy handle and 2 methods into * `XGDMatrixCreateFromCallback`, along with other parameters encoded as a JSON object. * - Step 3: Call appropriate data setters in `next` functions. * - * For example usage see demo/c-api/external-memory - * - * \param iter A handle to external data iterator. - * \param proxy A DMatrix proxy handle created by `XGProxyDMatrixCreate`. - * \param reset Callback function resetting the iterator state. - * \param next Callback function yielding the next batch of data. - * \param c_json_config JSON encoded parameters for DMatrix construction. Accepted fields are: - * + * \param iter A handle to external data iterator. + * \param proxy A DMatrix proxy handle created by \ref XGProxyDMatrixCreate. + * \param reset Callback function resetting the iterator state. + * \param next Callback function yielding the next batch of data. + * \param config JSON encoded parameters for DMatrix construction. Accepted fields are: * - missing: Which value to represent missing value * - cache_prefix: The path of cache file, caller must initialize all the directories in this path. * - nthread (optional): Number of threads used for initializing DMatrix. - * * \param[out] out The created external memory DMatrix * * \return 0 when success, -1 when failure happens */ XGB_DLL int XGDMatrixCreateFromCallback(DataIterHandle iter, DMatrixHandle proxy, DataIterResetCallback *reset, XGDMatrixCallbackNext *next, - char const *c_json_config, DMatrixHandle *out); + char const *config, DMatrixHandle *out); +/** + * @example external_memory.c + */ /*! * \brief Create a Quantile DMatrix with data iterator. @@ -427,7 +448,7 @@ XGB_DLL int XGDMatrixCreateFromCallback(DataIterHandle iter, DMatrixHandle proxy * Short note for how to use the second set of callback for (GPU)Hist tree method: * * - Step 0: Define a data iterator with 2 methods `reset`, and `next`. - * - Step 1: Create a DMatrix proxy by `XGProxyDMatrixCreate` and hold the handle. + * - Step 1: Create a DMatrix proxy by \ref XGProxyDMatrixCreate and hold the handle. * - Step 2: Pass the iterator handle, proxy handle and 2 methods into * `XGQuantileDMatrixCreateFromCallback`. * - Step 3: Call appropriate data setters in `next` functions. @@ -435,13 +456,14 @@ XGB_DLL int XGDMatrixCreateFromCallback(DataIterHandle iter, DMatrixHandle proxy * See test_iterative_dmatrix.cu or Python interface for examples. * * \param iter A handle to external data iterator. - * \param proxy A DMatrix proxy handle created by `XGProxyDMatrixCreate`. + * \param proxy A DMatrix proxy handle created by \ref XGProxyDMatrixCreate. * \param ref Reference DMatrix for providing quantile information. * \param reset Callback function resetting the iterator state. * \param next Callback function yielding the next batch of data. - * \param missing Which value to represent missing value - * \param nthread Number of threads to use, 0 for default. - * \param max_bin Maximum number of bins for building histogram. + * \param config JSON encoded parameters for DMatrix construction. Accepted fields are: + * - missing: Which value to represent missing value + * - nthread (optional): Number of threads used for initializing DMatrix. + * - max_bin (optional): Maximum number of bins for building histogram. * \param out The created Device Quantile DMatrix * * \return 0 when success, -1 when failure happens @@ -464,7 +486,7 @@ XGB_DLL int XGDeviceQuantileDMatrixCreateFromCallback(DataIterHandle iter, DMatr /*! * \brief Set data on a DMatrix proxy. * - * \param handle A DMatrix proxy created by XGProxyDMatrixCreate + * \param handle A DMatrix proxy created by \ref XGProxyDMatrixCreate * \param c_interface_str Null terminated JSON document string representation of CUDA * array interface. * @@ -477,7 +499,7 @@ XGProxyDMatrixSetDataCudaArrayInterface(DMatrixHandle handle, /*! * \brief Set data on a DMatrix proxy. * - * \param handle A DMatrix proxy created by XGProxyDMatrixCreate + * \param handle A DMatrix proxy created by \ref XGProxyDMatrixCreate * \param c_interface_str Null terminated JSON document string representation of CUDA * array interface, with an array of columns. * @@ -489,7 +511,7 @@ XGB_DLL int XGProxyDMatrixSetDataCudaColumnar(DMatrixHandle handle, /*! * \brief Set data on a DMatrix proxy. * - * \param handle A DMatrix proxy created by XGProxyDMatrixCreate + * \param handle A DMatrix proxy created by \ref XGProxyDMatrixCreate * \param c_interface_str Null terminated JSON document string representation of array * interface. * @@ -501,10 +523,11 @@ XGB_DLL int XGProxyDMatrixSetDataDense(DMatrixHandle handle, /*! * \brief Set data on a DMatrix proxy. * - * \param handle A DMatrix proxy created by XGProxyDMatrixCreate + * \param handle A DMatrix proxy created by \ref XGProxyDMatrixCreate * \param indptr JSON encoded __array_interface__ to row pointer in CSR. * \param indices JSON encoded __array_interface__ to column indices in CSR. - * \param values JSON encoded __array_interface__ to values in CSR.. + * \param data JSON encoded __array_interface__ to values in CSR.. + * \param ncol The number of columns of input CSR matrix. * * \return 0 when success, -1 when failure happens */ @@ -512,10 +535,7 @@ XGB_DLL int XGProxyDMatrixSetDataCSR(DMatrixHandle handle, char const *indptr, char const *indices, char const *data, bst_ulong ncol); -/* - * ==========================- End data callback APIs ========================== - */ - +/** @} */ // End of Streaming XGB_DLL int XGImportArrowRecordBatch(DataIterHandle data_handle, void *ptr_array, void *ptr_schema); @@ -523,17 +543,16 @@ XGB_DLL int XGImportArrowRecordBatch(DataIterHandle data_handle, void *ptr_array * \brief Construct DMatrix from arrow using callbacks. Arrow related C API is not stable * and subject to change in the future. * - * \param next Callback function for fetching arrow records. - * \param json_config JSON encoded configuration. Required values are: - * - * - missing - * - nthread - * + * \param next Callback function for fetching arrow records. + * \param config JSON encoded configuration. Required values are: + * - missing: Which value to represent missing value. + * - nbatch: Number of batches in arrow table. + * - nthread (optional): Number of threads used for initializing DMatrix. * \param out The created DMatrix. * * \return 0 when success, -1 when failure happens */ -XGB_DLL int XGDMatrixCreateFromArrowCallback(XGDMatrixCallbackNext *next, char const *json_config, +XGB_DLL int XGDMatrixCreateFromArrowCallback(XGDMatrixCallbackNext *next, char const *config, DMatrixHandle *out); /*! @@ -567,6 +586,10 @@ XGB_DLL int XGDMatrixSliceDMatrixEx(DMatrixHandle handle, * \return 0 when success, -1 when failure happens */ XGB_DLL int XGDMatrixFree(DMatrixHandle handle); +/** + * @example c-api-demo.c inference.c external_memory.c + */ + /*! * \brief load a data matrix into binary file * \param handle a instance of data matrix @@ -699,12 +722,10 @@ XGB_DLL int XGDMatrixGetStrFeatureInfo(DMatrixHandle handle, const char *field, * \param size Size of the data, this is relative to size of type. (Meaning NOT number * of bytes.) * \param type Indicator of data type. This is defined in xgboost::DataType enum class. - * - * float = 1 - * double = 2 - * uint32_t = 3 - * uint64_t = 4 - * + * - float = 1 + * - double = 2 + * - uint32_t = 3 + * - uint64_t = 4 * \return 0 when success, -1 when failure happens */ XGB_DLL int XGDMatrixSetDenseInfo(DMatrixHandle handle, const char *field, @@ -729,10 +750,12 @@ XGB_DLL int XGDMatrixSetGroup(DMatrixHandle handle, * \param out_dptr pointer to the result * \return 0 when success, -1 when failure happens */ -XGB_DLL int XGDMatrixGetFloatInfo(const DMatrixHandle handle, - const char *field, - bst_ulong* out_len, +XGB_DLL int XGDMatrixGetFloatInfo(const DMatrixHandle handle, const char *field, bst_ulong *out_len, const float **out_dptr); +/** + * @example c-api-demo.c + */ + /*! * \brief get uint32 info vector from matrix * \param handle a instance of data matrix @@ -762,7 +785,6 @@ XGB_DLL int XGDMatrixNumRow(DMatrixHandle handle, XGB_DLL int XGDMatrixNumCol(DMatrixHandle handle, bst_ulong *out); - /*! * \brief Get number of valid values from DMatrix. * @@ -794,7 +816,15 @@ XGB_DLL int XGDMatrixNumNonMissing(DMatrixHandle handle, bst_ulong *out); XGB_DLL int XGDMatrixGetDataAsCSR(DMatrixHandle const handle, char const *config, bst_ulong *out_indptr, unsigned *out_indices, float *out_data); -// --- start XGBoost class +/** @} */ // End of DMatrix + +/** + * @defgroup Booster + * + * @brief The `Booster` class is the gradient-boosted model for XGBoost. + * @{ + */ + /*! * \brief create xgboost learner * \param dmats matrices that are set to be cached @@ -802,15 +832,20 @@ XGB_DLL int XGDMatrixGetDataAsCSR(DMatrixHandle const handle, char const *config * \param out handle to the result booster * \return 0 when success, -1 when failure happens */ -XGB_DLL int XGBoosterCreate(const DMatrixHandle dmats[], - bst_ulong len, - BoosterHandle *out); +XGB_DLL int XGBoosterCreate(const DMatrixHandle dmats[], bst_ulong len, BoosterHandle *out); +/** + * @example c-api-demo.c + */ + /*! * \brief free obj in handle * \param handle handle to be freed * \return 0 when success, -1 when failure happens */ XGB_DLL int XGBoosterFree(BoosterHandle handle); +/** + * @example c-api-demo.c inference.c external_memory.c + */ /*! * \brief Slice a model using boosting index. The slice m:n indicates taking all trees @@ -848,14 +883,20 @@ XGB_DLL int XGBoosterBoostedRounds(BoosterHandle handle, int* out); XGB_DLL int XGBoosterSetParam(BoosterHandle handle, const char *name, const char *value); +/** + * @example c-api-demo.c + */ /*! * \brief get number of features + * \param handle Handle to booster. * \param out number of features * \return 0 when success, -1 when failure happens */ -XGB_DLL int XGBoosterGetNumFeature(BoosterHandle handle, - bst_ulong *out); +XGB_DLL int XGBoosterGetNumFeature(BoosterHandle handle, bst_ulong *out); +/** + * @example c-api-demo.c + */ /*! * \brief update the model in one round using dtrain @@ -864,9 +905,11 @@ XGB_DLL int XGBoosterGetNumFeature(BoosterHandle handle, * \param dtrain training data * \return 0 when success, -1 when failure happens */ -XGB_DLL int XGBoosterUpdateOneIter(BoosterHandle handle, - int iter, - DMatrixHandle dtrain); +XGB_DLL int XGBoosterUpdateOneIter(BoosterHandle handle, int iter, DMatrixHandle dtrain); +/** + * @example c-api-demo.c + */ + /*! * \brief update the model, by directly specify gradient and second order gradient, * this can be used to replace UpdateOneIter, to support customized loss function @@ -892,15 +935,26 @@ XGB_DLL int XGBoosterBoostOneIter(BoosterHandle handle, * \param out_result the string containing evaluation statistics * \return 0 when success, -1 when failure happens */ -XGB_DLL int XGBoosterEvalOneIter(BoosterHandle handle, - int iter, - DMatrixHandle dmats[], - const char *evnames[], - bst_ulong len, - const char **out_result); +XGB_DLL int XGBoosterEvalOneIter(BoosterHandle handle, int iter, DMatrixHandle dmats[], + const char *evnames[], bst_ulong len, const char **out_result); +/** + * @example c-api-demo.c + */ + +/** + * @defgroup Prediction + * @ingroup Booster + * + * @brief These functions are used for running prediction and explanation algorithms. + * + * @{ + */ /*! - * \brief make prediction based on dmat (deprecated, use `XGBoosterPredictFromDMatrix` instead) + * \brief make prediction based on dmat (deprecated, use \ref XGBoosterPredictFromDMatrix instead) + * \deprecated + * \see XGBoosterPredictFromDMatrix() + * * \param handle handle * \param dmat data matrix * \param option_mask bit-mask of options taken in prediction, possible values @@ -929,13 +983,14 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle, int training, bst_ulong *out_len, const float **out_result); + /*! - * \brief Make prediction from DMatrix, replacing `XGBoosterPredict`. + * \brief Make prediction from DMatrix, replacing \ref XGBoosterPredict. * * \param handle Booster handle * \param dmat DMatrix handle - * \param c_json_config String encoded predict configuration in JSON format, with - * following available fields in the JSON object: + * \param config String encoded predict configuration in JSON format, with following + * available fields in the JSON object: * * "type": [0, 6] * - 0: normal prediction @@ -972,10 +1027,10 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle, * \code * { * "type": 0, - * "training": False, + * "training": false, * "iteration_begin": 0, * "iteration_end": 0, - * "strict_shape": true, + * "strict_shape": true * } * \endcode * @@ -984,41 +1039,41 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle, * \param out_result Buffer storing prediction value (copy before use). * * \return 0 when success, -1 when failure happens + * + * \see XGBoosterPredictFromDense XGBoosterPredictFromCSR XGBoosterPredictFromCudaArray XGBoosterPredictFromCudaColumnar */ -XGB_DLL int XGBoosterPredictFromDMatrix(BoosterHandle handle, - DMatrixHandle dmat, - char const* c_json_config, - bst_ulong const **out_shape, - bst_ulong *out_dim, - float const **out_result); -/* +XGB_DLL int XGBoosterPredictFromDMatrix(BoosterHandle handle, DMatrixHandle dmat, + char const *config, bst_ulong const **out_shape, + bst_ulong *out_dim, float const **out_result); +/** + * @example inference.c + */ + +/** * \brief Inplace prediction from CPU dense matrix. * * \param handle Booster handle. * \param values JSON encoded __array_interface__ to values. - * \param c_json_config See `XGBoosterPredictFromDMatrix` for more info. - * + * \param config See \ref XGBoosterPredictFromDMatrix for more info. * Additional fields for inplace prediction are: - * "missing": float - * + * - "missing": float * \param m An optional (NULL if not available) proxy DMatrix instance * storing meta info. * - * \param out_shape See `XGBoosterPredictFromDMatrix` for more info. - * \param out_dim See `XGBoosterPredictFromDMatrix` for more info. - * \param out_result See `XGBoosterPredictFromDMatrix` for more info. + * \param out_shape See \ref XGBoosterPredictFromDMatrix for more info. + * \param out_dim See \ref XGBoosterPredictFromDMatrix for more info. + * \param out_result See \ref XGBoosterPredictFromDMatrix for more info. * * \return 0 when success, -1 when failure happens */ -XGB_DLL int XGBoosterPredictFromDense(BoosterHandle handle, - char const *values, - char const *c_json_config, - DMatrixHandle m, - bst_ulong const **out_shape, - bst_ulong *out_dim, - const float **out_result); +XGB_DLL int XGBoosterPredictFromDense(BoosterHandle handle, char const *values, char const *config, + DMatrixHandle m, bst_ulong const **out_shape, + bst_ulong *out_dim, const float **out_result); +/** + * @example inference.c + */ -/* +/** * \brief Inplace prediction from CPU CSR matrix. * * \param handle Booster handle. @@ -1026,76 +1081,74 @@ XGB_DLL int XGBoosterPredictFromDense(BoosterHandle handle, * \param indices JSON encoded __array_interface__ to column indices in CSR. * \param values JSON encoded __array_interface__ to values in CSR.. * \param ncol Number of features in data. - * \param c_json_config See `XGBoosterPredictFromDMatrix` for more info. + * \param config See \ref XGBoosterPredictFromDMatrix for more info. * Additional fields for inplace prediction are: - * "missing": float - * + * - "missing": float * \param m An optional (NULL if not available) proxy DMatrix instance * storing meta info. * - * \param out_shape See `XGBoosterPredictFromDMatrix` for more info. - * \param out_dim See `XGBoosterPredictFromDMatrix` for more info. - * \param out_result See `XGBoosterPredictFromDMatrix` for more info. + * \param out_shape See \ref XGBoosterPredictFromDMatrix for more info. + * \param out_dim See \ref XGBoosterPredictFromDMatrix for more info. + * \param out_result See \ref XGBoosterPredictFromDMatrix for more info. * * \return 0 when success, -1 when failure happens */ -XGB_DLL int XGBoosterPredictFromCSR(BoosterHandle handle, char const *indptr, - char const *indices, char const *values, - bst_ulong ncol, - char const *c_json_config, DMatrixHandle m, - bst_ulong const **out_shape, - bst_ulong *out_dim, - const float **out_result); +XGB_DLL int XGBoosterPredictFromCSR(BoosterHandle handle, char const *indptr, char const *indices, + char const *values, bst_ulong ncol, char const *config, + DMatrixHandle m, bst_ulong const **out_shape, + bst_ulong *out_dim, const float **out_result); -/* +/** * \brief Inplace prediction from CUDA Dense matrix (cupy in Python). * * \param handle Booster handle * \param values JSON encoded __cuda_array_interface__ to values. - * \param c_json_config See `XGBoosterPredictFromDMatrix` for more info. + * \param config See \ref XGBoosterPredictFromDMatrix for more info. * Additional fields for inplace prediction are: - * "missing": float - * + * - "missing": float * \param m An optional (NULL if not available) proxy DMatrix instance * storing meta info. - * \param out_shape See `XGBoosterPredictFromDMatrix` for more info. - * \param out_dim See `XGBoosterPredictFromDMatrix` for more info. - * \param out_result See `XGBoosterPredictFromDMatrix` for more info. + * \param out_shape See \ref XGBoosterPredictFromDMatrix for more info. + * \param out_dim See \ref XGBoosterPredictFromDMatrix for more info. + * \param out_result See \ref XGBoosterPredictFromDMatrix for more info. * * \return 0 when success, -1 when failure happens */ -XGB_DLL int XGBoosterPredictFromCudaArray( - BoosterHandle handle, char const *values, char const *c_json_config, - DMatrixHandle m, bst_ulong const **out_shape, bst_ulong *out_dim, - const float **out_result); +XGB_DLL int XGBoosterPredictFromCudaArray(BoosterHandle handle, char const *values, + char const *config, DMatrixHandle m, + bst_ulong const **out_shape, bst_ulong *out_dim, + const float **out_result); -/* +/** * \brief Inplace prediction from CUDA dense dataframe (cuDF in Python). * * \param handle Booster handle * \param values List of __cuda_array_interface__ for all columns encoded in JSON list. - * \param c_json_config See `XGBoosterPredictFromDMatrix` for more info. + * \param config See \ref XGBoosterPredictFromDMatrix for more info. * Additional fields for inplace prediction are: - * "missing": float - * + * - "missing": float * \param m An optional (NULL if not available) proxy DMatrix instance * storing meta info. - * \param out_shape See `XGBoosterPredictFromDMatrix` for more info. - * \param out_dim See `XGBoosterPredictFromDMatrix` for more info. - * \param out_result See `XGBoosterPredictFromDMatrix` for more info. + * \param out_shape See \ref XGBoosterPredictFromDMatrix for more info. + * \param out_dim See \ref XGBoosterPredictFromDMatrix for more info. + * \param out_result See \ref XGBoosterPredictFromDMatrix for more info. * * \return 0 when success, -1 when failure happens */ -XGB_DLL int XGBoosterPredictFromCudaColumnar( - BoosterHandle handle, char const *values, char const *c_json_config, - DMatrixHandle m, bst_ulong const **out_shape, bst_ulong *out_dim, - const float **out_result); +XGB_DLL int XGBoosterPredictFromCudaColumnar(BoosterHandle handle, char const *values, + char const *config, DMatrixHandle m, + bst_ulong const **out_shape, bst_ulong *out_dim, + const float **out_result); +/**@}*/ // End of Prediction -/* - * ========================== Begin Serialization APIs ========================= - */ -/* + +/** + * @defgroup Serialization + * @ingroup Booster + * + * @brief There are multiple ways to serialize a Booster object depending on the use case. + * * Short note for serialization APIs. There are 3 different sets of serialization API. * * - Functions with the term "Model" handles saving/loading XGBoost model like trees or @@ -1113,18 +1166,22 @@ XGB_DLL int XGBoosterPredictFromCudaColumnar( * situations like check-pointing, or continuing training task in distributed * environment. In these cases the task must be carried out without any user * intervention. + * + * @{ */ /*! * \brief Load model from existing file + * * \param handle handle * \param fname File URI or file name. -* \return 0 when success, -1 when failure happens + * \return 0 when success, -1 when failure happens */ XGB_DLL int XGBoosterLoadModel(BoosterHandle handle, const char *fname); /*! * \brief Save model into existing file + * * \param handle handle * \param fname File URI or file name. * \return 0 when success, -1 when failure happens @@ -1133,6 +1190,7 @@ XGB_DLL int XGBoosterSaveModel(BoosterHandle handle, const char *fname); /*! * \brief load model from in memory buffer + * * \param handle handle * \param buf pointer to the buffer * \param len the length of the buffer @@ -1147,8 +1205,8 @@ XGB_DLL int XGBoosterLoadModelFromBuffer(BoosterHandle handle, * result out, before next xgboost call * * \param handle handle - * \param json_config JSON encoded string storing parameters for the function. Following - * keys are expected in the JSON document: + * \param config JSON encoded string storing parameters for the function. Following + * keys are expected in the JSON document: * * "format": str * - json: Output booster will be encoded as JSON. @@ -1161,11 +1219,14 @@ XGB_DLL int XGBoosterLoadModelFromBuffer(BoosterHandle handle, * * \return 0 when success, -1 when failure happens */ -XGB_DLL int XGBoosterSaveModelToBuffer(BoosterHandle handle, char const *json_config, - bst_ulong *out_len, char const **out_dptr); +XGB_DLL int XGBoosterSaveModelToBuffer(BoosterHandle handle, char const *config, bst_ulong *out_len, + char const **out_dptr); /*! - * \brief Deprecated, use `XGBoosterSaveModelToBuffer` instead. + * \brief Save booster to a buffer with in binary format. + * + * \deprecated since 1.6.0 + * \see XGBoosterSaveModelToBuffer() */ XGB_DLL int XGBoosterGetModelRaw(BoosterHandle handle, bst_ulong *out_len, const char **out_dptr); @@ -1183,7 +1244,7 @@ XGB_DLL int XGBoosterSerializeToBuffer(BoosterHandle handle, bst_ulong *out_len, const char **out_dptr); /*! * \brief Memory snapshot based serialization method. Loads the buffer returned - * from `XGBoosterSerializeToBuffer'. + * from \ref XGBoosterSerializeToBuffer. * * \param handle handle * \param buf pointer to the buffer @@ -1231,15 +1292,11 @@ XGB_DLL int XGBoosterSaveJsonConfig(BoosterHandle handle, bst_ulong *out_len, * notice. * * \param handle handle to Booster object. - * \param json_parameters string representation of a JSON document. + * \param config string representation of a JSON document. * \return 0 when success, -1 when failure happens */ -XGB_DLL int XGBoosterLoadJsonConfig(BoosterHandle handle, - char const *json_parameters); -/* - * =========================== End Serialization APIs ========================== - */ - +XGB_DLL int XGBoosterLoadJsonConfig(BoosterHandle handle, char const *config); +/**@}*/ // End of Serialization /*! * \brief dump model, return array of strings representing model dump @@ -1380,7 +1437,7 @@ XGB_DLL int XGBoosterSetStrFeatureInfo(BoosterHandle handle, const char *field, * * \param handle An instance of Booster * \param field Field name - * \param size Size of output pointer `features` (number of strings returned). + * \param len Size of output pointer `features` (number of strings returned). * \param out_features Address of a pointer to array of strings. Result is stored in * thread local memory. * @@ -1397,7 +1454,7 @@ XGB_DLL int XGBoosterGetStrFeatureInfo(BoosterHandle handle, const char *field, * equal to out_n_scores and has multiple definitions of importance type. * * \param handle An instance of Booster - * \param json_config Parameters for computing scores. Accepted JSON keys are: + * \param config Parameters for computing scores encoded as JSON. Accepted JSON keys are: * - importance_type: A JSON string with following possible values: * * 'weight': the number of times a feature is used to split the data across all trees. * * 'gain': the average gain across all splits the feature is used in. @@ -1415,12 +1472,19 @@ XGB_DLL int XGBoosterGetStrFeatureInfo(BoosterHandle handle, const char *field, * * \return 0 when success, -1 when failure happens */ -XGB_DLL int XGBoosterFeatureScore(BoosterHandle handle, const char *json_config, - bst_ulong *out_n_features, - char const ***out_features, - bst_ulong *out_dim, - bst_ulong const **out_shape, +XGB_DLL int XGBoosterFeatureScore(BoosterHandle handle, const char *config, + bst_ulong *out_n_features, char const ***out_features, + bst_ulong *out_dim, bst_ulong const **out_shape, float const **out_scores); +/**@}*/ // End of Booster + +/** + * @defgroup Collective + * + * @brief Experimental support for exposing internal communicator in XGBoost. + * + * @{ + */ /*! * \brief Initialize the collective communicator. @@ -1433,7 +1497,7 @@ XGB_DLL int XGBoosterFeatureScore(BoosterHandle handle, const char *json_config, * The additional configuration is not required. Usually the communicator will detect settings * from environment variables. * - * \param json_config JSON encoded configuration. Accepted JSON keys are: + * \param config JSON encoded configuration. Accepted JSON keys are: * - xgboost_communicator: The type of the communicator. Can be set as an environment variable. * * rabit: Use Rabit. This is the default if the type is unspecified. * * mpi: Use MPI. @@ -1470,7 +1534,7 @@ XGB_DLL int XGBoosterFeatureScore(BoosterHandle handle, const char *json_config, * - federated_client_cert: Client certificate file path. Only needed for the SSL mode. * \return 0 for success, -1 for failure. */ -XGB_DLL int XGCommunicatorInit(char const* json_config); +XGB_DLL int XGCommunicatorInit(char const* config); /*! * \brief Finalize the collective communicator. @@ -1525,8 +1589,10 @@ XGB_DLL int XGCommunicatorGetProcessorName(const char** name_str); * \brief Broadcast a memory region to all others from root. This function is NOT thread-safe. * * Example: + * \code * int a = 1; * Broadcast(&a, sizeof(a), root); + * \endcode * * \param send_receive_buffer Pointer to the send or receive buffer. * \param size Size of the data. @@ -1539,10 +1605,13 @@ XGB_DLL int XGCommunicatorBroadcast(void *send_receive_buffer, size_t size, int * \brief Perform in-place allreduce. This function is NOT thread-safe. * * Example Usage: the following code gives sum of the result + * \code * vector data(10); * ... * Allreduce(&data[0], data.size(), DataType:kInt32, Op::kSum); * ... + * \endcode + * \param send_receive_buffer Buffer for both sending and receiving data. * \param count Number of elements to be reduced. * \param data_type Enumeration of data type, see xgboost::collective::DataType in communicator.h. @@ -1551,5 +1620,5 @@ XGB_DLL int XGCommunicatorBroadcast(void *send_receive_buffer, size_t size, int */ XGB_DLL int XGCommunicatorAllreduce(void *send_receive_buffer, size_t count, int data_type, int op); - +/**@}*/ // End of Collective #endif // XGBOOST_C_API_H_ diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index ce412aeeabfe..d183c2cb3780 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -1020,7 +1020,7 @@ def get_data(self) -> scipy.sparse.csr_matrix: testing purposes. If this is a quantized DMatrix then quantized values are returned instead of input values. - .. versionadded:: 2.0.0 + .. versionadded:: 1.7.0 """ indptr = np.empty(self.num_row() + 1, dtype=np.uint64) diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py index bc1185809136..e486fb57a5f5 100644 --- a/python-package/xgboost/data.py +++ b/python-package/xgboost/data.py @@ -619,12 +619,14 @@ def _from_arrow( if enable_categorical: raise ValueError("categorical data in arrow is not supported yet.") - rb_iter = iter(data.to_batches()) + batches = data.to_batches() + rb_iter = iter(batches) it = record_batch_data_iter(rb_iter) next_callback = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_void_p)(it) handle = ctypes.c_void_p() - - config = bytes(json.dumps({"missing": missing, "nthread": nthread}), "utf-8") + config = from_pystr_to_cstr( + json.dumps({"missing": missing, "nthread": nthread, "nbatch": len(batches)}) + ) _check_call( _LIB.XGDMatrixCreateFromArrowCallback( next_callback, diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index dc5e901efc38..15248f406d88 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -251,17 +251,13 @@ XGB_DLL int XGDMatrixCreateFromDataIter( } #ifndef XGBOOST_USE_CUDA -XGB_DLL int XGDMatrixCreateFromCudaColumnar(char const *data, - char const* c_json_config, - DMatrixHandle *out) { +XGB_DLL int XGDMatrixCreateFromCudaColumnar(char const *, char const *, DMatrixHandle *) { API_BEGIN(); common::AssertGPUSupport(); API_END(); } -XGB_DLL int XGDMatrixCreateFromCudaArrayInterface(char const *data, - char const* c_json_config, - DMatrixHandle *out) { +XGB_DLL int XGDMatrixCreateFromCudaArrayInterface(char const *, char const *, DMatrixHandle *) { API_BEGIN(); common::AssertGPUSupport(); API_END(); @@ -272,14 +268,14 @@ XGB_DLL int XGDMatrixCreateFromCudaArrayInterface(char const *data, // Create from data iterator XGB_DLL int XGDMatrixCreateFromCallback(DataIterHandle iter, DMatrixHandle proxy, DataIterResetCallback *reset, XGDMatrixCallbackNext *next, - char const *c_json_config, DMatrixHandle *out) { + char const *config, DMatrixHandle *out) { API_BEGIN(); - xgboost_CHECK_C_ARG_PTR(c_json_config); + xgboost_CHECK_C_ARG_PTR(config); - auto config = Json::Load(StringView{c_json_config}); - auto missing = GetMissing(config); - std::string cache = RequiredArg(config, "cache_prefix", __func__); - auto n_threads = OptionalArg(config, "nthread", common::OmpGetNumThreads(0)); + auto jconfig = Json::Load(StringView{config}); + auto missing = GetMissing(jconfig); + std::string cache = RequiredArg(jconfig, "cache_prefix", __func__); + auto n_threads = OptionalArg(jconfig, "nthread", common::OmpGetNumThreads(0)); xgboost_CHECK_C_ARG_PTR(next); xgboost_CHECK_C_ARG_PTR(reset); @@ -502,15 +498,16 @@ XGB_DLL int XGImportArrowRecordBatch(DataIterHandle data_handle, void *ptr_array API_END(); } -XGB_DLL int XGDMatrixCreateFromArrowCallback(XGDMatrixCallbackNext *next, char const *json_config, +XGB_DLL int XGDMatrixCreateFromArrowCallback(XGDMatrixCallbackNext *next, char const *config, DMatrixHandle *out) { API_BEGIN(); - xgboost_CHECK_C_ARG_PTR(json_config); - auto config = Json::Load(StringView{json_config}); - auto missing = GetMissing(config); - int32_t n_threads = get(config["nthread"]); - n_threads = common::OmpGetNumThreads(n_threads); - data::RecordBatchesIterAdapter adapter(next, n_threads); + xgboost_CHECK_C_ARG_PTR(config); + auto jconfig = Json::Load(StringView{config}); + auto missing = GetMissing(jconfig); + auto n_batches = RequiredArg(jconfig, "nbatch", __func__); + auto n_threads = + OptionalArg(jconfig, "nthread", common::OmpGetNumThreads(0)); + data::RecordBatchesIterAdapter adapter(next, n_batches); xgboost_CHECK_C_ARG_PTR(out); *out = new std::shared_ptr(DMatrix::Create(&adapter, missing, n_threads)); API_END(); @@ -1055,20 +1052,18 @@ XGB_DLL int XGBoosterPredictFromCSR(BoosterHandle handle, char const *indptr, ch } #if !defined(XGBOOST_USE_CUDA) -XGB_DLL int XGBoosterPredictFromCUDAArray( - BoosterHandle handle, char const *c_json_strs, char const *c_json_config, - DMatrixHandle m, xgboost::bst_ulong const **out_shape, xgboost::bst_ulong *out_dim, - const float **out_result) { +XGB_DLL int XGBoosterPredictFromCUDAArray(BoosterHandle handle, char const *, char const *, + DMatrixHandle, xgboost::bst_ulong const **, + xgboost::bst_ulong *, const float **) { API_BEGIN(); CHECK_HANDLE(); common::AssertGPUSupport(); API_END(); } -XGB_DLL int XGBoosterPredictFromCUDAColumnar( - BoosterHandle handle, char const *c_json_strs, char const *c_json_config, - DMatrixHandle m, xgboost::bst_ulong const **out_shape, xgboost::bst_ulong *out_dim, - const float **out_result) { +XGB_DLL int XGBoosterPredictFromCUDAColumnar(BoosterHandle handle, char const *, char const *, + DMatrixHandle, xgboost::bst_ulong const **, + xgboost::bst_ulong *, const float **) { API_BEGIN(); CHECK_HANDLE(); common::AssertGPUSupport(); @@ -1490,30 +1485,30 @@ XGB_DLL int XGBoosterGetStrFeatureInfo(BoosterHandle handle, const char *field, API_END(); } -XGB_DLL int XGBoosterFeatureScore(BoosterHandle handle, char const *json_config, +XGB_DLL int XGBoosterFeatureScore(BoosterHandle handle, char const *config, xgboost::bst_ulong *out_n_features, char const ***out_features, bst_ulong *out_dim, bst_ulong const **out_shape, float const **out_scores) { API_BEGIN(); CHECK_HANDLE(); auto *learner = static_cast(handle); - xgboost_CHECK_C_ARG_PTR(json_config); - auto config = Json::Load(StringView{json_config}); + xgboost_CHECK_C_ARG_PTR(config); + auto jconfig = Json::Load(StringView{config}); - auto importance = RequiredArg(config, "importance_type", __func__); + auto importance = RequiredArg(jconfig, "importance_type", __func__); std::string feature_map_uri; - if (!IsA(config["feature_map"])) { - feature_map_uri = get(config["feature_map"]); + if (!IsA(jconfig["feature_map"])) { + feature_map_uri = get(jconfig["feature_map"]); } FeatureMap feature_map = LoadFeatureMap(feature_map_uri); std::vector custom_feature_names; - if (!IsA(config["feature_names"])) { - custom_feature_names = get(config["feature_names"]); + if (!IsA(jconfig["feature_names"])) { + custom_feature_names = get(jconfig["feature_names"]); } std::vector tree_idx; - if (!IsA(config["tree_idx"])) { - auto j_tree_idx = get(config["tree_idx"]); + if (!IsA(jconfig["tree_idx"])) { + auto j_tree_idx = get(jconfig["tree_idx"]); for (auto const &idx : j_tree_idx) { tree_idx.push_back(get(idx)); } diff --git a/src/c_api/c_api.cu b/src/c_api/c_api.cu index 65d1a9056167..09f133aee1f0 100644 --- a/src/c_api/c_api.cu +++ b/src/c_api/c_api.cu @@ -1,10 +1,12 @@ // Copyright (c) 2019-2022 by Contributors +#include "../common/threading_utils.h" #include "../data/device_adapter.cuh" #include "../data/proxy_dmatrix.h" #include "c_api_error.h" #include "c_api_utils.h" #include "xgboost/c_api.h" #include "xgboost/data.h" +#include "xgboost/json.h" #include "xgboost/learner.h" namespace xgboost { @@ -70,10 +72,11 @@ XGB_DLL int XGDMatrixCreateFromCudaColumnar(char const *data, auto config = Json::Load(StringView{c_json_config}); float missing = GetMissing(config); - auto nthread = get(config["nthread"]); + auto n_threads = + OptionalArg(config, "nthread", common::OmpGetNumThreads(0)); data::CudfAdapter adapter(json_str); *out = - new std::shared_ptr(DMatrix::Create(&adapter, missing, nthread)); + new std::shared_ptr(DMatrix::Create(&adapter, missing, n_threads)); API_END(); } @@ -84,10 +87,11 @@ XGB_DLL int XGDMatrixCreateFromCudaArrayInterface(char const *data, std::string json_str{data}; auto config = Json::Load(StringView{c_json_config}); float missing = GetMissing(config); - auto nthread = get(config["nthread"]); + auto n_threads = + OptionalArg(config, "nthread", common::OmpGetNumThreads(0)); data::CupyAdapter adapter(json_str); *out = - new std::shared_ptr(DMatrix::Create(&adapter, missing, nthread)); + new std::shared_ptr(DMatrix::Create(&adapter, missing, n_threads)); API_END(); } diff --git a/src/c_api/c_api_utils.h b/src/c_api/c_api_utils.h index c590407460fa..ba23f765e641 100644 --- a/src/c_api/c_api_utils.h +++ b/src/c_api/c_api_utils.h @@ -151,7 +151,13 @@ inline uint32_t GetIterationFromTreeLimit(uint32_t ntree_limit, Learner *learner inline float GetMissing(Json const &config) { float missing; - auto const& j_missing = config["missing"]; + auto const &obj = get(config); + auto it = obj.find("missing"); + if (it == obj.cend()) { + LOG(FATAL) << "Argument `missing` is required."; + } + + auto const &j_missing = it->second; if (IsA(j_missing)) { missing = get(j_missing); } else if (IsA(j_missing)) { diff --git a/src/data/adapter.h b/src/data/adapter.h index ddd7731ad89f..3945b1a4ec29 100644 --- a/src/data/adapter.h +++ b/src/data/adapter.h @@ -1078,10 +1078,8 @@ class ArrowColumnarBatch { using ArrowColumnarBatchVec = std::vector>; class RecordBatchesIterAdapter: public dmlc::DataIter { public: - RecordBatchesIterAdapter(XGDMatrixCallbackNext *next_callback, - int nthread) - : next_callback_{next_callback}, - nbatches_{nthread} {} + RecordBatchesIterAdapter(XGDMatrixCallbackNext* next_callback, int nbatch) + : next_callback_{next_callback}, nbatches_{nbatch} {} void BeforeFirst() override { CHECK(at_first_) << "Cannot reset RecordBatchesIterAdapter"; diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc index 467e774aa339..498189ebbaec 100644 --- a/src/data/simple_dmatrix.cc +++ b/src/data/simple_dmatrix.cc @@ -263,6 +263,8 @@ template SimpleDMatrix::SimpleDMatrix( template <> SimpleDMatrix::SimpleDMatrix(RecordBatchesIterAdapter* adapter, float missing, int nthread) { + ctx_.nthread = nthread; + auto& offset_vec = sparse_page_->offset.HostVector(); auto& data_vec = sparse_page_->data.HostVector(); uint64_t total_batch_size = 0; @@ -275,7 +277,7 @@ SimpleDMatrix::SimpleDMatrix(RecordBatchesIterAdapter* adapter, float missing, i size_t num_elements = 0; size_t num_rows = 0; // Import Arrow RecordBatches -#pragma omp parallel for reduction(+ : num_elements, num_rows) num_threads(nthread) +#pragma omp parallel for reduction(+ : num_elements, num_rows) num_threads(ctx_.Threads()) for (int i = 0; i < static_cast(batches.size()); ++i) { // NOLINT num_elements += batches[i]->Import(missing); num_rows += batches[i]->Size(); @@ -297,7 +299,7 @@ SimpleDMatrix::SimpleDMatrix(RecordBatchesIterAdapter* adapter, float missing, i data_vec.resize(total_elements); offset_vec.resize(total_batch_size + 1); // Copy data into DMatrix -#pragma omp parallel num_threads(nthread) +#pragma omp parallel num_threads(ctx_.Threads()) { #pragma omp for nowait for (int i = 0; i < static_cast(batches.size()); ++i) { // NOLINT