Skip to content

Commit

Permalink
[breaking] Change internal model serialization to UBJSON. (#7556)
Browse files Browse the repository at this point in the history
* Use typed array for models.
* Change the memory snapshot format.
* Add new C API for saving to raw format.
  • Loading branch information
trivialfis authored Jan 15, 2022
1 parent 13b0fa4 commit a1bcd33
Show file tree
Hide file tree
Showing 24 changed files with 566 additions and 255 deletions.
2 changes: 1 addition & 1 deletion doc/model.schema
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@
"default_left": {
"type": "array",
"items": {
"type": "boolean"
"type": "integer"
}
},
"categories": {
Expand Down
26 changes: 22 additions & 4 deletions include/xgboost/c_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -1081,14 +1081,32 @@ XGB_DLL int XGBoosterSaveModel(BoosterHandle handle,
XGB_DLL int XGBoosterLoadModelFromBuffer(BoosterHandle handle,
const void *buf,
bst_ulong len);

/*!
* \brief save model into binary raw bytes, return header of the array
* user must copy the result out, before next xgboost call
* \brief Save model into raw bytes, return header of the array. User must copy the
* result out, before next xgboost call
*
* \param handle handle
* \param out_len the argument to hold the output length
* \param out_dptr the argument to hold the output data pointer
* \param json_config JSON encoded string storing parameters for the function. Following
* keys are expected in the JSON document:
*
* "format": str
* - json: Output booster will be encoded as JSON.
* - ubj: Output booster will be encoded as Univeral binary JSON.
* - deprecated: Output booster will be encoded as old custom binary format. Do not use
* this format except for compatibility reasons.
*
* \param out_len The argument to hold the output length
* \param out_dptr The argument to hold the output data pointer
*
* \return 0 when success, -1 when failure happens
*/
XGB_DLL int XGBoosterSaveModelToBuffer(BoosterHandle handle, char const *json_config,
bst_ulong *out_len, char const **out_dptr);

/*!
* \brief Deprecated, use `XGBoosterSaveModelToBuffer` instead.
*/
XGB_DLL int XGBoosterGetModelRaw(BoosterHandle handle, bst_ulong *out_len,
const char **out_dptr);

Expand Down
7 changes: 2 additions & 5 deletions include/xgboost/data.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*!
* Copyright (c) 2015-2021 by Contributors
* Copyright (c) 2015-2022 by Contributors
* \file data.h
* \brief The input data structure of xgboost.
* \author Tianqi Chen
Expand Down Expand Up @@ -36,10 +36,7 @@ enum class DataType : uint8_t {
kStr = 5
};

enum class FeatureType : uint8_t {
kNumerical,
kCategorical
};
enum class FeatureType : uint8_t { kNumerical = 0, kCategorical = 1 };

/*!
* \brief Meta information about dataset, always sit in memory.
Expand Down
4 changes: 2 additions & 2 deletions include/xgboost/linalg.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*!
* Copyright 2021 by XGBoost Contributors
* Copyright 2021-2022 by XGBoost Contributors
* \file linalg.h
* \brief Linear algebra related utilities.
*/
Expand Down Expand Up @@ -567,7 +567,7 @@ template <typename T, int32_t D>
Json ArrayInterface(TensorView<T const, D> const &t) {
Json array_interface{Object{}};
array_interface["data"] = std::vector<Json>(2);
array_interface["data"][0] = Integer(reinterpret_cast<int64_t>(t.Values().data()));
array_interface["data"][0] = Integer{reinterpret_cast<int64_t>(t.Values().data())};
array_interface["data"][1] = Boolean{true};
if (t.DeviceIdx() >= 0) {
// Change this once we have different CUDA stream.
Expand Down
5 changes: 3 additions & 2 deletions include/xgboost/tree_model.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*!
* Copyright 2014-2019 by Contributors
* Copyright 2014-2022 by Contributors
* \file tree_model.h
* \brief model structure for tree
* \author Tianqi Chen
Expand Down Expand Up @@ -42,7 +42,7 @@ struct TreeParam : public dmlc::Parameter<TreeParam> {
/*! \brief maximum depth, this is a statistics of the tree */
int deprecated_max_depth;
/*! \brief number of features used for tree construction */
int num_feature;
bst_feature_t num_feature;
/*!
* \brief leaf vector size, used for vector tree
* used to store more than one dimensional information in tree
Expand Down Expand Up @@ -629,6 +629,7 @@ class RegTree : public Model {
}

private:
template <bool typed>
void LoadCategoricalSplit(Json const& in);
void SaveCategoricalSplit(Json* p_out) const;
// vector of nodes
Expand Down
145 changes: 101 additions & 44 deletions src/c_api/c_api.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2014-2021 by Contributors
// Copyright (c) 2014-2022 by Contributors
#include <rabit/rabit.h>
#include <rabit/c_api.h>

Expand Down Expand Up @@ -248,22 +248,16 @@ XGB_DLL int XGDMatrixCreateFromCudaArrayInterface(char const *data,
#endif

// Create from data iterator
XGB_DLL int XGDMatrixCreateFromCallback(DataIterHandle iter,
DMatrixHandle proxy,
DataIterResetCallback *reset,
XGDMatrixCallbackNext *next,
char const* c_json_config,
DMatrixHandle *out) {
XGB_DLL int XGDMatrixCreateFromCallback(DataIterHandle iter, DMatrixHandle proxy,
DataIterResetCallback *reset, XGDMatrixCallbackNext *next,
char const *c_json_config, DMatrixHandle *out) {
API_BEGIN();
auto config = Json::Load(StringView{c_json_config});
float missing = get<Number const>(config["missing"]);
std::string cache = get<String const>(config["cache_prefix"]);
int32_t n_threads = omp_get_max_threads();
if (!IsA<Null>(config["nthread"])) {
n_threads = get<Integer const>(config["nthread"]);
}
*out = new std::shared_ptr<xgboost::DMatrix>{xgboost::DMatrix::Create(
iter, proxy, reset, next, missing, n_threads, cache)};
auto missing = GetMissing(config);
std::string cache = RequiredArg<String>(config, "cache_prefix", __func__);
auto n_threads = OptionalArg<Integer, int64_t>(config, "nthread", common::OmpGetNumThreads(0));
*out = new std::shared_ptr<xgboost::DMatrix>{
xgboost::DMatrix::Create(iter, proxy, reset, next, missing, n_threads, cache)};
API_END();
}

Expand Down Expand Up @@ -358,8 +352,8 @@ XGB_DLL int XGDMatrixCreateFromCSR(char const *indptr,
StringView{data}, ncol);
auto config = Json::Load(StringView{c_json_config});
float missing = GetMissing(config);
auto nthread = get<Integer const>(config["nthread"]);
*out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, missing, nthread));
auto n_threads = OptionalArg<Integer, int64_t>(config, "nthread", common::OmpGetNumThreads(0));
*out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, missing, n_threads));
API_END();
}

Expand All @@ -371,9 +365,9 @@ XGB_DLL int XGDMatrixCreateFromDense(char const *data,
xgboost::data::ArrayAdapter(StringView{data})};
auto config = Json::Load(StringView{c_json_config});
float missing = GetMissing(config);
auto nthread = get<Integer const>(config["nthread"]);
auto n_threads = OptionalArg<Integer, int64_t>(config, "nthread", common::OmpGetNumThreads(0));
*out =
new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, missing, nthread));
new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, missing, n_threads));
API_END();
}

Expand Down Expand Up @@ -765,11 +759,11 @@ XGB_DLL int XGBoosterPredictFromDMatrix(BoosterHandle handle,
auto& entry = learner->GetThreadLocal().prediction_entry;
auto p_m = *static_cast<std::shared_ptr<DMatrix> *>(dmat);

auto const& j_config = get<Object const>(config);
auto type = PredictionType(get<Integer const>(j_config.at("type")));
auto iteration_begin = get<Integer const>(j_config.at("iteration_begin"));
auto iteration_end = get<Integer const>(j_config.at("iteration_end"));
auto type = PredictionType(RequiredArg<Integer>(config, "type", __func__));
auto iteration_begin = RequiredArg<Integer>(config, "iteration_begin", __func__);
auto iteration_end = RequiredArg<Integer>(config, "iteration_end", __func__);

auto const& j_config = get<Object const>(config);
auto ntree_limit_it = j_config.find("ntree_limit");
if (ntree_limit_it != j_config.cend() && !IsA<Null>(ntree_limit_it->second) &&
get<Integer const>(ntree_limit_it->second) != 0) {
Expand All @@ -785,7 +779,7 @@ XGB_DLL int XGBoosterPredictFromDMatrix(BoosterHandle handle,
type == PredictionType::kApproxContribution;
bool interactions = type == PredictionType::kInteraction ||
type == PredictionType::kApproxInteraction;
bool training = get<Boolean const>(config["training"]);
bool training = RequiredArg<Boolean>(config, "training", __func__);
learner->Predict(p_m, type == PredictionType::kMargin, &entry.predictions,
iteration_begin, iteration_end, training,
type == PredictionType::kLeaf, contribs, approximate,
Expand All @@ -796,7 +790,7 @@ XGB_DLL int XGBoosterPredictFromDMatrix(BoosterHandle handle,
auto rounds = iteration_end - iteration_begin;
rounds = rounds == 0 ? learner->BoostedRounds() : rounds;
// Determine shape
bool strict_shape = get<Boolean const>(config["strict_shape"]);
bool strict_shape = RequiredArg<Boolean>(config, "strict_shape", __func__);
CalcPredictShape(strict_shape, type, p_m->Info().num_row_,
p_m->Info().num_col_, chunksize, learner->Groups(), rounds,
&shape, out_dim);
Expand All @@ -814,15 +808,15 @@ void InplacePredictImpl(std::shared_ptr<T> x, std::shared_ptr<DMatrix> p_m,
CHECK_EQ(get<Integer const>(config["cache_id"]), 0) << "Cache ID is not supported yet";

HostDeviceVector<float>* p_predt { nullptr };
auto type = PredictionType(get<Integer const>(config["type"]));
auto type = PredictionType(RequiredArg<Integer>(config, "type", __func__));
float missing = GetMissing(config);
learner->InplacePredict(x, p_m, type, missing, &p_predt,
get<Integer const>(config["iteration_begin"]),
get<Integer const>(config["iteration_end"]));
RequiredArg<Integer>(config, "iteration_begin", __func__),
RequiredArg<Integer>(config, "iteration_end", __func__));
CHECK(p_predt);
auto &shape = learner->GetThreadLocal().prediction_shape;
auto chunksize = n_rows == 0 ? 0 : p_predt->Size() / n_rows;
bool strict_shape = get<Boolean const>(config["strict_shape"]);
bool strict_shape = RequiredArg<Boolean>(config, "strict_shape", __func__);
CalcPredictShape(strict_shape, type, n_rows, n_cols, chunksize, learner->Groups(),
learner->BoostedRounds(), &shape, out_dim);
*out_result = dmlc::BeginPtr(p_predt->HostVector());
Expand Down Expand Up @@ -900,45 +894,105 @@ XGB_DLL int XGBoosterPredictFromCUDAColumnar(
XGB_DLL int XGBoosterLoadModel(BoosterHandle handle, const char* fname) {
API_BEGIN();
CHECK_HANDLE();
if (common::FileExtension(fname) == "json") {
auto read_file = [&]() {
auto str = common::LoadSequentialFile(fname);
CHECK_GT(str.size(), 2);
CHECK_GE(str.size(), 3); // "{}\0"
CHECK_EQ(str[0], '{');
Json in { Json::Load({str.c_str(), str.size()}) };
CHECK_EQ(str[str.size() - 2], '}');
return str;
};
if (common::FileExtension(fname) == "json") {
auto str = read_file();
Json in{Json::Load(StringView{str})};
static_cast<Learner*>(handle)->LoadModel(in);
} else if (common::FileExtension(fname) == "ubj") {
auto str = read_file();
Json in = Json::Load(StringView{str}, std::ios::binary);
static_cast<Learner *>(handle)->LoadModel(in);
} else {
std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname, "r"));
static_cast<Learner*>(handle)->LoadModel(fi.get());
}
API_END();
}

XGB_DLL int XGBoosterSaveModel(BoosterHandle handle, const char* c_fname) {
namespace {
void WarnOldModel() {
if (XGBOOST_VER_MAJOR >= 2) {
LOG(WARNING) << "Saving into deprecated binary model format, please consider using `json` or "
"`ubj`. Model format will default to JSON in XGBoost 2.2 if not specified.";
}
}
} // anonymous namespace

XGB_DLL int XGBoosterSaveModel(BoosterHandle handle, const char *c_fname) {
API_BEGIN();
CHECK_HANDLE();
std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(c_fname, "w"));
auto *learner = static_cast<Learner *>(handle);
learner->Configure();
if (common::FileExtension(c_fname) == "json") {
Json out { Object() };
auto save_json = [&](std::ios::openmode mode) {
Json out{Object()};
learner->SaveModel(&out);
std::string str;
Json::Dump(out, &str);
fo->Write(str.c_str(), str.size());
std::vector<char> str;
Json::Dump(out, &str, mode);
fo->Write(str.data(), str.size());
};
if (common::FileExtension(c_fname) == "json") {
save_json(std::ios::out);
} else if (common::FileExtension(c_fname) == "ubj") {
save_json(std::ios::binary);
} else if (XGBOOST_VER_MAJOR == 2 && XGBOOST_VER_MINOR >= 2) {
LOG(WARNING) << "Saving model to JSON as default. You can use file extension `json`, `ubj` or "
"`deprecated` to choose between formats.";
save_json(std::ios::out);
} else {
auto *bst = static_cast<Learner*>(handle);
WarnOldModel();
auto *bst = static_cast<Learner *>(handle);
bst->SaveModel(fo.get());
}
API_END();
}

XGB_DLL int XGBoosterLoadModelFromBuffer(BoosterHandle handle,
const void* buf,
XGB_DLL int XGBoosterLoadModelFromBuffer(BoosterHandle handle, const void *buf,
xgboost::bst_ulong len) {
API_BEGIN();
CHECK_HANDLE();
common::MemoryFixSizeBuffer fs((void*)buf, len); // NOLINT(*)
static_cast<Learner*>(handle)->LoadModel(&fs);
common::MemoryFixSizeBuffer fs((void *)buf, len); // NOLINT(*)
static_cast<Learner *>(handle)->LoadModel(&fs);
API_END();
}

XGB_DLL int XGBoosterSaveModelToBuffer(BoosterHandle handle, char const *json_config,
xgboost::bst_ulong *out_len, char const **out_dptr) {
API_BEGIN();
CHECK_HANDLE();
auto config = Json::Load(StringView{json_config});
auto format = RequiredArg<String>(config, "format", __func__);

auto *learner = static_cast<Learner *>(handle);
std::string &raw_str = learner->GetThreadLocal().ret_str;
raw_str.clear();

learner->Configure();
Json out{Object{}};
if (format == "json") {
learner->SaveModel(&out);
Json::Dump(out, &raw_str);
} else if (format == "ubj") {
learner->SaveModel(&out);
Json::Dump(out, &raw_str, std::ios::binary);
} else if (format == "deprecated") {
WarnOldModel();
common::MemoryBufferStream fo(&raw_str);
learner->SaveModel(&fo);
} else {
LOG(FATAL) << "Unknown format: `" << format << "`";
}

*out_dptr = dmlc::BeginPtr(raw_str);
*out_len = static_cast<xgboost::bst_ulong>(raw_str.length());

API_END();
}

Expand All @@ -952,6 +1006,8 @@ XGB_DLL int XGBoosterGetModelRaw(BoosterHandle handle,
raw_str.resize(0);

common::MemoryBufferStream fo(&raw_str);
LOG(WARNING) << "`" << __func__
<< "` is deprecated, please use `XGBoosterSaveModelToBuffer` instead.";

learner->Configure();
learner->SaveModel(&fo);
Expand Down Expand Up @@ -1208,7 +1264,8 @@ XGB_DLL int XGBoosterFeatureScore(BoosterHandle handle, char const *json_config,
CHECK_HANDLE();
auto *learner = static_cast<Learner *>(handle);
auto config = Json::Load(StringView{json_config});
auto importance = get<String const>(config["importance_type"]);

auto importance = RequiredArg<String>(config, "importance_type", __func__);
std::string feature_map_uri;
if (!IsA<Null>(config["feature_map"])) {
feature_map_uri = get<String const>(config["feature_map"]);
Expand Down
22 changes: 21 additions & 1 deletion src/c_api/c_api_utils.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*!
* Copyright (c) 2021 by XGBoost Contributors
* Copyright (c) 2021-2022 by XGBoost Contributors
*/
#ifndef XGBOOST_C_API_C_API_UTILS_H_
#define XGBOOST_C_API_C_API_UTILS_H_
Expand Down Expand Up @@ -241,5 +241,25 @@ inline void GenerateFeatureMap(Learner const *learner,
}

void XGBBuildInfoDevice(Json* p_info);

template <typename JT>
auto const &RequiredArg(Json const &in, std::string const &key, StringView func) {
auto const &obj = get<Object const>(in);
auto it = obj.find(key);
if (it == obj.cend() || IsA<Null>(it->second)) {
LOG(FATAL) << "Argument `" << key << "` is required for `" << func << "`";
}
return get<std::remove_const_t<JT> const>(it->second);
}

template <typename JT, typename T>
auto const &OptionalArg(Json const &in, std::string const &key, T const &dft) {
auto const &obj = get<Object const>(in);
auto it = obj.find(key);
if (it != obj.cend()) {
return get<std::remove_const_t<JT> const>(it->second);
}
return dft;
}
} // namespace xgboost
#endif // XGBOOST_C_API_C_API_UTILS_H_
2 changes: 1 addition & 1 deletion src/common/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ class ConfigParser {
const auto last_char = str.find_last_not_of(" \t\n\r");
if (first_char == std::string::npos) {
// Every character in str is a whitespace
return std::string();
return {};
}
CHECK_NE(last_char, std::string::npos);
const auto substr_len = last_char + 1 - first_char;
Expand Down
Loading

0 comments on commit a1bcd33

Please sign in to comment.