Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[breaking] Change internal model serialization to UBJSON. #7556

Merged
merged 8 commits into from
Jan 15, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/model.schema
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@
"default_left": {
"type": "array",
"items": {
"type": "boolean"
"type": "integer"
}
},
"categories": {
Expand Down
26 changes: 22 additions & 4 deletions include/xgboost/c_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -1081,14 +1081,32 @@ XGB_DLL int XGBoosterSaveModel(BoosterHandle handle,
XGB_DLL int XGBoosterLoadModelFromBuffer(BoosterHandle handle,
const void *buf,
bst_ulong len);

/*!
* \brief save model into binary raw bytes, return header of the array
* user must copy the result out, before next xgboost call
* \brief Save model into raw bytes, return header of the array. User must copy the
* result out, before next xgboost call
*
* \param handle handle
* \param out_len the argument to hold the output length
* \param out_dptr the argument to hold the output data pointer
* \param json_config JSON encoded string storing parameters for the function. Following
* keys are expected in the JSON document:
*
* "format": str
* - json: Output booster will be encoded as JSON.
* - ubj: Output booster will be encoded as Univeral binary JSON.
* - deprecated: Output booster will be encoded as old custom binary format. Do not use
* this format except for compatibility reasons.
*
* \param out_len The argument to hold the output length
* \param out_dptr The argument to hold the output data pointer
*
* \return 0 when success, -1 when failure happens
*/
XGB_DLL int XGBoosterSaveModelToBuffer(BoosterHandle handle, char const *json_config,
bst_ulong *out_len, char const **out_dptr);

/*!
* \brief Deprecated, use `XGBoosterSaveModelToBuffer` instead.
*/
XGB_DLL int XGBoosterGetModelRaw(BoosterHandle handle, bst_ulong *out_len,
const char **out_dptr);

Expand Down
7 changes: 2 additions & 5 deletions include/xgboost/data.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*!
* Copyright (c) 2015-2021 by Contributors
* Copyright (c) 2015-2022 by Contributors
* \file data.h
* \brief The input data structure of xgboost.
* \author Tianqi Chen
Expand Down Expand Up @@ -36,10 +36,7 @@ enum class DataType : uint8_t {
kStr = 5
};

enum class FeatureType : uint8_t {
kNumerical,
kCategorical
};
enum class FeatureType : uint8_t { kNumerical = 0, kCategorical = 1 };

/*!
* \brief Meta information about dataset, always sit in memory.
Expand Down
4 changes: 2 additions & 2 deletions include/xgboost/linalg.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*!
* Copyright 2021 by XGBoost Contributors
* Copyright 2021-2022 by XGBoost Contributors
* \file linalg.h
* \brief Linear algebra related utilities.
*/
Expand Down Expand Up @@ -567,7 +567,7 @@ template <typename T, int32_t D>
Json ArrayInterface(TensorView<T const, D> const &t) {
Json array_interface{Object{}};
array_interface["data"] = std::vector<Json>(2);
array_interface["data"][0] = Integer(reinterpret_cast<int64_t>(t.Values().data()));
array_interface["data"][0] = Integer{reinterpret_cast<int64_t>(t.Values().data())};
array_interface["data"][1] = Boolean{true};
if (t.DeviceIdx() >= 0) {
// Change this once we have different CUDA stream.
Expand Down
5 changes: 3 additions & 2 deletions include/xgboost/tree_model.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*!
* Copyright 2014-2019 by Contributors
* Copyright 2014-2022 by Contributors
* \file tree_model.h
* \brief model structure for tree
* \author Tianqi Chen
Expand Down Expand Up @@ -42,7 +42,7 @@ struct TreeParam : public dmlc::Parameter<TreeParam> {
/*! \brief maximum depth, this is a statistics of the tree */
int deprecated_max_depth;
/*! \brief number of features used for tree construction */
int num_feature;
bst_feature_t num_feature;
/*!
* \brief leaf vector size, used for vector tree
* used to store more than one dimensional information in tree
Expand Down Expand Up @@ -629,6 +629,7 @@ class RegTree : public Model {
}

private:
template <bool typed>
void LoadCategoricalSplit(Json const& in);
void SaveCategoricalSplit(Json* p_out) const;
// vector of nodes
Expand Down
145 changes: 101 additions & 44 deletions src/c_api/c_api.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2014-2021 by Contributors
// Copyright (c) 2014-2022 by Contributors
#include <rabit/rabit.h>
#include <rabit/c_api.h>

Expand Down Expand Up @@ -248,22 +248,16 @@ XGB_DLL int XGDMatrixCreateFromCudaArrayInterface(char const *data,
#endif

// Create from data iterator
XGB_DLL int XGDMatrixCreateFromCallback(DataIterHandle iter,
DMatrixHandle proxy,
DataIterResetCallback *reset,
XGDMatrixCallbackNext *next,
char const* c_json_config,
DMatrixHandle *out) {
XGB_DLL int XGDMatrixCreateFromCallback(DataIterHandle iter, DMatrixHandle proxy,
DataIterResetCallback *reset, XGDMatrixCallbackNext *next,
char const *c_json_config, DMatrixHandle *out) {
API_BEGIN();
auto config = Json::Load(StringView{c_json_config});
float missing = get<Number const>(config["missing"]);
std::string cache = get<String const>(config["cache_prefix"]);
int32_t n_threads = omp_get_max_threads();
if (!IsA<Null>(config["nthread"])) {
n_threads = get<Integer const>(config["nthread"]);
}
*out = new std::shared_ptr<xgboost::DMatrix>{xgboost::DMatrix::Create(
iter, proxy, reset, next, missing, n_threads, cache)};
auto missing = GetMissing(config);
std::string cache = RequiredArg<String>(config, "cache_prefix", __func__);
auto n_threads = OptionalArg<Integer, int64_t>(config, "nthread", common::OmpGetNumThreads(0));
*out = new std::shared_ptr<xgboost::DMatrix>{
xgboost::DMatrix::Create(iter, proxy, reset, next, missing, n_threads, cache)};
API_END();
}

Expand Down Expand Up @@ -358,8 +352,8 @@ XGB_DLL int XGDMatrixCreateFromCSR(char const *indptr,
StringView{data}, ncol);
auto config = Json::Load(StringView{c_json_config});
float missing = GetMissing(config);
auto nthread = get<Integer const>(config["nthread"]);
*out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, missing, nthread));
auto n_threads = OptionalArg<Integer, int64_t>(config, "nthread", common::OmpGetNumThreads(0));
*out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, missing, n_threads));
API_END();
}

Expand All @@ -371,9 +365,9 @@ XGB_DLL int XGDMatrixCreateFromDense(char const *data,
xgboost::data::ArrayAdapter(StringView{data})};
auto config = Json::Load(StringView{c_json_config});
float missing = GetMissing(config);
auto nthread = get<Integer const>(config["nthread"]);
auto n_threads = OptionalArg<Integer, int64_t>(config, "nthread", common::OmpGetNumThreads(0));
*out =
new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, missing, nthread));
new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, missing, n_threads));
API_END();
}

Expand Down Expand Up @@ -765,11 +759,11 @@ XGB_DLL int XGBoosterPredictFromDMatrix(BoosterHandle handle,
auto& entry = learner->GetThreadLocal().prediction_entry;
auto p_m = *static_cast<std::shared_ptr<DMatrix> *>(dmat);

auto const& j_config = get<Object const>(config);
auto type = PredictionType(get<Integer const>(j_config.at("type")));
auto iteration_begin = get<Integer const>(j_config.at("iteration_begin"));
auto iteration_end = get<Integer const>(j_config.at("iteration_end"));
auto type = PredictionType(RequiredArg<Integer>(config, "type", __func__));
auto iteration_begin = RequiredArg<Integer>(config, "iteration_begin", __func__);
auto iteration_end = RequiredArg<Integer>(config, "iteration_end", __func__);

auto const& j_config = get<Object const>(config);
auto ntree_limit_it = j_config.find("ntree_limit");
if (ntree_limit_it != j_config.cend() && !IsA<Null>(ntree_limit_it->second) &&
get<Integer const>(ntree_limit_it->second) != 0) {
Expand All @@ -785,7 +779,7 @@ XGB_DLL int XGBoosterPredictFromDMatrix(BoosterHandle handle,
type == PredictionType::kApproxContribution;
bool interactions = type == PredictionType::kInteraction ||
type == PredictionType::kApproxInteraction;
bool training = get<Boolean const>(config["training"]);
bool training = RequiredArg<Boolean>(config, "training", __func__);
learner->Predict(p_m, type == PredictionType::kMargin, &entry.predictions,
iteration_begin, iteration_end, training,
type == PredictionType::kLeaf, contribs, approximate,
Expand All @@ -796,7 +790,7 @@ XGB_DLL int XGBoosterPredictFromDMatrix(BoosterHandle handle,
auto rounds = iteration_end - iteration_begin;
rounds = rounds == 0 ? learner->BoostedRounds() : rounds;
// Determine shape
bool strict_shape = get<Boolean const>(config["strict_shape"]);
bool strict_shape = RequiredArg<Boolean>(config, "strict_shape", __func__);
CalcPredictShape(strict_shape, type, p_m->Info().num_row_,
p_m->Info().num_col_, chunksize, learner->Groups(), rounds,
&shape, out_dim);
Expand All @@ -814,15 +808,15 @@ void InplacePredictImpl(std::shared_ptr<T> x, std::shared_ptr<DMatrix> p_m,
CHECK_EQ(get<Integer const>(config["cache_id"]), 0) << "Cache ID is not supported yet";

HostDeviceVector<float>* p_predt { nullptr };
auto type = PredictionType(get<Integer const>(config["type"]));
auto type = PredictionType(RequiredArg<Integer>(config, "type", __func__));
float missing = GetMissing(config);
learner->InplacePredict(x, p_m, type, missing, &p_predt,
get<Integer const>(config["iteration_begin"]),
get<Integer const>(config["iteration_end"]));
RequiredArg<Integer>(config, "iteration_begin", __func__),
RequiredArg<Integer>(config, "iteration_end", __func__));
CHECK(p_predt);
auto &shape = learner->GetThreadLocal().prediction_shape;
auto chunksize = n_rows == 0 ? 0 : p_predt->Size() / n_rows;
bool strict_shape = get<Boolean const>(config["strict_shape"]);
bool strict_shape = RequiredArg<Boolean>(config, "strict_shape", __func__);
CalcPredictShape(strict_shape, type, n_rows, n_cols, chunksize, learner->Groups(),
learner->BoostedRounds(), &shape, out_dim);
*out_result = dmlc::BeginPtr(p_predt->HostVector());
Expand Down Expand Up @@ -900,45 +894,105 @@ XGB_DLL int XGBoosterPredictFromCUDAColumnar(
XGB_DLL int XGBoosterLoadModel(BoosterHandle handle, const char* fname) {
API_BEGIN();
CHECK_HANDLE();
if (common::FileExtension(fname) == "json") {
auto read_file = [&]() {
auto str = common::LoadSequentialFile(fname);
CHECK_GT(str.size(), 2);
CHECK_GE(str.size(), 3); // "{}\0"
CHECK_EQ(str[0], '{');
Json in { Json::Load({str.c_str(), str.size()}) };
CHECK_EQ(str[str.size() - 2], '}');
return str;
};
if (common::FileExtension(fname) == "json") {
auto str = read_file();
Json in{Json::Load(StringView{str})};
static_cast<Learner*>(handle)->LoadModel(in);
} else if (common::FileExtension(fname) == "ubj") {
auto str = read_file();
Json in = Json::Load(StringView{str}, std::ios::binary);
static_cast<Learner *>(handle)->LoadModel(in);
} else {
std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname, "r"));
static_cast<Learner*>(handle)->LoadModel(fi.get());
}
API_END();
}

XGB_DLL int XGBoosterSaveModel(BoosterHandle handle, const char* c_fname) {
namespace {
void WarnOldModel() {
if (XGBOOST_VER_MAJOR >= 2) {
LOG(WARNING) << "Saving into deprecated binary model format, please consider using `json` or "
"`ubj`. Model format will default to JSON in XGBoost 2.2 if not specified.";
}
}
} // anonymous namespace

XGB_DLL int XGBoosterSaveModel(BoosterHandle handle, const char *c_fname) {
API_BEGIN();
CHECK_HANDLE();
std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(c_fname, "w"));
auto *learner = static_cast<Learner *>(handle);
learner->Configure();
if (common::FileExtension(c_fname) == "json") {
Json out { Object() };
auto save_json = [&](std::ios::openmode mode) {
Json out{Object()};
learner->SaveModel(&out);
std::string str;
Json::Dump(out, &str);
fo->Write(str.c_str(), str.size());
std::vector<char> str;
Json::Dump(out, &str, mode);
fo->Write(str.data(), str.size());
};
if (common::FileExtension(c_fname) == "json") {
save_json(std::ios::out);
} else if (common::FileExtension(c_fname) == "ubj") {
save_json(std::ios::binary);
} else if (XGBOOST_VER_MAJOR == 2 && XGBOOST_VER_MINOR >= 2) {
LOG(WARNING) << "Saving model to JSON as default. You can use file extension `json`, `ubj` or "
"`deprecated` to choose between formats.";
save_json(std::ios::out);
} else {
auto *bst = static_cast<Learner*>(handle);
WarnOldModel();
auto *bst = static_cast<Learner *>(handle);
bst->SaveModel(fo.get());
}
API_END();
}

XGB_DLL int XGBoosterLoadModelFromBuffer(BoosterHandle handle,
const void* buf,
XGB_DLL int XGBoosterLoadModelFromBuffer(BoosterHandle handle, const void *buf,
xgboost::bst_ulong len) {
API_BEGIN();
CHECK_HANDLE();
common::MemoryFixSizeBuffer fs((void*)buf, len); // NOLINT(*)
static_cast<Learner*>(handle)->LoadModel(&fs);
common::MemoryFixSizeBuffer fs((void *)buf, len); // NOLINT(*)
static_cast<Learner *>(handle)->LoadModel(&fs);
API_END();
}

XGB_DLL int XGBoosterSaveModelToBuffer(BoosterHandle handle, char const *json_config,
xgboost::bst_ulong *out_len, char const **out_dptr) {
API_BEGIN();
CHECK_HANDLE();
auto config = Json::Load(StringView{json_config});
auto format = RequiredArg<String>(config, "format", __func__);

auto *learner = static_cast<Learner *>(handle);
std::string &raw_str = learner->GetThreadLocal().ret_str;
raw_str.clear();

learner->Configure();
Json out{Object{}};
if (format == "json") {
learner->SaveModel(&out);
Json::Dump(out, &raw_str);
} else if (format == "ubj") {
learner->SaveModel(&out);
Json::Dump(out, &raw_str, std::ios::binary);
} else if (format == "deprecated") {
WarnOldModel();
common::MemoryBufferStream fo(&raw_str);
learner->SaveModel(&fo);
} else {
LOG(FATAL) << "Unknown format: `" << format << "`";
}

*out_dptr = dmlc::BeginPtr(raw_str);
*out_len = static_cast<xgboost::bst_ulong>(raw_str.length());

API_END();
}

Expand All @@ -952,6 +1006,8 @@ XGB_DLL int XGBoosterGetModelRaw(BoosterHandle handle,
raw_str.resize(0);

common::MemoryBufferStream fo(&raw_str);
LOG(WARNING) << "`" << __func__
<< "` is deprecated, please use `XGBoosterSaveModelToBuffer` instead.";

learner->Configure();
learner->SaveModel(&fo);
Expand Down Expand Up @@ -1208,7 +1264,8 @@ XGB_DLL int XGBoosterFeatureScore(BoosterHandle handle, char const *json_config,
CHECK_HANDLE();
auto *learner = static_cast<Learner *>(handle);
auto config = Json::Load(StringView{json_config});
auto importance = get<String const>(config["importance_type"]);

auto importance = RequiredArg<String>(config, "importance_type", __func__);
std::string feature_map_uri;
if (!IsA<Null>(config["feature_map"])) {
feature_map_uri = get<String const>(config["feature_map"]);
Expand Down
22 changes: 21 additions & 1 deletion src/c_api/c_api_utils.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*!
* Copyright (c) 2021 by XGBoost Contributors
* Copyright (c) 2021-2022 by XGBoost Contributors
*/
#ifndef XGBOOST_C_API_C_API_UTILS_H_
#define XGBOOST_C_API_C_API_UTILS_H_
Expand Down Expand Up @@ -241,5 +241,25 @@ inline void GenerateFeatureMap(Learner const *learner,
}

void XGBBuildInfoDevice(Json* p_info);

template <typename JT>
auto const &RequiredArg(Json const &in, std::string const &key, StringView func) {
auto const &obj = get<Object const>(in);
auto it = obj.find(key);
if (it == obj.cend() || IsA<Null>(it->second)) {
LOG(FATAL) << "Argument `" << key << "` is required for `" << func << "`";
}
return get<std::remove_const_t<JT> const>(it->second);
}

template <typename JT, typename T>
auto const &OptionalArg(Json const &in, std::string const &key, T const &dft) {
auto const &obj = get<Object const>(in);
auto it = obj.find(key);
if (it != obj.cend()) {
return get<std::remove_const_t<JT> const>(it->second);
}
return dft;
}
} // namespace xgboost
#endif // XGBOOST_C_API_C_API_UTILS_H_
2 changes: 1 addition & 1 deletion src/common/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ class ConfigParser {
const auto last_char = str.find_last_not_of(" \t\n\r");
if (first_char == std::string::npos) {
// Every character in str is a whitespace
return std::string();
return {};
}
CHECK_NE(last_char, std::string::npos);
const auto substr_len = last_char + 1 - first_char;
Expand Down
Loading