Skip to content

Commit

Permalink
Move thread local entry into Learner.
Browse files Browse the repository at this point in the history
Extracted from dmlc#5389 .

This is an attempt to workaround CUDA context issue in static variable, where
the CUDA context can be released before device vector.

* Add PredictionEntry to thread local entry.

This eliminates one copy of prediction vector.

* Don't define CUDA C API in a namespace.
  • Loading branch information
trivialfis committed Mar 6, 2020
1 parent 8d06878 commit 374bd9a
Show file tree
Hide file tree
Showing 6 changed files with 83 additions and 62 deletions.
19 changes: 19 additions & 0 deletions include/xgboost/learner.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <rabit/rabit.h>
#include <xgboost/base.h>
#include <xgboost/feature_map.h>
#include <xgboost/predictor.h>
#include <xgboost/generic_parameters.h>
#include <xgboost/host_device_vector.h>
#include <xgboost/model.h>
Expand All @@ -29,6 +30,22 @@ class ObjFunction;
class DMatrix;
class Json;

/*! \brief entry to to easily hold returning information */
struct XGBAPIThreadLocalEntry {
/*! \brief result holder for returning string */
std::string ret_str;
/*! \brief result holder for returning strings */
std::vector<std::string> ret_vec_str;
/*! \brief result holder for returning string pointers */
std::vector<const char *> ret_vec_charp;
/*! \brief returning float vector. */
std::vector<bst_float> ret_vec_float;
/*! \brief temp variable of gradient pairs. */
std::vector<GradientPair> tmp_gpair;
PredictionCacheEntry prediction_entry;
};


/*!
* \brief Learner class that does training and prediction.
* This is the user facing module of xgboost training.
Expand Down Expand Up @@ -167,6 +184,8 @@ class Learner : public Model, public Configurable, public rabit::Serializable {
virtual std::vector<std::string> DumpModel(const FeatureMap& fmap,
bool with_stats,
std::string format) const = 0;

virtual XGBAPIThreadLocalEntry& GetThreadLocal() const = 0;
/*!
* \brief Create a new instance of learner.
* \param cache_data The matrix to cache the prediction.
Expand Down
94 changes: 38 additions & 56 deletions src/c_api/c_api.cc
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
// Copyright (c) 2014-2020 by Contributors
#include <dmlc/thread_local.h>
#include <rabit/rabit.h>
#include <rabit/c_api.h>

Expand All @@ -26,20 +25,6 @@

using namespace xgboost; // NOLINT(*);

/*! \brief entry to to easily hold returning information */
struct XGBAPIThreadLocalEntry {
/*! \brief result holder for returning string */
std::string ret_str;
/*! \brief result holder for returning strings */
std::vector<std::string> ret_vec_str;
/*! \brief result holder for returning string pointers */
std::vector<const char *> ret_vec_charp;
/*! \brief returning float vector. */
std::vector<bst_float> ret_vec_float;
/*! \brief temp variable of gradient pairs. */
std::vector<GradientPair> tmp_gpair;
};

XGB_DLL void XGBoostVersion(int* major, int* minor, int* patch) {
if (major) {
*major = XGBOOST_VER_MAJOR;
Expand All @@ -52,9 +37,6 @@ XGB_DLL void XGBoostVersion(int* major, int* minor, int* patch) {
}
}

// define the threadlocal store.
using XGBAPIThreadLocalStore = dmlc::ThreadLocalStore<XGBAPIThreadLocalEntry>;

int XGBRegisterLogCallback(void (*callback)(const char*)) {
API_BEGIN();
LogCallbackRegistry* registry = LogCallbackRegistryStore::Get();
Expand Down Expand Up @@ -102,16 +84,16 @@ XGB_DLL int XGDMatrixCreateFromArrayInterfaceColumns(char const* c_json_strs,
int nthread,
DMatrixHandle* out) {
API_BEGIN();
LOG(FATAL) << "Xgboost not compiled with cuda";
LOG(FATAL) << "XGBoost not compiled with CUDA";
API_END();
}

XGB_DLL int XGDMatrixCreateFromArrayInterface(char const* c_json_strs,
bst_float missing,
int nthread,
DMatrixHandle* out) {
bst_float missing,
int nthread,
DMatrixHandle* out) {
API_BEGIN();
LOG(FATAL) << "Xgboost not compiled with cuda";
LOG(FATAL) << "XGBoost not compiled with CUDA";
API_END();
}

Expand Down Expand Up @@ -375,7 +357,7 @@ XGB_DLL int XGBoosterSaveJsonConfig(BoosterHandle handle,
auto* learner = static_cast<Learner*>(handle);
learner->Configure();
learner->SaveConfig(&config);
std::string& raw_str = XGBAPIThreadLocalStore::Get()->ret_str;
std::string& raw_str = learner->GetThreadLocal().ret_str;
Json::Dump(config, &raw_str);
*out_str = raw_str.c_str();
*out_len = static_cast<xgboost::bst_ulong>(raw_str.length());
Expand Down Expand Up @@ -422,10 +404,11 @@ XGB_DLL int XGBoosterEvalOneIter(BoosterHandle handle,
const char* evnames[],
xgboost::bst_ulong len,
const char** out_str) {
std::string& eval_str = XGBAPIThreadLocalStore::Get()->ret_str;
API_BEGIN();
CHECK_HANDLE();
auto* bst = static_cast<Learner*>(handle);
std::string& eval_str = bst->GetThreadLocal().ret_str;

std::vector<std::shared_ptr<DMatrix>> data_sets;
std::vector<std::string> data_names;

Expand All @@ -446,24 +429,22 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle,
int32_t training,
xgboost::bst_ulong *len,
const bst_float **out_result) {
std::vector<bst_float>& preds =
XGBAPIThreadLocalStore::Get()->ret_vec_float;
API_BEGIN();
CHECK_HANDLE();
auto *bst = static_cast<Learner*>(handle);
auto *learner = static_cast<Learner*>(handle);
auto& entry = learner->GetThreadLocal().prediction_entry;
HostDeviceVector<bst_float> tmp_preds;
bst->Predict(
learner->Predict(
*static_cast<std::shared_ptr<DMatrix>*>(dmat),
(option_mask & 1) != 0,
&tmp_preds, ntree_limit,
&entry.predictions, ntree_limit,
static_cast<bool>(training),
(option_mask & 2) != 0,
(option_mask & 4) != 0,
(option_mask & 8) != 0,
(option_mask & 16) != 0);
preds = tmp_preds.HostVector();
*out_result = dmlc::BeginPtr(preds);
*len = static_cast<xgboost::bst_ulong>(preds.size());
*out_result = dmlc::BeginPtr(entry.predictions.ConstHostVector());
*len = static_cast<xgboost::bst_ulong>(entry.predictions.Size());
API_END();
}

Expand Down Expand Up @@ -515,13 +496,14 @@ XGB_DLL int XGBoosterLoadModelFromBuffer(BoosterHandle handle,
XGB_DLL int XGBoosterGetModelRaw(BoosterHandle handle,
xgboost::bst_ulong* out_len,
const char** out_dptr) {
std::string& raw_str = XGBAPIThreadLocalStore::Get()->ret_str;
raw_str.resize(0);

API_BEGIN();
CHECK_HANDLE();
common::MemoryBufferStream fo(&raw_str);
auto *learner = static_cast<Learner*>(handle);
std::string& raw_str = learner->GetThreadLocal().ret_str;
raw_str.resize(0);

common::MemoryBufferStream fo(&raw_str);

learner->Configure();
learner->SaveModel(&fo);
*out_dptr = dmlc::BeginPtr(raw_str);
Expand All @@ -534,13 +516,12 @@ XGB_DLL int XGBoosterGetModelRaw(BoosterHandle handle,
XGB_DLL int XGBoosterSerializeToBuffer(BoosterHandle handle,
xgboost::bst_ulong *out_len,
const char **out_dptr) {
std::string &raw_str = XGBAPIThreadLocalStore::Get()->ret_str;
raw_str.resize(0);

API_BEGIN();
CHECK_HANDLE();
common::MemoryBufferStream fo(&raw_str);
auto *learner = static_cast<Learner*>(handle);
std::string &raw_str = learner->GetThreadLocal().ret_str;
raw_str.resize(0);
common::MemoryBufferStream fo(&raw_str);
learner->Configure();
learner->Save(&fo);
*out_dptr = dmlc::BeginPtr(raw_str);
Expand Down Expand Up @@ -583,16 +564,13 @@ XGB_DLL int XGBoosterSaveRabitCheckpoint(BoosterHandle handle) {
API_END();
}

inline void XGBoostDumpModelImpl(
BoosterHandle handle,
const FeatureMap& fmap,
int with_stats,
const char *format,
xgboost::bst_ulong* len,
const char*** out_models) {
std::vector<std::string>& str_vecs = XGBAPIThreadLocalStore::Get()->ret_vec_str;
std::vector<const char*>& charp_vecs = XGBAPIThreadLocalStore::Get()->ret_vec_charp;
inline void XGBoostDumpModelImpl(BoosterHandle handle, const FeatureMap &fmap,
int with_stats, const char *format,
xgboost::bst_ulong *len,
const char ***out_models) {
auto *bst = static_cast<Learner*>(handle);
std::vector<std::string>& str_vecs = bst->GetThreadLocal().ret_vec_str;
std::vector<const char*>& charp_vecs = bst->GetThreadLocal().ret_vec_charp;
bst->Configure();
str_vecs = bst->DumpModel(fmap, with_stats != 0, format);
charp_vecs.resize(str_vecs.size());
Expand All @@ -608,7 +586,10 @@ XGB_DLL int XGBoosterDumpModel(BoosterHandle handle,
int with_stats,
xgboost::bst_ulong* len,
const char*** out_models) {
API_BEGIN();
CHECK_HANDLE();
return XGBoosterDumpModelEx(handle, fmap, with_stats, "text", len, out_models);
API_END();
}

XGB_DLL int XGBoosterDumpModelEx(BoosterHandle handle,
Expand Down Expand Up @@ -664,7 +645,7 @@ XGB_DLL int XGBoosterGetAttr(BoosterHandle handle,
const char** out,
int* success) {
auto* bst = static_cast<Learner*>(handle);
std::string& ret_str = XGBAPIThreadLocalStore::Get()->ret_str;
std::string& ret_str = bst->GetThreadLocal().ret_str;
API_BEGIN();
CHECK_HANDLE();
if (bst->GetAttr(key, &ret_str)) {
Expand All @@ -680,9 +661,9 @@ XGB_DLL int XGBoosterGetAttr(BoosterHandle handle,
XGB_DLL int XGBoosterSetAttr(BoosterHandle handle,
const char* key,
const char* value) {
auto* bst = static_cast<Learner*>(handle);
API_BEGIN();
CHECK_HANDLE();
auto* bst = static_cast<Learner*>(handle);
if (value == nullptr) {
bst->DelAttr(key);
} else {
Expand All @@ -694,12 +675,13 @@ XGB_DLL int XGBoosterSetAttr(BoosterHandle handle,
XGB_DLL int XGBoosterGetAttrNames(BoosterHandle handle,
xgboost::bst_ulong* out_len,
const char*** out) {
std::vector<std::string>& str_vecs = XGBAPIThreadLocalStore::Get()->ret_vec_str;
std::vector<const char*>& charp_vecs = XGBAPIThreadLocalStore::Get()->ret_vec_charp;
auto *bst = static_cast<Learner*>(handle);
API_BEGIN();
CHECK_HANDLE();
str_vecs = bst->GetAttrNames();
auto *learner = static_cast<Learner *>(handle);
std::vector<std::string> &str_vecs = learner->GetThreadLocal().ret_vec_str;
std::vector<const char *> &charp_vecs =
learner->GetThreadLocal().ret_vec_charp;
str_vecs = learner->GetAttrNames();
charp_vecs.resize(str_vecs.size());
for (size_t i = 0; i < str_vecs.size(); ++i) {
charp_vecs[i] = str_vecs[i].c_str();
Expand Down
9 changes: 4 additions & 5 deletions src/c_api/c_api.cu
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
// Copyright (c) 2014-2019 by Contributors

// Copyright (c) 2019-2020 by Contributors
#include "xgboost/data.h"
#include "xgboost/c_api.h"
#include "xgboost/learner.h"
#include "c_api_error.h"
#include "../data/device_adapter.cuh"

namespace xgboost {
using namespace xgboost; // NOLINT

XGB_DLL int XGDMatrixCreateFromArrayInterfaceColumns(char const* c_json_strs,
bst_float missing,
int nthread,
Expand All @@ -28,5 +29,3 @@ XGB_DLL int XGDMatrixCreateFromArrayInterface(char const* c_json_strs,
new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, missing, nthread));
API_END();
}

} // namespace xgboost
12 changes: 12 additions & 0 deletions src/common/transform.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,17 @@ class Transform {
return Span<T const> {_vec->ConstHostPointer(),
static_cast<typename Span<T>::index_type>(_vec->Size())};
}
// Recursive sync host
template <typename T>
void SyncHost(const HostDeviceVector<T> *_vector) const {
_vector->ConstHostPointer();
}
template <typename Head, typename... Rest>
void SyncHost(const HostDeviceVector<Head> *_vector,
const HostDeviceVector<Rest> *... _vectors) const {
_vector->ConstHostPointer();
SyncHost(_vectors...);
}
// Recursive unpack for Shard.
template <typename T>
void UnpackShard(int device, const HostDeviceVector<T> *vector) const {
Expand Down Expand Up @@ -154,6 +165,7 @@ class Transform {
void LaunchCPU(Functor func, HDV*... vectors) const {
omp_ulong end = static_cast<omp_ulong>(*(range_.end()));
dmlc::OMPException omp_exc;
SyncHost(vectors...);
#pragma omp parallel for schedule(static)
for (omp_ulong idx = 0; idx < end; ++idx) {
omp_exc.Run(func, idx, UnpackHDV(vectors)...);
Expand Down
9 changes: 9 additions & 0 deletions src/learner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,9 @@ class LearnerImpl : public Learner {
cache_.Cache(d, GenericParameter::kCpuId);
}
}
~LearnerImpl() override {
local_map.erase(this);
}
// Configuration before data is known.
void Configure() override {
if (!this->need_configuration_) { return; }
Expand Down Expand Up @@ -873,6 +876,9 @@ class LearnerImpl : public Learner {
}
}

XGBAPIThreadLocalEntry& GetThreadLocal() const override {
return local_map[this];
}
const std::map<std::string, std::string>& GetConfigurationArguments() const override {
return cfg_;
}
Expand Down Expand Up @@ -1017,6 +1023,7 @@ class LearnerImpl : public Learner {
// gradient pairs
HostDeviceVector<GradientPair> gpair_;
bool need_configuration_;
static thread_local std::map<LearnerImpl const *, XGBAPIThreadLocalEntry> local_map;

private:
/*! \brief random number transformation seed. */
Expand All @@ -1037,6 +1044,8 @@ std::string const LearnerImpl::kEvalMetric {"eval_metric"}; // NOLINT

constexpr int32_t LearnerImpl::kRandSeedMagic;

thread_local std::map<LearnerImpl const *, XGBAPIThreadLocalEntry> LearnerImpl::local_map;

Learner* Learner::Create(
const std::vector<std::shared_ptr<DMatrix> >& cache_data) {
return new LearnerImpl(cache_data);
Expand Down
2 changes: 1 addition & 1 deletion tests/python-gpu/test_from_columnar.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def test_cudf_training(self):
evals_result_cudf = {}
dtrain_cudf = xgb.DMatrix(df.from_pandas(X), df.from_pandas(y), weight=cudf_weights,
base_margin=cudf_base_margin)
params = {'gpu_id': 0, 'nthread': 1}
params = {'gpu_id': 0}
xgb.train(params, dtrain_cudf, evals=[(dtrain_cudf, "train")],
evals_result=evals_result_cudf)
evals_result_np = {}
Expand Down

0 comments on commit 374bd9a

Please sign in to comment.