From f0e9bc5a6aed234544aa309ba1ced2fb2da25bde Mon Sep 17 00:00:00 2001 From: fis Date: Tue, 30 Mar 2021 09:12:55 +0800 Subject: [PATCH 1/2] Optimize dart inplace predict perf. --- src/gbm/gbtree.cc | 67 +++++++++++++++++-------- src/gbm/gbtree.cu | 10 ++++ tests/python-gpu/test_gpu_prediction.py | 18 +++++-- 3 files changed, 69 insertions(+), 26 deletions(-) diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc index f170115a6489..9fc522e03180 100644 --- a/src/gbm/gbtree.cc +++ b/src/gbm/gbtree.cc @@ -575,6 +575,20 @@ void GPUDartPredictInc(common::Span out_predts, } #endif +void GPUDartInplacePredictInc(common::Span out_predts, + common::Span predts, float tree_w, + size_t n_rows, float base_score, + bst_group_t n_groups, + bst_group_t group) +#if defined(XGBOOST_USE_CUDA) +; // NOLINT +#else +{ + common::AssertGPUSupport(); +} +#endif + + class Dart : public GBTree { public: explicit Dart(LearnerModelParam const* booster_config) : @@ -728,13 +742,14 @@ class Dart : public GBTree { gpu_predictor_.get() #endif // defined(XGBOOST_USE_CUDA) }; + Predictor const * predictor {nullptr}; MetaInfo info; StringView msg{"Unsupported data type for inplace predict."}; int32_t device = GenericParameter::kCpuId; + PredictionCacheEntry predts; // Inplace predict is not used for training, so no need to drop tree. for (size_t i = tree_begin; i < tree_end; ++i) { - PredictionCacheEntry predts; if (tparam_.predictor == PredictorType::kAuto) { // Try both predictor implementations bool success = false; @@ -742,6 +757,7 @@ class Dart : public GBTree { if (p && p->InplacePredict(x, nullptr, model_, missing, &predts, i, i + 1)) { success = true; + predictor = p; #if defined(XGBOOST_USE_CUDA) device = predts.predictions.DeviceIdx(); #endif // defined(XGBOOST_USE_CUDA) @@ -750,46 +766,53 @@ class Dart : public GBTree { } CHECK(success) << msg; } else { - // No base margin for each tree - bool success = this->GetPredictor()->InplacePredict( - x, nullptr, model_, missing, &predts, i, i + 1); + // No base margin from meta info for each tree + predictor = this->GetPredictor().get(); + bool success = predictor->InplacePredict(x, nullptr, model_, missing, + &predts, i, i + 1); device = predts.predictions.DeviceIdx(); CHECK(success) << msg; } auto w = this->weight_drop_.at(i); - auto &h_predts = predts.predictions.HostVector(); - auto &h_out_predts = out_preds->predictions.HostVector(); + size_t n_groups = model_.learner_model_param->num_output_group; + auto n_rows = predts.predictions.Size() / n_groups; if (i == tree_begin) { - auto n_rows = - h_predts.size() / model_.learner_model_param->num_output_group; + // base margin is added here. if (p_m) { p_m->Info().num_row_ = n_rows; - cpu_predictor_->InitOutPredictions(p_m->Info(), - &out_preds->predictions, model_); + predictor->InitOutPredictions(p_m->Info(), &out_preds->predictions, + model_); } else { info.num_row_ = n_rows; - cpu_predictor_->InitOutPredictions(info, &out_preds->predictions, - model_); + predictor->InitOutPredictions(info, &out_preds->predictions, model_); } } // Multiple the tree weight - CHECK_EQ(h_predts.size(), h_out_predts.size()); + CHECK_EQ(predts.predictions.Size(), out_preds->predictions.Size()); + auto group = model_.tree_info.at(i); + if (device == GenericParameter::kCpuId) { + auto &h_predts = predts.predictions.HostVector(); + auto &h_out_predts = out_preds->predictions.HostVector(); #pragma omp parallel for - for (omp_ulong i = 0; i < h_out_predts.size(); ++i) { - // Need to remove the base margin from indiviual tree. - h_out_predts[i] += - (h_predts[i] - model_.learner_model_param->base_score) * w; + for (omp_ulong ridx = 0; ridx < n_rows; ++ridx) { + const size_t offset = ridx * n_groups + group; + // Need to remove the base margin from indiviual tree. + h_out_predts[offset] += + (h_predts[offset] - model_.learner_model_param->base_score) * w; + } + } else { + out_preds->predictions.SetDevice(device); + predts.predictions.SetDevice(device); + GPUDartInplacePredictInc(out_preds->predictions.DeviceSpan(), + predts.predictions.DeviceSpan(), w, n_rows, + model_.learner_model_param->base_score, + n_groups, group); } } - - if (device != GenericParameter::kCpuId) { - out_preds->predictions.SetDevice(device); - out_preds->predictions.DeviceSpan(); - } } void PredictInstance(const SparsePage::Inst &inst, diff --git a/src/gbm/gbtree.cu b/src/gbm/gbtree.cu index 29b26be06ec1..52ac90501912 100644 --- a/src/gbm/gbtree.cu +++ b/src/gbm/gbtree.cu @@ -14,5 +14,15 @@ void GPUDartPredictInc(common::Span out_predts, out_predts[offset] += (predts[offset] * tree_w); }); } + +void GPUDartInplacePredictInc(common::Span out_predts, + common::Span predts, float tree_w, + size_t n_rows, float base_score, + bst_group_t n_groups, bst_group_t group) { + dh::LaunchN(dh::CurrentDevice(), n_rows, [=] XGBOOST_DEVICE(size_t ridx) { + const size_t offset = ridx * n_groups + group; + out_predts[offset] += (predts[offset] - base_score) * tree_w; + }); +} } // namespace gbm } // namespace xgboost diff --git a/tests/python-gpu/test_gpu_prediction.py b/tests/python-gpu/test_gpu_prediction.py index d915be24bf2d..3c37624cc019 100644 --- a/tests/python-gpu/test_gpu_prediction.py +++ b/tests/python-gpu/test_gpu_prediction.py @@ -332,18 +332,23 @@ def test_predict_categorical_split(self, df): rmse = mean_squared_error(y_true=y, y_pred=pred, squared=False) np.testing.assert_almost_equal(rmse, eval_history['train']['rmse'][-1], decimal=5) - def test_predict_dart(self): + @pytest.mark.parametrize("n_classes", [2, 3]) + def test_predict_dart(self, n_classes): + from sklearn.datasets import make_classification import cupy as cp - rng = cp.random.RandomState(1994) n_samples = 1000 - X = rng.randn(n_samples, 10) - y = rng.randn(n_samples) + X_, y_ = make_classification( + n_samples=n_samples, n_informative=5, n_classes=n_classes + ) + X, y = cp.array(X_), cp.array(y_) + Xy = xgb.DMatrix(X, y) booster = xgb.train( { "tree_method": "gpu_hist", "booster": "dart", "rate_drop": 0.5, + "num_class": n_classes }, Xy, num_boost_round=32 @@ -351,8 +356,13 @@ def test_predict_dart(self): # predictor=auto inplace = booster.inplace_predict(X) copied = booster.predict(Xy) + cpu_inplace = booster.inplace_predict(X_) + booster.set_param({"predictor": "cpu_predictor"}) + cpu_copied = booster.predict(Xy) copied = cp.array(copied) + cp.testing.assert_allclose(cpu_inplace, copied, atol=1e-6) + cp.testing.assert_allclose(cpu_copied, copied, atol=1e-6) cp.testing.assert_allclose(inplace, copied, atol=1e-6) booster.set_param({"predictor": "gpu_predictor"}) From d1512742460f2ac95c49c165530fa8925a80ba1e Mon Sep 17 00:00:00 2001 From: fis Date: Tue, 30 Mar 2021 09:40:56 +0800 Subject: [PATCH 2/2] Structure the test. --- tests/python-gpu/test_gpu_prediction.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/tests/python-gpu/test_gpu_prediction.py b/tests/python-gpu/test_gpu_prediction.py index 3c37624cc019..71c27352e663 100644 --- a/tests/python-gpu/test_gpu_prediction.py +++ b/tests/python-gpu/test_gpu_prediction.py @@ -343,16 +343,23 @@ def test_predict_dart(self, n_classes): X, y = cp.array(X_), cp.array(y_) Xy = xgb.DMatrix(X, y) - booster = xgb.train( - { + if n_classes == 2: + params = { "tree_method": "gpu_hist", "booster": "dart", "rate_drop": 0.5, + "objective": "binary:logistic" + } + else: + params = { + "tree_method": "gpu_hist", + "booster": "dart", + "rate_drop": 0.5, + "objective": "multi:softprob", "num_class": n_classes - }, - Xy, - num_boost_round=32 - ) + } + + booster = xgb.train(params, Xy, num_boost_round=32) # predictor=auto inplace = booster.inplace_predict(X) copied = booster.predict(Xy)