From b13fcfeb03609832dca32220056b8b55f097fe20 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Wed, 5 Aug 2020 12:26:19 +0800 Subject: [PATCH 1/4] Fix sklearn doc. (#5980) --- python-package/xgboost/sklearn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py index 1f3033f2d29e..f533f7f3477d 100644 --- a/python-package/xgboost/sklearn.py +++ b/python-package/xgboost/sklearn.py @@ -77,7 +77,7 @@ def inner(preds, dmatrix): gamma : float Minimum loss reduction required to make a further partition on a leaf node of the tree. - min_child_weight : int + min_child_weight : float Minimum sum of instance weight(hessian) needed in a child. max_delta_step : int Maximum delta step we allow each tree's weight estimation to be. From fc06538f7d717a0c93fdedad99b7b5752cfd95bc Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Wed, 5 Aug 2020 16:44:52 +0800 Subject: [PATCH 2/4] Enforce tree order in JSON. (#5974) * Make JSON model IO more future proof by using tree id in model loading. --- src/gbm/gbtree_model.cc | 24 +++++++++++++----------- tests/cpp/test_learner.cc | 11 ++++++++++- 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/src/gbm/gbtree_model.cc b/src/gbm/gbtree_model.cc index a533467971af..8ebd8284c269 100644 --- a/src/gbm/gbtree_model.cc +++ b/src/gbm/gbtree_model.cc @@ -1,6 +1,8 @@ /*! - * Copyright 2019 by Contributors + * Copyright 2019-2020 by Contributors */ +#include + #include "xgboost/json.h" #include "xgboost/logging.h" #include "gbtree_model.h" @@ -41,15 +43,14 @@ void GBTreeModel::SaveModel(Json* p_out) const { auto& out = *p_out; CHECK_EQ(param.num_trees, static_cast(trees.size())); out["gbtree_model_param"] = ToJson(param); - std::vector trees_json; - size_t t = 0; - for (auto const& tree : trees) { + std::vector trees_json(trees.size()); + + for (size_t t = 0; t < trees.size(); ++t) { + auto const& tree = trees[t]; Json tree_json{Object()}; tree->SaveModel(&tree_json); - // The field is not used in XGBoost, but might be useful for external project. - tree_json["id"] = Integer(t); - trees_json.emplace_back(tree_json); - t++; + tree_json["id"] = Integer(static_cast(t)); + trees_json[t] = std::move(tree_json); } std::vector tree_info_json(tree_info.size()); @@ -70,9 +71,10 @@ void GBTreeModel::LoadModel(Json const& in) { auto const& trees_json = get(in["trees"]); trees.resize(trees_json.size()); - for (size_t t = 0; t < trees.size(); ++t) { - trees[t].reset( new RegTree() ); - trees[t]->LoadModel(trees_json[t]); + for (size_t t = 0; t < trees_json.size(); ++t) { // NOLINT + auto tree_id = get(trees_json[t]["id"]); + trees.at(tree_id).reset(new RegTree()); + trees.at(tree_id)->LoadModel(trees_json[t]); } tree_info.resize(param.num_trees); diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc index 7d473f00ca35..56e4a95ece42 100644 --- a/tests/cpp/test_learner.cc +++ b/tests/cpp/test_learner.cc @@ -148,7 +148,16 @@ TEST(Learner, JsonModelIO) { Json out { Object() }; learner->SaveModel(&out); - learner->LoadModel(out); + dmlc::TemporaryDirectory tmpdir; + + std::ofstream fout (tmpdir.path + "/model.json"); + fout << out; + fout.close(); + + auto loaded_str = common::LoadSequentialFile(tmpdir.path + "/model.json"); + Json loaded = Json::Load(StringView{loaded_str.c_str(), loaded_str.size()}); + + learner->LoadModel(loaded); learner->Configure(); Json new_in { Object() }; From 3bc5e576a0e5192dee26f5e5b78aab6c236f9a9d Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Sat, 8 Aug 2020 14:29:22 +0800 Subject: [PATCH 3/4] Fix dask predict shape infer. (#5989) --- python-package/xgboost/dask.py | 13 +++++---- tests/python/test_with_dask.py | 50 +++++++++++++++++++++++++--------- 2 files changed, 44 insertions(+), 19 deletions(-) diff --git a/python-package/xgboost/dask.py b/python-package/xgboost/dask.py index b97855cbe7aa..a890a6e617e7 100644 --- a/python-package/xgboost/dask.py +++ b/python-package/xgboost/dask.py @@ -738,7 +738,8 @@ def dispatched_predict(worker_id): predt = booster.predict(data=local_x, validate_features=local_x.num_row() != 0, *args) - ret = (delayed(predt), order) + columns = 1 if len(predt.shape) == 1 else predt.shape[1] + ret = ((delayed(predt), columns), order) predictions.append(ret) return predictions @@ -775,8 +776,10 @@ async def map_function(func): # See https://docs.dask.org/en/latest/array-creation.html arrays = [] for i, shape in enumerate(shapes): - arrays.append(da.from_delayed(results[i], shape=(shape[0], ), - dtype=numpy.float32)) + arrays.append(da.from_delayed( + results[i][0], shape=(shape[0],) + if results[i][1] == 1 else (shape[0], results[i][1]), + dtype=numpy.float32)) predictions = await da.concatenate(arrays, axis=0) return predictions @@ -978,6 +981,7 @@ def client(self): def client(self, clt): self._client = clt + @xgboost_model_doc("""Implementation of the Scikit-Learn API for XGBoost.""", ['estimators', 'model']) class DaskXGBRegressor(DaskScikitLearnBase, XGBRegressorBase): @@ -1032,9 +1036,6 @@ def predict(self, data): ['estimators', 'model'] ) class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase): - # pylint: disable=missing-docstring - _client = None - async def _fit_async(self, X, y, sample_weights=None, eval_set=None, diff --git a/tests/python/test_with_dask.py b/tests/python/test_with_dask.py index a0825b523fe6..66336c47ac3a 100644 --- a/tests/python/test_with_dask.py +++ b/tests/python/test_with_dask.py @@ -5,6 +5,7 @@ import numpy as np import json import asyncio +from sklearn.datasets import make_classification if sys.platform.startswith("win"): pytest.skip("Skipping dask tests on Windows", allow_module_level=True) @@ -36,7 +37,7 @@ def generate_array(): def test_from_dask_dataframe(): - with LocalCluster(n_workers=5) as cluster: + with LocalCluster(n_workers=kWorkers) as cluster: with Client(cluster) as client: X, y = generate_array() @@ -74,7 +75,7 @@ def test_from_dask_dataframe(): def test_from_dask_array(): - with LocalCluster(n_workers=5, threads_per_worker=5) as cluster: + with LocalCluster(n_workers=kWorkers, threads_per_worker=5) as cluster: with Client(cluster) as client: X, y = generate_array() dtrain = DaskDMatrix(client, X, y) @@ -104,8 +105,28 @@ def test_from_dask_array(): assert np.all(single_node_predt == from_arr.compute()) +def test_dask_predict_shape_infer(): + with LocalCluster(n_workers=kWorkers) as cluster: + with Client(cluster) as client: + X, y = make_classification(n_samples=1000, n_informative=5, + n_classes=3) + X_ = dd.from_array(X, chunksize=100) + y_ = dd.from_array(y, chunksize=100) + dtrain = xgb.dask.DaskDMatrix(client, data=X_, label=y_) + + model = xgb.dask.train( + client, + {"objective": "multi:softprob", "num_class": 3}, + dtrain=dtrain + ) + + preds = xgb.dask.predict(client, model, dtrain) + assert preds.shape[0] == preds.compute().shape[0] + assert preds.shape[1] == preds.compute().shape[1] + + def test_dask_missing_value_reg(): - with LocalCluster(n_workers=5) as cluster: + with LocalCluster(n_workers=kWorkers) as cluster: with Client(cluster) as client: X_0 = np.ones((20 // 2, kCols)) X_1 = np.zeros((20 // 2, kCols)) @@ -156,7 +177,7 @@ def test_dask_missing_value_cls(): def test_dask_regressor(): - with LocalCluster(n_workers=5) as cluster: + with LocalCluster(n_workers=kWorkers) as cluster: with Client(cluster) as client: X, y = generate_array() regressor = xgb.dask.DaskXGBRegressor(verbosity=1, n_estimators=2) @@ -178,7 +199,7 @@ def test_dask_regressor(): def test_dask_classifier(): - with LocalCluster(n_workers=5) as cluster: + with LocalCluster(n_workers=kWorkers) as cluster: with Client(cluster) as client: X, y = generate_array() y = (y * 10).astype(np.int32) @@ -188,7 +209,7 @@ def test_dask_classifier(): classifier.fit(X, y, eval_set=[(X, y)]) prediction = classifier.predict(X) - assert prediction.ndim == 1 + assert prediction.ndim == 2 assert prediction.shape[0] == kRows history = classifier.evals_result() @@ -211,14 +232,14 @@ def test_dask_classifier(): assert classifier.n_classes_ == 10 prediction = classifier.predict(X_d) - assert prediction.ndim == 1 + assert prediction.ndim == 2 assert prediction.shape[0] == kRows @pytest.mark.skipif(**tm.no_sklearn()) def test_sklearn_grid_search(): from sklearn.model_selection import GridSearchCV - with LocalCluster(n_workers=4) as cluster: + with LocalCluster(n_workers=kWorkers) as cluster: with Client(cluster) as client: X, y = generate_array() reg = xgb.dask.DaskXGBRegressor(learning_rate=0.1, @@ -292,7 +313,9 @@ def _check_outputs(out, predictions): evals=[(dtrain, 'validation')], num_boost_round=2) predictions = xgb.dask.predict(client=client, model=out, - data=dtrain).compute() + data=dtrain) + assert predictions.shape[1] == n_classes + predictions = predictions.compute() _check_outputs(out, predictions) # train has more rows than evals @@ -315,7 +338,7 @@ def _check_outputs(out, predictions): # environment and Exact doesn't support it. def test_empty_dmatrix_hist(): - with LocalCluster(n_workers=5) as cluster: + with LocalCluster(n_workers=kWorkers) as cluster: with Client(cluster) as client: parameters = {'tree_method': 'hist'} run_empty_dmatrix_reg(client, parameters) @@ -323,7 +346,7 @@ def test_empty_dmatrix_hist(): def test_empty_dmatrix_approx(): - with LocalCluster(n_workers=5) as cluster: + with LocalCluster(n_workers=kWorkers) as cluster: with Client(cluster) as client: parameters = {'tree_method': 'approx'} run_empty_dmatrix_reg(client, parameters) @@ -384,7 +407,7 @@ async def run_dask_classifier_asyncio(scheduler_address): await classifier.fit(X, y, eval_set=[(X, y)]) prediction = await classifier.predict(X) - assert prediction.ndim == 1 + assert prediction.ndim == 2 assert prediction.shape[0] == kRows history = classifier.evals_result() @@ -407,8 +430,9 @@ async def run_dask_classifier_asyncio(scheduler_address): assert classifier.n_classes_ == 10 prediction = await classifier.predict(X_d) - assert prediction.ndim == 1 + assert prediction.ndim == 2 assert prediction.shape[0] == kRows + assert prediction.shape[1] == 10 def test_with_asyncio(): From 2da94c06c5a242eba1465713c0748a7bdbb4b3de Mon Sep 17 00:00:00 2001 From: jameskrach <69264125+jameskrach@users.noreply.github.com> Date: Tue, 11 Aug 2020 04:11:28 -0400 Subject: [PATCH 4/4] [Breaking] Fix .predict() method and add .predict_proba() in xgboost.dask.DaskXGBClassifier (#5986) --- python-package/xgboost/dask.py | 23 ++++++++++++++++++++++- tests/python/test_with_dask.py | 32 ++++++++++++++++++++++++-------- 2 files changed, 46 insertions(+), 9 deletions(-) diff --git a/python-package/xgboost/dask.py b/python-package/xgboost/dask.py index a890a6e617e7..a08c21367a94 100644 --- a/python-package/xgboost/dask.py +++ b/python-package/xgboost/dask.py @@ -1079,13 +1079,34 @@ def fit(self, X, y, return self.client.sync(self._fit_async, X, y, sample_weights, eval_set, sample_weight_eval_set, verbose) - async def _predict_async(self, data): + async def _predict_proba_async(self, data): + _assert_dask_support() + test_dmatrix = await DaskDMatrix(client=self.client, data=data, missing=self.missing) pred_probs = await predict(client=self.client, model=self.get_booster(), data=test_dmatrix) return pred_probs + def predict_proba(self, data): # pylint: disable=arguments-differ,missing-docstring + _assert_dask_support() + return self.client.sync(self._predict_proba_async, data) + + async def _predict_async(self, data): + _assert_dask_support() + + test_dmatrix = await DaskDMatrix(client=self.client, data=data, + missing=self.missing) + pred_probs = await predict(client=self.client, + model=self.get_booster(), data=test_dmatrix) + + if self.n_classes_ == 2: + preds = (pred_probs > 0.5).astype(int) + else: + preds = da.argmax(pred_probs, axis=1) + + return preds + def predict(self, data): # pylint: disable=arguments-differ _assert_dask_support() return self.client.sync(self._predict_async, data) diff --git a/tests/python/test_with_dask.py b/tests/python/test_with_dask.py index 66336c47ac3a..b4be33ed348f 100644 --- a/tests/python/test_with_dask.py +++ b/tests/python/test_with_dask.py @@ -165,12 +165,12 @@ def test_dask_missing_value_cls(): missing=0.0) cls.client = client cls.fit(X, y, eval_set=[(X, y)]) - dd_predt = cls.predict(X).compute() + dd_pred_proba = cls.predict_proba(X).compute() np_X = X.compute() - np_predt = cls.get_booster().predict( + np_pred_proba = cls.get_booster().predict( xgb.DMatrix(np_X, missing=0.0)) - np.testing.assert_allclose(np_predt, dd_predt) + np.testing.assert_allclose(np_pred_proba, dd_pred_proba) cls = xgb.dask.DaskXGBClassifier() assert hasattr(cls, 'missing') @@ -209,7 +209,7 @@ def test_dask_classifier(): classifier.fit(X, y, eval_set=[(X, y)]) prediction = classifier.predict(X) - assert prediction.ndim == 2 + assert prediction.ndim == 1 assert prediction.shape[0] == kRows history = classifier.evals_result() @@ -222,7 +222,18 @@ def test_dask_classifier(): assert len(list(history['validation_0'])) == 1 assert len(history['validation_0']['merror']) == 2 + # Test .predict_proba() + probas = classifier.predict_proba(X) assert classifier.n_classes_ == 10 + assert probas.ndim == 2 + assert probas.shape[0] == kRows + assert probas.shape[1] == 10 + + cls_booster = classifier.get_booster() + single_node_proba = cls_booster.inplace_predict(X.compute()) + + np.testing.assert_allclose(single_node_proba, + probas.compute()) # Test with dataframe. X_d = dd.from_dask_array(X) @@ -232,7 +243,7 @@ def test_dask_classifier(): assert classifier.n_classes_ == 10 prediction = classifier.predict(X_d) - assert prediction.ndim == 2 + assert prediction.ndim == 1 assert prediction.shape[0] == kRows @@ -407,7 +418,7 @@ async def run_dask_classifier_asyncio(scheduler_address): await classifier.fit(X, y, eval_set=[(X, y)]) prediction = await classifier.predict(X) - assert prediction.ndim == 2 + assert prediction.ndim == 1 assert prediction.shape[0] == kRows history = classifier.evals_result() @@ -420,7 +431,13 @@ async def run_dask_classifier_asyncio(scheduler_address): assert len(list(history['validation_0'])) == 1 assert len(history['validation_0']['merror']) == 2 + # Test .predict_proba() + probas = await classifier.predict_proba(X) assert classifier.n_classes_ == 10 + assert probas.ndim == 2 + assert probas.shape[0] == kRows + assert probas.shape[1] == 10 + # Test with dataframe. X_d = dd.from_dask_array(X) @@ -430,9 +447,8 @@ async def run_dask_classifier_asyncio(scheduler_address): assert classifier.n_classes_ == 10 prediction = await classifier.predict(X_d) - assert prediction.ndim == 2 + assert prediction.ndim == 1 assert prediction.shape[0] == kRows - assert prediction.shape[1] == 10 def test_with_asyncio():