dmlc · hcho3 · Jan 30, 2022 · Oct 27, 2021 · Oct 27, 2021 · Oct 27, 2021
diff --git a/demo/guide-python/sklearn_examples.py b/demo/guide-python/sklearn_examples.py
@@ -12,7 +12,7 @@
 import numpy as np
 from sklearn.model_selection import KFold, train_test_split, GridSearchCV
 from sklearn.metrics import confusion_matrix, mean_squared_error
-from sklearn.datasets import load_iris, load_digits, load_boston
+from sklearn.datasets import load_iris, load_digits, fetch_california_housing
 
 rng = np.random.RandomState(31337)
 
@@ -38,10 +38,8 @@
     actuals = y[test_index]
     print(confusion_matrix(actuals, predictions))
 
-print("Boston Housing: regression")
-boston = load_boston()
-y = boston['target']
-X = boston['data']
+print("California Housing: regression")
+X, y = fetch_california_housing(return_X_y=True)
 kf = KFold(n_splits=2, shuffle=True, random_state=rng)
 for train_index, test_index in kf.split(X):
     xgb_model = xgb.XGBRegressor(n_jobs=1).fit(X[train_index], y[train_index])
@@ -50,8 +48,6 @@
     print(mean_squared_error(actuals, predictions))
 
 print("Parameter optimization")
-y = boston['target']
-X = boston['data']
 xgb_model = xgb.XGBRegressor(n_jobs=1)
 clf = GridSearchCV(xgb_model,
                    {'max_depth': [2, 4, 6],
@@ -63,8 +59,8 @@
 # The sklearn API models are picklable
 print("Pickling sklearn API models")
 # must open in binary format to pickle
-pickle.dump(clf, open("best_boston.pkl", "wb"))
-clf2 = pickle.load(open("best_boston.pkl", "rb"))
+pickle.dump(clf, open("best_calif.pkl", "wb"))
+clf2 = pickle.load(open("best_calif.pkl", "rb"))
 print(np.allclose(clf.predict(X), clf2.predict(X)))
 
 # Early-stopping

diff --git a/demo/guide-python/sklearn_parallel.py b/demo/guide-python/sklearn_parallel.py
@@ -3,16 +3,13 @@
 ===================================
 """
 from sklearn.model_selection import GridSearchCV
-from sklearn.datasets import load_boston
+from sklearn.datasets import fetch_california_housing
 import xgboost as xgb
 import multiprocessing
 
 if __name__ == "__main__":
     print("Parallel Parameter optimization")
-    boston = load_boston()
-
-    y = boston['target']
-    X = boston['data']
+    X, y = fetch_california_housing(return_X_y=True)
     xgb_model = xgb.XGBRegressor(n_jobs=multiprocessing.cpu_count() // 2)
     clf = GridSearchCV(xgb_model, {'max_depth': [2, 4, 6],
                                    'n_estimators': [50, 100, 200]}, verbose=1,

diff --git a/demo/guide-python/update_process.py b/demo/guide-python/update_process.py
@@ -8,14 +8,14 @@
 """
 
 import xgboost as xgb
-from sklearn.datasets import load_boston
+from sklearn.datasets import fetch_california_housing
 import numpy as np
 
 
 def main():
     n_rounds = 32
 
-    X, y = load_boston(return_X_y=True)
+    X, y = fetch_california_housing(return_X_y=True)
 
     # Train a model first
     X_train = X[: X.shape[0] // 2]

diff --git a/tests/python-gpu/test_gpu_linear.py b/tests/python-gpu/test_gpu_linear.py
@@ -43,8 +43,8 @@ def test_gpu_coordinate(self, param, num_rounds, dataset):
     # We test a weaker condition that the loss has not increased between the first and last
     # iteration
     @given(parameter_strategy, strategies.integers(10, 50),
-           tm.dataset_strategy, strategies.floats(1e-5, 2.0),
-           strategies.floats(1e-5, 2.0))
+           tm.dataset_strategy, strategies.floats(1e-5, 1.0),
+           strategies.floats(1e-5, 1.0))
     @settings(deadline=None)
     def test_gpu_coordinate_regularised(self, param, num_rounds, dataset, alpha, lambd):
         assume(len(dataset.y) > 0)
@@ -63,7 +63,7 @@ def test_gpu_coordinate_from_cupy(self):
         import cupy
         params = {'booster': 'gblinear', 'updater': 'gpu_coord_descent',
                   'n_estimators': 100}
-        X, y = tm.get_boston()
+        X, y = tm.get_california_housing()
         cpu_model = xgb.XGBRegressor(**params)
         cpu_model.fit(X, y)
         cpu_predt = cpu_model.predict(X)

diff --git a/tests/python-gpu/test_gpu_with_sklearn.py b/tests/python-gpu/test_gpu_with_sklearn.py
@@ -61,7 +61,7 @@ def test_boost_from_prediction_gpu_hist():
 
 
 def test_num_parallel_tree():
-    twskl.run_boston_housing_rf_regression("gpu_hist")
+    twskl.run_calif_housing_rf_regression("gpu_hist")
 
 
 @pytest.mark.skipif(**tm.no_pandas())

diff --git a/tests/python/test_callback.py b/tests/python/test_callback.py
@@ -384,7 +384,7 @@ def test_check_point(self):
                     os.path.join(tmpdir, 'model_' + str(i) + '.pkl'))
 
     def test_callback_list(self):
-        X, y = tm.get_boston()
+        X, y = tm.get_california_housing()
         m = xgb.DMatrix(X, y)
         callbacks = [xgb.callback.EarlyStopping(rounds=10)]
         for i in range(4):

diff --git a/tests/python/test_demos.py b/tests/python/test_demos.py
@@ -45,8 +45,8 @@ def test_sklearn_demo():
     script = os.path.join(PYTHON_DEMO_DIR, 'sklearn_examples.py')
     cmd = ['python', script]
     subprocess.check_call(cmd)
-    assert os.path.exists('best_boston.pkl')
-    os.remove('best_boston.pkl')
+    assert os.path.exists('best_calif.pkl')
+    os.remove('best_calif.pkl')
 
 
 @pytest.mark.skipif(**tm.no_sklearn())

diff --git a/tests/python/test_linear.py b/tests/python/test_linear.py
@@ -39,8 +39,8 @@ def test_coordinate(self, param, num_rounds, dataset, coord_param):
     # We test a weaker condition that the loss has not increased between the first and last
     # iteration
     @given(parameter_strategy, strategies.integers(10, 50),
-           tm.dataset_strategy, coord_strategy, strategies.floats(1e-5, 2.0),
-           strategies.floats(1e-5, 2.0))
+           tm.dataset_strategy, coord_strategy, strategies.floats(1e-5, 1.0),
+           strategies.floats(1e-5, 1.0))
     @settings(deadline=None)
     def test_coordinate_regularised(self, param, num_rounds, dataset, coord_param, alpha, lambd):
         param['updater'] = 'coord_descent'
@@ -69,8 +69,8 @@ def test_shotgun(self, param, num_rounds, dataset):
         assert tm.non_increasing(sampled_result)
 
     @given(parameter_strategy, strategies.integers(10, 50),
-           tm.dataset_strategy, strategies.floats(1e-5, 2.0),
-           strategies.floats(1e-5, 2.0))
+           tm.dataset_strategy, strategies.floats(1e-5, 1.0),
+           strategies.floats(1e-5, 1.0))
     @settings(deadline=None)
     def test_shotgun_regularised(self, param, num_rounds, dataset, alpha, lambd):
         param['updater'] = 'shotgun'

diff --git a/tests/python/test_predict.py b/tests/python/test_predict.py
@@ -88,8 +88,8 @@ def test_predict_leaf():
 
 
 def test_predict_shape():
-    from sklearn.datasets import load_boston
-    X, y = load_boston(return_X_y=True)
+    from sklearn.datasets import fetch_california_housing
+    X, y = fetch_california_housing(return_X_y=True)
     reg = xgb.XGBRegressor(n_estimators=1)
     reg.fit(X, y)
     predt = reg.get_booster().predict(xgb.DMatrix(X), strict_shape=True)

diff --git a/tests/python/test_with_dask.py b/tests/python/test_with_dask.py
@@ -304,8 +304,8 @@ def test_boost_from_prediction(tree_method: str, client: "Client") -> None:
 
 
 def test_inplace_predict(client: "Client") -> None:
-    from sklearn.datasets import load_boston
-    X_, y_ = load_boston(return_X_y=True)
+    from sklearn.datasets import fetch_california_housing
+    X_, y_ = fetch_california_housing(return_X_y=True)
     X, y = dd.from_array(X_, chunksize=32), dd.from_array(y_, chunksize=32)
     reg = xgb.dask.DaskXGBRegressor(n_estimators=4).fit(X, y)
     booster = reg.get_booster()
@@ -1278,8 +1278,8 @@ def test_feature_weights(self, client: "Client") -> None:
     @pytest.mark.skipif(**tm.no_dask())
     @pytest.mark.skipif(**tm.no_sklearn())
     def test_custom_objective(self, client: "Client") -> None:
-        from sklearn.datasets import load_boston
-        X, y = load_boston(return_X_y=True)
+        from sklearn.datasets import fetch_california_housing
+        X, y = fetch_california_housing(return_X_y=True)
         X, y = da.from_array(X), da.from_array(y)
         rounds = 20
 
@@ -1423,8 +1423,8 @@ def run_shap_cls_sklearn(self, X: Any, y: Any, client: "Client") -> None:
         assert np.allclose(np.sum(shap, axis=len(shap.shape) - 1), margin, 1e-5, 1e-5)
 
     def test_shap(self, client: "Client") -> None:
-        from sklearn.datasets import load_boston, load_digits
-        X, y = load_boston(return_X_y=True)
+        from sklearn.datasets import fetch_california_housing, load_digits
+        X, y = fetch_california_housing(return_X_y=True)
         params: Dict[str, Any] = {'objective': 'reg:squarederror'}
         self.run_shap(X, y, params, client)
 
@@ -1468,8 +1468,8 @@ def run_shap_interactions(
                            1e-5, 1e-5)
 
     def test_shap_interactions(self, client: "Client") -> None:
-        from sklearn.datasets import load_boston
-        X, y = load_boston(return_X_y=True)
+        from sklearn.datasets import fetch_california_housing
+        X, y = fetch_california_housing(return_X_y=True)
         params = {'objective': 'reg:squarederror'}
         self.run_shap_interactions(X, y, params, client)
 

diff --git a/tests/python/test_with_shap.py b/tests/python/test_with_shap.py
@@ -14,7 +14,8 @@
 # Check integration is not broken from xgboost side
 # Changes in binary format may cause problems
 def test_with_shap():
-    X, y = shap.datasets.boston()
+    from sklearn.datasets import fetch_california_housing
+    X, y = fetch_california_housing(return_X_y=True)
     dtrain = xgb.DMatrix(X, label=y)
     model = xgb.train({"learning_rate": 0.01}, dtrain, 10)
     explainer = shap.TreeExplainer(model)

diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py
@@ -327,16 +327,16 @@ def test_select_feature():
 
 
 def test_num_parallel_tree():
-    from sklearn.datasets import load_boston
+    from sklearn.datasets import fetch_california_housing
     reg = xgb.XGBRegressor(n_estimators=4, num_parallel_tree=4,
                            tree_method='hist')
-    boston = load_boston()
-    bst = reg.fit(X=boston['data'], y=boston['target'])
+    X, y = fetch_california_housing(return_X_y=True)
+    bst = reg.fit(X=X, y=y)
     dump = bst.get_booster().get_dump(dump_format='json')
     assert len(dump) == 16
 
     reg = xgb.XGBRFRegressor(n_estimators=4)
-    bst = reg.fit(X=boston['data'], y=boston['target'])
+    bst = reg.fit(X=X, y=y)
     dump = bst.get_booster().get_dump(dump_format='json')
     assert len(dump) == 4
 
@@ -345,14 +345,12 @@ def test_num_parallel_tree():
         'num_parallel_tree']) == 4
 
 
-def test_boston_housing_regression():
+def test_calif_housing_regression():
     from sklearn.metrics import mean_squared_error
-    from sklearn.datasets import load_boston
+    from sklearn.datasets import fetch_california_housing
     from sklearn.model_selection import KFold
 
-    boston = load_boston()
-    y = boston['target']
-    X = boston['data']
+    X, y = fetch_california_housing(return_X_y=True)
     kf = KFold(n_splits=2, shuffle=True, random_state=rng)
     for train_index, test_index in kf.split(X, y):
         xgb_model = xgb.XGBRegressor().fit(X[train_index], y[train_index])
@@ -376,12 +374,12 @@ def test_boston_housing_regression():
             xgb_model.feature_names_in_
 
 
-def run_boston_housing_rf_regression(tree_method):
+def run_calif_housing_rf_regression(tree_method):
     from sklearn.metrics import mean_squared_error
-    from sklearn.datasets import load_boston
+    from sklearn.datasets import fetch_california_housing
     from sklearn.model_selection import KFold
 
-    X, y = load_boston(return_X_y=True)
+    X, y = fetch_california_housing(return_X_y=True)
     kf = KFold(n_splits=2, shuffle=True, random_state=rng)
     for train_index, test_index in kf.split(X, y):
         xgb_model = xgb.XGBRFRegressor(random_state=42, tree_method=tree_method).fit(
@@ -396,39 +394,35 @@ def run_boston_housing_rf_regression(tree_method):
         rfreg.fit(X, y, early_stopping_rounds=10)
 
 
-def test_boston_housing_rf_regression():
-    run_boston_housing_rf_regression("hist")
+def test_calif_housing_rf_regression():
+    run_calif_housing_rf_regression("hist")
 
 
 def test_parameter_tuning():
     from sklearn.model_selection import GridSearchCV
-    from sklearn.datasets import load_boston
+    from sklearn.datasets import fetch_california_housing
 
-    boston = load_boston()
-    y = boston['target']
-    X = boston['data']
+    X, y = fetch_california_housing(return_X_y=True)
     xgb_model = xgb.XGBRegressor(learning_rate=0.1)
     clf = GridSearchCV(xgb_model, {'max_depth': [2, 4, 6],
                                    'n_estimators': [50, 100, 200]},
                        cv=3, verbose=1)
     clf.fit(X, y)
     assert clf.best_score_ < 0.7
-    assert clf.best_params_ == {'n_estimators': 100, 'max_depth': 4}
+    assert clf.best_params_ == {'n_estimators': 200, 'max_depth': 4}
 
 
 def test_regression_with_custom_objective():
     from sklearn.metrics import mean_squared_error
-    from sklearn.datasets import load_boston
+    from sklearn.datasets import fetch_california_housing
     from sklearn.model_selection import KFold
 
     def objective_ls(y_true, y_pred):
         grad = (y_pred - y_true)
         hess = np.ones(len(y_true))
         return grad, hess
 
-    boston = load_boston()
-    y = boston['target']
-    X = boston['data']
+    X, y = fetch_california_housing(return_X_y=True)
     kf = KFold(n_splits=2, shuffle=True, random_state=rng)
     for train_index, test_index in kf.split(X, y):
         xgb_model = xgb.XGBRegressor(objective=objective_ls).fit(
@@ -840,13 +834,13 @@ def test_save_load_model():
 
 
 def test_RFECV():
-    from sklearn.datasets import load_boston
+    from sklearn.datasets import fetch_california_housing
     from sklearn.datasets import load_breast_cancer
     from sklearn.datasets import load_iris
     from sklearn.feature_selection import RFECV
 
     # Regression
-    X, y = load_boston(return_X_y=True)
+    X, y = fetch_california_housing(return_X_y=True)
     bst = xgb.XGBRegressor(booster='gblinear', learning_rate=0.1,
                            n_estimators=10,
                            objective='reg:squarederror',

diff --git a/tests/python/testing.py b/tests/python/testing.py
@@ -229,8 +229,8 @@ def __repr__(self):
 
 
 @memory.cache
-def get_boston():
-    data = datasets.load_boston()
+def get_california_housing():
+    data = datasets.fetch_california_housing()
     return data.data, data.target
 
 
@@ -315,7 +315,9 @@ def make_categorical(
 
 _unweighted_datasets_strategy = strategies.sampled_from(
     [
-        TestDataset("boston", get_boston, "reg:squarederror", "rmse"),
+        TestDataset(
+            "calif_housing", get_california_housing, "reg:squarederror", "rmse"
+        ),
         TestDataset("digits", get_digits, "multi:softmax", "mlogloss"),
         TestDataset("cancer", get_cancer, "binary:logistic", "logloss"),
         TestDataset(