Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace all uses of deprecated function sklearn.datasets.load_boston #7373

Merged
merged 9 commits into from
Jan 30, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 5 additions & 9 deletions demo/guide-python/sklearn_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import numpy as np
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.datasets import load_iris, load_digits, load_boston
from sklearn.datasets import load_iris, load_digits, fetch_california_housing

rng = np.random.RandomState(31337)

Expand All @@ -38,10 +38,8 @@
actuals = y[test_index]
print(confusion_matrix(actuals, predictions))

print("Boston Housing: regression")
boston = load_boston()
y = boston['target']
X = boston['data']
print("California Housing: regression")
X, y = fetch_california_housing(return_X_y=True)
kf = KFold(n_splits=2, shuffle=True, random_state=rng)
for train_index, test_index in kf.split(X):
xgb_model = xgb.XGBRegressor(n_jobs=1).fit(X[train_index], y[train_index])
Expand All @@ -50,8 +48,6 @@
print(mean_squared_error(actuals, predictions))

print("Parameter optimization")
y = boston['target']
X = boston['data']
xgb_model = xgb.XGBRegressor(n_jobs=1)
clf = GridSearchCV(xgb_model,
{'max_depth': [2, 4, 6],
Expand All @@ -63,8 +59,8 @@
# The sklearn API models are picklable
print("Pickling sklearn API models")
# must open in binary format to pickle
pickle.dump(clf, open("best_boston.pkl", "wb"))
clf2 = pickle.load(open("best_boston.pkl", "rb"))
pickle.dump(clf, open("best_calif.pkl", "wb"))
clf2 = pickle.load(open("best_calif.pkl", "rb"))
print(np.allclose(clf.predict(X), clf2.predict(X)))

# Early-stopping
Expand Down
7 changes: 2 additions & 5 deletions demo/guide-python/sklearn_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,13 @@
===================================
"""
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_boston
from sklearn.datasets import fetch_california_housing
import xgboost as xgb
import multiprocessing

if __name__ == "__main__":
print("Parallel Parameter optimization")
boston = load_boston()

y = boston['target']
X = boston['data']
X, y = fetch_california_housing(return_X_y=True)
xgb_model = xgb.XGBRegressor(n_jobs=multiprocessing.cpu_count() // 2)
clf = GridSearchCV(xgb_model, {'max_depth': [2, 4, 6],
'n_estimators': [50, 100, 200]}, verbose=1,
Expand Down
4 changes: 2 additions & 2 deletions demo/guide-python/update_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,14 @@
"""

import xgboost as xgb
from sklearn.datasets import load_boston
from sklearn.datasets import fetch_california_housing
import numpy as np


def main():
n_rounds = 32

X, y = load_boston(return_X_y=True)
X, y = fetch_california_housing(return_X_y=True)

# Train a model first
X_train = X[: X.shape[0] // 2]
Expand Down
6 changes: 3 additions & 3 deletions tests/python-gpu/test_gpu_linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ def test_gpu_coordinate(self, param, num_rounds, dataset):
# We test a weaker condition that the loss has not increased between the first and last
# iteration
@given(parameter_strategy, strategies.integers(10, 50),
tm.dataset_strategy, strategies.floats(1e-5, 2.0),
strategies.floats(1e-5, 2.0))
tm.dataset_strategy, strategies.floats(1e-5, 1.0),
strategies.floats(1e-5, 1.0))
@settings(deadline=None)
def test_gpu_coordinate_regularised(self, param, num_rounds, dataset, alpha, lambd):
assume(len(dataset.y) > 0)
Expand All @@ -63,7 +63,7 @@ def test_gpu_coordinate_from_cupy(self):
import cupy
params = {'booster': 'gblinear', 'updater': 'gpu_coord_descent',
'n_estimators': 100}
X, y = tm.get_boston()
X, y = tm.get_california_housing()
cpu_model = xgb.XGBRegressor(**params)
cpu_model.fit(X, y)
cpu_predt = cpu_model.predict(X)
Expand Down
2 changes: 1 addition & 1 deletion tests/python-gpu/test_gpu_with_sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def test_boost_from_prediction_gpu_hist():


def test_num_parallel_tree():
twskl.run_boston_housing_rf_regression("gpu_hist")
twskl.run_calif_housing_rf_regression("gpu_hist")


@pytest.mark.skipif(**tm.no_pandas())
Expand Down
2 changes: 1 addition & 1 deletion tests/python/test_callback.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,7 +384,7 @@ def test_check_point(self):
os.path.join(tmpdir, 'model_' + str(i) + '.pkl'))

def test_callback_list(self):
X, y = tm.get_boston()
X, y = tm.get_california_housing()
m = xgb.DMatrix(X, y)
callbacks = [xgb.callback.EarlyStopping(rounds=10)]
for i in range(4):
Expand Down
4 changes: 2 additions & 2 deletions tests/python/test_demos.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ def test_sklearn_demo():
script = os.path.join(PYTHON_DEMO_DIR, 'sklearn_examples.py')
cmd = ['python', script]
subprocess.check_call(cmd)
assert os.path.exists('best_boston.pkl')
os.remove('best_boston.pkl')
assert os.path.exists('best_calif.pkl')
os.remove('best_calif.pkl')


@pytest.mark.skipif(**tm.no_sklearn())
Expand Down
8 changes: 4 additions & 4 deletions tests/python/test_linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ def test_coordinate(self, param, num_rounds, dataset, coord_param):
# We test a weaker condition that the loss has not increased between the first and last
# iteration
@given(parameter_strategy, strategies.integers(10, 50),
tm.dataset_strategy, coord_strategy, strategies.floats(1e-5, 2.0),
strategies.floats(1e-5, 2.0))
tm.dataset_strategy, coord_strategy, strategies.floats(1e-5, 1.0),
strategies.floats(1e-5, 1.0))
@settings(deadline=None)
def test_coordinate_regularised(self, param, num_rounds, dataset, coord_param, alpha, lambd):
param['updater'] = 'coord_descent'
Expand Down Expand Up @@ -69,8 +69,8 @@ def test_shotgun(self, param, num_rounds, dataset):
assert tm.non_increasing(sampled_result)

@given(parameter_strategy, strategies.integers(10, 50),
tm.dataset_strategy, strategies.floats(1e-5, 2.0),
strategies.floats(1e-5, 2.0))
tm.dataset_strategy, strategies.floats(1e-5, 1.0),
strategies.floats(1e-5, 1.0))
@settings(deadline=None)
def test_shotgun_regularised(self, param, num_rounds, dataset, alpha, lambd):
param['updater'] = 'shotgun'
Expand Down
4 changes: 2 additions & 2 deletions tests/python/test_predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,8 @@ def test_predict_leaf():


def test_predict_shape():
from sklearn.datasets import load_boston
X, y = load_boston(return_X_y=True)
from sklearn.datasets import fetch_california_housing
X, y = fetch_california_housing(return_X_y=True)
reg = xgb.XGBRegressor(n_estimators=1)
reg.fit(X, y)
predt = reg.get_booster().predict(xgb.DMatrix(X), strict_shape=True)
Expand Down
16 changes: 8 additions & 8 deletions tests/python/test_with_dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,8 +304,8 @@ def test_boost_from_prediction(tree_method: str, client: "Client") -> None:


def test_inplace_predict(client: "Client") -> None:
from sklearn.datasets import load_boston
X_, y_ = load_boston(return_X_y=True)
from sklearn.datasets import fetch_california_housing
X_, y_ = fetch_california_housing(return_X_y=True)
X, y = dd.from_array(X_, chunksize=32), dd.from_array(y_, chunksize=32)
reg = xgb.dask.DaskXGBRegressor(n_estimators=4).fit(X, y)
booster = reg.get_booster()
Expand Down Expand Up @@ -1278,8 +1278,8 @@ def test_feature_weights(self, client: "Client") -> None:
@pytest.mark.skipif(**tm.no_dask())
@pytest.mark.skipif(**tm.no_sklearn())
def test_custom_objective(self, client: "Client") -> None:
from sklearn.datasets import load_boston
X, y = load_boston(return_X_y=True)
from sklearn.datasets import fetch_california_housing
X, y = fetch_california_housing(return_X_y=True)
X, y = da.from_array(X), da.from_array(y)
rounds = 20

Expand Down Expand Up @@ -1423,8 +1423,8 @@ def run_shap_cls_sklearn(self, X: Any, y: Any, client: "Client") -> None:
assert np.allclose(np.sum(shap, axis=len(shap.shape) - 1), margin, 1e-5, 1e-5)

def test_shap(self, client: "Client") -> None:
from sklearn.datasets import load_boston, load_digits
X, y = load_boston(return_X_y=True)
from sklearn.datasets import fetch_california_housing, load_digits
X, y = fetch_california_housing(return_X_y=True)
params: Dict[str, Any] = {'objective': 'reg:squarederror'}
self.run_shap(X, y, params, client)

Expand Down Expand Up @@ -1468,8 +1468,8 @@ def run_shap_interactions(
1e-5, 1e-5)

def test_shap_interactions(self, client: "Client") -> None:
from sklearn.datasets import load_boston
X, y = load_boston(return_X_y=True)
from sklearn.datasets import fetch_california_housing
X, y = fetch_california_housing(return_X_y=True)
params = {'objective': 'reg:squarederror'}
self.run_shap_interactions(X, y, params, client)

Expand Down
3 changes: 2 additions & 1 deletion tests/python/test_with_shap.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
# Check integration is not broken from xgboost side
# Changes in binary format may cause problems
def test_with_shap():
X, y = shap.datasets.boston()
from sklearn.datasets import fetch_california_housing
X, y = fetch_california_housing(return_X_y=True)
dtrain = xgb.DMatrix(X, label=y)
model = xgb.train({"learning_rate": 0.01}, dtrain, 10)
explainer = shap.TreeExplainer(model)
Expand Down
44 changes: 19 additions & 25 deletions tests/python/test_with_sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,16 +327,16 @@ def test_select_feature():


def test_num_parallel_tree():
from sklearn.datasets import load_boston
from sklearn.datasets import fetch_california_housing
reg = xgb.XGBRegressor(n_estimators=4, num_parallel_tree=4,
tree_method='hist')
boston = load_boston()
bst = reg.fit(X=boston['data'], y=boston['target'])
X, y = fetch_california_housing(return_X_y=True)
bst = reg.fit(X=X, y=y)
dump = bst.get_booster().get_dump(dump_format='json')
assert len(dump) == 16

reg = xgb.XGBRFRegressor(n_estimators=4)
bst = reg.fit(X=boston['data'], y=boston['target'])
bst = reg.fit(X=X, y=y)
dump = bst.get_booster().get_dump(dump_format='json')
assert len(dump) == 4

Expand All @@ -345,14 +345,12 @@ def test_num_parallel_tree():
'num_parallel_tree']) == 4


def test_boston_housing_regression():
def test_calif_housing_regression():
from sklearn.metrics import mean_squared_error
from sklearn.datasets import load_boston
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import KFold

boston = load_boston()
y = boston['target']
X = boston['data']
X, y = fetch_california_housing(return_X_y=True)
kf = KFold(n_splits=2, shuffle=True, random_state=rng)
for train_index, test_index in kf.split(X, y):
xgb_model = xgb.XGBRegressor().fit(X[train_index], y[train_index])
Expand All @@ -376,12 +374,12 @@ def test_boston_housing_regression():
xgb_model.feature_names_in_


def run_boston_housing_rf_regression(tree_method):
def run_calif_housing_rf_regression(tree_method):
from sklearn.metrics import mean_squared_error
from sklearn.datasets import load_boston
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import KFold

X, y = load_boston(return_X_y=True)
X, y = fetch_california_housing(return_X_y=True)
kf = KFold(n_splits=2, shuffle=True, random_state=rng)
for train_index, test_index in kf.split(X, y):
xgb_model = xgb.XGBRFRegressor(random_state=42, tree_method=tree_method).fit(
Expand All @@ -396,39 +394,35 @@ def run_boston_housing_rf_regression(tree_method):
rfreg.fit(X, y, early_stopping_rounds=10)


def test_boston_housing_rf_regression():
run_boston_housing_rf_regression("hist")
def test_calif_housing_rf_regression():
run_calif_housing_rf_regression("hist")


def test_parameter_tuning():
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_boston
from sklearn.datasets import fetch_california_housing

boston = load_boston()
y = boston['target']
X = boston['data']
X, y = fetch_california_housing(return_X_y=True)
xgb_model = xgb.XGBRegressor(learning_rate=0.1)
clf = GridSearchCV(xgb_model, {'max_depth': [2, 4, 6],
'n_estimators': [50, 100, 200]},
cv=3, verbose=1)
clf.fit(X, y)
assert clf.best_score_ < 0.7
assert clf.best_params_ == {'n_estimators': 100, 'max_depth': 4}
assert clf.best_params_ == {'n_estimators': 200, 'max_depth': 4}


def test_regression_with_custom_objective():
from sklearn.metrics import mean_squared_error
from sklearn.datasets import load_boston
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import KFold

def objective_ls(y_true, y_pred):
grad = (y_pred - y_true)
hess = np.ones(len(y_true))
return grad, hess

boston = load_boston()
y = boston['target']
X = boston['data']
X, y = fetch_california_housing(return_X_y=True)
kf = KFold(n_splits=2, shuffle=True, random_state=rng)
for train_index, test_index in kf.split(X, y):
xgb_model = xgb.XGBRegressor(objective=objective_ls).fit(
Expand Down Expand Up @@ -840,13 +834,13 @@ def test_save_load_model():


def test_RFECV():
from sklearn.datasets import load_boston
from sklearn.datasets import fetch_california_housing
from sklearn.datasets import load_breast_cancer
from sklearn.datasets import load_iris
from sklearn.feature_selection import RFECV

# Regression
X, y = load_boston(return_X_y=True)
X, y = fetch_california_housing(return_X_y=True)
bst = xgb.XGBRegressor(booster='gblinear', learning_rate=0.1,
n_estimators=10,
objective='reg:squarederror',
Expand Down
8 changes: 5 additions & 3 deletions tests/python/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,8 +229,8 @@ def __repr__(self):


@memory.cache
def get_boston():
data = datasets.load_boston()
def get_california_housing():
data = datasets.fetch_california_housing()
return data.data, data.target


Expand Down Expand Up @@ -315,7 +315,9 @@ def make_categorical(

_unweighted_datasets_strategy = strategies.sampled_from(
[
TestDataset("boston", get_boston, "reg:squarederror", "rmse"),
TestDataset(
"calif_housing", get_california_housing, "reg:squarederror", "rmse"
),
TestDataset("digits", get_digits, "multi:softmax", "mlogloss"),
TestDataset("cancer", get_cancer, "binary:logistic", "logloss"),
TestDataset(
Expand Down