Skip to content

Commit

Permalink
Merge pull request #15 from uclamii/earl_stop_imb_score
Browse files Browse the repository at this point in the history
Early stopping + Verbosity Fix + Bootstrap Fix
  • Loading branch information
lshpaner authored Jul 2, 2024
2 parents 1991bd6 + 36708a1 commit 658557b
Show file tree
Hide file tree
Showing 3 changed files with 103 additions and 17 deletions.
69 changes: 67 additions & 2 deletions notebooks/binary_test.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
"\n",
"from sklearn.datasets import load_breast_cancer\n",
"\n",
"sys.path.append(os.path.join(os.pardir))\n",
"from functions import *\n",
"from model_tuner import *"
]
Expand Down Expand Up @@ -140,7 +139,73 @@
"metadata": {},
"outputs": [],
"source": [
"y_prob"
"from xgboost import XGBClassifier\n",
"\n",
"\n",
"estimator = XGBClassifier(\n",
" objective=\"binary:logistic\",\n",
")\n",
"\n",
"estimator_name = \"xgb\"\n",
"xgbearly = True\n",
"\n",
"tuned_parameters = {\n",
" f\"{estimator_name}__max_depth\": [3, 200],\n",
" f\"{estimator_name}__learning_rate\": [1e-4],\n",
" f\"{estimator_name}__n_estimators\": [100000],\n",
" f\"{estimator_name}__early_stopping_rounds\": [10],\n",
" f\"{estimator_name}__verbose\": [False],\n",
" f\"{estimator_name}__eval_metric\": [\"logloss\"],\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"kfold = False\n",
"calibrate = False\n",
"\n",
"# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)\n",
"\n",
"model = Model(\n",
" name=\"XGBoost Early\",\n",
" estimator_name=estimator_name,\n",
" calibrate=calibrate,\n",
" estimator=estimator,\n",
" kfold=kfold,\n",
" stratify_y=True,\n",
" grid=tuned_parameters,\n",
" randomized_grid=True,\n",
" n_iter=40,\n",
" xgboost_early=True,\n",
" scoring=[\"roc_auc\"],\n",
" n_splits=10,\n",
" selectKBest=False,\n",
" n_jobs=-2,\n",
" random_state=42,\n",
")\n",
"\n",
"\n",
"model.grid_search_param_tuning(X, y)\n",
"\n",
"X_train, y_train = model.get_train_data(X, y)\n",
"X_test, y_test = model.get_test_data(X, y)\n",
"X_valid, y_valid = model.get_valid_data(X, y)\n",
"\n",
"model.fit(X_train, y_train)\n",
"\n",
"print(\"Validation Metrics\")\n",
"model.return_metrics(X_valid, y_valid)\n",
"print(\"Test Metrics\")\n",
"model.return_metrics(X_test, y_test)\n",
"\n",
"y_prob = model.predict_proba(X_test)\n",
"\n",
"### F1 Weighted\n",
"y_pred = model.predict(X_test, optimal_threshold=True)"
]
},
{
Expand Down
2 changes: 1 addition & 1 deletion notebooks/regression_test.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -634,7 +634,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.8"
"version": "3.11.9"
}
},
"nbformat": 4,
Expand Down
49 changes: 35 additions & 14 deletions src/model_tuner/model_tuner_utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@

import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
Expand Down Expand Up @@ -348,6 +349,11 @@ def calibrateModel(

# reset estimator in case of calibrated model
self.reset_estimator()
# print(self.estimator[-1].get_params())
if 'device' in self.estimator[-1].get_params():
print("Change back to CPU")
self.estimator[-1].set_params(**{'device': 'cpu'})

# fit estimator
if self.imbalance_sampler:
self.process_imbalance_sampler(X_train, y_train)
Expand All @@ -360,6 +366,7 @@ def calibrateModel(
validation_data=(X_valid, y_valid),
)
# calibrate model, and save output

self.estimator = CalibratedClassifierCV(
self.estimator,
cv="prefit",
Expand Down Expand Up @@ -438,7 +445,6 @@ def fit(self, X, y, validation_data=None, score=None):
if self.xgboost_early:
X_valid, y_valid = validation_data
best_params = self.best_params_per_score[self.scoring[0]]["params"]
print(best_params)
if self.selectKBest or self.pipeline:

params_no_estimator = {
Expand Down Expand Up @@ -471,41 +477,52 @@ def fit(self, X, y, validation_data=None, score=None):
estimator_eval_set: eval_set,
estimator_verbosity: self.verbosity,
}
self.estimator.fit(X, y, **xgb_params)
if estimator_verbosity in best_params:
best_params.pop(estimator_verbosity)
self.estimator.set_params(**best_params).fit(X, y, **xgb_params)
else:
self.estimator.fit(X, y)
self.estimator.set_params(**best_params).fit(X, y)
else:
if self.xgboost_early:
X_valid, y_valid = validation_data
## Uses the current K Best selected to transform the
## Eval set before it is used in early stopping.

best_params = self.best_params_per_score[self.scoring[0]]["params"]
if self.selectKBest or self.pipeline:
print(self.best_params_per_score[score])
params_without_stopping = self.best_params_per_score[score][
"params"
].copy()

params_no_estimator = {
key: value
for key, value in best_params.items()
if not key.startswith(f"{self.estimator_name}__")
}
if self.imbalance_sampler:
# self.estimator[:-2].set_param()fit(X, y)
self.estimator[:-2].set_params(**params_no_estimator).fit(
X, y
)
X_valid_selected = self.estimator[:-2].transform(X_valid)
else:
self.estimator[:-1].fit(X, y)
self.estimator[:-1].set_params(**params_no_estimator).fit(
X, y
)
X_valid_selected = self.estimator[:-1].transform(X_valid)
else:
X_valid_selected = X_valid

X_valid, y_valid = validation_data
if isinstance(X_valid, pd.DataFrame):
eval_set = [(X_valid_selected, y_valid.values)]
else:
eval_set = [(X_valid_selected, y_valid)]

estimator_eval_set = f"{self.estimator_name}__eval_set"
estimator_verbosity = f"{self.estimator_name}__verbose"

xgb_params = {
estimator_eval_set: eval_set,
estimator_verbosity: self.verbosity,
}
if estimator_verbosity in self.best_params_per_score[score]["params"]:
self.best_params_per_score[score]["params"].pop(
estimator_verbosity
)

self.estimator.set_params(
**self.best_params_per_score[score]["params"]
).fit(X, y, **xgb_params)
Expand Down Expand Up @@ -732,6 +749,7 @@ def grid_search_param_tuning(

if params.get(estimator_verbosity):
self.verbosity = params[estimator_verbosity]
params.pop(estimator_verbosity)
else:
self.verbosity = False

Expand Down Expand Up @@ -765,12 +783,15 @@ def grid_search_param_tuning(

estimator_eval_set = f"{self.estimator_name}__eval_set"
estimator_verbosity = f"{self.estimator_name}__verbose"
estimator_eval_metric = f"{self.estimator_name}__eval_metric"

xgb_params = {
estimator_eval_set: eval_set,
estimator_verbosity: self.verbosity,
}

if estimator_verbosity in params:
params.pop(estimator_verbosity)

clf = self.estimator.set_params(**params).fit(
X_train, y_train, **xgb_params
)
Expand Down

0 comments on commit 658557b

Please sign in to comment.