-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #24 from UBC-MDS/model_script
upload scripts for eda, model fitting, and model evaluation
- Loading branch information
Showing
5 changed files
with
13,872 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
# eda.py | ||
# author: Yichi Zhang | ||
# date: 2024-12-07 | ||
|
||
import click | ||
import os | ||
import numpy as np | ||
import pandas as pd | ||
import matplotlib.pyplot as plt | ||
|
||
@click.command() | ||
@click.option('--processed-training-data', type=str, help="Path to processed training data") | ||
@click.option('--plot-to', type=str, help="Path to directory where the plot will be written to") | ||
def main(processed_training_data, plot_to): | ||
'''Plots the densities of each feature in the processed training data | ||
by class and displays them as a grid of plots. Also saves the plot.''' | ||
|
||
mushroom_train = pd.read_csv(processed_training_data) | ||
|
||
numeric_columns = mushroom_train.select_dtypes(include='number') # Select only numeric columns | ||
|
||
for column in numeric_columns.columns: | ||
plt.figure(figsize=(5,5)) | ||
plt.hist(mushroom_train[column], bins=15, edgecolor='black', alpha=0.7) | ||
plt.title(f'Histogram of {column}') | ||
plt.xlabel(column) | ||
plt.ylabel('Frequency') | ||
|
||
plt.savefig(os.path.join(plot_to, "figures", f"histogram_{column}.png"), | ||
dpi=300) | ||
|
||
|
||
categorical_columns = mushroom_train.select_dtypes(include='object') # Select only categorical columns | ||
|
||
for column in categorical_columns.columns: | ||
frequency = mushroom_train[column].value_counts() | ||
percentage = round(mushroom_train[column].value_counts(normalize=True) * 100, 2) | ||
freq_percent_df = pd.DataFrame({ | ||
"Frequency": frequency, | ||
"Percentage": percentage | ||
}) | ||
styled_df = freq_percent_df.style.format( | ||
precision=2 | ||
).background_gradient( | ||
subset=['Percentage'], | ||
cmap='YlOrRd' | ||
) | ||
fig, ax = plt.subplots(figsize=(6, 2)) # Adjust the figure size as needed | ||
ax.axis('off') # Turn off the axes | ||
|
||
# Create a table from the DataFrame | ||
table = ax.table( | ||
cellText=freq_percent_df.values, | ||
colLabels=freq_percent_df.columns, | ||
rowLabels=freq_percent_df.index, | ||
loc='center', | ||
cellLoc='center' | ||
) | ||
|
||
# Style adjustments for readability | ||
table.auto_set_font_size(False) | ||
table.set_fontsize(10) | ||
table.auto_set_column_width(col=list(range(len(freq_percent_df.columns)))) | ||
|
||
file_path = os.path.join(plot_to, "figures", f"{column}_frequency_table.png") | ||
plt.savefig(file_path, dpi=300, bbox_inches='tight') | ||
print(f"Saved styled table for '{column}'") | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
# fit_mushroom_classifier.py | ||
# author: Yichi Zhang | ||
# date: 2024-12-07 | ||
|
||
import click | ||
import os | ||
import pickle | ||
import json | ||
import logging | ||
from ucimlrepo import fetch_ucirepo | ||
import pandas as pd | ||
import numpy as np | ||
import pandera as pa | ||
from pandera import Check | ||
from deepchecks import Dataset | ||
import matplotlib.pyplot as plt | ||
from scipy.stats import loguniform, randint | ||
from sklearn import set_config | ||
from sklearn.model_selection import train_test_split | ||
from sklearn.neighbors import KNeighborsClassifier | ||
from sklearn.svm import SVC | ||
from sklearn.linear_model import LogisticRegression | ||
from sklearn.impute import SimpleImputer | ||
from sklearn.preprocessing import QuantileTransformer,OneHotEncoder | ||
from sklearn.compose import make_column_transformer | ||
from sklearn.pipeline import make_pipeline | ||
from sklearn.metrics import ConfusionMatrixDisplay, make_scorer, fbeta_score, accuracy_score, precision_score, recall_score | ||
from sklearn.model_selection import cross_validate, cross_val_predict, GridSearchCV, RandomizedSearchCV | ||
|
||
@click.command() | ||
@click.option('--scaled-test-data', type=str, help="Path to scaled test data") | ||
@click.option('--pipeline-from', type=str, help="Path to directory where the fit pipeline object lives") | ||
@click.option('--results-to', type=str, help="Path to directory where the plot will be written to") | ||
@click.option('--seed', type=int, help="Random seed", default=123) | ||
def main(scaled_test_data, pipeline_from, results_to, seed): | ||
'''Evaluates the breast cancer classifier on the test data | ||
and saves the evaluation results.''' | ||
np.random.seed(seed) | ||
set_config(transform_output="pandas") | ||
|
||
mushroom_test = pd.read_csv(scaled_test_data) | ||
|
||
with open(pipeline_from, 'rb') as f: | ||
mushroom_fit = pickle.load(f) | ||
|
||
# Compute accuracy | ||
accuracy = mushroom_fit.score( | ||
mushroom_test.drop(columns=["target"]), | ||
mushroom_test["target"] | ||
) | ||
|
||
# Compute F2 score (beta = 2) | ||
mushroom_preds = mushroom_test.assign( | ||
predicted=mushroom_fit.predict(mushroom_test) | ||
) | ||
f2_beta_2_score = fbeta_score( | ||
mushroom_preds['target'], | ||
mushroom_preds['predicted'], | ||
beta=2, | ||
pos_label='p' | ||
) | ||
|
||
test_scores = pd.DataFrame({'accuracy': [accuracy], | ||
'F2 score (beta = 2)': [f2_beta_2_score]}) | ||
test_scores.to_csv(os.path.join(results_to, "test_scores.csv"), index=False) | ||
|
||
confusion_matrix = pd.crosstab( | ||
mushroom_preds["target"], | ||
mushroom_preds["predicted"] | ||
) | ||
confusion_matrix.to_csv(os.path.join(results_to, "tables", "confusion_matrix.csv")) | ||
|
||
disp = ConfusionMatrixDisplay.from_predictions( | ||
mushroom_preds["target"], | ||
mushroom_preds["predicted"] | ||
) | ||
disp.plot() | ||
plt.savefig(os.path.join(results_to, "figures", "confusion_matrix.png"), dpi=300) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
# fit_mushroom_classifier.py | ||
# author: Yichi Zhang | ||
# date: 2024-12-07 | ||
|
||
import click | ||
import os | ||
import pickle | ||
import json | ||
import logging | ||
from ucimlrepo import fetch_ucirepo | ||
import pandas as pd | ||
import numpy as np | ||
import pandera as pa | ||
from pandera import Check | ||
from deepchecks import Dataset | ||
import matplotlib.pyplot as plt | ||
from scipy.stats import loguniform, randint | ||
from sklearn import set_config | ||
from sklearn.model_selection import train_test_split | ||
from sklearn.neighbors import KNeighborsClassifier | ||
from sklearn.svm import SVC | ||
from sklearn.linear_model import LogisticRegression | ||
from sklearn.impute import SimpleImputer | ||
from sklearn.preprocessing import QuantileTransformer,OneHotEncoder | ||
from sklearn.compose import make_column_transformer | ||
from sklearn.pipeline import make_pipeline | ||
from sklearn.metrics import ConfusionMatrixDisplay, make_scorer, fbeta_score, accuracy_score, precision_score, recall_score | ||
from sklearn.model_selection import cross_validate, cross_val_predict, GridSearchCV, RandomizedSearchCV | ||
|
||
|
||
@click.command() | ||
@click.option('--processed-training-data', type=str, help="Path to processed training data") | ||
@click.option('--preprocessor', type=str, help="Path to preprocessor object") | ||
@click.option('--pipeline-to', type=str, help="Path to directory where the pipeline object will be written to") | ||
@click.option('--plot-to', type=str, help="Path to directory where the plot will be written to") | ||
@click.option('--results-to', type=str, help="Path to directory where the plot will be written to") | ||
@click.option('--seed', type=int, help="Random seed", default=123) | ||
def main(processed_training_data, preprocessor, pipeline_to, plot_to, results_to, seed): | ||
'''Fits a breast cancer classifier to the training data | ||
and saves the pipeline object.''' | ||
np.random.seed(seed) | ||
set_config(transform_output="pandas") | ||
|
||
# read in data & preprocessor | ||
mushroom_train = pd.read_csv(processed_training_data) | ||
mushroom_preprocessor = pickle.load(open(preprocessor, "rb")) | ||
|
||
# create metrics | ||
scoring_metrics = { | ||
'accuracy':make_scorer(accuracy_score), | ||
'f2_score':make_scorer(fbeta_score, beta=2, pos_label='p',average='binary') | ||
} | ||
cv_results = dict() | ||
|
||
# tune model and save results | ||
# knn model | ||
knn = make_pipeline(mushroom_preprocessor, KNeighborsClassifier()) | ||
knn_grid = {'kneighborsclassifier__n_neighbors':randint(5,1000)} | ||
cv_results['knn'] = RandomizedSearchCV( | ||
knn, knn_grid, n_iter=5, n_jobs=-1, cv=3, | ||
scoring=scoring_metrics, random_state=seed, | ||
refit='f2_score' | ||
).fit(mushroom_train.drop(columns=["target"]), | ||
mushroom_train["target"]) | ||
|
||
# logistic regression model | ||
logreg = make_pipeline(preprocessor,LogisticRegression(max_iter=5000,random_state=seed)) | ||
logreg_grid = {'logisticregression__C':loguniform(1e-3,1e3)} | ||
cv_results['logreg'] = RandomizedSearchCV( | ||
logreg,logreg_grid,n_iter=30,n_jobs=-1, | ||
scoring=scoring_metrics,random_state=seed, | ||
refit='f2_score' | ||
).fit(mushroom_train.drop(columns=["target"]), | ||
mushroom_train["target"]) | ||
|
||
# svc model | ||
svc = make_pipeline(preprocessor,SVC(random_state=seed)) | ||
svc_grid = {'svc__C':loguniform(1e-3,1e3), | ||
'svc__gamma':loguniform(1e-3,1e3)} | ||
cv_results['svc'] = RandomizedSearchCV( | ||
svc,svc_grid,n_iter=3,n_jobs=-1,cv=3, | ||
scoring=scoring_metrics,random_state=seed, | ||
refit='f2_score' | ||
).fit(mushroom_train.drop(columns=["target"]), | ||
mushroom_train["target"]) | ||
|
||
# compilng hyperparameters and scores of best models into one dataframe | ||
cols = ['params', | ||
'mean_fit_time', | ||
'mean_test_accuracy', | ||
'std_test_accuracy', | ||
'mean_test_f2_score', | ||
'std_test_f2_score'] | ||
final_results = pd.concat( | ||
[pd.DataFrame(result.cv_results_).query('rank_test_f2_score == 1')[cols] for _,result in cv_results.items()] | ||
) | ||
final_results.index = ['KNN','Logisic Regression','SVC'] | ||
final_results.to_csv( | ||
os.path.join(results_to, "tables", "numeric_correlation_matrix.csv") | ||
) | ||
|
||
# save the best model | ||
best_model = cv_results['svc'].best_estimator_ | ||
best_model.fit( | ||
mushroom_train.drop(columns=["target"]), | ||
mushroom_train["target"] | ||
) | ||
|
||
with open(os.path.join(pipeline_to, "mushroom_best_model.pickle"), 'wb') as f: | ||
pickle.dump(best_model, f) | ||
|
||
disp = ConfusionMatrixDisplay.from_estimator( | ||
best_model, | ||
mushroom_train.drop(columns=["target"]), | ||
mushroom_train["target"] | ||
) | ||
disp.plot() | ||
plt.savefig(os.path.join(results_to, "figures", "confusion_matrix.png"), dpi=300) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
Oops, something went wrong.