Skip to content

Commit

Permalink
Merge pull request #24 from UBC-MDS/model_script
Browse files Browse the repository at this point in the history
upload scripts for eda, model fitting, and model evaluation
  • Loading branch information
hankunxiao authored Dec 6, 2024
2 parents 0b78126 + 997b18d commit 57452d0
Show file tree
Hide file tree
Showing 5 changed files with 13,872 additions and 0 deletions.
71 changes: 71 additions & 0 deletions eda.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# eda.py
# author: Yichi Zhang
# date: 2024-12-07

import click
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

@click.command()
@click.option('--processed-training-data', type=str, help="Path to processed training data")
@click.option('--plot-to', type=str, help="Path to directory where the plot will be written to")
def main(processed_training_data, plot_to):
'''Plots the densities of each feature in the processed training data
by class and displays them as a grid of plots. Also saves the plot.'''

mushroom_train = pd.read_csv(processed_training_data)

numeric_columns = mushroom_train.select_dtypes(include='number') # Select only numeric columns

for column in numeric_columns.columns:
plt.figure(figsize=(5,5))
plt.hist(mushroom_train[column], bins=15, edgecolor='black', alpha=0.7)
plt.title(f'Histogram of {column}')
plt.xlabel(column)
plt.ylabel('Frequency')

plt.savefig(os.path.join(plot_to, "figures", f"histogram_{column}.png"),
dpi=300)


categorical_columns = mushroom_train.select_dtypes(include='object') # Select only categorical columns

for column in categorical_columns.columns:
frequency = mushroom_train[column].value_counts()
percentage = round(mushroom_train[column].value_counts(normalize=True) * 100, 2)
freq_percent_df = pd.DataFrame({
"Frequency": frequency,
"Percentage": percentage
})
styled_df = freq_percent_df.style.format(
precision=2
).background_gradient(
subset=['Percentage'],
cmap='YlOrRd'
)
fig, ax = plt.subplots(figsize=(6, 2)) # Adjust the figure size as needed
ax.axis('off') # Turn off the axes

# Create a table from the DataFrame
table = ax.table(
cellText=freq_percent_df.values,
colLabels=freq_percent_df.columns,
rowLabels=freq_percent_df.index,
loc='center',
cellLoc='center'
)

# Style adjustments for readability
table.auto_set_font_size(False)
table.set_fontsize(10)
table.auto_set_column_width(col=list(range(len(freq_percent_df.columns))))

file_path = os.path.join(plot_to, "figures", f"{column}_frequency_table.png")
plt.savefig(file_path, dpi=300, bbox_inches='tight')
print(f"Saved styled table for '{column}'")


if __name__ == '__main__':
main()
82 changes: 82 additions & 0 deletions evaluate_mushroom_predictor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# fit_mushroom_classifier.py
# author: Yichi Zhang
# date: 2024-12-07

import click
import os
import pickle
import json
import logging
from ucimlrepo import fetch_ucirepo
import pandas as pd
import numpy as np
import pandera as pa
from pandera import Check
from deepchecks import Dataset
import matplotlib.pyplot as plt
from scipy.stats import loguniform, randint
from sklearn import set_config
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import QuantileTransformer,OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import ConfusionMatrixDisplay, make_scorer, fbeta_score, accuracy_score, precision_score, recall_score
from sklearn.model_selection import cross_validate, cross_val_predict, GridSearchCV, RandomizedSearchCV

@click.command()
@click.option('--scaled-test-data', type=str, help="Path to scaled test data")
@click.option('--pipeline-from', type=str, help="Path to directory where the fit pipeline object lives")
@click.option('--results-to', type=str, help="Path to directory where the plot will be written to")
@click.option('--seed', type=int, help="Random seed", default=123)
def main(scaled_test_data, pipeline_from, results_to, seed):
'''Evaluates the breast cancer classifier on the test data
and saves the evaluation results.'''
np.random.seed(seed)
set_config(transform_output="pandas")

mushroom_test = pd.read_csv(scaled_test_data)

with open(pipeline_from, 'rb') as f:
mushroom_fit = pickle.load(f)

# Compute accuracy
accuracy = mushroom_fit.score(
mushroom_test.drop(columns=["target"]),
mushroom_test["target"]
)

# Compute F2 score (beta = 2)
mushroom_preds = mushroom_test.assign(
predicted=mushroom_fit.predict(mushroom_test)
)
f2_beta_2_score = fbeta_score(
mushroom_preds['target'],
mushroom_preds['predicted'],
beta=2,
pos_label='p'
)

test_scores = pd.DataFrame({'accuracy': [accuracy],
'F2 score (beta = 2)': [f2_beta_2_score]})
test_scores.to_csv(os.path.join(results_to, "test_scores.csv"), index=False)

confusion_matrix = pd.crosstab(
mushroom_preds["target"],
mushroom_preds["predicted"]
)
confusion_matrix.to_csv(os.path.join(results_to, "tables", "confusion_matrix.csv"))

disp = ConfusionMatrixDisplay.from_predictions(
mushroom_preds["target"],
mushroom_preds["predicted"]
)
disp.plot()
plt.savefig(os.path.join(results_to, "figures", "confusion_matrix.png"), dpi=300)


if __name__ == '__main__':
main()
122 changes: 122 additions & 0 deletions fit_mushroom_classifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
# fit_mushroom_classifier.py
# author: Yichi Zhang
# date: 2024-12-07

import click
import os
import pickle
import json
import logging
from ucimlrepo import fetch_ucirepo
import pandas as pd
import numpy as np
import pandera as pa
from pandera import Check
from deepchecks import Dataset
import matplotlib.pyplot as plt
from scipy.stats import loguniform, randint
from sklearn import set_config
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import QuantileTransformer,OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import ConfusionMatrixDisplay, make_scorer, fbeta_score, accuracy_score, precision_score, recall_score
from sklearn.model_selection import cross_validate, cross_val_predict, GridSearchCV, RandomizedSearchCV


@click.command()
@click.option('--processed-training-data', type=str, help="Path to processed training data")
@click.option('--preprocessor', type=str, help="Path to preprocessor object")
@click.option('--pipeline-to', type=str, help="Path to directory where the pipeline object will be written to")
@click.option('--plot-to', type=str, help="Path to directory where the plot will be written to")
@click.option('--results-to', type=str, help="Path to directory where the plot will be written to")
@click.option('--seed', type=int, help="Random seed", default=123)
def main(processed_training_data, preprocessor, pipeline_to, plot_to, results_to, seed):
'''Fits a breast cancer classifier to the training data
and saves the pipeline object.'''
np.random.seed(seed)
set_config(transform_output="pandas")

# read in data & preprocessor
mushroom_train = pd.read_csv(processed_training_data)
mushroom_preprocessor = pickle.load(open(preprocessor, "rb"))

# create metrics
scoring_metrics = {
'accuracy':make_scorer(accuracy_score),
'f2_score':make_scorer(fbeta_score, beta=2, pos_label='p',average='binary')
}
cv_results = dict()

# tune model and save results
# knn model
knn = make_pipeline(mushroom_preprocessor, KNeighborsClassifier())
knn_grid = {'kneighborsclassifier__n_neighbors':randint(5,1000)}
cv_results['knn'] = RandomizedSearchCV(
knn, knn_grid, n_iter=5, n_jobs=-1, cv=3,
scoring=scoring_metrics, random_state=seed,
refit='f2_score'
).fit(mushroom_train.drop(columns=["target"]),
mushroom_train["target"])

# logistic regression model
logreg = make_pipeline(preprocessor,LogisticRegression(max_iter=5000,random_state=seed))
logreg_grid = {'logisticregression__C':loguniform(1e-3,1e3)}
cv_results['logreg'] = RandomizedSearchCV(
logreg,logreg_grid,n_iter=30,n_jobs=-1,
scoring=scoring_metrics,random_state=seed,
refit='f2_score'
).fit(mushroom_train.drop(columns=["target"]),
mushroom_train["target"])

# svc model
svc = make_pipeline(preprocessor,SVC(random_state=seed))
svc_grid = {'svc__C':loguniform(1e-3,1e3),
'svc__gamma':loguniform(1e-3,1e3)}
cv_results['svc'] = RandomizedSearchCV(
svc,svc_grid,n_iter=3,n_jobs=-1,cv=3,
scoring=scoring_metrics,random_state=seed,
refit='f2_score'
).fit(mushroom_train.drop(columns=["target"]),
mushroom_train["target"])

# compilng hyperparameters and scores of best models into one dataframe
cols = ['params',
'mean_fit_time',
'mean_test_accuracy',
'std_test_accuracy',
'mean_test_f2_score',
'std_test_f2_score']
final_results = pd.concat(
[pd.DataFrame(result.cv_results_).query('rank_test_f2_score == 1')[cols] for _,result in cv_results.items()]
)
final_results.index = ['KNN','Logisic Regression','SVC']
final_results.to_csv(
os.path.join(results_to, "tables", "numeric_correlation_matrix.csv")
)

# save the best model
best_model = cv_results['svc'].best_estimator_
best_model.fit(
mushroom_train.drop(columns=["target"]),
mushroom_train["target"]
)

with open(os.path.join(pipeline_to, "mushroom_best_model.pickle"), 'wb') as f:
pickle.dump(best_model, f)

disp = ConfusionMatrixDisplay.from_estimator(
best_model,
mushroom_train.drop(columns=["target"]),
mushroom_train["target"]
)
disp.plot()
plt.savefig(os.path.join(results_to, "figures", "confusion_matrix.png"), dpi=300)


if __name__ == '__main__':
main()
Loading

0 comments on commit 57452d0

Please sign in to comment.