From f01553051dc6e1e95924f068239c19eb03f6bfa2 Mon Sep 17 00:00:00 2001 From: Yichi Zhang Date: Mon, 2 Dec 2024 15:30:34 -0800 Subject: [PATCH 1/3] create fit_mushroom_classifier.py --- fit_mushroom_classifier.py | 112 +++++++++++++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 fit_mushroom_classifier.py diff --git a/fit_mushroom_classifier.py b/fit_mushroom_classifier.py new file mode 100644 index 0000000..0b9c813 --- /dev/null +++ b/fit_mushroom_classifier.py @@ -0,0 +1,112 @@ +# fit_mushroom_classifier.py +# author: Yichi Zhang +# date: 2024-12-07 + +import click +import os +import pickle +import json +import logging +from ucimlrepo import fetch_ucirepo +import pandas as pd +import numpy as np +import pandera as pa +from pandera import Check +from deepchecks import Dataset +import matplotlib.pyplot as plt +from scipy.stats import loguniform, randint +from sklearn import set_config +from sklearn.model_selection import train_test_split +from sklearn.neighbors import KNeighborsClassifier +from sklearn.svm import SVC +from sklearn.linear_model import LogisticRegression +from sklearn.impute import SimpleImputer +from sklearn.preprocessing import QuantileTransformer,OneHotEncoder +from sklearn.compose import make_column_transformer +from sklearn.pipeline import make_pipeline +from sklearn.metrics import ConfusionMatrixDisplay, make_scorer, fbeta_score, accuracy_score, precision_score, recall_score +from sklearn.model_selection import cross_validate, cross_val_predict, GridSearchCV, RandomizedSearchCV +import warnings +warnings.filterwarnings("ignore", category=FutureWarning, module="deepchecks") + + +@click.command() +@click.option('--training-data', type=str, help="Path to training data") +@click.option('--preprocessor', type=str, help="Path to preprocessor object") +@click.option('--columns-to-drop', type=str, help="Optional: columns to drop") +@click.option('--pipeline-to', type=str, help="Path to directory where the pipeline object will be written to") +@click.option('--plot-to', type=str, help="Path to directory where the plot will be written to") +@click.option('--seed', type=int, help="Random seed", default=123) +def main(training_data, preprocessor, columns_to_drop, pipeline_to, plot_to, seed): + '''Fits a breast cancer classifier to the training data + and saves the pipeline object.''' + np.random.seed(seed) + set_config(transform_output="pandas") + + # read in data & preprocessor + mushroom_train = pd.read_csv(training_data) + mushroom_preprocessor = pickle.load(open(preprocessor, "rb")) + + if columns_to_drop: + to_drop = pd.read_csv(columns_to_drop).feats_to_drop.tolist() + mushroom_train = mushroom_train.drop(columns=to_drop) + + # create metrics + scoring_metrics = { + 'accuracy':make_scorer(accuracy_score), + 'f2_score':make_scorer(fbeta_score, beta=2, pos_label='p',average='binary') + } + cv_results = dict() + + # tune model and save results + # knn model + knn = make_pipeline(mushroom_preprocessor, KNeighborsClassifier()) + knn_grid = {'kneighborsclassifier__n_neighbors':randint(5,1000)} + cv_results['knn'] = RandomizedSearchCV( + knn, knn_grid, n_iter=5, n_jobs=-1, cv=3, + scoring=scoring_metrics, random_state=seed, + refit='f2_score' + ).fit(mushroom_train.drop(columns=["target"]), + mushroom_train["target"]) + + # logistic regression model + logreg = make_pipeline(preprocessor,LogisticRegression(max_iter=5000,random_state=seed)) + logreg_grid = {'logisticregression__C':loguniform(1e-3,1e3)} + cv_results['logreg'] = RandomizedSearchCV( + logreg,logreg_grid,n_iter=30,n_jobs=-1, + scoring=scoring_metrics,random_state=seed, + refit='f2_score' + ).fit(mushroom_train.drop(columns=["target"]), + mushroom_train["target"]) + + # svc model + svc = make_pipeline(preprocessor,SVC(random_state=seed)) + svc_grid = {'svc__C':loguniform(1e-3,1e3), + 'svc__gamma':loguniform(1e-3,1e3)} + cv_results['svc'] = RandomizedSearchCV( + svc,svc_grid,n_iter=3,n_jobs=-1,cv=3, + scoring=scoring_metrics,random_state=seed, + refit='f2_score' + ).fit(mushroom_train.drop(columns=["target"]), + mushroom_train["target"]) + + # compilng hyperparameters and scores of best models into one dataframe + cols = ['params','mean_fit_time','mean_test_accuracy','std_test_accuracy','mean_test_f2_score','std_test_f2_score'] + final_results = pd.concat( + [pd.DataFrame(result.cv_results_).query('rank_test_f2_score == 1')[cols] for _,result in cv_results.items()] + ) + final_results.index = ['KNN','Logisic Regression','SVC'] + + with open("table.pkl", "wb") as f: + pickle.dump(final_results, f) + + # save the best model + best_model = cv_results['svc'].best_estimator_ + best_model.fit(mushroom_train.drop(columns=["target"]), mushroom_train["target"]) + + with open(os.path.join(pipeline_to, "mushroom_pipeline.pickle"), 'wb') as f: + pickle.dump(best_model, f) + + +if __name__ == '__main__': + main() \ No newline at end of file From f46ae74af29ec11fa302f41adc8287f5c35b829a Mon Sep 17 00:00:00 2001 From: Yichi Zhang Date: Thu, 5 Dec 2024 21:37:50 -0800 Subject: [PATCH 2/3] create eda.py --- eda.py | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 eda.py diff --git a/eda.py b/eda.py new file mode 100644 index 0000000..095cf18 --- /dev/null +++ b/eda.py @@ -0,0 +1,71 @@ +# eda.py +# author: Yichi Zhang +# date: 2024-12-07 + +import click +import os +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt + +@click.command() +@click.option('--processed-training-data', type=str, help="Path to processed training data") +@click.option('--plot-to', type=str, help="Path to directory where the plot will be written to") +def main(processed_training_data, plot_to): + '''Plots the densities of each feature in the processed training data + by class and displays them as a grid of plots. Also saves the plot.''' + + mushroom_train = pd.read_csv(processed_training_data) + + numeric_columns = mushroom_train.select_dtypes(include='number') # Select only numeric columns + + for column in numeric_columns.columns: + plt.figure(figsize=(5,5)) + plt.hist(mushroom_train[column], bins=15, edgecolor='black', alpha=0.7) + plt.title(f'Histogram of {column}') + plt.xlabel(column) + plt.ylabel('Frequency') + + plt.savefig(os.path.join(plot_to, "figures", f"histogram_{column}.png"), + dpi=300) + + + categorical_columns = mushroom_train.select_dtypes(include='object') # Select only categorical columns + + for column in categorical_columns.columns: + frequency = mushroom_train[column].value_counts() + percentage = round(mushroom_train[column].value_counts(normalize=True) * 100, 2) + freq_percent_df = pd.DataFrame({ + "Frequency": frequency, + "Percentage": percentage + }) + styled_df = freq_percent_df.style.format( + precision=2 + ).background_gradient( + subset=['Percentage'], + cmap='YlOrRd' + ) + fig, ax = plt.subplots(figsize=(6, 2)) # Adjust the figure size as needed + ax.axis('off') # Turn off the axes + + # Create a table from the DataFrame + table = ax.table( + cellText=freq_percent_df.values, + colLabels=freq_percent_df.columns, + rowLabels=freq_percent_df.index, + loc='center', + cellLoc='center' + ) + + # Style adjustments for readability + table.auto_set_font_size(False) + table.set_fontsize(10) + table.auto_set_column_width(col=list(range(len(freq_percent_df.columns)))) + + file_path = os.path.join(plot_to, "figures", f"{column}_frequency_table.png") + plt.savefig(file_path, dpi=300, bbox_inches='tight') + print(f"Saved styled table for '{column}'") + + +if __name__ == '__main__': + main() \ No newline at end of file From 997b18dd36459de19ce5131c44bc1ab7419c6727 Mon Sep 17 00:00:00 2001 From: Yichi Zhang Date: Thu, 5 Dec 2024 21:39:18 -0800 Subject: [PATCH 3/3] create scripts --- evaluate_mushroom_predictor.py | 82 + fit_mushroom_classifier.py | 42 +- .../Load_Data_and_EDA-checkpoint.html | 10308 ++++++++++++++++ .../Load_Data_and_EDA-checkpoint.ipynb | 3289 +++++ 4 files changed, 13705 insertions(+), 16 deletions(-) create mode 100644 evaluate_mushroom_predictor.py create mode 100644 notebooks/.ipynb_checkpoints/Load_Data_and_EDA-checkpoint.html create mode 100644 notebooks/.ipynb_checkpoints/Load_Data_and_EDA-checkpoint.ipynb diff --git a/evaluate_mushroom_predictor.py b/evaluate_mushroom_predictor.py new file mode 100644 index 0000000..d1ba1da --- /dev/null +++ b/evaluate_mushroom_predictor.py @@ -0,0 +1,82 @@ +# fit_mushroom_classifier.py +# author: Yichi Zhang +# date: 2024-12-07 + +import click +import os +import pickle +import json +import logging +from ucimlrepo import fetch_ucirepo +import pandas as pd +import numpy as np +import pandera as pa +from pandera import Check +from deepchecks import Dataset +import matplotlib.pyplot as plt +from scipy.stats import loguniform, randint +from sklearn import set_config +from sklearn.model_selection import train_test_split +from sklearn.neighbors import KNeighborsClassifier +from sklearn.svm import SVC +from sklearn.linear_model import LogisticRegression +from sklearn.impute import SimpleImputer +from sklearn.preprocessing import QuantileTransformer,OneHotEncoder +from sklearn.compose import make_column_transformer +from sklearn.pipeline import make_pipeline +from sklearn.metrics import ConfusionMatrixDisplay, make_scorer, fbeta_score, accuracy_score, precision_score, recall_score +from sklearn.model_selection import cross_validate, cross_val_predict, GridSearchCV, RandomizedSearchCV + +@click.command() +@click.option('--scaled-test-data', type=str, help="Path to scaled test data") +@click.option('--pipeline-from', type=str, help="Path to directory where the fit pipeline object lives") +@click.option('--results-to', type=str, help="Path to directory where the plot will be written to") +@click.option('--seed', type=int, help="Random seed", default=123) +def main(scaled_test_data, pipeline_from, results_to, seed): + '''Evaluates the breast cancer classifier on the test data + and saves the evaluation results.''' + np.random.seed(seed) + set_config(transform_output="pandas") + + mushroom_test = pd.read_csv(scaled_test_data) + + with open(pipeline_from, 'rb') as f: + mushroom_fit = pickle.load(f) + + # Compute accuracy + accuracy = mushroom_fit.score( + mushroom_test.drop(columns=["target"]), + mushroom_test["target"] + ) + + # Compute F2 score (beta = 2) + mushroom_preds = mushroom_test.assign( + predicted=mushroom_fit.predict(mushroom_test) + ) + f2_beta_2_score = fbeta_score( + mushroom_preds['target'], + mushroom_preds['predicted'], + beta=2, + pos_label='p' + ) + + test_scores = pd.DataFrame({'accuracy': [accuracy], + 'F2 score (beta = 2)': [f2_beta_2_score]}) + test_scores.to_csv(os.path.join(results_to, "test_scores.csv"), index=False) + + confusion_matrix = pd.crosstab( + mushroom_preds["target"], + mushroom_preds["predicted"] + ) + confusion_matrix.to_csv(os.path.join(results_to, "tables", "confusion_matrix.csv")) + + disp = ConfusionMatrixDisplay.from_predictions( + mushroom_preds["target"], + mushroom_preds["predicted"] + ) + disp.plot() + plt.savefig(os.path.join(results_to, "figures", "confusion_matrix.png"), dpi=300) + + +if __name__ == '__main__': + main() diff --git a/fit_mushroom_classifier.py b/fit_mushroom_classifier.py index 0b9c813..109838f 100644 --- a/fit_mushroom_classifier.py +++ b/fit_mushroom_classifier.py @@ -26,31 +26,25 @@ from sklearn.pipeline import make_pipeline from sklearn.metrics import ConfusionMatrixDisplay, make_scorer, fbeta_score, accuracy_score, precision_score, recall_score from sklearn.model_selection import cross_validate, cross_val_predict, GridSearchCV, RandomizedSearchCV -import warnings -warnings.filterwarnings("ignore", category=FutureWarning, module="deepchecks") @click.command() -@click.option('--training-data', type=str, help="Path to training data") +@click.option('--processed-training-data', type=str, help="Path to processed training data") @click.option('--preprocessor', type=str, help="Path to preprocessor object") -@click.option('--columns-to-drop', type=str, help="Optional: columns to drop") @click.option('--pipeline-to', type=str, help="Path to directory where the pipeline object will be written to") @click.option('--plot-to', type=str, help="Path to directory where the plot will be written to") +@click.option('--results-to', type=str, help="Path to directory where the plot will be written to") @click.option('--seed', type=int, help="Random seed", default=123) -def main(training_data, preprocessor, columns_to_drop, pipeline_to, plot_to, seed): +def main(processed_training_data, preprocessor, pipeline_to, plot_to, results_to, seed): '''Fits a breast cancer classifier to the training data and saves the pipeline object.''' np.random.seed(seed) set_config(transform_output="pandas") # read in data & preprocessor - mushroom_train = pd.read_csv(training_data) + mushroom_train = pd.read_csv(processed_training_data) mushroom_preprocessor = pickle.load(open(preprocessor, "rb")) - if columns_to_drop: - to_drop = pd.read_csv(columns_to_drop).feats_to_drop.tolist() - mushroom_train = mushroom_train.drop(columns=to_drop) - # create metrics scoring_metrics = { 'accuracy':make_scorer(accuracy_score), @@ -91,22 +85,38 @@ def main(training_data, preprocessor, columns_to_drop, pipeline_to, plot_to, see mushroom_train["target"]) # compilng hyperparameters and scores of best models into one dataframe - cols = ['params','mean_fit_time','mean_test_accuracy','std_test_accuracy','mean_test_f2_score','std_test_f2_score'] + cols = ['params', + 'mean_fit_time', + 'mean_test_accuracy', + 'std_test_accuracy', + 'mean_test_f2_score', + 'std_test_f2_score'] final_results = pd.concat( [pd.DataFrame(result.cv_results_).query('rank_test_f2_score == 1')[cols] for _,result in cv_results.items()] ) final_results.index = ['KNN','Logisic Regression','SVC'] - - with open("table.pkl", "wb") as f: - pickle.dump(final_results, f) + final_results.to_csv( + os.path.join(results_to, "tables", "numeric_correlation_matrix.csv") + ) # save the best model best_model = cv_results['svc'].best_estimator_ - best_model.fit(mushroom_train.drop(columns=["target"]), mushroom_train["target"]) + best_model.fit( + mushroom_train.drop(columns=["target"]), + mushroom_train["target"] + ) - with open(os.path.join(pipeline_to, "mushroom_pipeline.pickle"), 'wb') as f: + with open(os.path.join(pipeline_to, "mushroom_best_model.pickle"), 'wb') as f: pickle.dump(best_model, f) + disp = ConfusionMatrixDisplay.from_estimator( + best_model, + mushroom_train.drop(columns=["target"]), + mushroom_train["target"] + ) + disp.plot() + plt.savefig(os.path.join(results_to, "figures", "confusion_matrix.png"), dpi=300) + if __name__ == '__main__': main() \ No newline at end of file diff --git a/notebooks/.ipynb_checkpoints/Load_Data_and_EDA-checkpoint.html b/notebooks/.ipynb_checkpoints/Load_Data_and_EDA-checkpoint.html new file mode 100644 index 0000000..4986c35 --- /dev/null +++ b/notebooks/.ipynb_checkpoints/Load_Data_and_EDA-checkpoint.html @@ -0,0 +1,10308 @@ + + + + + +Load_Data_and_EDA + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + diff --git a/notebooks/.ipynb_checkpoints/Load_Data_and_EDA-checkpoint.ipynb b/notebooks/.ipynb_checkpoints/Load_Data_and_EDA-checkpoint.ipynb new file mode 100644 index 0000000..c1c34a3 --- /dev/null +++ b/notebooks/.ipynb_checkpoints/Load_Data_and_EDA-checkpoint.ipynb @@ -0,0 +1,3289 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1b40693f-6d55-4951-96a5-48a10ccb6773", + "metadata": {}, + "source": [ + "# Mushroom Edibility Classification Using Feature-Based Machine Learning Approach" + ] + }, + { + "cell_type": "markdown", + "id": "18590e2e-138a-4ed1-821b-8a1850fdce9b", + "metadata": {}, + "source": [ + "by Benjamin Frizzell, Hankun Xiao, Essie Zhang, Mason Zhang 2024/11/23" + ] + }, + { + "cell_type": "markdown", + "id": "81a65442-e81c-4e9d-9755-885bb2aebac9", + "metadata": {}, + "source": [ + "#### Import Library" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "0d2500b5-022e-4ff0-818c-ad1013efb69d", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "from ucimlrepo import fetch_ucirepo \n", + "import pandas as pd\n", + "import numpy as np\n", + "import pandera as pa\n", + "from pandera import Check\n", + "from deepchecks import Dataset\n", + "import json\n", + "import logging\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.model_selection import train_test_split" + ] + }, + { + "cell_type": "markdown", + "id": "f49175e2-5eab-4b03-816c-f20995c50c96", + "metadata": {}, + "source": [ + "## Summary" + ] + }, + { + "cell_type": "markdown", + "id": "a1f39a05-24c5-4a5e-a34c-830e8efeee78", + "metadata": {}, + "source": [ + "In this project, a Support Vector Classifier was built and tuned to identify mushrooms edibility. A mushroom is classified as edible or poisonous with given color, habitat, class, and others. The final classifier performed quite well on unseen test data, with a final overall accuracy of 0.99 and $F_{\\beta}$ score with $\\beta = 2$ of 0.99. Furthermore, we use confusion matrix to show the accuracy of classification poisonous or edible mushroom. The model makes 12174 correct predictions out of 12214 test observations. 17 mistakes were predicting a poisonous mushroom as edible (false negative), while 23 mistakes were predicting a edible mushroom as poisonous (false positive). The model’s performance shows promise for implementation, prioritizing safety by minimizing false negatives that could result in consuming poisonous mushrooms. While false positives may lead to unnecessarily discarding safe mushrooms, they pose no safety risk. Further development is needed to make this model useful. Research should focus on improving performance and analyzing cases of incorrect predictions." + ] + }, + { + "cell_type": "markdown", + "id": "5626172b-fdbc-486d-ba3d-550432375290", + "metadata": {}, + "source": [ + "## Introduction" + ] + }, + { + "cell_type": "markdown", + "id": "d6ca1f13-3cd2-4bc6-bfae-4d0d6440e1b7", + "metadata": {}, + "source": [ + "Mushrooms are the most common food which is rich in vitamins and minerals. However, not all mushrooms can be consumed directly, most of them are poisonous and identifying edible or poisonous mushroom through the naked eye is quite difficult. Our aim is to using machine learning to identify mushrooms edibility. In this project, three methods are used to detect the edibility of mushrooms: Support Vector Classifier (SVC), K-Nearest Neighbors (KNN), and Logistic Regression. " + ] + }, + { + "cell_type": "markdown", + "id": "921597ef-c12e-4c4c-b8bf-b1eb20e90814", + "metadata": {}, + "source": [ + "## Methods" + ] + }, + { + "cell_type": "markdown", + "id": "a0920cdf-10c1-4151-bbf3-a689486257dd", + "metadata": {}, + "source": [ + "### Data" + ] + }, + { + "cell_type": "markdown", + "id": "549db552-150b-4744-ba90-497604b5b601", + "metadata": {}, + "source": [ + "The dataset used in this project is the Secondary Mushroom Dataset created by Wagner, D., Heider, D., & Hattab, G. from UCI Machine Learning Repository. This dataset contains 61069 hypothetical mushrooms with caps based on 173 species (353 mushrooms per species). Each mushroom is identified as definitely edible, definitely poisonous, or of unknown edibility and not recommended (the latter class was combined with the poisonous class)." + ] + }, + { + "cell_type": "markdown", + "id": "809c0908-7030-437e-bb4c-56bdf0066119", + "metadata": {}, + "source": [ + "### Analysis" + ] + }, + { + "cell_type": "markdown", + "id": "614cbeca-8401-49b3-8eed-cce9dc41d292", + "metadata": {}, + "source": [ + "The mushroom dataset is balanced with 56% of poisonous mushroom and 44% of edible mushroom. All variables were standardized and variables with more than 15% missing values are dropped, because imputing a variable that has a significant proportion of missing data might introduce too much noise or bias, making it unreliable. Data was splitted with 80% being partitioned into the training set and 20% being partitioned into the test set. Three classification models including Support Vector Classifier (SVC), K-Nearest Neighbors (KNN), and Logistic Regression are used to predict whether a mushroom is edible or poisonous. The fine tuned Support Vector Classifier has the best overall performance. The hyperparameter was chosen using 5-fold cross validation with $F_{\\beta}$ score as the classification metric. $\\beta$ was chosen to be set to 2 for the $F_{\\beta}$ score to increase the weight on recall during fitting because predicting a mushroom to be edible when it is in fact poisonous could have severe health consequences. Therefore the goal is to prioritize the minimization of false negatives. The Python programming language (Van Rossum and Drake 2009) and the following Python packages were used to perform the analysis: Matplotlib (Hunter, 2007), Pandas (McKinney, 2010), Scikit-learn (Pedregosa et al., 2011), NumPy (Harris et al., 2020), SciPy (Virtanen et al., 2020), UCIMLRepo." + ] + }, + { + "cell_type": "markdown", + "id": "58647071-18ff-44cb-9f2d-fd243555cff0", + "metadata": {}, + "source": [ + "## Results & Discussion" + ] + }, + { + "cell_type": "markdown", + "id": "f94469b3-143e-4a67-8c22-5bfa73baeccc", + "metadata": {}, + "source": [ + "The EDA shows that all numeric columns in the mushroom dataset are nearly normal with some skewness. A robust preprocessing scheme `QuantileTransformer` is used because it can transform skewed data or heavy-tailed distributions into a more Gaussian-like shape and reduce the impact of outliers.\n", + "`OneHotEncoder` is applied for categorical features in the mushroom dataset, because each feature does not contains much categories and they are not ordered. It is critical to keep all important information in the features. Since ring type feature has many missing values, it was filled in with a \"Missing\" class. Treating missing values as a distinct category provides a way to model the absence of data directly. This can be valuable because missingness itself might carry information." + ] + }, + { + "cell_type": "markdown", + "id": "7558f2ed-854e-492b-8a71-7e37cdecf1f3", + "metadata": {}, + "source": [ + "#### Load Data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "20a3d74b-174b-420e-9745-6d68f9d7da5f", + "metadata": {}, + "outputs": [], + "source": [ + "# fetch dataset as pandas DataFrames\n", + "secondary_mushroom = fetch_ucirepo(id=848) \n", + "X = secondary_mushroom.data.features \n", + "y = secondary_mushroom.data.targets " + ] + }, + { + "cell_type": "markdown", + "id": "e4fa6da5-7876-43ba-b5ff-e12a66c78c75", + "metadata": {}, + "source": [ + "##### Before splitting the data into test and training sets, we want to check for missing values in each column to determine whether they can be used in our model." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "eeee4dd9-6fa9-47d3-b86f-e128e791a96e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Missing Values by Column
 ColumnMissing Count
0cap-diameter0
1cap-shape0
2cap-surface14120
3cap-color0
4does-bruise-or-bleed0
5gill-attachment9884
6gill-spacing25063
7gill-color0
8stem-height0
9stem-width0
10stem-root51538
11stem-surface38124
12stem-color0
13veil-type57892
14veil-color53656
15has-ring0
16ring-type2471
17spore-print-color54715
18habitat0
19season0
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Check the missing values\n", + "missing_values = X.isnull().sum().reset_index()\n", + "missing_values.columns = ['Column', 'Missing Count']\n", + "\n", + "# Highlight values with a gradient\n", + "styled_missing = missing_values.style.format(\n", + " precision=0\n", + ").background_gradient(\n", + " subset=['Missing Count'],\n", + " cmap='YlOrRd'\n", + ").set_caption(\"Missing Values by Column\")\n", + "\n", + "# Display the styled DataFrame\n", + "display(styled_missing)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "d362597a-3f05-4907-8fc6-2063d4579fa8", + "metadata": {}, + "outputs": [], + "source": [ + "colunms_to_drop = ['cap-surface', 'gill-attachment', 'gill-spacing', \n", + " 'stem-root', 'stem-surface', 'veil-type', 'veil-color', \n", + " 'spore-print-color']\n", + "X = X.drop(columns = colunms_to_drop)" + ] + }, + { + "cell_type": "markdown", + "id": "f997158b-7b42-4367-8be8-41683c425650", + "metadata": {}, + "source": [ + "After examining the data set, we decided to drop columns with a high proportion of missing values (over 15%), which include `cap-surface`, `gill-attachment`, `gill-spacing`, `stem-root`, `stem-surface`, `veil-type`, `veil-color`, and `spore-print-color`." + ] + }, + { + "cell_type": "markdown", + "id": "401edb8d-0321-43a2-9ada-fc4700368211", + "metadata": {}, + "source": [ + "#### Data Validation" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "b3fec04e-ecc7-4af2-becf-eafa8b55b696", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cap-diametercap-shapecap-colordoes-bruise-or-bleedgill-colorstem-heightstem-widthstem-colorhas-ringring-typehabitatseasontarget
015.26xofw16.9517.09wtgdwp
116.60xofw17.9918.19wtgdup
214.07xofw17.8017.74wtgdwp
314.17fefw15.7715.98wtpdwp
414.64xofw16.5317.20wtpdwp
..........................................
610641.18syff3.936.22yffdap
610651.27fyff3.185.43yffdap
610661.27syff3.866.37yffdup
610671.24fyff3.565.44yffdup
610681.17syff3.255.45yffdup
\n", + "

60903 rows × 13 columns

\n", + "
" + ], + "text/plain": [ + " cap-diameter cap-shape cap-color does-bruise-or-bleed gill-color \\\n", + "0 15.26 x o f w \n", + "1 16.60 x o f w \n", + "2 14.07 x o f w \n", + "3 14.17 f e f w \n", + "4 14.64 x o f w \n", + "... ... ... ... ... ... \n", + "61064 1.18 s y f f \n", + "61065 1.27 f y f f \n", + "61066 1.27 s y f f \n", + "61067 1.24 f y f f \n", + "61068 1.17 s y f f \n", + "\n", + " stem-height stem-width stem-color has-ring ring-type habitat season \\\n", + "0 16.95 17.09 w t g d w \n", + "1 17.99 18.19 w t g d u \n", + "2 17.80 17.74 w t g d w \n", + "3 15.77 15.98 w t p d w \n", + "4 16.53 17.20 w t p d w \n", + "... ... ... ... ... ... ... ... \n", + "61064 3.93 6.22 y f f d a \n", + "61065 3.18 5.43 y f f d a \n", + "61066 3.86 6.37 y f f d u \n", + "61067 3.56 5.44 y f f d u \n", + "61068 3.25 5.45 y f f d u \n", + "\n", + " target \n", + "0 p \n", + "1 p \n", + "2 p \n", + "3 p \n", + "4 p \n", + "... ... \n", + "61064 p \n", + "61065 p \n", + "61066 p \n", + "61067 p \n", + "61068 p \n", + "\n", + "[60903 rows x 13 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# combine features and target to the same dataframe named mushroom\n", + "mushroom = X.copy()\n", + "mushroom['target'] = y\n", + "\n", + "schema = pa.DataFrameSchema(\n", + " \n", + " {\n", + " \"target\": pa.Column(str, pa.Check.isin(['e', 'p'])),\n", + " # check missing value proportion (threshold=15%) AND value ranges for all features\n", + " \"cap-diameter\": pa.Column(float, checks = [pa.Check(lambda s: s.isna().mean() <= 0.15, \n", + " element_wise=False, \n", + " error=\"Too many null values in 'cap-diameter' column.\"), \n", + " pa.Check.between(0, 100)],\n", + " nullable=True), \n", + " \"cap-shape\": pa.Column(str, checks = [pa.Check(lambda s: s.isna().mean() <= 0.15, \n", + " element_wise=False, \n", + " error=\"Too many null values in 'cap-shape' column.\"), \n", + " pa.Check.isin(['x', 'f', 'p', 'b', 'c', 's', 'o'])],\n", + " nullable=True), \n", + " \"cap-color\": pa.Column(str, checks = [pa.Check(lambda s: s.isna().mean() <= 0.15, \n", + " element_wise=False, \n", + " error=\"Too many null values in 'cap-color' column.\"), \n", + " pa.Check.isin(['o', 'e', 'n', 'g', 'r', 'w', 'y', 'p', 'u', 'b', 'l', 'k'])],\n", + " nullable=True),\n", + " \"does-bruise-or-bleed\": pa.Column(str, checks = [pa.Check(lambda s: s.isna().mean() <= 0.15, \n", + " element_wise=False, \n", + " error=\"Too many null values in 'does-bruise-or-bleed' column.\"), \n", + " pa.Check.isin(['f', 't'])],\n", + " nullable=True),\n", + " \"gill-color\": pa.Column(str, checks = [pa.Check(lambda s: s.isna().mean() <= 0.15, \n", + " element_wise=False, \n", + " error=\"Too many null values in 'gill-color' column.\"), \n", + " pa.Check.isin(['w', 'n', 'p', 'u', 'b', 'g', 'y', 'r', 'e', 'o', 'k', 'l', 'f'])],\n", + " nullable=True),\n", + " \"stem-height\": pa.Column(float, checks = [pa.Check(lambda s: s.isna().mean() <= 0.15, \n", + " element_wise=False, \n", + " error=\"Too many null values in 'stem-height' column.\"), \n", + " pa.Check.between(0, 100)],\n", + " nullable=True),\n", + " \"stem-width\": pa.Column(float, checks = [pa.Check(lambda s: s.isna().mean() <= 0.15, \n", + " element_wise=False, \n", + " error=\"Too many null values in 'stem-width' column.\"), \n", + " pa.Check.between(0, 150)],\n", + " nullable=True),\n", + " \"stem-color\": pa.Column(str, checks = [pa.Check(lambda s: s.isna().mean() <= 0.15, \n", + " element_wise=False, \n", + " error=\"Too many null values in 'stem-color' column.\"), \n", + " pa.Check.isin(['o', 'e', 'n', 'g', 'r', 'w', 'y', 'p', 'u', 'b', 'l', 'k', 'f'])],\n", + " nullable=True),\n", + " \"has-ring\": pa.Column(str, checks = [pa.Check(lambda s: s.isna().mean() <= 0.15, \n", + " element_wise=False, \n", + " error=\"Too many null values in 'has-ring' column.\"), \n", + " pa.Check.isin(['t', 'f'])],\n", + " nullable=True),\n", + " \"ring-type\": pa.Column(str, checks = [pa.Check(lambda s: s.isna().mean() <= 0.15, \n", + " element_wise=False, \n", + " error=\"Too many null values in 'ring-type' column.\"), \n", + " pa.Check.isin(['c', 'e', 'r', 'g', 'l', 'p', 's', 'z', 'y', 'm', 'f'])],\n", + " nullable=True),\n", + " \"habitat\": pa.Column(str, checks = [pa.Check(lambda s: s.isna().mean() <= 0.15, \n", + " element_wise=False, \n", + " error=\"Too many null values in 'habitat' column.\"), \n", + " pa.Check.isin(['g', 'l', 'm', 'p', 'h', 'u', 'w', 'd'])],\n", + " nullable=True),\n", + " \"season\": pa.Column(str, checks = [pa.Check(lambda s: s.isna().mean() <= 0.15, \n", + " element_wise=False, \n", + " error=\"Too many null values in 'season' column.\"),\n", + " pa.Check.isin(['s', 'u', 'a', 'w'])],\n", + " nullable=True)\n", + " \n", + " },\n", + " checks=[\n", + " pa.Check(lambda df: ~mushroom.duplicated().any(), error=\"Duplicate rows found.\"),\n", + " pa.Check(lambda df: ~(mushroom.isna().all(axis=1)).any(), error=\"Empty rows found.\")\n", + " ],\n", + " drop_invalid_rows=True\n", + ")\n", + "\n", + "schema.validate(mushroom, lazy=True).drop_duplicates().dropna(how=\"all\")" + ] + }, + { + "cell_type": "markdown", + "id": "5b05535a-7bae-42ca-ae5d-aa42e3e95adc", + "metadata": {}, + "source": [ + "##### create validation_error.log file" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "1ccb0cfd-78eb-4f53-8d0e-4cf62b9e821d", + "metadata": {}, + "outputs": [], + "source": [ + "# create validation_error.log file\n", + "\n", + "# Configure logging\n", + "logging.basicConfig(\n", + " filename=\"validation_errors.log\",\n", + " filemode=\"w\",\n", + " format=\"%(asctime)s - %(message)s\",\n", + " level=logging.INFO,\n", + ")\n", + "\n", + "# Define the schema\n", + "schema = pa.DataFrameSchema(\n", + " \n", + " {\n", + " \"target\": pa.Column(str, pa.Check.isin(['e', 'p'])),\n", + " \n", + " \"cap-diameter\": pa.Column(float, checks = [pa.Check(lambda s: s.isna().mean() <= 0.15, \n", + " element_wise=False, \n", + " error=\"Too many null values in 'cap-diameter' column.\"), \n", + " pa.Check.between(0, 100)],\n", + " nullable=True),\n", + " \"cap-shape\": pa.Column(str, checks = [pa.Check(lambda s: s.isna().mean() <= 0.15, \n", + " element_wise=False, \n", + " error=\"Too many null values in 'cap-shape' column.\"), \n", + " pa.Check.isin(['x', 'f', 'p', 'b', 'c', 's', 'o'])],\n", + " nullable=True), \n", + " \"cap-color\": pa.Column(str, checks = [pa.Check(lambda s: s.isna().mean() <= 0.15, \n", + " element_wise=False, \n", + " error=\"Too many null values in 'cap-color' column.\"), \n", + " pa.Check.isin(['o', 'e', 'n', 'g', 'r', 'w', 'y', 'p', 'u', 'b', 'l', 'k'])],\n", + " nullable=True),\n", + " \"does-bruise-or-bleed\": pa.Column(str, checks = [pa.Check(lambda s: s.isna().mean() <= 0.15, \n", + " element_wise=False, \n", + " error=\"Too many null values in 'does-bruise-or-bleed' column.\"), \n", + " pa.Check.isin(['f', 't'])],\n", + " nullable=True),\n", + " \"gill-color\": pa.Column(str, checks = [pa.Check(lambda s: s.isna().mean() <= 0.15, \n", + " element_wise=False, \n", + " error=\"Too many null values in 'gill-color' column.\"), \n", + " pa.Check.isin(['w', 'n', 'p', 'u', 'b', 'g', 'y', 'r', 'e', 'o', 'k', 'l', 'f'])],\n", + " nullable=True),\n", + " \"stem-height\": pa.Column(float, checks = [pa.Check(lambda s: s.isna().mean() <= 0.15, \n", + " element_wise=False, \n", + " error=\"Too many null values in 'stem-height' column.\"), \n", + " pa.Check.between(0, 100)],\n", + " nullable=True),\n", + " \"stem-width\": pa.Column(float, checks = [pa.Check(lambda s: s.isna().mean() <= 0.15, \n", + " element_wise=False, \n", + " error=\"Too many null values in 'stem-width' column.\"), \n", + " pa.Check.between(0, 150)],\n", + " nullable=True),\n", + " \"stem-color\": pa.Column(str, checks = [pa.Check(lambda s: s.isna().mean() <= 0.15, \n", + " element_wise=False, \n", + " error=\"Too many null values in 'stem-color' column.\"), \n", + " pa.Check.isin(['o', 'e', 'n', 'g', 'r', 'w', 'y', 'p', 'u', 'b', 'l', 'k', 'f'])],\n", + " nullable=True),\n", + " \"has-ring\": pa.Column(str, checks = [pa.Check(lambda s: s.isna().mean() <= 0.15, \n", + " element_wise=False, \n", + " error=\"Too many null values in 'has-ring' column.\"), \n", + " pa.Check.isin(['t', 'f'])],\n", + " nullable=True),\n", + " \"ring-type\": pa.Column(str, checks = [pa.Check(lambda s: s.isna().mean() <= 0.15, \n", + " element_wise=False, \n", + " error=\"Too many null values in 'ring-type' column.\"), \n", + " pa.Check.isin(['c', 'e', 'r', 'g', 'l', 'p', 's', 'z', 'y', 'm', 'f'])],\n", + " nullable=True),\n", + " \"habitat\": pa.Column(str, checks = [pa.Check(lambda s: s.isna().mean() <= 0.15, \n", + " element_wise=False, \n", + " error=\"Too many null values in 'habitat' column.\"), \n", + " pa.Check.isin(['g', 'l', 'm', 'p', 'h', 'u', 'w', 'd'])],\n", + " nullable=True),\n", + " \"season\": pa.Column(str, checks = [pa.Check(lambda s: s.isna().mean() <= 0.15, \n", + " element_wise=False, \n", + " error=\"Too many null values in 'season' column.\"),\n", + " pa.Check.isin(['s', 'u', 'a', 'w'])],\n", + " nullable=True)\n", + " \n", + " },\n", + " checks=[\n", + " pa.Check(lambda df: ~mushroom.duplicated().any(), error=\"Duplicate rows found.\"),\n", + " pa.Check(lambda df: ~(mushroom.isna().all(axis=1)).any(), error=\"Empty rows found.\")\n", + " ],\n", + " drop_invalid_rows = False\n", + ")\n", + "\n", + "# Initialize error cases DataFrame\n", + "error_cases = pd.DataFrame()\n", + "data = mushroom.copy()\n", + "\n", + "# Validate data and handle errors\n", + "try:\n", + " validated_data = schema.validate(data, lazy=True)\n", + "except pa.errors.SchemaErrors as e:\n", + " error_cases = e.failure_cases\n", + "\n", + " # Convert the error message to a JSON string\n", + " error_message = json.dumps(e.message, indent=2)\n", + " logging.error(\"\\n\" + error_message)\n", + "\n", + "# Filter out invalid rows based on the error cases\n", + "if not error_cases.empty:\n", + " invalid_indices = error_cases[\"index\"].dropna().unique()\n", + " validated_data = (\n", + " data.drop(index=invalid_indices)\n", + " .reset_index(drop=True)\n", + " .drop_duplicates()\n", + " .dropna(how=\"all\")\n", + " )\n", + "else:\n", + " validated_data = data" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "66437bbd-1394-4685-8901-474bff89f28a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(60903, 13)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "validated_data.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "58de532e-c52c-46a6-876b-8d1f62a4e35b", + "metadata": {}, + "outputs": [], + "source": [ + "X = validated_data.drop(['target'], axis=1)\n", + "y = validated_data['target']" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "8f810d3b-d836-4679-9c5d-68563375c888", + "metadata": {}, + "outputs": [], + "source": [ + "# Split the data test and training set\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y, test_size=0.2, random_state=123\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "208ef84f-8ef1-4022-8d69-edd4bbf9a534", + "metadata": {}, + "outputs": [], + "source": [ + "mushroom_train = X_train.copy()\n", + "mushroom_train['target'] = y_train" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "95fa0221-e522-4f53-826b-d375a3a5c6db", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cap-diameterstem-heightstem-width
cap-diameter1.0000000.4200870.692574
stem-height0.4200871.0000000.431192
stem-width0.6925740.4311921.000000
\n", + "
" + ], + "text/plain": [ + " cap-diameter stem-height stem-width\n", + "cap-diameter 1.000000 0.420087 0.692574\n", + "stem-height 0.420087 1.000000 0.431192\n", + "stem-width 0.692574 0.431192 1.000000" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# check anomalous correlations\n", + "numeric_columns = mushroom_train.select_dtypes(include='number')\n", + "corr_matrix = numeric_columns.corr()\n", + "corr_matrix" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "7fd37ab8-281b-45f4-a30e-0b07ce76a767", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# check the distribution of target variable\n", + "category_counts = y_train.value_counts()\n", + "\n", + "# Plotting the bar chart\n", + "category_counts.plot(kind='bar', color='skyblue', edgecolor='black')\n", + "\n", + "# Adding labels and title\n", + "plt.title('Distribution of Target Variable', fontsize=14)\n", + "plt.xlabel('Categories', fontsize=12)\n", + "plt.ylabel('Count', fontsize=12)\n", + "\n", + "# Show the plot\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "1b334cbb-673a-4019-9d88-1bb80c957ad7", + "metadata": {}, + "source": [ + "#### EDA" + ] + }, + { + "cell_type": "markdown", + "id": "01b21f83-3fcc-4e0c-a7e6-8b9e0cd0c4ec", + "metadata": {}, + "source": [ + "##### Part 1: Missing Values" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "565a884a-16fc-4538-9e7e-535021dfdcf1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Missing Values by Column
 ColumnMissing Count
0cap-diameter0
1cap-shape0
2cap-color0
3does-bruise-or-bleed0
4gill-color0
5stem-height0
6stem-width0
7stem-color0
8has-ring0
9ring-type2471
10habitat0
11season0
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Check the missing values\n", + "missing_values = X.isnull().sum().reset_index()\n", + "missing_values.columns = ['Column', 'Missing Count']\n", + "\n", + "# Highlight values with a gradient\n", + "styled_missing = missing_values.style.format(\n", + " precision=0\n", + ").background_gradient(\n", + " subset=['Missing Count'],\n", + " cmap='YlOrRd'\n", + ").set_caption(\"Missing Values by Column\")\n", + "\n", + "# Display the styled DataFrame\n", + "display(styled_missing)" + ] + }, + { + "cell_type": "markdown", + "id": "302deb34-f082-41b2-bd10-7b20ba0b3dbd", + "metadata": {}, + "source": [ + "The initial `X_train` assessment has demonstrated no missing values within remaining features except for the `ring-type` . However, the proportion of missing values in this feature is reasonable, and simply dropping this column could result in loss of potentially valuable information, introduction of biases etc., which might reduce the overall accuracy of the classifier. Therefore, we decided to retain this column and perform imputation on `ring-type` in the data preprocessing phase. " + ] + }, + { + "cell_type": "markdown", + "id": "9dd6a42f-b507-4d81-aa68-1274d20872c1", + "metadata": {}, + "source": [ + "##### Part 2: The distribution of numeric features" + ] + }, + { + "cell_type": "markdown", + "id": "dc6a5a2b-84f8-402a-ac8c-0bb727ec5c13", + "metadata": {}, + "source": [ + "To understand the numeric features in the data set, we plotted histograms for each numeric column in `X_train`, which helps identify the distribution patterns as well as detecting any skewness or outliers. The numeric columns being plotted are `cap-diameter`, `stem-height`, and `stem-width`." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "c7c4cb2e-0f89-44fd-8faa-11088dd290e2", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "numeric_columns = X_train.select_dtypes(include='number') # Select only numeric columns\n", + "\n", + "for column in numeric_columns.columns:\n", + " plt.figure(figsize=(5,5))\n", + " plt.hist(X_train[column], bins=15, edgecolor='black', alpha=0.7)\n", + " plt.title(f'Histogram of {column}')\n", + " plt.xlabel(column)\n", + " plt.ylabel('Frequency')\n", + " plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "717bbe9d-18ee-4463-9a52-9128566851d5", + "metadata": {}, + "source": [ + "Based on the histograms, here are our findings for each feature being plotted.\n", + "\n", + "1. `cap-diameter`: The distribution is highly skewed to the right, with most values concentrated between 0 and 10 cm. There are also some outliers sitting at around 40 to 60 cm. \n", + "\n", + "2. `stem-height`: Slightly right-skewed distribution. The majority of mushrooms have stem heights between 4 and 10 cm, with few having stem heights over 20 cm.\n", + "\n", + "3. `stem-width`: Another heavily right-skewed distribution, with the majority of mushrooms having stem width below 20 cm, and a some rare cases exceeding 50 cm.\n", + "\n", + "The skewness observed across the 3 numeric features will be addressed in the preprocessing phase with `QuantileTransformer` from `sklearn.preprocessing` which maps data to a normal distribution while retaining the relative rank of values, making them more suitable for models sensitive to feature distributions, such as `SVC` and `LogisticRegression`. " + ] + }, + { + "cell_type": "markdown", + "id": "1b80b242-d2c4-48ba-9124-0f1a75233cf7", + "metadata": {}, + "source": [ + "##### Part 3: The distribution of categorical features" + ] + }, + { + "cell_type": "markdown", + "id": "fc0e83a8-e064-416d-9043-c9a340d24f18", + "metadata": {}, + "source": [ + "To understand the categorical features in the data set, we analyzed their frequency and percentage distributions, providing insights into the variability and class imbalance that might occur for each feature. " + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "c5d9010f-313a-4abf-a50c-e55c3b64b186", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Frequency and Percentage for 'cap-shape':\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 FrequencyPercentage
cap-shape  
x2151044.15
f1069821.96
s571711.73
b46159.47
o26345.41
p20984.31
c14502.98
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "---------------------------------------- \n", + "\n", + "Frequency and Percentage for 'cap-color':\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 FrequencyPercentage
cap-color  
n1940739.83
y687614.11
w617512.67
g34107.00
e32056.58
o29055.96
r13992.87
u13552.78
p13322.73
k10162.09
b9641.98
l6781.39
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "---------------------------------------- \n", + "\n", + "Frequency and Percentage for 'does-bruise-or-bleed':\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 FrequencyPercentage
does-bruise-or-bleed  
f4033382.78
t838917.22
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "---------------------------------------- \n", + "\n", + "Frequency and Percentage for 'gill-color':\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 FrequencyPercentage
gill-color  
w1483630.45
n774215.89
y759515.59
p47699.79
g32956.76
f27345.61
o23124.75
k19083.92
r11312.32
e8421.73
u8271.70
b7311.50
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "---------------------------------------- \n", + "\n", + "Frequency and Percentage for 'stem-color':\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 FrequencyPercentage
stem-color  
w1837737.72
n1447829.72
y629112.91
g20904.29
o17333.56
e16283.34
u11892.44
p8141.67
f7051.45
k6761.39
r4200.86
l1810.37
b1400.29
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "---------------------------------------- \n", + "\n", + "Frequency and Percentage for 'has-ring':\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 FrequencyPercentage
has-ring  
f3649574.90
t1222725.10
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "---------------------------------------- \n", + "\n", + "Frequency and Percentage for 'ring-type':\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 FrequencyPercentage
ring-type  
f3844082.28
e19644.20
z17133.67
l11512.46
r11292.42
p10282.20
g10052.15
m2870.61
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "---------------------------------------- \n", + "\n", + "Frequency and Percentage for 'habitat':\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 FrequencyPercentage
habitat  
d3516272.17
g640313.14
l25395.21
m23444.81
h15983.28
p2980.61
w2880.59
u900.18
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "---------------------------------------- \n", + "\n", + "Frequency and Percentage for 'season':\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 FrequencyPercentage
season  
a2407949.42
u1830037.56
w41498.52
s21944.50
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "---------------------------------------- \n", + "\n" + ] + } + ], + "source": [ + "categorical_columns = X_train.select_dtypes(include='object') # Select only categorical columns\n", + "\n", + "# Calculate frequency and percentage for each categorical features\n", + "for column in categorical_columns.columns:\n", + " print(f\"Frequency and Percentage for '{column}':\")\n", + " \n", + " # Frequency\n", + " frequency = X_train[column].value_counts()\n", + " # Percentage\n", + " percentage = round(X_train[column].value_counts(normalize=True) * 100, 2)\n", + " \n", + " # Combine into one DataFrame\n", + " freq_percent_df = pd.DataFrame({\n", + " \"Frequency\": frequency,\n", + " \"Percentage\": percentage\n", + " })\n", + "\n", + " # Highlight values with a gradient\n", + " styled_df = freq_percent_df.style.format(\n", + " precision=2\n", + " ).background_gradient(\n", + " subset=['Percentage'],\n", + " cmap='YlOrRd'\n", + " )\n", + "\n", + " # Display the styled DataFrame\n", + " display(styled_df)\n", + " print(\"-\" * 40, '\\n')" + ] + }, + { + "cell_type": "markdown", + "id": "12823bac-ddf4-47d7-bda9-785b424a1837", + "metadata": {}, + "source": [ + "Based on the Frequency and Percentage distributions, here are our findings:\n", + "\n", + "1. `cap-shape`: The most common cap shape is `x` (convex), comprising 43.97% of the data. Other shapes like `f` (flat) and `s` (sunken) are also prevalent, while `c` (conical) is the least common with 2.95% appearance.\n", + "\n", + "2. `cap-color`: The most frequently appeared color is `n` (brown), with 39.71% of the data. Other colors like `y` (yellow), `w` (white), and `g` (gray) are also well-represented, while rare colors like `b` (buff) and `l` (blue) appear in less than 2% of the data.\n", + "\n", + "3. `does-bruise-or-bleed`: The majority of the mushrooms are `f` (do not bruise or bleed), while their counterpart make up 17.26% of the data.\n", + "\n", + "4. `gill-color`: The most common gill color is `w` (white), with 30.45% of the data. Other colors such as `n` (brown) and `y` (yellow) are also frequent, while rare gill colors like `e` (red), `b` (buff) and `u` (purple) appear in less than 2% of the data.\n", + "\n", + "5. `stem-color`: `w` (white) and `n` (brown) are the dominating stem colors, accounting for 37.75% and 29.5% of the data, respectively. Other colors like `r` (green), `l` (blue) and `b` (buff) are less frequent, appearing in less than 1% of the observations.\n", + "\n", + "6. `has-ring`: Most mushrooms are `f` (do not have a ring), with 74.84% observations. The remaining 25.16% mushrooms are `t` (have a ring).\n", + "\n", + "7. `ring-type`: `f` (none) is the most common ring type, accounting for 82.3% of the data. Other types like `e` (evanescent) and `z` (zone) are less frequent, while rare types like `m` (movable) occur in less than 1% of the data.\n", + "\n", + "8. `habitat`: The predominant habitat is `d` (woods), with 72.46% appearance. Other habitats such as `g` (grasses) and `l` (leaves) are less common, while `w` (waste), `p` (paths), and `u` (urban) only make up less than 1% of the data individually.\n", + "\n", + "9. `season`: Most mushrooms grow in `a` (autumn), comprising 49.36% of the data, followed by `u` (summer) at 37.5%. The other two seasons `w` (winter) and `s` (spring) are less frequent.\n", + "\n", + "Categorical features will be encoded into binary format in the following preprocessing phase with `OneHotEncoder`. Since we are dealing with a mix of binary and non-binary categorical features, for features like `does-bruise-or-bleed` and `has-ring` that have two unique values, they will be handled with `drop='if_binary'` argument to reduce redundancy while still capturing the information. " + ] + }, + { + "cell_type": "markdown", + "id": "a8a13abe-906b-4230-8772-2d799a51a857", + "metadata": {}, + "source": [ + "##### Part 4: The distribution of the target" + ] + }, + { + "cell_type": "markdown", + "id": "e175de25-1893-40f0-a794-1153470d7230", + "metadata": {}, + "source": [ + "The target variable `class` represents whether a mushroom is `p` (poisonous) or `e` (edible). Understanding the distribution of the target helps assessing class balance, which might have impact on models' performance." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "9e47fdb0-f94a-4777-a3c1-954e7d62202a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
FrequencyPercentage
target
p2699655.41
e2172644.59
\n", + "
" + ], + "text/plain": [ + " Frequency Percentage\n", + "target \n", + "p 26996 55.41\n", + "e 21726 44.59" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + " # Frequency\n", + "frequency = y_train.value_counts()\n", + "# Percentage\n", + "percentage = round(y_train.value_counts(normalize=True) * 100, 2)\n", + "\n", + "# Combine into one DataFrame\n", + "freq_percent_df = pd.DataFrame({\n", + " \"Frequency\": frequency,\n", + " \"Percentage\": percentage\n", + "})\n", + "freq_percent_df" + ] + }, + { + "cell_type": "markdown", + "id": "10f25100-5adc-46c3-8503-6faebcc29100", + "metadata": {}, + "source": [ + "Based on the Frequency and Percentage distribution, here are our findings:\n", + "\n", + "1. `p` (Poisonous): There are 27,143 instances of poisonous mushrooms, accounting for 55.56% of the data.\n", + "\n", + "2. `e` (Edible): There are 21,712 instances of edible mushrooms, constituting 44.44% of the data.\n", + "\n", + "Using $F_{\\beta}$, precision, recall, or confusion matrix to evaluate the model's performance is advisable in the following procedure. " + ] + }, + { + "cell_type": "markdown", + "id": "e6000a8d-7e7b-4fd1-aa27-b3c8074e6d91", + "metadata": {}, + "source": [ + "#### Preprocessing and Model Building\n", + "\n", + "Three classification models including Support Vector Classifier (SVC), K-Nearest Neighbors (KNN), and Logistic Regression are used to predict whether a mushroom is edible or poisonous. Predicting a mushroom to be edible when it is in fact poisonous could have severe health consequences. Therefore the best model should prioritize the minimization of this error. To do this, we can evaluate models on an $F_{\\beta}$ score with $\\beta = 2$." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "2b26607e-2448-452d-8e0b-e25c56ceba44", + "metadata": {}, + "outputs": [], + "source": [ + "# loading in some models\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.svm import SVC\n", + "from sklearn.linear_model import LogisticRegression" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "32e69dc9-c9c6-4734-8f01-37a6813ef1d8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
ColumnTransformer(transformers=[('quantiletransformer',\n",
+       "                                 QuantileTransformer(output_distribution='normal',\n",
+       "                                                     random_state=123),\n",
+       "                                 ['cap-diameter', 'stem-height', 'stem-width']),\n",
+       "                                ('pipeline',\n",
+       "                                 Pipeline(steps=[('simpleimputer',\n",
+       "                                                  SimpleImputer(fill_value='missing',\n",
+       "                                                                strategy='constant')),\n",
+       "                                                 ('onehotencoder',\n",
+       "                                                  OneHotEncoder(drop='if_binary',\n",
+       "                                                                handle_unknown='ignore',\n",
+       "                                                                sparse_output=False))]),\n",
+       "                                 ['ring-type']),\n",
+       "                                ('onehotencoder',\n",
+       "                                 OneHotEncoder(drop='if_binary',\n",
+       "                                               handle_unknown='ignore',\n",
+       "                                               sparse_output=False),\n",
+       "                                 ['does-bruise-or-bleed', 'has-ring',\n",
+       "                                  'cap-shape', 'cap-color', 'gill-color',\n",
+       "                                  'stem-color', 'habitat', 'season'])])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "ColumnTransformer(transformers=[('quantiletransformer',\n", + " QuantileTransformer(output_distribution='normal',\n", + " random_state=123),\n", + " ['cap-diameter', 'stem-height', 'stem-width']),\n", + " ('pipeline',\n", + " Pipeline(steps=[('simpleimputer',\n", + " SimpleImputer(fill_value='missing',\n", + " strategy='constant')),\n", + " ('onehotencoder',\n", + " OneHotEncoder(drop='if_binary',\n", + " handle_unknown='ignore',\n", + " sparse_output=False))]),\n", + " ['ring-type']),\n", + " ('onehotencoder',\n", + " OneHotEncoder(drop='if_binary',\n", + " handle_unknown='ignore',\n", + " sparse_output=False),\n", + " ['does-bruise-or-bleed', 'has-ring',\n", + " 'cap-shape', 'cap-color', 'gill-color',\n", + " 'stem-color', 'habitat', 'season'])])" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# importing required preprocessors, pipelines, etc.\n", + "from sklearn.impute import SimpleImputer\n", + "from sklearn.preprocessing import QuantileTransformer,OneHotEncoder\n", + "from sklearn.compose import make_column_transformer\n", + "from sklearn.pipeline import make_pipeline\n", + "\n", + "# converting targets to Series objects to avoid warnings\n", + "y_train = y_train.squeeze()\n", + "y_test = y_test.squeeze()\n", + "\n", + "# random state for reproducability\n", + "SEED = 123\n", + "\n", + "# feature sets for each transformation\n", + "numeric_cols = ['cap-diameter','stem-height','stem-width']\n", + "categorical_cols = ['does-bruise-or-bleed','has-ring','cap-shape','cap-color','gill-color','stem-color','habitat','season']\n", + "impute_cols = ['ring-type']\n", + "\n", + "# creating transformers\n", + "numeric_transformer = QuantileTransformer(output_distribution='normal',random_state=SEED)\n", + "categorical_transformer = OneHotEncoder(drop='if_binary',handle_unknown='ignore',sparse_output=False)\n", + "impute_transformer = make_pipeline(\n", + " SimpleImputer(strategy='constant',fill_value = 'missing'),\n", + " categorical_transformer\n", + ")\n", + "\n", + "# final preprocessor\n", + "preprocessor = make_column_transformer(\n", + " (numeric_transformer,numeric_cols),\n", + " (impute_transformer,impute_cols),\n", + " (categorical_transformer,categorical_cols)\n", + ")\n", + "preprocessor" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "1a03aff3-0dd9-4d8d-acff-9a57fc86a6a8", + "metadata": {}, + "outputs": [], + "source": [ + "# create model pipelines\n", + "svc = make_pipeline(preprocessor,SVC(random_state=SEED))\n", + "knn = make_pipeline(preprocessor,KNeighborsClassifier())\n", + "logreg = make_pipeline(preprocessor,LogisticRegression(max_iter=5000,random_state=SEED))" + ] + }, + { + "cell_type": "markdown", + "id": "b80b1c6f-a12f-4b03-8ae6-384513426051", + "metadata": {}, + "source": [ + "#### Model Evaluation" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "23716434-2c8d-4a6e-8627-dfbcc2a70eb8", + "metadata": {}, + "outputs": [], + "source": [ + "# decide which metrics to use: f_beta score? Weighted to lower false positives\n", + "from sklearn.metrics import ConfusionMatrixDisplay, make_scorer, fbeta_score, accuracy_score, precision_score, recall_score\n", + "from sklearn.model_selection import cross_validate, cross_val_predict, GridSearchCV, RandomizedSearchCV" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "b81f3323-048e-4911-82a6-c80f5af91105", + "metadata": {}, + "outputs": [], + "source": [ + "# define the hyperparameter grid\n", + "from scipy.stats import loguniform, randint\n", + "\n", + "knn_grid = {'kneighborsclassifier__n_neighbors':randint(5,1000)}\n", + "\n", + "svc_grid = {'svc__C':loguniform(1e-3,1e3),\n", + " 'svc__gamma':loguniform(1e-3,1e3)}\n", + "\n", + "logreg_grid = {'logisticregression__C':loguniform(1e-3,1e3)}" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "3b1c6975-7d4d-4834-9ae8-17caea713141", + "metadata": {}, + "outputs": [], + "source": [ + "# create metrics\n", + "scoring_metrics = {\n", + " 'accuracy':make_scorer(accuracy_score),\n", + " 'f2_score':make_scorer(fbeta_score,beta=2,pos_label='p',average='binary') \n", + "}\n", + "cv_results = dict()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "038cd3ae-2375-4350-b564-39d243b37dae", + "metadata": {}, + "outputs": [], + "source": [ + "# hyperparameter tuning\n", + "cv_results['logreg'] = RandomizedSearchCV(\n", + " logreg,logreg_grid,n_iter=30,n_jobs=-1,\n", + " scoring=scoring_metrics,random_state=SEED,\n", + " refit='f2_score'\n", + ").fit(X_train,y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "71ef0579-1ba4-4f5a-bf9e-160ada934a8b", + "metadata": {}, + "outputs": [], + "source": [ + "cv_results['svc'] = RandomizedSearchCV(\n", + " svc,svc_grid,n_iter=3,n_jobs=-1,cv=3,\n", + " scoring=scoring_metrics,random_state=SEED,\n", + " refit='f2_score'\n", + ").fit(X_train,y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "434f3c94-b7bf-4c7a-919e-31e67b637b46", + "metadata": {}, + "outputs": [], + "source": [ + "cv_results['knn'] = RandomizedSearchCV(\n", + " knn,knn_grid,n_iter=5,n_jobs=-1,cv=3,\n", + " scoring=scoring_metrics,random_state=SEED,\n", + " refit='f2_score'\n", + ").fit(X_train,y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "1c18117a-eaa9-4d89-85ca-ccabc5553f10", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
paramsmean_fit_timemean_test_accuracystd_test_accuracymean_test_f2_scorestd_test_f2_score
Logisic Regression{'logisticregression__C': 0.05784745785308777}0.3356790.7465010.0037110.7793130.003517
SVC{'svc__C': 20.74024196289186, 'svc__gamma': 0....34.7748240.9965720.0003190.9970740.000163
KNN{'kneighborsclassifier__n_neighbors': 327}0.1048820.9318170.0017840.9368280.001980
\n", + "
" + ], + "text/plain": [ + " params \\\n", + "Logisic Regression {'logisticregression__C': 0.05784745785308777} \n", + "SVC {'svc__C': 20.74024196289186, 'svc__gamma': 0.... \n", + "KNN {'kneighborsclassifier__n_neighbors': 327} \n", + "\n", + " mean_fit_time mean_test_accuracy std_test_accuracy \\\n", + "Logisic Regression 0.335679 0.746501 0.003711 \n", + "SVC 34.774824 0.996572 0.000319 \n", + "KNN 0.104882 0.931817 0.001784 \n", + "\n", + " mean_test_f2_score std_test_f2_score \n", + "Logisic Regression 0.779313 0.003517 \n", + "SVC 0.997074 0.000163 \n", + "KNN 0.936828 0.001980 " + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# compilng hyperparameters and scores of best models into one dataframe\n", + "cols = ['params','mean_fit_time','mean_test_accuracy','std_test_accuracy','mean_test_f2_score','std_test_f2_score']\n", + "final_results = pd.concat(\n", + " [pd.DataFrame(result.cv_results_).query('rank_test_f2_score == 1')[cols] for _,result in cv_results.items()]\n", + ")\n", + "final_results.index = ['Logisic Regression','SVC','KNN']\n", + "final_results" + ] + }, + { + "cell_type": "markdown", + "id": "6c8c2497-cce7-4473-9604-2fee9235914f", + "metadata": {}, + "source": [ + "After tuning the hyperparameter, the Logistic Regression model has the mean accuracy of 0.75 and mean $F_{\\beta}$ score of 0.78 on the validation set. The KNN model has the mean accuracy of 0.93 and mean $F_{\\beta}$ score of 0.94. The SVC outperforms both Logistic Regression and KNN significantly in both accuracy of 0.99 and $F_{\\beta}$ score of 0.99. Thus, SVC is the ideal choice to identify edible or poisonous mushroom (recall is the highest priority)." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "9764dd35-0038-4c6d-8214-26700bf4052e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAg0AAAGwCAYAAAAqpFaiAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABA1ElEQVR4nO3deXhU9dn/8c9km4SQDISQhEBEUIhgIiIoCfgILgRQttqnaKkRWgwqAkVArOCCWogbi0qLFC1QhAf91eKGjWBVWoSARKMCkWrZgiQkSkggZJ05vz8iQ8eAznAmJOS8X9d1rjLnfM+Ze6Zj5p77uxybYRiGAAAAfkJAYwcAAADODyQNAADAKyQNAADAKyQNAADAKyQNAADAKyQNAADAKyQNAADAK0GNHYAZLpdLhw4dUkREhGw2W2OHAwDwkWEYOnbsmOLj4xUQ0HC/YysrK1VdXW36OiEhIQoNDfVDROen8zppOHTokBISEho7DACASfn5+erQoUODXLuyslKdOrZUYZHT9LXi4uK0d+9eyyYO53XSEBERIUmavGGg7OHBjRwN0DA+7stnG81XrWq0Se+4/543hOrqahUWObU/50JFRpx9NaPsmEsde+1TdXU1ScP56GSXhD08WPaW/GFF8xRk47ONZuz7Gxmciy7mlhE2tYw4++dxiW7w8zppAADAW07DJaeJuy05DZf/gjlPkTQAACzBJUMunX3WYObc5oIplwAAwCtUGgAAluCSS2Y6GMyd3TyQNAAALMFpGHIaZ9/FYObc5oLuCQAA4BUqDQAAS2AgpHkkDQAAS3DJkJOkwRS6JwAAgFeoNAAALIHuCfNIGgAAlsDsCfPongAAAF6h0gAAsATX95uZ862OpAEAYAlOk7MnzJzbXJA0AAAswWnI5F0u/RfL+YoxDQAAwCtUGgAAlsCYBvNIGgAAluCSTU7ZTJ1vdXRPAAAAr1BpAABYgsuo28ycb3VUGgAAluD8vnvCzOaLzMxMXXnllYqIiFBMTIxGjhyp3bt3e7QZO3asbDabx5aSkuLRpqqqSpMmTVJ0dLTCw8M1fPhwHTx40KNNSUmJ0tPT5XA45HA4lJ6erqNHj3q0OXDggIYNG6bw8HBFR0dr8uTJqq6u9uk1kTQAANAANm7cqHvuuUfZ2dnasGGDamtrlZaWpvLyco92gwcPVkFBgXt75513PI5PmTJFa9eu1Zo1a7Rp0yYdP35cQ4cOldPpdLcZPXq0cnNzlZWVpaysLOXm5io9Pd193Ol06qabblJ5ebk2bdqkNWvW6LXXXtO0adN8ek10TwAALOFsqgU/PN8XWVlZHo+XLVummJgY5eTk6JprrnHvt9vtiouLO+01SktL9dJLL2nlypW64YYbJEkvv/yyEhIS9N5772nQoEHKy8tTVlaWsrOz1adPH0nS0qVLlZqaqt27dysxMVHr16/Xrl27lJ+fr/j4eEnSvHnzNHbsWM2ZM0eRkZFevSYqDQAAS3AZNtObJJWVlXlsVVVVXj1/aWmpJCkqKspj/4cffqiYmBh17dpVGRkZKioqch/LyclRTU2N0tLS3Pvi4+OVlJSkzZs3S5K2bNkih8PhThgkKSUlRQ6Hw6NNUlKSO2GQpEGDBqmqqko5OTlev4ckDQAA+CAhIcE9dsDhcCgzM/MnzzEMQ1OnTtXVV1+tpKQk9/4hQ4Zo1apVev/99zVv3jx9/PHHuu6669yJSGFhoUJCQtS6dWuP68XGxqqwsNDdJiYmpt5zxsTEeLSJjY31ON66dWuFhIS423iD7gkAgCX4q3siPz/fo5xvt9t/8tyJEyfq888/16ZNmzz233LLLe5/JyUlqXfv3urYsaPWrVunm2+++YzXMwxDNtup1/Lf/zbT5qdQaQAAWIJTAaY3SYqMjPTYfippmDRpkt5880198MEH6tChw4+2bdeunTp27KivvvpKkhQXF6fq6mqVlJR4tCsqKnJXDuLi4nT48OF61youLvZo88OKQklJiWpqaupVIH4MSQMAwBIMk+MZDMO3KoVhGJo4caL+9re/6f3331enTp1+8pzvvvtO+fn5ateunSSpV69eCg4O1oYNG9xtCgoKtGPHDvXt21eSlJqaqtLSUm3bts3dZuvWrSotLfVos2PHDhUUFLjbrF+/Xna7Xb169fL6NdE9AQBAA7jnnnu0evVqvfHGG4qIiHD/0nc4HAoLC9Px48c1e/Zs/fznP1e7du20b98+zZw5U9HR0frZz37mbjtu3DhNmzZNbdq0UVRUlKZPn67k5GT3bIpu3bpp8ODBysjI0JIlSyRJ48eP19ChQ5WYmChJSktLU/fu3ZWenq6nn35aR44c0fTp05WRkeH1zAmJSgMAwCLO9eJOixcvVmlpqQYMGKB27dq5t1deeUWSFBgYqC+++EIjRoxQ165dNWbMGHXt2lVbtmxRRESE+zoLFizQyJEjNWrUKPXr108tWrTQW2+9pcDAQHebVatWKTk5WWlpaUpLS9Nll12mlStXuo8HBgZq3bp1Cg0NVb9+/TRq1CiNHDlSzzzzjE+vyWYYxnm7MGZZWZkcDofu23yj7C2DGzscoEFk9+Czjear1qjRh3pDpaWlPv3i9cXJ74q/f95J4RFn/1u5/JhLQy7b26CxNnVUGgAAgFcY0wAAsASXbHKZ+K3s0nlbmPcbkgYAgCWc62WkmyO6JwAAgFeoNAAALMFpBMhpnP1vZef5O2/Ab0gaAACWUDem4ey7GMyc21zQPQEAALxCpQEAYAmu/7p/xNmdT/cESQMAwBIY02AeSQMAwBJcCmCdBpMY0wAAALxCpQEAYAlOwyanj7e3/uH5VkfSAACwBKfJgZBOuifongAAAN6h0gAAsASXESCXidkTLmZPkDQAAKyB7gnz6J4AAABeodIAALAEl8zNgHD5L5TzFkkDAMASzC/uRHGedwAAAHiFSgMAwBLM33uC39kkDQAAS3DJJpfMjGlgRUiSBgCAJVBpMI93AAAAeIVKAwDAEswv7sTvbJIGAIAluAybXGbWaeAul6RNAADAO1QaAACW4DLZPcHiTiQNAACLMH+XS5IG3gEAAOAVKg0AAEtwyianiQWazJzbXJA0AAAsge4J83gHAACAV6g0AAAswSlzXQxO/4Vy3iJpAABYAt0T5pE0AAAsgRtWmcc7AAAAvEKlAQBgCYZscpkY02Aw5ZKkAQBgDXRPmMc7AAAAvEKlAQBgCdwa2zySBgCAJThN3uXSzLnNBe8AAADwCpUGAIAl0D1hHkkDAMASXAqQy0SB3cy5zQXvAAAA8AqVBgCAJTgNm5wmuhjMnNtckDQAACyBMQ3mkTQAACzBMHmXS4MVIRnTAAAAvEOlAQBgCU7Z5DRx0ykz5zYXJA0AAEtwGebGJbgMPwZznqJ7AgAAeIVKQzP2zUsBOvIPmyr22hRglyIuN3TBFKfCLjzV5sh7Nh3+a4DK82yqPWpT8is1Cr+k/rWOfWZT/vMBOv6FTbZgqUWioW5/cCogVCr92Ka8O07/UUpaVauWSXXp+fEdNh14tu65JKnlpYYuuNd52ucDzpVbJh7Wb2YWau3SaL3wSHtJUmgLp8bNKlDqoDJFtq7V4YMheuOlaL39l+hGjhZmuEwOhDRzbnNB0tCMlW23KfYWl1peashwSvnPByrvriD1+FutAlvUtXFW1CUTbdJc2vPo6T8Oxz6z6csJgYr/jUsX/s4pW7B04t82d50q4nJDV/yjxuOcg38IVGm2TeGX1iUMznIp7+5ARQ0w1GlWrYxa6eDiQH15d5B6rq9VQHCDvQ3AGXXtcUI33nZEe3aGeuy/69FD6tH3uJ6adIEO54foiv7HNCnzoL47HKwt7zoaKVqY5ZJNLhPjEsyc21yQNjVj3RY7FTPCUIuLpfBE6aLHnKousLl/6UtS22GGOtzlUmSfM3fW7X86QHG/dKn9OJdaXCyFdZTaDDQUEFJ3PCBYCok+tQU5pJIPbWo70iXb909Vsc8mZ5lNHe6pq3S0uFjqcJdTNUdsqi5swDcBOIPQFk7dv2i/Ft7XQcdKAz2Odet1Qhv+X5Q+39JShw+G6O+r2mjPrjB1uexEI0ULNA0kDRbiPF73v0GR3o/mqflOOv5FgIKjpB23Byrn2iDt/E2gyj45c8ZdstGmmqNS2xEu976wCw0FtTZUtDZArhrJVSkVrQ1Q2EWG7O3O9hUBZ2/i3G+07R+R+vRfEfWO7dwWrpS0UrWJq5FkqEff42rfuUo5G+u3xfnj5IqQZjara/SkwTAMPfXUU+rcubPCwsLUo0cP/fWvf23ssJodw5D2PxOoiJ4uteji/XmV39T9R3LwhQDF3OzSJX+sVXg3Q3njA1Wx//TnFK8NUKu+huxxp/YFhkvdX6zVt+sCtO2qIG1LDdLRzQG65A+1stFJhnOs/4gSXZxcoT9nnj5j/eND8Trw71Ct/mSX1u3/XL9ftUeLHmivndtanuNI4U8nxzSY2ayu0d+BBx98UMuWLdPixYu1c+dO3Xvvvbrtttu0cePGem2rqqpUVlbmscE7+zIDVP6VTRc/6fTtxO+LBTH/61LMSEPh3aQL73Mp7EKp+PX6H5+qw9LRzTa1/ZnL8zKV0n8eCVTE5YaSVjp16QqnWlxk6Mt7guSqPMsXBZyFtvHVuvuxQ3pq0gWqqTr9n8CR477VJb1O6OExF2ri4K5a+li8JmZ+o57/c+wcR4vzWWZmpq688kpFREQoJiZGI0eO1O7duz3aGIah2bNnKz4+XmFhYRowYIB27tzp0aaqqkqTJk1SdHS0wsPDNXz4cB08eNCjTUlJidLT0+VwOORwOJSenq6jR496tDlw4ICGDRum8PBwRUdHa/LkyaqurvbpNTVq0lBeXq758+frz3/+swYNGqTOnTtr7Nixuu2227RkyZJ67TMzM91viMPhUEJCQiNEff7Zmxmgkg8D1H1preyxvp0bHF3XlRHW2bNLI7SToarC+qW64tcDFOSQWvf3bP/tOzZVH7LposecaplkKOIyQxc/4VTVN9KRDyj54dy5+LIKtW5bq0VZ/9Y7Bz7TOwc+U4++5Rox7lu9c+Az2cOcGvu7Qv1pdry2bnBob16Y3lwWrY1vttL/3lXc2OHDBJds7vtPnNXm40DIjRs36p577lF2drY2bNig2tpapaWlqby83N3mqaee0vz587Vo0SJ9/PHHiouL08CBA3Xs2KkEdcqUKVq7dq3WrFmjTZs26fjx4xo6dKiczlM/AkePHq3c3FxlZWUpKytLubm5Sk9Pdx93Op266aabVF5erk2bNmnNmjV67bXXNG3aNJ9eU6MWhnft2qXKykoNHDjQY391dbV69uxZr/0DDzygqVOnuh+XlZWROPwIw6irMBx5P0DdX6pVaAffr2FvLwW3NVS5zybpVCJQud+mVld7VhMMQyp+I0Bth7nqzYZwVX4/2+K//puz2b5/zIIpOIdy/9VS46/t6rFv2oJ85X8dqlf/0FaBgVJwiCGX58dbLqdkC+DDej4zTM6eMHw8Nysry+PxsmXLFBMTo5ycHF1zzTUyDEMLFy7UrFmzdPPNN0uSVqxYodjYWK1evVp33nmnSktL9dJLL2nlypW64YYbJEkvv/yyEhIS9N5772nQoEHKy8tTVlaWsrOz1adPH0nS0qVLlZqaqt27dysxMVHr16/Xrl27lJ+fr/j4eEnSvHnzNHbsWM2ZM0eRkZFevaZGTRpc3/9XuW7dOrVv397jmN1ur9febrefdj9Ob9/cAH379wAlLnQqMFyq/rZuf1BLKeD7GWa1pVJVgVRdXPcfQ8X3yUHw9zMhbDYpfqxLBxcHqEWiofBEQ8VvBqhin9R1nudf1bJtNlV9Y1PMD7omJMmR6tL+BQHaN7duJobhkg79OVC2ICnySv4Q49ypKA/U/t1hHvsqTwToWMmp/Z9tDlfGQwWqrgzQ4YPBuiy1XDf8b4n+9Gh8Y4QMP/HXXS5/2DXu7XdTaWmpJCkqKkqStHfvXhUWFiotLc3jWv3799fmzZt15513KicnRzU1NR5t4uPjlZSUpM2bN2vQoEHasmWLHA6HO2GQpJSUFDkcDm3evFmJiYnasmWLkpKS3AmDJA0aNEhVVVXKycnRtdde69V70KhJQ/fu3WW323XgwAH179+/MUNplg6/WjeNbNc4z/+bOz9Wq5gRdV/URz60ac/Dp45/fX/dv9vf5VTC3XVf/u1uc8lVJe1/OlC1pd8v7PSCU6E/KPIUrQ1Qy8tdCutcP5awTlLic05980KAdtweJNmk8EsMXfJHp0La+usVA/6ReXdH/WZmge5ftF8RrZwq+iZEy59sp7f/0qaxQ0MT8MMK9yOPPKLZs2f/6DmGYWjq1Km6+uqrlZSUJEkqLKybbx4b69lvHBsbq/3797vbhISEqHXr1vXanDy/sLBQMTEx9Z4zJibGo80Pn6d169YKCQlxt/FGoyYNERERmj59uu699165XC5dffXVKisr0+bNm9WyZUuNGTOmMcM776V8VvOTbWJGGIoZ8dPt2o+rW6fhx3R54scHWbZKNdQq1ceBmMA5MON/L/Z4XFIcrHn3XtBI0aCh+GtFyPz8fI9yvjdVhokTJ+rzzz/Xpk2b6h2z2TyrH4Zh1Nv3Qz9sc7r2Z9PmpzT6ZLfHH39cMTExyszM1J49e9SqVStdccUVmjlzZmOHBgBoRvzVPREZGen1GABJmjRpkt58803985//VIcOpwaXxcXVzUsvLCxUu3anpv8WFRW5qwJxcXGqrq5WSUmJR7WhqKhIffv2dbc5fPhwvectLi72uM7WrVs9jpeUlKimpqZeBeLHNPqUS5vNpsmTJ+vLL79UdXW1ioqKlJWVpWuuuaaxQwMA4KwZhqGJEyfqb3/7m95//3116tTJ43inTp0UFxenDRs2uPdVV1dr48aN7oSgV69eCg4O9mhTUFCgHTt2uNukpqaqtLRU27Ztc7fZunWrSktLPdrs2LFDBQUF7jbr16+X3W5Xr169vH5NjV5pAADgXDjX95645557tHr1ar3xxhuKiIhwjx1wOBwKCwuTzWbTlClTNHfuXHXp0kVdunTR3Llz1aJFC40ePdrddty4cZo2bZratGmjqKgoTZ8+XcnJye7ZFN26ddPgwYOVkZHhXq5g/PjxGjp0qBITEyVJaWlp6t69u9LT0/X000/ryJEjmj59ujIyMnyqmpA0AAAswV/dE95avHixJGnAgAEe+5ctW6axY8dKkmbMmKGKigpNmDBBJSUl6tOnj9avX6+IiFNLli9YsEBBQUEaNWqUKioqdP3112v58uUKDDx1z5RVq1Zp8uTJ7lkWw4cP16JFi9zHAwMDtW7dOk2YMEH9+vVTWFiYRo8erWeeecan12QzDOO8ne9WVlYmh8Oh+zbfKHtLbpOI5im7B59tNF+1Ro0+1BsqLS316RevL05+V9z07h0KDg856+vUlFdr3aAXGzTWpo5KAwDAEs51paE5ImkAAFgCSYN5jT57AgAAnB+oNAAALIFKg3kkDQAASzDk+7TJH55vdSQNAABLoNJgHmMaAACAV6g0AAAsgUqDeSQNAABLIGkwj+4JAADgFSoNAABLoNJgHkkDAMASDMMmw8QXv5lzmwu6JwAAgFeoNAAALMElm6nFncyc21yQNAAALIExDebRPQEAALxCpQEAYAkMhDSPpAEAYAl0T5hH0gAAsAQqDeYxpgEAAHiFSgMAwBIMk90TVBpIGgAAFmFIMgxz51sd3RMAAMArVBoAAJbgkk02VoQ0haQBAGAJzJ4wj+4JAADgFSoNAABLcBk22VjcyRSSBgCAJRiGydkTTJ+gewIAAHiHSgMAwBIYCGkeSQMAwBJIGswjaQAAWAIDIc1jTAMAAPAKlQYAgCUwe8I8kgYAgCXUJQ1mxjT4MZjzFN0TAADAK1QaAACWwOwJ80gaAACWYHy/mTnf6uieAAAAXqHSAACwBLonzCNpAABYA/0TppE0AACswWSlQVQaGNMAAAC8Q6UBAGAJrAhpHkkDAMASGAhpHt0TAADAK1QaAADWYNjMDWak0kDSAACwBsY0mEf3BAAA8AqVBgCANbC4k2kkDQAAS2D2hHleJQ3PPfec1xecPHnyWQcDAACaLq+ShgULFnh1MZvNRtIAAGi66GIwxaukYe/evQ0dBwAADYruCfPOevZEdXW1du/erdraWn/GAwBAwzD8sFmcz0nDiRMnNG7cOLVo0UKXXnqpDhw4IKluLMMTTzzh9wABAEDT4HPS8MADD+izzz7Thx9+qNDQUPf+G264Qa+88opfgwMAwH9sftiszecpl6+//rpeeeUVpaSkyGY79QZ2795d//nPf/waHAAAfsM6Dab5XGkoLi5WTExMvf3l5eUeSQQAAFb2z3/+U8OGDVN8fLxsNptef/11j+Njx46VzWbz2FJSUjzaVFVVadKkSYqOjlZ4eLiGDx+ugwcPerQpKSlRenq6HA6HHA6H0tPTdfToUY82Bw4c0LBhwxQeHq7o6GhNnjxZ1dXVPr8mn5OGK6+8UuvWrXM/PpkoLF26VKmpqT4HAADAOXGOB0KWl5erR48eWrRo0RnbDB48WAUFBe7tnXfe8Tg+ZcoUrV27VmvWrNGmTZt0/PhxDR06VE6n091m9OjRys3NVVZWlrKyspSbm6v09HT3cafTqZtuuknl5eXatGmT1qxZo9dee03Tpk3z7QXpLLonMjMzNXjwYO3atUu1tbV69tlntXPnTm3ZskUbN270OQAAAM4JP93lsqyszGO33W6X3W6v13zIkCEaMmTIj17SbrcrLi7utMdKS0v10ksvaeXKlbrhhhskSS+//LISEhL03nvvadCgQcrLy1NWVpays7PVp08fSad+xO/evVuJiYlav369du3apfz8fMXHx0uS5s2bp7Fjx2rOnDmKjIz0+i3wudLQt29fffTRRzpx4oQuuugirV+/XrGxsdqyZYt69erl6+UAADivJCQkuLsCHA6HMjMzz/paH374oWJiYtS1a1dlZGSoqKjIfSwnJ0c1NTVKS0tz74uPj1dSUpI2b94sSdqyZYscDoc7YZCklJQUORwOjzZJSUnuhEGSBg0apKqqKuXk5PgU71ndeyI5OVkrVqw4m1MBAGgU/ro1dn5+vsev89NVGbwxZMgQ/eIXv1DHjh21d+9ePfTQQ7ruuuuUk5Mju92uwsJChYSEqHXr1h7nxcbGqrCwUJJUWFh42nGGMTExHm1iY2M9jrdu3VohISHuNt46q6TB6XRq7dq1ysvLk81mU7du3TRixAgFBXH/KwBAE+Wn2RORkZE+lfTP5JZbbnH/OykpSb1791bHjh21bt063XzzzWcOwzA8Jh6cbhLC2bTxhs/f8jt27NCIESNUWFioxMRESdK///1vtW3bVm+++aaSk5N9vSQAAJbXrl07dezYUV999ZUkKS4uTtXV1SopKfGoNhQVFalv377uNocPH653reLiYnd1IS4uTlu3bvU4XlJSopqamnoViJ/i85iGO+64Q5deeqkOHjyoTz75RJ988ony8/N12WWXafz48b5eDgCAc+PkQEgzWwP67rvvlJ+fr3bt2kmSevXqpeDgYG3YsMHdpqCgQDt27HAnDampqSotLdW2bdvcbbZu3arS0lKPNjt27FBBQYG7zfr162W3230ei+hzpeGzzz7T9u3bPbKe1q1ba86cObryyit9vRwAAOeEzajbzJzvi+PHj+vrr792P967d69yc3MVFRWlqKgozZ49Wz//+c/Vrl077du3TzNnzlR0dLR+9rOfSZIcDofGjRunadOmqU2bNoqKitL06dOVnJzsnk3RrVs3DR48WBkZGVqyZIkkafz48Ro6dKi7NyAtLU3du3dXenq6nn76aR05ckTTp09XRkaGz90sPlcaEhMTT1sKKSoq0sUXX+zr5QAAODfO8ToN27dvV8+ePdWzZ09J0tSpU9WzZ089/PDDCgwM1BdffKERI0aoa9euGjNmjLp27aotW7YoIiLCfY0FCxZo5MiRGjVqlPr166cWLVrorbfeUmBgoLvNqlWrlJycrLS0NKWlpemyyy7TypUr3ccDAwO1bt06hYaGql+/fho1apRGjhypZ555xrcXJMlmGD89lvS/56Ru2rRJM2bM0OzZs90rV2VnZ+uxxx7TE088oRtvvNHnIM5WWVmZHA6H7tt8o+wtg8/Z8wLnUnYPPttovmqNGn2oN1RaWuqXwYWnc/K7ImHhYwoIC/3pE87AVVGp/CkPN2isTZ1X3ROtWrXyGGFpGIZGjRrl3ncy7xg2bJjHKlUAADQZflrcycq8Sho++OCDho4DAICGxQ2rTPMqaejfv39DxwEAAJq4s16N6cSJEzpw4EC9u2RddtllpoMCAMDvqDSY5nPSUFxcrF//+tf6+9//ftrjjGkAADRJJA2m+TzlcsqUKSopKVF2drbCwsKUlZWlFStWqEuXLnrzzTcbIkYAANAE+FxpeP/99/XGG2/oyiuvVEBAgDp27KiBAwcqMjJSmZmZuummmxoiTgAAzGH2hGk+VxrKy8vdd9SKiopScXGxpLo7X37yySf+jQ4AAD85uSKkmc3qzmpFyN27d0uSLr/8ci1ZskTffPONXnjhBfd62QAAoPnxuXtiypQp7ptePPLIIxo0aJBWrVqlkJAQLV++3N/xAQDgHwyENM3npOFXv/qV+989e/bUvn379OWXX+qCCy5QdHS0X4MDAABNx1mv03BSixYtdMUVV/gjFgAAGoxNJu9y6bdIzl9eJQ1Tp071+oLz588/62AAAEDT5VXS8Omnn3p1sf++qdW59HHfYAXZuBMgmqd3D+U2dghAgyk75lLrrufoyZhyaRo3rAIAWAMDIU3zecolAACwJtMDIQEAOC9QaTCNpAEAYAlmV3VkRUi6JwAAgJeoNAAArIHuCdPOqtKwcuVK9evXT/Hx8dq/f78kaeHChXrjjTf8GhwAAH5j+GGzOJ+ThsWLF2vq1Km68cYbdfToUTmdTklSq1attHDhQn/HBwAAmgifk4bnn39eS5cu1axZsxQYGOje37t3b33xxRd+DQ4AAH/h1tjm+TymYe/everZs2e9/Xa7XeXl5X4JCgAAv2NFSNN8rjR06tRJubm59fb//e9/V/fu3f0REwAA/seYBtN8rjTcd999uueee1RZWSnDMLRt2zb93//9nzIzM/Xiiy82RIwAAKAJ8Dlp+PWvf63a2lrNmDFDJ06c0OjRo9W+fXs9++yzuvXWWxsiRgAATGNxJ/POap2GjIwMZWRk6Ntvv5XL5VJMTIy/4wIAwL9Yp8E0U4s7RUdH+ysOAADQxPmcNHTq1Ek225lHkO7Zs8dUQAAANAiz0yapNPieNEyZMsXjcU1NjT799FNlZWXpvvvu81dcAAD4F90TpvmcNPz2t7897f4//OEP2r59u+mAAABA0+S3u1wOGTJEr732mr8uBwCAf7FOg2l+u8vlX//6V0VFRfnrcgAA+BVTLs3zOWno2bOnx0BIwzBUWFio4uJi/fGPf/RrcAAAoOnwOWkYOXKkx+OAgAC1bdtWAwYM0CWXXOKvuAAAQBPjU9JQW1urCy+8UIMGDVJcXFxDxQQAgP8xe8I0nwZCBgUF6e6771ZVVVVDxQMAQIPg1tjm+Tx7ok+fPvr0008bIhYAANCE+TymYcKECZo2bZoOHjyoXr16KTw83OP4ZZdd5rfgAADwK6oFpnidNPzmN7/RwoULdcstt0iSJk+e7D5ms9lkGIZsNpucTqf/owQAwCzGNJjmddKwYsUKPfHEE9q7d29DxgMAAJoor5MGw6hLsTp27NhgwQAA0FBY3Mk8n8Y0/NjdLQEAaNLonjDNp6Sha9euP5k4HDlyxFRAAACgafIpaXj00UflcDgaKhYAABoM3RPm+ZQ03HrrrYqJiWmoWAAAaDh0T5jm9eJOjGcAAMDafJ49AQDAeYlKg2leJw0ul6sh4wAAoEExpsE8n5eRBgDgvESlwTSfb1gFAACsiUoDAMAaqDSYRtIAALAExjSYR/cEAADwCpUGAIA10D1hGkkDAMAS6J4wj+4JAAAawD//+U8NGzZM8fHxstlsev311z2OG4ah2bNnKz4+XmFhYRowYIB27tzp0aaqqkqTJk1SdHS0wsPDNXz4cB08eNCjTUlJidLT0+VwOORwOJSenq6jR496tDlw4ICGDRum8PBwRUdHa/Lkyaqurvb5NZE0AACswfDD5oPy8nL16NFDixYtOu3xp556SvPnz9eiRYv08ccfKy4uTgMHDtSxY8fcbaZMmaK1a9dqzZo12rRpk44fP66hQ4fK6XS624wePVq5ubnKyspSVlaWcnNzlZ6e7j7udDp10003qby8XJs2bdKaNWv02muvadq0ab69INE9AQCwinM8pmHIkCEaMmTI6S9lGFq4cKFmzZqlm2++WZK0YsUKxcbGavXq1brzzjtVWlqql156SStXrtQNN9wgSXr55ZeVkJCg9957T4MGDVJeXp6ysrKUnZ2tPn36SJKWLl2q1NRU7d69W4mJiVq/fr127dql/Px8xcfHS5LmzZunsWPHas6cOYqMjPT6NVFpAADAB2VlZR5bVVWVz9fYu3evCgsLlZaW5t5nt9vVv39/bd68WZKUk5Ojmpoajzbx8fFKSkpyt9myZYscDoc7YZCklJQUORwOjzZJSUnuhEGSBg0apKqqKuXk5PgUN0kDAMASbH7YJCkhIcE9fsDhcCgzM9PnWAoLCyVJsbGxHvtjY2PdxwoLCxUSEqLWrVv/aJuYmJh614+JifFo88Pnad26tUJCQtxtvEX3BADAGvzUPZGfn+9R0rfb7Wd9SZvN5vHYMIx6++qF8YM2p2t/Nm28QaUBAGAJJ6dcmtkkKTIy0mM7m6QhLi5Okur90i8qKnJXBeLi4lRdXa2SkpIfbXP48OF61y8uLvZo88PnKSkpUU1NTb0KxE8haQAA4Bzr1KmT4uLitGHDBve+6upqbdy4UX379pUk9erVS8HBwR5tCgoKtGPHDneb1NRUlZaWatu2be42W7duVWlpqUebHTt2qKCgwN1m/fr1stvt6tWrl09x0z0BALCGczx74vjx4/r666/dj/fu3avc3FxFRUXpggsu0JQpUzR37lx16dJFXbp00dy5c9WiRQuNHj1akuRwODRu3DhNmzZNbdq0UVRUlKZPn67k5GT3bIpu3bpp8ODBysjI0JIlSyRJ48eP19ChQ5WYmChJSktLU/fu3ZWenq6nn35aR44c0fTp05WRkeHTzAmJpAEAYCXncFXH7du369prr3U/njp1qiRpzJgxWr58uWbMmKGKigpNmDBBJSUl6tOnj9avX6+IiAj3OQsWLFBQUJBGjRqliooKXX/99Vq+fLkCAwPdbVatWqXJkye7Z1kMHz7cY22IwMBArVu3ThMmTFC/fv0UFham0aNH65lnnvH5NdkMwzhvF8YsKyuTw+HQAI1QkC24scMBGsS7h3IbOwSgwZQdc6l11z0qLS31+Vev18/x/XfFpXfOVWBI6Flfx1ldqZ1LZjZorE0dlQYAgCVw7wnzSBoAANbAXS5NY/YEAADwCpUGAIAl0D1hHkkDAMAa6J4wje4JAADgFSoNAABLoHvCPJIGAIA10D1hGkkDAMAaSBpMY0wDAADwCpUGAIAlMKbBPJIGAIA10D1hGt0TAADAK1QaAACWYDMM2Uzc2NnMuc0FSQMAwBronjCN7gkAAOAVKg0AAEtg9oR5JA0AAGuge8I0uicAAIBXqDQAACyB7gnzSBoAANZA94RpJA0AAEug0mAeYxoAAIBXqDQAAKyB7gnTSBoAAJZBF4M5dE8AAACvUGkAAFiDYdRtZs63OJIGAIAlMHvCPLonAACAV6g0AACsgdkTppE0AAAsweaq28ycb3V0TwAAAK9QaUA9t00rVPq0wx77jhQF6ZeXXypJ6jfkqG5M/05dLquQI8qpuwd21Z6dYY0RKixuzfMx+uidVsr/2q6QUJe69z6hcbMOKeHiKo92B76y66Xfx+vz7JYyXFLHxErNemGfYjrUSJIO7QvR0sfitXNbS9VU29Tr2jLd8/tv1Lptbb3nrK6y6bc3ddWeXWH64/rduiipwn1sUPzl9dpPeiJfQ2//zr8vHGeH7gnTSBpwWvu+DNXvbunsfuxy2tz/Dm3h0q6Pw/Wvt1vp3mcONkZ4gCTp8y0tNWzst+p6+Qk5a6XlT7bTzF9epKUbv1Roi7pa8qF9IZo6sosG3/qd0qcXKjzSqQNfhSoktO4boPJEgGb+8iJ17l6hJ//f15KkFU+108NjOunZt79SwA/qsS/9Pl5t4mq0Z9fpE+VpCw6o97Vl7sfhEc4GeOU4G8yeMI+kAafldEolxcGnPfaP16IkSbEdqs9lSEA9c1fv8Xg8bcEB3ZKcrK8+D1NySrkkafkT7XTVdWW646ECd7t2HU99dnduC9fh/BD9Yf1uhUe43Nf53+7Jyt3UUldcc9zd9uP3I5SzMUIPvbhXH78fedqYWkY6FRVTv0KBJoB1GkxjTANOq32naq3+ZKdWZOfpgcX7FXdB1U+fBDSy8rJASVJEq7pf9y6XtO0fkWrfuUozf9lZo5Iv1eSbumjz3x3uc2qqbZJNCg459YUQYncpIMDQzm0t3ftKioO08L4EzXh+v+xhZ/7y+MOD7fWLS5M0aUhXvf2XNnIxeA7NSKMmDQMGDNDEiRM1ceJEtWrVSm3atNGDDz4o4wzZXFVVlcrKyjw2+N+Xn7TQ05MTNHN0Zy28r4Nat63Rgje/VkRrfj2h6TIM6U+z2+vSq47rwksqJUlHvw1SRXmgXlkUo97XHlPm/+1Rv8GleuyOC/X5lnBJ0iW9yhXawqWX5sSr8oRNlScCtPTxeLlcNh0pCnJf+5kpF+im9O/UtUfFGWMYM6NAs5bs0xOv/Ef9R5ToT4/Ga81zsQ3/4uGVk90TZjara/TuiRUrVmjcuHHaunWrtm/frvHjx6tjx47KyMio1zYzM1OPPvpoI0RpLds/OFV23feltGt7Cy3f8qUG/qJEf/tT20aMDDizP8xsr715YZr3+lfufcb3v/JTB5Xp5vHFkqSLkiq0a3u41v0lWpellqtVG6ceXLJPzz/QQW+8FC1bgHTtyBJdnHxCAXWFC73xUrROHAvQLZMO//BpPYyecur4yQGSqxbEeexHI2IgpGmNnjQkJCRowYIFstlsSkxM1BdffKEFCxacNml44IEHNHXqVPfjsrIyJSQknMtwLamqIlD7vgxV+050UaBp+sOs9tqy3qF5a79W2/ga9/7IKKcCgwx17Frp0T6hS6V2bgt3P+414JiWb8lT6XeBCgySWjqcurXHpYpLqPvM534UoS8/CdfQC3t4XGfikK667uYS3ffsgdPG1e2KEzpxLFAlxUGnnYkBnG8aPWlISUmRzXZqZH5qaqrmzZsnp9OpwMBAj7Z2u112u/1ch2h5wSEuJVxcpR1bw3+6MXAOGUZdwrA5y6Gn//q14i7wHJwbHGKoa48TOvgfz78b3+yxu6db/jdHm7qxELmbWurot0FKSavrAp3w+EGNvf/U36PvCoM1c/RFmvnCPl3S88QZ4/t6R5hCQl0Kj2QGRVPA7AnzGj1pQNOT8fAhZa+PVNE3wWoVXavRU4rUIsKpDa/WzZqIaFWrtu1r1Ca27o9uwkV1v+JKioLOOOMCaAiLZnbQB2tba/ayPQpr6XKPQQiPcLoHK/5iQpHm3tVRSSnH1aPvcW3/IFLZG+qSjJPeXROlC7pUytGmVnk54Vr8cHv9bHyxe72HugTjVJIRGl7X7xHfsdpd2cheH6kjxUHq3uuEQkJd+uyjllr+ZDvd+KvvFGLn26ZJYPaEaY2eNGRnZ9d73KVLl3pVBpw70e1q9MAf9ysyyqnS7wL15SfhmjK0i4q+CZEkpaSVafrCfHf7mS/UlWZXzovVy/PiGiVmWNPbK6IlSff9vIvH/mkLDijtliOSpH5DSjX5iYNasyhWix/qoA6dq/TQ0r1K6lPubn/wP3Yty2ynY0cDFZtQrV9OPuweA+GtwGBDby+P1p9m2+Vy1U3rvP2+Ag0f+63JVwk0HTbjTFMVzoEBAwYoJydHGRkZuvPOO/XJJ58oIyND8+bN05133vmT55eVlcnhcGiARijIxi9cNE/vHspt7BCABlN2zKXWXfeotLRUkZGnX/vC9HN8/12ROuQxBQWHnvV1amsqteXvDzdorE1do1cabr/9dlVUVOiqq65SYGCgJk2apPHjxzd2WACA5obZE6Y1etIQHByshQsXavHixY0dCgAA+BGNnjQAAHAuMHvCPJIGAIA1uIy6zcz5FteoScOHH37YmE8PALASxjSYxg2rAACAV+ieAABYgk0mxzT4LZLzF0kDAMAaWBHSNLonAACAV6g0AAAsgSmX5pE0AACsgdkTptE9AQAAvEKlAQBgCTbDkM3EYEYz5zYXJA0AAGtwfb+ZOd/i6J4AAABeIWkAAFjCye4JM5svZs+eLZvN5rHFxcW5jxuGodmzZys+Pl5hYWEaMGCAdu7c6XGNqqoqTZo0SdHR0QoPD9fw4cN18OBBjzYlJSVKT0+Xw+GQw+FQenq6jh49etbv048haQAAWIPhh81Hl156qQoKCtzbF1984T721FNPaf78+Vq0aJE+/vhjxcXFaeDAgTp27Ji7zZQpU7R27VqtWbNGmzZt0vHjxzV06FA5nU53m9GjRys3N1dZWVnKyspSbm6u0tPTfQ/WC4xpAABYQyOsCBkUFORRXTh1KUMLFy7UrFmzdPPNN0uSVqxYodjYWK1evVp33nmnSktL9dJLL2nlypW64YYbJEkvv/yyEhIS9N5772nQoEHKy8tTVlaWsrOz1adPH0nS0qVLlZqaqt27dysxMfHsX+9pUGkAAMAHZWVlHltVVdUZ23711VeKj49Xp06ddOutt2rPnj2SpL1796qwsFBpaWnutna7Xf3799fmzZslSTk5OaqpqfFoEx8fr6SkJHebLVu2yOFwuBMGSUpJSZHD4XC38SeSBgCAJZxcEdLMJkkJCQnu8QMOh0OZmZmnfb4+ffroL3/5i959910tXbpUhYWF6tu3r7777jsVFhZKkmJjYz3OiY2NdR8rLCxUSEiIWrdu/aNtYmJi6j13TEyMu40/0T0BALAGP3VP5OfnKzIy0r3bbreftvmQIUPc/05OTlZqaqouuugirVixQikpKZIkm83z3pmGYdTbVz8Mzzana+/Ndc4GlQYAAHwQGRnpsZ0pafih8PBwJScn66uvvnKPc/hhNaCoqMhdfYiLi1N1dbVKSkp+tM3hw4frPVdxcXG9KoY/kDQAACzB5jK/mVFVVaW8vDy1a9dOnTp1UlxcnDZs2OA+Xl1drY0bN6pv376SpF69eik4ONijTUFBgXbs2OFuk5qaqtLSUm3bts3dZuvWrSotLXW38Se6JwAA1nCOZ09Mnz5dw4YN0wUXXKCioiL9/ve/V1lZmcaMGSObzaYpU6Zo7ty56tKli7p06aK5c+eqRYsWGj16tCTJ4XBo3LhxmjZtmtq0aaOoqChNnz5dycnJ7tkU3bp10+DBg5WRkaElS5ZIksaPH6+hQ4f6feaERNIAAECDOHjwoH75y1/q22+/Vdu2bZWSkqLs7Gx17NhRkjRjxgxVVFRowoQJKikpUZ8+fbR+/XpFRES4r7FgwQIFBQVp1KhRqqio0PXXX6/ly5crMDDQ3WbVqlWaPHmye5bF8OHDtWjRogZ5TTbDOH/vwFFWViaHw6EBGqEgW3BjhwM0iHcP5TZ2CECDKTvmUuuue1RaWuoxuNCvz3Hyu+LKWQoKCj3r69TWVurDj+c0aKxNHZUGAIAlcJdL8xgICQAAvEKlAQBgDY2wjHRzQ9IAALAGQ5KZaZPkDCQNAABrYEyDeYxpAAAAXqHSAACwBkMmxzT4LZLzFkkDAMAaGAhpGt0TAADAK1QaAADW4JJk5m7RJm9Y1RyQNAAALIHZE+bRPQEAALxCpQEAYA0MhDSNpAEAYA0kDabRPQEAALxCpQEAYA1UGkwjaQAAWANTLk0jaQAAWAJTLs1jTAMAAPAKlQYAgDUwpsE0kgYAgDW4DMlm4ovfRdJA9wQAAPAKlQYAgDXQPWEaSQMAwCJMJg0iaaB7AgAAeIVKAwDAGuieMI2kAQBgDS5DproYmD1B9wQAAPAOlQYAgDUYrrrNzPkWR9IAALAGxjSYRtIAALAGxjSYxpgGAADgFSoNAABroHvCNJIGAIA1GDKZNPgtkvMW3RMAAMArVBoAANZA94RpJA0AAGtwuSSZWGvBxToNdE8AAACvUGkAAFgD3ROmkTQAAKyBpME0uicAAIBXqDQAAKyBZaRNI2kAAFiCYbhkmLhTpZlzmwuSBgCANRiGuWoBYxoY0wAAALxDpQEAYA2GyTENVBpIGgAAFuFySTYT4xIY00D3BAAA8A6VBgCANdA9YRpJAwDAEgyXS4aJ7gmmXNI9AQAAvESlAQBgDXRPmEbSAACwBpch2UgazKB7AgAAeIVKAwDAGgxDkpl1Gqg0kDQAACzBcBkyTHRPGCQNJA0AAIswXDJXaWDKJWMaAACAV6g0AAAsge4J80gaAADWQPeEaed10nAy66tVjan1OoCmrOwYf6jQfJUdr/t8n4tf8Wa/K2pV479gzlPnddJw7NgxSdImvdPIkQANp3XXxo4AaHjHjh2Tw+FokGuHhIQoLi5OmwrNf1fExcUpJCTED1Gdn2zGedxJ43K5dOjQIUVERMhmszV2OJZQVlamhIQE5efnKzIysrHDAfyKz/e5ZxiGjh07pvj4eAUENNzY/MrKSlVXV5u+TkhIiEJDQ/0Q0fnpvK40BAQEqEOHDo0dhiVFRkbyRxXNFp/vc6uhKgz/LTQ01NJf9v7ClEsAAOAVkgYAAOAVkgb4xG6365FHHpHdbm/sUAC/4/MN/LjzeiAkAAA4d6g0AAAAr5A0AAAAr5A0AAAAr5A0AAAAr5A0AAAAr5A0AAAAr5A0wCuGYeipp55S586dFRYWph49euivf/1rY4cF+MWAAQM0ceJETZw4Ua1atVKbNm304IMPnpM7LwLnE5IGeOXBBx/UsmXLtHjxYu3cuVP33nuvbrvtNm3cuLGxQwP8YsWKFQoKCtLWrVv13HPPacGCBXrxxRcbOyygSWFxJ/yk8vJyRUdH6/3331dqaqp7/x133KETJ05o9erVjRgdYN6AAQNUVFSknTt3uu+Y+7vf/U5vvvmmdu3a1cjRAU3HeX2XS5wbu3btUmVlpQYOHOixv7q6Wj179mykqAD/SklJcScMkpSamqp58+bJ6XQqMDCwESMDmg6SBvwkl8slSVq3bp3at2/vcYw1+gHAOkga8JO6d+8uu92uAwcOqH///o0dDtAgsrOz6z3u0qULVQbgv5A04CdFRERo+vTpuvfee+VyuXT11VerrKxMmzdvVsuWLTVmzJjGDhEwLT8/X1OnTtWdd96pTz75RM8//7zmzZvX2GEBTQpJA7zy+OOPKyYmRpmZmdqzZ49atWqlK664QjNnzmzs0AC/uP3221VRUaGrrrpKgYGBmjRpksaPH9/YYQFNCrMnAFjegAEDdPnll2vhwoWNHQrQpLFOAwAA8ApJAwAA8ArdEwAAwCtUGgAAgFdIGgAAgFdIGgAAgFdIGgAAgFdIGgAAgFdIGgCTZs+ercsvv9z9eOzYsRo5cuQ5j2Pfvn2y2WzKzc09Y5sLL7zQpwWMli9frlatWpmOzWaz6fXXXzd9HQCNi6QBzdLYsWNls9lks9kUHByszp07a/r06SovL2/w53722We1fPlyr9p680UPAE0F955AszV48GAtW7ZMNTU1+te//qU77rhD5eXlWrx4cb22NTU1Cg4O9svzOhwOv1wHAJoaKg1otux2u+Li4pSQkKDRo0frV7/6lbtEfrJL4c9//rM6d+4su90uwzBUWlqq8ePHKyYmRpGRkbruuuv02WefeVz3iSeeUGxsrCIiIjRu3DhVVlZ6HP9h94TL5dKTTz6piy++WHa7XRdccIHmzJkjSerUqZMkqWfPnrLZbBowYID7vGXLlqlbt24KDQ3VJZdcoj/+8Y8ez7Nt2zb17NlToaGh6t27tz799FOf36P58+crOTlZ4eHhSkhI0IQJE3T8+PF67V5//XV17dpVoaGhGjhwoPLz8z2Ov/XWW+rVq5dCQ0PVuXNnPfroo6qtrfU5HgBNG0kDLCMsLEw1NTXux19//bVeffVVvfbaa+7ugZtuukmFhYV65513lJOToyuuuELXX3+9jhw5Ikl69dVX9cgjj2jOnDnavn272rVrV+/L/IceeOABPfnkk3rooYe0a9curV69WrGxsZLqvvgl6b333lNBQYH+9re/SZKWLl2qWbNmac6cOcrLy9PcuXP10EMPacWKFZKk8vJyDR06VImJicrJydHs2bM1ffp0n9+TgIAAPffcc9qxY4dWrFih999/XzNmzPBoc+LECc2ZM0crVqzQRx99pLKyMt16663u4++++65uu+02TZ48Wbt27dKSJUu0fPlyd2IEoBkxgGZozJgxxogRI9yPt27darRp08YYNWqUYRiG8cgjjxjBwcFGUVGRu80//vEPIzIy0qisrPS41kUXXWQsWbLEMAzDSE1NNe666y6P43369DF69Ohx2ucuKysz7Ha7sXTp0tPGuXfvXkOS8emnn3rsT0hIMFavXu2x7/HHHzdSU1MNwzCMJUuWGFFRUUZ5ebn7+OLFi097rf/WsWNHY8GCBWc8/uqrrxpt2rRxP162bJkhycjOznbvy8vLMyQZW7duNQzDMP7nf/7HmDt3rsd1Vq5cabRr1879WJKxdu3aMz4vgPMDYxrQbL399ttq2bKlamtrVVNToxEjRuj55593H+/YsaPatm3rfpyTk6Pjx4+rTZs2HtepqKjQf/7zH0lSXl6e7rrrLo/jqamp+uCDD04bQ15enqqqqnT99dd7HXdxcbHy8/M1btw4ZWRkuPfX1ta6x0vk5eWpR48eatGihUccvvrggw80d+5c7dq1S2VlZaqtrVVlZaXKy8sVHh4uSQoKClLv3r3d51xyySVq1aqV8vLydNVVVyknJ0cff/yxR2XB6XSqsrJSJ06c8IgRwPmNpAHN1rXXXqvFixcrODhY8fHx9QY6nvxSPMnlcqldu3b68MMP613rbKcdhoWF+XyOy+WSVNdF0adPH49jgYGBkiTDD/eZ279/v2688UbdddddevzxxxUVFaVNmzZp3LhxHt04Ut2UyR86uc/lcunRRx/VzTffXK9NaGio6TgBNB0kDWi2wsPDdfHFF3vd/oorrlBhYaGCgoJ04YUXnrZNt27dlJ2drdtvv929Lzs7+4zX7NKli8LCwvSPf/xDd9xxR73jISEhkup+mZ8UGxur9u3ba8+ePfrVr3512ut2795dK1euVEVFhTsx+bE4Tmf79u2qra3VvHnzFBBQN7zp1VdfrdeutrZW27dv11VXXSVJ2r17t44ePapLLrlEUt37tnv3bp/eawDnJ5IG4Hs33HCDUlNTNXLkSD355JNKTEzUoUOH9M4772jkyJHq3bu3fvvb32rMmDHq3bu3rr76aq1atUo7d+5U586dT3vN0NBQ3X///ZoxY4ZCQkLUr18/FRcXa+fOnRo3bpxiYmIUFhamrKwsdejQQaGhoXI4HJo9e7YmT56syMhIDRkyRFVVVdq+fbtKSko0depUjR49WrNmzdK4ceP04IMPat++fXrmmWd8er0XXXSRamtr9fzzz2vYsGH66KOP9MILL9RrFxwcrEmTJum5555TcHCwJk6cqJSUFHcS8fDDD2vo0KFKSEjQL37xCwUEBOjzzz/XF198od///ve+/x8BoMli9gTwPZvNpnfeeUfXXHONfvOb36hr16669dZbtW/fPvdsh1tuuUUPP/yw7r//fvXq1Uv79+/X3Xff/aPXfeihhzRt2jQ9/PDD6tatm2655RYVFRVJqhsv8Nxzz2nJkiWKj4/XiBEjJEl33HGHXnzxRS1fvlzJycnq37+/li9f7p6i2bJlS7311lvatWuXevbsqVmzZunJJ5/06fVefvnlmj9/vp588kklJSVp1apVyszMrNeuRYsWuv/++zV69GilpqYqLCxMa9ascR8fNGiQ3n77bW3YsEFXXnmlUlJSNH/+fHXs2NGneAA0fTbDH52jAACg2aPSAAAAvELSAAAAvELSAAAAvELSAAAAvELSAAAAvELSAAAAvELSAAAAvELSAAAAvELSAAAAvELSAAAAvELSAAAAvPL/AXc/a8mQm2whAAAAAElFTkSuQmCC", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "best_model = cv_results['svc'].best_estimator_\n", + "best_model.fit(X_train,y_train)\n", + "\n", + "# confusion matrix of test results\n", + "ConfusionMatrixDisplay.from_estimator(\n", + " best_model,\n", + " X_train,\n", + " y_train\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "7162513f-f781-4d4c-a535-1b63d10fdb4a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Test F2-Score: 0.9968782518210197\n", + "Test Accuracy: 0.9963057220261062\n" + ] + } + ], + "source": [ + "# Finally, report the test score and confusion matrix \n", + "y_test_predict = best_model.predict(X_test)\n", + "\n", + "test_f2_score = fbeta_score(y_test,y_test_predict,beta=2,pos_label='p')\n", + "test_accuracy = accuracy_score(y_test,y_test_predict)\n", + "print(f'Test F2-Score: {test_f2_score}\\nTest Accuracy: {test_accuracy}')" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "14991874-e219-46e1-b434-c96c96528cb7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Plotting confusion matrix for test set\n", + "ConfusionMatrixDisplay.from_predictions(\n", + " y_test,\n", + " y_test_predict\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "17aa63ee-0e26-4bcf-82cd-740d6493ea65", + "metadata": {}, + "source": [ + "The prediction model performed quite well on test data, with a final overall accuracy of 0.99 and $F_{\\beta}$ score of 0.99. The model only makes 40 mistakes out of 12214 test samples. 17 mistakes were predicting a poisonous mushroom as edible (false negative), while 23 mistakes were predicting a edible mushroom as poisonous (false positive). The model’s performance is promising for implementation, as false negatives represent potential safety risks and these errors could lead to consuming poisonous mushrooms, it is minimized to protect users. On the other hand, false positives are less harmful, they may lead to discarding safe mushrooms unnecessarily but do not endanger safety." + ] + }, + { + "cell_type": "markdown", + "id": "ebd8961e-06e7-4f8d-a4a8-3745e2dffe3c", + "metadata": {}, + "source": [ + "While the overall performance of the SVC model are impressive, efforts could focus on further reducing false negatives to enhance the safety of predictions. It might be important to take a closer look at the 40 misclassified observations to identify specific features contributing to these misclassifications. Implementing feature engineering on those features such as encoding rare categories differently can enhance the model’s power and reduce the misclassification cases. Additionally, trying other classifiers like Decision Tree and Random Forest which are less sensitive to scaling or irrelevant features might improve the prediction.\n" + ] + }, + { + "cell_type": "markdown", + "id": "4a4fb6c1-9c89-4ff0-ac25-81461d4c0245", + "metadata": {}, + "source": [ + "## References\n", + "Wagner, D., Heider, D., & Hattab, G. (2021). Secondary Mushroom [Dataset]. UCI Machine Learning Repository. https://doi.org/10.24432/C5FP5Q.\n", + "\n", + "Scikit-learn developers. (n.d.). QuantileTransformer. Scikit-learn. Retrieved November 21, 2024, from https://scikit-learn.org/dev/modules/generated/sklearn.preprocessing.QuantileTransformer.html\n", + "\n", + "Hunter, J. D. (2007). Matplotlib: A 2D Graphics Environment. Computing in Science & Engineering, 9(3), 90–95.\n", + "\n", + "McKinney, W. (2010). Data Structures for Statistical Computing in Python. Proceedings of the 9th Python in Science Conference, 51–56.\n", + "\n", + "Pedregosa, F., Varoquaux, G., Gramfort, A., Michel, V., Thirion, B., Grisel, O., … Duchesnay, E. (2011). Scikit-learn: Machine Learning in Python. Journal of Machine Learning Research, 12, 2825–2830.\n", + "\n", + "Harris, C. R., Millman, K. J., van der Walt, S. J., Gommers, R., Virtanen, P., Cournapeau, D., … Oliphant, T. E. (2020). Array programming with NumPy. Nature, 585(7825), 357–362.\n", + "\n", + "Virtanen, P., Gommers, R., Oliphant, T. E., Haberland, M., Reddy, T., Cournapeau, D., … van der Walt, S. J. (2020). SciPy 1.0: Fundamental Algorithms for Scientific Computing in Python. Nature Methods, 17, 261–272.\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:mushroom_classifier_env]", + "language": "python", + "name": "conda-env-mushroom_classifier_env-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}