diff --git a/report/heart_disease_predictor_report.ipynb b/report/heart_disease_predictor_report.ipynb index 5ab044f..e7197bb 100644 --- a/report/heart_disease_predictor_report.ipynb +++ b/report/heart_disease_predictor_report.ipynb @@ -1352,7 +1352,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Save the processed data to be further used" + "## Data Validation" ] }, { @@ -1360,6 +1360,90 @@ "execution_count": 24, "metadata": {}, "outputs": [], + "source": [ + "import pandera as pa \n", + "from pandera import Column, Check, DataFrameSchema\n", + "\n", + "schema = pa.DataFrameSchema(\n", + " {\n", + " # Numeric columns with no missing values allowed\n", + " \"age\": pa.Column(int, pa.Check.between(0, 120), nullable=False),\n", + " \"sex\": pa.Column(int, pa.Check.isin([0, 1]), nullable=False),\n", + "\n", + " # Categorical columns with string types\n", + " \"chest_pain_type\": pa.Column(\n", + " str,\n", + " pa.Check.isin([\"typical angina\", \"atypical angina\", \"non-anginal pain\", \"asymptomatic\"]),\n", + " nullable=False\n", + " ),\n", + "\n", + " # Numeric columns with range checks and no missing values\n", + " \"resting_blood_pressure\": pa.Column(int, pa.Check.between(50, 250), nullable=False),\n", + " \"cholesterol\": pa.Column(int, pa.Check.between(100, 600), nullable=False),\n", + " \"fasting_blood_sugar\": pa.Column(int, pa.Check.isin([0, 1]), nullable=False),\n", + "\n", + " # Categorical columns\n", + " \"rest_ecg\": pa.Column(\n", + " str,\n", + " pa.Check.isin([\"normal\", \"ST-T wave abnormality\", \"left ventricular hypertrophy\"]),\n", + " nullable=False\n", + " ),\n", + " # Numeric columns with range checks\n", + " \"max_heart_rate\": Column(int, Check.between(50, 220), nullable=False),\n", + " \n", + " # Categorical column with a string type\n", + " \"exercise_induced_angina\": Column(\n", + " str,\n", + " Check.isin([\"yes\", \"no\"]),\n", + " nullable=False), \n", + " \n", + " # Numeric columns with missing values allowed up to 5%\n", + " \"st_depression\": Column(\n", + " float,\n", + " Check.between(0.0, 10.0),\n", + " Check(lambda s: s.isna().mean() <= 0.05, element_wise=False, \n", + " error=\"Too many null values in 'st_depression' column.\"),\n", + " nullable=True),\n", + " \n", + " \"num_of_vessels\": Column(\n", + " float,\n", + " Check(lambda s: s.isna() | ((s >= 0) & (s <= 4)), element_wise=True),\n", + " Check(lambda s: s.isna().mean() <= 0.05, element_wise=False, \n", + " error=\"Too many null values in 'num_of_vessels' column.\"),\n", + " nullable=True),\n", + " \n", + " # Categorical column with missing values allowed up to 5%\n", + " \"thalassemia\": Column(\n", + " str,\n", + " Check.isin([\"normal\", \"fixed defect\", \"reversable defect\"]),\n", + " Check(lambda s: s.isna().mean() <= 0.05, element_wise=False, \n", + " error=\"Too many null values in 'thalassemia' column.\"),\n", + " nullable=True), \n", + " \n", + " # Target column\n", + " \"diagnosis\": Column(int, Check.isin([0, 1]), nullable=False),\n", + " },\n", + " checks=[\n", + " # Check for8\n", + " Check(lambda df: ~df.duplicated().any(), error=\"Duplicate rows found.\"),\n", + " # Check for empty rows\n", + " Check(lambda df: ~(df.isna().all(axis=1)).any(), error=\"Empty rows found.\")\n", + " ]\n", + ")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Save the processed data to be further used" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], "source": [ "df.to_csv(\"../data/processed/processed_heart_disease_data.csv\", index=False)" ] @@ -1373,7 +1457,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -1383,7 +1467,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ @@ -1392,7 +1476,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -1442,7 +1526,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 29, "metadata": {}, "outputs": [ { @@ -1466,7 +1550,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ @@ -1475,7 +1559,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 31, "metadata": {}, "outputs": [ { @@ -1541,7 +1625,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ @@ -1576,7 +1660,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 33, "metadata": {}, "outputs": [ { @@ -2019,12 +2103,13 @@ " 'num_of_vessels', 'thalassemia'])])" ] }, - "execution_count": 32, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "\n", "#splitting the features \n", "\n", "numeric_features = [\n", @@ -2059,7 +2144,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ @@ -2072,7 +2157,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 35, "metadata": {}, "outputs": [ { @@ -2484,7 +2569,7 @@ "[207 rows x 25 columns]" ] }, - "execution_count": 34, + "execution_count": 35, "metadata": {}, "output_type": "execute_result" } @@ -2500,7 +2585,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 36, "metadata": {}, "outputs": [ { @@ -2519,7 +2604,7 @@ " dtype='object')" ] }, - "execution_count": 35, + "execution_count": 36, "metadata": {}, "output_type": "execute_result" } @@ -2530,7 +2615,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 37, "metadata": {}, "outputs": [], "source": [ @@ -2559,7 +2644,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 38, "metadata": {}, "outputs": [ { @@ -2590,12 +2675,12 @@ " \n", " \n", " fit_time\n", - " 0.011\n", - " 0.004\n", + " 0.015\n", + " 0.005\n", " \n", " \n", " score_time\n", - " 0.012\n", + " 0.014\n", " 0.002\n", " \n", " \n", @@ -2644,8 +2729,8 @@ ], "text/plain": [ " mean std\n", - "fit_time 0.011 0.004\n", - "score_time 0.012 0.002\n", + "fit_time 0.015 0.005\n", + "score_time 0.014 0.002\n", "test_accuracy 0.677 0.123\n", "train_accuracy 1.000 0.000\n", "test_precision 0.659 0.111\n", @@ -2656,7 +2741,7 @@ "train_f1 1.000 0.000" ] }, - "execution_count": 37, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } @@ -2676,7 +2761,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 39, "metadata": {}, "outputs": [], "source": [ @@ -2689,7 +2774,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 40, "metadata": {}, "outputs": [ { @@ -2699,7 +2784,7 @@ " [ 6, 31]])" ] }, - "execution_count": 39, + "execution_count": 40, "metadata": {}, "output_type": "execute_result" } @@ -2719,7 +2804,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 41, "metadata": {}, "outputs": [ { @@ -2778,7 +2863,7 @@ "accuracy 0.755556 0.755556 0.755556" ] }, - "execution_count": 40, + "execution_count": 41, "metadata": {}, "output_type": "execute_result" } @@ -2804,7 +2889,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 42, "metadata": {}, "outputs": [ { @@ -2835,12 +2920,12 @@ " \n", " \n", " fit_time\n", - " 0.013\n", - " 0.004\n", + " 0.016\n", + " 0.008\n", " \n", " \n", " score_time\n", - " 0.010\n", + " 0.012\n", " 0.001\n", " \n", " \n", @@ -2889,8 +2974,8 @@ ], "text/plain": [ " mean std\n", - "fit_time 0.013 0.004\n", - "score_time 0.010 0.001\n", + "fit_time 0.016 0.008\n", + "score_time 0.012 0.001\n", "test_accuracy 0.841 0.068\n", "train_accuracy 0.890 0.011\n", "test_precision 0.850 0.092\n", @@ -2901,7 +2986,7 @@ "train_f1 0.882 0.014" ] }, - "execution_count": 41, + "execution_count": 42, "metadata": {}, "output_type": "execute_result" } @@ -2920,7 +3005,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 43, "metadata": {}, "outputs": [ { @@ -2945,7 +3030,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 44, "metadata": {}, "outputs": [ { @@ -3428,7 +3513,7 @@ " LogisticRegression(max_iter=1000, random_state=123))])" ] }, - "execution_count": 43, + "execution_count": 44, "metadata": {}, "output_type": "execute_result" } @@ -3448,7 +3533,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 45, "metadata": {}, "outputs": [], "source": [ @@ -3457,7 +3542,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 46, "metadata": {}, "outputs": [], "source": [ @@ -3472,245 +3557,245 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 47, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", - "\n", + "
\n", " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", "
Table 1: Logistic Regression Coefficients
 FeatureCoefficientFeatureCoefficient
7chest_pain_type_asymptomatic1.2426177chest_pain_type_asymptomatic1.242617
20num_of_vessels_2.00.93676120num_of_vessels_2.00.936761
24thalassemia_reversable defect0.90860724thalassemia_reversable defect0.908607
16slope_flat0.78917916slope_flat0.789179
6sex0.4677936sex0.467793
1resting_blood_pressure0.4643031resting_blood_pressure0.464303
21num_of_vessels_3.00.41697021num_of_vessels_3.00.416970
5st_depression0.4004225st_depression0.400422
12rest_ecg_left ventricular hypertrophy0.26895912rest_ecg_left ventricular hypertrophy0.268959
14exercise_induced_angina_yes0.21078314exercise_induced_angina_yes0.210783
8chest_pain_type_atypical angina0.1516168chest_pain_type_atypical angina0.151616
11rest_ecg_ST-T wave abnormality0.15090411rest_ecg_ST-T wave abnormality0.150904
3cholesterol0.1416753cholesterol0.141675
19num_of_vessels_1.00.13159819num_of_vessels_1.00.131598
0age-0.0891450age-0.089145
2fasting_blood_sugar-0.2263292fasting_blood_sugar-0.226329
17slope_upsloping-0.37399517slope_upsloping-0.373995
15slope_downsloping-0.38506115slope_downsloping-0.385061
13rest_ecg_normal-0.38974013rest_ecg_normal-0.389740
22thalassemia_fixed defect-0.41137022thalassemia_fixed defect-0.411370
23thalassemia_normal-0.46711323thalassemia_normal-0.467113
4max_heart_rate-0.6484894max_heart_rate-0.648489
9chest_pain_type_non-anginal pain-0.6705719chest_pain_type_non-anginal pain-0.670571
10chest_pain_type_typical angina-0.69353910chest_pain_type_typical angina-0.693539
18num_of_vessels_0.0-1.45520618num_of_vessels_0.0-1.455206
\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 46, + "execution_count": 47, "metadata": {}, "output_type": "execute_result" } @@ -3730,7 +3815,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 48, "metadata": {}, "outputs": [ { @@ -3758,7 +3843,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 49, "metadata": {}, "outputs": [], "source": [ @@ -3769,7 +3854,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 50, "metadata": {}, "outputs": [ { @@ -3779,7 +3864,7 @@ " [ 8, 29]])" ] }, - "execution_count": 49, + "execution_count": 50, "metadata": {}, "output_type": "execute_result" } @@ -3799,7 +3884,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 51, "metadata": {}, "outputs": [ { @@ -3858,7 +3943,7 @@ "accuracy 0.822222 0.822222 0.822222" ] }, - "execution_count": 50, + "execution_count": 51, "metadata": {}, "output_type": "execute_result" } @@ -3938,9 +4023,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python [conda env:heart_disease_522]", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "conda-env-heart_disease_522-py" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -3952,7 +4037,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.7" + "version": "3.11.9" } }, "nbformat": 4,