diff --git a/report/heart_disease_predictor_report.ipynb b/report/heart_disease_predictor_report.ipynb
index 3624be0..e7197bb 100644
--- a/report/heart_disease_predictor_report.ipynb
+++ b/report/heart_disease_predictor_report.ipynb
@@ -93,7 +93,7 @@
},
{
"cell_type": "code",
- "execution_count": 25,
+ "execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
@@ -1357,7 +1357,7 @@
},
{
"cell_type": "code",
- "execution_count": 30,
+ "execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
@@ -1441,7 +1441,7 @@
},
{
"cell_type": "code",
- "execution_count": 24,
+ "execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
@@ -1457,7 +1457,7 @@
},
{
"cell_type": "code",
- "execution_count": 25,
+ "execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
@@ -1467,7 +1467,7 @@
},
{
"cell_type": "code",
- "execution_count": 26,
+ "execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
@@ -1476,7 +1476,7 @@
},
{
"cell_type": "code",
- "execution_count": 27,
+ "execution_count": 28,
"metadata": {},
"outputs": [
{
@@ -1526,7 +1526,7 @@
},
{
"cell_type": "code",
- "execution_count": 28,
+ "execution_count": 29,
"metadata": {},
"outputs": [
{
@@ -1550,7 +1550,7 @@
},
{
"cell_type": "code",
- "execution_count": 29,
+ "execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
@@ -1559,7 +1559,7 @@
},
{
"cell_type": "code",
- "execution_count": 30,
+ "execution_count": 31,
"metadata": {},
"outputs": [
{
@@ -1625,7 +1625,7 @@
},
{
"cell_type": "code",
- "execution_count": 31,
+ "execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
@@ -1660,7 +1660,7 @@
},
{
"cell_type": "code",
- "execution_count": 32,
+ "execution_count": 33,
"metadata": {},
"outputs": [
{
@@ -2103,18 +2103,48 @@
" 'num_of_vessels', 'thalassemia'])])"
]
},
- "execution_count": 32,
+ "execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "\n"
+ "\n",
+ "#splitting the features \n",
+ "\n",
+ "numeric_features = [\n",
+ " \"age\", \n",
+ " \"resting_blood_pressure\", \n",
+ " \"fasting_blood_sugar\", \n",
+ " \"cholesterol\", \n",
+ " \"max_heart_rate\", \n",
+ " \"st_depression\", \n",
+ " \"sex\"\n",
+ "]\n",
+ "categorical_features = [\n",
+ " \"chest_pain_type\", \n",
+ " \"rest_ecg\", \n",
+ " \"exercise_induced_angina\", \n",
+ " \"slope\", \n",
+ " \"num_of_vessels\", \n",
+ " \"thalassemia\"\n",
+ "]\n",
+ "\n",
+ "numeric_transformer = StandardScaler()\n",
+ "categorical_transformer = OneHotEncoder(drop=\"if_binary\", handle_unknown=\"ignore\")\n",
+ "\n",
+ "# Create Column transformer\n",
+ "preprocessor = make_column_transformer(\n",
+ " (numeric_transformer, numeric_features),\n",
+ " (categorical_transformer, categorical_features)\n",
+ ")\n",
+ "\n",
+ "preprocessor\n"
]
},
{
"cell_type": "code",
- "execution_count": 33,
+ "execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
@@ -2127,7 +2157,7 @@
},
{
"cell_type": "code",
- "execution_count": 34,
+ "execution_count": 35,
"metadata": {},
"outputs": [
{
@@ -2539,7 +2569,7 @@
"[207 rows x 25 columns]"
]
},
- "execution_count": 34,
+ "execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
@@ -2555,7 +2585,7 @@
},
{
"cell_type": "code",
- "execution_count": 35,
+ "execution_count": 36,
"metadata": {},
"outputs": [
{
@@ -2574,7 +2604,7 @@
" dtype='object')"
]
},
- "execution_count": 35,
+ "execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
@@ -2585,7 +2615,7 @@
},
{
"cell_type": "code",
- "execution_count": 36,
+ "execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
@@ -2614,7 +2644,7 @@
},
{
"cell_type": "code",
- "execution_count": 37,
+ "execution_count": 38,
"metadata": {},
"outputs": [
{
@@ -2645,12 +2675,12 @@
"
\n",
" \n",
" fit_time | \n",
- " 0.011 | \n",
- " 0.004 | \n",
+ " 0.015 | \n",
+ " 0.005 | \n",
"
\n",
" \n",
" score_time | \n",
- " 0.012 | \n",
+ " 0.014 | \n",
" 0.002 | \n",
"
\n",
" \n",
@@ -2699,8 +2729,8 @@
],
"text/plain": [
" mean std\n",
- "fit_time 0.011 0.004\n",
- "score_time 0.012 0.002\n",
+ "fit_time 0.015 0.005\n",
+ "score_time 0.014 0.002\n",
"test_accuracy 0.677 0.123\n",
"train_accuracy 1.000 0.000\n",
"test_precision 0.659 0.111\n",
@@ -2711,7 +2741,7 @@
"train_f1 1.000 0.000"
]
},
- "execution_count": 37,
+ "execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
@@ -2731,7 +2761,7 @@
},
{
"cell_type": "code",
- "execution_count": 38,
+ "execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
@@ -2744,7 +2774,7 @@
},
{
"cell_type": "code",
- "execution_count": 39,
+ "execution_count": 40,
"metadata": {},
"outputs": [
{
@@ -2754,7 +2784,7 @@
" [ 6, 31]])"
]
},
- "execution_count": 39,
+ "execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
@@ -2774,7 +2804,7 @@
},
{
"cell_type": "code",
- "execution_count": 40,
+ "execution_count": 41,
"metadata": {},
"outputs": [
{
@@ -2833,7 +2863,7 @@
"accuracy 0.755556 0.755556 0.755556"
]
},
- "execution_count": 40,
+ "execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
@@ -2859,7 +2889,7 @@
},
{
"cell_type": "code",
- "execution_count": 41,
+ "execution_count": 42,
"metadata": {},
"outputs": [
{
@@ -2890,12 +2920,12 @@
"
\n",
" \n",
" fit_time | \n",
- " 0.013 | \n",
- " 0.004 | \n",
+ " 0.016 | \n",
+ " 0.008 | \n",
"
\n",
" \n",
" score_time | \n",
- " 0.010 | \n",
+ " 0.012 | \n",
" 0.001 | \n",
"
\n",
" \n",
@@ -2944,8 +2974,8 @@
],
"text/plain": [
" mean std\n",
- "fit_time 0.013 0.004\n",
- "score_time 0.010 0.001\n",
+ "fit_time 0.016 0.008\n",
+ "score_time 0.012 0.001\n",
"test_accuracy 0.841 0.068\n",
"train_accuracy 0.890 0.011\n",
"test_precision 0.850 0.092\n",
@@ -2956,7 +2986,7 @@
"train_f1 0.882 0.014"
]
},
- "execution_count": 41,
+ "execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
@@ -2975,7 +3005,7 @@
},
{
"cell_type": "code",
- "execution_count": 42,
+ "execution_count": 43,
"metadata": {},
"outputs": [
{
@@ -3000,7 +3030,7 @@
},
{
"cell_type": "code",
- "execution_count": 43,
+ "execution_count": 44,
"metadata": {},
"outputs": [
{
@@ -3483,7 +3513,7 @@
" LogisticRegression(max_iter=1000, random_state=123))])"
]
},
- "execution_count": 43,
+ "execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
@@ -3503,7 +3533,7 @@
},
{
"cell_type": "code",
- "execution_count": 44,
+ "execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
@@ -3512,7 +3542,7 @@
},
{
"cell_type": "code",
- "execution_count": 45,
+ "execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
@@ -3527,245 +3557,245 @@
},
{
"cell_type": "code",
- "execution_count": 46,
+ "execution_count": 47,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
- "\n",
+ "\n",
" Table 1: Logistic Regression Coefficients\n",
" \n",
" \n",
" | \n",
- " Feature | \n",
- " Coefficient | \n",
+ " Feature | \n",
+ " Coefficient | \n",
"
\n",
" \n",
" \n",
" \n",
- " 7 | \n",
- " chest_pain_type_asymptomatic | \n",
- " 1.242617 | \n",
+ " 7 | \n",
+ " chest_pain_type_asymptomatic | \n",
+ " 1.242617 | \n",
"
\n",
" \n",
- " 20 | \n",
- " num_of_vessels_2.0 | \n",
- " 0.936761 | \n",
+ " 20 | \n",
+ " num_of_vessels_2.0 | \n",
+ " 0.936761 | \n",
"
\n",
" \n",
- " 24 | \n",
- " thalassemia_reversable defect | \n",
- " 0.908607 | \n",
+ " 24 | \n",
+ " thalassemia_reversable defect | \n",
+ " 0.908607 | \n",
"
\n",
" \n",
- " 16 | \n",
- " slope_flat | \n",
- " 0.789179 | \n",
+ " 16 | \n",
+ " slope_flat | \n",
+ " 0.789179 | \n",
"
\n",
" \n",
- " 6 | \n",
- " sex | \n",
- " 0.467793 | \n",
+ " 6 | \n",
+ " sex | \n",
+ " 0.467793 | \n",
"
\n",
" \n",
- " 1 | \n",
- " resting_blood_pressure | \n",
- " 0.464303 | \n",
+ " 1 | \n",
+ " resting_blood_pressure | \n",
+ " 0.464303 | \n",
"
\n",
" \n",
- " 21 | \n",
- " num_of_vessels_3.0 | \n",
- " 0.416970 | \n",
+ " 21 | \n",
+ " num_of_vessels_3.0 | \n",
+ " 0.416970 | \n",
"
\n",
" \n",
- " 5 | \n",
- " st_depression | \n",
- " 0.400422 | \n",
+ " 5 | \n",
+ " st_depression | \n",
+ " 0.400422 | \n",
"
\n",
" \n",
- " 12 | \n",
- " rest_ecg_left ventricular hypertrophy | \n",
- " 0.268959 | \n",
+ " 12 | \n",
+ " rest_ecg_left ventricular hypertrophy | \n",
+ " 0.268959 | \n",
"
\n",
" \n",
- " 14 | \n",
- " exercise_induced_angina_yes | \n",
- " 0.210783 | \n",
+ " 14 | \n",
+ " exercise_induced_angina_yes | \n",
+ " 0.210783 | \n",
"
\n",
" \n",
- " 8 | \n",
- " chest_pain_type_atypical angina | \n",
- " 0.151616 | \n",
+ " 8 | \n",
+ " chest_pain_type_atypical angina | \n",
+ " 0.151616 | \n",
"
\n",
" \n",
- " 11 | \n",
- " rest_ecg_ST-T wave abnormality | \n",
- " 0.150904 | \n",
+ " 11 | \n",
+ " rest_ecg_ST-T wave abnormality | \n",
+ " 0.150904 | \n",
"
\n",
" \n",
- " 3 | \n",
- " cholesterol | \n",
- " 0.141675 | \n",
+ " 3 | \n",
+ " cholesterol | \n",
+ " 0.141675 | \n",
"
\n",
" \n",
- " 19 | \n",
- " num_of_vessels_1.0 | \n",
- " 0.131598 | \n",
+ " 19 | \n",
+ " num_of_vessels_1.0 | \n",
+ " 0.131598 | \n",
"
\n",
" \n",
- " 0 | \n",
- " age | \n",
- " -0.089145 | \n",
+ " 0 | \n",
+ " age | \n",
+ " -0.089145 | \n",
"
\n",
" \n",
- " 2 | \n",
- " fasting_blood_sugar | \n",
- " -0.226329 | \n",
+ " 2 | \n",
+ " fasting_blood_sugar | \n",
+ " -0.226329 | \n",
"
\n",
" \n",
- " 17 | \n",
- " slope_upsloping | \n",
- " -0.373995 | \n",
+ " 17 | \n",
+ " slope_upsloping | \n",
+ " -0.373995 | \n",
"
\n",
" \n",
- " 15 | \n",
- " slope_downsloping | \n",
- " -0.385061 | \n",
+ " 15 | \n",
+ " slope_downsloping | \n",
+ " -0.385061 | \n",
"
\n",
" \n",
- " 13 | \n",
- " rest_ecg_normal | \n",
- " -0.389740 | \n",
+ " 13 | \n",
+ " rest_ecg_normal | \n",
+ " -0.389740 | \n",
"
\n",
" \n",
- " 22 | \n",
- " thalassemia_fixed defect | \n",
- " -0.411370 | \n",
+ " 22 | \n",
+ " thalassemia_fixed defect | \n",
+ " -0.411370 | \n",
"
\n",
" \n",
- " 23 | \n",
- " thalassemia_normal | \n",
- " -0.467113 | \n",
+ " 23 | \n",
+ " thalassemia_normal | \n",
+ " -0.467113 | \n",
"
\n",
" \n",
- " 4 | \n",
- " max_heart_rate | \n",
- " -0.648489 | \n",
+ " 4 | \n",
+ " max_heart_rate | \n",
+ " -0.648489 | \n",
"
\n",
" \n",
- " 9 | \n",
- " chest_pain_type_non-anginal pain | \n",
- " -0.670571 | \n",
+ " 9 | \n",
+ " chest_pain_type_non-anginal pain | \n",
+ " -0.670571 | \n",
"
\n",
" \n",
- " 10 | \n",
- " chest_pain_type_typical angina | \n",
- " -0.693539 | \n",
+ " 10 | \n",
+ " chest_pain_type_typical angina | \n",
+ " -0.693539 | \n",
"
\n",
" \n",
- " 18 | \n",
- " num_of_vessels_0.0 | \n",
- " -1.455206 | \n",
+ " 18 | \n",
+ " num_of_vessels_0.0 | \n",
+ " -1.455206 | \n",
"
\n",
" \n",
"
\n"
],
"text/plain": [
- ""
+ ""
]
},
- "execution_count": 46,
+ "execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
@@ -3785,7 +3815,7 @@
},
{
"cell_type": "code",
- "execution_count": 47,
+ "execution_count": 48,
"metadata": {},
"outputs": [
{
@@ -3813,7 +3843,7 @@
},
{
"cell_type": "code",
- "execution_count": 48,
+ "execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
@@ -3824,7 +3854,7 @@
},
{
"cell_type": "code",
- "execution_count": 49,
+ "execution_count": 50,
"metadata": {},
"outputs": [
{
@@ -3834,7 +3864,7 @@
" [ 8, 29]])"
]
},
- "execution_count": 49,
+ "execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
@@ -3854,7 +3884,7 @@
},
{
"cell_type": "code",
- "execution_count": 50,
+ "execution_count": 51,
"metadata": {},
"outputs": [
{
@@ -3913,7 +3943,7 @@
"accuracy 0.822222 0.822222 0.822222"
]
},
- "execution_count": 50,
+ "execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
@@ -3930,122 +3960,6 @@
"loges_report_filtered"
]
},
- {
- "cell_type": "code",
- "execution_count": 17,
- "metadata": {},
- "outputs": [
- {
- "ename": "FileNotFoundError",
- "evalue": "[Errno 2] No such file or directory: 'data/raw_heart_disease_data.csv'",
- "output_type": "error",
- "traceback": [
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
- "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
- "Cell \u001b[0;32mIn[17], line 55\u001b[0m\n\u001b[1;32m 53\u001b[0m \u001b[38;5;66;03m# Load your dataset\u001b[39;00m\n\u001b[1;32m 54\u001b[0m file_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdata/raw_heart_disease_data.csv\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;66;03m# Update with the path to your dataset\u001b[39;00m\n\u001b[0;32m---> 55\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 57\u001b[0m \u001b[38;5;66;03m# Validate the dataset against the schema\u001b[39;00m\n\u001b[1;32m 58\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n",
- "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1026\u001b[0m, in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[1;32m 1013\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[1;32m 1014\u001b[0m dialect,\n\u001b[1;32m 1015\u001b[0m delimiter,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1022\u001b[0m dtype_backend\u001b[38;5;241m=\u001b[39mdtype_backend,\n\u001b[1;32m 1023\u001b[0m )\n\u001b[1;32m 1024\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[0;32m-> 1026\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n",
- "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/pandas/io/parsers/readers.py:620\u001b[0m, in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 617\u001b[0m _validate_names(kwds\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnames\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[1;32m 619\u001b[0m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[0;32m--> 620\u001b[0m parser \u001b[38;5;241m=\u001b[39m \u001b[43mTextFileReader\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 622\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[1;32m 623\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m parser\n",
- "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1620\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m 1617\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 1619\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles: IOHandles \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1620\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_engine\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m)\u001b[49m\n",
- "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1880\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[0;34m(self, f, engine)\u001b[0m\n\u001b[1;32m 1878\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[1;32m 1879\u001b[0m mode \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m-> 1880\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;241m=\u001b[39m \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1881\u001b[0m \u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1882\u001b[0m \u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1883\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1884\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompression\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcompression\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1885\u001b[0m \u001b[43m \u001b[49m\u001b[43mmemory_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmemory_map\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1886\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_text\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_text\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1887\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding_errors\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstrict\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1888\u001b[0m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstorage_options\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1889\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1890\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1891\u001b[0m f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles\u001b[38;5;241m.\u001b[39mhandle\n",
- "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/pandas/io/common.py:873\u001b[0m, in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m 868\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(handle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[1;32m 869\u001b[0m \u001b[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[1;32m 870\u001b[0m \u001b[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[1;32m 871\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mencoding \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mmode:\n\u001b[1;32m 872\u001b[0m \u001b[38;5;66;03m# Encoding\u001b[39;00m\n\u001b[0;32m--> 873\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(\n\u001b[1;32m 874\u001b[0m handle,\n\u001b[1;32m 875\u001b[0m ioargs\u001b[38;5;241m.\u001b[39mmode,\n\u001b[1;32m 876\u001b[0m encoding\u001b[38;5;241m=\u001b[39mioargs\u001b[38;5;241m.\u001b[39mencoding,\n\u001b[1;32m 877\u001b[0m errors\u001b[38;5;241m=\u001b[39merrors,\n\u001b[1;32m 878\u001b[0m newline\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 879\u001b[0m )\n\u001b[1;32m 880\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 881\u001b[0m \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[1;32m 882\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(handle, ioargs\u001b[38;5;241m.\u001b[39mmode)\n",
- "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'data/raw_heart_disease_data.csv'"
- ]
- }
- ],
- "source": [
- "import pandas as pd\n",
- "import pandera as pa\n",
- "from pandera import Column, Check, DataFrameSchema\n",
- "\n",
- "# Define the data validation schema\n",
- "schema = DataFrameSchema(\n",
- " {\n",
- " # Validate numeric columns\n",
- " \"age\": Column(int, Check.between(0, 120), nullable=False),\n",
- " \"sex\": Column(int, Check.isin([0, 1]), nullable=False),\n",
- " \"chest_pain_type\": Column(\n",
- " str, \n",
- " Check.isin([\"typical angina\", \"atypical angina\", \"non-anginal pain\", \"asymptomatic\"]), \n",
- " nullable=False\n",
- " ),\n",
- " \"resting_blood_pressure\": Column(int, Check.between(50, 250), nullable=False),\n",
- " \"cholesterol\": Column(int, Check.between(100, 600), nullable=False),\n",
- " \"fasting_blood_sugar\": Column(int, Check.isin([0, 1]), nullable=False),\n",
- " \"rest_ecg\": Column(\n",
- " str, \n",
- " Check.isin([\"normal\", \"ST-T wave abnormality\", \"left ventricular hypertrophy\"]),\n",
- " nullable=False\n",
- " ),\n",
- " \"max_heart_rate\": Column(int, Check.between(50, 220), nullable=False),\n",
- " \"exercise_induced_angina\": Column(str, Check.isin([\"yes\", \"no\"]), nullable=False),\n",
- " \"st_depression\": Column(float, Check.between(0.0, 10.0), nullable=True),\n",
- " \"slope\": Column(\n",
- " str, \n",
- " Check.isin([\"upsloping\", \"flat\", \"downsloping\"]), \n",
- " nullable=False\n",
- " ),\n",
- " \"num_of_vessels\": Column(\n",
- " float, \n",
- " Check(lambda x: x.isna() | ((x >= 0) & (x <= 4)), element_wise=True), \n",
- " nullable=True\n",
- " ),\n",
- " \"thalassemia\": Column(\n",
- " str, \n",
- " Check.isin([\"normal\", \"fixed defect\", \"reversable defect\"]), \n",
- " nullable=True\n",
- " ),\n",
- " \"diagnosis\": Column(int, Check.isin([0, 1]), nullable=False),\n",
- " },\n",
- " # Additional checks for the entire DataFrame\n",
- " checks=[\n",
- " # Ensure no duplicate rows\n",
- " Check(lambda df: ~df.duplicated().any(), error=\"Duplicate rows found.\"),\n",
- " # Ensure no empty rows\n",
- " Check(lambda df: ~(df.isna().all(axis=1)).any(), error=\"Empty rows found.\"),\n",
- " ]\n",
- ")\n",
- "\n",
- "# Load your dataset\n",
- "file_path = \"data/raw_heart_disease_data.csv\" # Update with the path to your dataset\n",
- "data = pd.read_csv(file_path)\n",
- "\n",
- "# Validate the dataset against the schema\n",
- "try:\n",
- " validated_data = schema.validate(data)\n",
- " print(\"Data validation passed successfully!\")\n",
- "except pa.errors.SchemaError as e:\n",
- " print(f\"Data validation failed: {e}\")\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Changing column correct Column Names**\n",
- "column_mapping = {\n",
- "}\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# No empty observations \n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# missingness \n"
- ]
- },
{
"cell_type": "markdown",
"metadata": {},