diff --git a/report/heart_disease_predictor_report.ipynb b/report/heart_disease_predictor_report.ipynb index 3624be0..e7197bb 100644 --- a/report/heart_disease_predictor_report.ipynb +++ b/report/heart_disease_predictor_report.ipynb @@ -93,7 +93,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -1357,7 +1357,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -1441,7 +1441,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -1457,7 +1457,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -1467,7 +1467,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ @@ -1476,7 +1476,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -1526,7 +1526,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 29, "metadata": {}, "outputs": [ { @@ -1550,7 +1550,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ @@ -1559,7 +1559,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 31, "metadata": {}, "outputs": [ { @@ -1625,7 +1625,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ @@ -1660,7 +1660,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 33, "metadata": {}, "outputs": [ { @@ -2103,18 +2103,48 @@ " 'num_of_vessels', 'thalassemia'])])" ] }, - "execution_count": 32, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "\n" + "\n", + "#splitting the features \n", + "\n", + "numeric_features = [\n", + " \"age\", \n", + " \"resting_blood_pressure\", \n", + " \"fasting_blood_sugar\", \n", + " \"cholesterol\", \n", + " \"max_heart_rate\", \n", + " \"st_depression\", \n", + " \"sex\"\n", + "]\n", + "categorical_features = [\n", + " \"chest_pain_type\", \n", + " \"rest_ecg\", \n", + " \"exercise_induced_angina\", \n", + " \"slope\", \n", + " \"num_of_vessels\", \n", + " \"thalassemia\"\n", + "]\n", + "\n", + "numeric_transformer = StandardScaler()\n", + "categorical_transformer = OneHotEncoder(drop=\"if_binary\", handle_unknown=\"ignore\")\n", + "\n", + "# Create Column transformer\n", + "preprocessor = make_column_transformer(\n", + " (numeric_transformer, numeric_features),\n", + " (categorical_transformer, categorical_features)\n", + ")\n", + "\n", + "preprocessor\n" ] }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ @@ -2127,7 +2157,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 35, "metadata": {}, "outputs": [ { @@ -2539,7 +2569,7 @@ "[207 rows x 25 columns]" ] }, - "execution_count": 34, + "execution_count": 35, "metadata": {}, "output_type": "execute_result" } @@ -2555,7 +2585,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 36, "metadata": {}, "outputs": [ { @@ -2574,7 +2604,7 @@ " dtype='object')" ] }, - "execution_count": 35, + "execution_count": 36, "metadata": {}, "output_type": "execute_result" } @@ -2585,7 +2615,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 37, "metadata": {}, "outputs": [], "source": [ @@ -2614,7 +2644,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 38, "metadata": {}, "outputs": [ { @@ -2645,12 +2675,12 @@ " \n", " \n", " fit_time\n", - " 0.011\n", - " 0.004\n", + " 0.015\n", + " 0.005\n", " \n", " \n", " score_time\n", - " 0.012\n", + " 0.014\n", " 0.002\n", " \n", " \n", @@ -2699,8 +2729,8 @@ ], "text/plain": [ " mean std\n", - "fit_time 0.011 0.004\n", - "score_time 0.012 0.002\n", + "fit_time 0.015 0.005\n", + "score_time 0.014 0.002\n", "test_accuracy 0.677 0.123\n", "train_accuracy 1.000 0.000\n", "test_precision 0.659 0.111\n", @@ -2711,7 +2741,7 @@ "train_f1 1.000 0.000" ] }, - "execution_count": 37, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } @@ -2731,7 +2761,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 39, "metadata": {}, "outputs": [], "source": [ @@ -2744,7 +2774,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 40, "metadata": {}, "outputs": [ { @@ -2754,7 +2784,7 @@ " [ 6, 31]])" ] }, - "execution_count": 39, + "execution_count": 40, "metadata": {}, "output_type": "execute_result" } @@ -2774,7 +2804,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 41, "metadata": {}, "outputs": [ { @@ -2833,7 +2863,7 @@ "accuracy 0.755556 0.755556 0.755556" ] }, - "execution_count": 40, + "execution_count": 41, "metadata": {}, "output_type": "execute_result" } @@ -2859,7 +2889,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 42, "metadata": {}, "outputs": [ { @@ -2890,12 +2920,12 @@ " \n", " \n", " fit_time\n", - " 0.013\n", - " 0.004\n", + " 0.016\n", + " 0.008\n", " \n", " \n", " score_time\n", - " 0.010\n", + " 0.012\n", " 0.001\n", " \n", " \n", @@ -2944,8 +2974,8 @@ ], "text/plain": [ " mean std\n", - "fit_time 0.013 0.004\n", - "score_time 0.010 0.001\n", + "fit_time 0.016 0.008\n", + "score_time 0.012 0.001\n", "test_accuracy 0.841 0.068\n", "train_accuracy 0.890 0.011\n", "test_precision 0.850 0.092\n", @@ -2956,7 +2986,7 @@ "train_f1 0.882 0.014" ] }, - "execution_count": 41, + "execution_count": 42, "metadata": {}, "output_type": "execute_result" } @@ -2975,7 +3005,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 43, "metadata": {}, "outputs": [ { @@ -3000,7 +3030,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 44, "metadata": {}, "outputs": [ { @@ -3483,7 +3513,7 @@ " LogisticRegression(max_iter=1000, random_state=123))])" ] }, - "execution_count": 43, + "execution_count": 44, "metadata": {}, "output_type": "execute_result" } @@ -3503,7 +3533,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 45, "metadata": {}, "outputs": [], "source": [ @@ -3512,7 +3542,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 46, "metadata": {}, "outputs": [], "source": [ @@ -3527,245 +3557,245 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 47, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", - "\n", + "
\n", " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", "
Table 1: Logistic Regression Coefficients
 FeatureCoefficientFeatureCoefficient
7chest_pain_type_asymptomatic1.2426177chest_pain_type_asymptomatic1.242617
20num_of_vessels_2.00.93676120num_of_vessels_2.00.936761
24thalassemia_reversable defect0.90860724thalassemia_reversable defect0.908607
16slope_flat0.78917916slope_flat0.789179
6sex0.4677936sex0.467793
1resting_blood_pressure0.4643031resting_blood_pressure0.464303
21num_of_vessels_3.00.41697021num_of_vessels_3.00.416970
5st_depression0.4004225st_depression0.400422
12rest_ecg_left ventricular hypertrophy0.26895912rest_ecg_left ventricular hypertrophy0.268959
14exercise_induced_angina_yes0.21078314exercise_induced_angina_yes0.210783
8chest_pain_type_atypical angina0.1516168chest_pain_type_atypical angina0.151616
11rest_ecg_ST-T wave abnormality0.15090411rest_ecg_ST-T wave abnormality0.150904
3cholesterol0.1416753cholesterol0.141675
19num_of_vessels_1.00.13159819num_of_vessels_1.00.131598
0age-0.0891450age-0.089145
2fasting_blood_sugar-0.2263292fasting_blood_sugar-0.226329
17slope_upsloping-0.37399517slope_upsloping-0.373995
15slope_downsloping-0.38506115slope_downsloping-0.385061
13rest_ecg_normal-0.38974013rest_ecg_normal-0.389740
22thalassemia_fixed defect-0.41137022thalassemia_fixed defect-0.411370
23thalassemia_normal-0.46711323thalassemia_normal-0.467113
4max_heart_rate-0.6484894max_heart_rate-0.648489
9chest_pain_type_non-anginal pain-0.6705719chest_pain_type_non-anginal pain-0.670571
10chest_pain_type_typical angina-0.69353910chest_pain_type_typical angina-0.693539
18num_of_vessels_0.0-1.45520618num_of_vessels_0.0-1.455206
\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 46, + "execution_count": 47, "metadata": {}, "output_type": "execute_result" } @@ -3785,7 +3815,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 48, "metadata": {}, "outputs": [ { @@ -3813,7 +3843,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 49, "metadata": {}, "outputs": [], "source": [ @@ -3824,7 +3854,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 50, "metadata": {}, "outputs": [ { @@ -3834,7 +3864,7 @@ " [ 8, 29]])" ] }, - "execution_count": 49, + "execution_count": 50, "metadata": {}, "output_type": "execute_result" } @@ -3854,7 +3884,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 51, "metadata": {}, "outputs": [ { @@ -3913,7 +3943,7 @@ "accuracy 0.822222 0.822222 0.822222" ] }, - "execution_count": 50, + "execution_count": 51, "metadata": {}, "output_type": "execute_result" } @@ -3930,122 +3960,6 @@ "loges_report_filtered" ] }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "ename": "FileNotFoundError", - "evalue": "[Errno 2] No such file or directory: 'data/raw_heart_disease_data.csv'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[17], line 55\u001b[0m\n\u001b[1;32m 53\u001b[0m \u001b[38;5;66;03m# Load your dataset\u001b[39;00m\n\u001b[1;32m 54\u001b[0m file_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdata/raw_heart_disease_data.csv\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;66;03m# Update with the path to your dataset\u001b[39;00m\n\u001b[0;32m---> 55\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 57\u001b[0m \u001b[38;5;66;03m# Validate the dataset against the schema\u001b[39;00m\n\u001b[1;32m 58\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n", - "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1026\u001b[0m, in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[1;32m 1013\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[1;32m 1014\u001b[0m dialect,\n\u001b[1;32m 1015\u001b[0m delimiter,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1022\u001b[0m dtype_backend\u001b[38;5;241m=\u001b[39mdtype_backend,\n\u001b[1;32m 1023\u001b[0m )\n\u001b[1;32m 1024\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[0;32m-> 1026\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/pandas/io/parsers/readers.py:620\u001b[0m, in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 617\u001b[0m _validate_names(kwds\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnames\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[1;32m 619\u001b[0m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[0;32m--> 620\u001b[0m parser \u001b[38;5;241m=\u001b[39m \u001b[43mTextFileReader\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 622\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[1;32m 623\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m parser\n", - "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1620\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m 1617\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 1619\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles: IOHandles \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1620\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_engine\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1880\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[0;34m(self, f, engine)\u001b[0m\n\u001b[1;32m 1878\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[1;32m 1879\u001b[0m mode \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m-> 1880\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;241m=\u001b[39m \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1881\u001b[0m \u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1882\u001b[0m \u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1883\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1884\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompression\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcompression\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1885\u001b[0m \u001b[43m \u001b[49m\u001b[43mmemory_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmemory_map\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1886\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_text\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_text\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1887\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding_errors\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstrict\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1888\u001b[0m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstorage_options\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1889\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1890\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1891\u001b[0m f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles\u001b[38;5;241m.\u001b[39mhandle\n", - "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/pandas/io/common.py:873\u001b[0m, in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m 868\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(handle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[1;32m 869\u001b[0m \u001b[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[1;32m 870\u001b[0m \u001b[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[1;32m 871\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mencoding \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mmode:\n\u001b[1;32m 872\u001b[0m \u001b[38;5;66;03m# Encoding\u001b[39;00m\n\u001b[0;32m--> 873\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(\n\u001b[1;32m 874\u001b[0m handle,\n\u001b[1;32m 875\u001b[0m ioargs\u001b[38;5;241m.\u001b[39mmode,\n\u001b[1;32m 876\u001b[0m encoding\u001b[38;5;241m=\u001b[39mioargs\u001b[38;5;241m.\u001b[39mencoding,\n\u001b[1;32m 877\u001b[0m errors\u001b[38;5;241m=\u001b[39merrors,\n\u001b[1;32m 878\u001b[0m newline\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 879\u001b[0m )\n\u001b[1;32m 880\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 881\u001b[0m \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[1;32m 882\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(handle, ioargs\u001b[38;5;241m.\u001b[39mmode)\n", - "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'data/raw_heart_disease_data.csv'" - ] - } - ], - "source": [ - "import pandas as pd\n", - "import pandera as pa\n", - "from pandera import Column, Check, DataFrameSchema\n", - "\n", - "# Define the data validation schema\n", - "schema = DataFrameSchema(\n", - " {\n", - " # Validate numeric columns\n", - " \"age\": Column(int, Check.between(0, 120), nullable=False),\n", - " \"sex\": Column(int, Check.isin([0, 1]), nullable=False),\n", - " \"chest_pain_type\": Column(\n", - " str, \n", - " Check.isin([\"typical angina\", \"atypical angina\", \"non-anginal pain\", \"asymptomatic\"]), \n", - " nullable=False\n", - " ),\n", - " \"resting_blood_pressure\": Column(int, Check.between(50, 250), nullable=False),\n", - " \"cholesterol\": Column(int, Check.between(100, 600), nullable=False),\n", - " \"fasting_blood_sugar\": Column(int, Check.isin([0, 1]), nullable=False),\n", - " \"rest_ecg\": Column(\n", - " str, \n", - " Check.isin([\"normal\", \"ST-T wave abnormality\", \"left ventricular hypertrophy\"]),\n", - " nullable=False\n", - " ),\n", - " \"max_heart_rate\": Column(int, Check.between(50, 220), nullable=False),\n", - " \"exercise_induced_angina\": Column(str, Check.isin([\"yes\", \"no\"]), nullable=False),\n", - " \"st_depression\": Column(float, Check.between(0.0, 10.0), nullable=True),\n", - " \"slope\": Column(\n", - " str, \n", - " Check.isin([\"upsloping\", \"flat\", \"downsloping\"]), \n", - " nullable=False\n", - " ),\n", - " \"num_of_vessels\": Column(\n", - " float, \n", - " Check(lambda x: x.isna() | ((x >= 0) & (x <= 4)), element_wise=True), \n", - " nullable=True\n", - " ),\n", - " \"thalassemia\": Column(\n", - " str, \n", - " Check.isin([\"normal\", \"fixed defect\", \"reversable defect\"]), \n", - " nullable=True\n", - " ),\n", - " \"diagnosis\": Column(int, Check.isin([0, 1]), nullable=False),\n", - " },\n", - " # Additional checks for the entire DataFrame\n", - " checks=[\n", - " # Ensure no duplicate rows\n", - " Check(lambda df: ~df.duplicated().any(), error=\"Duplicate rows found.\"),\n", - " # Ensure no empty rows\n", - " Check(lambda df: ~(df.isna().all(axis=1)).any(), error=\"Empty rows found.\"),\n", - " ]\n", - ")\n", - "\n", - "# Load your dataset\n", - "file_path = \"data/raw_heart_disease_data.csv\" # Update with the path to your dataset\n", - "data = pd.read_csv(file_path)\n", - "\n", - "# Validate the dataset against the schema\n", - "try:\n", - " validated_data = schema.validate(data)\n", - " print(\"Data validation passed successfully!\")\n", - "except pa.errors.SchemaError as e:\n", - " print(f\"Data validation failed: {e}\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Changing column correct Column Names**\n", - "column_mapping = {\n", - "}\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# No empty observations \n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# missingness \n" - ] - }, { "cell_type": "markdown", "metadata": {},