diff --git a/report/heart_disease_predictor_report.ipynb b/report/heart_disease_predictor_report.ipynb index 5ab044f..e7197bb 100644 --- a/report/heart_disease_predictor_report.ipynb +++ b/report/heart_disease_predictor_report.ipynb @@ -1352,7 +1352,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Save the processed data to be further used" + "## Data Validation" ] }, { @@ -1360,6 +1360,90 @@ "execution_count": 24, "metadata": {}, "outputs": [], + "source": [ + "import pandera as pa \n", + "from pandera import Column, Check, DataFrameSchema\n", + "\n", + "schema = pa.DataFrameSchema(\n", + " {\n", + " # Numeric columns with no missing values allowed\n", + " \"age\": pa.Column(int, pa.Check.between(0, 120), nullable=False),\n", + " \"sex\": pa.Column(int, pa.Check.isin([0, 1]), nullable=False),\n", + "\n", + " # Categorical columns with string types\n", + " \"chest_pain_type\": pa.Column(\n", + " str,\n", + " pa.Check.isin([\"typical angina\", \"atypical angina\", \"non-anginal pain\", \"asymptomatic\"]),\n", + " nullable=False\n", + " ),\n", + "\n", + " # Numeric columns with range checks and no missing values\n", + " \"resting_blood_pressure\": pa.Column(int, pa.Check.between(50, 250), nullable=False),\n", + " \"cholesterol\": pa.Column(int, pa.Check.between(100, 600), nullable=False),\n", + " \"fasting_blood_sugar\": pa.Column(int, pa.Check.isin([0, 1]), nullable=False),\n", + "\n", + " # Categorical columns\n", + " \"rest_ecg\": pa.Column(\n", + " str,\n", + " pa.Check.isin([\"normal\", \"ST-T wave abnormality\", \"left ventricular hypertrophy\"]),\n", + " nullable=False\n", + " ),\n", + " # Numeric columns with range checks\n", + " \"max_heart_rate\": Column(int, Check.between(50, 220), nullable=False),\n", + " \n", + " # Categorical column with a string type\n", + " \"exercise_induced_angina\": Column(\n", + " str,\n", + " Check.isin([\"yes\", \"no\"]),\n", + " nullable=False), \n", + " \n", + " # Numeric columns with missing values allowed up to 5%\n", + " \"st_depression\": Column(\n", + " float,\n", + " Check.between(0.0, 10.0),\n", + " Check(lambda s: s.isna().mean() <= 0.05, element_wise=False, \n", + " error=\"Too many null values in 'st_depression' column.\"),\n", + " nullable=True),\n", + " \n", + " \"num_of_vessels\": Column(\n", + " float,\n", + " Check(lambda s: s.isna() | ((s >= 0) & (s <= 4)), element_wise=True),\n", + " Check(lambda s: s.isna().mean() <= 0.05, element_wise=False, \n", + " error=\"Too many null values in 'num_of_vessels' column.\"),\n", + " nullable=True),\n", + " \n", + " # Categorical column with missing values allowed up to 5%\n", + " \"thalassemia\": Column(\n", + " str,\n", + " Check.isin([\"normal\", \"fixed defect\", \"reversable defect\"]),\n", + " Check(lambda s: s.isna().mean() <= 0.05, element_wise=False, \n", + " error=\"Too many null values in 'thalassemia' column.\"),\n", + " nullable=True), \n", + " \n", + " # Target column\n", + " \"diagnosis\": Column(int, Check.isin([0, 1]), nullable=False),\n", + " },\n", + " checks=[\n", + " # Check for8\n", + " Check(lambda df: ~df.duplicated().any(), error=\"Duplicate rows found.\"),\n", + " # Check for empty rows\n", + " Check(lambda df: ~(df.isna().all(axis=1)).any(), error=\"Empty rows found.\")\n", + " ]\n", + ")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Save the processed data to be further used" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], "source": [ "df.to_csv(\"../data/processed/processed_heart_disease_data.csv\", index=False)" ] @@ -1373,7 +1457,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -1383,7 +1467,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ @@ -1392,7 +1476,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -1442,7 +1526,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 29, "metadata": {}, "outputs": [ { @@ -1466,7 +1550,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ @@ -1475,7 +1559,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 31, "metadata": {}, "outputs": [ { @@ -1541,7 +1625,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ @@ -1576,7 +1660,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 33, "metadata": {}, "outputs": [ { @@ -2019,12 +2103,13 @@ " 'num_of_vessels', 'thalassemia'])])" ] }, - "execution_count": 32, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "\n", "#splitting the features \n", "\n", "numeric_features = [\n", @@ -2059,7 +2144,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ @@ -2072,7 +2157,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 35, "metadata": {}, "outputs": [ { @@ -2484,7 +2569,7 @@ "[207 rows x 25 columns]" ] }, - "execution_count": 34, + "execution_count": 35, "metadata": {}, "output_type": "execute_result" } @@ -2500,7 +2585,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 36, "metadata": {}, "outputs": [ { @@ -2519,7 +2604,7 @@ " dtype='object')" ] }, - "execution_count": 35, + "execution_count": 36, "metadata": {}, "output_type": "execute_result" } @@ -2530,7 +2615,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 37, "metadata": {}, "outputs": [], "source": [ @@ -2559,7 +2644,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 38, "metadata": {}, "outputs": [ { @@ -2590,12 +2675,12 @@ "
\n", "\n", - " | Feature | \n", - "Coefficient | \n", + "Feature | \n", + "Coefficient | \n", "|
---|---|---|---|---|---|
7 | \n", - "chest_pain_type_asymptomatic | \n", - "1.242617 | \n", + "7 | \n", + "chest_pain_type_asymptomatic | \n", + "1.242617 | \n", "
20 | \n", - "num_of_vessels_2.0 | \n", - "0.936761 | \n", + "20 | \n", + "num_of_vessels_2.0 | \n", + "0.936761 | \n", "
24 | \n", - "thalassemia_reversable defect | \n", - "0.908607 | \n", + "24 | \n", + "thalassemia_reversable defect | \n", + "0.908607 | \n", "
16 | \n", - "slope_flat | \n", - "0.789179 | \n", + "16 | \n", + "slope_flat | \n", + "0.789179 | \n", "
6 | \n", - "sex | \n", - "0.467793 | \n", + "6 | \n", + "sex | \n", + "0.467793 | \n", "
1 | \n", - "resting_blood_pressure | \n", - "0.464303 | \n", + "1 | \n", + "resting_blood_pressure | \n", + "0.464303 | \n", "
21 | \n", - "num_of_vessels_3.0 | \n", - "0.416970 | \n", + "21 | \n", + "num_of_vessels_3.0 | \n", + "0.416970 | \n", "
5 | \n", - "st_depression | \n", - "0.400422 | \n", + "5 | \n", + "st_depression | \n", + "0.400422 | \n", "
12 | \n", - "rest_ecg_left ventricular hypertrophy | \n", - "0.268959 | \n", + "12 | \n", + "rest_ecg_left ventricular hypertrophy | \n", + "0.268959 | \n", "
14 | \n", - "exercise_induced_angina_yes | \n", - "0.210783 | \n", + "14 | \n", + "exercise_induced_angina_yes | \n", + "0.210783 | \n", "
8 | \n", - "chest_pain_type_atypical angina | \n", - "0.151616 | \n", + "8 | \n", + "chest_pain_type_atypical angina | \n", + "0.151616 | \n", "
11 | \n", - "rest_ecg_ST-T wave abnormality | \n", - "0.150904 | \n", + "11 | \n", + "rest_ecg_ST-T wave abnormality | \n", + "0.150904 | \n", "
3 | \n", - "cholesterol | \n", - "0.141675 | \n", + "3 | \n", + "cholesterol | \n", + "0.141675 | \n", "
19 | \n", - "num_of_vessels_1.0 | \n", - "0.131598 | \n", + "19 | \n", + "num_of_vessels_1.0 | \n", + "0.131598 | \n", "
0 | \n", - "age | \n", - "-0.089145 | \n", + "0 | \n", + "age | \n", + "-0.089145 | \n", "
2 | \n", - "fasting_blood_sugar | \n", - "-0.226329 | \n", + "2 | \n", + "fasting_blood_sugar | \n", + "-0.226329 | \n", "
17 | \n", - "slope_upsloping | \n", - "-0.373995 | \n", + "17 | \n", + "slope_upsloping | \n", + "-0.373995 | \n", "
15 | \n", - "slope_downsloping | \n", - "-0.385061 | \n", + "15 | \n", + "slope_downsloping | \n", + "-0.385061 | \n", "
13 | \n", - "rest_ecg_normal | \n", - "-0.389740 | \n", + "13 | \n", + "rest_ecg_normal | \n", + "-0.389740 | \n", "
22 | \n", - "thalassemia_fixed defect | \n", - "-0.411370 | \n", + "22 | \n", + "thalassemia_fixed defect | \n", + "-0.411370 | \n", "
23 | \n", - "thalassemia_normal | \n", - "-0.467113 | \n", + "23 | \n", + "thalassemia_normal | \n", + "-0.467113 | \n", "
4 | \n", - "max_heart_rate | \n", - "-0.648489 | \n", + "4 | \n", + "max_heart_rate | \n", + "-0.648489 | \n", "
9 | \n", - "chest_pain_type_non-anginal pain | \n", - "-0.670571 | \n", + "9 | \n", + "chest_pain_type_non-anginal pain | \n", + "-0.670571 | \n", "
10 | \n", - "chest_pain_type_typical angina | \n", - "-0.693539 | \n", + "10 | \n", + "chest_pain_type_typical angina | \n", + "-0.693539 | \n", "
18 | \n", - "num_of_vessels_0.0 | \n", - "-1.455206 | \n", + "18 | \n", + "num_of_vessels_0.0 | \n", + "-1.455206 | \n", "