From 9b7d405455c9c8a4b71c3d51dbd352d762c246ee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CCeline?= <celine.habashy@gmail.com>
Date: Thu, 28 Nov 2024 14:11:46 -0800
Subject: [PATCH 01/10] Added dataframeschema for data validation

---
 report/heart_disease_predictor_report.ipynb | 504 ++++++--------------
 1 file changed, 159 insertions(+), 345 deletions(-)
diff --git a/report/heart_disease_predictor_report.ipynb b/report/heart_disease_predictor_report.ipynb
index 5ab044f..cb96fdd 100644
--- a/report/heart_disease_predictor_report.ipynb
+++ b/report/heart_disease_predictor_report.ipynb
@@ -93,7 +93,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -101,12 +101,13 @@
     "import pandas as pd\n",
     "import warnings\n",
     "\n",
+    "\n",
     "warnings.filterwarnings('ignore')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
@@ -142,7 +143,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
@@ -405,7 +406,7 @@
        "[303 rows x 14 columns]"
       ]
      },
-     "execution_count": 3,
+     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -417,7 +418,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -433,321 +434,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>age</th>\n",
-       "      <th>sex</th>\n",
-       "      <th>chest_pain_type</th>\n",
-       "      <th>resting_blood_pressure</th>\n",
-       "      <th>cholesterol</th>\n",
-       "      <th>fasting_blood_sugar</th>\n",
-       "      <th>rest_ecg</th>\n",
-       "      <th>max_heart_rate</th>\n",
-       "      <th>exercise_induced_angina</th>\n",
-       "      <th>st_depression</th>\n",
-       "      <th>slope</th>\n",
-       "      <th>num_of_vessels</th>\n",
-       "      <th>thalassemia</th>\n",
-       "      <th>diagnosis</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>63</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1</td>\n",
-       "      <td>145</td>\n",
-       "      <td>233</td>\n",
-       "      <td>1</td>\n",
-       "      <td>2</td>\n",
-       "      <td>150</td>\n",
-       "      <td>0</td>\n",
-       "      <td>2.3</td>\n",
-       "      <td>3</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>6.0</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>67</td>\n",
-       "      <td>1</td>\n",
-       "      <td>4</td>\n",
-       "      <td>160</td>\n",
-       "      <td>286</td>\n",
-       "      <td>0</td>\n",
-       "      <td>2</td>\n",
-       "      <td>108</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1.5</td>\n",
-       "      <td>2</td>\n",
-       "      <td>3.0</td>\n",
-       "      <td>3.0</td>\n",
-       "      <td>2</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>67</td>\n",
-       "      <td>1</td>\n",
-       "      <td>4</td>\n",
-       "      <td>120</td>\n",
-       "      <td>229</td>\n",
-       "      <td>0</td>\n",
-       "      <td>2</td>\n",
-       "      <td>129</td>\n",
-       "      <td>1</td>\n",
-       "      <td>2.6</td>\n",
-       "      <td>2</td>\n",
-       "      <td>2.0</td>\n",
-       "      <td>7.0</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>37</td>\n",
-       "      <td>1</td>\n",
-       "      <td>3</td>\n",
-       "      <td>130</td>\n",
-       "      <td>250</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>187</td>\n",
-       "      <td>0</td>\n",
-       "      <td>3.5</td>\n",
-       "      <td>3</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>3.0</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>41</td>\n",
-       "      <td>0</td>\n",
-       "      <td>2</td>\n",
-       "      <td>130</td>\n",
-       "      <td>204</td>\n",
-       "      <td>0</td>\n",
-       "      <td>2</td>\n",
-       "      <td>172</td>\n",
-       "      <td>0</td>\n",
-       "      <td>1.4</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>3.0</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>298</th>\n",
-       "      <td>45</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1</td>\n",
-       "      <td>110</td>\n",
-       "      <td>264</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>132</td>\n",
-       "      <td>0</td>\n",
-       "      <td>1.2</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>7.0</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>299</th>\n",
-       "      <td>68</td>\n",
-       "      <td>1</td>\n",
-       "      <td>4</td>\n",
-       "      <td>144</td>\n",
-       "      <td>193</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0</td>\n",
-       "      <td>141</td>\n",
-       "      <td>0</td>\n",
-       "      <td>3.4</td>\n",
-       "      <td>2</td>\n",
-       "      <td>2.0</td>\n",
-       "      <td>7.0</td>\n",
-       "      <td>2</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>300</th>\n",
-       "      <td>57</td>\n",
-       "      <td>1</td>\n",
-       "      <td>4</td>\n",
-       "      <td>130</td>\n",
-       "      <td>131</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>115</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1.2</td>\n",
-       "      <td>2</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>7.0</td>\n",
-       "      <td>3</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>301</th>\n",
-       "      <td>57</td>\n",
-       "      <td>0</td>\n",
-       "      <td>2</td>\n",
-       "      <td>130</td>\n",
-       "      <td>236</td>\n",
-       "      <td>0</td>\n",
-       "      <td>2</td>\n",
-       "      <td>174</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>2</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>3.0</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>302</th>\n",
-       "      <td>38</td>\n",
-       "      <td>1</td>\n",
-       "      <td>3</td>\n",
-       "      <td>138</td>\n",
-       "      <td>175</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>173</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>3.0</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>303 rows × 14 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "     age  sex  chest_pain_type  resting_blood_pressure  cholesterol  \\\n",
-       "0     63    1                1                     145          233   \n",
-       "1     67    1                4                     160          286   \n",
-       "2     67    1                4                     120          229   \n",
-       "3     37    1                3                     130          250   \n",
-       "4     41    0                2                     130          204   \n",
-       "..   ...  ...              ...                     ...          ...   \n",
-       "298   45    1                1                     110          264   \n",
-       "299   68    1                4                     144          193   \n",
-       "300   57    1                4                     130          131   \n",
-       "301   57    0                2                     130          236   \n",
-       "302   38    1                3                     138          175   \n",
-       "\n",
-       "     fasting_blood_sugar  rest_ecg  max_heart_rate  exercise_induced_angina  \\\n",
-       "0                      1         2             150                        0   \n",
-       "1                      0         2             108                        1   \n",
-       "2                      0         2             129                        1   \n",
-       "3                      0         0             187                        0   \n",
-       "4                      0         2             172                        0   \n",
-       "..                   ...       ...             ...                      ...   \n",
-       "298                    0         0             132                        0   \n",
-       "299                    1         0             141                        0   \n",
-       "300                    0         0             115                        1   \n",
-       "301                    0         2             174                        0   \n",
-       "302                    0         0             173                        0   \n",
-       "\n",
-       "     st_depression  slope  num_of_vessels  thalassemia  diagnosis  \n",
-       "0              2.3      3             0.0          6.0          0  \n",
-       "1              1.5      2             3.0          3.0          2  \n",
-       "2              2.6      2             2.0          7.0          1  \n",
-       "3              3.5      3             0.0          3.0          0  \n",
-       "4              1.4      1             0.0          3.0          0  \n",
-       "..             ...    ...             ...          ...        ...  \n",
-       "298            1.2      2             0.0          7.0          1  \n",
-       "299            3.4      2             2.0          7.0          2  \n",
-       "300            1.2      2             1.0          7.0          3  \n",
-       "301            0.0      2             1.0          3.0          1  \n",
-       "302            0.0      1             NaN          3.0          0  \n",
-       "\n",
-       "[303 rows x 14 columns]"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "df"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "age                          int64\n",
-       "sex                          int64\n",
-       "chest_pain_type              int64\n",
-       "resting_blood_pressure       int64\n",
-       "cholesterol                  int64\n",
-       "fasting_blood_sugar          int64\n",
-       "rest_ecg                     int64\n",
-       "max_heart_rate               int64\n",
-       "exercise_induced_angina      int64\n",
-       "st_depression              float64\n",
-       "slope                        int64\n",
-       "num_of_vessels             float64\n",
-       "thalassemia                float64\n",
-       "diagnosis                    int64\n",
-       "dtype: object"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Check datatypes\n",
     "\n",
@@ -1348,6 +1046,35 @@
     "df.isnull().sum()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Data Validation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "schema = pa.DataFrameSchema(\n",
+    "    {\n",
+    "        # Numeric columns with no missing values allowed\n",
+    "        \"age\": Column(int, Check.between(0, 120), nullable=False),\n",
+    "        \"sex\": Column(int, Check.isin([0, 1]), nullable=False),\n",
+    "        \n",
+    "        # Categorical columns with string types\n",
+    "        \"chest_pain_type\": Column(\n",
+    "            str,\n",
+    "            Check.isin([\"typical angina\", \"atypical angina\", \"non-anginal pain\", \"asymptomatic\"]),\n",
+    "            nullable=False\n",
+    "             )\n",
+    "    }\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -2025,36 +1752,7 @@
     }
    ],
    "source": [
-    "#splitting the features \n",
-    "\n",
-    "numeric_features = [\n",
-    "    \"age\", \n",
-    "    \"resting_blood_pressure\", \n",
-    "    \"fasting_blood_sugar\", \n",
-    "    \"cholesterol\", \n",
-    "    \"max_heart_rate\", \n",
-    "    \"st_depression\", \n",
-    "    \"sex\"\n",
-    "]\n",
-    "categorical_features = [\n",
-    "    \"chest_pain_type\", \n",
-    "    \"rest_ecg\", \n",
-    "    \"exercise_induced_angina\", \n",
-    "    \"slope\", \n",
-    "    \"num_of_vessels\", \n",
-    "    \"thalassemia\"\n",
-    "]\n",
-    "\n",
-    "numeric_transformer = StandardScaler()\n",
-    "categorical_transformer = OneHotEncoder(drop=\"if_binary\", handle_unknown=\"ignore\")\n",
-    "\n",
-    "# Create Column transformer\n",
-    "preprocessor = make_column_transformer(\n",
-    "    (numeric_transformer, numeric_features),\n",
-    "    (categorical_transformer, categorical_features)\n",
-    ")\n",
-    "\n",
-    "preprocessor\n"
+    "\n"
    ]
   },
   {
@@ -3875,6 +3573,122 @@
     "loges_report_filtered"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "FileNotFoundError",
+     "evalue": "[Errno 2] No such file or directory: 'data/raw_heart_disease_data.csv'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[17], line 55\u001b[0m\n\u001b[1;32m     53\u001b[0m \u001b[38;5;66;03m# Load your dataset\u001b[39;00m\n\u001b[1;32m     54\u001b[0m file_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdata/raw_heart_disease_data.csv\u001b[39m\u001b[38;5;124m\"\u001b[39m  \u001b[38;5;66;03m# Update with the path to your dataset\u001b[39;00m\n\u001b[0;32m---> 55\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     57\u001b[0m \u001b[38;5;66;03m# Validate the dataset against the schema\u001b[39;00m\n\u001b[1;32m     58\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1026\u001b[0m, in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[1;32m   1013\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[1;32m   1014\u001b[0m     dialect,\n\u001b[1;32m   1015\u001b[0m     delimiter,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   1022\u001b[0m     dtype_backend\u001b[38;5;241m=\u001b[39mdtype_backend,\n\u001b[1;32m   1023\u001b[0m )\n\u001b[1;32m   1024\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[0;32m-> 1026\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/pandas/io/parsers/readers.py:620\u001b[0m, in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m    617\u001b[0m _validate_names(kwds\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnames\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[1;32m    619\u001b[0m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[0;32m--> 620\u001b[0m parser \u001b[38;5;241m=\u001b[39m \u001b[43mTextFileReader\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    622\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[1;32m    623\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m parser\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1620\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m   1617\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m   1619\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles: IOHandles \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1620\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_engine\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1880\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[0;34m(self, f, engine)\u001b[0m\n\u001b[1;32m   1878\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[1;32m   1879\u001b[0m         mode \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m-> 1880\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;241m=\u001b[39m \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1881\u001b[0m \u001b[43m    \u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1882\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1883\u001b[0m \u001b[43m    \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1884\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcompression\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcompression\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1885\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmemory_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmemory_map\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1886\u001b[0m \u001b[43m    \u001b[49m\u001b[43mis_text\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_text\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1887\u001b[0m \u001b[43m    \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding_errors\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstrict\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1888\u001b[0m \u001b[43m    \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstorage_options\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1889\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1890\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1891\u001b[0m f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles\u001b[38;5;241m.\u001b[39mhandle\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/pandas/io/common.py:873\u001b[0m, in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m    868\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(handle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[1;32m    869\u001b[0m     \u001b[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[1;32m    870\u001b[0m     \u001b[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[1;32m    871\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mencoding \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mmode:\n\u001b[1;32m    872\u001b[0m         \u001b[38;5;66;03m# Encoding\u001b[39;00m\n\u001b[0;32m--> 873\u001b[0m         handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(\n\u001b[1;32m    874\u001b[0m             handle,\n\u001b[1;32m    875\u001b[0m             ioargs\u001b[38;5;241m.\u001b[39mmode,\n\u001b[1;32m    876\u001b[0m             encoding\u001b[38;5;241m=\u001b[39mioargs\u001b[38;5;241m.\u001b[39mencoding,\n\u001b[1;32m    877\u001b[0m             errors\u001b[38;5;241m=\u001b[39merrors,\n\u001b[1;32m    878\u001b[0m             newline\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m    879\u001b[0m         )\n\u001b[1;32m    880\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    881\u001b[0m         \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[1;32m    882\u001b[0m         handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(handle, ioargs\u001b[38;5;241m.\u001b[39mmode)\n",
+      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'data/raw_heart_disease_data.csv'"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import pandera as pa\n",
+    "from pandera import Column, Check, DataFrameSchema\n",
+    "\n",
+    "# Define the data validation schema\n",
+    "schema = DataFrameSchema(\n",
+    "    {\n",
+    "        # Validate numeric columns\n",
+    "        \"age\": Column(int, Check.between(0, 120), nullable=False),\n",
+    "        \"sex\": Column(int, Check.isin([0, 1]), nullable=False),\n",
+    "        \"chest_pain_type\": Column(\n",
+    "            str, \n",
+    "            Check.isin([\"typical angina\", \"atypical angina\", \"non-anginal pain\", \"asymptomatic\"]), \n",
+    "            nullable=False\n",
+    "        ),\n",
+    "        \"resting_blood_pressure\": Column(int, Check.between(50, 250), nullable=False),\n",
+    "        \"cholesterol\": Column(int, Check.between(100, 600), nullable=False),\n",
+    "        \"fasting_blood_sugar\": Column(int, Check.isin([0, 1]), nullable=False),\n",
+    "        \"rest_ecg\": Column(\n",
+    "            str, \n",
+    "            Check.isin([\"normal\", \"ST-T wave abnormality\", \"left ventricular hypertrophy\"]),\n",
+    "            nullable=False\n",
+    "        ),\n",
+    "        \"max_heart_rate\": Column(int, Check.between(50, 220), nullable=False),\n",
+    "        \"exercise_induced_angina\": Column(str, Check.isin([\"yes\", \"no\"]), nullable=False),\n",
+    "        \"st_depression\": Column(float, Check.between(0.0, 10.0), nullable=True),\n",
+    "        \"slope\": Column(\n",
+    "            str, \n",
+    "            Check.isin([\"upsloping\", \"flat\", \"downsloping\"]), \n",
+    "            nullable=False\n",
+    "        ),\n",
+    "        \"num_of_vessels\": Column(\n",
+    "            float, \n",
+    "            Check(lambda x: x.isna() | ((x >= 0) & (x <= 4)), element_wise=True), \n",
+    "            nullable=True\n",
+    "        ),\n",
+    "        \"thalassemia\": Column(\n",
+    "            str, \n",
+    "            Check.isin([\"normal\", \"fixed defect\", \"reversable defect\"]), \n",
+    "            nullable=True\n",
+    "        ),\n",
+    "        \"diagnosis\": Column(int, Check.isin([0, 1]), nullable=False),\n",
+    "    },\n",
+    "    # Additional checks for the entire DataFrame\n",
+    "    checks=[\n",
+    "        # Ensure no duplicate rows\n",
+    "        Check(lambda df: ~df.duplicated().any(), error=\"Duplicate rows found.\"),\n",
+    "        # Ensure no empty rows\n",
+    "        Check(lambda df: ~(df.isna().all(axis=1)).any(), error=\"Empty rows found.\"),\n",
+    "    ]\n",
+    ")\n",
+    "\n",
+    "# Load your dataset\n",
+    "file_path = \"data/raw_heart_disease_data.csv\"  # Update with the path to your dataset\n",
+    "data = pd.read_csv(file_path)\n",
+    "\n",
+    "# Validate the dataset against the schema\n",
+    "try:\n",
+    "    validated_data = schema.validate(data)\n",
+    "    print(\"Data validation passed successfully!\")\n",
+    "except pa.errors.SchemaError as e:\n",
+    "    print(f\"Data validation failed: {e}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Changing column correct Column Names**\n",
+    "column_mapping = {\n",
+    "}\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# No empty observations \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# missingness \n"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -3938,9 +3752,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python [conda env:heart_disease_522]",
+   "display_name": "Python [conda env:base] *",
    "language": "python",
-   "name": "conda-env-heart_disease_522-py"
+   "name": "conda-base-py"
   },
   "language_info": {
    "codemirror_mode": {
@@ -3952,7 +3766,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.7"
+   "version": "3.11.10"
   }
  },
  "nbformat": 4,

From cfcb46c00a416d8cfa4fa91196adfe2b2f2209bb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CCeline?= <celine.habashy@gmail.com>
Date: Thu, 28 Nov 2024 14:15:08 -0800
Subject: [PATCH 02/10] Add numeric columns with range checks and no missing
 values

---
 report/heart_disease_predictor_report.ipynb | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/report/heart_disease_predictor_report.ipynb b/report/heart_disease_predictor_report.ipynb
index cb96fdd..f08757a 100644
--- a/report/heart_disease_predictor_report.ipynb
+++ b/report/heart_disease_predictor_report.ipynb
@@ -1070,7 +1070,9 @@
     "            str,\n",
     "            Check.isin([\"typical angina\", \"atypical angina\", \"non-anginal pain\", \"asymptomatic\"]),\n",
     "            nullable=False\n",
-    "             )\n",
+    "             \n",
+    "        \n",
+    "        )\n",
     "    }\n",
     ")"
    ]

From fa02d0ae971c53f93bb8f14648edc305ce069f6c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CCeline?= <celine.habashy@gmail.com>
Date: Thu, 28 Nov 2024 14:17:32 -0800
Subject: [PATCH 03/10] Checking catergorical columns for data validation

---
 report/heart_disease_predictor_report.ipynb | 28 ++++++++++++++-------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/report/heart_disease_predictor_report.ipynb b/report/heart_disease_predictor_report.ipynb
index f08757a..f4fc198 100644
--- a/report/heart_disease_predictor_report.ipynb
+++ b/report/heart_disease_predictor_report.ipynb
@@ -1055,26 +1055,36 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 30,
    "metadata": {},
    "outputs": [],
    "source": [
     "schema = pa.DataFrameSchema(\n",
     "    {\n",
     "        # Numeric columns with no missing values allowed\n",
-    "        \"age\": Column(int, Check.between(0, 120), nullable=False),\n",
-    "        \"sex\": Column(int, Check.isin([0, 1]), nullable=False),\n",
-    "        \n",
+    "        \"age\": pa.Column(int, pa.Check.between(0, 120), nullable=False),\n",
+    "        \"sex\": pa.Column(int, pa.Check.isin([0, 1]), nullable=False),\n",
+    "\n",
     "        # Categorical columns with string types\n",
-    "        \"chest_pain_type\": Column(\n",
+    "        \"chest_pain_type\": pa.Column(\n",
     "            str,\n",
-    "            Check.isin([\"typical angina\", \"atypical angina\", \"non-anginal pain\", \"asymptomatic\"]),\n",
+    "            pa.Check.isin([\"typical angina\", \"atypical angina\", \"non-anginal pain\", \"asymptomatic\"]),\n",
+    "            nullable=False\n",
+    "        ),\n",
+    "\n",
+    "        # Numeric columns with range checks and no missing values\n",
+    "        \"resting_blood_pressure\": pa.Column(int, pa.Check.between(50, 250), nullable=False),\n",
+    "        \"cholesterol\": pa.Column(int, pa.Check.between(100, 600), nullable=False),\n",
+    "        \"fasting_blood_sugar\": pa.Column(int, pa.Check.isin([0, 1]), nullable=False),\n",
+    "\n",
+    "        # Categorical columns\n",
+    "        \"rest_ecg\": pa.Column(\n",
+    "            str,\n",
+    "            pa.Check.isin([\"normal\", \"ST-T wave abnormality\", \"left ventricular hypertrophy\"]),\n",
     "            nullable=False\n",
-    "             \n",
-    "        \n",
     "        )\n",
     "    }\n",
-    ")"
+    ")\n"
    ]
   },
   {

From c2b44ef9ccc421e7297943ab77d3596ccf6e11bd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CCeline?= <celine.habashy@gmail.com>
Date: Thu, 28 Nov 2024 14:18:56 -0800
Subject: [PATCH 04/10] Add numeric columns with range checks

---
 report/heart_disease_predictor_report.ipynb | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/report/heart_disease_predictor_report.ipynb b/report/heart_disease_predictor_report.ipynb
index f4fc198..8efef11 100644
--- a/report/heart_disease_predictor_report.ipynb
+++ b/report/heart_disease_predictor_report.ipynb
@@ -1055,9 +1055,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 31,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "ename": "SyntaxError",
+     "evalue": "closing parenthesis ')' does not match opening parenthesis '{' on line 2 (3202913183.py, line 27)",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;36m  Cell \u001b[0;32mIn[31], line 27\u001b[0;36m\u001b[0m\n\u001b[0;31m    )\u001b[0m\n\u001b[0m    ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m closing parenthesis ')' does not match opening parenthesis '{' on line 2\n"
+     ]
+    }
+   ],
    "source": [
     "schema = pa.DataFrameSchema(\n",
     "    {\n",
@@ -1082,6 +1091,9 @@
     "            str,\n",
     "            pa.Check.isin([\"normal\", \"ST-T wave abnormality\", \"left ventricular hypertrophy\"]),\n",
     "            nullable=False\n",
+    "        ),\n",
+    "       # Numeric columns with range checks\n",
+    "        \"max_heart_rate\": Column(int, Check.between(50, 220), nullable=False),\n",
     "        )\n",
     "    }\n",
     ")\n"

From b5aa95813ecbf813314aa2b694a2488578b4bbe8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CCeline?= <celine.habashy@gmail.com>
Date: Thu, 28 Nov 2024 14:31:14 -0800
Subject: [PATCH 05/10] Add correct data types in each column

---
 report/heart_disease_predictor_report.ipynb | 28 ++++++++++++---------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/report/heart_disease_predictor_report.ipynb b/report/heart_disease_predictor_report.ipynb
index 8efef11..da69a2c 100644
--- a/report/heart_disease_predictor_report.ipynb
+++ b/report/heart_disease_predictor_report.ipynb
@@ -1055,18 +1055,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 36,
    "metadata": {},
-   "outputs": [
-    {
-     "ename": "SyntaxError",
-     "evalue": "closing parenthesis ')' does not match opening parenthesis '{' on line 2 (3202913183.py, line 27)",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;36m  Cell \u001b[0;32mIn[31], line 27\u001b[0;36m\u001b[0m\n\u001b[0;31m    )\u001b[0m\n\u001b[0m    ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m closing parenthesis ')' does not match opening parenthesis '{' on line 2\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "schema = pa.DataFrameSchema(\n",
     "    {\n",
@@ -1094,7 +1085,20 @@
     "        ),\n",
     "       # Numeric columns with range checks\n",
     "        \"max_heart_rate\": Column(int, Check.between(50, 220), nullable=False),\n",
-    "        )\n",
+    "        \n",
+    "        # Categorical column with a string type\n",
+    "        \"exercise_induced_angina\": Column(\n",
+    "            str,\n",
+    "            Check.isin([\"yes\", \"no\"]),\n",
+    "            nullable=False),    \n",
+    "        \n",
+    "        # Numeric columns with missing values allowed up to 5%\n",
+    "        \"st_depression\": Column(\n",
+    "            float,\n",
+    "            Check.between(0.0, 10.0),\n",
+    "            Check(lambda s: s.isna().mean() <= 0.05, element_wise=False, \n",
+    "                  error=\"Too many null values in 'st_depression' column.\"),\n",
+    "            nullable=True),\n",
     "    }\n",
     ")\n"
    ]

From 8628739f12e885aa327a8f8583a7c210cbcbc086 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CCeline?= <celine.habashy@gmail.com>
Date: Thu, 28 Nov 2024 14:32:32 -0800
Subject: [PATCH 06/10] Added correct data types in different column

---
 report/heart_disease_predictor_report.ipynb | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/report/heart_disease_predictor_report.ipynb b/report/heart_disease_predictor_report.ipynb
index da69a2c..aa6ebe6 100644
--- a/report/heart_disease_predictor_report.ipynb
+++ b/report/heart_disease_predictor_report.ipynb
@@ -1055,7 +1055,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 38,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1099,6 +1099,14 @@
     "            Check(lambda s: s.isna().mean() <= 0.05, element_wise=False, \n",
     "                  error=\"Too many null values in 'st_depression' column.\"),\n",
     "            nullable=True),\n",
+    "        \n",
+    "        \"num_of_vessels\": Column(\n",
+    "            float,\n",
+    "            Check(lambda s: s.isna() | ((s >= 0) & (s <= 4)), element_wise=True),\n",
+    "            Check(lambda s: s.isna().mean() <= 0.05, element_wise=False, \n",
+    "                  error=\"Too many null values in 'num_of_vessels' column.\"),\n",
+    "            nullable=True),\n",
+    "            \n",
     "    }\n",
     ")\n"
    ]

From c4158780ae1f8eb82217b57f7e5368c5b914f43b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CCeline?= <celine.habashy@gmail.com>
Date: Thu, 28 Nov 2024 14:34:15 -0800
Subject: [PATCH 07/10] Added 5% missingness threshold

---
 report/heart_disease_predictor_report.ipynb | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/report/heart_disease_predictor_report.ipynb b/report/heart_disease_predictor_report.ipynb
index aa6ebe6..757407f 100644
--- a/report/heart_disease_predictor_report.ipynb
+++ b/report/heart_disease_predictor_report.ipynb
@@ -1055,7 +1055,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 40,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1106,7 +1106,14 @@
     "            Check(lambda s: s.isna().mean() <= 0.05, element_wise=False, \n",
     "                  error=\"Too many null values in 'num_of_vessels' column.\"),\n",
     "            nullable=True),\n",
-    "            \n",
+    "        \n",
+    "        # Categorical column with missing values allowed up to 5%\n",
+    "        \"thalassemia\": Column(\n",
+    "            str,\n",
+    "            Check.isin([\"normal\", \"fixed defect\", \"reversable defect\"]),\n",
+    "            Check(lambda s: s.isna().mean() <= 0.05, element_wise=False, \n",
+    "                  error=\"Too many null values in 'thalassemia' column.\"),\n",
+    "            nullable=True)          \n",
     "    }\n",
     ")\n"
    ]

From f60b8a74c8fddcc3ebfba6d4a6f460d21aeed8e4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CCeline?= <celine.habashy@gmail.com>
Date: Thu, 28 Nov 2024 14:36:08 -0800
Subject: [PATCH 08/10] Checked for duplicate and empty rows

---
 report/heart_disease_predictor_report.ipynb | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/report/heart_disease_predictor_report.ipynb b/report/heart_disease_predictor_report.ipynb
index 757407f..a974996 100644
--- a/report/heart_disease_predictor_report.ipynb
+++ b/report/heart_disease_predictor_report.ipynb
@@ -1055,7 +1055,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 42,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1113,8 +1113,17 @@
     "            Check.isin([\"normal\", \"fixed defect\", \"reversable defect\"]),\n",
     "            Check(lambda s: s.isna().mean() <= 0.05, element_wise=False, \n",
     "                  error=\"Too many null values in 'thalassemia' column.\"),\n",
-    "            nullable=True)          \n",
-    "    }\n",
+    "            nullable=True), \n",
+    "     \n",
+    "        # Target column\n",
+    "        \"diagnosis\": Column(int, Check.isin([0, 1]), nullable=False),\n",
+    "    },\n",
+    "        checks=[\n",
+    "        # Check for duplicate rows\n",
+    "        Check(lambda df: ~df.duplicated().any(), error=\"Duplicate rows found.\"),\n",
+    "        # Check for empty rows\n",
+    "        Check(lambda df: ~(df.isna().all(axis=1)).any(), error=\"Empty rows found.\")\n",
+    "    ]\n",
     ")\n"
    ]
   },

From 67b19817423f3068a54c7f48852eccc742497b03 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CCeline?= <celine.habashy@gmail.com>
Date: Thu, 28 Nov 2024 15:46:02 -0800
Subject: [PATCH 09/10] Add data validation from 1-6

---
 report/heart_disease_predictor_report.ipynb | 335 +++++++++++++++++++-
 1 file changed, 320 insertions(+), 15 deletions(-)

diff --git a/report/heart_disease_predictor_report.ipynb b/report/heart_disease_predictor_report.ipynb
index a974996..3624be0 100644
--- a/report/heart_disease_predictor_report.ipynb
+++ b/report/heart_disease_predictor_report.ipynb
@@ -93,7 +93,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 25,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -101,13 +101,12 @@
     "import pandas as pd\n",
     "import warnings\n",
     "\n",
-    "\n",
     "warnings.filterwarnings('ignore')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
@@ -143,7 +142,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -406,7 +405,7 @@
        "[303 rows x 14 columns]"
       ]
      },
-     "execution_count": 13,
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -418,7 +417,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -434,18 +433,321 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>age</th>\n",
+       "      <th>sex</th>\n",
+       "      <th>chest_pain_type</th>\n",
+       "      <th>resting_blood_pressure</th>\n",
+       "      <th>cholesterol</th>\n",
+       "      <th>fasting_blood_sugar</th>\n",
+       "      <th>rest_ecg</th>\n",
+       "      <th>max_heart_rate</th>\n",
+       "      <th>exercise_induced_angina</th>\n",
+       "      <th>st_depression</th>\n",
+       "      <th>slope</th>\n",
+       "      <th>num_of_vessels</th>\n",
+       "      <th>thalassemia</th>\n",
+       "      <th>diagnosis</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>63</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>145</td>\n",
+       "      <td>233</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2</td>\n",
+       "      <td>150</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2.3</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>6.0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>67</td>\n",
+       "      <td>1</td>\n",
+       "      <td>4</td>\n",
+       "      <td>160</td>\n",
+       "      <td>286</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2</td>\n",
+       "      <td>108</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1.5</td>\n",
+       "      <td>2</td>\n",
+       "      <td>3.0</td>\n",
+       "      <td>3.0</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>67</td>\n",
+       "      <td>1</td>\n",
+       "      <td>4</td>\n",
+       "      <td>120</td>\n",
+       "      <td>229</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2</td>\n",
+       "      <td>129</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.6</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>37</td>\n",
+       "      <td>1</td>\n",
+       "      <td>3</td>\n",
+       "      <td>130</td>\n",
+       "      <td>250</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>187</td>\n",
+       "      <td>0</td>\n",
+       "      <td>3.5</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>3.0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>41</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2</td>\n",
+       "      <td>130</td>\n",
+       "      <td>204</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2</td>\n",
+       "      <td>172</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1.4</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>3.0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>298</th>\n",
+       "      <td>45</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>110</td>\n",
+       "      <td>264</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>132</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1.2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>299</th>\n",
+       "      <td>68</td>\n",
+       "      <td>1</td>\n",
+       "      <td>4</td>\n",
+       "      <td>144</td>\n",
+       "      <td>193</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>141</td>\n",
+       "      <td>0</td>\n",
+       "      <td>3.4</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>300</th>\n",
+       "      <td>57</td>\n",
+       "      <td>1</td>\n",
+       "      <td>4</td>\n",
+       "      <td>130</td>\n",
+       "      <td>131</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>115</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1.2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>301</th>\n",
+       "      <td>57</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2</td>\n",
+       "      <td>130</td>\n",
+       "      <td>236</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2</td>\n",
+       "      <td>174</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>3.0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>302</th>\n",
+       "      <td>38</td>\n",
+       "      <td>1</td>\n",
+       "      <td>3</td>\n",
+       "      <td>138</td>\n",
+       "      <td>175</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>173</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>3.0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>303 rows × 14 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     age  sex  chest_pain_type  resting_blood_pressure  cholesterol  \\\n",
+       "0     63    1                1                     145          233   \n",
+       "1     67    1                4                     160          286   \n",
+       "2     67    1                4                     120          229   \n",
+       "3     37    1                3                     130          250   \n",
+       "4     41    0                2                     130          204   \n",
+       "..   ...  ...              ...                     ...          ...   \n",
+       "298   45    1                1                     110          264   \n",
+       "299   68    1                4                     144          193   \n",
+       "300   57    1                4                     130          131   \n",
+       "301   57    0                2                     130          236   \n",
+       "302   38    1                3                     138          175   \n",
+       "\n",
+       "     fasting_blood_sugar  rest_ecg  max_heart_rate  exercise_induced_angina  \\\n",
+       "0                      1         2             150                        0   \n",
+       "1                      0         2             108                        1   \n",
+       "2                      0         2             129                        1   \n",
+       "3                      0         0             187                        0   \n",
+       "4                      0         2             172                        0   \n",
+       "..                   ...       ...             ...                      ...   \n",
+       "298                    0         0             132                        0   \n",
+       "299                    1         0             141                        0   \n",
+       "300                    0         0             115                        1   \n",
+       "301                    0         2             174                        0   \n",
+       "302                    0         0             173                        0   \n",
+       "\n",
+       "     st_depression  slope  num_of_vessels  thalassemia  diagnosis  \n",
+       "0              2.3      3             0.0          6.0          0  \n",
+       "1              1.5      2             3.0          3.0          2  \n",
+       "2              2.6      2             2.0          7.0          1  \n",
+       "3              3.5      3             0.0          3.0          0  \n",
+       "4              1.4      1             0.0          3.0          0  \n",
+       "..             ...    ...             ...          ...        ...  \n",
+       "298            1.2      2             0.0          7.0          1  \n",
+       "299            3.4      2             2.0          7.0          2  \n",
+       "300            1.2      2             1.0          7.0          3  \n",
+       "301            0.0      2             1.0          3.0          1  \n",
+       "302            0.0      1             NaN          3.0          0  \n",
+       "\n",
+       "[303 rows x 14 columns]"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "df"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "age                          int64\n",
+       "sex                          int64\n",
+       "chest_pain_type              int64\n",
+       "resting_blood_pressure       int64\n",
+       "cholesterol                  int64\n",
+       "fasting_blood_sugar          int64\n",
+       "rest_ecg                     int64\n",
+       "max_heart_rate               int64\n",
+       "exercise_induced_angina      int64\n",
+       "st_depression              float64\n",
+       "slope                        int64\n",
+       "num_of_vessels             float64\n",
+       "thalassemia                float64\n",
+       "diagnosis                    int64\n",
+       "dtype: object"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "# Check datatypes\n",
     "\n",
@@ -1055,10 +1357,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": 30,
    "metadata": {},
    "outputs": [],
    "source": [
+    "import pandera as pa \n",
+    "from pandera import Column, Check, DataFrameSchema\n",
+    "\n",
     "schema = pa.DataFrameSchema(\n",
     "    {\n",
     "        # Numeric columns with no missing values allowed\n",
@@ -1119,7 +1424,7 @@
     "        \"diagnosis\": Column(int, Check.isin([0, 1]), nullable=False),\n",
     "    },\n",
     "        checks=[\n",
-    "        # Check for duplicate rows\n",
+    "        # Check for8\n",
     "        Check(lambda df: ~df.duplicated().any(), error=\"Duplicate rows found.\"),\n",
     "        # Check for empty rows\n",
     "        Check(lambda df: ~(df.isna().all(axis=1)).any(), error=\"Empty rows found.\")\n",
@@ -3804,9 +4109,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python [conda env:base] *",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
-   "name": "conda-base-py"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -3818,7 +4123,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.10"
+   "version": "3.11.9"
   }
  },
  "nbformat": 4,

From 684549be8c5c6bd202cf5840d01555b0bbff9f27 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CCeline?= <celine.habashy@gmail.com>
Date: Thu, 28 Nov 2024 15:55:55 -0800
Subject: [PATCH 10/10] Add missing content

---
 report/heart_disease_predictor_report.ipynb | 450 ++++++++------------
 1 file changed, 182 insertions(+), 268 deletions(-)

diff --git a/report/heart_disease_predictor_report.ipynb b/report/heart_disease_predictor_report.ipynb
index 3624be0..e7197bb 100644
--- a/report/heart_disease_predictor_report.ipynb
+++ b/report/heart_disease_predictor_report.ipynb
@@ -93,7 +93,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1357,7 +1357,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1441,7 +1441,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 25,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1457,7 +1457,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 26,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1467,7 +1467,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 27,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1476,7 +1476,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 28,
    "metadata": {},
    "outputs": [
     {
@@ -1526,7 +1526,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 29,
    "metadata": {},
    "outputs": [
     {
@@ -1550,7 +1550,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 30,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1559,7 +1559,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 31,
    "metadata": {},
    "outputs": [
     {
@@ -1625,7 +1625,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 32,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1660,7 +1660,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 33,
    "metadata": {},
    "outputs": [
     {
@@ -2103,18 +2103,48 @@
        "                                  'num_of_vessels', 'thalassemia'])])"
       ]
      },
-     "execution_count": 32,
+     "execution_count": 33,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "\n"
+    "\n",
+    "#splitting the features \n",
+    "\n",
+    "numeric_features = [\n",
+    "    \"age\", \n",
+    "    \"resting_blood_pressure\", \n",
+    "    \"fasting_blood_sugar\", \n",
+    "    \"cholesterol\", \n",
+    "    \"max_heart_rate\", \n",
+    "    \"st_depression\", \n",
+    "    \"sex\"\n",
+    "]\n",
+    "categorical_features = [\n",
+    "    \"chest_pain_type\", \n",
+    "    \"rest_ecg\", \n",
+    "    \"exercise_induced_angina\", \n",
+    "    \"slope\", \n",
+    "    \"num_of_vessels\", \n",
+    "    \"thalassemia\"\n",
+    "]\n",
+    "\n",
+    "numeric_transformer = StandardScaler()\n",
+    "categorical_transformer = OneHotEncoder(drop=\"if_binary\", handle_unknown=\"ignore\")\n",
+    "\n",
+    "# Create Column transformer\n",
+    "preprocessor = make_column_transformer(\n",
+    "    (numeric_transformer, numeric_features),\n",
+    "    (categorical_transformer, categorical_features)\n",
+    ")\n",
+    "\n",
+    "preprocessor\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": 34,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2127,7 +2157,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 35,
    "metadata": {},
    "outputs": [
     {
@@ -2539,7 +2569,7 @@
        "[207 rows x 25 columns]"
       ]
      },
-     "execution_count": 34,
+     "execution_count": 35,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2555,7 +2585,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 36,
    "metadata": {},
    "outputs": [
     {
@@ -2574,7 +2604,7 @@
        "      dtype='object')"
       ]
      },
-     "execution_count": 35,
+     "execution_count": 36,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2585,7 +2615,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 37,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2614,7 +2644,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": 38,
    "metadata": {},
    "outputs": [
     {
@@ -2645,12 +2675,12 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>fit_time</th>\n",
-       "      <td>0.011</td>\n",
-       "      <td>0.004</td>\n",
+       "      <td>0.015</td>\n",
+       "      <td>0.005</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>score_time</th>\n",
-       "      <td>0.012</td>\n",
+       "      <td>0.014</td>\n",
        "      <td>0.002</td>\n",
        "    </tr>\n",
        "    <tr>\n",
@@ -2699,8 +2729,8 @@
       ],
       "text/plain": [
        "                  mean    std\n",
-       "fit_time         0.011  0.004\n",
-       "score_time       0.012  0.002\n",
+       "fit_time         0.015  0.005\n",
+       "score_time       0.014  0.002\n",
        "test_accuracy    0.677  0.123\n",
        "train_accuracy   1.000  0.000\n",
        "test_precision   0.659  0.111\n",
@@ -2711,7 +2741,7 @@
        "train_f1         1.000  0.000"
       ]
      },
-     "execution_count": 37,
+     "execution_count": 38,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2731,7 +2761,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 39,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2744,7 +2774,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": 40,
    "metadata": {},
    "outputs": [
     {
@@ -2754,7 +2784,7 @@
        "       [ 6, 31]])"
       ]
      },
-     "execution_count": 39,
+     "execution_count": 40,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2774,7 +2804,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 41,
    "metadata": {},
    "outputs": [
     {
@@ -2833,7 +2863,7 @@
        "accuracy   0.755556  0.755556  0.755556"
       ]
      },
-     "execution_count": 40,
+     "execution_count": 41,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2859,7 +2889,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": 42,
    "metadata": {},
    "outputs": [
     {
@@ -2890,12 +2920,12 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>fit_time</th>\n",
-       "      <td>0.013</td>\n",
-       "      <td>0.004</td>\n",
+       "      <td>0.016</td>\n",
+       "      <td>0.008</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>score_time</th>\n",
-       "      <td>0.010</td>\n",
+       "      <td>0.012</td>\n",
        "      <td>0.001</td>\n",
        "    </tr>\n",
        "    <tr>\n",
@@ -2944,8 +2974,8 @@
       ],
       "text/plain": [
        "                  mean    std\n",
-       "fit_time         0.013  0.004\n",
-       "score_time       0.010  0.001\n",
+       "fit_time         0.016  0.008\n",
+       "score_time       0.012  0.001\n",
        "test_accuracy    0.841  0.068\n",
        "train_accuracy   0.890  0.011\n",
        "test_precision   0.850  0.092\n",
@@ -2956,7 +2986,7 @@
        "train_f1         0.882  0.014"
       ]
      },
-     "execution_count": 41,
+     "execution_count": 42,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2975,7 +3005,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": 43,
    "metadata": {},
    "outputs": [
     {
@@ -3000,7 +3030,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 43,
+   "execution_count": 44,
    "metadata": {},
    "outputs": [
     {
@@ -3483,7 +3513,7 @@
        "                 LogisticRegression(max_iter=1000, random_state=123))])"
       ]
      },
-     "execution_count": 43,
+     "execution_count": 44,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -3503,7 +3533,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 44,
+   "execution_count": 45,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3512,7 +3542,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 45,
+   "execution_count": 46,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3527,245 +3557,245 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 46,
+   "execution_count": 47,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/html": [
        "<style type=\"text/css\">\n",
-       "#T_d6e4e_row0_col1 {\n",
+       "#T_0b6d4_row0_col1 {\n",
        "  background-color: #b40426;\n",
        "  color: #f1f1f1;\n",
        "}\n",
-       "#T_d6e4e_row1_col1 {\n",
+       "#T_0b6d4_row1_col1 {\n",
        "  background-color: #da5a49;\n",
        "  color: #f1f1f1;\n",
        "}\n",
-       "#T_d6e4e_row2_col1 {\n",
+       "#T_0b6d4_row2_col1 {\n",
        "  background-color: #dd5f4b;\n",
        "  color: #f1f1f1;\n",
        "}\n",
-       "#T_d6e4e_row3_col1 {\n",
+       "#T_0b6d4_row3_col1 {\n",
        "  background-color: #e8765c;\n",
        "  color: #f1f1f1;\n",
        "}\n",
-       "#T_d6e4e_row4_col1, #T_d6e4e_row5_col1 {\n",
+       "#T_0b6d4_row4_col1, #T_0b6d4_row5_col1 {\n",
        "  background-color: #f7a889;\n",
        "  color: #000000;\n",
        "}\n",
-       "#T_d6e4e_row6_col1 {\n",
+       "#T_0b6d4_row6_col1 {\n",
        "  background-color: #f7af91;\n",
        "  color: #000000;\n",
        "}\n",
-       "#T_d6e4e_row7_col1 {\n",
+       "#T_0b6d4_row7_col1 {\n",
        "  background-color: #f7b093;\n",
        "  color: #000000;\n",
        "}\n",
-       "#T_d6e4e_row8_col1 {\n",
+       "#T_0b6d4_row8_col1 {\n",
        "  background-color: #f5c0a7;\n",
        "  color: #000000;\n",
        "}\n",
-       "#T_d6e4e_row9_col1 {\n",
+       "#T_0b6d4_row9_col1 {\n",
        "  background-color: #f4c6af;\n",
        "  color: #000000;\n",
        "}\n",
-       "#T_d6e4e_row10_col1, #T_d6e4e_row11_col1 {\n",
+       "#T_0b6d4_row10_col1, #T_0b6d4_row11_col1 {\n",
        "  background-color: #f1ccb8;\n",
        "  color: #000000;\n",
        "}\n",
-       "#T_d6e4e_row12_col1 {\n",
+       "#T_0b6d4_row12_col1 {\n",
        "  background-color: #f1cdba;\n",
        "  color: #000000;\n",
        "}\n",
-       "#T_d6e4e_row13_col1 {\n",
+       "#T_0b6d4_row13_col1 {\n",
        "  background-color: #f0cdbb;\n",
        "  color: #000000;\n",
        "}\n",
-       "#T_d6e4e_row14_col1 {\n",
+       "#T_0b6d4_row14_col1 {\n",
        "  background-color: #dedcdb;\n",
        "  color: #000000;\n",
        "}\n",
-       "#T_d6e4e_row15_col1 {\n",
+       "#T_0b6d4_row15_col1 {\n",
        "  background-color: #d1dae9;\n",
        "  color: #000000;\n",
        "}\n",
-       "#T_d6e4e_row16_col1 {\n",
+       "#T_0b6d4_row16_col1 {\n",
        "  background-color: #c0d4f5;\n",
        "  color: #000000;\n",
        "}\n",
-       "#T_d6e4e_row17_col1, #T_d6e4e_row18_col1 {\n",
+       "#T_0b6d4_row17_col1, #T_0b6d4_row18_col1 {\n",
        "  background-color: #bfd3f6;\n",
        "  color: #000000;\n",
        "}\n",
-       "#T_d6e4e_row19_col1 {\n",
+       "#T_0b6d4_row19_col1 {\n",
        "  background-color: #bcd2f7;\n",
        "  color: #000000;\n",
        "}\n",
-       "#T_d6e4e_row20_col1 {\n",
+       "#T_0b6d4_row20_col1 {\n",
        "  background-color: #b5cdfa;\n",
        "  color: #000000;\n",
        "}\n",
-       "#T_d6e4e_row21_col1 {\n",
+       "#T_0b6d4_row21_col1 {\n",
        "  background-color: #9ebeff;\n",
        "  color: #000000;\n",
        "}\n",
-       "#T_d6e4e_row22_col1 {\n",
+       "#T_0b6d4_row22_col1 {\n",
        "  background-color: #9bbcff;\n",
        "  color: #000000;\n",
        "}\n",
-       "#T_d6e4e_row23_col1 {\n",
+       "#T_0b6d4_row23_col1 {\n",
        "  background-color: #98b9ff;\n",
        "  color: #000000;\n",
        "}\n",
-       "#T_d6e4e_row24_col1 {\n",
+       "#T_0b6d4_row24_col1 {\n",
        "  background-color: #3b4cc0;\n",
        "  color: #f1f1f1;\n",
        "}\n",
        "</style>\n",
-       "<table id=\"T_d6e4e\">\n",
+       "<table id=\"T_0b6d4\">\n",
        "  <caption>Table 1: Logistic Regression Coefficients</caption>\n",
        "  <thead>\n",
        "    <tr>\n",
        "      <th class=\"blank level0\" >&nbsp;</th>\n",
-       "      <th id=\"T_d6e4e_level0_col0\" class=\"col_heading level0 col0\" >Feature</th>\n",
-       "      <th id=\"T_d6e4e_level0_col1\" class=\"col_heading level0 col1\" >Coefficient</th>\n",
+       "      <th id=\"T_0b6d4_level0_col0\" class=\"col_heading level0 col0\" >Feature</th>\n",
+       "      <th id=\"T_0b6d4_level0_col1\" class=\"col_heading level0 col1\" >Coefficient</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th id=\"T_d6e4e_level0_row0\" class=\"row_heading level0 row0\" >7</th>\n",
-       "      <td id=\"T_d6e4e_row0_col0\" class=\"data row0 col0\" >chest_pain_type_asymptomatic</td>\n",
-       "      <td id=\"T_d6e4e_row0_col1\" class=\"data row0 col1\" >1.242617</td>\n",
+       "      <th id=\"T_0b6d4_level0_row0\" class=\"row_heading level0 row0\" >7</th>\n",
+       "      <td id=\"T_0b6d4_row0_col0\" class=\"data row0 col0\" >chest_pain_type_asymptomatic</td>\n",
+       "      <td id=\"T_0b6d4_row0_col1\" class=\"data row0 col1\" >1.242617</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_d6e4e_level0_row1\" class=\"row_heading level0 row1\" >20</th>\n",
-       "      <td id=\"T_d6e4e_row1_col0\" class=\"data row1 col0\" >num_of_vessels_2.0</td>\n",
-       "      <td id=\"T_d6e4e_row1_col1\" class=\"data row1 col1\" >0.936761</td>\n",
+       "      <th id=\"T_0b6d4_level0_row1\" class=\"row_heading level0 row1\" >20</th>\n",
+       "      <td id=\"T_0b6d4_row1_col0\" class=\"data row1 col0\" >num_of_vessels_2.0</td>\n",
+       "      <td id=\"T_0b6d4_row1_col1\" class=\"data row1 col1\" >0.936761</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_d6e4e_level0_row2\" class=\"row_heading level0 row2\" >24</th>\n",
-       "      <td id=\"T_d6e4e_row2_col0\" class=\"data row2 col0\" >thalassemia_reversable defect</td>\n",
-       "      <td id=\"T_d6e4e_row2_col1\" class=\"data row2 col1\" >0.908607</td>\n",
+       "      <th id=\"T_0b6d4_level0_row2\" class=\"row_heading level0 row2\" >24</th>\n",
+       "      <td id=\"T_0b6d4_row2_col0\" class=\"data row2 col0\" >thalassemia_reversable defect</td>\n",
+       "      <td id=\"T_0b6d4_row2_col1\" class=\"data row2 col1\" >0.908607</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_d6e4e_level0_row3\" class=\"row_heading level0 row3\" >16</th>\n",
-       "      <td id=\"T_d6e4e_row3_col0\" class=\"data row3 col0\" >slope_flat</td>\n",
-       "      <td id=\"T_d6e4e_row3_col1\" class=\"data row3 col1\" >0.789179</td>\n",
+       "      <th id=\"T_0b6d4_level0_row3\" class=\"row_heading level0 row3\" >16</th>\n",
+       "      <td id=\"T_0b6d4_row3_col0\" class=\"data row3 col0\" >slope_flat</td>\n",
+       "      <td id=\"T_0b6d4_row3_col1\" class=\"data row3 col1\" >0.789179</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_d6e4e_level0_row4\" class=\"row_heading level0 row4\" >6</th>\n",
-       "      <td id=\"T_d6e4e_row4_col0\" class=\"data row4 col0\" >sex</td>\n",
-       "      <td id=\"T_d6e4e_row4_col1\" class=\"data row4 col1\" >0.467793</td>\n",
+       "      <th id=\"T_0b6d4_level0_row4\" class=\"row_heading level0 row4\" >6</th>\n",
+       "      <td id=\"T_0b6d4_row4_col0\" class=\"data row4 col0\" >sex</td>\n",
+       "      <td id=\"T_0b6d4_row4_col1\" class=\"data row4 col1\" >0.467793</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_d6e4e_level0_row5\" class=\"row_heading level0 row5\" >1</th>\n",
-       "      <td id=\"T_d6e4e_row5_col0\" class=\"data row5 col0\" >resting_blood_pressure</td>\n",
-       "      <td id=\"T_d6e4e_row5_col1\" class=\"data row5 col1\" >0.464303</td>\n",
+       "      <th id=\"T_0b6d4_level0_row5\" class=\"row_heading level0 row5\" >1</th>\n",
+       "      <td id=\"T_0b6d4_row5_col0\" class=\"data row5 col0\" >resting_blood_pressure</td>\n",
+       "      <td id=\"T_0b6d4_row5_col1\" class=\"data row5 col1\" >0.464303</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_d6e4e_level0_row6\" class=\"row_heading level0 row6\" >21</th>\n",
-       "      <td id=\"T_d6e4e_row6_col0\" class=\"data row6 col0\" >num_of_vessels_3.0</td>\n",
-       "      <td id=\"T_d6e4e_row6_col1\" class=\"data row6 col1\" >0.416970</td>\n",
+       "      <th id=\"T_0b6d4_level0_row6\" class=\"row_heading level0 row6\" >21</th>\n",
+       "      <td id=\"T_0b6d4_row6_col0\" class=\"data row6 col0\" >num_of_vessels_3.0</td>\n",
+       "      <td id=\"T_0b6d4_row6_col1\" class=\"data row6 col1\" >0.416970</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_d6e4e_level0_row7\" class=\"row_heading level0 row7\" >5</th>\n",
-       "      <td id=\"T_d6e4e_row7_col0\" class=\"data row7 col0\" >st_depression</td>\n",
-       "      <td id=\"T_d6e4e_row7_col1\" class=\"data row7 col1\" >0.400422</td>\n",
+       "      <th id=\"T_0b6d4_level0_row7\" class=\"row_heading level0 row7\" >5</th>\n",
+       "      <td id=\"T_0b6d4_row7_col0\" class=\"data row7 col0\" >st_depression</td>\n",
+       "      <td id=\"T_0b6d4_row7_col1\" class=\"data row7 col1\" >0.400422</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_d6e4e_level0_row8\" class=\"row_heading level0 row8\" >12</th>\n",
-       "      <td id=\"T_d6e4e_row8_col0\" class=\"data row8 col0\" >rest_ecg_left ventricular hypertrophy</td>\n",
-       "      <td id=\"T_d6e4e_row8_col1\" class=\"data row8 col1\" >0.268959</td>\n",
+       "      <th id=\"T_0b6d4_level0_row8\" class=\"row_heading level0 row8\" >12</th>\n",
+       "      <td id=\"T_0b6d4_row8_col0\" class=\"data row8 col0\" >rest_ecg_left ventricular hypertrophy</td>\n",
+       "      <td id=\"T_0b6d4_row8_col1\" class=\"data row8 col1\" >0.268959</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_d6e4e_level0_row9\" class=\"row_heading level0 row9\" >14</th>\n",
-       "      <td id=\"T_d6e4e_row9_col0\" class=\"data row9 col0\" >exercise_induced_angina_yes</td>\n",
-       "      <td id=\"T_d6e4e_row9_col1\" class=\"data row9 col1\" >0.210783</td>\n",
+       "      <th id=\"T_0b6d4_level0_row9\" class=\"row_heading level0 row9\" >14</th>\n",
+       "      <td id=\"T_0b6d4_row9_col0\" class=\"data row9 col0\" >exercise_induced_angina_yes</td>\n",
+       "      <td id=\"T_0b6d4_row9_col1\" class=\"data row9 col1\" >0.210783</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_d6e4e_level0_row10\" class=\"row_heading level0 row10\" >8</th>\n",
-       "      <td id=\"T_d6e4e_row10_col0\" class=\"data row10 col0\" >chest_pain_type_atypical angina</td>\n",
-       "      <td id=\"T_d6e4e_row10_col1\" class=\"data row10 col1\" >0.151616</td>\n",
+       "      <th id=\"T_0b6d4_level0_row10\" class=\"row_heading level0 row10\" >8</th>\n",
+       "      <td id=\"T_0b6d4_row10_col0\" class=\"data row10 col0\" >chest_pain_type_atypical angina</td>\n",
+       "      <td id=\"T_0b6d4_row10_col1\" class=\"data row10 col1\" >0.151616</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_d6e4e_level0_row11\" class=\"row_heading level0 row11\" >11</th>\n",
-       "      <td id=\"T_d6e4e_row11_col0\" class=\"data row11 col0\" >rest_ecg_ST-T wave abnormality</td>\n",
-       "      <td id=\"T_d6e4e_row11_col1\" class=\"data row11 col1\" >0.150904</td>\n",
+       "      <th id=\"T_0b6d4_level0_row11\" class=\"row_heading level0 row11\" >11</th>\n",
+       "      <td id=\"T_0b6d4_row11_col0\" class=\"data row11 col0\" >rest_ecg_ST-T wave abnormality</td>\n",
+       "      <td id=\"T_0b6d4_row11_col1\" class=\"data row11 col1\" >0.150904</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_d6e4e_level0_row12\" class=\"row_heading level0 row12\" >3</th>\n",
-       "      <td id=\"T_d6e4e_row12_col0\" class=\"data row12 col0\" >cholesterol</td>\n",
-       "      <td id=\"T_d6e4e_row12_col1\" class=\"data row12 col1\" >0.141675</td>\n",
+       "      <th id=\"T_0b6d4_level0_row12\" class=\"row_heading level0 row12\" >3</th>\n",
+       "      <td id=\"T_0b6d4_row12_col0\" class=\"data row12 col0\" >cholesterol</td>\n",
+       "      <td id=\"T_0b6d4_row12_col1\" class=\"data row12 col1\" >0.141675</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_d6e4e_level0_row13\" class=\"row_heading level0 row13\" >19</th>\n",
-       "      <td id=\"T_d6e4e_row13_col0\" class=\"data row13 col0\" >num_of_vessels_1.0</td>\n",
-       "      <td id=\"T_d6e4e_row13_col1\" class=\"data row13 col1\" >0.131598</td>\n",
+       "      <th id=\"T_0b6d4_level0_row13\" class=\"row_heading level0 row13\" >19</th>\n",
+       "      <td id=\"T_0b6d4_row13_col0\" class=\"data row13 col0\" >num_of_vessels_1.0</td>\n",
+       "      <td id=\"T_0b6d4_row13_col1\" class=\"data row13 col1\" >0.131598</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_d6e4e_level0_row14\" class=\"row_heading level0 row14\" >0</th>\n",
-       "      <td id=\"T_d6e4e_row14_col0\" class=\"data row14 col0\" >age</td>\n",
-       "      <td id=\"T_d6e4e_row14_col1\" class=\"data row14 col1\" >-0.089145</td>\n",
+       "      <th id=\"T_0b6d4_level0_row14\" class=\"row_heading level0 row14\" >0</th>\n",
+       "      <td id=\"T_0b6d4_row14_col0\" class=\"data row14 col0\" >age</td>\n",
+       "      <td id=\"T_0b6d4_row14_col1\" class=\"data row14 col1\" >-0.089145</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_d6e4e_level0_row15\" class=\"row_heading level0 row15\" >2</th>\n",
-       "      <td id=\"T_d6e4e_row15_col0\" class=\"data row15 col0\" >fasting_blood_sugar</td>\n",
-       "      <td id=\"T_d6e4e_row15_col1\" class=\"data row15 col1\" >-0.226329</td>\n",
+       "      <th id=\"T_0b6d4_level0_row15\" class=\"row_heading level0 row15\" >2</th>\n",
+       "      <td id=\"T_0b6d4_row15_col0\" class=\"data row15 col0\" >fasting_blood_sugar</td>\n",
+       "      <td id=\"T_0b6d4_row15_col1\" class=\"data row15 col1\" >-0.226329</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_d6e4e_level0_row16\" class=\"row_heading level0 row16\" >17</th>\n",
-       "      <td id=\"T_d6e4e_row16_col0\" class=\"data row16 col0\" >slope_upsloping</td>\n",
-       "      <td id=\"T_d6e4e_row16_col1\" class=\"data row16 col1\" >-0.373995</td>\n",
+       "      <th id=\"T_0b6d4_level0_row16\" class=\"row_heading level0 row16\" >17</th>\n",
+       "      <td id=\"T_0b6d4_row16_col0\" class=\"data row16 col0\" >slope_upsloping</td>\n",
+       "      <td id=\"T_0b6d4_row16_col1\" class=\"data row16 col1\" >-0.373995</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_d6e4e_level0_row17\" class=\"row_heading level0 row17\" >15</th>\n",
-       "      <td id=\"T_d6e4e_row17_col0\" class=\"data row17 col0\" >slope_downsloping</td>\n",
-       "      <td id=\"T_d6e4e_row17_col1\" class=\"data row17 col1\" >-0.385061</td>\n",
+       "      <th id=\"T_0b6d4_level0_row17\" class=\"row_heading level0 row17\" >15</th>\n",
+       "      <td id=\"T_0b6d4_row17_col0\" class=\"data row17 col0\" >slope_downsloping</td>\n",
+       "      <td id=\"T_0b6d4_row17_col1\" class=\"data row17 col1\" >-0.385061</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_d6e4e_level0_row18\" class=\"row_heading level0 row18\" >13</th>\n",
-       "      <td id=\"T_d6e4e_row18_col0\" class=\"data row18 col0\" >rest_ecg_normal</td>\n",
-       "      <td id=\"T_d6e4e_row18_col1\" class=\"data row18 col1\" >-0.389740</td>\n",
+       "      <th id=\"T_0b6d4_level0_row18\" class=\"row_heading level0 row18\" >13</th>\n",
+       "      <td id=\"T_0b6d4_row18_col0\" class=\"data row18 col0\" >rest_ecg_normal</td>\n",
+       "      <td id=\"T_0b6d4_row18_col1\" class=\"data row18 col1\" >-0.389740</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_d6e4e_level0_row19\" class=\"row_heading level0 row19\" >22</th>\n",
-       "      <td id=\"T_d6e4e_row19_col0\" class=\"data row19 col0\" >thalassemia_fixed defect</td>\n",
-       "      <td id=\"T_d6e4e_row19_col1\" class=\"data row19 col1\" >-0.411370</td>\n",
+       "      <th id=\"T_0b6d4_level0_row19\" class=\"row_heading level0 row19\" >22</th>\n",
+       "      <td id=\"T_0b6d4_row19_col0\" class=\"data row19 col0\" >thalassemia_fixed defect</td>\n",
+       "      <td id=\"T_0b6d4_row19_col1\" class=\"data row19 col1\" >-0.411370</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_d6e4e_level0_row20\" class=\"row_heading level0 row20\" >23</th>\n",
-       "      <td id=\"T_d6e4e_row20_col0\" class=\"data row20 col0\" >thalassemia_normal</td>\n",
-       "      <td id=\"T_d6e4e_row20_col1\" class=\"data row20 col1\" >-0.467113</td>\n",
+       "      <th id=\"T_0b6d4_level0_row20\" class=\"row_heading level0 row20\" >23</th>\n",
+       "      <td id=\"T_0b6d4_row20_col0\" class=\"data row20 col0\" >thalassemia_normal</td>\n",
+       "      <td id=\"T_0b6d4_row20_col1\" class=\"data row20 col1\" >-0.467113</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_d6e4e_level0_row21\" class=\"row_heading level0 row21\" >4</th>\n",
-       "      <td id=\"T_d6e4e_row21_col0\" class=\"data row21 col0\" >max_heart_rate</td>\n",
-       "      <td id=\"T_d6e4e_row21_col1\" class=\"data row21 col1\" >-0.648489</td>\n",
+       "      <th id=\"T_0b6d4_level0_row21\" class=\"row_heading level0 row21\" >4</th>\n",
+       "      <td id=\"T_0b6d4_row21_col0\" class=\"data row21 col0\" >max_heart_rate</td>\n",
+       "      <td id=\"T_0b6d4_row21_col1\" class=\"data row21 col1\" >-0.648489</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_d6e4e_level0_row22\" class=\"row_heading level0 row22\" >9</th>\n",
-       "      <td id=\"T_d6e4e_row22_col0\" class=\"data row22 col0\" >chest_pain_type_non-anginal pain</td>\n",
-       "      <td id=\"T_d6e4e_row22_col1\" class=\"data row22 col1\" >-0.670571</td>\n",
+       "      <th id=\"T_0b6d4_level0_row22\" class=\"row_heading level0 row22\" >9</th>\n",
+       "      <td id=\"T_0b6d4_row22_col0\" class=\"data row22 col0\" >chest_pain_type_non-anginal pain</td>\n",
+       "      <td id=\"T_0b6d4_row22_col1\" class=\"data row22 col1\" >-0.670571</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_d6e4e_level0_row23\" class=\"row_heading level0 row23\" >10</th>\n",
-       "      <td id=\"T_d6e4e_row23_col0\" class=\"data row23 col0\" >chest_pain_type_typical angina</td>\n",
-       "      <td id=\"T_d6e4e_row23_col1\" class=\"data row23 col1\" >-0.693539</td>\n",
+       "      <th id=\"T_0b6d4_level0_row23\" class=\"row_heading level0 row23\" >10</th>\n",
+       "      <td id=\"T_0b6d4_row23_col0\" class=\"data row23 col0\" >chest_pain_type_typical angina</td>\n",
+       "      <td id=\"T_0b6d4_row23_col1\" class=\"data row23 col1\" >-0.693539</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_d6e4e_level0_row24\" class=\"row_heading level0 row24\" >18</th>\n",
-       "      <td id=\"T_d6e4e_row24_col0\" class=\"data row24 col0\" >num_of_vessels_0.0</td>\n",
-       "      <td id=\"T_d6e4e_row24_col1\" class=\"data row24 col1\" >-1.455206</td>\n",
+       "      <th id=\"T_0b6d4_level0_row24\" class=\"row_heading level0 row24\" >18</th>\n",
+       "      <td id=\"T_0b6d4_row24_col0\" class=\"data row24 col0\" >num_of_vessels_0.0</td>\n",
+       "      <td id=\"T_0b6d4_row24_col1\" class=\"data row24 col1\" >-1.455206</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n"
       ],
       "text/plain": [
-       "<pandas.io.formats.style.Styler at 0x155eded20>"
+       "<pandas.io.formats.style.Styler at 0x145f2acd0>"
       ]
      },
-     "execution_count": 46,
+     "execution_count": 47,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -3785,7 +3815,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 47,
+   "execution_count": 48,
    "metadata": {},
    "outputs": [
     {
@@ -3813,7 +3843,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 48,
+   "execution_count": 49,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3824,7 +3854,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 49,
+   "execution_count": 50,
    "metadata": {},
    "outputs": [
     {
@@ -3834,7 +3864,7 @@
        "       [ 8, 29]])"
       ]
      },
-     "execution_count": 49,
+     "execution_count": 50,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -3854,7 +3884,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 50,
+   "execution_count": 51,
    "metadata": {},
    "outputs": [
     {
@@ -3913,7 +3943,7 @@
        "accuracy   0.822222  0.822222  0.822222"
       ]
      },
-     "execution_count": 50,
+     "execution_count": 51,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -3930,122 +3960,6 @@
     "loges_report_filtered"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "FileNotFoundError",
-     "evalue": "[Errno 2] No such file or directory: 'data/raw_heart_disease_data.csv'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[17], line 55\u001b[0m\n\u001b[1;32m     53\u001b[0m \u001b[38;5;66;03m# Load your dataset\u001b[39;00m\n\u001b[1;32m     54\u001b[0m file_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdata/raw_heart_disease_data.csv\u001b[39m\u001b[38;5;124m\"\u001b[39m  \u001b[38;5;66;03m# Update with the path to your dataset\u001b[39;00m\n\u001b[0;32m---> 55\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     57\u001b[0m \u001b[38;5;66;03m# Validate the dataset against the schema\u001b[39;00m\n\u001b[1;32m     58\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1026\u001b[0m, in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[1;32m   1013\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[1;32m   1014\u001b[0m     dialect,\n\u001b[1;32m   1015\u001b[0m     delimiter,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   1022\u001b[0m     dtype_backend\u001b[38;5;241m=\u001b[39mdtype_backend,\n\u001b[1;32m   1023\u001b[0m )\n\u001b[1;32m   1024\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[0;32m-> 1026\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/pandas/io/parsers/readers.py:620\u001b[0m, in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m    617\u001b[0m _validate_names(kwds\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnames\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[1;32m    619\u001b[0m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[0;32m--> 620\u001b[0m parser \u001b[38;5;241m=\u001b[39m \u001b[43mTextFileReader\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    622\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[1;32m    623\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m parser\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1620\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m   1617\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m   1619\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles: IOHandles \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1620\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_engine\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1880\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[0;34m(self, f, engine)\u001b[0m\n\u001b[1;32m   1878\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[1;32m   1879\u001b[0m         mode \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m-> 1880\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;241m=\u001b[39m \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1881\u001b[0m \u001b[43m    \u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1882\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1883\u001b[0m \u001b[43m    \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1884\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcompression\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcompression\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1885\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmemory_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmemory_map\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1886\u001b[0m \u001b[43m    \u001b[49m\u001b[43mis_text\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_text\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1887\u001b[0m \u001b[43m    \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding_errors\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstrict\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1888\u001b[0m \u001b[43m    \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstorage_options\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1889\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1890\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1891\u001b[0m f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles\u001b[38;5;241m.\u001b[39mhandle\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/pandas/io/common.py:873\u001b[0m, in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m    868\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(handle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[1;32m    869\u001b[0m     \u001b[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[1;32m    870\u001b[0m     \u001b[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[1;32m    871\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mencoding \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mmode:\n\u001b[1;32m    872\u001b[0m         \u001b[38;5;66;03m# Encoding\u001b[39;00m\n\u001b[0;32m--> 873\u001b[0m         handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(\n\u001b[1;32m    874\u001b[0m             handle,\n\u001b[1;32m    875\u001b[0m             ioargs\u001b[38;5;241m.\u001b[39mmode,\n\u001b[1;32m    876\u001b[0m             encoding\u001b[38;5;241m=\u001b[39mioargs\u001b[38;5;241m.\u001b[39mencoding,\n\u001b[1;32m    877\u001b[0m             errors\u001b[38;5;241m=\u001b[39merrors,\n\u001b[1;32m    878\u001b[0m             newline\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m    879\u001b[0m         )\n\u001b[1;32m    880\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    881\u001b[0m         \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[1;32m    882\u001b[0m         handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(handle, ioargs\u001b[38;5;241m.\u001b[39mmode)\n",
-      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'data/raw_heart_disease_data.csv'"
-     ]
-    }
-   ],
-   "source": [
-    "import pandas as pd\n",
-    "import pandera as pa\n",
-    "from pandera import Column, Check, DataFrameSchema\n",
-    "\n",
-    "# Define the data validation schema\n",
-    "schema = DataFrameSchema(\n",
-    "    {\n",
-    "        # Validate numeric columns\n",
-    "        \"age\": Column(int, Check.between(0, 120), nullable=False),\n",
-    "        \"sex\": Column(int, Check.isin([0, 1]), nullable=False),\n",
-    "        \"chest_pain_type\": Column(\n",
-    "            str, \n",
-    "            Check.isin([\"typical angina\", \"atypical angina\", \"non-anginal pain\", \"asymptomatic\"]), \n",
-    "            nullable=False\n",
-    "        ),\n",
-    "        \"resting_blood_pressure\": Column(int, Check.between(50, 250), nullable=False),\n",
-    "        \"cholesterol\": Column(int, Check.between(100, 600), nullable=False),\n",
-    "        \"fasting_blood_sugar\": Column(int, Check.isin([0, 1]), nullable=False),\n",
-    "        \"rest_ecg\": Column(\n",
-    "            str, \n",
-    "            Check.isin([\"normal\", \"ST-T wave abnormality\", \"left ventricular hypertrophy\"]),\n",
-    "            nullable=False\n",
-    "        ),\n",
-    "        \"max_heart_rate\": Column(int, Check.between(50, 220), nullable=False),\n",
-    "        \"exercise_induced_angina\": Column(str, Check.isin([\"yes\", \"no\"]), nullable=False),\n",
-    "        \"st_depression\": Column(float, Check.between(0.0, 10.0), nullable=True),\n",
-    "        \"slope\": Column(\n",
-    "            str, \n",
-    "            Check.isin([\"upsloping\", \"flat\", \"downsloping\"]), \n",
-    "            nullable=False\n",
-    "        ),\n",
-    "        \"num_of_vessels\": Column(\n",
-    "            float, \n",
-    "            Check(lambda x: x.isna() | ((x >= 0) & (x <= 4)), element_wise=True), \n",
-    "            nullable=True\n",
-    "        ),\n",
-    "        \"thalassemia\": Column(\n",
-    "            str, \n",
-    "            Check.isin([\"normal\", \"fixed defect\", \"reversable defect\"]), \n",
-    "            nullable=True\n",
-    "        ),\n",
-    "        \"diagnosis\": Column(int, Check.isin([0, 1]), nullable=False),\n",
-    "    },\n",
-    "    # Additional checks for the entire DataFrame\n",
-    "    checks=[\n",
-    "        # Ensure no duplicate rows\n",
-    "        Check(lambda df: ~df.duplicated().any(), error=\"Duplicate rows found.\"),\n",
-    "        # Ensure no empty rows\n",
-    "        Check(lambda df: ~(df.isna().all(axis=1)).any(), error=\"Empty rows found.\"),\n",
-    "    ]\n",
-    ")\n",
-    "\n",
-    "# Load your dataset\n",
-    "file_path = \"data/raw_heart_disease_data.csv\"  # Update with the path to your dataset\n",
-    "data = pd.read_csv(file_path)\n",
-    "\n",
-    "# Validate the dataset against the schema\n",
-    "try:\n",
-    "    validated_data = schema.validate(data)\n",
-    "    print(\"Data validation passed successfully!\")\n",
-    "except pa.errors.SchemaError as e:\n",
-    "    print(f\"Data validation failed: {e}\")\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Changing column correct Column Names**\n",
-    "column_mapping = {\n",
-    "}\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# No empty observations \n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# missingness \n"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},

	age	sex	chest_pain_type	resting_blood_pressure	cholesterol	fasting_blood_sugar	rest_ecg	max_heart_rate	exercise_induced_angina	st_depression	slope	num_of_vessels	thalassemia	diagnosis
0	63	1	1	145	233	1	2	150	0	2.3	3	0.0	6.0	0
1	67	1	4	160	286	0	2	108	1	1.5	2	3.0	3.0	2
2	67	1	4	120	229	0	2	129	1	2.6	2	2.0	7.0	1
3	37	1	3	130	250	0	0	187	0	3.5	3	0.0	3.0	0
4	41	0	2	130	204	0	2	172	0	1.4	1	0.0	3.0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
298	45	1	1	110	264	0	0	132	0	1.2	2	0.0	7.0	1
299	68	1	4	144	193	1	0	141	0	3.4	2	2.0	7.0	2
300	57	1	4	130	131	0	0	115	1	1.2	2	1.0	7.0	3
301	57	0	2	130	236	0	2	174	0	0.0	2	1.0	3.0	1
302	38	1	3	138	175	0	0	173	0	0.0	1	NaN	3.0	0
	age	sex	chest_pain_type	resting_blood_pressure	cholesterol	fasting_blood_sugar	rest_ecg	max_heart_rate	exercise_induced_angina	st_depression	slope	num_of_vessels	thalassemia	diagnosis
0	63	1	1	145	233	1	2	150	0	2.3	3	0.0	6.0	0
1	67	1	4	160	286	0	2	108	1	1.5	2	3.0	3.0	2
2	67	1	4	120	229	0	2	129	1	2.6	2	2.0	7.0	1
3	37	1	3	130	250	0	0	187	0	3.5	3	0.0	3.0	0
4	41	0	2	130	204	0	2	172	0	1.4	1	0.0	3.0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
298	45	1	1	110	264	0	0	132	0	1.2	2	0.0	7.0	1
299	68	1	4	144	193	1	0	141	0	3.4	2	2.0	7.0	2
300	57	1	4	130	131	0	0	115	1	1.2	2	1.0	7.0	3
301	57	0	2	130	236	0	2	174	0	0.0	2	1.0	3.0	1
302	38	1	3	138	175	0	0	173	0	0.0	1	NaN	3.0	0
	Feature	Coefficient	Feature	Coefficient
7	chest_pain_type_asymptomatic	1.242617	7	chest_pain_type_asymptomatic	1.242617
20	num_of_vessels_2.0	0.936761	20	num_of_vessels_2.0	0.936761
24	thalassemia_reversable defect	0.908607	24	thalassemia_reversable defect	0.908607
16	slope_flat	0.789179	16	slope_flat	0.789179
6	sex	0.467793	6	sex	0.467793
1	resting_blood_pressure	0.464303	1	resting_blood_pressure	0.464303
21	num_of_vessels_3.0	0.416970	21	num_of_vessels_3.0	0.416970
5	st_depression	0.400422	5	st_depression	0.400422
12	rest_ecg_left ventricular hypertrophy	0.268959	12	rest_ecg_left ventricular hypertrophy	0.268959
14	exercise_induced_angina_yes	0.210783	14	exercise_induced_angina_yes	0.210783
8	chest_pain_type_atypical angina	0.151616	8	chest_pain_type_atypical angina	0.151616
11	rest_ecg_ST-T wave abnormality	0.150904	11	rest_ecg_ST-T wave abnormality	0.150904
3	cholesterol	0.141675	3	cholesterol	0.141675
19	num_of_vessels_1.0	0.131598	19	num_of_vessels_1.0	0.131598
0	age	-0.089145	0	age	-0.089145
2	fasting_blood_sugar	-0.226329	2	fasting_blood_sugar	-0.226329
17	slope_upsloping	-0.373995	17	slope_upsloping	-0.373995
15	slope_downsloping	-0.385061	15	slope_downsloping	-0.385061
13	rest_ecg_normal	-0.389740	13	rest_ecg_normal	-0.389740
22	thalassemia_fixed defect	-0.411370	22	thalassemia_fixed defect	-0.411370
23	thalassemia_normal	-0.467113	23	thalassemia_normal	-0.467113
4	max_heart_rate	-0.648489	4	max_heart_rate	-0.648489
9	chest_pain_type_non-anginal pain	-0.670571	9	chest_pain_type_non-anginal pain	-0.670571
10	chest_pain_type_typical angina	-0.693539	10	chest_pain_type_typical angina	-0.693539
18	num_of_vessels_0.0	-1.455206	18	num_of_vessels_0.0	-1.455206