From 9b7d405455c9c8a4b71c3d51dbd352d762c246ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CCeline?= Date: Thu, 28 Nov 2024 14:11:46 -0800 Subject: [PATCH 01/10] Added dataframeschema for data validation --- report/heart_disease_predictor_report.ipynb | 504 ++++++-------------- 1 file changed, 159 insertions(+), 345 deletions(-) diff --git a/report/heart_disease_predictor_report.ipynb b/report/heart_disease_predictor_report.ipynb index 5ab044f..cb96fdd 100644 --- a/report/heart_disease_predictor_report.ipynb +++ b/report/heart_disease_predictor_report.ipynb @@ -93,7 +93,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -101,12 +101,13 @@ "import pandas as pd\n", "import warnings\n", "\n", + "\n", "warnings.filterwarnings('ignore')" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -142,7 +143,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -405,7 +406,7 @@ "[303 rows x 14 columns]" ] }, - "execution_count": 3, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -417,7 +418,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -433,321 +434,18 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
agesexchest_pain_typeresting_blood_pressurecholesterolfasting_blood_sugarrest_ecgmax_heart_rateexercise_induced_anginast_depressionslopenum_of_vesselsthalassemiadiagnosis
063111452331215002.330.06.00
167141602860210811.523.03.02
267141202290212912.622.07.01
337131302500018703.530.03.00
441021302040217201.410.03.00
.............................................
29845111102640013201.220.07.01
29968141441931014103.422.07.02
30057141301310011511.221.07.03
30157021302360217400.021.03.01
30238131381750017300.01NaN3.00
\n", - "

303 rows × 14 columns

\n", - "
" - ], - "text/plain": [ - " age sex chest_pain_type resting_blood_pressure cholesterol \\\n", - "0 63 1 1 145 233 \n", - "1 67 1 4 160 286 \n", - "2 67 1 4 120 229 \n", - "3 37 1 3 130 250 \n", - "4 41 0 2 130 204 \n", - ".. ... ... ... ... ... \n", - "298 45 1 1 110 264 \n", - "299 68 1 4 144 193 \n", - "300 57 1 4 130 131 \n", - "301 57 0 2 130 236 \n", - "302 38 1 3 138 175 \n", - "\n", - " fasting_blood_sugar rest_ecg max_heart_rate exercise_induced_angina \\\n", - "0 1 2 150 0 \n", - "1 0 2 108 1 \n", - "2 0 2 129 1 \n", - "3 0 0 187 0 \n", - "4 0 2 172 0 \n", - ".. ... ... ... ... \n", - "298 0 0 132 0 \n", - "299 1 0 141 0 \n", - "300 0 0 115 1 \n", - "301 0 2 174 0 \n", - "302 0 0 173 0 \n", - "\n", - " st_depression slope num_of_vessels thalassemia diagnosis \n", - "0 2.3 3 0.0 6.0 0 \n", - "1 1.5 2 3.0 3.0 2 \n", - "2 2.6 2 2.0 7.0 1 \n", - "3 3.5 3 0.0 3.0 0 \n", - "4 1.4 1 0.0 3.0 0 \n", - ".. ... ... ... ... ... \n", - "298 1.2 2 0.0 7.0 1 \n", - "299 3.4 2 2.0 7.0 2 \n", - "300 1.2 2 1.0 7.0 3 \n", - "301 0.0 2 1.0 3.0 1 \n", - "302 0.0 1 NaN 3.0 0 \n", - "\n", - "[303 rows x 14 columns]" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "age int64\n", - "sex int64\n", - "chest_pain_type int64\n", - "resting_blood_pressure int64\n", - "cholesterol int64\n", - "fasting_blood_sugar int64\n", - "rest_ecg int64\n", - "max_heart_rate int64\n", - "exercise_induced_angina int64\n", - "st_depression float64\n", - "slope int64\n", - "num_of_vessels float64\n", - "thalassemia float64\n", - "diagnosis int64\n", - "dtype: object" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Check datatypes\n", "\n", @@ -1348,6 +1046,35 @@ "df.isnull().sum()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Validation" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "schema = pa.DataFrameSchema(\n", + " {\n", + " # Numeric columns with no missing values allowed\n", + " \"age\": Column(int, Check.between(0, 120), nullable=False),\n", + " \"sex\": Column(int, Check.isin([0, 1]), nullable=False),\n", + " \n", + " # Categorical columns with string types\n", + " \"chest_pain_type\": Column(\n", + " str,\n", + " Check.isin([\"typical angina\", \"atypical angina\", \"non-anginal pain\", \"asymptomatic\"]),\n", + " nullable=False\n", + " )\n", + " }\n", + ")" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -2025,36 +1752,7 @@ } ], "source": [ - "#splitting the features \n", - "\n", - "numeric_features = [\n", - " \"age\", \n", - " \"resting_blood_pressure\", \n", - " \"fasting_blood_sugar\", \n", - " \"cholesterol\", \n", - " \"max_heart_rate\", \n", - " \"st_depression\", \n", - " \"sex\"\n", - "]\n", - "categorical_features = [\n", - " \"chest_pain_type\", \n", - " \"rest_ecg\", \n", - " \"exercise_induced_angina\", \n", - " \"slope\", \n", - " \"num_of_vessels\", \n", - " \"thalassemia\"\n", - "]\n", - "\n", - "numeric_transformer = StandardScaler()\n", - "categorical_transformer = OneHotEncoder(drop=\"if_binary\", handle_unknown=\"ignore\")\n", - "\n", - "# Create Column transformer\n", - "preprocessor = make_column_transformer(\n", - " (numeric_transformer, numeric_features),\n", - " (categorical_transformer, categorical_features)\n", - ")\n", - "\n", - "preprocessor\n" + "\n" ] }, { @@ -3875,6 +3573,122 @@ "loges_report_filtered" ] }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: 'data/raw_heart_disease_data.csv'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[17], line 55\u001b[0m\n\u001b[1;32m 53\u001b[0m \u001b[38;5;66;03m# Load your dataset\u001b[39;00m\n\u001b[1;32m 54\u001b[0m file_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdata/raw_heart_disease_data.csv\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;66;03m# Update with the path to your dataset\u001b[39;00m\n\u001b[0;32m---> 55\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 57\u001b[0m \u001b[38;5;66;03m# Validate the dataset against the schema\u001b[39;00m\n\u001b[1;32m 58\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n", + "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1026\u001b[0m, in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[1;32m 1013\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[1;32m 1014\u001b[0m dialect,\n\u001b[1;32m 1015\u001b[0m delimiter,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1022\u001b[0m dtype_backend\u001b[38;5;241m=\u001b[39mdtype_backend,\n\u001b[1;32m 1023\u001b[0m )\n\u001b[1;32m 1024\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[0;32m-> 1026\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/pandas/io/parsers/readers.py:620\u001b[0m, in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 617\u001b[0m _validate_names(kwds\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnames\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[1;32m 619\u001b[0m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[0;32m--> 620\u001b[0m parser \u001b[38;5;241m=\u001b[39m \u001b[43mTextFileReader\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 622\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[1;32m 623\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m parser\n", + "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1620\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m 1617\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 1619\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles: IOHandles \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1620\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_engine\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1880\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[0;34m(self, f, engine)\u001b[0m\n\u001b[1;32m 1878\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[1;32m 1879\u001b[0m mode \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m-> 1880\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;241m=\u001b[39m \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1881\u001b[0m \u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1882\u001b[0m \u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1883\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1884\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompression\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcompression\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1885\u001b[0m \u001b[43m \u001b[49m\u001b[43mmemory_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmemory_map\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1886\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_text\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_text\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1887\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding_errors\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstrict\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1888\u001b[0m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstorage_options\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1889\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1890\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1891\u001b[0m f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles\u001b[38;5;241m.\u001b[39mhandle\n", + "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/pandas/io/common.py:873\u001b[0m, in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m 868\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(handle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[1;32m 869\u001b[0m \u001b[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[1;32m 870\u001b[0m \u001b[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[1;32m 871\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mencoding \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mmode:\n\u001b[1;32m 872\u001b[0m \u001b[38;5;66;03m# Encoding\u001b[39;00m\n\u001b[0;32m--> 873\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(\n\u001b[1;32m 874\u001b[0m handle,\n\u001b[1;32m 875\u001b[0m ioargs\u001b[38;5;241m.\u001b[39mmode,\n\u001b[1;32m 876\u001b[0m encoding\u001b[38;5;241m=\u001b[39mioargs\u001b[38;5;241m.\u001b[39mencoding,\n\u001b[1;32m 877\u001b[0m errors\u001b[38;5;241m=\u001b[39merrors,\n\u001b[1;32m 878\u001b[0m newline\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 879\u001b[0m )\n\u001b[1;32m 880\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 881\u001b[0m \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[1;32m 882\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(handle, ioargs\u001b[38;5;241m.\u001b[39mmode)\n", + "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'data/raw_heart_disease_data.csv'" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import pandera as pa\n", + "from pandera import Column, Check, DataFrameSchema\n", + "\n", + "# Define the data validation schema\n", + "schema = DataFrameSchema(\n", + " {\n", + " # Validate numeric columns\n", + " \"age\": Column(int, Check.between(0, 120), nullable=False),\n", + " \"sex\": Column(int, Check.isin([0, 1]), nullable=False),\n", + " \"chest_pain_type\": Column(\n", + " str, \n", + " Check.isin([\"typical angina\", \"atypical angina\", \"non-anginal pain\", \"asymptomatic\"]), \n", + " nullable=False\n", + " ),\n", + " \"resting_blood_pressure\": Column(int, Check.between(50, 250), nullable=False),\n", + " \"cholesterol\": Column(int, Check.between(100, 600), nullable=False),\n", + " \"fasting_blood_sugar\": Column(int, Check.isin([0, 1]), nullable=False),\n", + " \"rest_ecg\": Column(\n", + " str, \n", + " Check.isin([\"normal\", \"ST-T wave abnormality\", \"left ventricular hypertrophy\"]),\n", + " nullable=False\n", + " ),\n", + " \"max_heart_rate\": Column(int, Check.between(50, 220), nullable=False),\n", + " \"exercise_induced_angina\": Column(str, Check.isin([\"yes\", \"no\"]), nullable=False),\n", + " \"st_depression\": Column(float, Check.between(0.0, 10.0), nullable=True),\n", + " \"slope\": Column(\n", + " str, \n", + " Check.isin([\"upsloping\", \"flat\", \"downsloping\"]), \n", + " nullable=False\n", + " ),\n", + " \"num_of_vessels\": Column(\n", + " float, \n", + " Check(lambda x: x.isna() | ((x >= 0) & (x <= 4)), element_wise=True), \n", + " nullable=True\n", + " ),\n", + " \"thalassemia\": Column(\n", + " str, \n", + " Check.isin([\"normal\", \"fixed defect\", \"reversable defect\"]), \n", + " nullable=True\n", + " ),\n", + " \"diagnosis\": Column(int, Check.isin([0, 1]), nullable=False),\n", + " },\n", + " # Additional checks for the entire DataFrame\n", + " checks=[\n", + " # Ensure no duplicate rows\n", + " Check(lambda df: ~df.duplicated().any(), error=\"Duplicate rows found.\"),\n", + " # Ensure no empty rows\n", + " Check(lambda df: ~(df.isna().all(axis=1)).any(), error=\"Empty rows found.\"),\n", + " ]\n", + ")\n", + "\n", + "# Load your dataset\n", + "file_path = \"data/raw_heart_disease_data.csv\" # Update with the path to your dataset\n", + "data = pd.read_csv(file_path)\n", + "\n", + "# Validate the dataset against the schema\n", + "try:\n", + " validated_data = schema.validate(data)\n", + " print(\"Data validation passed successfully!\")\n", + "except pa.errors.SchemaError as e:\n", + " print(f\"Data validation failed: {e}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Changing column correct Column Names**\n", + "column_mapping = {\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# No empty observations \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# missingness \n" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -3938,9 +3752,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python [conda env:heart_disease_522]", + "display_name": "Python [conda env:base] *", "language": "python", - "name": "conda-env-heart_disease_522-py" + "name": "conda-base-py" }, "language_info": { "codemirror_mode": { @@ -3952,7 +3766,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.7" + "version": "3.11.10" } }, "nbformat": 4, From cfcb46c00a416d8cfa4fa91196adfe2b2f2209bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CCeline?= Date: Thu, 28 Nov 2024 14:15:08 -0800 Subject: [PATCH 02/10] Add numeric columns with range checks and no missing values --- report/heart_disease_predictor_report.ipynb | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/report/heart_disease_predictor_report.ipynb b/report/heart_disease_predictor_report.ipynb index cb96fdd..f08757a 100644 --- a/report/heart_disease_predictor_report.ipynb +++ b/report/heart_disease_predictor_report.ipynb @@ -1070,7 +1070,9 @@ " str,\n", " Check.isin([\"typical angina\", \"atypical angina\", \"non-anginal pain\", \"asymptomatic\"]),\n", " nullable=False\n", - " )\n", + " \n", + " \n", + " )\n", " }\n", ")" ] From fa02d0ae971c53f93bb8f14648edc305ce069f6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CCeline?= Date: Thu, 28 Nov 2024 14:17:32 -0800 Subject: [PATCH 03/10] Checking catergorical columns for data validation --- report/heart_disease_predictor_report.ipynb | 28 ++++++++++++++------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/report/heart_disease_predictor_report.ipynb b/report/heart_disease_predictor_report.ipynb index f08757a..f4fc198 100644 --- a/report/heart_disease_predictor_report.ipynb +++ b/report/heart_disease_predictor_report.ipynb @@ -1055,26 +1055,36 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "schema = pa.DataFrameSchema(\n", " {\n", " # Numeric columns with no missing values allowed\n", - " \"age\": Column(int, Check.between(0, 120), nullable=False),\n", - " \"sex\": Column(int, Check.isin([0, 1]), nullable=False),\n", - " \n", + " \"age\": pa.Column(int, pa.Check.between(0, 120), nullable=False),\n", + " \"sex\": pa.Column(int, pa.Check.isin([0, 1]), nullable=False),\n", + "\n", " # Categorical columns with string types\n", - " \"chest_pain_type\": Column(\n", + " \"chest_pain_type\": pa.Column(\n", " str,\n", - " Check.isin([\"typical angina\", \"atypical angina\", \"non-anginal pain\", \"asymptomatic\"]),\n", + " pa.Check.isin([\"typical angina\", \"atypical angina\", \"non-anginal pain\", \"asymptomatic\"]),\n", + " nullable=False\n", + " ),\n", + "\n", + " # Numeric columns with range checks and no missing values\n", + " \"resting_blood_pressure\": pa.Column(int, pa.Check.between(50, 250), nullable=False),\n", + " \"cholesterol\": pa.Column(int, pa.Check.between(100, 600), nullable=False),\n", + " \"fasting_blood_sugar\": pa.Column(int, pa.Check.isin([0, 1]), nullable=False),\n", + "\n", + " # Categorical columns\n", + " \"rest_ecg\": pa.Column(\n", + " str,\n", + " pa.Check.isin([\"normal\", \"ST-T wave abnormality\", \"left ventricular hypertrophy\"]),\n", " nullable=False\n", - " \n", - " \n", " )\n", " }\n", - ")" + ")\n" ] }, { From c2b44ef9ccc421e7297943ab77d3596ccf6e11bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CCeline?= Date: Thu, 28 Nov 2024 14:18:56 -0800 Subject: [PATCH 04/10] Add numeric columns with range checks --- report/heart_disease_predictor_report.ipynb | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/report/heart_disease_predictor_report.ipynb b/report/heart_disease_predictor_report.ipynb index f4fc198..8efef11 100644 --- a/report/heart_disease_predictor_report.ipynb +++ b/report/heart_disease_predictor_report.ipynb @@ -1055,9 +1055,18 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 31, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "SyntaxError", + "evalue": "closing parenthesis ')' does not match opening parenthesis '{' on line 2 (3202913183.py, line 27)", + "output_type": "error", + "traceback": [ + "\u001b[0;36m Cell \u001b[0;32mIn[31], line 27\u001b[0;36m\u001b[0m\n\u001b[0;31m )\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m closing parenthesis ')' does not match opening parenthesis '{' on line 2\n" + ] + } + ], "source": [ "schema = pa.DataFrameSchema(\n", " {\n", @@ -1082,6 +1091,9 @@ " str,\n", " pa.Check.isin([\"normal\", \"ST-T wave abnormality\", \"left ventricular hypertrophy\"]),\n", " nullable=False\n", + " ),\n", + " # Numeric columns with range checks\n", + " \"max_heart_rate\": Column(int, Check.between(50, 220), nullable=False),\n", " )\n", " }\n", ")\n" From b5aa95813ecbf813314aa2b694a2488578b4bbe8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CCeline?= Date: Thu, 28 Nov 2024 14:31:14 -0800 Subject: [PATCH 05/10] Add correct data types in each column --- report/heart_disease_predictor_report.ipynb | 28 ++++++++++++--------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/report/heart_disease_predictor_report.ipynb b/report/heart_disease_predictor_report.ipynb index 8efef11..da69a2c 100644 --- a/report/heart_disease_predictor_report.ipynb +++ b/report/heart_disease_predictor_report.ipynb @@ -1055,18 +1055,9 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 36, "metadata": {}, - "outputs": [ - { - "ename": "SyntaxError", - "evalue": "closing parenthesis ')' does not match opening parenthesis '{' on line 2 (3202913183.py, line 27)", - "output_type": "error", - "traceback": [ - "\u001b[0;36m Cell \u001b[0;32mIn[31], line 27\u001b[0;36m\u001b[0m\n\u001b[0;31m )\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m closing parenthesis ')' does not match opening parenthesis '{' on line 2\n" - ] - } - ], + "outputs": [], "source": [ "schema = pa.DataFrameSchema(\n", " {\n", @@ -1094,7 +1085,20 @@ " ),\n", " # Numeric columns with range checks\n", " \"max_heart_rate\": Column(int, Check.between(50, 220), nullable=False),\n", - " )\n", + " \n", + " # Categorical column with a string type\n", + " \"exercise_induced_angina\": Column(\n", + " str,\n", + " Check.isin([\"yes\", \"no\"]),\n", + " nullable=False), \n", + " \n", + " # Numeric columns with missing values allowed up to 5%\n", + " \"st_depression\": Column(\n", + " float,\n", + " Check.between(0.0, 10.0),\n", + " Check(lambda s: s.isna().mean() <= 0.05, element_wise=False, \n", + " error=\"Too many null values in 'st_depression' column.\"),\n", + " nullable=True),\n", " }\n", ")\n" ] From 8628739f12e885aa327a8f8583a7c210cbcbc086 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CCeline?= Date: Thu, 28 Nov 2024 14:32:32 -0800 Subject: [PATCH 06/10] Added correct data types in different column --- report/heart_disease_predictor_report.ipynb | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/report/heart_disease_predictor_report.ipynb b/report/heart_disease_predictor_report.ipynb index da69a2c..aa6ebe6 100644 --- a/report/heart_disease_predictor_report.ipynb +++ b/report/heart_disease_predictor_report.ipynb @@ -1055,7 +1055,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ @@ -1099,6 +1099,14 @@ " Check(lambda s: s.isna().mean() <= 0.05, element_wise=False, \n", " error=\"Too many null values in 'st_depression' column.\"),\n", " nullable=True),\n", + " \n", + " \"num_of_vessels\": Column(\n", + " float,\n", + " Check(lambda s: s.isna() | ((s >= 0) & (s <= 4)), element_wise=True),\n", + " Check(lambda s: s.isna().mean() <= 0.05, element_wise=False, \n", + " error=\"Too many null values in 'num_of_vessels' column.\"),\n", + " nullable=True),\n", + " \n", " }\n", ")\n" ] From c4158780ae1f8eb82217b57f7e5368c5b914f43b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CCeline?= Date: Thu, 28 Nov 2024 14:34:15 -0800 Subject: [PATCH 07/10] Added 5% missingness threshold --- report/heart_disease_predictor_report.ipynb | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/report/heart_disease_predictor_report.ipynb b/report/heart_disease_predictor_report.ipynb index aa6ebe6..757407f 100644 --- a/report/heart_disease_predictor_report.ipynb +++ b/report/heart_disease_predictor_report.ipynb @@ -1055,7 +1055,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 40, "metadata": {}, "outputs": [], "source": [ @@ -1106,7 +1106,14 @@ " Check(lambda s: s.isna().mean() <= 0.05, element_wise=False, \n", " error=\"Too many null values in 'num_of_vessels' column.\"),\n", " nullable=True),\n", - " \n", + " \n", + " # Categorical column with missing values allowed up to 5%\n", + " \"thalassemia\": Column(\n", + " str,\n", + " Check.isin([\"normal\", \"fixed defect\", \"reversable defect\"]),\n", + " Check(lambda s: s.isna().mean() <= 0.05, element_wise=False, \n", + " error=\"Too many null values in 'thalassemia' column.\"),\n", + " nullable=True) \n", " }\n", ")\n" ] From f60b8a74c8fddcc3ebfba6d4a6f460d21aeed8e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CCeline?= Date: Thu, 28 Nov 2024 14:36:08 -0800 Subject: [PATCH 08/10] Checked for duplicate and empty rows --- report/heart_disease_predictor_report.ipynb | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/report/heart_disease_predictor_report.ipynb b/report/heart_disease_predictor_report.ipynb index 757407f..a974996 100644 --- a/report/heart_disease_predictor_report.ipynb +++ b/report/heart_disease_predictor_report.ipynb @@ -1055,7 +1055,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 42, "metadata": {}, "outputs": [], "source": [ @@ -1113,8 +1113,17 @@ " Check.isin([\"normal\", \"fixed defect\", \"reversable defect\"]),\n", " Check(lambda s: s.isna().mean() <= 0.05, element_wise=False, \n", " error=\"Too many null values in 'thalassemia' column.\"),\n", - " nullable=True) \n", - " }\n", + " nullable=True), \n", + " \n", + " # Target column\n", + " \"diagnosis\": Column(int, Check.isin([0, 1]), nullable=False),\n", + " },\n", + " checks=[\n", + " # Check for duplicate rows\n", + " Check(lambda df: ~df.duplicated().any(), error=\"Duplicate rows found.\"),\n", + " # Check for empty rows\n", + " Check(lambda df: ~(df.isna().all(axis=1)).any(), error=\"Empty rows found.\")\n", + " ]\n", ")\n" ] }, From 67b19817423f3068a54c7f48852eccc742497b03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CCeline?= Date: Thu, 28 Nov 2024 15:46:02 -0800 Subject: [PATCH 09/10] Add data validation from 1-6 --- report/heart_disease_predictor_report.ipynb | 335 +++++++++++++++++++- 1 file changed, 320 insertions(+), 15 deletions(-) diff --git a/report/heart_disease_predictor_report.ipynb b/report/heart_disease_predictor_report.ipynb index a974996..3624be0 100644 --- a/report/heart_disease_predictor_report.ipynb +++ b/report/heart_disease_predictor_report.ipynb @@ -93,7 +93,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -101,13 +101,12 @@ "import pandas as pd\n", "import warnings\n", "\n", - "\n", "warnings.filterwarnings('ignore')" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -143,7 +142,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -406,7 +405,7 @@ "[303 rows x 14 columns]" ] }, - "execution_count": 13, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -418,7 +417,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -434,18 +433,321 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
agesexchest_pain_typeresting_blood_pressurecholesterolfasting_blood_sugarrest_ecgmax_heart_rateexercise_induced_anginast_depressionslopenum_of_vesselsthalassemiadiagnosis
063111452331215002.330.06.00
167141602860210811.523.03.02
267141202290212912.622.07.01
337131302500018703.530.03.00
441021302040217201.410.03.00
.............................................
29845111102640013201.220.07.01
29968141441931014103.422.07.02
30057141301310011511.221.07.03
30157021302360217400.021.03.01
30238131381750017300.01NaN3.00
\n", + "

303 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " age sex chest_pain_type resting_blood_pressure cholesterol \\\n", + "0 63 1 1 145 233 \n", + "1 67 1 4 160 286 \n", + "2 67 1 4 120 229 \n", + "3 37 1 3 130 250 \n", + "4 41 0 2 130 204 \n", + ".. ... ... ... ... ... \n", + "298 45 1 1 110 264 \n", + "299 68 1 4 144 193 \n", + "300 57 1 4 130 131 \n", + "301 57 0 2 130 236 \n", + "302 38 1 3 138 175 \n", + "\n", + " fasting_blood_sugar rest_ecg max_heart_rate exercise_induced_angina \\\n", + "0 1 2 150 0 \n", + "1 0 2 108 1 \n", + "2 0 2 129 1 \n", + "3 0 0 187 0 \n", + "4 0 2 172 0 \n", + ".. ... ... ... ... \n", + "298 0 0 132 0 \n", + "299 1 0 141 0 \n", + "300 0 0 115 1 \n", + "301 0 2 174 0 \n", + "302 0 0 173 0 \n", + "\n", + " st_depression slope num_of_vessels thalassemia diagnosis \n", + "0 2.3 3 0.0 6.0 0 \n", + "1 1.5 2 3.0 3.0 2 \n", + "2 2.6 2 2.0 7.0 1 \n", + "3 3.5 3 0.0 3.0 0 \n", + "4 1.4 1 0.0 3.0 0 \n", + ".. ... ... ... ... ... \n", + "298 1.2 2 0.0 7.0 1 \n", + "299 3.4 2 2.0 7.0 2 \n", + "300 1.2 2 1.0 7.0 3 \n", + "301 0.0 2 1.0 3.0 1 \n", + "302 0.0 1 NaN 3.0 0 \n", + "\n", + "[303 rows x 14 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "age int64\n", + "sex int64\n", + "chest_pain_type int64\n", + "resting_blood_pressure int64\n", + "cholesterol int64\n", + "fasting_blood_sugar int64\n", + "rest_ecg int64\n", + "max_heart_rate int64\n", + "exercise_induced_angina int64\n", + "st_depression float64\n", + "slope int64\n", + "num_of_vessels float64\n", + "thalassemia float64\n", + "diagnosis int64\n", + "dtype: object" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Check datatypes\n", "\n", @@ -1055,10 +1357,13 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ + "import pandera as pa \n", + "from pandera import Column, Check, DataFrameSchema\n", + "\n", "schema = pa.DataFrameSchema(\n", " {\n", " # Numeric columns with no missing values allowed\n", @@ -1119,7 +1424,7 @@ " \"diagnosis\": Column(int, Check.isin([0, 1]), nullable=False),\n", " },\n", " checks=[\n", - " # Check for duplicate rows\n", + " # Check for8\n", " Check(lambda df: ~df.duplicated().any(), error=\"Duplicate rows found.\"),\n", " # Check for empty rows\n", " Check(lambda df: ~(df.isna().all(axis=1)).any(), error=\"Empty rows found.\")\n", @@ -3804,9 +4109,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python [conda env:base] *", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "conda-base-py" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -3818,7 +4123,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.10" + "version": "3.11.9" } }, "nbformat": 4, From 684549be8c5c6bd202cf5840d01555b0bbff9f27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CCeline?= Date: Thu, 28 Nov 2024 15:55:55 -0800 Subject: [PATCH 10/10] Add missing content --- report/heart_disease_predictor_report.ipynb | 450 ++++++++------------ 1 file changed, 182 insertions(+), 268 deletions(-) diff --git a/report/heart_disease_predictor_report.ipynb b/report/heart_disease_predictor_report.ipynb index 3624be0..e7197bb 100644 --- a/report/heart_disease_predictor_report.ipynb +++ b/report/heart_disease_predictor_report.ipynb @@ -93,7 +93,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -1357,7 +1357,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -1441,7 +1441,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -1457,7 +1457,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -1467,7 +1467,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ @@ -1476,7 +1476,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -1526,7 +1526,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 29, "metadata": {}, "outputs": [ { @@ -1550,7 +1550,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ @@ -1559,7 +1559,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 31, "metadata": {}, "outputs": [ { @@ -1625,7 +1625,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ @@ -1660,7 +1660,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 33, "metadata": {}, "outputs": [ { @@ -2103,18 +2103,48 @@ " 'num_of_vessels', 'thalassemia'])])" ] }, - "execution_count": 32, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "\n" + "\n", + "#splitting the features \n", + "\n", + "numeric_features = [\n", + " \"age\", \n", + " \"resting_blood_pressure\", \n", + " \"fasting_blood_sugar\", \n", + " \"cholesterol\", \n", + " \"max_heart_rate\", \n", + " \"st_depression\", \n", + " \"sex\"\n", + "]\n", + "categorical_features = [\n", + " \"chest_pain_type\", \n", + " \"rest_ecg\", \n", + " \"exercise_induced_angina\", \n", + " \"slope\", \n", + " \"num_of_vessels\", \n", + " \"thalassemia\"\n", + "]\n", + "\n", + "numeric_transformer = StandardScaler()\n", + "categorical_transformer = OneHotEncoder(drop=\"if_binary\", handle_unknown=\"ignore\")\n", + "\n", + "# Create Column transformer\n", + "preprocessor = make_column_transformer(\n", + " (numeric_transformer, numeric_features),\n", + " (categorical_transformer, categorical_features)\n", + ")\n", + "\n", + "preprocessor\n" ] }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ @@ -2127,7 +2157,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 35, "metadata": {}, "outputs": [ { @@ -2539,7 +2569,7 @@ "[207 rows x 25 columns]" ] }, - "execution_count": 34, + "execution_count": 35, "metadata": {}, "output_type": "execute_result" } @@ -2555,7 +2585,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 36, "metadata": {}, "outputs": [ { @@ -2574,7 +2604,7 @@ " dtype='object')" ] }, - "execution_count": 35, + "execution_count": 36, "metadata": {}, "output_type": "execute_result" } @@ -2585,7 +2615,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 37, "metadata": {}, "outputs": [], "source": [ @@ -2614,7 +2644,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 38, "metadata": {}, "outputs": [ { @@ -2645,12 +2675,12 @@ " \n", " \n", " fit_time\n", - " 0.011\n", - " 0.004\n", + " 0.015\n", + " 0.005\n", " \n", " \n", " score_time\n", - " 0.012\n", + " 0.014\n", " 0.002\n", " \n", " \n", @@ -2699,8 +2729,8 @@ ], "text/plain": [ " mean std\n", - "fit_time 0.011 0.004\n", - "score_time 0.012 0.002\n", + "fit_time 0.015 0.005\n", + "score_time 0.014 0.002\n", "test_accuracy 0.677 0.123\n", "train_accuracy 1.000 0.000\n", "test_precision 0.659 0.111\n", @@ -2711,7 +2741,7 @@ "train_f1 1.000 0.000" ] }, - "execution_count": 37, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } @@ -2731,7 +2761,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 39, "metadata": {}, "outputs": [], "source": [ @@ -2744,7 +2774,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 40, "metadata": {}, "outputs": [ { @@ -2754,7 +2784,7 @@ " [ 6, 31]])" ] }, - "execution_count": 39, + "execution_count": 40, "metadata": {}, "output_type": "execute_result" } @@ -2774,7 +2804,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 41, "metadata": {}, "outputs": [ { @@ -2833,7 +2863,7 @@ "accuracy 0.755556 0.755556 0.755556" ] }, - "execution_count": 40, + "execution_count": 41, "metadata": {}, "output_type": "execute_result" } @@ -2859,7 +2889,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 42, "metadata": {}, "outputs": [ { @@ -2890,12 +2920,12 @@ " \n", " \n", " fit_time\n", - " 0.013\n", - " 0.004\n", + " 0.016\n", + " 0.008\n", " \n", " \n", " score_time\n", - " 0.010\n", + " 0.012\n", " 0.001\n", " \n", " \n", @@ -2944,8 +2974,8 @@ ], "text/plain": [ " mean std\n", - "fit_time 0.013 0.004\n", - "score_time 0.010 0.001\n", + "fit_time 0.016 0.008\n", + "score_time 0.012 0.001\n", "test_accuracy 0.841 0.068\n", "train_accuracy 0.890 0.011\n", "test_precision 0.850 0.092\n", @@ -2956,7 +2986,7 @@ "train_f1 0.882 0.014" ] }, - "execution_count": 41, + "execution_count": 42, "metadata": {}, "output_type": "execute_result" } @@ -2975,7 +3005,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 43, "metadata": {}, "outputs": [ { @@ -3000,7 +3030,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 44, "metadata": {}, "outputs": [ { @@ -3483,7 +3513,7 @@ " LogisticRegression(max_iter=1000, random_state=123))])" ] }, - "execution_count": 43, + "execution_count": 44, "metadata": {}, "output_type": "execute_result" } @@ -3503,7 +3533,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 45, "metadata": {}, "outputs": [], "source": [ @@ -3512,7 +3542,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 46, "metadata": {}, "outputs": [], "source": [ @@ -3527,245 +3557,245 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 47, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", - "\n", + "
\n", " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", "
Table 1: Logistic Regression Coefficients
 FeatureCoefficientFeatureCoefficient
7chest_pain_type_asymptomatic1.2426177chest_pain_type_asymptomatic1.242617
20num_of_vessels_2.00.93676120num_of_vessels_2.00.936761
24thalassemia_reversable defect0.90860724thalassemia_reversable defect0.908607
16slope_flat0.78917916slope_flat0.789179
6sex0.4677936sex0.467793
1resting_blood_pressure0.4643031resting_blood_pressure0.464303
21num_of_vessels_3.00.41697021num_of_vessels_3.00.416970
5st_depression0.4004225st_depression0.400422
12rest_ecg_left ventricular hypertrophy0.26895912rest_ecg_left ventricular hypertrophy0.268959
14exercise_induced_angina_yes0.21078314exercise_induced_angina_yes0.210783
8chest_pain_type_atypical angina0.1516168chest_pain_type_atypical angina0.151616
11rest_ecg_ST-T wave abnormality0.15090411rest_ecg_ST-T wave abnormality0.150904
3cholesterol0.1416753cholesterol0.141675
19num_of_vessels_1.00.13159819num_of_vessels_1.00.131598
0age-0.0891450age-0.089145
2fasting_blood_sugar-0.2263292fasting_blood_sugar-0.226329
17slope_upsloping-0.37399517slope_upsloping-0.373995
15slope_downsloping-0.38506115slope_downsloping-0.385061
13rest_ecg_normal-0.38974013rest_ecg_normal-0.389740
22thalassemia_fixed defect-0.41137022thalassemia_fixed defect-0.411370
23thalassemia_normal-0.46711323thalassemia_normal-0.467113
4max_heart_rate-0.6484894max_heart_rate-0.648489
9chest_pain_type_non-anginal pain-0.6705719chest_pain_type_non-anginal pain-0.670571
10chest_pain_type_typical angina-0.69353910chest_pain_type_typical angina-0.693539
18num_of_vessels_0.0-1.45520618num_of_vessels_0.0-1.455206
\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 46, + "execution_count": 47, "metadata": {}, "output_type": "execute_result" } @@ -3785,7 +3815,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 48, "metadata": {}, "outputs": [ { @@ -3813,7 +3843,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 49, "metadata": {}, "outputs": [], "source": [ @@ -3824,7 +3854,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 50, "metadata": {}, "outputs": [ { @@ -3834,7 +3864,7 @@ " [ 8, 29]])" ] }, - "execution_count": 49, + "execution_count": 50, "metadata": {}, "output_type": "execute_result" } @@ -3854,7 +3884,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 51, "metadata": {}, "outputs": [ { @@ -3913,7 +3943,7 @@ "accuracy 0.822222 0.822222 0.822222" ] }, - "execution_count": 50, + "execution_count": 51, "metadata": {}, "output_type": "execute_result" } @@ -3930,122 +3960,6 @@ "loges_report_filtered" ] }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "ename": "FileNotFoundError", - "evalue": "[Errno 2] No such file or directory: 'data/raw_heart_disease_data.csv'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[17], line 55\u001b[0m\n\u001b[1;32m 53\u001b[0m \u001b[38;5;66;03m# Load your dataset\u001b[39;00m\n\u001b[1;32m 54\u001b[0m file_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdata/raw_heart_disease_data.csv\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;66;03m# Update with the path to your dataset\u001b[39;00m\n\u001b[0;32m---> 55\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 57\u001b[0m \u001b[38;5;66;03m# Validate the dataset against the schema\u001b[39;00m\n\u001b[1;32m 58\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n", - "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1026\u001b[0m, in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[1;32m 1013\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[1;32m 1014\u001b[0m dialect,\n\u001b[1;32m 1015\u001b[0m delimiter,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1022\u001b[0m dtype_backend\u001b[38;5;241m=\u001b[39mdtype_backend,\n\u001b[1;32m 1023\u001b[0m )\n\u001b[1;32m 1024\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[0;32m-> 1026\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/pandas/io/parsers/readers.py:620\u001b[0m, in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 617\u001b[0m _validate_names(kwds\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnames\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[1;32m 619\u001b[0m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[0;32m--> 620\u001b[0m parser \u001b[38;5;241m=\u001b[39m \u001b[43mTextFileReader\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 622\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[1;32m 623\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m parser\n", - "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1620\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m 1617\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 1619\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles: IOHandles \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1620\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_engine\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1880\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[0;34m(self, f, engine)\u001b[0m\n\u001b[1;32m 1878\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[1;32m 1879\u001b[0m mode \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m-> 1880\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;241m=\u001b[39m \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1881\u001b[0m \u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1882\u001b[0m \u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1883\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1884\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompression\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcompression\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1885\u001b[0m \u001b[43m \u001b[49m\u001b[43mmemory_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmemory_map\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1886\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_text\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_text\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1887\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding_errors\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstrict\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1888\u001b[0m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstorage_options\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1889\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1890\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1891\u001b[0m f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles\u001b[38;5;241m.\u001b[39mhandle\n", - "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/pandas/io/common.py:873\u001b[0m, in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m 868\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(handle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[1;32m 869\u001b[0m \u001b[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[1;32m 870\u001b[0m \u001b[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[1;32m 871\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mencoding \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mmode:\n\u001b[1;32m 872\u001b[0m \u001b[38;5;66;03m# Encoding\u001b[39;00m\n\u001b[0;32m--> 873\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(\n\u001b[1;32m 874\u001b[0m handle,\n\u001b[1;32m 875\u001b[0m ioargs\u001b[38;5;241m.\u001b[39mmode,\n\u001b[1;32m 876\u001b[0m encoding\u001b[38;5;241m=\u001b[39mioargs\u001b[38;5;241m.\u001b[39mencoding,\n\u001b[1;32m 877\u001b[0m errors\u001b[38;5;241m=\u001b[39merrors,\n\u001b[1;32m 878\u001b[0m newline\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 879\u001b[0m )\n\u001b[1;32m 880\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 881\u001b[0m \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[1;32m 882\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(handle, ioargs\u001b[38;5;241m.\u001b[39mmode)\n", - "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'data/raw_heart_disease_data.csv'" - ] - } - ], - "source": [ - "import pandas as pd\n", - "import pandera as pa\n", - "from pandera import Column, Check, DataFrameSchema\n", - "\n", - "# Define the data validation schema\n", - "schema = DataFrameSchema(\n", - " {\n", - " # Validate numeric columns\n", - " \"age\": Column(int, Check.between(0, 120), nullable=False),\n", - " \"sex\": Column(int, Check.isin([0, 1]), nullable=False),\n", - " \"chest_pain_type\": Column(\n", - " str, \n", - " Check.isin([\"typical angina\", \"atypical angina\", \"non-anginal pain\", \"asymptomatic\"]), \n", - " nullable=False\n", - " ),\n", - " \"resting_blood_pressure\": Column(int, Check.between(50, 250), nullable=False),\n", - " \"cholesterol\": Column(int, Check.between(100, 600), nullable=False),\n", - " \"fasting_blood_sugar\": Column(int, Check.isin([0, 1]), nullable=False),\n", - " \"rest_ecg\": Column(\n", - " str, \n", - " Check.isin([\"normal\", \"ST-T wave abnormality\", \"left ventricular hypertrophy\"]),\n", - " nullable=False\n", - " ),\n", - " \"max_heart_rate\": Column(int, Check.between(50, 220), nullable=False),\n", - " \"exercise_induced_angina\": Column(str, Check.isin([\"yes\", \"no\"]), nullable=False),\n", - " \"st_depression\": Column(float, Check.between(0.0, 10.0), nullable=True),\n", - " \"slope\": Column(\n", - " str, \n", - " Check.isin([\"upsloping\", \"flat\", \"downsloping\"]), \n", - " nullable=False\n", - " ),\n", - " \"num_of_vessels\": Column(\n", - " float, \n", - " Check(lambda x: x.isna() | ((x >= 0) & (x <= 4)), element_wise=True), \n", - " nullable=True\n", - " ),\n", - " \"thalassemia\": Column(\n", - " str, \n", - " Check.isin([\"normal\", \"fixed defect\", \"reversable defect\"]), \n", - " nullable=True\n", - " ),\n", - " \"diagnosis\": Column(int, Check.isin([0, 1]), nullable=False),\n", - " },\n", - " # Additional checks for the entire DataFrame\n", - " checks=[\n", - " # Ensure no duplicate rows\n", - " Check(lambda df: ~df.duplicated().any(), error=\"Duplicate rows found.\"),\n", - " # Ensure no empty rows\n", - " Check(lambda df: ~(df.isna().all(axis=1)).any(), error=\"Empty rows found.\"),\n", - " ]\n", - ")\n", - "\n", - "# Load your dataset\n", - "file_path = \"data/raw_heart_disease_data.csv\" # Update with the path to your dataset\n", - "data = pd.read_csv(file_path)\n", - "\n", - "# Validate the dataset against the schema\n", - "try:\n", - " validated_data = schema.validate(data)\n", - " print(\"Data validation passed successfully!\")\n", - "except pa.errors.SchemaError as e:\n", - " print(f\"Data validation failed: {e}\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Changing column correct Column Names**\n", - "column_mapping = {\n", - "}\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# No empty observations \n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# missingness \n" - ] - }, { "cell_type": "markdown", "metadata": {},