From b0765c07db0d5f71edec84e74ec105317702d6ef Mon Sep 17 00:00:00 2001 From: Nonso Ebele-Muolokwu Date: Fri, 29 Nov 2024 18:09:53 -0800 Subject: [PATCH] Nonso - Added new checks to the schema --- analysis.ipynb | 78 ++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 59 insertions(+), 19 deletions(-) diff --git a/analysis.ipynb b/analysis.ipynb index 3f1cd9d..79e52e3 100644 --- a/analysis.ipynb +++ b/analysis.ipynb @@ -193,7 +193,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -233,17 +233,31 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# Uses janitor to clean column names\n", - "wine_df = wine_df.clean_names()" + "wine_df = wine_df.clean_names()\n", + "\n", + "wine_df = wine_df.drop_duplicates()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "def check_corr_feats(df):\n", + " aa = (df.corr().abs() < 0.9)\n", + " np.fill_diagonal(aa.values, True)\n", + " return aa.all().all()" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -255,23 +269,49 @@ } ], "source": [ - "# Define the schema\n", "schema = pa.DataFrameSchema({\n", - " \"fixed_acidity\": Column(pa.Float, nullable=False),\n", - " \"volatile_acidity\": Column(pa.Float, nullable=False),\n", - " \"citric_acid\": Column(pa.Float, nullable=False),\n", - " \"residual_sugar\": Column(pa.Float, nullable=False),\n", - " \"chlorides\": Column(pa.Float, nullable=False),\n", - " \"free_sulfur_dioxide\": Column(pa.Float, nullable=False),\n", - " \"total_sulfur_dioxide\": Column(pa.Float, nullable=False),\n", - " \"density\": Column(pa.Float, nullable=False),\n", - " \"ph\": Column(pa.Float, nullable=False),\n", - " \"sulphates\": Column(pa.Float, nullable=False),\n", - " \"alcohol\": Column(pa.Float, nullable=False),\n", - " \"quality\": Column(pa.Int, Check.in_range(0, 10), nullable=False), # Assuming quality is rated 0-10\n", - "})\n", + " \"fixed_acidity\": Column(pa.Float, Check.ge(0), nullable=False),\n", + " \"volatile_acidity\": Column(pa.Float, Check.ge(0), nullable=False),\n", + " \"citric_acid\": Column(pa.Float, Check.ge(0), nullable=False),\n", + " \"residual_sugar\": Column(pa.Float, Check.ge(0), nullable=False),\n", + " \"chlorides\": Column(pa.Float, Check.ge(0), nullable=False),\n", + " \"free_sulfur_dioxide\": Column(pa.Float, Check.ge(0), nullable=False),\n", + " \"total_sulfur_dioxide\": Column(pa.Float, Check.ge(0), nullable=False),\n", + " \"density\": Column(pa.Float, Check.ge(0), nullable=False),\n", + " \"ph\": Column(pa.Float, [Check.ge(0), Check.le(14)], nullable=False),\n", + " \"sulphates\": Column(pa.Float, Check.ge(0), nullable=False),\n", + " \"alcohol\": Column(pa.Float, Check.ge(0), nullable=False),\n", + " \"quality\": Column(pa.Int, Check.isin([3, 4, 5, 6, 7, 8, 9]), nullable=False) # Example: Replace with valid levels\n", + "},\n", + " checks=[\n", + " # Ensure no duplicate rows\n", + " pa.Check(lambda df: ~df.duplicated().any(), error=\"Duplicate rows found.\"),\n", + " \n", + " # Ensure no empty rows\n", + " pa.Check(lambda df: ~(df.isna().all(axis=1)).any(), error=\"Empty rows found.\"),\n", + " \n", + " # Check for missingness threshold (e.g., <5%)\n", + " pa.Check(lambda df: (df.isna().mean() < 0.05).all(), error=\"Missingness exceeds threshold.\"),\n", + " \n", + " # Ensure the target variable distribution meets expectations\n", + " pa.Check(lambda df: df['quality'].value_counts(normalize=True).between(0.0001, 0.5).all(), \n", + " error=\"Quality distribution is outside expected bounds.\"),\n", + " \n", + " # Check no anomalous correlations between target and features\n", + " pa.Check(lambda df: (df.corr()['quality'].abs()[:-1] < 0.9).all(), \n", + " error=\"Anomalous correlations found between quality and features.\"),\n", + " \n", + " # Check no anomalous correlations between features\n", + " pa.Check(lambda df: check_corr_feats(df), \n", + " error=\"Anomalous correlations found between features.\"),\n", + " # pa.Check(\n", + " # lambda df: df.apply(lambda x: ((x >= x.quantile(0.01)) & (x <= x.quantile(0.99))).all(), axis=0).all(),\n", + " # error=\"Outliers detected beyond 1st and 99th percentiles.\"\n", + " # )\n", + " ]\n", + ")\n", + "\n", "\n", - "# Validate the DataFrame\n", "try:\n", " schema.validate(wine_df)\n", " print(\"Data is valid!\")\n",