Enable checks on PySpark dataframes with more rows than IntegerType a…

…llows (#309) * Add failing test if PySpark dataframe is out of 32-bit integer limit * Use bigints for rows and violations
canimus · Sep 5, 2024 · 5e828fc · 5e828fc
1 parent a69ddd1
commit 5e828fc
Show file tree

Hide file tree

Showing 2 changed files with 9 additions and 1 deletion.
diff --git a/cuallee/pyspark_validation.py b/cuallee/pyspark_validation.py
@@ -855,7 +855,7 @@ def _value(x):
             )
             for rule in check.rules
         ],
-        schema="id int, timestamp string, check string, level string, column string, rule string, value string, rows int, violations int, pass_rate double, pass_threshold double, status string",
+        schema="id int, timestamp string, check string, level string, column string, rule string, value string, rows bigint, violations bigint, pass_rate double, pass_threshold double, status string",
     )
 
     return result
diff --git a/test/unit/pyspark_dataframe/test_spark_validation.py b/test/unit/pyspark_dataframe/test_spark_validation.py
@@ -201,6 +201,14 @@ def test_timestamp_column_validation(spark):
         PSV.validate_data_types(check.rules, df)
 
 
+def test_bigint_rows(spark):
+    df = spark.range(2_500_000_000) # beyond 32-bit integer limit
+    check = Check(CheckLevel.WARNING, "pytest")
+    check.is_complete("id")
+    rs = check.validate(df)
+    assert rs.first().status == "PASS"
+
+
 def test_get_compute_dictionary(spark):
     df = spark.range(10)
     check = (