Skip to content

Commit

Permalink
Enable checks on PySpark dataframes with more rows than IntegerType a…
Browse files Browse the repository at this point in the history
…llows (#309)

* Add failing test if PySpark dataframe is out of 32-bit integer limit

* Use bigints for rows and violations
  • Loading branch information
runkelcorey authored Sep 5, 2024
1 parent a69ddd1 commit 5e828fc
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 1 deletion.
2 changes: 1 addition & 1 deletion cuallee/pyspark_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -855,7 +855,7 @@ def _value(x):
)
for rule in check.rules
],
schema="id int, timestamp string, check string, level string, column string, rule string, value string, rows int, violations int, pass_rate double, pass_threshold double, status string",
schema="id int, timestamp string, check string, level string, column string, rule string, value string, rows bigint, violations bigint, pass_rate double, pass_threshold double, status string",
)

return result
8 changes: 8 additions & 0 deletions test/unit/pyspark_dataframe/test_spark_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,14 @@ def test_timestamp_column_validation(spark):
PSV.validate_data_types(check.rules, df)


def test_bigint_rows(spark):
df = spark.range(2_500_000_000) # beyond 32-bit integer limit
check = Check(CheckLevel.WARNING, "pytest")
check.is_complete("id")
rs = check.validate(df)
assert rs.first().status == "PASS"


def test_get_compute_dictionary(spark):
df = spark.range(10)
check = (
Expand Down

0 comments on commit 5e828fc

Please sign in to comment.