Skip to content

Commit

Permalink
Feature bigquery r complete (#113)
Browse files Browse the repository at this point in the history
* add are_complete

* finish are_complete test cases

* correct bracket typo

* reverse conftest for local dvp

* fix test cases
  • Loading branch information
vestalisvirginis authored Aug 25, 2023
1 parent 905f59a commit d08ad66
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 5 deletions.
13 changes: 10 additions & 3 deletions cuallee/bigquery_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,16 @@ def is_complete(self, rule: Rule):
)
return self.compute_instruction

# def is_complete(self, rule: Rule) -> str:
# """Verify the absence of null values in a column"""
# return f"SUM(CAST({rule.column} IS NOT NULL AS INTEGER))"
def are_complete(self, rule: Rule):
"""Verify the absence of null values in a column"""
predicate = [f"{c} IS NOT NULL" for c in rule.column]
self.compute_instruction = ComputeInstruction(
predicate,
"("+f"+".join([self._sum_predicate_to_integer(p) for p in predicate])+f")/{len(rule.column)}",
ComputeMethod.SQL,
)
return self.compute_instruction


def _get_expressions(compute_set: Dict[str, ComputeInstruction]) -> str:
"""Get the expression for all the rules in check in one string"""
Expand Down
49 changes: 49 additions & 0 deletions test/unit/bigquery/test_are_complete.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import pytest
import pandas as pd

from google.cloud import bigquery

from cuallee import Check, CheckLevel


def test_positive():
df = bigquery.dataset.Table('bigquery-public-data.chicago_taxi_trips.taxi_trips')
check = Check(CheckLevel.WARNING, "pytest")
check.are_complete(("taxi_id", "unique_key"))
rs = check.validate(df)
assert rs.status.str.match('PASS')[1]
assert rs.violations[1] == 0
assert rs.pass_rate[1] == 1.0


def test_negative():
df = bigquery.dataset.Table('bigquery-public-data.chicago_taxi_trips.taxi_trips')
check = Check(CheckLevel.WARNING, "pytest")
check.are_complete(("trip_start_timestamp", "trip_end_timestamp"))
rs = check.validate(df)
assert rs.status.str.match('FAIL')[1]
assert rs.violations[1] == 9217
assert rs.pass_threshold[1] == 1.0
assert rs.pass_rate[1] == 0.9999558876219533


@pytest.mark.parametrize(
"rule_column", [tuple(["taxi_id", "unique_key"]), list(["taxi_id", "unique_key"])], ids=("tuple", "list")
)
def test_parameters(spark, rule_column):
df = bigquery.dataset.Table('bigquery-public-data.chicago_taxi_trips.taxi_trips')
check = Check(CheckLevel.WARNING, "pytest")
check.are_complete(rule_column)
rs = check.validate(df)
assert rs.status.str.match('PASS')[1]


def test_coverage():
df = bigquery.dataset.Table('bigquery-public-data.chicago_taxi_trips.taxi_trips')
check = Check(CheckLevel.WARNING, "pytest")
check.are_complete(("trip_start_timestamp", "trip_end_timestamp"), 0.7)
rs = check.validate(df)
assert rs.status.str.match('PASS')[1]
assert rs.violations[1] == 9217
assert rs.pass_threshold[1] == 0.7
assert rs.pass_rate[1] == 0.9999558876219533 #207167439/207176656
5 changes: 3 additions & 2 deletions test/unit/bigquery/test_is_complete.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ def test_negative():
rs = check.validate(df)
assert rs.status.str.match('FAIL')[1]
assert rs.violations[1] == 18434
#assert rs.pass_rate[1] == 207158238/207176656
assert rs.pass_threshold[1] == 1.0
assert rs.pass_rate[1] == 0.9999117752439066


# def test_parameters():
Expand All @@ -37,4 +38,4 @@ def test_coverage():
assert rs.status.str.match('PASS')[1]
assert rs.violations[1] == 18434
assert rs.pass_threshold[1] == 0.7
# assert rs.pass_rate[1] == 207158238/207176656
assert rs.pass_rate[1] == 0.9999117752439066 #207158222/207176656

0 comments on commit d08ad66

Please sign in to comment.