Skip to content

Commit

Permalink
Feature add biochecks (#282)
Browse files Browse the repository at this point in the history
* Modified pyspark validation to accomodate for custom functions

* Added cds verification:

* Added bio checks test cases

* Added README updates
  • Loading branch information
canimus authored Jul 13, 2024
1 parent f949962 commit fbef98f
Show file tree
Hide file tree
Showing 8 changed files with 160 additions and 9 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ Provider | API | Versions
![databricks](logos/databricks.svg?raw=true "PySpark DataFrame API")| `pyspark` & `spark-connect` |`3.5.x`, `3.4.0`, `3.3.x`, `3.2.x`
![bigquery](logos/bigquery.png?raw=true "BigQuery Client API")| `bigquery` | `3.4.1`
![pandas](logos/pandas.svg?raw=true "Pandas DataFrame API")| `pandas`| `2.0.2`, `1.5.x`, `1.4.x`
![duckdb](logos/duckdb.png?raw=true "DuckDB API")|`duckdb` | `0.10.2`,~~`0.9.2`~~,~~`0.8.0`~~, ~~`0.7.1`~~
![duckdb](logos/duckdb.png?raw=true "DuckDB API")|`duckdb` | `1.0.0`,~~`0.10.2`~~,~~`0.9.2`~~,~~`0.8.0`~~, ~~`0.7.1`~~
![polars](logos/polars.svg?raw=true "Polars API")|`polars`| `1.0.0`,~~`0.19.6`~~
![daft](logos/daft.png?raw=true "Daft API")|`daft`| `0.2.24`, ~~`0.2.19`~~

Expand Down
5 changes: 3 additions & 2 deletions cuallee/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ def __post_init__(self):
self.name = self.method

def __repr__(self):
return f"Rule(method:{self.name}, column:{self.column}, value:{self.value}, data_type:{self.data_type}, coverage:{self.coverage}, status:{self.status}"
return f"Rule(method:{self.name}, column:{self.column}, value:{self.value}, data_type:{self.data_type}, coverage:{self.coverage}, ordinal:{self.ordinal}"

def __rshift__(self, rule_dict: Dict[str, Any]) -> Dict[str, Any]:
rule_dict[self.key] = self
Expand Down Expand Up @@ -300,7 +300,8 @@ def add_rule(self, method: str, *arg, **kwargs):
Args:
method (str): Check name
arg (list): Parameters of the check
arg (list): Parameters of the Rule
kwars (dict): Dictionary of options for the Rule
"""
return operator.methodcaller(method, *arg, **kwargs)(self)

Expand Down
9 changes: 8 additions & 1 deletion cuallee/bio/checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,15 @@ def is_dna(self, column: str, pct: float = 1.0, options: Dict[str, str] = {"name
return self._check

def is_protein(self, column: str, pct: float = 1.0, options: Dict[str, str] = {"name": "is_protein"}):
"""Verifies that country codes are valid against the ISO standard 3166"""
"""Verifies that a sequence contains only valid aminoacid 1-letter codes"""
self._check.has_pattern(
column, rf"^[{''.join(self._aminoacids['1_letter_code'].tolist())}]*$", pct, options=options
)
return self._check

def is_cds(self, column: str, pct: float = 1.0, options: Dict[str, str] = {"name": "is_cds"}):
"""Verifies that a sequence contains the correct codons"""
self._check.satisfies(
column, f"({column} rlike '^ATG.*') and ({column} rlike '.*(TAA|TAG|TGA)$') and (length({column}) % 3 == 0)", pct, options=options
)
return self._check
8 changes: 7 additions & 1 deletion cuallee/duckdb_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import pandas as pd # type: ignore
from toolz import first # type: ignore
from string import Template
import re

from cuallee import Check, Rule

Expand Down Expand Up @@ -106,7 +107,12 @@ def has_correlation(self, rule: Rule) -> str:
return f"CORR({rule.column[0]}, {rule.column[1]}) = {rule.value}"

def satisfies(self, rule: Rule) -> str:
return f"SUM(CAST(({rule.value}) AS INTEGER))"
"""Allows arbitrary SQL statement execution as rules"""

# Compatibility with other dataframe regular expression comparissons
expression = re.compile(re.escape("rlike"), re.IGNORECASE)
subquery = expression.sub('SIMILAR TO', rule.value)
return f"SUM(CAST(({subquery}) AS INTEGER))"

def has_entropy(self, rule: Rule) -> str:
return f"ENTROPY({rule.column}) = {rule.value}"
Expand Down
10 changes: 6 additions & 4 deletions cuallee/pyspark_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -801,11 +801,13 @@ def summary(check: Check, dataframe: DataFrame) -> DataFrame:
spark = SparkSession.builder.getOrCreate()

def _value(x):
"""Removes verbosity for Callable values"""
if isinstance(x, Callable):
"""Removes verbosity for Callable values"""
if x.options and isinstance(x.options, dict):
return x.options.get("custom_value", "f(x)")
elif isinstance(x.value, Callable):
return "f(x)"
else:
return str(x)
return str(x.value)

# Compute the expression
computed_expressions = compute(check._rule)
Expand Down Expand Up @@ -845,7 +847,7 @@ def _value(x):
check.level.name,
str(rule.column),
str(rule.name),
_value(rule.value),
_value(rule),
int(check.rows),
int(rule.violations),
float(rule.pass_rate),
Expand Down
40 changes: 40 additions & 0 deletions test/unit/bio_checks/test_duckdb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import pytest
import polars as pl
import duckdb

def test_is_dna(check, db: duckdb.DuckDBPyConnection):
df = pl.DataFrame({"sequence" : ["ATGCCCTTTGGGTAA", "ATGCCCTTTGGGTAG", "ATGCCCTTTGGGTGA"]})
check.table_name = "df"
check.bio.is_dna("sequence")
assert check.validate(db).status.str.match("PASS").all()

def test_is_not_dna(check, db: duckdb.DuckDBPyConnection):
df = pl.DataFrame({"sequence" : ["XXX", "YYY", "ZZZ"]})
check.table_name = "df"
check.bio.is_dna("sequence")
assert check.validate(db).status.str.match("FAIL").all()

def test_is_cds(check, db: duckdb.DuckDBPyConnection):
df = pl.DataFrame({"sequence" : ["ATGCCCTTTGGGTAA", "ATGCCCTTTGGGTAG", "ATGCCCTTTGGGTGA"]})
check.table_name = "df"
check.bio.is_cds("sequence")
assert check.validate(db).status.str.match("PASS").all()


def test_is_not_cds(check, db: duckdb.DuckDBPyConnection):
df = pl.DataFrame({"sequence" : ["ATGCCCTTTGGGTCC", "ATGCCCTTTGGGCCC", "ATGCCCTTTGGGTTT"]})
check.table_name = "df"
check.bio.is_cds("sequence")
assert check.validate(db).status.str.match("FAIL").all()

def test_is_protein(check, db: duckdb.DuckDBPyConnection):
df = pl.DataFrame({"sequence" : ["ARND", "PSTW", "GHIL"]})
check.table_name = "df"
check.bio.is_protein("sequence")
assert check.validate(db).status.str.match("PASS").all()

def test_is_not_protein(check, db: duckdb.DuckDBPyConnection):
df = pl.DataFrame({"sequence" : ["XXX", "OO1", "UU2"]})
check.table_name = "df"
check.bio.is_protein("sequence")
assert check.validate(db).status.str.match("FAIL").all()
45 changes: 45 additions & 0 deletions test/unit/bio_checks/test_polars.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import pytest
import polars as pl

def test_is_dna(check):
df = pl.DataFrame({"sequence" : ["ATGCCCTTTGGGTAA", "ATGCCCTTTGGGTAG", "ATGCCCTTTGGGTGA"]})
check.bio.is_dna("sequence")
rs = check.validate(df)
result = check.validate(df).select(pl.col("status")) == "PASS"
assert all(result.to_series().to_list())

def test_is_not_dna(check):
df = pl.DataFrame({"sequence" : ["XXX", "YYY", "ZZZ"]})
check.bio.is_dna("sequence")
rs = check.validate(df)
result = check.validate(df).select(pl.col("status")) == "FAIL"
assert all(result.to_series().to_list())

def test_is_cds(check):
df = pl.DataFrame({"sequence" : ["ATGCCCTTTGGGTAA", "ATGCCCTTTGGGTAG", "ATGCCCTTTGGGTGA"]})
check.bio.is_cds("sequence")
rs = check.validate(df)
result = check.validate(df).select(pl.col("status")) == "PASS"
assert all(result.to_series().to_list())


def test_is_not_cds(check):
df = pl.DataFrame({"sequence" : ["ATGCCCTTTGGGTCC", "ATGCCCTTTGGGCCC", "ATGCCCTTTGGGTTT"]})
check.bio.is_cds("sequence")
rs = check.validate(df)
result = check.validate(df).select(pl.col("status")) == "FAIL"
assert all(result.to_series().to_list())

def test_is_protein(check):
df = pl.DataFrame({"sequence" : ["ARND", "PSTW", "GHIL"]})
check.bio.is_protein("sequence")
rs = check.validate(df)
result = check.validate(df).select(pl.col("status")) == "PASS"
assert all(result.to_series().to_list())

def test_is_not_protein(check):
df = pl.DataFrame({"sequence" : ["XXX", "OO1", "UU2"]})
check.bio.is_protein("sequence")
rs = check.validate(df)
result = check.validate(df).select(pl.col("status")) == "FAIL"
assert all(result.to_series().to_list())
50 changes: 50 additions & 0 deletions test/unit/bio_checks/test_pyspark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import pytest

def test_is_dna(check, spark):
df = spark.createDataFrame([("ATGCCCTTTGGGTAA",), ("ATGCCCTTTGGGTAG",), ("ATGCCCTTTGGGTGA",)], schema="sequence string")
check.bio.is_dna("sequence")
rs = check.validate(df)
assert rs.first().status == "PASS"
assert rs.first().violations == 0
assert rs.first().pass_threshold == 1.0

def test_is_not_dna(check, spark):
df = spark.createDataFrame([("XXX",), ("YYY",), ("ZZZ",)], schema="sequence string")
check.bio.is_dna("sequence")
rs = check.validate(df)
assert rs.first().status == "FAIL"
assert rs.first().violations == 3
assert rs.first().pass_threshold == 1.0

def test_is_cds(check, spark):
df = spark.createDataFrame([("ATGCCCTTTGGGTAA",), ("ATGCCCTTTGGGTAG",), ("ATGCCCTTTGGGTGA",)], schema="sequence string")
check.bio.is_cds("sequence")
rs = check.validate(df)
assert rs.first().status == "PASS"
assert rs.first().violations == 0
assert rs.first().pass_threshold == 1.0


def test_is_not_cds(check, spark):
df = spark.createDataFrame([("ATGCCCTTTGGGTCC",), ("ATGCCCTTTGGGCCC",), ("ATGCCCTTTGGGTTT",)], schema="sequence string")
check.bio.is_cds("sequence")
rs = check.validate(df)
assert rs.first().status == "FAIL"
assert rs.first().violations == 3
assert rs.first().pass_threshold == 1.0

def test_is_protein(check, spark):
df = spark.createDataFrame([("ARND",), ("PSTW",), ("GHIL",)], schema="sequence string")
check.bio.is_protein("sequence")
rs = check.validate(df)
assert rs.first().status == "PASS"
assert rs.first().violations == 0
assert rs.first().pass_threshold == 1.0

def test_is_not_protein(check, spark):
df = spark.createDataFrame([("XXX",), ("OO1",), ("UU2",)], schema="sequence string")
check.bio.is_protein("sequence")
rs = check.validate(df)
assert rs.first().status == "FAIL"
assert rs.first().violations == 3
assert rs.first().pass_threshold == 1.0

0 comments on commit fbef98f

Please sign in to comment.