diff --git a/docs/getting-started/setup.md b/docs/getting-started/setup.md index 2c43f39b..e77435c3 100644 --- a/docs/getting-started/setup.md +++ b/docs/getting-started/setup.md @@ -48,7 +48,7 @@ create table if not exists `catalog`.`schema`.`{product}_rules` ( 7. `action_if_failed` There are 3 different types of actions. These are 'ignore', 'drop', and 'fail'. Ignore: The rule is run and the output is logged. No action is performed regardless of whether the rule has succeeded or failed. Applies for all 3 rule types. Drop: The rows that fail the rule get dropped from the dataset. Applies for only row_dq rule type. - Fail: DAG fails if the rule fails. Applies for all 3 rule types. + Fail: job fails if the rule fails. Applies for all 3 rule types. 8. `tag` provide some tag name to dq rule example: completeness, validity, uniqueness etc. 9. `description` Long description for the rule 10. `enable_for_source_dq_validation` flag to run the agg rule @@ -59,12 +59,11 @@ create table if not exists `catalog`.`schema`.`{product}_rules` ( 15. `query_dq_delimiter` segregate custom queries delimiter ex: $, @ etc. By default it is @. Users can override it with any other delimiter based on the need. The same delimiter mentioned here has to be used in the custom query. 16. `enable_querydq_custom_output` required custom query output in separate table -rule_type, enable_for_source_dq_validation and enable_for_target_dq_validation columns define source_agg_dq, target_agg_dq,source_query_dq and target_query_dq. please see the below definitions: -If rule_type is row_dq then row_dq is TRUE -If rule_type is agg_dq and enable_for_source_dq_validation is TRUE then source_agg_dq is TRUE -If rule_type is agg_dq and enable_for_target_dq_validation is TRUE then target_agg_dq is TRUE -If rule_type is query_dq and enable_for_source_dq_validation is TRUE then source_query_dq is TRUE -If rule_type is query_dq and enable_for_target_dq_validation is TRUE then target_query_dq is TRUE + +The Spark Expectation process consists of three phases: +1. When enable_for_source_dq_validation is true, execute agg_dq and query_dq on the source Dataframe +2. If the first step is successful, proceed to run row_dq +3. When enable_for_target_dq_validation is true, exeucte agg_dq and query_dq on the Dataframe resulting from row_dq ### Rule Type For Rules @@ -166,15 +165,20 @@ source_dq_expected_outcome string, -- (11)! source_dq_actual_row_count string, -- (12)! source_dq_error_row_count string, -- (13)! source_dq_row_count string, -- (14)! -target_expectations string, -- (15)! -target_dq_status string, -- (16)! -target_dq_actual_outcome string, -- (17)! -target_dq_expected_outcome string, -- (18)! -target_dq_actual_row_count string, -- (19)! -target_dq_error_row_count string, -- (20)! -target_dq_row_count string, -- (21)! -dq_date date, -- (22)! -dq_time string, -- (23)! +source_dq_start_time string, -- (15)! +source_dq_end_time string, -- (16)! +target_expectations string, -- (17)! +target_dq_status string, -- (18)! +target_dq_actual_outcome string, -- (19)! +target_dq_expected_outcome string, -- (20)! +target_dq_actual_row_count string, -- (21)! +target_dq_error_row_count string, -- (22)! +target_dq_row_count string, -- (23)! +target_dq_start_time string, -- (24)! +target_dq_end_time string, -- (25)! +dq_date date, -- (26)! +dq_time string, -- (27)! +dq_job_metadata_info string, -- (28)! ); ``` @@ -192,12 +196,17 @@ dq_time string, -- (23)! 12. `source_dq_actual_row_count` Number of rows of the source dq 13. `source_dq_error_row_count` Number of rows failed in the source dq 14. `source_dq_row_count` Number of rows of the source dq -15. `target_expectations` Actual Rule to be executed on the target dq -16. `target_dq_status` Status of the rule execution in the Target dq -17. `target_dq_actual_outcome` Actual outcome of the Target dq check -18. `target_dq_expected_outcome` Expected outcome of the Target dq check -19. `target_dq_actual_row_count` Number of rows of the target dq -20. `target_dq_error_row_count` Number of rows failed in the target dq -21. `target_dq_row_count` Number of rows of the target dq -22. `dq_date` Dq executed date -23. `dq_time` Dq executed timestamp +15. `source_dq_start_time` source dq start timestamp +16. `source_dq_end_time` source dq end timestamp +17. `target_expectations` Actual Rule to be executed on the target dq +18. `target_dq_status` Status of the rule execution in the Target dq +19. `target_dq_actual_outcome` Actual outcome of the Target dq check +20. `target_dq_expected_outcome` Expected outcome of the Target dq check +21. `target_dq_actual_row_count` Number of rows of the target dq +22. `target_dq_error_row_count` Number of rows failed in the target dq +23. `target_dq_row_count` Number of rows of the target dq +24. `target_dq_start_time` target dq start timestamp +25. `target_dq_end_time` target dq end timestamp +26. `dq_date` Dq executed date +27. `dq_time` Dq executed timestamp +28. `dq_job_metadata_info` dq job metadata diff --git a/prospector.yaml b/prospector.yaml index 9d32f4e9..5e250468 100644 --- a/prospector.yaml +++ b/prospector.yaml @@ -16,6 +16,7 @@ max-line-length: 120 pylint: disable: + - too-many-lines - too-many-branches - too-many-statements - too-many-instance-attributes diff --git a/spark_expectations/config/user_config.py b/spark_expectations/config/user_config.py index af17c40d..feb53229 100644 --- a/spark_expectations/config/user_config.py +++ b/spark_expectations/config/user_config.py @@ -64,6 +64,7 @@ class Constants: # declare const user config variables for agg query dq detailed stats se_enable_agg_dq_detailed_result = "spark.expectations.agg.dq.detailed.stats" se_enable_query_dq_detailed_result = "spark.expectations.query.dq.detailed.stats" + se_job_metadata = "spark.expectations.job.metadata" querydq_output_custom_table_name = "spark.expectations.query.dq.custom.table_name" diff --git a/spark_expectations/core/context.py b/spark_expectations/core/context.py index 6a2abca9..7ee6d159 100644 --- a/spark_expectations/core/context.py +++ b/spark_expectations/core/context.py @@ -75,6 +75,7 @@ def __post_init__(self) -> None: self._final_agg_dq_result: Optional[List[Dict[str, str]]] = None self._source_query_dq_result: Optional[List[Dict[str, str]]] = None self._final_query_dq_result: Optional[List[Dict[str, str]]] = None + self._job_metadata: Optional[str] = None self._source_agg_dq_detailed_stats: Optional[List[Tuple]] = None self._source_query_dq_detailed_stats: Optional[List[Tuple]] = None @@ -1911,3 +1912,26 @@ def get_dq_rules_params(self) -> dict: """ return self._dq_rules_params + + def set_job_metadata(self, job_metadata: Optional[str] = None) -> None: + """ + This function is used to set the job_metadata + + Returns: + None + + """ + self._job_metadata = job_metadata + + @property + def get_job_metadata(self) -> Optional[str]: + """ + This function is used to get row data quality rule type name + + Returns: + str: Returns _row_dq_rule_type_name" + + """ + if self._job_metadata is not None: + return str(self._job_metadata) + return None diff --git a/spark_expectations/core/expectations.py b/spark_expectations/core/expectations.py index 99cfb06c..c618b15a 100644 --- a/spark_expectations/core/expectations.py +++ b/spark_expectations/core/expectations.py @@ -46,7 +46,15 @@ class SparkExpectations: def __post_init__(self) -> None: if isinstance(self.rules_df, DataFrame): - self.spark: SparkSession = self.rules_df.sparkSession + try: + self.spark: Optional[SparkSession] = self.rules_df.sparkSession + except AttributeError: + self.spark = SparkSession.getActiveSession() + + if self.spark is None: + raise SparkExpectationsMiscException( + "Spark session is not available, please initialize a spark session before calling SE" + ) else: raise SparkExpectationsMiscException( "Input rules_df is not of dataframe type" @@ -112,7 +120,7 @@ def _except(func: Any) -> Any: # variable used for enabling notification at different level _default_notification_dict: Dict[ - str, Union[str, int, bool, Dict[str, str]] + str, Union[str, int, bool, Dict[str, str], None] ] = { user_config.se_notifications_on_start: False, user_config.se_notifications_on_completion: False, @@ -121,10 +129,13 @@ def _except(func: Any) -> Any: user_config.se_notifications_on_error_drop_threshold: 100, user_config.se_enable_agg_dq_detailed_result: False, user_config.se_enable_query_dq_detailed_result: False, + user_config.se_job_metadata: None, user_config.querydq_output_custom_table_name: f"{self.stats_table}_querydq_output", } - _notification_dict: Dict[str, Union[str, int, bool, Dict[str, str]]] = ( + _notification_dict: Dict[ + str, Union[str, int, bool, Dict[str, str], None] + ] = ( {**_default_notification_dict, **user_conf} if user_conf else _default_notification_dict @@ -262,6 +273,8 @@ def _except(func: Any) -> Any: else False ) + _job_metadata: str = user_config.se_job_metadata + notifications_on_error_drop_threshold = _notification_dict.get( user_config.se_notifications_on_error_drop_threshold, 100 ) @@ -280,6 +293,7 @@ def _except(func: Any) -> Any: self._context.set_dq_expectations(expectations) self._context.set_rules_execution_settings_config(rules_execution_settings) self._context.set_querydq_secondary_queries(dq_queries_dict) + self._context.set_job_metadata(_job_metadata) @self._notification.send_notification_decorator @self._statistics_decorator.collect_stats_decorator @@ -292,6 +306,7 @@ def wrapper(*args: tuple, **kwargs: dict) -> DataFrame: table_name: str = self._context.get_table_name _input_count = _df.count() + _log.info("data frame input record count: %s", _input_count) _output_count: int = 0 _error_count: int = 0 _source_dq_df: Optional[DataFrame] = None @@ -333,21 +348,28 @@ def wrapper(*args: tuple, **kwargs: dict) -> DataFrame: self._context.set_input_count(_input_count) self._context.set_error_drop_threshold(_error_drop_threshold) + _log.info( + "Spark Expectations run id for this run: %s", + self._context.get_run_id, + ) + if isinstance(_df, DataFrame): _log.info("The function dataframe is created") self._context.set_table_name(table_name) if write_to_temp_table: _log.info("Dropping to temp table started") - self.spark.sql(f"drop table if exists {table_name}_temp") + self.spark.sql(f"drop table if exists {table_name}_temp") # type: ignore _log.info("Dropping to temp table completed") _log.info("Writing to temp table started") + source_columns = _df.columns self._writer.save_df_as_table( _df, f"{table_name}_temp", self._context.get_target_and_error_table_writer_config, ) _log.info("Read from temp table started") - _df = self.spark.sql(f"select * from {table_name}_temp") + _df = self.spark.sql(f"select * from {table_name}_temp") # type: ignore + _df = _df.select(source_columns) _log.info("Read from temp table completed") func_process = self._process.execute_dq_process( @@ -544,7 +566,7 @@ def wrapper(*args: tuple, **kwargs: dict) -> DataFrame: "error occurred while processing spark " "expectations due to given dataframe is not type of dataframe" ) - self.spark.catalog.clearCache() + # self.spark.catalog.clearCache() return _row_dq_df diff --git a/spark_expectations/examples/base_setup.py b/spark_expectations/examples/base_setup.py index 4a78d7ce..d16e2598 100644 --- a/spark_expectations/examples/base_setup.py +++ b/spark_expectations/examples/base_setup.py @@ -25,7 +25,7 @@ """ -RULES_DATA = """ +RULES_DATA = """ ("your_product", "dq_spark_dev.customer_order", "row_dq", "sales_greater_than_zero", "sales", "sales > 2", "ignore", "accuracy", "sales value should be greater than zero", false, true, true, false, 0,null, null) ,("your_product", "dq_spark_{env}.customer_order", "row_dq", "discount_threshold", "discount", "discount*100 < 60","drop", "validity", "discount should be less than 40", true, true, true, false, 0,null, null) @@ -41,8 +41,8 @@ ,("your_product", "dq_spark_dev.customer_order", "query_dq", "order_count_validity_check", "*", "(select count(*) from order_source) > 10", "ignore", "validity", "row count threshold", true, true, true, false, 0, null, true) ,("your_product", "dq_spark_{env}.customer_order", "query_dq", "product_category", "*", "(select count(distinct category) from {table}) < 5", "ignore", "validity", "distinct product category", true, true, true, false, 0,null, true) ,("your_product", "dq_spark_{env}.customer_order", "agg_dq", "distinct_of_ship_mode", "ship_mode", "count(distinct ship_mode) <= 3", "ignore", "validity", "regex format validation for quantity", true, true, true, false, 0,null, null) - - + + """ diff --git a/spark_expectations/examples/sample_dq_bigquery.py b/spark_expectations/examples/sample_dq_bigquery.py index 89e3612c..5c71c7c5 100644 --- a/spark_expectations/examples/sample_dq_bigquery.py +++ b/spark_expectations/examples/sample_dq_bigquery.py @@ -19,6 +19,12 @@ .option("createDisposition", "CREATE_IF_NEEDED") .option("writeMethod", "direct") ) +dic_job_info = { + "job": "job_name", + "Region": "NA", + "Snapshot": "2024-04-15", +} +job_info = str(dic_job_info) # if wanted to use indirect method use below setting and spark session # writer = WrappedDataFrameWriter().mode("overwrite").format("bigquery").\ @@ -63,6 +69,7 @@ "env": "local", "table": "product", }, + user_config.se_job_metadata: job_info, } diff --git a/spark_expectations/examples/sample_dq_delta.py b/spark_expectations/examples/sample_dq_delta.py index 7f2cb04b..6520d95c 100644 --- a/spark_expectations/examples/sample_dq_delta.py +++ b/spark_expectations/examples/sample_dq_delta.py @@ -14,6 +14,12 @@ writer = WrappedDataFrameWriter().mode("append").format("delta") spark = set_up_delta() +dic_job_info = { + "job": "job_name", + "Region": "NA", + "Snapshot": "2024-04-15", +} +job_info = str(dic_job_info) se: SparkExpectations = SparkExpectations( product_id="your_product", @@ -47,6 +53,7 @@ "env": "dev", "table": "product", }, + user_config.se_job_metadata: job_info, } diff --git a/spark_expectations/examples/sample_dq_iceberg.py b/spark_expectations/examples/sample_dq_iceberg.py index b7859ace..16e352a2 100644 --- a/spark_expectations/examples/sample_dq_iceberg.py +++ b/spark_expectations/examples/sample_dq_iceberg.py @@ -11,6 +11,12 @@ from spark_expectations.config.user_config import Constants as user_config writer = WrappedDataFrameWriter().mode("append").format("iceberg") +dic_job_info = { + "job": "job_name", + "Region": "NA", + "Snapshot": "2024-04-15", +} +job_info = str(dic_job_info) spark = set_up_iceberg() @@ -48,6 +54,7 @@ "env": "local", "table": "product", }, + user_config.se_job_metadata: job_info, } diff --git a/spark_expectations/sinks/utils/writer.py b/spark_expectations/sinks/utils/writer.py index 67d93c55..261ceb2e 100644 --- a/spark_expectations/sinks/utils/writer.py +++ b/spark_expectations/sinks/utils/writer.py @@ -1,6 +1,6 @@ from dataclasses import dataclass from typing import Dict, Optional, Tuple, List -from datetime import datetime +from datetime import datetime, timezone from pyspark.sql import DataFrame from pyspark.sql.functions import ( lit, @@ -15,6 +15,8 @@ col, split, current_date, + monotonically_increasing_id, + coalesce, ) from pyspark.sql.types import StructType from spark_expectations import _log @@ -125,7 +127,24 @@ def save_df_as_table( def get_row_dq_detailed_stats( self, ) -> List[ - Tuple[str, str, str, str, str, str, str, str, str, None, None, int, str, int] + Tuple[ + str, + str, + str, + str, + str, + str, + str, + str, + str, + None, + None, + int, + str, + int, + str, + str, + ] ]: """ This function writes the detailed stats for row dq into the detailed stats table @@ -145,44 +164,79 @@ def get_row_dq_detailed_stats( _input_count = self._context.get_input_count _row_dq_result = [] - _rowdq_rule_dict = {} + + _rowdq_expectations = self._context.get_dq_expectations + _row_dq_expectations = _rowdq_expectations["row_dq_rules"] + if ( self._context.get_summarized_row_dq_res is not None and len(self._context.get_summarized_row_dq_res) > 0 ): - _rowdq_expectations = self._context.get_dq_expectations - for _rowdq_rule in _rowdq_expectations["row_dq_rules"]: - _rowdq_rule_dict[_rowdq_rule["rule"]] = ( - _rowdq_rule["expectation"] - + "|" - + _rowdq_rule["tag"] - + "|" - + _rowdq_rule["description"] - ) - - for _dq_res in self._context.get_summarized_row_dq_res: - _rowdq_rules = str(_rowdq_rule_dict[_dq_res["rule"]]).split("|") - _rule_expectations = _rowdq_rules[0] - _rule_tag = _rowdq_rules[1] - _rule_desc = _rowdq_rules[2] - _row_dq_result.append( - ( - _run_id, - _product_id, - _table_name, - _dq_res["rule_type"], - _dq_res["rule"], - _rule_expectations, - _rule_tag, - _rule_desc, - "pass" if int(_dq_res["failed_row_count"]) == 0 else "fail", - None, - None, - (_input_count - int(_dq_res["failed_row_count"])), - _dq_res["failed_row_count"], - _input_count, + _row_dq_res = self._context.get_summarized_row_dq_res + _dq_res = {d["rule"]: d["failed_row_count"] for d in _row_dq_res} + + for _rowdq_rule in _row_dq_expectations: + if _rowdq_rule["rule"] in _dq_res: + failed_row_count = _dq_res[_rowdq_rule["rule"]] + _row_dq_result.append( + ( + _run_id, + _product_id, + _table_name, + _rowdq_rule["rule_type"], + _rowdq_rule["rule"], + _rowdq_rule["expectation"], + _rowdq_rule["tag"], + _rowdq_rule["description"], + "pass" if int(failed_row_count) == 0 else "fail", + None, + None, + (_input_count - int(failed_row_count)), + failed_row_count, + _input_count, + self._context.get_row_dq_start_time.replace( + tzinfo=timezone.utc + ).strftime("%Y-%m-%d %H:%M:%S") + if self._context.get_row_dq_start_time + else "1900-01-01 00:00:00", + self._context.get_row_dq_end_time.replace( + tzinfo=timezone.utc + ).strftime("%Y-%m-%d %H:%M:%S") + if self._context.get_row_dq_end_time + else "1900-01-01 00:00:00", + ) ) + _row_dq_expectations.remove(_rowdq_rule) + + for _rowdq_rule in _row_dq_expectations: + _row_dq_result.append( + ( + _run_id, + _product_id, + _table_name, + _rowdq_rule["rule_type"], + _rowdq_rule["rule"], + _rowdq_rule["expectation"], + _rowdq_rule["tag"], + _rowdq_rule["description"], + "pass", + None, + None, + _input_count, + "0", + _input_count, + self._context.get_row_dq_start_time.replace( + tzinfo=timezone.utc + ).strftime("%Y-%m-%d %H:%M:%S") + if self._context.get_row_dq_start_time + else "1900-01-01 00:00:00", + self._context.get_row_dq_end_time.replace( + tzinfo=timezone.utc + ).strftime("%Y-%m-%d %H:%M:%S") + if self._context.get_row_dq_end_time + else "1900-01-01 00:00:00", ) + ) return _row_dq_result @@ -266,6 +320,8 @@ def _prep_secondary_query_output(self) -> DataFrame: - alias_comp - target_output - dq_time + - dq_start_time + - dq_end_time """ _querydq_secondary_query_source_output = ( self._context.get_source_query_dq_output @@ -374,6 +430,8 @@ def _prep_detailed_stats( "source_dq_actual_row_count", "source_dq_error_row_count", "source_dq_row_count", + "source_dq_start_time", + "source_dq_end_time", ] ) _detailed_stats_target_dq_schema = self._create_schema( @@ -392,6 +450,8 @@ def _prep_detailed_stats( "target_dq_actual_row_count", "target_dq_error_row_count", "target_dq_row_count", + "target_dq_start_time", + "target_dq_end_time", ] ) rules_execution_settings = self._context.get_rules_execution_settings_config @@ -408,11 +468,7 @@ def _prep_detailed_stats( _source_querydq_detailed_stats_result ) - if ( - self._context.get_row_dq_status != "Skipped" - and self._context.get_summarized_row_dq_res is not None - and len(self._context.get_summarized_row_dq_res) > 0 - ): + if self._context.get_row_dq_status != "Skipped" and _row_dq: _rowdq_detailed_stats_result = self.get_row_dq_detailed_stats() else: @@ -461,6 +517,10 @@ def _prep_detailed_stats( "dq_date", current_date() ).withColumn("dq_time", lit(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) + _df_detailed_stats = _df_detailed_stats.withColumn( + "dq_job_metadata_info", lit(self._context.get_job_metadata).cast("string") + ) + return _df_detailed_stats def write_detailed_stats(self) -> None: @@ -523,12 +583,18 @@ def write_detailed_stats(self) -> None: ) _log.info( - "Writing metrics to the querydq custom output table: %s, ended", - self._context.get_query_dq_output_custom_table_name, + "Writing metrics to the detailed stats table: %s, ended", + self._context.get_dq_detailed_stats_table_name, ) + # TODO Create a separate function for writing the custom query dq stats _df_custom_detailed_stats_source = self._prep_secondary_query_output() + _log.info( + "Writing metrics to the output custom table: %s, started", + self._context.get_query_dq_output_custom_table_name, + ) + self.save_df_as_table( _df_custom_detailed_stats_source, self._context.get_query_dq_output_custom_table_name, @@ -536,6 +602,11 @@ def write_detailed_stats(self) -> None: stats_table=True, ) + _log.info( + "Writing metrics to the output custom table: %s, ended", + self._context.get_query_dq_output_custom_table_name, + ) + except Exception as e: raise SparkExpectationsMiscException( f"error occurred while saving the data into the stats table {e}" @@ -729,16 +800,20 @@ def write_error_stats(self) -> None: stats_table=True, ) + _log.info( + "Writing metrics to the stats table: %s, ended", + self._context.get_dq_stats_table_name, + ) + if ( self._context.get_agg_dq_detailed_stats_status is True or self._context.get_query_dq_detailed_stats_status is True ): self.write_detailed_stats() - _log.info( - "Writing metrics to the stats table: %s, ended", - self._context.get_dq_stats_table_name, - ) + # TODO Implement the below function for writing the custom query dq stats + # if self._context.get_query_dq_detailed_stats_status is True: + # self.write_query_dq_custom_output() # TODO check if streaming_stats is set to off, if it's enabled only then this should run @@ -804,6 +879,19 @@ def write_error_records_final( ) -> Tuple[int, DataFrame]: try: _log.info("_write_error_records_final started") + df = df.withColumn("sequence_number", monotonically_increasing_id()) + + df_seq = df + + df = df.select( + "sequence_number", + *[ + dq_column + for dq_column in df.columns + if dq_column.startswith(f"{rule_type}") + ], + ) + df.cache() failed_records = [ f"size({dq_column}) != 0" @@ -841,7 +929,26 @@ def write_error_records_final( lit(self._context.get_run_date), ) ) - error_df = df.filter(f"size(meta_{rule_type}_results) != 0") + error_df_seq = df.filter(f"size(meta_{rule_type}_results) != 0") + + error_df = df_seq.join( + error_df_seq, + df_seq.sequence_number == error_df_seq.sequence_number, + "inner", + ) + + # sequence number column removing from the data frame + error_df_columns = [ + dq_column + for dq_column in error_df.columns + if ( + dq_column.startswith("sequence_number") + or dq_column.startswith(rule_type) + ) + is False + ] + + error_df = error_df.select(error_df_columns) self._context.print_dataframe_with_debugger(error_df) print( @@ -858,6 +965,27 @@ def write_error_records_final( # if _error_count > 0: self.generate_summarized_row_dq_res(error_df, rule_type) + # sequence number adding to dataframe for passing to action function + df = df_seq.join( + error_df_seq, + df_seq.sequence_number == error_df_seq.sequence_number, + "left", + ).withColumn( + f"meta_{rule_type}_results", + coalesce(col(f"meta_{rule_type}_results"), array()), + ) + + df = ( + df.select(error_df_columns) + .withColumn( + self._context.get_run_id_name, lit(self._context.get_run_id) + ) + .withColumn( + self._context.get_run_date_time_name, + lit(self._context.get_run_date), + ) + ) + _log.info("_write_error_records_final ended") return _error_count, df diff --git a/spark_expectations/utils/actions.py b/spark_expectations/utils/actions.py index 300fc02b..a3f689a9 100644 --- a/spark_expectations/utils/actions.py +++ b/spark_expectations/utils/actions.py @@ -1,5 +1,6 @@ from typing import Dict, List, Any, Optional, Tuple import re +from datetime import datetime from pyspark.sql import DataFrame @@ -120,7 +121,18 @@ def agg_query_dq_detailed_result( _context: SparkExpectationsContext, _dq_rule: Dict[str, str], df: DataFrame, - querydq_output: List[Tuple[str, str, str, str, Any, str, dict, str]], + querydq_output: List[ + Tuple[ + str, + str, + str, + str, + Any, + str, + dict, + str, + ] + ], _source_dq_status: bool = False, _target_dq_status: bool = False, ) -> Any: @@ -528,6 +540,11 @@ def run_dq_rules( _context.get_agg_dq_detailed_stats_status is True or _context.get_query_dq_detailed_stats_status is True ): + current_date = datetime.now() + dq_start_time = datetime.strftime( + current_date, "%Y-%m-%d %H:%M:%S" + ) + ( _querydq_output_list, _agg_query_dq_output_tuple, @@ -539,7 +556,13 @@ def run_dq_rules( _source_dq_status=_source_dq_enabled, _target_dq_status=_target_dq_enabled, ) - + current_date = datetime.now() + dq_end_time = datetime.strftime( + current_date, "%Y-%m-%d %H:%M:%S" + ) + _agg_query_dq_output_list = list(_agg_query_dq_output_tuple) + _agg_query_dq_output_list.extend([dq_start_time, dq_end_time]) + _agg_query_dq_output_tuple = tuple(_agg_query_dq_output_list) _agg_query_dq_results.append(_agg_query_dq_output_tuple) if ( @@ -654,9 +677,15 @@ def action_on_rules( """ try: + _df_dq_columns = [ + dq_column + for dq_column in _df_dq.columns + if (dq_column.startswith(f"meta_{_rule_type}_results")) is False + ] + _df_dq_columns.append("action_if_failed") _df_dq = _df_dq.withColumn( "action_if_failed", get_actions_list(col(f"meta_{_rule_type}_results")) - ).drop(f"meta_{_rule_type}_results") + ).select(_df_dq_columns) if ( not _df_dq.filter( @@ -682,8 +711,7 @@ def action_on_rules( f"expectations and the action_if_failed " "suggested to fail" ) - - return _df_dq.drop(_df_dq.action_if_failed) + return _df_dq.select(_df_dq_columns[:-1]) except Exception as e: raise SparkExpectationsMiscException( diff --git a/spark_expectations/utils/reader.py b/spark_expectations/utils/reader.py index 8bef122e..471e9197 100644 --- a/spark_expectations/utils/reader.py +++ b/spark_expectations/utils/reader.py @@ -1,5 +1,5 @@ import os -from typing import Optional, Union, Dict +from typing import Optional, Union, Dict, Tuple from dataclasses import dataclass from functools import reduce from pyspark.sql import DataFrame @@ -246,7 +246,7 @@ def get_rules_from_df( is_dlt: bool = False, tag: Optional[str] = None, params: Optional[dict] = None, - ) -> tuple[dict, dict, dict]: + ) -> Tuple[Dict, Dict, Dict]: """ This function fetches the data quality rules from the table and return it as a dictionary diff --git a/tests/core/test_context.py b/tests/core/test_context.py index 97247ff7..f85e91f6 100644 --- a/tests/core/test_context.py +++ b/tests/core/test_context.py @@ -30,12 +30,12 @@ def test_context_properties(): # Test that the getter properties return the correct values context = SparkExpectationsContext(product_id="product1", spark=spark) - context._run_id = 'test_run_id' - context._run_date = 'test_run_date' - context._dq_stats_table_name = 'test_dq_stats_table' - context._dq_detailed_stats_table_name = 'test_dq_stats_table' - context._final_table_name = 'test_final_table' - context._error_table_name = 'test_error_table' + context._run_id = "test_run_id" + context._run_date = "test_run_date" + context._dq_stats_table_name = "test_dq_stats_table" + context._dq_detailed_stats_table_name = "test_dq_stats_table" + context._final_table_name = "test_final_table" + context._error_table_name = "test_error_table" context._row_dq_rule_type_name = "row_dq_test" context._agg_dq_rule_type_name = "agg_dq_test" @@ -145,13 +145,12 @@ def test_context_properties(): context._se_streaming_stats_dict = {"a": "b", "c": "d"} context._se_streaming_stats_topic_name = "test_topic" - - assert context.get_run_id == 'test_run_id' - assert context.get_run_date == 'test_run_date' - assert context._dq_stats_table_name == 'test_dq_stats_table' - assert context._dq_detailed_stats_table_name == 'test_dq_stats_table' - assert context._final_table_name == 'test_final_table' - assert context._error_table_name == 'test_error_table' + assert context.get_run_id == "test_run_id" + assert context.get_run_date == "test_run_date" + assert context._dq_stats_table_name == "test_dq_stats_table" + assert context._dq_detailed_stats_table_name == "test_dq_stats_table" + assert context._final_table_name == "test_final_table" + assert context._error_table_name == "test_error_table" assert context.get_row_dq_rule_type_name == "row_dq_test" assert context.get_agg_dq_rule_type_name == "agg_dq_test" @@ -218,14 +217,14 @@ def test_context_properties(): assert context._run_date_time_name == "run_date_time" assert ( - context._supported_df_query_dq - == spark.createDataFrame( - [ - { - "spark_expectations_query_check": "supported_place_holder_dataset_to_run_query_check" - } - ] - ).collect() + context._supported_df_query_dq + == spark.createDataFrame( + [ + { + "spark_expectations_query_check": "supported_place_holder_dataset_to_run_query_check" + } + ] + ).collect() ) assert context._source_agg_dq_start_time == datetime_now @@ -325,9 +324,9 @@ def test_get_source_agg_dq_status_exception(): context = SparkExpectationsContext(product_id="product1", spark=spark) context._source_agg_dq_status = None with pytest.raises( - SparkExpectationsMiscException, - match="The spark expectations context is not set completely, please assign " - "'_source_agg_dq_status' before \n accessing it", + SparkExpectationsMiscException, + match="The spark expectations context is not set completely, please assign " + "'_source_agg_dq_status' before \n accessing it", ): context.get_source_agg_dq_status @@ -336,9 +335,9 @@ def test_get_row_dq_status_exception(): context = SparkExpectationsContext(product_id="product1", spark=spark) context._row_dq_status = None with pytest.raises( - SparkExpectationsMiscException, - match="The spark expectations context is not set completely, please assign " - "'_row_dq_status' before \n accessing it", + SparkExpectationsMiscException, + match="The spark expectations context is not set completely, please assign " + "'_row_dq_status' before \n accessing it", ): context.get_row_dq_status @@ -347,9 +346,9 @@ def test_get_final_agg_dq_status_exception(): context = SparkExpectationsContext(product_id="product1", spark=spark) context._final_agg_dq_status = None with pytest.raises( - SparkExpectationsMiscException, - match="The spark expectations context is not set completely, please assign " - "'_final_agg_dq_status' before \n accessing it", + SparkExpectationsMiscException, + match="The spark expectations context is not set completely, please assign " + "'_final_agg_dq_status' before \n accessing it", ): context.get_final_agg_dq_status @@ -358,9 +357,9 @@ def test_get_dq_run_status_exception(): context = SparkExpectationsContext(product_id="product1", spark=spark) context._dq_run_status = None with pytest.raises( - SparkExpectationsMiscException, - match="The spark expectations context is not set completely, please assign " - "'_dq_run_status' before \n accessing it", + SparkExpectationsMiscException, + match="The spark expectations context is not set completely, please assign " + "'_dq_run_status' before \n accessing it", ): context.get_dq_run_status @@ -369,9 +368,9 @@ def test_get_row_dq_rule_type_name_exception(): context = SparkExpectationsContext(product_id="product1", spark=spark) context._row_dq_rule_type_name = None with pytest.raises( - SparkExpectationsMiscException, - match="The spark expectations context is not set completely, please assign " - "'_row_dq_rule_type_name' before \n accessing it", + SparkExpectationsMiscException, + match="The spark expectations context is not set completely, please assign " + "'_row_dq_rule_type_name' before \n accessing it", ): context.get_row_dq_rule_type_name @@ -380,9 +379,9 @@ def test_get_agg_dq_rule_type_name_exception(): context = SparkExpectationsContext(product_id="product1", spark=spark) context._agg_dq_rule_type_name = None with pytest.raises( - SparkExpectationsMiscException, - match="The spark expectations context is not set completely, please assign " - "'_agg_dq_rule_type_name' before \n accessing it", + SparkExpectationsMiscException, + match="The spark expectations context is not set completely, please assign " + "'_agg_dq_rule_type_name' before \n accessing it", ): context.get_agg_dq_rule_type_name @@ -408,9 +407,9 @@ def test_get_query_dq_rule_type_name(): context._query_dq_rule_type_name = value if value is None: with pytest.raises( - SparkExpectationsMiscException, - match="The spark expectations context is not set completely, please assign " - "'_query_dq_rule_type_name' before \n accessing it", + SparkExpectationsMiscException, + match="The spark expectations context is not set completely, please assign " + "'_query_dq_rule_type_name' before \n accessing it", ): context.get_query_dq_rule_type_name else: @@ -420,9 +419,9 @@ def test_get_query_dq_rule_type_name(): def test_get_dq_stats_table_name_exception(): context = SparkExpectationsContext(product_id="product1", spark=spark) with pytest.raises( - SparkExpectationsMiscException, - match="The spark expectations context is not set completely, please assign " - "'_dq_stats_table_name' before \n accessing it", + SparkExpectationsMiscException, + match="The spark expectations context is not set completely, please assign " + "'_dq_stats_table_name' before \n accessing it", ): context.get_dq_stats_table_name @@ -430,9 +429,9 @@ def test_get_dq_stats_table_name_exception(): def test_get_final_table_name_exception(): context = SparkExpectationsContext(product_id="product1", spark=spark) with pytest.raises( - SparkExpectationsMiscException, - match="The spark expectations context is not set completely, please assign " - "'_final_table_name' before \n accessing it", + SparkExpectationsMiscException, + match="The spark expectations context is not set completely, please assign " + "'_final_table_name' before \n accessing it", ): context.get_final_table_name @@ -440,9 +439,9 @@ def test_get_final_table_name_exception(): def test_get_error_table_name_exception(): context = SparkExpectationsContext(product_id="product1", spark=spark) with pytest.raises( - SparkExpectationsMiscException, - match="The spark expectations context is not set completely, please assign " - "'_error_table_name' before \n accessing it", + SparkExpectationsMiscException, + match="The spark expectations context is not set completely, please assign " + "'_error_table_name' before \n accessing it", ): context.get_error_table_name @@ -458,8 +457,8 @@ def test_get_config_file_path_exception(): context = SparkExpectationsContext(product_id="product1", spark=spark) context._dq_config_abs_path = None with pytest.raises( - SparkExpectationsMiscException, - match="""The spark expectations context is not set completely, please assign '_dq_config_abs_path' before + SparkExpectationsMiscException, + match="""The spark expectations context is not set completely, please assign '_dq_config_abs_path' before accessing it""", ): context.get_config_file_path @@ -637,9 +636,9 @@ def test_get_mail_smtp_server_exception(): context = SparkExpectationsContext(product_id="product1", spark=spark) context._mail_smtp_server = None with pytest.raises( - SparkExpectationsMiscException, - match="The spark expectations context is not set completely, please assign " - "'_mail_smtp_server' before \n accessing it", + SparkExpectationsMiscException, + match="The spark expectations context is not set completely, please assign " + "'_mail_smtp_server' before \n accessing it", ): context.get_mail_smtp_server @@ -648,9 +647,9 @@ def test_get_mail_smtp_port_exception(): context = SparkExpectationsContext(product_id="product1", spark=spark) context._mail_smtp_port = 0 with pytest.raises( - SparkExpectationsMiscException, - match="The spark expectations context is not set completely, please assign " - "'_mail_smtp_port' before \n accessing it", + SparkExpectationsMiscException, + match="The spark expectations context is not set completely, please assign " + "'_mail_smtp_port' before \n accessing it", ): context.get_mail_smtp_port @@ -659,9 +658,9 @@ def test_get_to_mail_exception(): context = SparkExpectationsContext(product_id="product1", spark=spark) context._to_mail = False with pytest.raises( - SparkExpectationsMiscException, - match="The spark expectations context is not set completely, please assign " - "'_to_mail' before \n accessing it", + SparkExpectationsMiscException, + match="The spark expectations context is not set completely, please assign " + "'_to_mail' before \n accessing it", ): context.get_to_mail @@ -670,9 +669,9 @@ def test_get_mail_subject_exception(): context = SparkExpectationsContext(product_id="product1", spark=spark) context._to_mail = False with pytest.raises( - SparkExpectationsMiscException, - match="The spark expectations context is not set completely, please assign " - "'_mail_subject' before \n accessing it", + SparkExpectationsMiscException, + match="The spark expectations context is not set completely, please assign " + "'_mail_subject' before \n accessing it", ): context.get_mail_subject @@ -681,9 +680,9 @@ def test_get_mail_from_exception(): context = SparkExpectationsContext(product_id="product1", spark=spark) context._mail_from = False with pytest.raises( - SparkExpectationsMiscException, - match="The spark expectations context is not set completely, please assign " - "'_mail_from' before \n accessing it", + SparkExpectationsMiscException, + match="The spark expectations context is not set completely, please assign " + "'_mail_from' before \n accessing it", ): context.get_mail_from @@ -692,9 +691,9 @@ def test_get_slack_webhook_url_exception(): context = SparkExpectationsContext(product_id="product1", spark=spark) context._slack_webhook_url = False with pytest.raises( - SparkExpectationsMiscException, - match="The spark expectations context is not set completely, please assign " - "'_slack_webhook_url' before \n accessing it", + SparkExpectationsMiscException, + match="The spark expectations context is not set completely, please assign " + "'_slack_webhook_url' before \n accessing it", ): context.get_slack_webhook_url @@ -703,9 +702,9 @@ def test_get_teams_webhook_url_exception(): context = SparkExpectationsContext(product_id="product1", spark=spark) context._teams_webhook_url = False with pytest.raises( - SparkExpectationsMiscException, - match="The spark expectations context is not set completely, please assign " - "'_teams_webhook_url' before \n accessing it", + SparkExpectationsMiscException, + match="The spark expectations context is not set completely, please assign " + "'_teams_webhook_url' before \n accessing it", ): context.get_teams_webhook_url @@ -714,9 +713,9 @@ def test_get_table_name_expection(): context = SparkExpectationsContext(product_id="product1", spark=spark) context._table_name = "" with pytest.raises( - SparkExpectationsMiscException, - match="The spark expectations context is not set completely, please assign " - "'_table_name' before \n accessing it", + SparkExpectationsMiscException, + match="The spark expectations context is not set completely, please assign " + "'_table_name' before \n accessing it", ): context.get_table_name @@ -818,8 +817,8 @@ def test_get_error_threshold(): context = SparkExpectationsContext(product_id="product1", spark=spark) context._error_drop_threshold = 0 with pytest.raises( - SparkExpectationsMiscException, - match="""The spark expectations context is not set completely, please assign '_error_drop_threshold' before + SparkExpectationsMiscException, + match="""The spark expectations context is not set completely, please assign '_error_drop_threshold' before accessing it""", ): context.get_error_drop_threshold @@ -829,8 +828,8 @@ def test_get_cerberus_url_exception(): context = SparkExpectationsContext(product_id="product1", spark=spark) context._cerberus_url = None with pytest.raises( - SparkExpectationsMiscException, - match="""The spark expectations context is not set completely, please assign '_cerberus_url' before + SparkExpectationsMiscException, + match="""The spark expectations context is not set completely, please assign '_cerberus_url' before accessing it""", ): context.get_cerberus_url @@ -840,8 +839,8 @@ def test_get_cerberus_cred_path_exception(): context = SparkExpectationsContext(product_id="product1", spark=spark) context._cerberus_cred_path = None with pytest.raises( - SparkExpectationsMiscException, - match="""The spark expectations context is not set completely, please assign '_cerberus_cred_path' before + SparkExpectationsMiscException, + match="""The spark expectations context is not set completely, please assign '_cerberus_cred_path' before accessing it""", ): context.get_cerberus_cred_path @@ -851,8 +850,8 @@ def test_get_cerberus_token_exception(): context = SparkExpectationsContext(product_id="product1", spark=spark) context._cerberus_token = None with pytest.raises( - SparkExpectationsMiscException, - match="""The spark expectations context is not set completely, please assign '_cerberus_token' before + SparkExpectationsMiscException, + match="""The spark expectations context is not set completely, please assign '_cerberus_token' before accessing it""", ): context.get_cerberus_token @@ -1056,8 +1055,8 @@ def test_get_run_id_name(): context._run_id_name = value if not value: with pytest.raises( - SparkExpectationsMiscException, - match="""The spark expectations context is not set completely, please assign '_run_id_name' .*""", + SparkExpectationsMiscException, + match="""The spark expectations context is not set completely, please assign '_run_id_name' .*""", ): context.get_run_id_name else: @@ -1071,8 +1070,8 @@ def test_get_run_date_name(): context._run_date_name = value if not value: with pytest.raises( - SparkExpectationsMiscException, - match="""The spark expectations context is not set completely, please assign '_run_date_name' .*""", + SparkExpectationsMiscException, + match="""The spark expectations context is not set completely, please assign '_run_date_name' .*""", ): context.get_run_date_name else: @@ -1086,8 +1085,8 @@ def test_get_run_date_time_name(): context._run_date_time_name = value if not value: with pytest.raises( - SparkExpectationsMiscException, - match="""The spark expectations context is not set completely, please assign '_run_date_time_name' .*""", + SparkExpectationsMiscException, + match="""The spark expectations context is not set completely, please assign '_run_date_time_name' .*""", ): context.get_run_date_time_name else: @@ -1125,8 +1124,8 @@ def test_get_num_row_dq_rules(): context = SparkExpectationsContext(product_id="product1", spark=spark) with pytest.raises( - SparkExpectationsMiscException, - match="""The spark expectations context is not set completely, please assign '_num_row_dq_rules' .*""", + SparkExpectationsMiscException, + match="""The spark expectations context is not set completely, please assign '_num_row_dq_rules' .*""", ): context._num_row_dq_rules = None context.get_num_row_dq_rules @@ -1140,8 +1139,8 @@ def test_get_num_agg_dq_rules_exception(): context = SparkExpectationsContext(product_id="product1", spark=spark) context._num_agg_dq_rules = [1, 2, 3, 4] with pytest.raises( - SparkExpectationsMiscException, - match="""The spark expectations context is not set completely, please assign '_num_agg_dq_rules' before + SparkExpectationsMiscException, + match="""The spark expectations context is not set completely, please assign '_num_agg_dq_rules' before accessing it""", ): context.get_num_agg_dq_rules @@ -1151,8 +1150,8 @@ def test_get_num_query_dq_rules_exception(): context = SparkExpectationsContext(product_id="product1", spark=spark) context._num_query_dq_rules = [1, 2, 3, 4] with pytest.raises( - SparkExpectationsMiscException, - match="""The spark expectations context is not set completely, please assign '_num_query_dq_rules' before + SparkExpectationsMiscException, + match="""The spark expectations context is not set completely, please assign '_num_query_dq_rules' before accessing it""", ): context.get_num_query_dq_rules @@ -1161,8 +1160,8 @@ def test_get_num_query_dq_rules_exception(): def test_get_num_dq_rules(): context = SparkExpectationsContext(product_id="product1", spark=spark) with pytest.raises( - SparkExpectationsMiscException, - match="""The spark expectations context is not set completely, please assign '_num_dq_rules' .*""", + SparkExpectationsMiscException, + match="""The spark expectations context is not set completely, please assign '_num_dq_rules' .*""", ): context._num_dq_rules = None context.get_num_dq_rules @@ -1230,8 +1229,8 @@ def test_get_source_query_dq_status_exception(): context = SparkExpectationsContext(product_id="product1", spark=spark) context._source_query_dq_status = None with pytest.raises( - SparkExpectationsMiscException, - match="""The spark expectations context is not set completely, please assign '_source_query_dq_status' before + SparkExpectationsMiscException, + match="""The spark expectations context is not set completely, please assign '_source_query_dq_status' before accessing it""", ): context.get_source_query_dq_status @@ -1241,8 +1240,8 @@ def test_get_final_query_dq_status_exception(): context = SparkExpectationsContext(product_id="product1", spark=spark) context._final_query_dq_status = None with pytest.raises( - SparkExpectationsMiscException, - match="""The spark expectations context is not set completely, please assign '_final_query_dq_status' before + SparkExpectationsMiscException, + match="""The spark expectations context is not set completely, please assign '_final_query_dq_status' before accessing it""", ): context.get_final_query_dq_status @@ -1252,16 +1251,16 @@ def test_set_supported_df_query_dq(): context = SparkExpectationsContext(product_id="product1", spark=spark) context._supported_df_query_dq = context.set_supported_df_query_dq() assert ( - context.get_supported_df_query_dq.collect() - == get_spark_session() - .createDataFrame( - [ - { - "spark_expectations_query_check": "supported_place_holder_dataset_to_run_query_check" - } - ] - ) - .collect() + context.get_supported_df_query_dq.collect() + == get_spark_session() + .createDataFrame( + [ + { + "spark_expectations_query_check": "supported_place_holder_dataset_to_run_query_check" + } + ] + ) + .collect() ) @@ -1269,8 +1268,8 @@ def test_get_supported_df_query_dq(): context = SparkExpectationsContext(product_id="product1", spark=spark) context._supported_df_query_dq = None with pytest.raises( - SparkExpectationsMiscException, - match="""The spark expectations context is not set completely, please assign '_supported_df_query_dq' before + SparkExpectationsMiscException, + match="""The spark expectations context is not set completely, please assign '_supported_df_query_dq' before accessing it""", ): context.get_supported_df_query_dq @@ -1393,8 +1392,8 @@ def test_get_secret_type_exception(): context.set_se_streaming_stats_dict({user_config.se_enable_streaming: "a"}) with pytest.raises( - SparkExpectationsMiscException, - match="""The spark expectations context is not set completely, please assign + SparkExpectationsMiscException, + match="""The spark expectations context is not set completely, please assign 'UserConfig.secret_type' before accessing it""", ): @@ -1426,8 +1425,8 @@ def test_get_server_url_key_exception(): ) with pytest.raises( - SparkExpectationsMiscException, - match="""The spark expectations context is not set completely, please assign + SparkExpectationsMiscException, + match="""The spark expectations context is not set completely, please assign 'UserConfig.cbs_kafka_server_url' before accessing it""", ): @@ -1459,8 +1458,8 @@ def test_get_token_endpoint_url_exception(): ) with pytest.raises( - SparkExpectationsMiscException, - match="""The spark expectations context is not set completely, please assign + SparkExpectationsMiscException, + match="""The spark expectations context is not set completely, please assign 'UserConfig.cbs_secret_token_url' before accessing it""", ): @@ -1492,8 +1491,8 @@ def test_get_token_exception(): ) with pytest.raises( - SparkExpectationsMiscException, - match="""The spark expectations context is not set completely, please assign + SparkExpectationsMiscException, + match="""The spark expectations context is not set completely, please assign 'UserConfig.cbs_secret_token' before accessing it""", ): @@ -1525,8 +1524,8 @@ def test_get_client_id_exception(): ) with pytest.raises( - SparkExpectationsMiscException, - match="""The spark expectations context is not set completely, please assign + SparkExpectationsMiscException, + match="""The spark expectations context is not set completely, please assign 'UserConfig.cbs_secret_app_name' before accessing it""", ): @@ -1558,8 +1557,8 @@ def test_get_topic_name_exception(): ) with pytest.raises( - SparkExpectationsMiscException, - match="""The spark expectations context is not set completely, please assign + SparkExpectationsMiscException, + match="""The spark expectations context is not set completely, please assign 'UserConfig.cbs_topic_name' before accessing it""", ): @@ -1578,8 +1577,8 @@ def test_get_se_streaming_stats_topic_name(): context.set_se_streaming_stats_topic_name("test_topic") assert ( - context.get_se_streaming_stats_topic_name - == context.get_se_streaming_stats_topic_name + context.get_se_streaming_stats_topic_name + == context.get_se_streaming_stats_topic_name ) @@ -1588,8 +1587,8 @@ def test_get_se_streaming_stats_topic_name_exception(): context.set_se_streaming_stats_topic_name("") with pytest.raises( - SparkExpectationsMiscException, - match="""The spark expectations context is not set completely, please assign + SparkExpectationsMiscException, + match="""The spark expectations context is not set completely, please assign '_se_streaming_stats_topic_name' before accessing it""", ): @@ -1647,63 +1646,67 @@ def test_get_rules_exceds_threshold(): ] - def test_set_agg_dq_detailed_stats_status(): context = SparkExpectationsContext(product_id="product1", spark=spark) context.set_agg_dq_detailed_stats_status(True) assert context.get_agg_dq_detailed_stats_status is True + def test_get_agg_dq_detailed_stats_status(): context = SparkExpectationsContext(product_id="product1", spark=spark) assert context.get_agg_dq_detailed_stats_status is True + def test_set_query_dq_detailed_stats_status(): context = SparkExpectationsContext(product_id="product1", spark=spark) context.set_query_dq_detailed_stats_status(True) assert context.get_query_dq_detailed_stats_status is True + def test_get_query_dq_detailed_stats_status(): context = SparkExpectationsContext(product_id="product1", spark=spark) assert context.get_query_dq_detailed_stats_status is False - def test_set_target_agg_dq_detailed_stats(): context = SparkExpectationsContext(product_id="product1", spark=spark) target_agg_dq_detailed_stats = [ ("column1", "rule1"), ("column2", "rule2"), - ("column3", "rule3") + ("column3", "rule3"), ] context.set_target_agg_dq_detailed_stats(target_agg_dq_detailed_stats) assert context.get_target_agg_dq_detailed_stats == target_agg_dq_detailed_stats + def test_set_target_query_dq_detailed_stats(): context = SparkExpectationsContext(product_id="product1", spark=spark) target_query_dq_detailed_stats = [ ("column1", "rule1"), ("column2", "rule2"), - ("column3", "rule3") + ("column3", "rule3"), ] context.set_target_query_dq_detailed_stats(target_query_dq_detailed_stats) assert context.get_target_query_dq_detailed_stats == target_query_dq_detailed_stats + def test_set_source_agg_dq_detailed_stats(): context = SparkExpectationsContext(product_id="product1", spark=spark) source_agg_dq_detailed_stats = [ ("column1", "rule1"), ("column2", "rule2"), - ("column3", "rule3") + ("column3", "rule3"), ] context.set_source_agg_dq_detailed_stats(source_agg_dq_detailed_stats) assert context.get_source_agg_dq_detailed_stats == source_agg_dq_detailed_stats + def test_set_source_query_dq_detailed_stats(): context = SparkExpectationsContext(product_id="product1", spark=spark) source_query_dq_detailed_stats = [ ("column1", "rule1"), ("column2", "rule2"), - ("column3", "rule3") + ("column3", "rule3"), ] context.set_source_query_dq_detailed_stats(source_query_dq_detailed_stats) assert context.get_source_query_dq_detailed_stats == source_query_dq_detailed_stats @@ -1722,16 +1725,19 @@ def test_set_agg_dq_detailed_stats_status(): context.set_agg_dq_detailed_stats_status(True) assert context.get_agg_dq_detailed_stats_status is True + def test_get_agg_dq_detailed_stats_status(): context = SparkExpectationsContext(product_id="product1", spark=spark) context.set_agg_dq_detailed_stats_status(True) assert context.get_agg_dq_detailed_stats_status is True + def test_set_query_dq_detailed_stats_status(): context = SparkExpectationsContext(product_id="product1", spark=spark) context.set_query_dq_detailed_stats_status(True) assert context.get_query_dq_detailed_stats_status is True + def test_get_query_dq_detailed_stats_status(): context = SparkExpectationsContext(product_id="product1", spark=spark) context.set_query_dq_detailed_stats_status(True) @@ -1743,6 +1749,7 @@ def test_set_agg_dq_detailed_stats_status(): context.set_agg_dq_detailed_stats_status(True) assert context.get_agg_dq_detailed_stats_status is True + def test_set_query_dq_detailed_stats_status(): context = SparkExpectationsContext(product_id="product1", spark=spark) context.set_query_dq_detailed_stats_status(True) @@ -1756,7 +1763,6 @@ def test_set_dq_stats_table_name(): assert context.get_dq_stats_table_name == "dq_stats_table_name" - def test_set_dq_detailed_stats_table_name(): context = SparkExpectationsContext(product_id="product1", spark=spark) context.set_dq_detailed_stats_table_name("dq_stats_table_name") @@ -1764,25 +1770,27 @@ def test_set_dq_detailed_stats_table_name(): context._enable_agg_dq_detailed_result = True context._enable_query_dq_detailed_result = True - assert context.get_agg_dq_detailed_stats_status is True assert context.get_query_dq_detailed_stats_status is True - + assert context._dq_detailed_stats_table_name == "dq_stats_table_name" assert context.get_dq_detailed_stats_table_name == "dq_stats_table_name" + def test_set_detailed_stats_table_writer_config(): context = SparkExpectationsContext(product_id="product1", spark=spark) config = {"setting1": "value1", "setting2": "value2"} context.set_detailed_stats_table_writer_config(config) assert context.get_detailed_stats_table_writer_config == config + def test_set_dq_expectations(): context = SparkExpectationsContext(product_id="product1", spark=spark) dq_expectations = {"column1": "expectation1", "column2": "expectation2"} context.set_dq_expectations(dq_expectations) assert context.get_dq_expectations == dq_expectations + def test_set_rules_execution_settings_config(): context = SparkExpectationsContext(product_id="product1", spark=spark) config = {"setting1": "value1", "setting2": "value2"} @@ -1795,28 +1803,33 @@ def test_get_row_dq_start_time(): context._row_dq_start_time = datetime.now() assert context.get_row_dq_start_time == context._row_dq_start_time -def test_get_row_dq_end_time(): + +def test_get_row_dq_end_time(): context = SparkExpectationsContext(product_id="product1", spark=spark) context._row_dq_end_time = datetime.now() assert context.get_row_dq_end_time == context._row_dq_end_time - def test_get_row_dq_start_time_exception(): context = SparkExpectationsContext(product_id="product1", spark=spark) context._row_dq_start_time = None - with pytest.raises(SparkExpectationsMiscException, - match="""The spark expectations context is not set completely, + with pytest.raises( + SparkExpectationsMiscException, + match="""The spark expectations context is not set completely, please assign '_row_dq_start_time' before - accessing it"""): + accessing it""", + ): context.get_row_dq_start_time + def test_get_row_dq_end_time_exception(): context = SparkExpectationsContext(product_id="product1", spark=spark) context._row_dq_end_time = None - with pytest.raises(SparkExpectationsMiscException, + with pytest.raises( + SparkExpectationsMiscException, match="""The spark expectations context is not set completely, please assign '_row_dq_end_time' before - accessing it"""): + accessing it""", + ): context.get_row_dq_end_time @@ -1834,14 +1847,17 @@ def test_get_query_dq_output_custom_table_name(): context.set_query_dq_output_custom_table_name("test_table") assert context.get_query_dq_output_custom_table_name == "test_table" + def test_get_query_dq_output_custom_table_name_exception(): context = SparkExpectationsContext(product_id="product1", spark=spark) context.set_query_dq_detailed_stats_status(True) context.set_dq_detailed_stats_table_name(None) - with pytest.raises(SparkExpectationsMiscException, + with pytest.raises( + SparkExpectationsMiscException, match="""The spark expectations context is not set completely, please assign '_dq_detailed_stats_table_name,query_dq_detailed_stats_status' before - accessing it"""): + accessing it""", + ): context.get_query_dq_output_custom_table_name @@ -1851,42 +1867,52 @@ def test_get_dq_detailed_stats_table_name(): context.set_dq_detailed_stats_table_name("test_table") assert context.get_dq_detailed_stats_table_name == "test_table" -def test_get_dq_detailed_stats_table_name_exception(): + +def test_get_dq_detailed_stats_table_name_exception(): context = SparkExpectationsContext(product_id="product1", spark=spark) context.set_query_dq_detailed_stats_status(True) context.set_dq_detailed_stats_table_name(None) - with pytest.raises(SparkExpectationsMiscException, + with pytest.raises( + SparkExpectationsMiscException, match="""The spark expectations context is not set completely, please assign '_dq_detailed_stats_table_name' before - accessing it"""): + accessing it""", + ): context.get_dq_detailed_stats_table_name -def test_get_dq_expectations(): + +def test_get_dq_expectations(): context = SparkExpectationsContext(product_id="product1", spark=spark) dq_expectations = {"column1": "expectation1", "column2": "expectation2"} context.set_dq_expectations(dq_expectations) assert context.get_dq_expectations == dq_expectations + def test_get_dq_expectations_exception(): context = SparkExpectationsContext(product_id="product1", spark=spark) context._dq_expectations = None - with pytest.raises(SparkExpectationsMiscException, + with pytest.raises( + SparkExpectationsMiscException, match="""The spark expectations context is not set completely, please assign '_dq_expectations' before - accessing it"""): + accessing it""", + ): context.get_dq_expectations + def test_set_querydq_secondary_queries(): context = SparkExpectationsContext(product_id="product1", spark=spark) querydq_secondary_queries = {"query1": "query1", "query2": "query2"} context.set_querydq_secondary_queries(querydq_secondary_queries) assert context.get_querydq_secondary_queries == querydq_secondary_queries + def test_get_querydq_secondary_queries(): context = SparkExpectationsContext(product_id="product1", spark=spark) querydq_secondary_queries = {"query1": "query1", "query2": "query2"} context.set_querydq_secondary_queries(querydq_secondary_queries) assert context.get_querydq_secondary_queries == querydq_secondary_queries + def test_get_source_query_dq_output(): context = SparkExpectationsContext(product_id="product1", spark=spark) source_query_dq_output = [{"column1": "value1", "column2": "value2"}] @@ -1900,18 +1926,21 @@ def test_set_target_query_dq_output(): context.set_target_query_dq_output(target_query_dq_output) assert context._target_query_dq_output == target_query_dq_output + def test_get_target_query_dq_output(): context = SparkExpectationsContext(product_id="product1", spark=spark) target_query_dq_output = [{"column1": "value1", "column2": "value2"}] context._target_query_dq_output = target_query_dq_output assert context.get_target_query_dq_output == target_query_dq_output + def test_get_source_query_dq_output(): context = SparkExpectationsContext(product_id="product1", spark=spark) source_query_dq_output = [{"column1": "value1", "column2": "value2"}] context._source_query_dq_output = source_query_dq_output assert context.get_source_query_dq_output == source_query_dq_output + def test_set_source_query_dq_output(): context = SparkExpectationsContext(product_id="product1", spark=spark) source_query_dq_output = [{"column1": "value1", "column2": "value2"}] @@ -1935,28 +1964,48 @@ def test_set_dq_rules_params(): assert context.get_dq_rules_params == {} # testing when passing parameterizied values to dq rules - context._dq_rules_params = {'env': 'local'} - assert context.get_dq_rules_params == {'env': 'local'} + context._dq_rules_params = {"env": "local"} + assert context.get_dq_rules_params == {"env": "local"} + def test_get_dq_expectations(): context = SparkExpectationsContext(product_id="test_product", spark=spark) context.set_dq_expectations({"rule1": "expectation1", "rule2": "expectation2"}) # Test when _dq_expectations is set - assert context.get_dq_expectations == {"rule1": "expectation1", "rule2": "expectation2"} + assert context.get_dq_expectations == { + "rule1": "expectation1", + "rule2": "expectation2", + } # Test when _dq_expectations is not set context._dq_expectations = None with pytest.raises(SparkExpectationsMiscException): context.get_dq_expectations() - + def test_set_dq_expectations(): context = SparkExpectationsContext(product_id="test_product", spark=spark) dq_expectations = { "rule1": "expectation1", "rule2": "expectation2", - "rule3": "expectation3" + "rule3": "expectation3", } context.set_dq_expectations(dq_expectations) - assert context._dq_expectations == dq_expectations \ No newline at end of file + assert context._dq_expectations == dq_expectations + + +def test_set_job_metadata(): + context = SparkExpectationsContext(product_id="test_product", spark=spark) + context.set_job_metadata("{'job_name': 'test_job_metadata'}") + assert context._job_metadata == "{'job_name': 'test_job_metadata'}" + + +def test_get_job_metadata(): + context = SparkExpectationsContext(product_id="test_product", spark=spark) + context._job_metadata = "{'job_name': 'test_job_metadata'}" + assert context.get_job_metadata == "{'job_name': 'test_job_metadata'}" + + # testing for None condition + context._job_metadata = None + assert context.get_job_metadata is None diff --git a/tests/core/test_expectations.py b/tests/core/test_expectations.py index 16c224b4..f17ee180 100644 --- a/tests/core/test_expectations.py +++ b/tests/core/test_expectations.py @@ -1,20 +1,21 @@ # pylint: disable=too-many-lines import os import datetime -from unittest.mock import Mock +from unittest.mock import Mock, PropertyMock from unittest.mock import patch import pytest -from pyspark.sql import DataFrame +from pyspark.sql import DataFrame, SparkSession from pyspark.sql.functions import lit, to_timestamp, col from pyspark.sql.types import StringType, IntegerType, StructField, StructType from spark_expectations.core.context import SparkExpectationsContext -from spark_expectations.core.expectations import SparkExpectations, WrappedDataFrameWriter +from spark_expectations.core.expectations import ( + SparkExpectations, + WrappedDataFrameWriter, +) from spark_expectations.config.user_config import Constants as user_config from spark_expectations.core import get_spark_session -from spark_expectations.core.exceptions import ( - SparkExpectationsMiscException -) +from spark_expectations.core.exceptions import SparkExpectationsMiscException # os.environ["UNIT_TESTING_ENV"] = "local" @@ -25,22 +26,29 @@ def fixture_setup_local_kafka_topic(): current_dir = os.path.dirname(os.path.abspath(__file__)) - if os.getenv('UNIT_TESTING_ENV') != "spark_expectations_unit_testing_on_github_actions": - + if ( + os.getenv("UNIT_TESTING_ENV") + != "spark_expectations_unit_testing_on_github_actions" + ): # remove if docker conatiner is running - os.system(f"sh {current_dir}/../../spark_expectations/examples/docker_scripts/docker_kafka_stop_script.sh") + os.system( + f"sh {current_dir}/../../spark_expectations/examples/docker_scripts/docker_kafka_stop_script.sh" + ) # start docker container and create the topic - os.system(f"sh {current_dir}/../../spark_expectations/examples/docker_scripts/docker_kafka_start_script.sh") + os.system( + f"sh {current_dir}/../../spark_expectations/examples/docker_scripts/docker_kafka_start_script.sh" + ) yield "docker container started" # remove docker container - os.system(f"sh {current_dir}/../../spark_expectations/examples/docker_scripts/docker_kafka_stop_script.sh") + os.system( + f"sh {current_dir}/../../spark_expectations/examples/docker_scripts/docker_kafka_stop_script.sh" + ) else: - yield "A Kafka server has been launched within a Docker container for the purpose of conducting tests in " \ - "a Jenkins environment" + yield "A Kafka server has been launched within a Docker container for the purpose of conducting tests in " "a Jenkins environment" @pytest.fixture(name="_fixture_df") @@ -58,11 +66,19 @@ def fixture_df(): @pytest.fixture(name="_fixture_dq_rules") def fixture_dq_rules(): # create a sample dq_rules map for above input - return {"rules": {"num_dq_rules": 1, "num_row_dq_rules": 1}, - "query_dq_rules": {"num_final_query_dq_rules": 0, "num_source_query_dq_rules": 0, - "num_query_dq_rules": 0}, - "agg_dq_rules": {"num_source_agg_dq_rules": 0, "num_agg_dq_rules": 0, - "num_final_agg_dq_rules": 0}} + return { + "rules": {"num_dq_rules": 1, "num_row_dq_rules": 1}, + "query_dq_rules": { + "num_final_query_dq_rules": 0, + "num_source_query_dq_rules": 0, + "num_query_dq_rules": 0, + }, + "agg_dq_rules": { + "num_source_agg_dq_rules": 0, + "num_agg_dq_rules": 0, + "num_final_agg_dq_rules": 0, + }, + } @pytest.fixture(name="_fixture_rules_df") @@ -81,7 +97,7 @@ def fixture_rules_df(): "enable_for_target_dq_validation": True, "is_active": True, "enable_error_drop_alert": True, - "error_drop_threshold": "10" + "error_drop_threshold": "10", } return spark.createDataFrame([rules_dict]) @@ -120,13 +136,14 @@ def fixture_context(): def fixture_spark_expectations(_fixture_rules_df): # create a spark expectations class object writer = WrappedDataFrameWriter().mode("append").format("delta") - spark_expectations = SparkExpectations(product_id="product1", - rules_df=_fixture_rules_df, - stats_table="dq_spark.test_dq_stats_table", - stats_table_writer=writer, - target_and_error_table_writer=writer, - debugger=False, - ) + spark_expectations = SparkExpectations( + product_id="product1", + rules_df=_fixture_rules_df, + stats_table="dq_spark.test_dq_stats_table", + stats_table_writer=writer, + target_and_error_table_writer=writer, + debugger=False, + ) def _error_threshold_exceeds(expectations): pass @@ -187,2092 +204,2620 @@ def _error_threshold_exceeds(expectations): # os.system("rm -rf /tmp/hive/warehouse/dq_spark.db") -@pytest.mark.parametrize("input_df, " - "expectations, " - "write_to_table, " - "write_to_temp_table, " - "expected_output, " - "input_count, " - "error_count, " - "output_count, " - "source_agg_dq_res, " - "final_agg_dq_res, " - "source_query_dq_res, " - "final_query_dq_res, " - "dq_rules, " - "status", - [ - ( - # Note: where err: refers error table and fnl: final table - # test case 0 - # In this test case, the action for failed rows is "ignore", - # so the function should return the input DataFrame with all rows. - # collect stats in the test_stats_table and - # log the error records into the error table. - - spark.createDataFrame( - [ - {"col1": 1, "col2": "a"}, - # row doesn't meet expectations(ignore), log into err & fnl - {"col1": 2, "col2": "b"}, - # row meets expectations(ignore), log into final table - {"col1": 3, "col2": "c"}, - # row meets expectations(ignore), log into final table - ] - ), - [ - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "row_dq", - "rule": "col1_threshold", - "column_name": "col1", - "expectation": "col1 > 1", - "action_if_failed": "ignore", - "tag": "validity", - "description": "col1 value must be greater than 1", - "enable_for_source_dq_validation": True, - "enable_for_target_dq_validation": True, - "is_active": True, - "enable_error_drop_alert": True, - "error_drop_threshold": "10" - } - ], - True, # write to table - True, # write to temp table - # expected res - spark.createDataFrame( - [ - {"col1": 1, "col2": "a"}, - {"col1": 2, "col2": "b"}, - {"col1": 3, "col2": "c"}, - ] - ), - 3, # input count - 1, # error count - 3, # output count - None, # source_agg_dq_res - None, # final_agg_dq_res - None, # source_query_dq_res - None, # final_query_dq_res - {"rules": {"num_dq_rules": 1, "num_row_dq_rules": 1}, - "query_dq_rules": {"num_final_query_dq_rules": 0, "num_source_query_dq_rules": 0, - "num_query_dq_rules": 0}, - "agg_dq_rules": {"num_source_agg_dq_rules": 0, "num_agg_dq_rules": 0, - "num_final_agg_dq_rules": 0}}, # dq_rules - # status at different stages for given input - {"row_dq_status": "Passed", "source_agg_dq_status": "Skipped", - "final_agg_dq_status": "Skipped", "run_status": "Passed", - "source_query_dq_status": "Skipped", "final_query_dq_status": "Skipped"}, - ), - - ( - # test case 1 - # In this test case, the action for failed rows is "drop", - # collect stats in the test_stats_table and - # log the error records into the error table. - spark.createDataFrame( - [ - {"col1": 1, "col2": "a", "col3": 4}, - # row meets expectations(drop), log into err & fnl - {"col1": 2, "col2": "b", "col3": 5}, - # row doesn't meets expectations(drop), log into final table - {"col1": 3, "col2": "c", 'col3': 6}, - # row meets expectations(drop), log into final table - ] - ), - [ - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "row_dq", - "rule": "col2_set", - "column_name": "col2", - "expectation": "col2 in ('a', 'c')", - "action_if_failed": "drop", - "tag": "strict", - "description": "col2 value must be in ('a', 'b')", - "enable_for_source_dq_validation": False, - "enable_for_target_dq_validation": False, - "is_active": True, - "enable_error_drop_alert": True, - "error_drop_threshold": "5", - } - ], - True, # write to table - True, # write to temp table - # expected res - spark.createDataFrame( - [ - {"col1": 1, "col2": "a", "col3": 4}, - # row doesn't meet expectations(ignore), log into err & fnl - {"col1": 3, "col2": "c", 'col3': 6}, - # row meets expectations(ignore), log into final table - ] - ), - 3, # input count - 1, # error count - 2, # output count - None, # source_agg_dq_res - None, # final_agg_dq_res - None, # source_query_dq_res - None, # final_query_dq_res - {"rules": {"num_dq_rules": 1, "num_row_dq_rules": 1}, - "query_dq_rules": {"num_final_query_dq_rules": 0, "num_source_query_dq_rules": 0, - "num_query_dq_rules": 0}, - "agg_dq_rules": {"num_source_agg_dq_rules": 0, "num_agg_dq_rules": 0, - "num_final_agg_dq_rules": 0}}, # dq_rules - # status at different stages for given input - {"row_dq_status": "Passed", "source_agg_dq_status": "Skipped", - "final_agg_dq_status": "Skipped", "run_status": "Passed", - "source_query_dq_status": "Skipped", "final_query_dq_status": "Skipped"}, - ), - ( - # test case 2 - # In this test case, the action for failed rows is "fail", - # spark expectations expected to fail - # collect stats in the test_stats_table and - # log the error records into the error table. - spark.createDataFrame( - [ - {"col1": 1, "col2": "a", "col3": 4}, - # row doesn't meet expectations(fail), log into err & fnl - {"col1": 2, "col2": "b", "col3": 5}, - # row meets doesn't expectations(fail), log into final table - {"col1": 3, "col2": "c", 'col3': 6}, - # row meets doesn't expectations(fail), log into final table - ] - ), - [ - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "row_dq", - "rule": "col3_threshold", - "column_name": "col3", - "expectation": "col3 > 6", - "action_if_failed": "fail", - "tag": "strict", - "description": "col3 value must be greater than 6", - "enable_for_source_dq_validation": False, - "enable_for_target_dq_validation": False, - "is_active": True, - "enable_error_drop_alert": True, - "error_drop_threshold": "15", - } - ], - True, # write to table - True, # write to temp table - SparkExpectationsMiscException, # expected res - 3, # input count - 3, # error count - 0, # output count - None, # source_agg_dq_res - None, # final_agg_dq_res - None, # source_query_dq_res - None, # final_query_dq_res - {"rules": {"num_dq_rules": 1, "num_row_dq_rules": 1}, - "query_dq_rules": {"num_final_query_dq_rules": 0, "num_source_query_dq_rules": 0, - "num_query_dq_rules": 0}, - "agg_dq_rules": {"num_source_agg_dq_rules": 0, "num_agg_dq_rules": 0, - "num_final_agg_dq_rules": 0}}, # dq_rules - # status at different stages for given input - {"row_dq_status": "Failed", "source_agg_dq_status": "Skipped", - "final_agg_dq_status": "Skipped", "run_status": "Failed", - "source_query_dq_status": "Skipped", "final_query_dq_status": "Skipped"}, - ), - ( # test case 3 - # In this test case, the action for failed rows is "ignore" & "drop", - # collect stats in the test_stats_table and - # log the error records into the error table. - spark.createDataFrame( - [ - {"col1": 1, "col2": "a", "col3": 4}, - # row doesn't meet expectations1(ignore) 2(drop), log into err & fnl - {"col1": 2, "col2": "b", "col3": 5}, - # row meets expectations1(ignore), log into final table - {"col1": 3, "col2": "c", 'col3': 6}, - # row doesnt'meets expectations1(ignore), log into final table - ] - ), - [ - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "row_dq", - "rule": "col3_threshold", - "column_name": "col3", - "expectation": "col3 > 6", - "action_if_failed": "ignore", - "tag": "strict", - "description": "col3 value must be greater than 6", - "enable_for_source_dq_validation": False, - "enable_for_target_dq_validation": False, - "is_active": True, - "enable_error_drop_alert": False, - "error_drop_threshold": "10", - }, - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "row_dq", - "rule": "col1_add_col3_threshold", - "column_name": "col1", - "expectation": "(col1+col3) > 6", - "action_if_failed": "drop", - "tag": "strict", - "description": "col1_add_col3 value must be greater than 6", - "enable_for_source_dq_validation": False, - "enable_for_target_dq_validation": False, - "is_active": True, - "enable_error_drop_alert": False, - "error_drop_threshold": "15", - } - ], - - True, # write to table - True, # write to temp table - # expected res - spark.createDataFrame( - [ - {"col1": 2, "col2": "b", "col3": 5}, - {"col1": 3, "col2": "c", 'col3': 6}, - ] - ), - 3, # input count - 3, # error count - 2, # output count - None, # source_agg_dq_res - None, # final_agg_dq_res - None, # source_query_dq_res - None, # final_query_dq_res - {"rules": {"num_dq_rules": 2, "num_row_dq_rules": 2}, - "query_dq_rules": {"num_final_query_dq_rules": 0, "num_source_query_dq_rules": 0, - "num_query_dq_rules": 0}, - "agg_dq_rules": {"num_source_agg_dq_rules": 0, "num_agg_dq_rules": 0, - "num_final_agg_dq_rules": 0}}, # dq_rules - # status at different stages for given input - {"row_dq_status": "Passed", "source_agg_dq_status": "Skipped", - "final_agg_dq_status": "Skipped", "run_status": "Passed", - "source_query_dq_status": "Skipped", "final_query_dq_status": "Skipped"}, - ), - ( - # test case 4 - # In this test case, the action for failed rows is "ignore" & "fail", - # collect stats in the test_stats_table and - # log the error records into the error table. - spark.createDataFrame( - [ - {"col1": 1, "col2": "a", "col3": 4}, - # row doesn't meet expectations1(ignore), log into err & fnl - {"col1": 2, "col2": "b", "col3": 5}, - # row meets expectations1(ignore), log into final table - {"col1": 3, "col2": "c", 'col3': 6}, - # row meets expectations1(ignore), log into final table - ] - ), - [ - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "row_dq", - "rule": "col3_threshold", - "column_name": "col3", - "expectation": "col3 > 6", - "action_if_failed": "ignore", - "tag": "strict", - "description": "col3 value must be greater than 6", - "enable_for_source_dq_validation": False, - "enable_for_target_dq_validation": False, - "is_active": True, - "enable_error_drop_alert": True, - "error_drop_threshold": "20", - }, - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "row_dq", - "rule": "col3_minus_col1_threshold", - "column_name": "col1", - "expectation": "(col3-col1) > 1", - "action_if_failed": "fail", - "tag": "strict", - "description": "col3_minus_col1 value must be greater than 1", - "enable_for_source_dq_validation": False, - "enable_for_target_dq_validation": False, - "is_active": True, - "enable_error_drop_alert": True, - "error_drop_threshold": "5", - } - ], - True, # write to table - True, # write to temp table - # expected res - spark.createDataFrame( - [ - {"col1": 1, "col2": "a", "col3": 4}, - {"col1": 2, "col2": "b", "col3": 5}, - {"col1": 3, "col2": "c", 'col3': 6}, - ] - ), - 3, # input count - 3, # error count - 3, # output count - None, # source_agg_dq_res - None, # final_agg_dq_res - None, # source_query_dq_res - None, # final_query_dq_res - {"rules": {"num_dq_rules": 2, "num_row_dq_rules": 2}, - "query_dq_rules": {"num_final_query_dq_rules": 0, "num_source_query_dq_rules": 0, - "num_query_dq_rules": 0}, - "agg_dq_rules": {"num_source_agg_dq_rules": 0, "num_agg_dq_rules": 0, - "num_final_agg_dq_rules": 0}}, # dq_rules - # status at different stages for given input - {"row_dq_status": "Passed", "source_agg_dq_status": "Skipped", - "final_agg_dq_status": "Skipped", "run_status": "Passed", - "source_query_dq_status": "Skipped", "final_query_dq_status": "Skipped"}, - ), - ( - # Test case 5 - # In this test case, the action for failed rows is "drop" & "fail", - # collect stats in the test_stats_table and - # log the error records into the error table. - spark.createDataFrame( - [ - {"col1": 1, "col2": "a", "col3": 4}, - # row doesn't meet expectations1(drop) & 2(drop), log into err & fnl - {"col1": 2, "col2": "b", "col3": 5}, - # row meets expectations1(drop) & 2(fail), log into final table - {"col1": 3, "col2": "c", 'col3': 6}, - # row meets expectations1(drop), & 2(fail) log into final table - ] - ), - [ - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "row_dq", - "rule": "col3_threshold", - "column_name": "col3", - "expectation": "col3 > 6", - "action_if_failed": "drop", - "tag": "strict", - "description": "col3 value must be greater than 6", - "enable_for_source_dq_validation": False, - "enable_for_target_dq_validation": False, - "is_active": True, - "enable_error_drop_alert": True, - "error_drop_threshold": "25", - }, - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "row_dq", - "rule": "col3_minus_col1_threshold", - "column_name": "col1", - "expectation": "(col3-col1) = 1", - "action_if_failed": "fail", - "tag": "strict", - "description": "col3_minus_col1 value must be equals to 1", - "enable_for_source_dq_validation": False, - "enable_for_target_dq_validation": False, - "is_active": True, - "enable_error_drop_alert": False, - "error_drop_threshold": "25", - } - ], - True, # write to table - True, # write to temp table - SparkExpectationsMiscException, # expected res - 3, # input count - 3, # error count - 0, # output count - None, # source_agg_dq_res - None, # final_agg_dq_res - None, # source_query_dq_res - None, # final_query_dq_res - {"rules": {"num_dq_rules": 2, "num_row_dq_rules": 2}, - "query_dq_rules": {"num_final_query_dq_rules": 0, "num_source_query_dq_rules": 0, - "num_query_dq_rules": 0}, - "agg_dq_rules": {"num_source_agg_dq_rules": 0, "num_agg_dq_rules": 0, - "num_final_agg_dq_rules": 0}}, # dq_rules - # status at different stages for given input - {"row_dq_status": "Failed", "source_agg_dq_status": "Skipped", - "final_agg_dq_status": "Skipped", "run_status": "Failed", - "source_query_dq_status": "Skipped", "final_query_dq_status": "Skipped"}, - ), - ( # Test case 6 - # In this test case, the action for failed rows is "drop" & "fail", - # collect stats in the test_stats_table and - # log the error records into the error table - spark.createDataFrame( - [ - {"col1": 1, "col2": "a", "col3": 4}, - # row doesn't meet expectations1(drop) & meets 2(fail), log into err & fnl - {"col1": 2, "col2": "b", "col3": 5}, - # row meets expectations1(drop) & meets 2(fail), log into final table - {"col1": 3, "col2": "c", 'col3': 6}, - # row meets expectations1(drop) & meets 2(fail), log into final table - ] - ), - [ - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "row_dq", - "rule": "col3_threshold", - "column_name": "col3", - "expectation": "col3 > 6", - "action_if_failed": "drop", - "tag": "strict", - "description": "col3 value must be greater than 6", - "enable_for_source_dq_validation": False, - "enable_for_target_dq_validation": False, - "is_active": True, - "enable_error_drop_alert": True, - "error_drop_threshold": "10", - }, - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "row_dq", - "rule": "col3_mul_col1_threshold", - "column_name": "col1", - "expectation": "(col3*col1) > 1", - "action_if_failed": "fail", - "tag": "strict", - "description": "col3_mul_col1 value must be equals to 1", - "enable_for_source_dq_validation": False, - "enable_for_target_dq_validation": False, - "is_active": True, - "enable_error_drop_alert": True, - "error_drop_threshold": "10", - } - ], - True, # write to table - True, # write to temp table - # expected res - spark.createDataFrame([], schema=StructType([ - StructField("col1", IntegerType()), - StructField("col2", StringType()), - StructField("col3", IntegerType()) - ])), - 3, # input count - 3, # error count - 0, # output count - None, # source_agg_dq_res - None, # final_agg_dq_res - None, # source_query_dq_res - None, # final_query_dq_res - {"rules": {"num_dq_rules": 2, "num_row_dq_rules": 2}, - "query_dq_rules": {"num_final_query_dq_rules": 0, "num_source_query_dq_rules": 0, - "num_query_dq_rules": 0}, - "agg_dq_rules": {"num_source_agg_dq_rules": 0, "num_agg_dq_rules": 0, - "num_final_agg_dq_rules": 0}}, # dq_rules - # status at different stages for given input - {"row_dq_status": "Passed", "source_agg_dq_status": "Skipped", - "final_agg_dq_status": "Skipped", "run_status": "Passed", - "source_query_dq_status": "Skipped", "final_query_dq_status": "Skipped"}, - ), - ( - # Test case 7 - # In this test case, the action for failed rows is "ignore", "drop" & "fail", - # collect stats in the test_stats_table and - # log the error records into the error table - - spark.createDataFrame( - [ - {"col1": 1, "col2": "a", "col3": 4}, - # row doesn't meet the expectation1(ignore) & expectation2(drop) - {"col1": 2, "col2": "b", "col3": 5}, - # row doesn't meet the expectation2(drop) - {"col1": 3, "col2": "c", 'col3': 6}, - # row meets all the expectations - ] - ), - [ - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "row_dq", - "rule": "col1_threshold", - "column_name": "col1", - "expectation": "col1 > 1", - "action_if_failed": "ignore", - "tag": "strict", - "description": "col1 value must be greater than 1", - "enable_for_source_dq_validation": False, - "enable_for_target_dq_validation": False, - "is_active": True, - "enable_error_drop_alert": False, - "error_drop_threshold": "0", - }, - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "row_dq", - "rule": "col3_threshold", - "column_name": "col3", - "expectation": "col3 > 5", - "action_if_failed": "drop", - "tag": "strict", - "description": "col3 value must be greater than 5", - "enable_for_source_dq_validation": False, - "enable_for_target_dq_validation": False, - "is_active": True, - "enable_error_drop_alert": True, - "error_drop_threshold": "10", - }, - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "row_dq", - "rule": "col3_mul_col1_threshold", - "column_name": "col1", - "expectation": "(col3*col1) > 1", - "action_if_failed": "fail", - "tag": "strict", - "description": "col3_mul_col1 value must be equals to 1", - "enable_for_source_dq_validation": False, - "enable_for_target_dq_validation": False, - "is_active": True, - "enable_error_drop_alert": False, - "error_drop_threshold": "20", - } - ], - True, # write to table - True, # write_to_temp_table - spark.createDataFrame( # expected output - [ - {"col1": 3, "col2": "c", 'col3': 6}, - ] - ), - 3, # input count - 2, # error count - 1, # output count - None, # source_agg_result - None, # final_agg_result - None, # source_query_dq_res - None, # final_query_dq_res - {"rules": {"num_dq_rules": 3, "num_row_dq_rules": 3}, - "query_dq_rules": {"num_final_query_dq_rules": 0, "num_source_query_dq_rules": 0, - "num_query_dq_rules": 0}, - "agg_dq_rules": {"num_source_agg_dq_rules": 0, "num_agg_dq_rules": 0, - "num_final_agg_dq_rules": 0}}, # dq_rules - {"row_dq_status": "Passed", "source_agg_dq_status": "Skipped", # status - "final_agg_dq_status": "Skipped", "run_status": "Passed", - "source_query_dq_status": "Skipped", "final_query_dq_status": "Skipped"}, - ), - ( - # Test case 8 - # In this test case, dq run set for source_agg_dq - # collect stats in the test_stats_table - spark.createDataFrame( - [ - {"col1": 1, "col2": "a", "col3": 4}, - {"col1": 2, "col2": "b", "col3": 5}, - {"col1": 3, "col2": "c", 'col3': 6}, - ] - ), - [ - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "agg_dq", - "rule": "sum_col3_threshold", - "column_name": "col3", - "expectation": "sum(col3) > 20", - "action_if_failed": "ignore", - "tag": "strict", - "description": "sum col3 value must be greater than 20", - "enable_for_source_dq_validation": True, - "enable_for_target_dq_validation": False, - "is_active": True, - "enable_error_drop_alert": True, - "error_drop_threshold": "10", - } - ], - True, # write to table - True, # write to temp table - spark.createDataFrame( - [ - {"col1": 1, "col2": "a", "col3": 4}, - {"col1": 2, "col2": "b", "col3": 5}, - {"col1": 3, "col2": "c", 'col3': 6}, - ] - ), # expected result - 3, # input count - 0, # error count - 0, # output count - [{"description": "sum col3 value must be greater than 20", # source_ag_result - "rule": "sum_col3_threshold", - "rule_type": "agg_dq", "action_if_failed": "ignore", "tag": "strict"}], - None, # final_agg_result - None, # source_query_dq_res - None, # final_query_dq_res - {"rules": {"num_dq_rules": 1, "num_row_dq_rules": 0}, - "query_dq_rules": {"num_final_query_dq_rules": 0, "num_source_query_dq_rules": 0, - "num_query_dq_rules": 0}, - "agg_dq_rules": {"num_source_agg_dq_rules": 1, "num_agg_dq_rules": 1, - "num_final_agg_dq_rules": 1}}, # dq_rules - {"row_dq_status": "Skipped", "source_agg_dq_status": "Passed", # status - "final_agg_dq_status": "Skipped", "run_status": "Passed", - "source_query_dq_status": "Skipped", "final_query_dq_status": "Skipped"}, - ), - ( - # Test case 9 - # In this test case, dq run set for source_agg_dq with action_if_failed fail - # collect stats in the test_stats_table - spark.createDataFrame( - [ - # avg of col3 is not more than 25 - {"col1": 1, "col2": "a", "col3": 4}, - {"col1": 2, "col2": "b", "col3": 5}, - {"col1": 3, "col2": "c", 'col3': 6}, - ] - ), - [ - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "agg_dq", - "rule": "avg_col3_threshold", - "column_name": "col3", - "expectation": "avg(col3) > 25", - "action_if_failed": "fail", - "tag": "strict", - "description": "avg col3 value must be greater than 25", - "enable_for_source_dq_validation": True, - "enable_for_target_dq_validation": False, - "is_active": True, - "enable_error_drop_alert": False, - "error_drop_threshold": "20", - } - ], - True, # write to table - True, # write to temp table - SparkExpectationsMiscException, # excepted result - 3, # input count - 0, # error count - 0, # output count - [{"description": "avg col3 value must be greater than 25", # source_agg_result - "rule": "avg_col3_threshold", - "rule_type": "agg_dq", "action_if_failed": "fail", "tag": "strict"}], - None, # final_agg_result - None, # source_query_dq_res - None, # final_query_dq_res - {"rules": {"num_dq_rules": 1, "num_row_dq_rules": 0}, - "query_dq_rules": {"num_final_query_dq_rules": 0, "num_source_query_dq_rules": 0, - "num_query_dq_rules": 0}, - "agg_dq_rules": {"num_source_agg_dq_rules": 1, "num_agg_dq_rules": 1, - "num_final_agg_dq_rules": 1}}, # dq_rules - {"row_dq_status": "Skipped", "source_agg_dq_status": "Failed", # status - "final_agg_dq_status": "Skipped", "run_status": "Failed", - "source_query_dq_status": "Skipped", "final_query_dq_status": "Skipped"}, - ), - ( - # Test case 10 - # In this test case, dq run set for final_agg_dq with action_if_failed ignore - # collect stats in the test_stats_table - spark.createDataFrame( - [ - # minimum of col1 must be greater than 10 - {"col1": 1, "col2": "a", "col3": 4}, - {"col1": 2, "col2": "b", "col3": 5}, - {"col1": 3, "col2": "c", 'col3': 6}, - ] - ), - [{ - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "agg_dq", - "rule": "min_col1_threshold", - "column_name": "col1", - "expectation": "min(col1) > 10", - "action_if_failed": "ignore", - "tag": "strict", - "description": "min col1 value must be greater than 10", - "enable_for_source_dq_validation": False, - "enable_for_target_dq_validation": True, - "is_active": True, - "enable_error_drop_alert": False, - "error_drop_threshold": "20", - - }, - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "row_dq", - "rule": "col2_set", - "column_name": "col2", - "expectation": "col2 in ('a', 'c')", - "action_if_failed": "drop", - "tag": "strict", - "description": "col2 value must be in ('a', 'b')", - "enable_for_source_dq_validation": False, - "enable_for_target_dq_validation": False, - "is_active": True, - "enable_error_drop_alert": False, - "error_drop_threshold": "0", - } - ], - True, # write to table - True, # write to temp table - spark.createDataFrame( - [ - {"col1": 1, "col2": "a", "col3": 4}, - {"col1": 3, "col2": "c", 'col3': 6}, - - ] - ), # expected result but row_dq set to false - 3, # input count - 1, # error count - 2, # output count - None, # source_agg-result - [{"description": "min col1 value must be greater than 10", - "rule": "min_col1_threshold", - "rule_type": "agg_dq", "action_if_failed": "ignore", "tag": "strict"}], - # final_agg_result - None, # source_query_dq_res - None, # final_query_dq_res - {"rules": {"num_dq_rules": 2, "num_row_dq_rules": 1}, - "query_dq_rules": {"num_final_query_dq_rules": 0, "num_source_query_dq_rules": 0, - "num_query_dq_rules": 0}, - "agg_dq_rules": {"num_source_agg_dq_rules": 1, "num_agg_dq_rules": 1, - "num_final_agg_dq_rules": 1}}, # dq_rules - {"row_dq_status": "Passed", "source_agg_dq_status": "Skipped", - "final_agg_dq_status": "Passed", "run_status": "Passed", - "source_query_dq_status": "Skipped", "final_query_dq_status": "Skipped"}, - ), - ( - # Test case 11 - # In this test case, dq run set for row_dq & final_agg_dq - # with action_if_failed drop(row), fail(agg) - # collect stats in the test_stats_table & error into error_table - - spark.createDataFrame( - # standard deviation of col3 must be greater than 10 - [ - {"col1": 1, "col2": "a", "col3": 4}, - # row meet expectation - {"col1": 2, "col2": "b", "col3": 5}, - # row doesn't meet expectation - {"col1": 3, "col2": "c", 'col3': 6}, - # row meets expectations - - ] - ), - [ - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "agg_dq", - "rule": "std_col3_threshold", - "column_name": "col3", - "expectation": "stddev(col3) > 10", - "action_if_failed": "fail", - "tag": "strict", - "description": "std col3 value must be greater than 10", - "enable_for_source_dq_validation": False, - "enable_for_target_dq_validation": True, - "is_active": True, - "enable_error_drop_alert": False, - "error_drop_threshold": "20", - }, - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "row_dq", - "rule": "col2_set", - "column_name": "col2", - "expectation": "col2 in ('a', 'c')", - "action_if_failed": "drop", - "tag": "strict", - "description": "col2 value must be in ('a', 'b')", - "enable_for_source_dq_validation": False, - "enable_for_target_dq_validation": False, - "is_active": True, - "enable_error_drop_alert": True, - "error_drop_threshold": "15", - } - ], - True, # write to table - True, # write temp table - SparkExpectationsMiscException, # expected result - 3, # input count - 1, # error count - 2, # output count - None, # source_agg_result - [{"description": "std col3 value must be greater than 10", - "rule": "std_col3_threshold", - "rule_type": "agg_dq", "action_if_failed": "fail", "tag": "strict"}], - # final_agg_result - None, # source_query_dq_res - None, # final_query_dq_res - {"rules": {"num_dq_rules": 2, "num_row_dq_rules": 1}, - "query_dq_rules": {"num_final_query_dq_rules": 0, "num_source_query_dq_rules": 0, - "num_query_dq_rules": 0}, - "agg_dq_rules": {"num_source_agg_dq_rules": 1, "num_agg_dq_rules": 1, - "num_final_agg_dq_rules": 1}}, # dq_rules - {"row_dq_status": "Passed", "source_agg_dq_status": "Skipped", - "final_agg_dq_status": "Failed", "run_status": "Failed", - "source_query_dq_status": "Skipped", "final_query_dq_status": "Skipped"}, - # status - ), - - ( - # Test case 12 - # In this test case, dq run set for row_dq - # with action_if_failed drop - # collect stats in the test_stats_table & error into error_table - spark.createDataFrame( - [ - {"col1": 1, "col2": "a"}, - # row doesn't meet the expectations - {"col1": 2, "col2": "b"}, - # row meets the expectation - {"col1": 3, "col2": "c"}, - # row meets the expectations - ] - ), - [ - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "row_dq", - "rule": "col1_threshold", - "column_name": "col1", - "expectation": "col1 > 1", - "action_if_failed": "drop", - "tag": "validity", - "description": "col1 value must be greater than 1", - "enable_for_source_dq_validation": False, - "enable_for_target_dq_validation": False, - "is_active": True, - "enable_error_drop_alert": False, - "error_drop_threshold": "0", - } - ], - True, # write to table - True, # write to temp table - spark.createDataFrame([ # expected_output - {"col1": 2, "col2": "b"}, - {"col1": 3, "col2": "c"} - ]), # expected result - 3, # input count - 1, # error count - 2, # output count - None, # source_agg_result - None, # final_agg_result - None, # source_query_dq_res - None, # final_query_dq_res - {"rules": {"num_dq_rules": 1, "num_row_dq_rules": 1}, - "query_dq_rules": {"num_final_query_dq_rules": 0, "num_source_query_dq_rules": 0, - "num_query_dq_rules": 0}, - "agg_dq_rules": {"num_source_agg_dq_rules": 0, "num_agg_dq_rules": 0, - "num_final_agg_dq_rules": 0}}, # dq_rules - {"row_dq_status": "Passed", "source_agg_dq_status": "Skipped", - "final_agg_dq_status": "Skipped", "run_status": "Passed", - "source_query_dq_status": "Skipped", "final_query_dq_status": "Skipped"} # status - ), - ( - # Test case 13 - # In this test case, dq run set for row_dq & source_agg_dq - # with action_if_failed (ignore, drop) and ignore(agg_dq) - # collect stats in the test_stats_table & error into error_table - - spark.createDataFrame( - [ - # count of distinct element in col2 must be greater than 2 - {"col1": 1, "col2": "a", "col3": 4}, - # row doesn't meet expectation1(ignore) - {"col1": 2, "col2": "b", "col3": 5}, - # row meets all row_dq expectations - {"col1": 3, "col2": "c", "col3": 6}, - # row meets all row_dq expectations - ] - ), - [ - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "row_dq", - "rule": "col1_threshold_1", - "column_name": "col1", - "expectation": "col1 > 1", - "action_if_failed": "ignore", - "tag": "validity", - "description": "col1 value must be greater than 1", - "enable_for_source_dq_validation": False, - "enable_for_target_dq_validation": False, - "is_active": True, - "enable_error_drop_alert": True, - "error_drop_threshold": "10", - }, - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "row_dq", - "rule": "col2_set", - "column_name": "col2", - "expectation": "col2 in ('a', 'b', 'c')", - "action_if_failed": "drop", - "tag": "validity", - "description": "col1 value must be greater than 2", - "enable_for_source_dq_validation": False, - "enable_for_target_dq_validation": False, - "is_active": True, - "enable_error_drop_alert": True, - "error_drop_threshold": "10", - }, - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "agg_dq", - "rule": "distinct_col2_threshold", - "column_name": "col2", - "expectation": "count(distinct col2) > 4", - "action_if_failed": "ignore", - "tag": "validity", - "description": "distinct of col2 value must be greater than 4", - "enable_for_source_dq_validation": True, - "enable_for_target_dq_validation": False, - "is_active": True, - "enable_error_drop_alert": False, - "error_drop_threshold": "20", - } - ], - True, # write to table - True, # write to temp table - spark.createDataFrame([ # expected_output - {"col1": 1, "col2": "a", "col3": 4}, - {"col1": 2, "col2": "b", "col3": 5}, - {"col1": 3, "col2": "c", "col3": 6} - ]), - 3, # input count - 1, # error count - 3, # output count - [{"description": "distinct of col2 value must be greater than 4", - "rule": "distinct_col2_threshold", - "rule_type": "agg_dq", "action_if_failed": "ignore", "tag": "validity"}], - # source_agg_result - None, # final_agg_result - None, # source_query_dq_res - None, # final_query_dq_res - {"rules": {"num_dq_rules": 3, "num_row_dq_rules": 2}, - "query_dq_rules": {"num_final_query_dq_rules": 0, "num_source_query_dq_rules": 0, - "num_query_dq_rules": 0}, - "agg_dq_rules": {"num_source_agg_dq_rules": 1, "num_agg_dq_rules": 1, - "num_final_agg_dq_rules": 1}}, # dq_rules - {"row_dq_status": "Passed", "source_agg_dq_status": "Passed", - "final_agg_dq_status": "Skipped", "run_status": "Passed", - "source_query_dq_status": "Skipped", "final_query_dq_status": "Skipped"} # status - ), - - ( - # Test case 14 - # In this test case, dq run set for row_dq, source_agg_dq & final_agg_dq - # with action_if_failed r(ignore, drop) for row_dq and (ignore) for agg_dq - # collect stats in the test_stats_table & error into error_table - spark.createDataFrame( - [ - # avg of col1 must be greater than 4(ignore) for agg_dq - {"col1": 1, "col2": "a", "col3": 4}, - # row doesn't meet row_dq expectation1(ignore) - {"col1": 2, "col2": "b", "col3": 5}, - # row doesn't meet row_dq expectation2(drop) - {"col1": 3, "col2": "c", "col3": 6}, - # row meets all row-dq expectations - ] - ), - [ - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "row_dq", - "rule": "col3_threshold_4", - "column_name": "col3", - "expectation": "col3 > 4", - "action_if_failed": "drop", - "tag": "validity", - "description": "col3 value must be greater than 4", - "enable_for_source_dq_validation": False, - "enable_for_target_dq_validation": False, - "is_active": True, - "enable_error_drop_alert": True, - "error_drop_threshold": "20", - }, - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "row_dq", - "rule": "col2_set", - "column_name": "col2", - "expectation": "col2 in ('a', 'b')", - "action_if_failed": "ignore", - "tag": "validity", - "description": "col2 value must be in (a, b)", - "enable_for_source_dq_validation": False, - "enable_for_target_dq_validation": False, - "is_active": True, - "enable_error_drop_alert": True, - "error_drop_threshold": "2", - }, - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "agg_dq", - "rule": "avg_col1_threshold", - "column_name": "col1", - "expectation": "avg(col1) > 4", - "action_if_failed": "ignore", - "tag": "accuracy", - "description": "avg of col1 value must be greater than 4", - "enable_for_source_dq_validation": True, - "enable_for_target_dq_validation": True, - "is_active": True, - "enable_error_drop_alert": False, - "error_drop_threshold": "20", - } - ], - True, # write to table - True, # write to temp table - spark.createDataFrame([ # expected_output - {"col1": 2, "col2": "b", "col3": 5}, - {"col1": 3, "col2": "c", "col3": 6} - ]), # expected result - 3, # input count - 2, # error count - 2, # output count - [{"description": "avg of col1 value must be greater than 4", - "rule": "avg_col1_threshold", - "rule_type": "agg_dq", "action_if_failed": "ignore", "tag": "accuracy"}], - # source_agg_dq - [{"description": "avg of col1 value must be greater than 4", - "rule": "avg_col1_threshold", - "rule_type": "agg_dq", "action_if_failed": "ignore", "tag": "accuracy"}], - # final_agg_dq - None, # source_query_dq_res - None, # final_query_dq_res - {"rules": {"num_dq_rules": 3, "num_row_dq_rules": 2}, - "query_dq_rules": {"num_final_query_dq_rules": 0, "num_source_query_dq_rules": 0, - "num_query_dq_rules": 0}, - "agg_dq_rules": {"num_source_agg_dq_rules": 1, "num_agg_dq_rules": 1, - "num_final_agg_dq_rules": 1}}, # dq_rules - {"row_dq_status": "Passed", "source_agg_dq_status": "Passed", - "final_agg_dq_status": "Passed", "run_status": "Passed", - "source_query_dq_status": "Skipped", "final_query_dq_status": "Skipped"} # status - ), - ( - # Test case 15 - # In this test case, dq run set for row_dq, source_agg_dq & final_agg_dq - # with action_if_failed (ignore, drop) for row_dq and (ignore, fail) for agg_dq - # collect stats in the test_stats_table & error into error_table - spark.createDataFrame( - [ - # average of col1 must be greater than 4(ignore) - # standard deviation of col1 must be greater than 0(fail) - {"col1": 1, "col2": "a", "col3": 4}, - # row doesn't meet row_dq expectation1(drop) and expectation2(ignore) - {"col1": 2, "col2": "b", "col3": 5}, - # row meets all row_dq_expectations - {"col1": 3, "col2": "c", "col3": 6}, - # row meets all row_dq_expectations - {"col1": 2, "col2": "d", "col3": 7} - ] - ), - [ - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "row_dq", - "rule": "col3_and_col1_threshold_4", - "column_name": "col3, col1", - "expectation": "((col3 * col1) - col3) > 5", - "action_if_failed": "drop", - "tag": "validity", - "description": "col3 and col1 operation value must be greater than 3", - "enable_for_source_dq_validation": False, - "enable_for_target_dq_validation": False, - "is_active": True, - "enable_error_drop_alert": True, - "error_drop_threshold": "25", - }, - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "row_dq", - "rule": "col2_set", - "column_name": "col2", - "expectation": "col2 in ('b', 'c')", - "action_if_failed": "ignore", - "tag": "validity", - "description": "col2 value must be in (b, c)", - "enable_for_source_dq_validation": False, - "enable_for_target_dq_validation": False, - "is_active": True, - "enable_error_drop_alert": True, - "error_drop_threshold": "30", - }, - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "agg_dq", - "rule": "avg_col1_threshold", - "column_name": "col1", - "expectation": "avg(col1) > 4", - "action_if_failed": "ignore", - "tag": "validity", - "description": "avg of col1 value must be greater than 4", - "enable_for_source_dq_validation": True, - "enable_for_target_dq_validation": True, - "is_active": True, - "enable_error_drop_alert": False, - "error_drop_threshold": "20", - }, - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "agg_dq", - "rule": "stddev_col3_threshold", - "column_name": "col3", - "expectation": "stddev(col3) > 1", - "action_if_failed": "ignore", - "tag": "validity", - "description": "stddev of col3 value must be greater than one", - "enable_for_source_dq_validation": True, - "enable_for_target_dq_validation": True, - "is_active": True, - "enable_error_drop_alert": False, - "error_drop_threshold": "20", - } - ], - True, # write to table - True, # write to temp table - spark.createDataFrame([ # expected_output - {"col1": 3, "col2": "c", "col3": 6}, - {"col1": 2, "col2": "d", "col3": 7} - ]), # expected result - 4, # input count - 3, # error count - 2, # output count - [{"description": "avg of col1 value must be greater than 4", - "rule": "avg_col1_threshold", - "rule_type": "agg_dq", "action_if_failed": "ignore", "tag": "validity"}], - # source_agg_result - [{'action_if_failed': 'ignore', - 'description': 'avg of col1 value must be greater than 4', - 'rule': 'avg_col1_threshold', - 'rule_type': 'agg_dq', - 'tag': 'validity'}, - {'action_if_failed': 'ignore', - 'description': 'stddev of col3 value must be greater than one', - 'rule': 'stddev_col3_threshold', - 'rule_type': 'agg_dq', - 'tag': 'validity'}], - # final_agg_result - None, # source_query_dq_res - None, # final_query_dq_res - {"rules": {"num_dq_rules": 5, "num_row_dq_rules": 2}, - "query_dq_rules": {"num_final_query_dq_rules": 0, "num_source_query_dq_rules": 0, - "num_query_dq_rules": 0}, # dq_rules - "agg_dq_rules": {"num_source_agg_dq_rules": 2, "num_agg_dq_rules": 3, - "num_final_agg_dq_rules": 2}}, - {"row_dq_status": "Passed", "source_agg_dq_status": "Passed", - "final_agg_dq_status": "Passed", "run_status": "Passed", - "source_query_dq_status": "Skipped", "final_query_dq_status": "Skipped"} # status - ), - ( - # Test case 16 - # In this test case, dq run set for query_dq source_query_dq - # with action_if_failed (ignore) for query_dq - # collect stats in the test_stats_table & error into error_table - spark.createDataFrame( - [ - # sum of col1 must be greater than 10(ignore) - # standard deviation of col3 must be greater than 0(ignore) - {"col1": 1, "col2": "a", "col3": 4}, - # row doesn't meet row_dq expectation1(drop) - {"col1": 2, "col2": "b", "col3": 5}, - # row meets all row_dq_expectations - {"col1": 3, "col2": "c", "col3": 6}, - # row meets all row_dq_expectations - ] - ), - [ - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "query_dq", - "rule": "sum_col1_threshold", - "column_name": "col1", - "expectation": "(select sum(col1) from test_table) > 10", - "action_if_failed": "ignore", - "tag": "validity", - "description": "sum of col1 value must be greater than 10", - "enable_for_source_dq_validation": True, - "enable_for_target_dq_validation": True, - "is_active": True, - "enable_error_drop_alert": False, - "error_drop_threshold": "20", - }, - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "query_dq", - "rule": "stddev_col3_threshold", - "column_name": "col3", - "expectation": "(select stddev(col3) from test_table) > 0", - "action_if_failed": "ignore", - "tag": "validity", - "description": "stddev of col3 value must be greater than 0", - "enable_for_source_dq_validation": True, - "enable_for_target_dq_validation": True, - "is_active": True, - "enable_error_drop_alert": False, - "error_drop_threshold": "20", - } - ], - False, # write to table - False, # write to temp table - None, # expected result - 3, # input count - 0, # error count - 0, # output count - None, # source_agg_result - None, # final_agg_result - # final_agg_result - [{"description": "sum of col1 value must be greater than 10", - "rule": "sum_col1_threshold", - "rule_type": "query_dq", "action_if_failed": "ignore", "tag": "validity"}], - # source_query_dq_res - None, # final_query_dq_res - {"rules": {"num_dq_rules": 2, "num_row_dq_rules": 0}, - "query_dq_rules": {"num_final_query_dq_rules": 2, "num_source_query_dq_rules": 2, - "num_query_dq_rules": 2}, # dq_rules - "agg_dq_rules": {"num_source_agg_dq_rules": 0, "num_agg_dq_rules": 0, - "num_final_agg_dq_rules": 0}}, - {"row_dq_status": "Skipped", "source_agg_dq_status": "Skipped", - "final_agg_dq_status": "Skipped", "run_status": "Passed", - "source_query_dq_status": "Passed", "final_query_dq_status": "Skipped"} # status - ), - - ( - # Test case 17 - # In this test case, dq run set for query_dq final_query_dq - # with action_if_failed (ignore) for query_dq - # collect stats in the test_stats_table & error into error_table - spark.createDataFrame( - [ - # max of col1 must be greater than 10(ignore) - # min of col3 must be greater than 0(ignore) - {"col1": 1, "col2": "a", "col3": 4}, - # row doesn't meet row_dq expectation1(drop) and expectation2(ignore) - {"col1": 2, "col2": "b", "col3": 5}, - # row meets all row_dq_expectations - {"col1": 3, "col2": "c", "col3": 6}, - # row meets all row_dq_expectations - ] - ), - [ - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "row_dq", - "rule": "col3_and_col1_threshold_4", - "column_name": "col3, col1", - "expectation": "((col3 * col1) - col3) > 5", - "action_if_failed": "drop", - "tag": "validity", - "description": "col3 and col1 operation value must be greater than 3", - "enable_for_source_dq_validation": False, - "enable_for_target_dq_validation": False, - "is_active": True, - "enable_error_drop_alert": True, - "error_drop_threshold": "20", - }, - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "query_dq", - "rule": "max_col1_threshold", - "column_name": "col1", - "expectation": "(select max(col1) from test_final_table_view) > 10", - "action_if_failed": "ignore", - "tag": "strict", - "description": "max of col1 value must be greater than 10", - "enable_for_source_dq_validation": False, - "enable_for_target_dq_validation": True, - "is_active": True, - "enable_error_drop_alert": False, - "error_drop_threshold": "20", - }, - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "query_dq", - "rule": "min_col3_threshold", - "column_name": "col3", - "expectation": "(select min(col3) from test_final_table_view) > 0", - "action_if_failed": "ignore", - "tag": "validity", - "description": "min of col3 value must be greater than 0", - "enable_for_source_dq_validation": False, - "enable_for_target_dq_validation": True, - "is_active": True, - "enable_error_drop_alert": False, - "error_drop_threshold": "20", - } - ], - True, # write to table - False, # write to temp table - spark.createDataFrame( - [ - {"col1": 3, "col2": "c", "col3": 6}, - ]), # expected result - 3, # input count - 2, # error count - 1, # output count - None, # source_agg_result - None, # final_agg_result - # final_agg_result - None, # source_query_dq_res - [{"description": "max of col1 value must be greater than 10", - "rule": "max_col1_threshold", - "rule_type": "query_dq", "action_if_failed": "ignore", "tag": "strict"}], - # final_query_dq_res - {"rules": {"num_dq_rules": 3, "num_row_dq_rules": 1}, - "query_dq_rules": {"num_final_query_dq_rules": 2, "num_source_query_dq_rules": 0, - "num_query_dq_rules": 2}, # dq_rules - "agg_dq_rules": {"num_source_agg_dq_rules": 0, "num_agg_dq_rules": 0, - "num_final_agg_dq_rules": 0}}, - {"row_dq_status": "Passed", "source_agg_dq_status": "Skipped", - "final_agg_dq_status": "Skipped", "run_status": "Passed", - "source_query_dq_status": "Skipped", "final_query_dq_status": "Passed"} # status - ), - ( - # Test case 18 - # In this test case, dq run set for query_dq source_query_dq(ignore, fail) - # with action_if_failed (fail) for query_dq - # collect stats in the test_stats_table, error into error_table & raises the error - spark.createDataFrame( - [ - # min of col1 must be greater than 10(fail) - # standard deviation of col3 must be greater than 0(ignore) - {"col1": 1, "col2": "a", "col3": 4}, - # row doesn't meet row_dq expectation1(drop) - {"col1": 2, "col2": "b", "col3": 5}, - # row meets all row_dq_expectations - {"col1": 3, "col2": "c", "col3": 6}, - # row meets all row_dq_expectations - ] - ), - [ - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "query_dq", - "rule": "min_col1_threshold", - "column_name": "col1", - "expectation": "(select min(col1) from test_final_table_view) > 10", - "action_if_failed": "fail", - "tag": "validity", - "description": "min of col1 value must be greater than 10", - "enable_for_source_dq_validation": True, - "enable_for_target_dq_validation": False, - "is_active": True, - "enable_error_drop_alert": False, - "error_drop_threshold": "20", - }, - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "query_dq", - "rule": "stddev_col3_threshold", - "column_name": "col3", - "expectation": "(select stddev(col3) from test_final_table_view) > 0", - "action_if_failed": "ignore", - "tag": "validity", - "description": "stddev of col3 value must be greater than 0", - "enable_for_source_dq_validation": True, - "enable_for_target_dq_validation": False, - "is_active": True, - "enable_error_drop_alert": False, - "error_drop_threshold": "20", - } - ], - False, # write to table - False, # write to temp table - SparkExpectationsMiscException, # expected result - 3, # input count - 0, # error count - 0, # output count - None, # source_agg_result - None, # final_agg_result - [{'action_if_failed': 'fail', - 'description': 'min of col1 value must be greater than 10', - 'rule': 'min_col1_threshold', - 'rule_type': 'query_dq', - 'tag': 'validity'}, - {'action_if_failed': 'ignore', - 'description': 'stddev of col3 value must be greater than 0', - 'rule': 'stddev_col3_threshold', - 'rule_type': 'query_dq', - 'tag': 'validity'}], # source_query_dq_res - None, # final_query_dq_res - {"rules": {"num_dq_rules": 2, "num_row_dq_rules": 0}, - "query_dq_rules": {"num_final_query_dq_rules": 2, "num_source_query_dq_rules": 2, - "num_query_dq_rules": 2}, # dq_rules - "agg_dq_rules": {"num_source_agg_dq_rules": 0, "num_agg_dq_rules": 0, - "num_final_agg_dq_rules": 0}}, - {"row_dq_status": "Skipped", "source_agg_dq_status": "Skipped", - "final_agg_dq_status": "Skipped", "run_status": "Failed", - "source_query_dq_status": "Failed", "final_query_dq_status": "Skipped"} # status - ), - ( - # Test case 19 - # In this test case, dq run set for query_dq final_query_dq(ignore, fail) - # with action_if_failed (ignore, fail) for query_dq - # collect stats in the test_stats_table, error into error_table & raises error - spark.createDataFrame( - [ - # max of col1 must be greater than 10(ignore) - # min of col3 must be greater than 0(ignore) - {"col1": 1, "col2": "a", "col3": 4}, - # row doesn't meet row_dq expectation1(drop) and expectation2(ignore) - {"col1": 2, "col2": "b", "col3": 5}, - # row meets all row_dq_expectations - {"col1": 3, "col2": "c", "col3": 6}, - # row meets all row_dq_expectations - ] - ), - [ - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "row_dq", - "rule": "col3_and_col1_threshold_4", - "column_name": "col3, col1", - "expectation": "((col3 * col1) - col3) > 5", - "action_if_failed": "drop", - "tag": "validity", - "description": "col3 and col1 operation value must be greater than 3", - "enable_for_source_dq_validation": False, - "enable_for_target_dq_validation": False, - "is_active": True, - "enable_error_drop_alert": True, - "error_drop_threshold": "25", - }, - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "query_dq", - "rule": "max_col1_threshold", - "column_name": "col1", - "expectation": "(select max(col1) from test_final_table_view) > 10", - "action_if_failed": "fail", - "tag": "strict", - "description": "max of col1 value must be greater than 10", - "enable_for_source_dq_validation": False, - "enable_for_target_dq_validation": True, - "is_active": True, - "enable_error_drop_alert": False, - "error_drop_threshold": "20", - }, - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "query_dq", - "rule": "min_col3_threshold", - "column_name": "col3", - "expectation": "(select min(col3) from test_final_table_view) > 0", - "action_if_failed": "ignore", - "tag": "validity", - "description": "min of col3 value must be greater than 0", - "enable_for_source_dq_validation": False, - "enable_for_target_dq_validation": True, - "is_active": True, - "enable_error_drop_alert": False, - "error_drop_threshold": "20", - } - ], - True, # write to table - False, # write to temp table - SparkExpectationsMiscException, # expected result - 3, # input count - 2, # error count - 1, # output count - None, # source_agg_result - None, # final_agg_result - # final_agg_result - None, # source_query_dq_res - [{"description": "max of col1 value must be greater than 10", - "rule": "max_col1_threshold", - "rule_type": "query_dq", "action_if_failed": "fail", "tag": "strict"}], - # final_query_dq_res - {"rules": {"num_dq_rules": 3, "num_row_dq_rules": 1}, - "query_dq_rules": {"num_final_query_dq_rules": 2, "num_source_query_dq_rules": 0, - "num_query_dq_rules": 2}, # dq_rules - "agg_dq_rules": {"num_source_agg_dq_rules": 0, "num_agg_dq_rules": 0, - "num_final_agg_dq_rules": 0}}, - {"row_dq_status": "Passed", "source_agg_dq_status": "Skipped", - "final_agg_dq_status": "Skipped", "run_status": "Failed", - "source_query_dq_status": "Skipped", "final_query_dq_status": "Failed"} # status - ), - ( - # Test case 20 - # In this test case, dq run set for query_dq source_query_dq & - # final_query_dq(ignore, fail) - # with action_if_failed (ignore, fail) for query_dq - # collect stats in the test_stats_table, error into error_table - spark.createDataFrame( - [ - # min of col1 must be greater than 10(ignore) - source_query_dq - # max of col1 must be greater than 100(ignore) - final_query_dq - # min of col3 must be greater than 0(fail) - final_query_dq - {"col1": 1, "col2": "a", "col3": 4}, - # row meets all row_dq_expectations(drop) - {"col1": 2, "col2": "b", "col3": 5}, - # ow doesn't meet row_dq expectation1(drop) - {"col1": 3, "col2": "c", "col3": 6}, - # row meets all row_dq_expectations - ] - ), - [ - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "row_dq", - "rule": "col3_mod_2", - "column_name": "col3", - "expectation": "(col3 % 2) = 0", - "action_if_failed": "drop", - "tag": "validity", - "description": "col3 mod must equals to 0", - "enable_for_source_dq_validation": False, - "enable_for_target_dq_validation": False, - "is_active": True, - "enable_error_drop_alert": True, - "error_drop_threshold": "40", - }, - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "query_dq", - "rule": "min_col1_threshold", - "column_name": "col1", - "expectation": "(select min(col1) from test_final_table_view) > 10", - "action_if_failed": "ignore", - "tag": "strict", - "description": "min of col1 value must be greater than 10", - "enable_for_source_dq_validation": True, - "enable_for_target_dq_validation": False, - "is_active": True, - "enable_error_drop_alert": False, - "error_drop_threshold": "20", - }, - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "query_dq", - "rule": "max_col1_threshold", - "column_name": "col1", - "expectation": "(select max(col1) from test_final_table_view) > 100", - "action_if_failed": "ignore", - "tag": "strict", - "description": "max of col1 value must be greater than 100", - "enable_for_source_dq_validation": False, - "enable_for_target_dq_validation": True, - "is_active": True, - "enable_error_drop_alert": False, - "error_drop_threshold": "20", - }, - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "query_dq", - "rule": "min_col3_threshold", - "column_name": "col3", - "expectation": "(select min(col3) from test_final_table_view) > 0", - "action_if_failed": "fail", - "tag": "validity", - "description": "min of col3 value must be greater than 0", - "enable_for_source_dq_validation": False, - "enable_for_target_dq_validation": True, - "is_active": True, - "enable_error_drop_alert": False, - "error_drop_threshold": "20", - } - ], - True, # write to table - False, # write to temp table - spark.createDataFrame( - [ - {"col1": 1, "col2": "a", "col3": 4}, - {"col1": 3, "col2": "c", "col3": 6}, - ] - ), # expected result - 3, # input count - 1, # error count - 2, # output count - None, # source_agg_result - None, # final_agg_result - # final_agg_result - [{"description": "min of col1 value must be greater than 10", - "rule": "min_col1_threshold", - "rule_type": "query_dq", "action_if_failed": "ignore", "tag": "strict"}], - # source_query_dq_res - [{"description": "max of col1 value must be greater than 100", - "rule": "max_col1_threshold", - "rule_type": "query_dq", "action_if_failed": "ignore", "tag": "strict"}], - # final_query_dq_res - {"rules": {"num_dq_rules": 4, "num_row_dq_rules": 1}, - "query_dq_rules": {"num_final_query_dq_rules": 2, "num_source_query_dq_rules": 1, - "num_query_dq_rules": 3}, # dq_rules - "agg_dq_rules": {"num_source_agg_dq_rules": 0, "num_agg_dq_rules": 0, - "num_final_agg_dq_rules": 0}}, - {"row_dq_status": "Passed", "source_agg_dq_status": "Skipped", - "final_agg_dq_status": "Skipped", "run_status": "Passed", - "source_query_dq_status": "Passed", "final_query_dq_status": "Passed"} # status - ), - ( - # Test case 21 - # In this test case, dq run set for query_dq source_query_dq & - # final_query_dq(ignore, fail) - # with action_if_failed (ignore, fail) for query_dq - # collect stats in the test_stats_table, error into error_table & raise the error - spark.createDataFrame( - [ - # min of col1 must be greater than 10(ignore) - source_query_dq - # max of col1 must be greater than 100(fail) - final_query_dq - # min of col3 must be greater than 0(fail) - final_query_dq - {"col1": 1, "col2": "a", "col3": 4}, - # row meets all row_dq_expectations(drop) - {"col1": 2, "col2": "b", "col3": 5}, - # ow doesn't meet row_dq expectation1(drop) - {"col1": 3, "col2": "c", "col3": 6}, - # row meets all row_dq_expectations - ] - ), - [ - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "row_dq", - "rule": "col3_mod_2", - "column_name": "col3", - "expectation": "(col3 % 2) = 0", - "action_if_failed": "drop", - "tag": "validity", - "description": "col3 mod must equals to 0", - "enable_for_source_dq_validation": False, - "enable_for_target_dq_validation": False, - "is_active": True, - "enable_error_drop_alert": False, - "error_drop_threshold": "10", - }, - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "query_dq", - "rule": "min_col1_threshold", - "column_name": "col1", - "expectation": "(select min(col1) from test_final_table_view) > 10", - "action_if_failed": "ignore", - "tag": "strict", - "description": "min of col1 value must be greater than 10", - "enable_for_source_dq_validation": True, - "enable_for_target_dq_validation": False, - "is_active": True, - "enable_error_drop_alert": False, - "error_drop_threshold": "20", - }, - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "query_dq", - "rule": "max_col1_threshold", - "column_name": "col1", - "expectation": "(select max(col1) from test_final_table_view) > 100", - "action_if_failed": "fail", - "tag": "strict", - "description": "max of col1 value must be greater than 100", - "enable_for_source_dq_validation": False, - "enable_for_target_dq_validation": True, - "is_active": True, - "enable_error_drop_alert": False, - "error_drop_threshold": "20", - }, - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "query_dq", - "rule": "min_col3_threshold", - "column_name": "col3", - "expectation": "(select min(col3) from test_final_table_view) > 0", - "action_if_failed": "ignore", - "tag": "validity", - "description": "min of col3 value must be greater than 0", - "enable_for_source_dq_validation": False, - "enable_for_target_dq_validation": True, - "is_active": True, - "enable_error_drop_alert": False, - "error_drop_threshold": "20", - } - ], - True, # write to table - False, # write to temp table - SparkExpectationsMiscException, # expected result - 3, # input count - 1, # error count - 2, # output count - None, # source_agg_result - None, # final_agg_result - # final_agg_result - [{"description": "min of col1 value must be greater than 10", - "rule": "min_col1_threshold", - "rule_type": "query_dq", "action_if_failed": "ignore", "tag": "strict"}], - # source_query_dq_res - [{"description": "max of col1 value must be greater than 100", - "rule": "max_col1_threshold", - "rule_type": "query_dq", "action_if_failed": "fail", "tag": "strict"}], - # final_query_dq_res - {"rules": {"num_dq_rules": 4, "num_row_dq_rules": 1}, - "query_dq_rules": {"num_final_query_dq_rules": 2, "num_source_query_dq_rules": 1, - "num_query_dq_rules": 3}, # dq_rules - "agg_dq_rules": {"num_source_agg_dq_rules": 0, "num_agg_dq_rules": 0, - "num_final_agg_dq_rules": 0}}, - {"row_dq_status": "Passed", "source_agg_dq_status": "Skipped", - "final_agg_dq_status": "Skipped", "run_status": "Failed", - "source_query_dq_status": "Passed", "final_query_dq_status": "Failed"} # status - ), - ( - # Test case 22 - # In this test case, dq run set for query_dq source_query_dq & - # final_query_dq(ignore, fail) - # with action_if_failed (ignore, fail) for query_dq - # collect stats in the test_stats_table, error into error_table & raise the error - spark.createDataFrame( - [ - # min of col1 must be greater than 10(ignore) - source_query_dq - # max of col1 must be greater than 100(fail) - final_query_dq - # min of col3 must be greater than 0(fail) - final_query_dq - {"col1": 1, "col2": "a", "col3": 4}, - # row meets all row_dq_expectations(drop) - {"col1": 2, "col2": "b", "col3": 5}, - # ow doesn't meet row_dq expectation1(drop) - {"col1": 3, "col2": "c", "col3": 6}, - # row meets all row_dq_expectations - ] - ), - [ - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "agg_dq", - "rule": "col3_max_value", - "column_name": "col3", - "expectation": "max(col3) > 1", - "action_if_failed": "fail", - "tag": "validity", - "description": "col3 mod must equals to 0", - "enable_for_source_dq_validation": True, - "enable_for_target_dq_validation": True, - "is_active": True, - "enable_error_drop_alert": False, - "error_drop_threshold": "20", - }, - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "row_dq", - "rule": "col3_mod_2", - "column_name": "col3", - "expectation": "(col3 % 2) = 0", - "action_if_failed": "drop", - "tag": "validity", - "description": "col3 mod must equals to 0", - "enable_for_source_dq_validation": False, - "enable_for_target_dq_validation": False, - "is_active": True, - "enable_error_drop_alert": False, - "error_drop_threshold": "100", - }, - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "query_dq", - "rule": "count_col1_threshold", - "column_name": "col1", - "expectation": "(select count(col1) from test_final_table_view) > 3", - "action_if_failed": "ignore", - "tag": "strict", - "description": "count of col1 value must be greater than 3", - "enable_for_source_dq_validation": True, - "enable_for_target_dq_validation": False, - "is_active": True, - "enable_error_drop_alert": False, - "error_drop_threshold": "20", - }, - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "query_dq", - "rule": "col3_positive_threshold", - "column_name": "col1", - "expectation": "(select count(case when col3>0 then 1 else 0 end) from " - "test_final_table_view) > 10", - "action_if_failed": "ignore", - "tag": "strict", - "description": "count of col3 positive value must be greater than 10", - "enable_for_source_dq_validation": False, - "enable_for_target_dq_validation": True, - "is_active": True, - "enable_error_drop_alert": False, - "error_drop_threshold": "20", - } - ], - True, # write to table - False, # write to temp table - spark.createDataFrame( - [ - {"col1": 1, "col2": "a", "col3": 4}, - {"col1": 3, "col2": "c", "col3": 6}, - ] - ), # expected result - 3, # input count - 1, # error count - 2, # output count - None, # source_agg_result - None, # final_agg_result - # final_agg_result - [{"description": "count of col1 value must be greater than 3", - "rule": "count_col1_threshold", - "rule_type": "query_dq", "action_if_failed": "ignore", "tag": "strict"}], - # source_query_dq_res - [{"description": "count of col3 positive value must be greater than 10", - "rule": "col3_positive_threshold", - "rule_type": "query_dq", "action_if_failed": "ignore", "tag": "strict"}], - # final_query_dq_res - {"rules": {"num_dq_rules": 4, "num_row_dq_rules": 1}, - "query_dq_rules": {"num_final_query_dq_rules": 1, "num_source_query_dq_rules": 1, - "num_query_dq_rules": 2}, # dq_rules - "agg_dq_rules": {"num_source_agg_dq_rules": 1, "num_agg_dq_rules": 1, - "num_final_agg_dq_rules": 1}}, - {"row_dq_status": "Passed", "source_agg_dq_status": "Passed", - "final_agg_dq_status": "Passed", "run_status": "Passed", - "source_query_dq_status": "Passed", "final_query_dq_status": "Passed"} # status - ), - ( - # Test case 23 - # In this test case, dq run set for query_dq source_query_dq and one of the rule is parameterized - # with action_if_failed (ignore) for query_dq - # collect stats in the test_stats_table & error into error_table - spark.createDataFrame( - [ - # sum of col1 must be greater than 10(ignore) - # standard deviation of col3 must be greater than 0(ignore) - {"col1": 1, "col2": "a", "col3": 4}, - # row doesn't meet row_dq expectation1(drop) - {"col1": 2, "col2": "b", "col3": 5}, - # row meets all row_dq_expectations - {"col1": 3, "col2": "c", "col3": 6}, - # row meets all row_dq_expectations - ] - ), - [ - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "query_dq", - "rule": "sum_col1_threshold", - "column_name": "col1", - "expectation": "(select sum(col1) from {table}) > 10", - "action_if_failed": "ignore", - "tag": "validity", - "description": "sum of col1 value must be greater than 10", - "enable_for_source_dq_validation": True, - "enable_for_target_dq_validation": True, - "is_active": True, - "enable_error_drop_alert": False, - "error_drop_threshold": "20", - }, - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "query_dq", - "rule": "stddev_col3_threshold", - "column_name": "col3", - "expectation": "(select stddev(col3) from test_table) > 0", - "action_if_failed": "ignore", - "tag": "validity", - "description": "stddev of col3 value must be greater than 0", - "enable_for_source_dq_validation": True, - "enable_for_target_dq_validation": True, - "is_active": True, - "enable_error_drop_alert": False, - "error_drop_threshold": "20", - } - ], - False, # write to table - False, # write to temp table - None, # expected result - 3, # input count - 0, # error count - 0, # output count - None, # source_agg_result - None, # final_agg_result - # final_agg_result - [{"description": "sum of col1 value must be greater than 10", - "rule": "sum_col1_threshold", - "rule_type": "query_dq", "action_if_failed": "ignore", "tag": "validity"}], - # source_query_dq_res - None, # final_query_dq_res - {"rules": {"num_dq_rules": 2, "num_row_dq_rules": 0}, - "query_dq_rules": {"num_final_query_dq_rules": 2, "num_source_query_dq_rules": 2, - "num_query_dq_rules": 2}, # dq_rules - "agg_dq_rules": {"num_source_agg_dq_rules": 0, "num_agg_dq_rules": 0, - "num_final_agg_dq_rules": 0}}, - {"row_dq_status": "Skipped", "source_agg_dq_status": "Skipped", - "final_agg_dq_status": "Skipped", "run_status": "Passed", - "source_query_dq_status": "Passed", "final_query_dq_status": "Skipped"} # status - ), - ( - # Test case 24 - # In this test case, dq run set for source_agg_dq with action_if_failed fail - # with the sql syntax > lower_bound and < upper_bound - # collect stats in the test_stats_table - spark.createDataFrame( - [ - # avg of col3 is greater than 18 and not more than 25 - {"col1": 1, "col2": "a", "col3": 4}, - {"col1": 2, "col2": "b", "col3": 5}, - {"col1": 3, "col2": "c", 'col3': 6}, - ] - ), - [ - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "agg_dq", - "rule": "avg_col3_range", - "column_name": "col3", - "expectation": "avg(col3) > 18 and avg(col3) < 25", - "action_if_failed": "fail", - "tag": "strict", - "description": "avg col3 value must be greater than 18 and less than 25", - "enable_for_source_dq_validation": True, - "enable_for_target_dq_validation": False, - "is_active": True, - "enable_error_drop_alert": False, - "error_drop_threshold": "20", - } - ], - True, # write to table - True, # write to temp table - SparkExpectationsMiscException, # excepted result - 3, # input count - 0, # error count - 0, # output count - [{"description": "avg col3 value must be greater than 18 and less than 25", # source_agg_result - "rule": "avg_col3_range", - "rule_type": "agg_dq", "action_if_failed": "fail", "tag": "strict"}], - None, # final_agg_result - None, # source_query_dq_res - None, # final_query_dq_res - {"rules": {"num_dq_rules": 1, "num_row_dq_rules": 0}, - "query_dq_rules": {"num_final_query_dq_rules": 0, "num_source_query_dq_rules": 0, - "num_query_dq_rules": 0}, - "agg_dq_rules": {"num_source_agg_dq_rules": 1, "num_agg_dq_rules": 1, - "num_final_agg_dq_rules": 1}}, # dq_rules - {"row_dq_status": "Skipped", "source_agg_dq_status": "Failed", # status - "final_agg_dq_status": "Skipped", "run_status": "Failed", - "source_query_dq_status": "Skipped", "final_query_dq_status": "Skipped"}, - ), - ( - # Test case 25 - # In this test case, dq run set for source_agg_dq with action_if_failed fail - # with the sql syntax between lower_bound and upper_bound - # collect stats in the test_stats_table - spark.createDataFrame( - [ - # avg of col3 is greater than 18 and not more than 25 - {"col1": 1, "col2": "a", "col3": 4}, - {"col1": 2, "col2": "b", "col3": 5}, - {"col1": 3, "col2": "c", 'col3': 6}, - ] - ), - [ - { - "product_id": "product1", - "table_name": "dq_spark.test_final_table", - "rule_type": "agg_dq", - "rule": "avg_col3_range", - "column_name": "col3", - "expectation": "avg(col3) between 18 and 25", - "action_if_failed": "fail", - "tag": "strict", - "description": "avg col3 value must be greater than 18 and less than 25", - "enable_for_source_dq_validation": True, - "enable_for_target_dq_validation": False, - "is_active": True, - "enable_error_drop_alert": False, - "error_drop_threshold": "20", - } - ], - True, # write to table - True, # write to temp table - SparkExpectationsMiscException, # excepted result - 3, # input count - 0, # error count - 0, # output count - [{"description": "avg col3 value must be greater than 18 and less than 25", # source_agg_result - "rule": "avg_col3_range", - "rule_type": "agg_dq", "action_if_failed": "fail", "tag": "strict"}], - None, # final_agg_result - None, # source_query_dq_res - None, # final_query_dq_res - {"rules": {"num_dq_rules": 1, "num_row_dq_rules": 0}, - "query_dq_rules": {"num_final_query_dq_rules": 0, "num_source_query_dq_rules": 0, - "num_query_dq_rules": 0}, - "agg_dq_rules": {"num_source_agg_dq_rules": 1, "num_agg_dq_rules": 1, - "num_final_agg_dq_rules": 1}}, # dq_rules - {"row_dq_status": "Skipped", "source_agg_dq_status": "Failed", # status - "final_agg_dq_status": "Skipped", "run_status": "Failed", - "source_query_dq_status": "Skipped", "final_query_dq_status": "Skipped"}, - ), - ]) -def test_with_expectations(input_df, - expectations, - write_to_table, - write_to_temp_table, - expected_output, - input_count, - error_count, - output_count, - source_agg_dq_res, - final_agg_dq_res, - source_query_dq_res, - final_query_dq_res, - dq_rules, - status, - _fixture_create_database, - _fixture_local_kafka_topic): +def test_spark_session_initialization(): + # Test if spark session is initialized even if dataframe.sparkSession is not accessible + with patch.object( + DataFrame, "sparkSession", new_callable=PropertyMock + ) as mock_sparkSession: + mock_sparkSession.side_effect = AttributeError( + "The 'sparkSession' attribute is not accessible" + ) + + rules_df = spark.createDataFrame([("Alice", 32)], ["name", "age"]) + + writer = WrappedDataFrameWriter().mode("append").format("parquet") + se = SparkExpectations( + product_id="product1", + rules_df=rules_df, + stats_table="dq_spark.test_dq_stats_table", + stats_table_writer=writer, + target_and_error_table_writer=writer, + debugger=False, + ) + assert type(se.spark) == SparkSession + + # Test if exception is raised if sparkSession.getActiveSession() returns None + with patch.object( + DataFrame, "sparkSession", new_callable=PropertyMock + ) as mock_sparkSession: + mock_sparkSession.side_effect = AttributeError( + "The 'sparkSession' attribute is not accessible" + ) + with patch.object(SparkSession, "getActiveSession", return_value=None): + rules_df = spark.createDataFrame([("Alice", 32)], ["name", "age"]) + + writer = WrappedDataFrameWriter().mode("append").format("parquet") + + # expect it to raise an exception SparkExpectationsMiscException as spark session is not available + with pytest.raises(SparkExpectationsMiscException) as e: + se = SparkExpectations( + product_id="product1", + rules_df=rules_df, + stats_table="dq_spark.test_dq_stats_table", + stats_table_writer=writer, + target_and_error_table_writer=writer, + debugger=False, + ) + assert se.spark is None + assert ( + str(e.value) + == "Spark session is not available, please initialize a spark session before calling SE" + ) + + +@pytest.mark.parametrize( + "input_df, " + "expectations, " + "write_to_table, " + "write_to_temp_table, " + "expected_output, " + "input_count, " + "error_count, " + "output_count, " + "source_agg_dq_res, " + "final_agg_dq_res, " + "source_query_dq_res, " + "final_query_dq_res, " + "dq_rules, " + "status", + [ + ( + # Note: where err: refers error table and fnl: final table + # test case 0 + # In this test case, the action for failed rows is "ignore", + # so the function should return the input DataFrame with all rows. + # collect stats in the test_stats_table and + # log the error records into the error table. + spark.createDataFrame( + [ + {"col1": 1, "col2": "a"}, + # row doesn't meet expectations(ignore), log into err & fnl + {"col1": 2, "col2": "b"}, + # row meets expectations(ignore), log into final table + {"col1": 3, "col2": "c"}, + # row meets expectations(ignore), log into final table + ] + ), + [ + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "row_dq", + "rule": "col1_threshold", + "column_name": "col1", + "expectation": "col1 > 1", + "action_if_failed": "ignore", + "tag": "validity", + "description": "col1 value must be greater than 1", + "enable_for_source_dq_validation": True, + "enable_for_target_dq_validation": True, + "is_active": True, + "enable_error_drop_alert": True, + "error_drop_threshold": "10", + } + ], + True, # write to table + True, # write to temp table + # expected res + spark.createDataFrame( + [ + {"col1": 1, "col2": "a"}, + {"col1": 2, "col2": "b"}, + {"col1": 3, "col2": "c"}, + ] + ), + 3, # input count + 1, # error count + 3, # output count + None, # source_agg_dq_res + None, # final_agg_dq_res + None, # source_query_dq_res + None, # final_query_dq_res + { + "rules": {"num_dq_rules": 1, "num_row_dq_rules": 1}, + "query_dq_rules": { + "num_final_query_dq_rules": 0, + "num_source_query_dq_rules": 0, + "num_query_dq_rules": 0, + }, + "agg_dq_rules": { + "num_source_agg_dq_rules": 0, + "num_agg_dq_rules": 0, + "num_final_agg_dq_rules": 0, + }, + }, # dq_rules + # status at different stages for given input + { + "row_dq_status": "Passed", + "source_agg_dq_status": "Skipped", + "final_agg_dq_status": "Skipped", + "run_status": "Passed", + "source_query_dq_status": "Skipped", + "final_query_dq_status": "Skipped", + }, + ), + ( + # test case 1 + # In this test case, the action for failed rows is "drop", + # collect stats in the test_stats_table and + # log the error records into the error table. + spark.createDataFrame( + [ + {"col1": 1, "col2": "a", "col3": 4}, + # row meets expectations(drop), log into err & fnl + {"col1": 2, "col2": "b", "col3": 5}, + # row doesn't meets expectations(drop), log into final table + {"col1": 3, "col2": "c", "col3": 6}, + # row meets expectations(drop), log into final table + ] + ), + [ + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "row_dq", + "rule": "col2_set", + "column_name": "col2", + "expectation": "col2 in ('a', 'c')", + "action_if_failed": "drop", + "tag": "strict", + "description": "col2 value must be in ('a', 'b')", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": False, + "is_active": True, + "enable_error_drop_alert": True, + "error_drop_threshold": "5", + } + ], + True, # write to table + True, # write to temp table + # expected res + spark.createDataFrame( + [ + {"col1": 1, "col2": "a", "col3": 4}, + # row doesn't meet expectations(ignore), log into err & fnl + {"col1": 3, "col2": "c", "col3": 6}, + # row meets expectations(ignore), log into final table + ] + ), + 3, # input count + 1, # error count + 2, # output count + None, # source_agg_dq_res + None, # final_agg_dq_res + None, # source_query_dq_res + None, # final_query_dq_res + { + "rules": {"num_dq_rules": 1, "num_row_dq_rules": 1}, + "query_dq_rules": { + "num_final_query_dq_rules": 0, + "num_source_query_dq_rules": 0, + "num_query_dq_rules": 0, + }, + "agg_dq_rules": { + "num_source_agg_dq_rules": 0, + "num_agg_dq_rules": 0, + "num_final_agg_dq_rules": 0, + }, + }, # dq_rules + # status at different stages for given input + { + "row_dq_status": "Passed", + "source_agg_dq_status": "Skipped", + "final_agg_dq_status": "Skipped", + "run_status": "Passed", + "source_query_dq_status": "Skipped", + "final_query_dq_status": "Skipped", + }, + ), + ( + # test case 2 + # In this test case, the action for failed rows is "fail", + # spark expectations expected to fail + # collect stats in the test_stats_table and + # log the error records into the error table. + spark.createDataFrame( + [ + {"col1": 1, "col2": "a", "col3": 4}, + # row doesn't meet expectations(fail), log into err & fnl + {"col1": 2, "col2": "b", "col3": 5}, + # row meets doesn't expectations(fail), log into final table + {"col1": 3, "col2": "c", "col3": 6}, + # row meets doesn't expectations(fail), log into final table + ] + ), + [ + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "row_dq", + "rule": "col3_threshold", + "column_name": "col3", + "expectation": "col3 > 6", + "action_if_failed": "fail", + "tag": "strict", + "description": "col3 value must be greater than 6", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": False, + "is_active": True, + "enable_error_drop_alert": True, + "error_drop_threshold": "15", + } + ], + True, # write to table + True, # write to temp table + SparkExpectationsMiscException, # expected res + 3, # input count + 3, # error count + 0, # output count + None, # source_agg_dq_res + None, # final_agg_dq_res + None, # source_query_dq_res + None, # final_query_dq_res + { + "rules": {"num_dq_rules": 1, "num_row_dq_rules": 1}, + "query_dq_rules": { + "num_final_query_dq_rules": 0, + "num_source_query_dq_rules": 0, + "num_query_dq_rules": 0, + }, + "agg_dq_rules": { + "num_source_agg_dq_rules": 0, + "num_agg_dq_rules": 0, + "num_final_agg_dq_rules": 0, + }, + }, # dq_rules + # status at different stages for given input + { + "row_dq_status": "Failed", + "source_agg_dq_status": "Skipped", + "final_agg_dq_status": "Skipped", + "run_status": "Failed", + "source_query_dq_status": "Skipped", + "final_query_dq_status": "Skipped", + }, + ), + ( # test case 3 + # In this test case, the action for failed rows is "ignore" & "drop", + # collect stats in the test_stats_table and + # log the error records into the error table. + spark.createDataFrame( + [ + {"col1": 1, "col2": "a", "col3": 4}, + # row doesn't meet expectations1(ignore) 2(drop), log into err & fnl + {"col1": 2, "col2": "b", "col3": 5}, + # row meets expectations1(ignore), log into final table + {"col1": 3, "col2": "c", "col3": 6}, + # row doesnt'meets expectations1(ignore), log into final table + ] + ), + [ + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "row_dq", + "rule": "col3_threshold", + "column_name": "col3", + "expectation": "col3 > 6", + "action_if_failed": "ignore", + "tag": "strict", + "description": "col3 value must be greater than 6", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": False, + "is_active": True, + "enable_error_drop_alert": False, + "error_drop_threshold": "10", + }, + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "row_dq", + "rule": "col1_add_col3_threshold", + "column_name": "col1", + "expectation": "(col1+col3) > 6", + "action_if_failed": "drop", + "tag": "strict", + "description": "col1_add_col3 value must be greater than 6", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": False, + "is_active": True, + "enable_error_drop_alert": False, + "error_drop_threshold": "15", + }, + ], + True, # write to table + True, # write to temp table + # expected res + spark.createDataFrame( + [ + {"col1": 2, "col2": "b", "col3": 5}, + {"col1": 3, "col2": "c", "col3": 6}, + ] + ), + 3, # input count + 3, # error count + 2, # output count + None, # source_agg_dq_res + None, # final_agg_dq_res + None, # source_query_dq_res + None, # final_query_dq_res + { + "rules": {"num_dq_rules": 2, "num_row_dq_rules": 2}, + "query_dq_rules": { + "num_final_query_dq_rules": 0, + "num_source_query_dq_rules": 0, + "num_query_dq_rules": 0, + }, + "agg_dq_rules": { + "num_source_agg_dq_rules": 0, + "num_agg_dq_rules": 0, + "num_final_agg_dq_rules": 0, + }, + }, # dq_rules + # status at different stages for given input + { + "row_dq_status": "Passed", + "source_agg_dq_status": "Skipped", + "final_agg_dq_status": "Skipped", + "run_status": "Passed", + "source_query_dq_status": "Skipped", + "final_query_dq_status": "Skipped", + }, + ), + ( + # test case 4 + # In this test case, the action for failed rows is "ignore" & "fail", + # collect stats in the test_stats_table and + # log the error records into the error table. + spark.createDataFrame( + [ + {"col1": 1, "col2": "a", "col3": 4}, + # row doesn't meet expectations1(ignore), log into err & fnl + {"col1": 2, "col2": "b", "col3": 5}, + # row meets expectations1(ignore), log into final table + {"col1": 3, "col2": "c", "col3": 6}, + # row meets expectations1(ignore), log into final table + ] + ), + [ + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "row_dq", + "rule": "col3_threshold", + "column_name": "col3", + "expectation": "col3 > 6", + "action_if_failed": "ignore", + "tag": "strict", + "description": "col3 value must be greater than 6", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": False, + "is_active": True, + "enable_error_drop_alert": True, + "error_drop_threshold": "20", + }, + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "row_dq", + "rule": "col3_minus_col1_threshold", + "column_name": "col1", + "expectation": "(col3-col1) > 1", + "action_if_failed": "fail", + "tag": "strict", + "description": "col3_minus_col1 value must be greater than 1", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": False, + "is_active": True, + "enable_error_drop_alert": True, + "error_drop_threshold": "5", + }, + ], + True, # write to table + True, # write to temp table + # expected res + spark.createDataFrame( + [ + {"col1": 1, "col2": "a", "col3": 4}, + {"col1": 2, "col2": "b", "col3": 5}, + {"col1": 3, "col2": "c", "col3": 6}, + ] + ), + 3, # input count + 3, # error count + 3, # output count + None, # source_agg_dq_res + None, # final_agg_dq_res + None, # source_query_dq_res + None, # final_query_dq_res + { + "rules": {"num_dq_rules": 2, "num_row_dq_rules": 2}, + "query_dq_rules": { + "num_final_query_dq_rules": 0, + "num_source_query_dq_rules": 0, + "num_query_dq_rules": 0, + }, + "agg_dq_rules": { + "num_source_agg_dq_rules": 0, + "num_agg_dq_rules": 0, + "num_final_agg_dq_rules": 0, + }, + }, # dq_rules + # status at different stages for given input + { + "row_dq_status": "Passed", + "source_agg_dq_status": "Skipped", + "final_agg_dq_status": "Skipped", + "run_status": "Passed", + "source_query_dq_status": "Skipped", + "final_query_dq_status": "Skipped", + }, + ), + ( + # Test case 5 + # In this test case, the action for failed rows is "drop" & "fail", + # collect stats in the test_stats_table and + # log the error records into the error table. + spark.createDataFrame( + [ + {"col1": 1, "col2": "a", "col3": 4}, + # row doesn't meet expectations1(drop) & 2(drop), log into err & fnl + {"col1": 2, "col2": "b", "col3": 5}, + # row meets expectations1(drop) & 2(fail), log into final table + {"col1": 3, "col2": "c", "col3": 6}, + # row meets expectations1(drop), & 2(fail) log into final table + ] + ), + [ + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "row_dq", + "rule": "col3_threshold", + "column_name": "col3", + "expectation": "col3 > 6", + "action_if_failed": "drop", + "tag": "strict", + "description": "col3 value must be greater than 6", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": False, + "is_active": True, + "enable_error_drop_alert": True, + "error_drop_threshold": "25", + }, + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "row_dq", + "rule": "col3_minus_col1_threshold", + "column_name": "col1", + "expectation": "(col3-col1) = 1", + "action_if_failed": "fail", + "tag": "strict", + "description": "col3_minus_col1 value must be equals to 1", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": False, + "is_active": True, + "enable_error_drop_alert": False, + "error_drop_threshold": "25", + }, + ], + True, # write to table + True, # write to temp table + SparkExpectationsMiscException, # expected res + 3, # input count + 3, # error count + 0, # output count + None, # source_agg_dq_res + None, # final_agg_dq_res + None, # source_query_dq_res + None, # final_query_dq_res + { + "rules": {"num_dq_rules": 2, "num_row_dq_rules": 2}, + "query_dq_rules": { + "num_final_query_dq_rules": 0, + "num_source_query_dq_rules": 0, + "num_query_dq_rules": 0, + }, + "agg_dq_rules": { + "num_source_agg_dq_rules": 0, + "num_agg_dq_rules": 0, + "num_final_agg_dq_rules": 0, + }, + }, # dq_rules + # status at different stages for given input + { + "row_dq_status": "Failed", + "source_agg_dq_status": "Skipped", + "final_agg_dq_status": "Skipped", + "run_status": "Failed", + "source_query_dq_status": "Skipped", + "final_query_dq_status": "Skipped", + }, + ), + ( # Test case 6 + # In this test case, the action for failed rows is "drop" & "fail", + # collect stats in the test_stats_table and + # log the error records into the error table + spark.createDataFrame( + [ + {"col1": 1, "col2": "a", "col3": 4}, + # row doesn't meet expectations1(drop) & meets 2(fail), log into err & fnl + {"col1": 2, "col2": "b", "col3": 5}, + # row meets expectations1(drop) & meets 2(fail), log into final table + {"col1": 3, "col2": "c", "col3": 6}, + # row meets expectations1(drop) & meets 2(fail), log into final table + ] + ), + [ + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "row_dq", + "rule": "col3_threshold", + "column_name": "col3", + "expectation": "col3 > 6", + "action_if_failed": "drop", + "tag": "strict", + "description": "col3 value must be greater than 6", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": False, + "is_active": True, + "enable_error_drop_alert": True, + "error_drop_threshold": "10", + }, + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "row_dq", + "rule": "col3_mul_col1_threshold", + "column_name": "col1", + "expectation": "(col3*col1) > 1", + "action_if_failed": "fail", + "tag": "strict", + "description": "col3_mul_col1 value must be equals to 1", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": False, + "is_active": True, + "enable_error_drop_alert": True, + "error_drop_threshold": "10", + }, + ], + True, # write to table + True, # write to temp table + # expected res + spark.createDataFrame( + [], + schema=StructType( + [ + StructField("col1", IntegerType()), + StructField("col2", StringType()), + StructField("col3", IntegerType()), + ] + ), + ), + 3, # input count + 3, # error count + 0, # output count + None, # source_agg_dq_res + None, # final_agg_dq_res + None, # source_query_dq_res + None, # final_query_dq_res + { + "rules": {"num_dq_rules": 2, "num_row_dq_rules": 2}, + "query_dq_rules": { + "num_final_query_dq_rules": 0, + "num_source_query_dq_rules": 0, + "num_query_dq_rules": 0, + }, + "agg_dq_rules": { + "num_source_agg_dq_rules": 0, + "num_agg_dq_rules": 0, + "num_final_agg_dq_rules": 0, + }, + }, # dq_rules + # status at different stages for given input + { + "row_dq_status": "Passed", + "source_agg_dq_status": "Skipped", + "final_agg_dq_status": "Skipped", + "run_status": "Passed", + "source_query_dq_status": "Skipped", + "final_query_dq_status": "Skipped", + }, + ), + ( + # Test case 7 + # In this test case, the action for failed rows is "ignore", "drop" & "fail", + # collect stats in the test_stats_table and + # log the error records into the error table + spark.createDataFrame( + [ + {"col1": 1, "col2": "a", "col3": 4}, + # row doesn't meet the expectation1(ignore) & expectation2(drop) + {"col1": 2, "col2": "b", "col3": 5}, + # row doesn't meet the expectation2(drop) + {"col1": 3, "col2": "c", "col3": 6}, + # row meets all the expectations + ] + ), + [ + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "row_dq", + "rule": "col1_threshold", + "column_name": "col1", + "expectation": "col1 > 1", + "action_if_failed": "ignore", + "tag": "strict", + "description": "col1 value must be greater than 1", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": False, + "is_active": True, + "enable_error_drop_alert": False, + "error_drop_threshold": "0", + }, + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "row_dq", + "rule": "col3_threshold", + "column_name": "col3", + "expectation": "col3 > 5", + "action_if_failed": "drop", + "tag": "strict", + "description": "col3 value must be greater than 5", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": False, + "is_active": True, + "enable_error_drop_alert": True, + "error_drop_threshold": "10", + }, + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "row_dq", + "rule": "col3_mul_col1_threshold", + "column_name": "col1", + "expectation": "(col3*col1) > 1", + "action_if_failed": "fail", + "tag": "strict", + "description": "col3_mul_col1 value must be equals to 1", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": False, + "is_active": True, + "enable_error_drop_alert": False, + "error_drop_threshold": "20", + }, + ], + True, # write to table + True, # write_to_temp_table + spark.createDataFrame( # expected output + [ + {"col1": 3, "col2": "c", "col3": 6}, + ] + ), + 3, # input count + 2, # error count + 1, # output count + None, # source_agg_result + None, # final_agg_result + None, # source_query_dq_res + None, # final_query_dq_res + { + "rules": {"num_dq_rules": 3, "num_row_dq_rules": 3}, + "query_dq_rules": { + "num_final_query_dq_rules": 0, + "num_source_query_dq_rules": 0, + "num_query_dq_rules": 0, + }, + "agg_dq_rules": { + "num_source_agg_dq_rules": 0, + "num_agg_dq_rules": 0, + "num_final_agg_dq_rules": 0, + }, + }, # dq_rules + { + "row_dq_status": "Passed", + "source_agg_dq_status": "Skipped", # status + "final_agg_dq_status": "Skipped", + "run_status": "Passed", + "source_query_dq_status": "Skipped", + "final_query_dq_status": "Skipped", + }, + ), + ( + # Test case 8 + # In this test case, dq run set for source_agg_dq + # collect stats in the test_stats_table + spark.createDataFrame( + [ + {"col1": 1, "col2": "a", "col3": 4}, + {"col1": 2, "col2": "b", "col3": 5}, + {"col1": 3, "col2": "c", "col3": 6}, + ] + ), + [ + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "agg_dq", + "rule": "sum_col3_threshold", + "column_name": "col3", + "expectation": "sum(col3) > 20", + "action_if_failed": "ignore", + "tag": "strict", + "description": "sum col3 value must be greater than 20", + "enable_for_source_dq_validation": True, + "enable_for_target_dq_validation": False, + "is_active": True, + "enable_error_drop_alert": True, + "error_drop_threshold": "10", + } + ], + True, # write to table + True, # write to temp table + spark.createDataFrame( + [ + {"col1": 1, "col2": "a", "col3": 4}, + {"col1": 2, "col2": "b", "col3": 5}, + {"col1": 3, "col2": "c", "col3": 6}, + ] + ), # expected result + 3, # input count + 0, # error count + 0, # output count + [ + { + "description": "sum col3 value must be greater than 20", # source_ag_result + "rule": "sum_col3_threshold", + "rule_type": "agg_dq", + "action_if_failed": "ignore", + "tag": "strict", + } + ], + None, # final_agg_result + None, # source_query_dq_res + None, # final_query_dq_res + { + "rules": {"num_dq_rules": 1, "num_row_dq_rules": 0}, + "query_dq_rules": { + "num_final_query_dq_rules": 0, + "num_source_query_dq_rules": 0, + "num_query_dq_rules": 0, + }, + "agg_dq_rules": { + "num_source_agg_dq_rules": 1, + "num_agg_dq_rules": 1, + "num_final_agg_dq_rules": 1, + }, + }, # dq_rules + { + "row_dq_status": "Skipped", + "source_agg_dq_status": "Passed", # status + "final_agg_dq_status": "Skipped", + "run_status": "Passed", + "source_query_dq_status": "Skipped", + "final_query_dq_status": "Skipped", + }, + ), + ( + # Test case 9 + # In this test case, dq run set for source_agg_dq with action_if_failed fail + # collect stats in the test_stats_table + spark.createDataFrame( + [ + # avg of col3 is not more than 25 + {"col1": 1, "col2": "a", "col3": 4}, + {"col1": 2, "col2": "b", "col3": 5}, + {"col1": 3, "col2": "c", "col3": 6}, + ] + ), + [ + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "agg_dq", + "rule": "avg_col3_threshold", + "column_name": "col3", + "expectation": "avg(col3) > 25", + "action_if_failed": "fail", + "tag": "strict", + "description": "avg col3 value must be greater than 25", + "enable_for_source_dq_validation": True, + "enable_for_target_dq_validation": False, + "is_active": True, + "enable_error_drop_alert": False, + "error_drop_threshold": "20", + } + ], + True, # write to table + True, # write to temp table + SparkExpectationsMiscException, # excepted result + 3, # input count + 0, # error count + 0, # output count + [ + { + "description": "avg col3 value must be greater than 25", # source_agg_result + "rule": "avg_col3_threshold", + "rule_type": "agg_dq", + "action_if_failed": "fail", + "tag": "strict", + } + ], + None, # final_agg_result + None, # source_query_dq_res + None, # final_query_dq_res + { + "rules": {"num_dq_rules": 1, "num_row_dq_rules": 0}, + "query_dq_rules": { + "num_final_query_dq_rules": 0, + "num_source_query_dq_rules": 0, + "num_query_dq_rules": 0, + }, + "agg_dq_rules": { + "num_source_agg_dq_rules": 1, + "num_agg_dq_rules": 1, + "num_final_agg_dq_rules": 1, + }, + }, # dq_rules + { + "row_dq_status": "Skipped", + "source_agg_dq_status": "Failed", # status + "final_agg_dq_status": "Skipped", + "run_status": "Failed", + "source_query_dq_status": "Skipped", + "final_query_dq_status": "Skipped", + }, + ), + ( + # Test case 10 + # In this test case, dq run set for final_agg_dq with action_if_failed ignore + # collect stats in the test_stats_table + spark.createDataFrame( + [ + # minimum of col1 must be greater than 10 + {"col1": 1, "col2": "a", "col3": 4}, + {"col1": 2, "col2": "b", "col3": 5}, + {"col1": 3, "col2": "c", "col3": 6}, + ] + ), + [ + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "agg_dq", + "rule": "min_col1_threshold", + "column_name": "col1", + "expectation": "min(col1) > 10", + "action_if_failed": "ignore", + "tag": "strict", + "description": "min col1 value must be greater than 10", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": True, + "is_active": True, + "enable_error_drop_alert": False, + "error_drop_threshold": "20", + }, + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "row_dq", + "rule": "col2_set", + "column_name": "col2", + "expectation": "col2 in ('a', 'c')", + "action_if_failed": "drop", + "tag": "strict", + "description": "col2 value must be in ('a', 'b')", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": False, + "is_active": True, + "enable_error_drop_alert": False, + "error_drop_threshold": "0", + }, + ], + True, # write to table + True, # write to temp table + spark.createDataFrame( + [ + {"col1": 1, "col2": "a", "col3": 4}, + {"col1": 3, "col2": "c", "col3": 6}, + ] + ), # expected result but row_dq set to false + 3, # input count + 1, # error count + 2, # output count + None, # source_agg-result + [ + { + "description": "min col1 value must be greater than 10", + "rule": "min_col1_threshold", + "rule_type": "agg_dq", + "action_if_failed": "ignore", + "tag": "strict", + } + ], + # final_agg_result + None, # source_query_dq_res + None, # final_query_dq_res + { + "rules": {"num_dq_rules": 2, "num_row_dq_rules": 1}, + "query_dq_rules": { + "num_final_query_dq_rules": 0, + "num_source_query_dq_rules": 0, + "num_query_dq_rules": 0, + }, + "agg_dq_rules": { + "num_source_agg_dq_rules": 1, + "num_agg_dq_rules": 1, + "num_final_agg_dq_rules": 1, + }, + }, # dq_rules + { + "row_dq_status": "Passed", + "source_agg_dq_status": "Skipped", + "final_agg_dq_status": "Passed", + "run_status": "Passed", + "source_query_dq_status": "Skipped", + "final_query_dq_status": "Skipped", + }, + ), + ( + # Test case 11 + # In this test case, dq run set for row_dq & final_agg_dq + # with action_if_failed drop(row), fail(agg) + # collect stats in the test_stats_table & error into error_table + spark.createDataFrame( + # standard deviation of col3 must be greater than 10 + [ + {"col1": 1, "col2": "a", "col3": 4}, + # row meet expectation + {"col1": 2, "col2": "b", "col3": 5}, + # row doesn't meet expectation + {"col1": 3, "col2": "c", "col3": 6}, + # row meets expectations + ] + ), + [ + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "agg_dq", + "rule": "std_col3_threshold", + "column_name": "col3", + "expectation": "stddev(col3) > 10", + "action_if_failed": "fail", + "tag": "strict", + "description": "std col3 value must be greater than 10", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": True, + "is_active": True, + "enable_error_drop_alert": False, + "error_drop_threshold": "20", + }, + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "row_dq", + "rule": "col2_set", + "column_name": "col2", + "expectation": "col2 in ('a', 'c')", + "action_if_failed": "drop", + "tag": "strict", + "description": "col2 value must be in ('a', 'b')", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": False, + "is_active": True, + "enable_error_drop_alert": True, + "error_drop_threshold": "15", + }, + ], + True, # write to table + True, # write temp table + SparkExpectationsMiscException, # expected result + 3, # input count + 1, # error count + 2, # output count + None, # source_agg_result + [ + { + "description": "std col3 value must be greater than 10", + "rule": "std_col3_threshold", + "rule_type": "agg_dq", + "action_if_failed": "fail", + "tag": "strict", + } + ], + # final_agg_result + None, # source_query_dq_res + None, # final_query_dq_res + { + "rules": {"num_dq_rules": 2, "num_row_dq_rules": 1}, + "query_dq_rules": { + "num_final_query_dq_rules": 0, + "num_source_query_dq_rules": 0, + "num_query_dq_rules": 0, + }, + "agg_dq_rules": { + "num_source_agg_dq_rules": 1, + "num_agg_dq_rules": 1, + "num_final_agg_dq_rules": 1, + }, + }, # dq_rules + { + "row_dq_status": "Passed", + "source_agg_dq_status": "Skipped", + "final_agg_dq_status": "Failed", + "run_status": "Failed", + "source_query_dq_status": "Skipped", + "final_query_dq_status": "Skipped", + }, + # status + ), + ( + # Test case 12 + # In this test case, dq run set for row_dq + # with action_if_failed drop + # collect stats in the test_stats_table & error into error_table + spark.createDataFrame( + [ + {"col1": 1, "col2": "a"}, + # row doesn't meet the expectations + {"col1": 2, "col2": "b"}, + # row meets the expectation + {"col1": 3, "col2": "c"}, + # row meets the expectations + ] + ), + [ + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "row_dq", + "rule": "col1_threshold", + "column_name": "col1", + "expectation": "col1 > 1", + "action_if_failed": "drop", + "tag": "validity", + "description": "col1 value must be greater than 1", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": False, + "is_active": True, + "enable_error_drop_alert": False, + "error_drop_threshold": "0", + } + ], + True, # write to table + True, # write to temp table + spark.createDataFrame( + [{"col1": 2, "col2": "b"}, {"col1": 3, "col2": "c"}] # expected_output + ), # expected result + 3, # input count + 1, # error count + 2, # output count + None, # source_agg_result + None, # final_agg_result + None, # source_query_dq_res + None, # final_query_dq_res + { + "rules": {"num_dq_rules": 1, "num_row_dq_rules": 1}, + "query_dq_rules": { + "num_final_query_dq_rules": 0, + "num_source_query_dq_rules": 0, + "num_query_dq_rules": 0, + }, + "agg_dq_rules": { + "num_source_agg_dq_rules": 0, + "num_agg_dq_rules": 0, + "num_final_agg_dq_rules": 0, + }, + }, # dq_rules + { + "row_dq_status": "Passed", + "source_agg_dq_status": "Skipped", + "final_agg_dq_status": "Skipped", + "run_status": "Passed", + "source_query_dq_status": "Skipped", + "final_query_dq_status": "Skipped", + }, # status + ), + ( + # Test case 13 + # In this test case, dq run set for row_dq & source_agg_dq + # with action_if_failed (ignore, drop) and ignore(agg_dq) + # collect stats in the test_stats_table & error into error_table + spark.createDataFrame( + [ + # count of distinct element in col2 must be greater than 2 + {"col1": 1, "col2": "a", "col3": 4}, + # row doesn't meet expectation1(ignore) + {"col1": 2, "col2": "b", "col3": 5}, + # row meets all row_dq expectations + {"col1": 3, "col2": "c", "col3": 6}, + # row meets all row_dq expectations + ] + ), + [ + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "row_dq", + "rule": "col1_threshold_1", + "column_name": "col1", + "expectation": "col1 > 1", + "action_if_failed": "ignore", + "tag": "validity", + "description": "col1 value must be greater than 1", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": False, + "is_active": True, + "enable_error_drop_alert": True, + "error_drop_threshold": "10", + }, + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "row_dq", + "rule": "col2_set", + "column_name": "col2", + "expectation": "col2 in ('a', 'b', 'c')", + "action_if_failed": "drop", + "tag": "validity", + "description": "col1 value must be greater than 2", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": False, + "is_active": True, + "enable_error_drop_alert": True, + "error_drop_threshold": "10", + }, + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "agg_dq", + "rule": "distinct_col2_threshold", + "column_name": "col2", + "expectation": "count(distinct col2) > 4", + "action_if_failed": "ignore", + "tag": "validity", + "description": "distinct of col2 value must be greater than 4", + "enable_for_source_dq_validation": True, + "enable_for_target_dq_validation": False, + "is_active": True, + "enable_error_drop_alert": False, + "error_drop_threshold": "20", + }, + ], + True, # write to table + True, # write to temp table + spark.createDataFrame( + [ # expected_output + {"col1": 1, "col2": "a", "col3": 4}, + {"col1": 2, "col2": "b", "col3": 5}, + {"col1": 3, "col2": "c", "col3": 6}, + ] + ), + 3, # input count + 1, # error count + 3, # output count + [ + { + "description": "distinct of col2 value must be greater than 4", + "rule": "distinct_col2_threshold", + "rule_type": "agg_dq", + "action_if_failed": "ignore", + "tag": "validity", + } + ], + # source_agg_result + None, # final_agg_result + None, # source_query_dq_res + None, # final_query_dq_res + { + "rules": {"num_dq_rules": 3, "num_row_dq_rules": 2}, + "query_dq_rules": { + "num_final_query_dq_rules": 0, + "num_source_query_dq_rules": 0, + "num_query_dq_rules": 0, + }, + "agg_dq_rules": { + "num_source_agg_dq_rules": 1, + "num_agg_dq_rules": 1, + "num_final_agg_dq_rules": 1, + }, + }, # dq_rules + { + "row_dq_status": "Passed", + "source_agg_dq_status": "Passed", + "final_agg_dq_status": "Skipped", + "run_status": "Passed", + "source_query_dq_status": "Skipped", + "final_query_dq_status": "Skipped", + }, # status + ), + ( + # Test case 14 + # In this test case, dq run set for row_dq, source_agg_dq & final_agg_dq + # with action_if_failed r(ignore, drop) for row_dq and (ignore) for agg_dq + # collect stats in the test_stats_table & error into error_table + spark.createDataFrame( + [ + # avg of col1 must be greater than 4(ignore) for agg_dq + {"col1": 1, "col2": "a", "col3": 4}, + # row doesn't meet row_dq expectation1(ignore) + {"col1": 2, "col2": "b", "col3": 5}, + # row doesn't meet row_dq expectation2(drop) + {"col1": 3, "col2": "c", "col3": 6}, + # row meets all row-dq expectations + ] + ), + [ + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "row_dq", + "rule": "col3_threshold_4", + "column_name": "col3", + "expectation": "col3 > 4", + "action_if_failed": "drop", + "tag": "validity", + "description": "col3 value must be greater than 4", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": False, + "is_active": True, + "enable_error_drop_alert": True, + "error_drop_threshold": "20", + }, + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "row_dq", + "rule": "col2_set", + "column_name": "col2", + "expectation": "col2 in ('a', 'b')", + "action_if_failed": "ignore", + "tag": "validity", + "description": "col2 value must be in (a, b)", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": False, + "is_active": True, + "enable_error_drop_alert": True, + "error_drop_threshold": "2", + }, + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "agg_dq", + "rule": "avg_col1_threshold", + "column_name": "col1", + "expectation": "avg(col1) > 4", + "action_if_failed": "ignore", + "tag": "accuracy", + "description": "avg of col1 value must be greater than 4", + "enable_for_source_dq_validation": True, + "enable_for_target_dq_validation": True, + "is_active": True, + "enable_error_drop_alert": False, + "error_drop_threshold": "20", + }, + ], + True, # write to table + True, # write to temp table + spark.createDataFrame( + [ # expected_output + {"col1": 2, "col2": "b", "col3": 5}, + {"col1": 3, "col2": "c", "col3": 6}, + ] + ), # expected result + 3, # input count + 2, # error count + 2, # output count + [ + { + "description": "avg of col1 value must be greater than 4", + "rule": "avg_col1_threshold", + "rule_type": "agg_dq", + "action_if_failed": "ignore", + "tag": "accuracy", + } + ], + # source_agg_dq + [ + { + "description": "avg of col1 value must be greater than 4", + "rule": "avg_col1_threshold", + "rule_type": "agg_dq", + "action_if_failed": "ignore", + "tag": "accuracy", + } + ], + # final_agg_dq + None, # source_query_dq_res + None, # final_query_dq_res + { + "rules": {"num_dq_rules": 3, "num_row_dq_rules": 2}, + "query_dq_rules": { + "num_final_query_dq_rules": 0, + "num_source_query_dq_rules": 0, + "num_query_dq_rules": 0, + }, + "agg_dq_rules": { + "num_source_agg_dq_rules": 1, + "num_agg_dq_rules": 1, + "num_final_agg_dq_rules": 1, + }, + }, # dq_rules + { + "row_dq_status": "Passed", + "source_agg_dq_status": "Passed", + "final_agg_dq_status": "Passed", + "run_status": "Passed", + "source_query_dq_status": "Skipped", + "final_query_dq_status": "Skipped", + }, # status + ), + ( + # Test case 15 + # In this test case, dq run set for row_dq, source_agg_dq & final_agg_dq + # with action_if_failed (ignore, drop) for row_dq and (ignore, fail) for agg_dq + # collect stats in the test_stats_table & error into error_table + spark.createDataFrame( + [ + # average of col1 must be greater than 4(ignore) + # standard deviation of col1 must be greater than 0(fail) + {"col1": 1, "col2": "a", "col3": 4}, + # row doesn't meet row_dq expectation1(drop) and expectation2(ignore) + {"col1": 2, "col2": "b", "col3": 5}, + # row meets all row_dq_expectations + {"col1": 3, "col2": "c", "col3": 6}, + # row meets all row_dq_expectations + {"col1": 2, "col2": "d", "col3": 7}, + ] + ), + [ + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "row_dq", + "rule": "col3_and_col1_threshold_4", + "column_name": "col3, col1", + "expectation": "((col3 * col1) - col3) > 5", + "action_if_failed": "drop", + "tag": "validity", + "description": "col3 and col1 operation value must be greater than 3", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": False, + "is_active": True, + "enable_error_drop_alert": True, + "error_drop_threshold": "25", + }, + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "row_dq", + "rule": "col2_set", + "column_name": "col2", + "expectation": "col2 in ('b', 'c')", + "action_if_failed": "ignore", + "tag": "validity", + "description": "col2 value must be in (b, c)", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": False, + "is_active": True, + "enable_error_drop_alert": True, + "error_drop_threshold": "30", + }, + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "agg_dq", + "rule": "avg_col1_threshold", + "column_name": "col1", + "expectation": "avg(col1) > 4", + "action_if_failed": "ignore", + "tag": "validity", + "description": "avg of col1 value must be greater than 4", + "enable_for_source_dq_validation": True, + "enable_for_target_dq_validation": True, + "is_active": True, + "enable_error_drop_alert": False, + "error_drop_threshold": "20", + }, + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "agg_dq", + "rule": "stddev_col3_threshold", + "column_name": "col3", + "expectation": "stddev(col3) > 1", + "action_if_failed": "ignore", + "tag": "validity", + "description": "stddev of col3 value must be greater than one", + "enable_for_source_dq_validation": True, + "enable_for_target_dq_validation": True, + "is_active": True, + "enable_error_drop_alert": False, + "error_drop_threshold": "20", + }, + ], + True, # write to table + True, # write to temp table + spark.createDataFrame( + [ # expected_output + {"col1": 3, "col2": "c", "col3": 6}, + {"col1": 2, "col2": "d", "col3": 7}, + ] + ), # expected result + 4, # input count + 3, # error count + 2, # output count + [ + { + "description": "avg of col1 value must be greater than 4", + "rule": "avg_col1_threshold", + "rule_type": "agg_dq", + "action_if_failed": "ignore", + "tag": "validity", + } + ], + # source_agg_result + [ + { + "action_if_failed": "ignore", + "description": "avg of col1 value must be greater than 4", + "rule": "avg_col1_threshold", + "rule_type": "agg_dq", + "tag": "validity", + }, + { + "action_if_failed": "ignore", + "description": "stddev of col3 value must be greater than one", + "rule": "stddev_col3_threshold", + "rule_type": "agg_dq", + "tag": "validity", + }, + ], + # final_agg_result + None, # source_query_dq_res + None, # final_query_dq_res + { + "rules": {"num_dq_rules": 5, "num_row_dq_rules": 2}, + "query_dq_rules": { + "num_final_query_dq_rules": 0, + "num_source_query_dq_rules": 0, + "num_query_dq_rules": 0, + }, # dq_rules + "agg_dq_rules": { + "num_source_agg_dq_rules": 2, + "num_agg_dq_rules": 3, + "num_final_agg_dq_rules": 2, + }, + }, + { + "row_dq_status": "Passed", + "source_agg_dq_status": "Passed", + "final_agg_dq_status": "Passed", + "run_status": "Passed", + "source_query_dq_status": "Skipped", + "final_query_dq_status": "Skipped", + }, # status + ), + ( + # Test case 16 + # In this test case, dq run set for query_dq source_query_dq + # with action_if_failed (ignore) for query_dq + # collect stats in the test_stats_table & error into error_table + spark.createDataFrame( + [ + # sum of col1 must be greater than 10(ignore) + # standard deviation of col3 must be greater than 0(ignore) + {"col1": 1, "col2": "a", "col3": 4}, + # row doesn't meet row_dq expectation1(drop) + {"col1": 2, "col2": "b", "col3": 5}, + # row meets all row_dq_expectations + {"col1": 3, "col2": "c", "col3": 6}, + # row meets all row_dq_expectations + ] + ), + [ + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "query_dq", + "rule": "sum_col1_threshold", + "column_name": "col1", + "expectation": "(select sum(col1) from test_table) > 10", + "action_if_failed": "ignore", + "tag": "validity", + "description": "sum of col1 value must be greater than 10", + "enable_for_source_dq_validation": True, + "enable_for_target_dq_validation": True, + "is_active": True, + "enable_error_drop_alert": False, + "error_drop_threshold": "20", + }, + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "query_dq", + "rule": "stddev_col3_threshold", + "column_name": "col3", + "expectation": "(select stddev(col3) from test_table) > 0", + "action_if_failed": "ignore", + "tag": "validity", + "description": "stddev of col3 value must be greater than 0", + "enable_for_source_dq_validation": True, + "enable_for_target_dq_validation": True, + "is_active": True, + "enable_error_drop_alert": False, + "error_drop_threshold": "20", + }, + ], + False, # write to table + False, # write to temp table + None, # expected result + 3, # input count + 0, # error count + 0, # output count + None, # source_agg_result + None, # final_agg_result + # final_agg_result + [ + { + "description": "sum of col1 value must be greater than 10", + "rule": "sum_col1_threshold", + "rule_type": "query_dq", + "action_if_failed": "ignore", + "tag": "validity", + } + ], + # source_query_dq_res + None, # final_query_dq_res + { + "rules": {"num_dq_rules": 2, "num_row_dq_rules": 0}, + "query_dq_rules": { + "num_final_query_dq_rules": 2, + "num_source_query_dq_rules": 2, + "num_query_dq_rules": 2, + }, # dq_rules + "agg_dq_rules": { + "num_source_agg_dq_rules": 0, + "num_agg_dq_rules": 0, + "num_final_agg_dq_rules": 0, + }, + }, + { + "row_dq_status": "Skipped", + "source_agg_dq_status": "Skipped", + "final_agg_dq_status": "Skipped", + "run_status": "Passed", + "source_query_dq_status": "Passed", + "final_query_dq_status": "Skipped", + }, # status + ), + ( + # Test case 17 + # In this test case, dq run set for query_dq final_query_dq + # with action_if_failed (ignore) for query_dq + # collect stats in the test_stats_table & error into error_table + spark.createDataFrame( + [ + # max of col1 must be greater than 10(ignore) + # min of col3 must be greater than 0(ignore) + {"col1": 1, "col2": "a", "col3": 4}, + # row doesn't meet row_dq expectation1(drop) and expectation2(ignore) + {"col1": 2, "col2": "b", "col3": 5}, + # row meets all row_dq_expectations + {"col1": 3, "col2": "c", "col3": 6}, + # row meets all row_dq_expectations + ] + ), + [ + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "row_dq", + "rule": "col3_and_col1_threshold_4", + "column_name": "col3, col1", + "expectation": "((col3 * col1) - col3) > 5", + "action_if_failed": "drop", + "tag": "validity", + "description": "col3 and col1 operation value must be greater than 3", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": False, + "is_active": True, + "enable_error_drop_alert": True, + "error_drop_threshold": "20", + }, + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "query_dq", + "rule": "max_col1_threshold", + "column_name": "col1", + "expectation": "(select max(col1) from test_final_table_view) > 10", + "action_if_failed": "ignore", + "tag": "strict", + "description": "max of col1 value must be greater than 10", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": True, + "is_active": True, + "enable_error_drop_alert": False, + "error_drop_threshold": "20", + }, + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "query_dq", + "rule": "min_col3_threshold", + "column_name": "col3", + "expectation": "(select min(col3) from test_final_table_view) > 0", + "action_if_failed": "ignore", + "tag": "validity", + "description": "min of col3 value must be greater than 0", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": True, + "is_active": True, + "enable_error_drop_alert": False, + "error_drop_threshold": "20", + }, + ], + True, # write to table + False, # write to temp table + spark.createDataFrame( + [ + {"col1": 3, "col2": "c", "col3": 6}, + ] + ), # expected result + 3, # input count + 2, # error count + 1, # output count + None, # source_agg_result + None, # final_agg_result + # final_agg_result + None, # source_query_dq_res + [ + { + "description": "max of col1 value must be greater than 10", + "rule": "max_col1_threshold", + "rule_type": "query_dq", + "action_if_failed": "ignore", + "tag": "strict", + } + ], + # final_query_dq_res + { + "rules": {"num_dq_rules": 3, "num_row_dq_rules": 1}, + "query_dq_rules": { + "num_final_query_dq_rules": 2, + "num_source_query_dq_rules": 0, + "num_query_dq_rules": 2, + }, # dq_rules + "agg_dq_rules": { + "num_source_agg_dq_rules": 0, + "num_agg_dq_rules": 0, + "num_final_agg_dq_rules": 0, + }, + }, + { + "row_dq_status": "Passed", + "source_agg_dq_status": "Skipped", + "final_agg_dq_status": "Skipped", + "run_status": "Passed", + "source_query_dq_status": "Skipped", + "final_query_dq_status": "Passed", + }, # status + ), + ( + # Test case 18 + # In this test case, dq run set for query_dq source_query_dq(ignore, fail) + # with action_if_failed (fail) for query_dq + # collect stats in the test_stats_table, error into error_table & raises the error + spark.createDataFrame( + [ + # min of col1 must be greater than 10(fail) + # standard deviation of col3 must be greater than 0(ignore) + {"col1": 1, "col2": "a", "col3": 4}, + # row doesn't meet row_dq expectation1(drop) + {"col1": 2, "col2": "b", "col3": 5}, + # row meets all row_dq_expectations + {"col1": 3, "col2": "c", "col3": 6}, + # row meets all row_dq_expectations + ] + ), + [ + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "query_dq", + "rule": "min_col1_threshold", + "column_name": "col1", + "expectation": "(select min(col1) from test_final_table_view) > 10", + "action_if_failed": "fail", + "tag": "validity", + "description": "min of col1 value must be greater than 10", + "enable_for_source_dq_validation": True, + "enable_for_target_dq_validation": False, + "is_active": True, + "enable_error_drop_alert": False, + "error_drop_threshold": "20", + }, + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "query_dq", + "rule": "stddev_col3_threshold", + "column_name": "col3", + "expectation": "(select stddev(col3) from test_final_table_view) > 0", + "action_if_failed": "ignore", + "tag": "validity", + "description": "stddev of col3 value must be greater than 0", + "enable_for_source_dq_validation": True, + "enable_for_target_dq_validation": False, + "is_active": True, + "enable_error_drop_alert": False, + "error_drop_threshold": "20", + }, + ], + False, # write to table + False, # write to temp table + SparkExpectationsMiscException, # expected result + 3, # input count + 0, # error count + 0, # output count + None, # source_agg_result + None, # final_agg_result + [ + { + "action_if_failed": "fail", + "description": "min of col1 value must be greater than 10", + "rule": "min_col1_threshold", + "rule_type": "query_dq", + "tag": "validity", + }, + { + "action_if_failed": "ignore", + "description": "stddev of col3 value must be greater than 0", + "rule": "stddev_col3_threshold", + "rule_type": "query_dq", + "tag": "validity", + }, + ], # source_query_dq_res + None, # final_query_dq_res + { + "rules": {"num_dq_rules": 2, "num_row_dq_rules": 0}, + "query_dq_rules": { + "num_final_query_dq_rules": 2, + "num_source_query_dq_rules": 2, + "num_query_dq_rules": 2, + }, # dq_rules + "agg_dq_rules": { + "num_source_agg_dq_rules": 0, + "num_agg_dq_rules": 0, + "num_final_agg_dq_rules": 0, + }, + }, + { + "row_dq_status": "Skipped", + "source_agg_dq_status": "Skipped", + "final_agg_dq_status": "Skipped", + "run_status": "Failed", + "source_query_dq_status": "Failed", + "final_query_dq_status": "Skipped", + }, # status + ), + ( + # Test case 19 + # In this test case, dq run set for query_dq final_query_dq(ignore, fail) + # with action_if_failed (ignore, fail) for query_dq + # collect stats in the test_stats_table, error into error_table & raises error + spark.createDataFrame( + [ + # max of col1 must be greater than 10(ignore) + # min of col3 must be greater than 0(ignore) + {"col1": 1, "col2": "a", "col3": 4}, + # row doesn't meet row_dq expectation1(drop) and expectation2(ignore) + {"col1": 2, "col2": "b", "col3": 5}, + # row meets all row_dq_expectations + {"col1": 3, "col2": "c", "col3": 6}, + # row meets all row_dq_expectations + ] + ), + [ + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "row_dq", + "rule": "col3_and_col1_threshold_4", + "column_name": "col3, col1", + "expectation": "((col3 * col1) - col3) > 5", + "action_if_failed": "drop", + "tag": "validity", + "description": "col3 and col1 operation value must be greater than 3", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": False, + "is_active": True, + "enable_error_drop_alert": True, + "error_drop_threshold": "25", + }, + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "query_dq", + "rule": "max_col1_threshold", + "column_name": "col1", + "expectation": "(select max(col1) from test_final_table_view) > 10", + "action_if_failed": "fail", + "tag": "strict", + "description": "max of col1 value must be greater than 10", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": True, + "is_active": True, + "enable_error_drop_alert": False, + "error_drop_threshold": "20", + }, + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "query_dq", + "rule": "min_col3_threshold", + "column_name": "col3", + "expectation": "(select min(col3) from test_final_table_view) > 0", + "action_if_failed": "ignore", + "tag": "validity", + "description": "min of col3 value must be greater than 0", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": True, + "is_active": True, + "enable_error_drop_alert": False, + "error_drop_threshold": "20", + }, + ], + True, # write to table + False, # write to temp table + SparkExpectationsMiscException, # expected result + 3, # input count + 2, # error count + 1, # output count + None, # source_agg_result + None, # final_agg_result + # final_agg_result + None, # source_query_dq_res + [ + { + "description": "max of col1 value must be greater than 10", + "rule": "max_col1_threshold", + "rule_type": "query_dq", + "action_if_failed": "fail", + "tag": "strict", + } + ], + # final_query_dq_res + { + "rules": {"num_dq_rules": 3, "num_row_dq_rules": 1}, + "query_dq_rules": { + "num_final_query_dq_rules": 2, + "num_source_query_dq_rules": 0, + "num_query_dq_rules": 2, + }, # dq_rules + "agg_dq_rules": { + "num_source_agg_dq_rules": 0, + "num_agg_dq_rules": 0, + "num_final_agg_dq_rules": 0, + }, + }, + { + "row_dq_status": "Passed", + "source_agg_dq_status": "Skipped", + "final_agg_dq_status": "Skipped", + "run_status": "Failed", + "source_query_dq_status": "Skipped", + "final_query_dq_status": "Failed", + }, # status + ), + ( + # Test case 20 + # In this test case, dq run set for query_dq source_query_dq & + # final_query_dq(ignore, fail) + # with action_if_failed (ignore, fail) for query_dq + # collect stats in the test_stats_table, error into error_table + spark.createDataFrame( + [ + # min of col1 must be greater than 10(ignore) - source_query_dq + # max of col1 must be greater than 100(ignore) - final_query_dq + # min of col3 must be greater than 0(fail) - final_query_dq + {"col1": 1, "col2": "a", "col3": 4}, + # row meets all row_dq_expectations(drop) + {"col1": 2, "col2": "b", "col3": 5}, + # ow doesn't meet row_dq expectation1(drop) + {"col1": 3, "col2": "c", "col3": 6}, + # row meets all row_dq_expectations + ] + ), + [ + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "row_dq", + "rule": "col3_mod_2", + "column_name": "col3", + "expectation": "(col3 % 2) = 0", + "action_if_failed": "drop", + "tag": "validity", + "description": "col3 mod must equals to 0", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": False, + "is_active": True, + "enable_error_drop_alert": True, + "error_drop_threshold": "40", + }, + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "query_dq", + "rule": "min_col1_threshold", + "column_name": "col1", + "expectation": "(select min(col1) from test_final_table_view) > 10", + "action_if_failed": "ignore", + "tag": "strict", + "description": "min of col1 value must be greater than 10", + "enable_for_source_dq_validation": True, + "enable_for_target_dq_validation": False, + "is_active": True, + "enable_error_drop_alert": False, + "error_drop_threshold": "20", + }, + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "query_dq", + "rule": "max_col1_threshold", + "column_name": "col1", + "expectation": "(select max(col1) from test_final_table_view) > 100", + "action_if_failed": "ignore", + "tag": "strict", + "description": "max of col1 value must be greater than 100", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": True, + "is_active": True, + "enable_error_drop_alert": False, + "error_drop_threshold": "20", + }, + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "query_dq", + "rule": "min_col3_threshold", + "column_name": "col3", + "expectation": "(select min(col3) from test_final_table_view) > 0", + "action_if_failed": "fail", + "tag": "validity", + "description": "min of col3 value must be greater than 0", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": True, + "is_active": True, + "enable_error_drop_alert": False, + "error_drop_threshold": "20", + }, + ], + True, # write to table + False, # write to temp table + spark.createDataFrame( + [ + {"col1": 1, "col2": "a", "col3": 4}, + {"col1": 3, "col2": "c", "col3": 6}, + ] + ), # expected result + 3, # input count + 1, # error count + 2, # output count + None, # source_agg_result + None, # final_agg_result + # final_agg_result + [ + { + "description": "min of col1 value must be greater than 10", + "rule": "min_col1_threshold", + "rule_type": "query_dq", + "action_if_failed": "ignore", + "tag": "strict", + } + ], + # source_query_dq_res + [ + { + "description": "max of col1 value must be greater than 100", + "rule": "max_col1_threshold", + "rule_type": "query_dq", + "action_if_failed": "ignore", + "tag": "strict", + } + ], + # final_query_dq_res + { + "rules": {"num_dq_rules": 4, "num_row_dq_rules": 1}, + "query_dq_rules": { + "num_final_query_dq_rules": 2, + "num_source_query_dq_rules": 1, + "num_query_dq_rules": 3, + }, # dq_rules + "agg_dq_rules": { + "num_source_agg_dq_rules": 0, + "num_agg_dq_rules": 0, + "num_final_agg_dq_rules": 0, + }, + }, + { + "row_dq_status": "Passed", + "source_agg_dq_status": "Skipped", + "final_agg_dq_status": "Skipped", + "run_status": "Passed", + "source_query_dq_status": "Passed", + "final_query_dq_status": "Passed", + }, # status + ), + ( + # Test case 21 + # In this test case, dq run set for query_dq source_query_dq & + # final_query_dq(ignore, fail) + # with action_if_failed (ignore, fail) for query_dq + # collect stats in the test_stats_table, error into error_table & raise the error + spark.createDataFrame( + [ + # min of col1 must be greater than 10(ignore) - source_query_dq + # max of col1 must be greater than 100(fail) - final_query_dq + # min of col3 must be greater than 0(fail) - final_query_dq + {"col1": 1, "col2": "a", "col3": 4}, + # row meets all row_dq_expectations(drop) + {"col1": 2, "col2": "b", "col3": 5}, + # ow doesn't meet row_dq expectation1(drop) + {"col1": 3, "col2": "c", "col3": 6}, + # row meets all row_dq_expectations + ] + ), + [ + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "row_dq", + "rule": "col3_mod_2", + "column_name": "col3", + "expectation": "(col3 % 2) = 0", + "action_if_failed": "drop", + "tag": "validity", + "description": "col3 mod must equals to 0", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": False, + "is_active": True, + "enable_error_drop_alert": False, + "error_drop_threshold": "10", + }, + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "query_dq", + "rule": "min_col1_threshold", + "column_name": "col1", + "expectation": "(select min(col1) from test_final_table_view) > 10", + "action_if_failed": "ignore", + "tag": "strict", + "description": "min of col1 value must be greater than 10", + "enable_for_source_dq_validation": True, + "enable_for_target_dq_validation": False, + "is_active": True, + "enable_error_drop_alert": False, + "error_drop_threshold": "20", + }, + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "query_dq", + "rule": "max_col1_threshold", + "column_name": "col1", + "expectation": "(select max(col1) from test_final_table_view) > 100", + "action_if_failed": "fail", + "tag": "strict", + "description": "max of col1 value must be greater than 100", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": True, + "is_active": True, + "enable_error_drop_alert": False, + "error_drop_threshold": "20", + }, + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "query_dq", + "rule": "min_col3_threshold", + "column_name": "col3", + "expectation": "(select min(col3) from test_final_table_view) > 0", + "action_if_failed": "ignore", + "tag": "validity", + "description": "min of col3 value must be greater than 0", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": True, + "is_active": True, + "enable_error_drop_alert": False, + "error_drop_threshold": "20", + }, + ], + True, # write to table + False, # write to temp table + SparkExpectationsMiscException, # expected result + 3, # input count + 1, # error count + 2, # output count + None, # source_agg_result + None, # final_agg_result + # final_agg_result + [ + { + "description": "min of col1 value must be greater than 10", + "rule": "min_col1_threshold", + "rule_type": "query_dq", + "action_if_failed": "ignore", + "tag": "strict", + } + ], + # source_query_dq_res + [ + { + "description": "max of col1 value must be greater than 100", + "rule": "max_col1_threshold", + "rule_type": "query_dq", + "action_if_failed": "fail", + "tag": "strict", + } + ], + # final_query_dq_res + { + "rules": {"num_dq_rules": 4, "num_row_dq_rules": 1}, + "query_dq_rules": { + "num_final_query_dq_rules": 2, + "num_source_query_dq_rules": 1, + "num_query_dq_rules": 3, + }, # dq_rules + "agg_dq_rules": { + "num_source_agg_dq_rules": 0, + "num_agg_dq_rules": 0, + "num_final_agg_dq_rules": 0, + }, + }, + { + "row_dq_status": "Passed", + "source_agg_dq_status": "Skipped", + "final_agg_dq_status": "Skipped", + "run_status": "Failed", + "source_query_dq_status": "Passed", + "final_query_dq_status": "Failed", + }, # status + ), + ( + # Test case 22 + # In this test case, dq run set for query_dq source_query_dq & + # final_query_dq(ignore, fail) + # with action_if_failed (ignore, fail) for query_dq + # collect stats in the test_stats_table, error into error_table & raise the error + spark.createDataFrame( + [ + # min of col1 must be greater than 10(ignore) - source_query_dq + # max of col1 must be greater than 100(fail) - final_query_dq + # min of col3 must be greater than 0(fail) - final_query_dq + {"col1": 1, "col2": "a", "col3": 4}, + # row meets all row_dq_expectations(drop) + {"col1": 2, "col2": "b", "col3": 5}, + # ow doesn't meet row_dq expectation1(drop) + {"col1": 3, "col2": "c", "col3": 6}, + # row meets all row_dq_expectations + ] + ), + [ + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "agg_dq", + "rule": "col3_max_value", + "column_name": "col3", + "expectation": "max(col3) > 1", + "action_if_failed": "fail", + "tag": "validity", + "description": "col3 mod must equals to 0", + "enable_for_source_dq_validation": True, + "enable_for_target_dq_validation": True, + "is_active": True, + "enable_error_drop_alert": False, + "error_drop_threshold": "20", + }, + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "row_dq", + "rule": "col3_mod_2", + "column_name": "col3", + "expectation": "(col3 % 2) = 0", + "action_if_failed": "drop", + "tag": "validity", + "description": "col3 mod must equals to 0", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": False, + "is_active": True, + "enable_error_drop_alert": False, + "error_drop_threshold": "100", + }, + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "query_dq", + "rule": "count_col1_threshold", + "column_name": "col1", + "expectation": "(select count(col1) from test_final_table_view) > 3", + "action_if_failed": "ignore", + "tag": "strict", + "description": "count of col1 value must be greater than 3", + "enable_for_source_dq_validation": True, + "enable_for_target_dq_validation": False, + "is_active": True, + "enable_error_drop_alert": False, + "error_drop_threshold": "20", + }, + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "query_dq", + "rule": "col3_positive_threshold", + "column_name": "col1", + "expectation": "(select count(case when col3>0 then 1 else 0 end) from " + "test_final_table_view) > 10", + "action_if_failed": "ignore", + "tag": "strict", + "description": "count of col3 positive value must be greater than 10", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": True, + "is_active": True, + "enable_error_drop_alert": False, + "error_drop_threshold": "20", + }, + ], + True, # write to table + False, # write to temp table + spark.createDataFrame( + [ + {"col1": 1, "col2": "a", "col3": 4}, + {"col1": 3, "col2": "c", "col3": 6}, + ] + ), # expected result + 3, # input count + 1, # error count + 2, # output count + None, # source_agg_result + None, # final_agg_result + # final_agg_result + [ + { + "description": "count of col1 value must be greater than 3", + "rule": "count_col1_threshold", + "rule_type": "query_dq", + "action_if_failed": "ignore", + "tag": "strict", + } + ], + # source_query_dq_res + [ + { + "description": "count of col3 positive value must be greater than 10", + "rule": "col3_positive_threshold", + "rule_type": "query_dq", + "action_if_failed": "ignore", + "tag": "strict", + } + ], + # final_query_dq_res + { + "rules": {"num_dq_rules": 4, "num_row_dq_rules": 1}, + "query_dq_rules": { + "num_final_query_dq_rules": 1, + "num_source_query_dq_rules": 1, + "num_query_dq_rules": 2, + }, # dq_rules + "agg_dq_rules": { + "num_source_agg_dq_rules": 1, + "num_agg_dq_rules": 1, + "num_final_agg_dq_rules": 1, + }, + }, + { + "row_dq_status": "Passed", + "source_agg_dq_status": "Passed", + "final_agg_dq_status": "Passed", + "run_status": "Passed", + "source_query_dq_status": "Passed", + "final_query_dq_status": "Passed", + }, # status + ), + ( + # Test case 23 + # In this test case, dq run set for query_dq source_query_dq and one of the rule is parameterized + # with action_if_failed (ignore) for query_dq + # collect stats in the test_stats_table & error into error_table + spark.createDataFrame( + [ + # sum of col1 must be greater than 10(ignore) + # standard deviation of col3 must be greater than 0(ignore) + {"col1": 1, "col2": "a", "col3": 4}, + # row doesn't meet row_dq expectation1(drop) + {"col1": 2, "col2": "b", "col3": 5}, + # row meets all row_dq_expectations + {"col1": 3, "col2": "c", "col3": 6}, + # row meets all row_dq_expectations + ] + ), + [ + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "query_dq", + "rule": "sum_col1_threshold", + "column_name": "col1", + "expectation": "(select sum(col1) from {table}) > 10", + "action_if_failed": "ignore", + "tag": "validity", + "description": "sum of col1 value must be greater than 10", + "enable_for_source_dq_validation": True, + "enable_for_target_dq_validation": True, + "is_active": True, + "enable_error_drop_alert": False, + "error_drop_threshold": "20", + }, + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "query_dq", + "rule": "stddev_col3_threshold", + "column_name": "col3", + "expectation": "(select stddev(col3) from test_table) > 0", + "action_if_failed": "ignore", + "tag": "validity", + "description": "stddev of col3 value must be greater than 0", + "enable_for_source_dq_validation": True, + "enable_for_target_dq_validation": True, + "is_active": True, + "enable_error_drop_alert": False, + "error_drop_threshold": "20", + }, + ], + False, # write to table + False, # write to temp table + None, # expected result + 3, # input count + 0, # error count + 0, # output count + None, # source_agg_result + None, # final_agg_result + # final_agg_result + [ + { + "description": "sum of col1 value must be greater than 10", + "rule": "sum_col1_threshold", + "rule_type": "query_dq", + "action_if_failed": "ignore", + "tag": "validity", + } + ], + # source_query_dq_res + None, # final_query_dq_res + { + "rules": {"num_dq_rules": 2, "num_row_dq_rules": 0}, + "query_dq_rules": { + "num_final_query_dq_rules": 2, + "num_source_query_dq_rules": 2, + "num_query_dq_rules": 2, + }, # dq_rules + "agg_dq_rules": { + "num_source_agg_dq_rules": 0, + "num_agg_dq_rules": 0, + "num_final_agg_dq_rules": 0, + }, + }, + { + "row_dq_status": "Skipped", + "source_agg_dq_status": "Skipped", + "final_agg_dq_status": "Skipped", + "run_status": "Passed", + "source_query_dq_status": "Passed", + "final_query_dq_status": "Skipped", + }, # status + ), + ( + # Test case 24 + # In this test case, dq run set for source_agg_dq with action_if_failed fail + # with the sql syntax > lower_bound and < upper_bound + # collect stats in the test_stats_table + spark.createDataFrame( + [ + # avg of col3 is greater than 18 and not more than 25 + {"col1": 1, "col2": "a", "col3": 4}, + {"col1": 2, "col2": "b", "col3": 5}, + {"col1": 3, "col2": "c", "col3": 6}, + ] + ), + [ + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "agg_dq", + "rule": "avg_col3_range", + "column_name": "col3", + "expectation": "avg(col3) > 18 and avg(col3) < 25", + "action_if_failed": "fail", + "tag": "strict", + "description": "avg col3 value must be greater than 18 and less than 25", + "enable_for_source_dq_validation": True, + "enable_for_target_dq_validation": False, + "is_active": True, + "enable_error_drop_alert": False, + "error_drop_threshold": "20", + } + ], + True, # write to table + True, # write to temp table + SparkExpectationsMiscException, # excepted result + 3, # input count + 0, # error count + 0, # output count + [ + { + "description": "avg col3 value must be greater than 18 and less than 25", # source_agg_result + "rule": "avg_col3_range", + "rule_type": "agg_dq", + "action_if_failed": "fail", + "tag": "strict", + } + ], + None, # final_agg_result + None, # source_query_dq_res + None, # final_query_dq_res + { + "rules": {"num_dq_rules": 1, "num_row_dq_rules": 0}, + "query_dq_rules": { + "num_final_query_dq_rules": 0, + "num_source_query_dq_rules": 0, + "num_query_dq_rules": 0, + }, + "agg_dq_rules": { + "num_source_agg_dq_rules": 1, + "num_agg_dq_rules": 1, + "num_final_agg_dq_rules": 1, + }, + }, # dq_rules + { + "row_dq_status": "Skipped", + "source_agg_dq_status": "Failed", # status + "final_agg_dq_status": "Skipped", + "run_status": "Failed", + "source_query_dq_status": "Skipped", + "final_query_dq_status": "Skipped", + }, + ), + ( + # Test case 25 + # In this test case, dq run set for source_agg_dq with action_if_failed fail + # with the sql syntax between lower_bound and upper_bound + # collect stats in the test_stats_table + spark.createDataFrame( + [ + # avg of col3 is greater than 18 and not more than 25 + {"col1": 1, "col2": "a", "col3": 4}, + {"col1": 2, "col2": "b", "col3": 5}, + {"col1": 3, "col2": "c", "col3": 6}, + ] + ), + [ + { + "product_id": "product1", + "table_name": "dq_spark.test_final_table", + "rule_type": "agg_dq", + "rule": "avg_col3_range", + "column_name": "col3", + "expectation": "avg(col3) between 18 and 25", + "action_if_failed": "fail", + "tag": "strict", + "description": "avg col3 value must be greater than 18 and less than 25", + "enable_for_source_dq_validation": True, + "enable_for_target_dq_validation": False, + "is_active": True, + "enable_error_drop_alert": False, + "error_drop_threshold": "20", + } + ], + True, # write to table + True, # write to temp table + SparkExpectationsMiscException, # excepted result + 3, # input count + 0, # error count + 0, # output count + [ + { + "description": "avg col3 value must be greater than 18 and less than 25", # source_agg_result + "rule": "avg_col3_range", + "rule_type": "agg_dq", + "action_if_failed": "fail", + "tag": "strict", + } + ], + None, # final_agg_result + None, # source_query_dq_res + None, # final_query_dq_res + { + "rules": {"num_dq_rules": 1, "num_row_dq_rules": 0}, + "query_dq_rules": { + "num_final_query_dq_rules": 0, + "num_source_query_dq_rules": 0, + "num_query_dq_rules": 0, + }, + "agg_dq_rules": { + "num_source_agg_dq_rules": 1, + "num_agg_dq_rules": 1, + "num_final_agg_dq_rules": 1, + }, + }, # dq_rules + { + "row_dq_status": "Skipped", + "source_agg_dq_status": "Failed", # status + "final_agg_dq_status": "Skipped", + "run_status": "Failed", + "source_query_dq_status": "Skipped", + "final_query_dq_status": "Skipped", + }, + ), + ], +) +def test_with_expectations( + input_df, + expectations, + write_to_table, + write_to_temp_table, + expected_output, + input_count, + error_count, + output_count, + source_agg_dq_res, + final_agg_dq_res, + source_query_dq_res, + final_query_dq_res, + dq_rules, + status, + _fixture_create_database, + _fixture_local_kafka_topic, +): input_df.createOrReplaceTempView("test_table") spark.conf.set("spark.sql.session.timeZone", "Etc/UTC") - rules_df = spark.createDataFrame(expectations) if len(expectations) > 0 else expectations + rules_df = ( + spark.createDataFrame(expectations) if len(expectations) > 0 else expectations + ) rules_df.show(truncate=False) if len(expectations) > 0 else None writer = WrappedDataFrameWriter().mode("append").format("parquet") - se = SparkExpectations(product_id="product1", - rules_df=rules_df, - stats_table="dq_spark.test_dq_stats_table", - stats_table_writer=writer, - target_and_error_table_writer=writer, - debugger=False, - ) + se = SparkExpectations( + product_id="product1", + rules_df=rules_df, + stats_table="dq_spark.test_dq_stats_table", + stats_table_writer=writer, + target_and_error_table_writer=writer, + debugger=False, + ) se._context._run_date = "2022-12-27 10:00:00" se._context._env = "local" se._context.set_input_count(100) @@ -2280,13 +2825,15 @@ def test_with_expectations(input_df, se._context.set_error_count(0) se._context._run_id = "product1_run_test" - # Decorate the mock function with required args @se.with_expectations( "dq_spark.test_final_table", - user_conf={user_config.se_notifications_on_fail: False, user_config.se_dq_rules_params: {'table' : 'test_table'} }, + user_conf={ + user_config.se_notifications_on_fail: False, + user_config.se_dq_rules_params: {"table": "test_table"}, + }, write_to_table=write_to_table, - write_to_temp_table=write_to_temp_table + write_to_temp_table=write_to_temp_table, ) def get_dataset() -> DataFrame: return input_df @@ -2294,10 +2841,16 @@ def get_dataset() -> DataFrame: input_df.show(truncate=False) if isinstance(expected_output, type) and issubclass(expected_output, Exception): - with pytest.raises(expected_output, match=r"error occurred while processing spark expectations .*"): + with pytest.raises( + expected_output, + match=r"error occurred while processing spark expectations .*", + ): get_dataset() # decorated_func() - if status.get("final_agg_dq_status") == 'Failed' or status.get("final_query_dq_status") == 'Failed': + if ( + status.get("final_agg_dq_status") == "Failed" + or status.get("final_query_dq_status") == "Failed" + ): try: spark.table("dq_spark.test_final_table") assert False @@ -2308,11 +2861,15 @@ def get_dataset() -> DataFrame: get_dataset() # decorated_func() if write_to_table is True: - expected_output_df = expected_output.withColumn("run_id", lit("product1_run_test")) \ - .withColumn("run_date", to_timestamp(lit("2022-12-27 10:00:00"))) + expected_output_df = expected_output.withColumn( + "run_id", lit("product1_run_test") + ).withColumn("run_date", to_timestamp(lit("2022-12-27 10:00:00"))) result_df = spark.table("dq_spark.test_final_table") - assert result_df.orderBy("col2").collect() == expected_output_df.orderBy("col2").collect() + assert ( + result_df.orderBy("col2").collect() + == expected_output_df.orderBy("col2").collect() + ) if spark.catalog.tableExists("dq_spark.test_final_table_error"): error_table = spark.table("dq_spark.test_final_table_error") @@ -2341,14 +2898,19 @@ def get_dataset() -> DataFrame: assert row.meta_dq_run_datetime == datetime.datetime(2022, 12, 27, 10, 00, 00) assert len(stats_table.columns) == 20 - assert spark.read.format("kafka").option( - "kafka.bootstrap.servers", "localhost:9092" - ).option("subscribe", "dq-sparkexpectations-stats").option( - "startingOffsets", "earliest" - ).option( - "endingOffsets", "latest" - ).load().orderBy(col('timestamp').desc()).limit(1).selectExpr( - "cast(value as string) as value").collect() == stats_table.selectExpr("to_json(struct(*)) AS value").collect() + assert ( + spark.read.format("kafka") + .option("kafka.bootstrap.servers", "localhost:9092") + .option("subscribe", "dq-sparkexpectations-stats") + .option("startingOffsets", "earliest") + .option("endingOffsets", "latest") + .load() + .orderBy(col("timestamp").desc()) + .limit(1) + .selectExpr("cast(value as string) as value") + .collect() + == stats_table.selectExpr("to_json(struct(*)) AS value").collect() + ) # spark.sql("select * from dq_spark.test_final_table").show(truncate=False) # spark.sql("select * from dq_spark.test_final_table_error").show(truncate=False) @@ -2363,14 +2925,20 @@ def get_dataset() -> DataFrame: @patch("spark_expectations.core.expectations.SparkExpectationsWriter.write_error_stats") -def test_with_expectations_patch(_write_error_stats, - _fixture_create_database, - _fixture_spark_expectations, - _fixture_df, - _fixture_rules_df): +def test_with_expectations_patch( + _write_error_stats, + _fixture_create_database, + _fixture_spark_expectations, + _fixture_df, + _fixture_rules_df, +): decorated_func = _fixture_spark_expectations.with_expectations( "dq_spark.test_final_table", - user_conf={user_config.se_notifications_on_fail: False,user_config.se_enable_query_dq_detailed_result: True, user_config.se_enable_agg_dq_detailed_result: True}, + user_conf={ + user_config.se_notifications_on_fail: False, + user_config.se_enable_query_dq_detailed_result: True, + user_config.se_enable_agg_dq_detailed_result: True, + }, )(Mock(return_value=_fixture_df)) decorated_func() @@ -2379,34 +2947,42 @@ def test_with_expectations_patch(_write_error_stats, def test_with_expectations_overwrite_writers( - _fixture_create_database, - _fixture_spark_expectations, - _fixture_df, - _fixture_rules_df): + _fixture_create_database, + _fixture_spark_expectations, + _fixture_df, + _fixture_rules_df, +): modified_writer = WrappedDataFrameWriter().mode("overwrite").format("iceberg") _fixture_spark_expectations.with_expectations( "dq_spark.test_final_table", user_conf={user_config.se_notifications_on_fail: False}, - target_and_error_table_writer=modified_writer + target_and_error_table_writer=modified_writer, )(Mock(return_value=_fixture_df)) - assert _fixture_spark_expectations._context.get_target_and_error_table_writer_config == modified_writer.build() + assert ( + _fixture_spark_expectations._context.get_target_and_error_table_writer_config + == modified_writer.build() + ) -def test_with_expectations_dataframe_not_returned_exception(_fixture_create_database, - _fixture_spark_expectations, - _fixture_df, - _fixture_rules_df, - _fixture_local_kafka_topic): +def test_with_expectations_dataframe_not_returned_exception( + _fixture_create_database, + _fixture_spark_expectations, + _fixture_df, + _fixture_rules_df, + _fixture_local_kafka_topic, +): partial_func = _fixture_spark_expectations.with_expectations( "dq_spark.test_final_table", user_conf={user_config.se_notifications_on_fail: False}, ) - with pytest.raises(SparkExpectationsMiscException, - match=r"error occurred while processing spark expectations error occurred while" - r" processing spark " - r"expectations due to given dataframe is not type of dataframe"): + with pytest.raises( + SparkExpectationsMiscException, + match=r"error occurred while processing spark expectations error occurred while" + r" processing spark " + r"expectations due to given dataframe is not type of dataframe", + ): # Create a mock object with a rdd return value mock_func = Mock(return_value=_fixture_df.rdd) @@ -2419,43 +2995,47 @@ def test_with_expectations_dataframe_not_returned_exception(_fixture_create_data spark.sql("CLEAR CACHE") -def test_with_expectations_exception(_fixture_create_database, - _fixture_spark_expectations, - _fixture_local_kafka_topic): - rules_dict = [{ - "product_id": "product1", - "table_name": "dq_spark.test_table", - "rule_type": "row_dq", - "rule": "col1_threshold", - "column_name": "col1", - "expectation": "col1 > 1", - "action_if_failed": "ignore", - "tag": "validity", - "description": "col1 value must be greater than 1", - "enable_for_source_dq_validation": True, - "enable_for_target_dq_validation": True, - "is_active": True, - "enable_error_drop_alert": True, - "error_drop_threshold": "10" - } - ] +def test_with_expectations_exception( + _fixture_create_database, _fixture_spark_expectations, _fixture_local_kafka_topic +): + rules_dict = [ + { + "product_id": "product1", + "table_name": "dq_spark.test_table", + "rule_type": "row_dq", + "rule": "col1_threshold", + "column_name": "col1", + "expectation": "col1 > 1", + "action_if_failed": "ignore", + "tag": "validity", + "description": "col1 value must be greater than 1", + "enable_for_source_dq_validation": True, + "enable_for_target_dq_validation": True, + "is_active": True, + "enable_error_drop_alert": True, + "error_drop_threshold": "10", + } + ] rules_df = spark.createDataFrame(rules_dict) writer = WrappedDataFrameWriter().mode("append").format("delta") - se = SparkExpectations(product_id="product1", - rules_df=rules_df, - stats_table="dq_spark.test_dq_stats_table", - stats_table_writer=writer, - target_and_error_table_writer=writer, - debugger=False, - ) + se = SparkExpectations( + product_id="product1", + rules_df=rules_df, + stats_table="dq_spark.test_dq_stats_table", + stats_table_writer=writer, + target_and_error_table_writer=writer, + debugger=False, + ) partial_func = se.with_expectations( "dq_spark.test_final_table", - user_conf={user_config.se_notifications_on_fail: False} + user_conf={user_config.se_notifications_on_fail: False}, ) - with pytest.raises(SparkExpectationsMiscException, - match=r"error occurred while processing spark expectations .*"): + with pytest.raises( + SparkExpectationsMiscException, + match=r"error occurred while processing spark expectations .*", + ): # Create a mock object with a list return value mock_func = Mock(return_value=["apple", "banana", "pineapple", "orange"]) @@ -2468,9 +3048,10 @@ def test_with_expectations_exception(_fixture_create_database, spark.sql(f"DROP DATABASE {db.name} CASCADE") spark.sql("CLEAR CACHE") -def test_with_expectations_negative_parameter(_fixture_create_database, - _fixture_spark_expectations, - _fixture_local_kafka_topic): + +def test_with_expectations_negative_parameter( + _fixture_create_database, _fixture_spark_expectations, _fixture_local_kafka_topic +): rules_dict = [ { "product_id": "product1", @@ -2488,24 +3069,27 @@ def test_with_expectations_negative_parameter(_fixture_create_database, "enable_error_drop_alert": False, "error_drop_threshold": "20", } - ] + ] rules_df = spark.createDataFrame(rules_dict) writer = WrappedDataFrameWriter().mode("append").format("delta") - se = SparkExpectations(product_id="product1", - rules_df=rules_df, - stats_table="dq_spark.test_dq_stats_table", - stats_table_writer=writer, - target_and_error_table_writer=writer, - debugger=False, - ) + se = SparkExpectations( + product_id="product1", + rules_df=rules_df, + stats_table="dq_spark.test_dq_stats_table", + stats_table_writer=writer, + target_and_error_table_writer=writer, + debugger=False, + ) partial_func = se.with_expectations( "dq_spark.test_final_table", - user_conf={user_config.se_notifications_on_fail: False} + user_conf={user_config.se_notifications_on_fail: False}, ) - with pytest.raises(SparkExpectationsMiscException, - match=r"error occurred while retrieving rules list from the table 'table2'"): + with pytest.raises( + SparkExpectationsMiscException, + match=r"error occurred while retrieving rules list from the table 'table2'", + ): # Create a mock object with a list return value mock_func = Mock(return_value=["apple", "banana", "pineapple", "orange"]) @@ -2517,20 +3101,22 @@ def test_with_expectations_negative_parameter(_fixture_create_database, if db.name != "default": spark.sql(f"DROP DATABASE {db.name} CASCADE") spark.sql("CLEAR CACHE") + + # @patch('spark_expectations.core.expectations.SparkExpectationsNotify', autospec=True, # spec_set=True) # @patch('spark_expectations.notifications.push.spark_expectations_notify._notification_hook', autospec=True, # spec_set=True) def test_error_threshold_breach( - # _mock_notification_hook, _mock_spark_expectations_notify, - _fixture_create_database, - _fixture_local_kafka_topic + # _mock_notification_hook, _mock_spark_expectations_notify, + _fixture_create_database, + _fixture_local_kafka_topic, ): input_df = spark.createDataFrame( [ {"col1": 1, "col2": "a", "col3": 4}, {"col1": 2, "col2": "b", "col3": 5}, - {"col1": 3, "col2": "c", 'col3': 6}, + {"col1": 3, "col2": "c", "col3": 6}, ] ) @@ -2549,7 +3135,7 @@ def test_error_threshold_breach( "enable_for_target_dq_validation": True, "is_active": True, "enable_error_drop_alert": True, - "error_drop_threshold": "25" + "error_drop_threshold": "25", }, { "product_id": "product1", @@ -2565,8 +3151,9 @@ def test_error_threshold_breach( "enable_for_target_dq_validation": True, "is_active": True, "enable_error_drop_alert": True, - "error_drop_threshold": "10" - }] + "error_drop_threshold": "10", + }, + ] # create a PySpark DataFrame from the list of dictionaries rules_df = spark.createDataFrame(rules) @@ -2574,18 +3161,22 @@ def test_error_threshold_breach( writer = WrappedDataFrameWriter().mode("append").format("delta") with patch( - 'spark_expectations.notifications.push.spark_expectations_notify.SparkExpectationsNotify' - '.notify_on_exceeds_of_error_threshold', - autospec=True, spec_set=True) as _mock_notification_hook: - se = SparkExpectations(product_id="product1", - rules_df=rules_df, - stats_table="dq_spark.test_dq_stats_table", - stats_table_writer=writer, - target_and_error_table_writer=writer, - debugger=False, - ) + "spark_expectations.notifications.push.spark_expectations_notify.SparkExpectationsNotify" + ".notify_on_exceeds_of_error_threshold", + autospec=True, + spec_set=True, + ) as _mock_notification_hook: + se = SparkExpectations( + product_id="product1", + rules_df=rules_df, + stats_table="dq_spark.test_dq_stats_table", + stats_table_writer=writer, + target_and_error_table_writer=writer, + debugger=False, + ) from spark_expectations.config.user_config import Constants as user_config + conf = { user_config.se_notifications_on_fail: True, user_config.se_notifications_on_error_drop_exceeds_threshold_breach: True, @@ -2595,7 +3186,7 @@ def test_error_threshold_breach( @se.with_expectations( target_table="dq_spark.test_final_table", write_to_table=True, - user_conf=conf + user_conf=conf, ) def get_dataset() -> DataFrame: return input_df @@ -2609,8 +3200,9 @@ def get_dataset() -> DataFrame: spark.sql("CLEAR CACHE") -def test_target_table_view_exception(_fixture_create_database, - _fixture_local_kafka_topic): +def test_target_table_view_exception( + _fixture_create_database, _fixture_local_kafka_topic +): rules = [ { "product_id": "product1", @@ -2626,7 +3218,7 @@ def test_target_table_view_exception(_fixture_create_database, "enable_for_target_dq_validation": True, "is_active": True, "enable_error_drop_alert": True, - "error_drop_threshold": "10" + "error_drop_threshold": "10", }, { "product_id": "product1", @@ -2642,8 +3234,9 @@ def test_target_table_view_exception(_fixture_create_database, "enable_for_target_dq_validation": True, "is_active": True, "enable_error_drop_alert": True, - "error_drop_threshold": "10" - }] + "error_drop_threshold": "10", + }, + ] input_df = spark.createDataFrame( [ @@ -2664,24 +3257,27 @@ def test_target_table_view_exception(_fixture_create_database, rules_df.createOrReplaceTempView("test_table") writer = WrappedDataFrameWriter().mode("append").format("delta") - se = SparkExpectations(product_id="product1", - rules_df=rules_df, - stats_table="dq_spark.test_dq_stats_table", - stats_table_writer=writer, - target_and_error_table_writer=writer, - debugger=False, - ) + se = SparkExpectations( + product_id="product1", + rules_df=rules_df, + stats_table="dq_spark.test_dq_stats_table", + stats_table_writer=writer, + target_and_error_table_writer=writer, + debugger=False, + ) @se.with_expectations( target_table="dq_spark.test_final_table", write_to_table=True, target_table_view="test_table", - ) def get_dataset() -> DataFrame: return input_df - with pytest.raises(SparkExpectationsMiscException, match=r"error occurred while processing spark expectations .*"): + with pytest.raises( + SparkExpectationsMiscException, + match=r"error occurred while processing spark expectations .*", + ): get_dataset() # decorated_func() for db in spark.catalog.listDatabases(): @@ -2692,18 +3288,22 @@ def get_dataset() -> DataFrame: def test_spark_expectations_exception(): writer = WrappedDataFrameWriter().mode("append").format("parquet") - with pytest.raises(SparkExpectationsMiscException, match=r"Input rules_df is not of dataframe type"): - SparkExpectations(product_id="product1", - rules_df=[], - stats_table="dq_spark.test_dq_stats_table", - stats_table_writer=writer, - target_and_error_table_writer=writer, - debugger=False, - ) + with pytest.raises( + SparkExpectationsMiscException, match=r"Input rules_df is not of dataframe type" + ): + SparkExpectations( + product_id="product1", + rules_df=[], + stats_table="dq_spark.test_dq_stats_table", + stats_table_writer=writer, + target_and_error_table_writer=writer, + debugger=False, + ) # [UnitTests for WrappedDataFrameWriter class] + def reset_wrapped_dataframe_writer(): writer = WrappedDataFrameWriter() writer._mode = None @@ -2723,36 +3323,51 @@ def test_format(): def test_partitionBy(): - assert WrappedDataFrameWriter().partitionBy("date", "region")._partition_by == ["date", "region"] + assert WrappedDataFrameWriter().partitionBy("date", "region")._partition_by == [ + "date", + "region", + ] def test_option(): - assert WrappedDataFrameWriter().option("compression", "gzip")._options == {"compression": "gzip"} + assert WrappedDataFrameWriter().option("compression", "gzip")._options == { + "compression": "gzip" + } def test_options(): - assert WrappedDataFrameWriter().options(path="/path/to/output", inferSchema="true")._options == { - "path": "/path/to/output", "inferSchema": "true"} + assert WrappedDataFrameWriter().options( + path="/path/to/output", inferSchema="true" + )._options == {"path": "/path/to/output", "inferSchema": "true"} def test_bucketBy(): - assert WrappedDataFrameWriter().bucketBy(4, "country", "city")._bucket_by == {"num_buckets": 4, - "columns": ("country", "city")} + assert WrappedDataFrameWriter().bucketBy(4, "country", "city")._bucket_by == { + "num_buckets": 4, + "columns": ("country", "city"), + } def test_build(): - writer = WrappedDataFrameWriter().mode("overwrite") \ - .format("parquet") \ - .partitionBy("date", "region") \ - .option("compression", "gzip") \ - .options(path="/path/to/output", inferSchema="true") \ - .bucketBy(4, "country", "city") \ + writer = ( + WrappedDataFrameWriter() + .mode("overwrite") + .format("parquet") + .partitionBy("date", "region") + .option("compression", "gzip") + .options(path="/path/to/output", inferSchema="true") + .bucketBy(4, "country", "city") .sortBy("col1", "col2") + ) expected_config = { "mode": "overwrite", "format": "parquet", "partitionBy": ["date", "region"], - "options": {"compression": "gzip", "path": "/path/to/output", "inferSchema": "true"}, + "options": { + "compression": "gzip", + "path": "/path/to/output", + "inferSchema": "true", + }, "bucketBy": {"num_buckets": 4, "columns": ("country", "city")}, "sortBy": ["col1", "col2"], } @@ -2768,12 +3383,17 @@ def test_build_some_values(): "partitionBy": [], "options": {}, "bucketBy": {}, - "sortBy": [] + "sortBy": [], } assert writer.build() == expected_config def test_delta_bucketby_exception(): - writer = WrappedDataFrameWriter().mode("append").format("delta").bucketBy(10, "a", "b") - with pytest.raises(SparkExpectationsMiscException, match=r"Bucketing is not supported for delta tables yet"): + writer = ( + WrappedDataFrameWriter().mode("append").format("delta").bucketBy(10, "a", "b") + ) + with pytest.raises( + SparkExpectationsMiscException, + match=r"Bucketing is not supported for delta tables yet", + ): writer.build() diff --git a/tests/sinks/utils/test_writer.py b/tests/sinks/utils/test_writer.py index 0cc9d300..775d80a3 100644 --- a/tests/sinks/utils/test_writer.py +++ b/tests/sinks/utils/test_writer.py @@ -1,5 +1,6 @@ import os import unittest.mock +from datetime import datetime from unittest.mock import MagicMock, patch, Mock import pytest @@ -21,17 +22,18 @@ def fixture_mock_context(): # fixture for mock context mock_object = Mock(spec=SparkExpectationsContext) - + mock_object.get_dq_expectations = { - 'rule': 'table_row_count_gt_1', - 'description': 'table count should be greater than 1', - 'rule_type': 'query_dq', - 'tag': 'validity', - 'action_if_failed': 'ignore', + "rule": "table_row_count_gt_1", + "description": "table count should be greater than 1", + "rule_type": "query_dq", + "tag": "validity", + "action_if_failed": "ignore", } - + return mock_object + @pytest.fixture(name="_fixture_local_kafka_topic") def fixture_setup_local_kafka_topic(): current_dir = os.path.dirname(os.path.abspath(__file__)) @@ -358,32 +360,44 @@ def test_write_df_to_table( _fixture_writer, _fixture_employee, table_name, options ) -@pytest.mark.parametrize("input_record", [ - ({ - 'row_dq_rules': {'product_id': 'your_product', 'table_name': 'dq_spark_local.customer_order', - 'rule_type': 'row_dq', 'rule': 'sales_greater_than_zero', - 'column_name': 'sales', 'expectation': 'sales > 2', 'action_if_failed': 'drop', - 'enable_for_source_dq_validation': False, - 'enable_for_target_dq_validation': True, 'tag': 'accuracy', 'description': 'sales value should be greater than zero', - 'enable_error_drop_alert': False, 'error_drop_threshold': 0 - }, - }) -]) -def test_get_row_dq_detailed_stats_exception( - input_record,_fixture_writer): - + +@pytest.mark.parametrize( + "input_record", + [ + ( + { + "row_dq_rules": { + "product_id": "your_product", + "table_name": "dq_spark_local.customer_order", + "rule_type": "row_dq", + "rule": "sales_greater_than_zero", + "column_name": "sales", + "expectation": "sales > 2", + "action_if_failed": "drop", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": True, + "tag": "accuracy", + "description": "sales value should be greater than zero", + "enable_error_drop_alert": False, + "error_drop_threshold": 0, + }, + } + ) + ], +) +def test_get_row_dq_detailed_stats_exception(input_record, _fixture_writer): _mock_context = Mock(spec=SparkExpectationsContext) setattr(_mock_context, "get_dq_expectations", input_record.get("row_dq_rules")) _mock_context.spark = spark _fixture_writer = SparkExpectationsWriter(_mock_context) # faulty user input is given to test the exception functionality of the agg_dq_result - with pytest.raises(SparkExpectationsMiscException, - match=r"error occurred while fetching the stats from get_row_dq_detailed_stats .*"): + with pytest.raises( + SparkExpectationsMiscException, + match=r"error occurred while fetching the stats from get_row_dq_detailed_stats .*", + ): _fixture_writer.get_row_dq_detailed_stats() - - @pytest.mark.parametrize( "input_record, expected_result, writer_config", [ @@ -392,95 +406,228 @@ def test_get_row_dq_detailed_stats_exception( "input_count": 100, "error_count": 10, "output_count": 90, - - "rules_execution_settings_config": - {'row_dq': True, 'source_agg_dq': True, 'source_query_dq': True, 'target_agg_dq': True, 'target_query_dq': True}, + "rules_execution_settings_config": { + "row_dq": True, + "source_agg_dq": True, + "source_query_dq": True, + "target_agg_dq": True, + "target_query_dq": True, + }, "agg_dq_detailed_stats_status": True, "query_dq_detailed_stats_status": True, - "source_agg_dq_detailed_stats": [('product1_run_test', 'product_1', 'dq_spark_local.customer_order', 'agg_dq', 'sum_of_sales', 'sum(sales)>10000', 'validity', 'regex format validation for quantity','fail', 1988, '>10000', 5,0,5), - ], - - "target_agg_dq_detailed_stats": [('product1_run_test', 'product_1', 'dq_spark_local.customer_order', 'agg_dq', 'sum_of_sales', 'sum(sales)>10000', 'validity', 'regex format validation for quantity','fail', 1030, '>10000', 4,0,4), - ], - - "source_query_dq_detailed_stats": [('product1_run_test', 'product_1', 'dq_spark_local.customer_order', 'query_dq', 'product_missing_count_threshold', '((select count(*) from (select distinct product_id,order_id from order_source) a) - (select count(*) from (select distinct product_id,order_id from order_target) b) ) < 3', 'validity', 'row count threshold', 'pass', - 1, - '<3', 5,0,5) - ], - - "target_query_dq_detailed_stats": [('product1_run_test', 'product_1', 'dq_spark_local.customer_order', 'query_dq', 'product_missing_count_threshold', '((select count(*) from (select distinct product_id,order_id from order_source) a) - (select count(*) from (select distinct product_id,order_id from order_target) b) ) < 3', 'validity', 'row count threshold', 'pass', - 1, - '<3', 5,0,5) - - ], - - "source_query_dq_output": [('product1_run_test', - 'your_product', 'dq_spark_local.customer_order', - 'product_missing_count_threshold', 'source_f1', '_source_dq', - {'source_f1': ['{"product_id":"FUR-TA-10000577","order_id":"US-2015-108966"}', - '{"product_id":"OFF-ST-10000760","order_id":"US-2015-108966"}', - '{"product_id":"FUR-CH-10000454","order_id":"CA-2016-152156"}', - '{"product_id":"FUR-BO-10001798","order_id":"CA-2016-152156"}', - '{"product_id":"OFF-LA-10000240","order_id":"CA-2016-138688"}']}, '2024-03-14 06:53:39'), - ('product1_run_test', - 'your_product', 'dq_spark_local.customer_order', - 'product_missing_count_threshold', 'target_f1', '_source_dq', - {'target_f1': ['{"product_id":"FUR-TA-10000577","order_id":"US-2015-108966"}', - '{"product_id":"FUR-CH-10000454","order_id":"CA-2016-152156"}', - '{"product_id":"FUR-BO-10001798","order_id":"CA-2016-152156"}', - '{"product_id":"OFF-LA-10000240","order_id":"CA-2016-138688"}']}, '2024-03-14 06:53:39'), - ], - "target_query_dq_output": [('product1_run_test', - 'your_product', 'dq_spark_local.customer_order', - 'product_missing_count_threshold', 'source_f1', '_target_dq', - {'source_f1': ['{"product_id":"FUR-TA-10000577","order_id":"US-2015-108966"}', - '{"product_id":"OFF-ST-10000760","order_id":"US-2015-108966"}', - '{"product_id":"FUR-CH-10000454","order_id":"CA-2016-152156"}', - '{"product_id":"FUR-BO-10001798","order_id":"CA-2016-152156"}', - '{"product_id":"OFF-LA-10000240","order_id":"CA-2016-138688"}']}, '2024-03-14 06:53:39'), - ('product1_run_test', - 'your_product', 'dq_spark_local.customer_order', - 'product_missing_count_threshold', 'target_f1', '_target_dq', - {'target_f1': ['{"product_id":"FUR-TA-10000577","order_id":"US-2015-108966"}', - '{"product_id":"FUR-CH-10000454","order_id":"CA-2016-152156"}', - '{"product_id":"FUR-BO-10001798","order_id":"CA-2016-152156"}', - '{"product_id":"OFF-LA-10000240","order_id":"CA-2016-138688"}']}, '2024-03-14 06:53:39') - ], - - "detailed_stats_table_writer_config" : {'mode': 'overwrite', "format": "delta", 'partitionBy': [], 'bucketBy': {}, 'sortBy': [], - 'options': {"mergeSchema": "true"}}, - "test_dq_detailed_stats_table":"test_dq_detailed_stats_table", + "source_agg_dq_detailed_stats": [ + ( + "product1_run_test", + "product_1", + "dq_spark_local.customer_order", + "agg_dq", + "sum_of_sales", + "sum(sales)>10000", + "validity", + "regex format validation for quantity", + "fail", + 1988, + ">10000", + 5, + 0, + 5, + "2024-03-14 00:00:00", + "2024-03-14 00:10:00", + ), + ], + "target_agg_dq_detailed_stats": [ + ( + "product1_run_test", + "product_1", + "dq_spark_local.customer_order", + "agg_dq", + "sum_of_sales", + "sum(sales)>10000", + "validity", + "regex format validation for quantity", + "fail", + 1030, + ">10000", + 4, + 0, + 4, + "2024-03-14 01:00:00", + "2024-03-14 01:10:00", + ), + ], + "source_query_dq_detailed_stats": [ + ( + "product1_run_test", + "product_1", + "dq_spark_local.customer_order", + "query_dq", + "product_missing_count_threshold", + "((select count(*) from (select distinct product_id,order_id from order_source) a) - (select count(*) from (select distinct product_id,order_id from order_target) b) ) < 3", + "validity", + "row count threshold", + "pass", + 1, + "<3", + 5, + 0, + 5, + "2024-03-14 02:00:00", + "2024-03-14 02:10:00", + ) + ], + "target_query_dq_detailed_stats": [ + ( + "product1_run_test", + "product_1", + "dq_spark_local.customer_order", + "query_dq", + "product_missing_count_threshold", + "((select count(*) from (select distinct product_id,order_id from order_source) a) - (select count(*) from (select distinct product_id,order_id from order_target) b) ) < 3", + "validity", + "row count threshold", + "pass", + 1, + "<3", + 5, + 0, + 5, + "2024-03-14 03:00:00", + "2024-03-14 03:10:00", + ) + ], + "source_query_dq_output": [ + ( + "product1_run_test", + "your_product", + "dq_spark_local.customer_order", + "product_missing_count_threshold", + "source_f1", + "_source_dq", + { + "source_f1": [ + '{"product_id":"FUR-TA-10000577","order_id":"US-2015-108966"}', + '{"product_id":"OFF-ST-10000760","order_id":"US-2015-108966"}', + '{"product_id":"FUR-CH-10000454","order_id":"CA-2016-152156"}', + '{"product_id":"FUR-BO-10001798","order_id":"CA-2016-152156"}', + '{"product_id":"OFF-LA-10000240","order_id":"CA-2016-138688"}', + ] + }, + "2024-03-14 06:53:39", + ), + ( + "product1_run_test", + "your_product", + "dq_spark_local.customer_order", + "product_missing_count_threshold", + "target_f1", + "_source_dq", + { + "target_f1": [ + '{"product_id":"FUR-TA-10000577","order_id":"US-2015-108966"}', + '{"product_id":"FUR-CH-10000454","order_id":"CA-2016-152156"}', + '{"product_id":"FUR-BO-10001798","order_id":"CA-2016-152156"}', + '{"product_id":"OFF-LA-10000240","order_id":"CA-2016-138688"}', + ] + }, + "2024-03-14 06:53:39", + ), + ], + "target_query_dq_output": [ + ( + "product1_run_test", + "your_product", + "dq_spark_local.customer_order", + "product_missing_count_threshold", + "source_f1", + "_target_dq", + { + "source_f1": [ + '{"product_id":"FUR-TA-10000577","order_id":"US-2015-108966"}', + '{"product_id":"OFF-ST-10000760","order_id":"US-2015-108966"}', + '{"product_id":"FUR-CH-10000454","order_id":"CA-2016-152156"}', + '{"product_id":"FUR-BO-10001798","order_id":"CA-2016-152156"}', + '{"product_id":"OFF-LA-10000240","order_id":"CA-2016-138688"}', + ] + }, + "2024-03-14 06:53:39", + ), + ( + "product1_run_test", + "your_product", + "dq_spark_local.customer_order", + "product_missing_count_threshold", + "target_f1", + "_target_dq", + { + "target_f1": [ + '{"product_id":"FUR-TA-10000577","order_id":"US-2015-108966"}', + '{"product_id":"FUR-CH-10000454","order_id":"CA-2016-152156"}', + '{"product_id":"FUR-BO-10001798","order_id":"CA-2016-152156"}', + '{"product_id":"OFF-LA-10000240","order_id":"CA-2016-138688"}', + ] + }, + "2024-03-14 06:53:39", + ), + ], + "detailed_stats_table_writer_config": { + "mode": "overwrite", + "format": "delta", + "partitionBy": [], + "bucketBy": {}, + "sortBy": [], + "options": {"mergeSchema": "true"}, + }, + "test_dq_detailed_stats_table": "test_dq_detailed_stats_table", "test_querydq_output_custom_table_name": "test_querydq_output_custom_table_name", - "dq_expectations" : { - 'row_dq_rules': [{'product_id': 'your_product', 'table_name': 'dq_spark_local.customer_order', 'rule_type': 'row_dq', - 'rule': 'rule1', 'column_name': 'sales', - 'expectation': 'sales > 2', 'action_if_failed': 'drop', - 'enable_for_source_dq_validation': False, - 'enable_for_target_dq_validation': True, - 'tag': 'accuracy', 'description': 'sales value should be greater than zero', - 'enable_error_drop_alert': False, 'error_drop_threshold': 0}, - {'product_id': 'your_product', 'table_name': 'dq_spark_local.customer_order', 'rule_type': 'row_dq', - 'rule': 'rule2', 'column_name': 'sales', - 'expectation': 'sales > 2', 'action_if_failed': 'drop', - 'enable_for_source_dq_validation': False, - 'enable_for_target_dq_validation': True, - 'tag': 'accuracy', 'description': 'sales value should be greater than zero', - 'enable_error_drop_alert': False, 'error_drop_threshold': 0}, - {'product_id': 'your_product', 'table_name': 'dq_spark_local.customer_order', 'rule_type': 'row_dq', - 'rule': 'rule3', 'column_name': 'sales', - 'expectation': 'sales > 2', 'action_if_failed': 'drop', - 'enable_for_source_dq_validation': False, - 'enable_for_target_dq_validation': True, - 'tag': 'accuracy', 'description': 'sales value should be greater than zero', - 'enable_error_drop_alert': False, 'error_drop_threshold': 0}], + "dq_expectations": { + "row_dq_rules": [ + { + "product_id": "your_product", + "table_name": "dq_spark_local.customer_order", + "rule_type": "row_dq", + "rule": "rule1", + "column_name": "sales", + "expectation": "sales > 2", + "action_if_failed": "drop", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": True, + "tag": "accuracy", + "description": "sales value should be greater than zero", + "enable_error_drop_alert": False, + "error_drop_threshold": 0, }, - - - - - - - + { + "product_id": "your_product", + "table_name": "dq_spark_local.customer_order", + "rule_type": "row_dq", + "rule": "rule2", + "column_name": "sales", + "expectation": "sales > 2", + "action_if_failed": "drop", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": True, + "tag": "accuracy", + "description": "sales value should be greater than zero", + "enable_error_drop_alert": False, + "error_drop_threshold": 0, + }, + { + "product_id": "your_product", + "table_name": "dq_spark_local.customer_order", + "rule_type": "row_dq", + "rule": "rule3", + "column_name": "sales", + "expectation": "sales > 2", + "action_if_failed": "drop", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": True, + "tag": "accuracy", + "description": "sales value should be greater than zero", + "enable_error_drop_alert": False, + "error_drop_threshold": 0, + }, + ], + }, "source_agg_results": [ { "rule_name": "rule1", @@ -520,7 +667,6 @@ def test_get_row_dq_detailed_stats_exception( "rule_type": "row_dq", "failed_count": 10, "failed_row_count": 10, - }, { "rule_name": "rule2", @@ -594,6 +740,7 @@ def test_get_row_dq_detailed_stats_exception( "source_query_dq": "Passed", "final_query_dq": "Passed", }, + "job_metadata": {"dag": "dag1", "task": "task1", "team": "my_squad"}, }, { "output_percentage": 90.0, @@ -608,46 +755,99 @@ def test_get_row_dq_detailed_stats_exception( "error_count": 10, "output_count": 95, "source_agg_results": None, - - - - "rules_execution_settings_config": - - {'row_dq': True, 'source_agg_dq': False, 'source_query_dq': False, 'target_agg_dq': True, 'target_query_dq': False}, - + "rules_execution_settings_config": { + "row_dq": True, + "source_agg_dq": False, + "source_query_dq": False, + "target_agg_dq": True, + "target_query_dq": False, + }, "agg_dq_detailed_stats_status": True, "query_dq_detailed_stats_status": False, "source_agg_dq_detailed_stats": [], - - "target_agg_dq_detailed_stats": [('product1_run_test', 'product_1', 'dq_spark_local.customer_order', 'agg_dq', 'sum_of_sales', 'sum(sales)>10000', 'validity', 'regex format validation for quantity','fail', 1030, '>10000', 4,0,4), - ], - - "source_query_dq_detailed_stats": [ - ], - - "target_query_dq_detailed_stats": [ - - ], - - - "source_query_dq_output": [ ], + "target_agg_dq_detailed_stats": [ + ( + "product1_run_test", + "product_1", + "dq_spark_local.customer_order", + "agg_dq", + "sum_of_sales", + "sum(sales)>10000", + "validity", + "regex format validation for quantity", + "fail", + 1030, + ">10000", + 4, + 0, + 4, + "2024-03-14 01:00:00", + "2024-03-14 01:10:00", + ), + ], + "source_query_dq_detailed_stats": [], + "target_query_dq_detailed_stats": [], + "source_query_dq_output": [], "target_query_dq_output": [], - - "detailed_stats_table_writer_config" : {'mode': 'overwrite', "format": "delta", 'partitionBy': [], 'bucketBy': {}, 'sortBy': [], - 'options': {"mergeSchema": "true"}}, - "test_dq_detailed_stats_table":"test_dq_detailed_stats_table", + "detailed_stats_table_writer_config": { + "mode": "overwrite", + "format": "delta", + "partitionBy": [], + "bucketBy": {}, + "sortBy": [], + "options": {"mergeSchema": "true"}, + }, + "test_dq_detailed_stats_table": "test_dq_detailed_stats_table", "test_querydq_output_custom_table_name": "test_querydq_output_custom_table_name", - "dq_expectations" : { - 'row_dq_rules': [{'product_id': 'your_product', 'table_name': 'dq_spark_local.customer_order', 'rule_type': 'row_dq', 'rule': 'rule1', 'column_name': 'sales', 'expectation': 'sales > 2', 'action_if_failed': 'drop', 'enable_for_source_dq_validation': False, 'enable_for_target_dq_validation': True, 'tag': 'accuracy', 'description': 'sales value should be greater than zero', 'enable_error_drop_alert': False, 'error_drop_threshold': 0}, - {'product_id': 'your_product', 'table_name': 'dq_spark_local.customer_order', 'rule_type': 'row_dq', 'rule': 'rule2', 'column_name': 'sales', 'expectation': 'sales > 2', 'action_if_failed': 'drop', 'enable_for_source_dq_validation': False, 'enable_for_target_dq_validation': True, 'tag': 'accuracy', 'description': 'sales value should be greater than zero', 'enable_error_drop_alert': False, 'error_drop_threshold': 0}, - {'product_id': 'your_product', 'table_name': 'dq_spark_local.customer_order', 'rule_type': 'row_dq', 'rule': 'rule3', 'column_name': 'sales', 'expectation': 'sales > 2', 'action_if_failed': 'drop', 'enable_for_source_dq_validation': False, 'enable_for_target_dq_validation': True, 'tag': 'accuracy', 'description': 'sales value should be greater than zero', 'enable_error_drop_alert': False, 'error_drop_threshold': 0}], + "dq_expectations": { + "row_dq_rules": [ + { + "product_id": "your_product", + "table_name": "dq_spark_local.customer_order", + "rule_type": "row_dq", + "rule": "rule1", + "column_name": "sales", + "expectation": "sales > 2", + "action_if_failed": "drop", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": True, + "tag": "accuracy", + "description": "sales value should be greater than zero", + "enable_error_drop_alert": False, + "error_drop_threshold": 0, }, - - - - - - + { + "product_id": "your_product", + "table_name": "dq_spark_local.customer_order", + "rule_type": "row_dq", + "rule": "rule2", + "column_name": "sales", + "expectation": "sales > 2", + "action_if_failed": "drop", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": True, + "tag": "accuracy", + "description": "sales value should be greater than zero", + "enable_error_drop_alert": False, + "error_drop_threshold": 0, + }, + { + "product_id": "your_product", + "table_name": "dq_spark_local.customer_order", + "rule_type": "row_dq", + "rule": "rule3", + "column_name": "sales", + "expectation": "sales > 2", + "action_if_failed": "drop", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": True, + "tag": "accuracy", + "description": "sales value should be greater than zero", + "enable_error_drop_alert": False, + "error_drop_threshold": 0, + }, + ], + }, "final_agg_results": [ { "rule_name": "rule2", @@ -677,7 +877,7 @@ def test_get_row_dq_detailed_stats_exception( }, { "rule_name": "rule3", - "rule": "rule3", + "rule": "rule3", "action_if_failed": "ignore", "rule_type": "row_dq", "failed_count": 8, @@ -747,67 +947,159 @@ def test_get_row_dq_detailed_stats_exception( }, None, ), - ( { "input_count": 100, "error_count": 100, "output_count": 100, - - - - "rules_execution_settings_config": - {'row_dq': True, 'source_agg_dq': True, 'source_query_dq': False, 'target_agg_dq': False, 'target_query_dq': True}, + "rules_execution_settings_config": { + "row_dq": True, + "source_agg_dq": True, + "source_query_dq": False, + "target_agg_dq": False, + "target_query_dq": True, + }, "agg_dq_detailed_stats_status": True, "query_dq_detailed_stats_status": True, - "source_agg_dq_detailed_stats": [('product1_run_test', 'product_1', 'dq_spark_local.customer_order', 'agg_dq', 'sum_of_sales', 'sum(sales)>10000', 'validity', 'regex format validation for quantity','fail', 1988, '>10000', 5,0,5), - ], - + "source_agg_dq_detailed_stats": [ + ( + "product1_run_test", + "product_1", + "dq_spark_local.customer_order", + "agg_dq", + "sum_of_sales", + "sum(sales)>10000", + "validity", + "regex format validation for quantity", + "fail", + 1988, + ">10000", + 5, + 0, + 5, + "2024-03-14 00:00:00", + "2024-03-14 00:10:00", + ), + ], "target_agg_dq_detailed_stats": [], - - "source_query_dq_detailed_stats": [ - ], - - "target_query_dq_detailed_stats": [('product1_run_test', 'product_1', 'dq_spark_local.customer_order', 'query_dq', 'product_missing_count_threshold', '((select count(*) from (select distinct product_id,order_id from order_source) a) - (select count(*) from (select distinct product_id,order_id from order_target) b) ) < 3', 'validity', 'row count threshold', 'pass', - 1, - '<3', 5,0,5) - - ], - - + "source_query_dq_detailed_stats": [], + "target_query_dq_detailed_stats": [ + ( + "product1_run_test", + "product_1", + "dq_spark_local.customer_order", + "query_dq", + "product_missing_count_threshold", + "((select count(*) from (select distinct product_id,order_id from order_source) a) - (select count(*) from (select distinct product_id,order_id from order_target) b) ) < 3", + "validity", + "row count threshold", + "pass", + 1, + "<3", + 5, + 0, + 5, + "2024-03-14 03:00:00", + "2024-03-14 03:10:00", + ) + ], "source_query_dq_output": [], - "target_query_dq_output": [('your_product_96bb003e-e1cf-11ee-9a59-ae03267c3340', - 'your_product', 'dq_spark_local.customer_order', - 'product_missing_count_threshold', 'source_f1', '_target_dq', - {'source_f1': ['{"product_id":"FUR-TA-10000577","order_id":"US-2015-108966"}', - '{"product_id":"OFF-ST-10000760","order_id":"US-2015-108966"}', - '{"product_id":"FUR-CH-10000454","order_id":"CA-2016-152156"}', - '{"product_id":"FUR-BO-10001798","order_id":"CA-2016-152156"}', - '{"product_id":"OFF-LA-10000240","order_id":"CA-2016-138688"}']}, '2024-03-14 06:53:39'), - ('your_product_96bb003e-e1cf-11ee-9a59-ae03267c3340', - 'your_product', 'dq_spark_local.customer_order', - 'product_missing_count_threshold', 'target_f1', '_target_dq', - {'target_f1': ['{"product_id":"FUR-TA-10000577","order_id":"US-2015-108966"}', - '{"product_id":"FUR-CH-10000454","order_id":"CA-2016-152156"}', - '{"product_id":"FUR-BO-10001798","order_id":"CA-2016-152156"}', - '{"product_id":"OFF-LA-10000240","order_id":"CA-2016-138688"}']}, '2024-03-14 06:53:39') - ], - - "detailed_stats_table_writer_config" : {'mode': 'overwrite', "format": "delta", 'partitionBy': [], 'bucketBy': {}, 'sortBy': [], - 'options': {"mergeSchema": "true"}}, - "test_dq_detailed_stats_table":"test_dq_detailed_stats_table", + "target_query_dq_output": [ + ( + "your_product_96bb003e-e1cf-11ee-9a59-ae03267c3340", + "your_product", + "dq_spark_local.customer_order", + "product_missing_count_threshold", + "source_f1", + "_target_dq", + { + "source_f1": [ + '{"product_id":"FUR-TA-10000577","order_id":"US-2015-108966"}', + '{"product_id":"OFF-ST-10000760","order_id":"US-2015-108966"}', + '{"product_id":"FUR-CH-10000454","order_id":"CA-2016-152156"}', + '{"product_id":"FUR-BO-10001798","order_id":"CA-2016-152156"}', + '{"product_id":"OFF-LA-10000240","order_id":"CA-2016-138688"}', + ] + }, + "2024-03-14 06:53:39", + ), + ( + "your_product_96bb003e-e1cf-11ee-9a59-ae03267c3340", + "your_product", + "dq_spark_local.customer_order", + "product_missing_count_threshold", + "target_f1", + "_target_dq", + { + "target_f1": [ + '{"product_id":"FUR-TA-10000577","order_id":"US-2015-108966"}', + '{"product_id":"FUR-CH-10000454","order_id":"CA-2016-152156"}', + '{"product_id":"FUR-BO-10001798","order_id":"CA-2016-152156"}', + '{"product_id":"OFF-LA-10000240","order_id":"CA-2016-138688"}', + ] + }, + "2024-03-14 06:53:39", + ), + ], + "detailed_stats_table_writer_config": { + "mode": "overwrite", + "format": "delta", + "partitionBy": [], + "bucketBy": {}, + "sortBy": [], + "options": {"mergeSchema": "true"}, + }, + "test_dq_detailed_stats_table": "test_dq_detailed_stats_table", "test_querydq_output_custom_table_name": "test_querydq_output_custom_table_name", - "dq_expectations" : { - 'row_dq_rules': [{'product_id': 'your_product', 'table_name': 'dq_spark_local.customer_order', 'rule_type': 'row_dq', 'rule': 'rule1', 'column_name': 'sales', 'expectation': 'sales > 2', 'action_if_failed': 'drop', 'enable_for_source_dq_validation': False, 'enable_for_target_dq_validation': True, 'tag': 'accuracy', 'description': 'sales value should be greater than zero', 'enable_error_drop_alert': False, 'error_drop_threshold': 0}, - {'product_id': 'your_product', 'table_name': 'dq_spark_local.customer_order', 'rule_type': 'row_dq', 'rule': 'rule2', 'column_name': 'sales', 'expectation': 'sales > 2', 'action_if_failed': 'drop', 'enable_for_source_dq_validation': False, 'enable_for_target_dq_validation': True, 'tag': 'accuracy', 'description': 'sales value should be greater than zero', 'enable_error_drop_alert': False, 'error_drop_threshold': 0}, - {'product_id': 'your_product', 'table_name': 'dq_spark_local.customer_order', 'rule_type': 'row_dq', 'rule': 'rule3', 'column_name': 'sales', 'expectation': 'sales > 2', 'action_if_failed': 'drop', 'enable_for_source_dq_validation': False, 'enable_for_target_dq_validation': True, 'tag': 'accuracy', 'description': 'sales value should be greater than zero', 'enable_error_drop_alert': False, 'error_drop_threshold': 0}], + "dq_expectations": { + "row_dq_rules": [ + { + "product_id": "your_product", + "table_name": "dq_spark_local.customer_order", + "rule_type": "row_dq", + "rule": "rule1", + "column_name": "sales", + "expectation": "sales > 2", + "action_if_failed": "drop", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": True, + "tag": "accuracy", + "description": "sales value should be greater than zero", + "enable_error_drop_alert": False, + "error_drop_threshold": 0, }, - - - - - - + { + "product_id": "your_product", + "table_name": "dq_spark_local.customer_order", + "rule_type": "row_dq", + "rule": "rule2", + "column_name": "sales", + "expectation": "sales > 2", + "action_if_failed": "drop", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": True, + "tag": "accuracy", + "description": "sales value should be greater than zero", + "enable_error_drop_alert": False, + "error_drop_threshold": 0, + }, + { + "product_id": "your_product", + "table_name": "dq_spark_local.customer_order", + "rule_type": "row_dq", + "rule": "rule3", + "column_name": "sales", + "expectation": "sales > 2", + "action_if_failed": "drop", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": True, + "tag": "accuracy", + "description": "sales value should be greater than zero", + "enable_error_drop_alert": False, + "error_drop_threshold": 0, + }, + ], + }, "source_agg_results": [ { "rule_name": "rule2", @@ -845,11 +1137,11 @@ def test_get_row_dq_detailed_stats_exception( }, { "rule_name": "rule3", - "rule": "rule3", + "rule": "rule3", "action_if_failed": "ignore", "rule_type": "row_dq", "failed_count": 8, - "failed_row_count": 8, + "failed_row_count": 8, }, ], "row_dq_error_threshold": [ @@ -920,40 +1212,99 @@ def test_get_row_dq_detailed_stats_exception( "input_count": 100, "error_count": 100, "output_count": 0, - - - - "rules_execution_settings_config": - {'row_dq': True, 'source_agg_dq': True, 'source_query_dq': False, 'target_agg_dq': False, 'target_query_dq': False}, + "rules_execution_settings_config": { + "row_dq": True, + "source_agg_dq": True, + "source_query_dq": False, + "target_agg_dq": False, + "target_query_dq": False, + }, "agg_dq_detailed_stats_status": True, "query_dq_detailed_stats_status": False, - "source_agg_dq_detailed_stats": [('product1_run_test', 'product_1', 'dq_spark_local.customer_order', 'agg_dq', 'sum_of_sales', 'sum(sales)>10000', 'validity', 'regex format validation for quantity','fail', 1988, '>10000', 5,0,5), - ], - - "target_agg_dq_detailed_stats": [ ], - + "source_agg_dq_detailed_stats": [ + ( + "product1_run_test", + "product_1", + "dq_spark_local.customer_order", + "agg_dq", + "sum_of_sales", + "sum(sales)>10000", + "validity", + "regex format validation for quantity", + "fail", + 1988, + ">10000", + 5, + 0, + 5, + "2024-03-14 00:00:00", + "2024-03-14 00:10:00", + ), + ], + "target_agg_dq_detailed_stats": [], "source_query_dq_detailed_stats": [], - "target_query_dq_detailed_stats": [], - - "source_query_dq_output": [], "target_query_dq_output": [], - - "detailed_stats_table_writer_config" : {'mode': 'overwrite', "format": "delta", 'partitionBy': [], 'bucketBy': {}, 'sortBy': [], - 'options': {"mergeSchema": "true"}}, - "test_dq_detailed_stats_table":"test_dq_detailed_stats_table", + "detailed_stats_table_writer_config": { + "mode": "overwrite", + "format": "delta", + "partitionBy": [], + "bucketBy": {}, + "sortBy": [], + "options": {"mergeSchema": "true"}, + }, + "test_dq_detailed_stats_table": "test_dq_detailed_stats_table", "test_querydq_output_custom_table_name": "test_querydq_output_custom_table_name", - "dq_expectations" : { - 'row_dq_rules': [{'product_id': 'your_product', 'table_name': 'dq_spark_local.customer_order', 'rule_type': 'row_dq', 'rule': 'rule1', 'column_name': 'sales', 'expectation': 'sales > 2', 'action_if_failed': 'drop', 'enable_for_source_dq_validation': False, 'enable_for_target_dq_validation': True, 'tag': 'accuracy', 'description': 'sales value should be greater than zero', 'enable_error_drop_alert': False, 'error_drop_threshold': 0}, - {'product_id': 'your_product', 'table_name': 'dq_spark_local.customer_order', 'rule_type': 'row_dq', 'rule': 'rule2', 'column_name': 'sales', 'expectation': 'sales > 2', 'action_if_failed': 'drop', 'enable_for_source_dq_validation': False, 'enable_for_target_dq_validation': True, 'tag': 'accuracy', 'description': 'sales value should be greater than zero', 'enable_error_drop_alert': False, 'error_drop_threshold': 0}, - {'product_id': 'your_product', 'table_name': 'dq_spark_local.customer_order', 'rule_type': 'row_dq', 'rule': 'rule3', 'column_name': 'sales', 'expectation': 'sales > 2', 'action_if_failed': 'drop', 'enable_for_source_dq_validation': False, 'enable_for_target_dq_validation': True, 'tag': 'accuracy', 'description': 'sales value should be greater than zero', 'enable_error_drop_alert': False, 'error_drop_threshold': 0}], + "dq_expectations": { + "row_dq_rules": [ + { + "product_id": "your_product", + "table_name": "dq_spark_local.customer_order", + "rule_type": "row_dq", + "rule": "rule1", + "column_name": "sales", + "expectation": "sales > 2", + "action_if_failed": "drop", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": True, + "tag": "accuracy", + "description": "sales value should be greater than zero", + "enable_error_drop_alert": False, + "error_drop_threshold": 0, }, - - - - - + { + "product_id": "your_product", + "table_name": "dq_spark_local.customer_order", + "rule_type": "row_dq", + "rule": "rule2", + "column_name": "sales", + "expectation": "sales > 2", + "action_if_failed": "drop", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": True, + "tag": "accuracy", + "description": "sales value should be greater than zero", + "enable_error_drop_alert": False, + "error_drop_threshold": 0, + }, + { + "product_id": "your_product", + "table_name": "dq_spark_local.customer_order", + "rule_type": "row_dq", + "rule": "rule3", + "column_name": "sales", + "expectation": "sales > 2", + "action_if_failed": "drop", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": True, + "tag": "accuracy", + "description": "sales value should be greater than zero", + "enable_error_drop_alert": False, + "error_drop_threshold": 0, + }, + ], + }, "source_agg_results": [ { "rule_name": "rule2", @@ -988,7 +1339,7 @@ def test_get_row_dq_detailed_stats_exception( "action_if_failed": "ignore", "rule_type": "row_dq", "failed_count": 8, - "failed_row_count": 8, + "failed_row_count": 8, }, ], "row_dq_error_threshold": [ @@ -1059,76 +1410,209 @@ def test_get_row_dq_detailed_stats_exception( "input_count": 100, "error_count": 100, "output_count": 0, - - - "rules_execution_settings_config": - {'row_dq': True, 'source_agg_dq': True, 'source_query_dq': True, 'target_agg_dq': False, 'target_query_dq': True}, + "rules_execution_settings_config": { + "row_dq": True, + "source_agg_dq": True, + "source_query_dq": True, + "target_agg_dq": False, + "target_query_dq": True, + }, "agg_dq_detailed_stats_status": True, "query_dq_detailed_stats_status": True, - "source_agg_dq_detailed_stats": [('product1_run_test', 'product_1', 'dq_spark_local.customer_order', 'agg_dq', 'sum_of_sales', 'sum(sales)>10000', 'validity', 'regex format validation for quantity','fail', 1988, '>10000', 5,0,5), - ], - + "source_agg_dq_detailed_stats": [ + ( + "product1_run_test", + "product_1", + "dq_spark_local.customer_order", + "agg_dq", + "sum_of_sales", + "sum(sales)>10000", + "validity", + "regex format validation for quantity", + "fail", + 1988, + ">10000", + 5, + 0, + 5, + "2024-03-14 00:00:00", + "2024-03-14 00:10:00", + ), + ], "target_agg_dq_detailed_stats": [], - - "source_query_dq_detailed_stats": [('product1_run_test', 'product_1', 'dq_spark_local.customer_order', 'query_dq', 'product_missing_count_threshold', '((select count(*) from (select distinct product_id,order_id from order_source) a) - (select count(*) from (select distinct product_id,order_id from order_target) b) ) < 3', 'validity', 'row count threshold', 'pass', - 1, - '<3', 5,0,5) - ], - - "target_query_dq_detailed_stats": [('product1_run_test', 'product_1', 'dq_spark_local.customer_order', 'query_dq', 'product_missing_count_threshold', '((select count(*) from (select distinct product_id,order_id from order_source) a) - (select count(*) from (select distinct product_id,order_id from order_target) b) ) < 3', 'validity', 'row count threshold', 'pass', - 1, - '<3', 5,0,5) - - ], - - - "source_query_dq_output": [('product1_run_test', - 'your_product', 'dq_spark_local.customer_order', - 'product_missing_count_threshold', 'source_f1', '_source_dq', - {'source_f1': ['{"product_id":"FUR-TA-10000577","order_id":"US-2015-108966"}', - '{"product_id":"OFF-ST-10000760","order_id":"US-2015-108966"}', - '{"product_id":"FUR-CH-10000454","order_id":"CA-2016-152156"}', - '{"product_id":"FUR-BO-10001798","order_id":"CA-2016-152156"}', - '{"product_id":"OFF-LA-10000240","order_id":"CA-2016-138688"}']}, '2024-03-14 06:53:39'), - ('product1_run_test', - 'your_product', 'dq_spark_local.customer_order', - 'product_missing_count_threshold', 'target_f1', '_source_dq', - {'target_f1': ['{"product_id":"FUR-TA-10000577","order_id":"US-2015-108966"}', - '{"product_id":"FUR-CH-10000454","order_id":"CA-2016-152156"}', - '{"product_id":"FUR-BO-10001798","order_id":"CA-2016-152156"}', - '{"product_id":"OFF-LA-10000240","order_id":"CA-2016-138688"}']}, '2024-03-14 06:53:39'), - ], - "target_query_dq_output": [('product1_run_test', - 'your_product', 'dq_spark_local.customer_order', - 'product_missing_count_threshold', 'source_f1', '_target_dq', - {'source_f1': ['{"product_id":"FUR-TA-10000577","order_id":"US-2015-108966"}', - '{"product_id":"OFF-ST-10000760","order_id":"US-2015-108966"}', - '{"product_id":"FUR-CH-10000454","order_id":"CA-2016-152156"}', - '{"product_id":"FUR-BO-10001798","order_id":"CA-2016-152156"}', - '{"product_id":"OFF-LA-10000240","order_id":"CA-2016-138688"}']}, '2024-03-14 06:53:39'), - ('product1_run_test', - 'your_product', 'dq_spark_local.customer_order', - 'product_missing_count_threshold', 'target_f1', '_target_dq', - {'target_f1': ['{"product_id":"FUR-TA-10000577","order_id":"US-2015-108966"}', - '{"product_id":"FUR-CH-10000454","order_id":"CA-2016-152156"}', - '{"product_id":"FUR-BO-10001798","order_id":"CA-2016-152156"}', - '{"product_id":"OFF-LA-10000240","order_id":"CA-2016-138688"}']}, '2024-03-14 06:53:39') - ], - - - "detailed_stats_table_writer_config" : {'mode': 'overwrite', "format": "delta", 'partitionBy': [], 'bucketBy': {}, 'sortBy': [], - 'options': {"mergeSchema": "true"}}, - "test_dq_detailed_stats_table":"test_dq_detailed_stats_table", + "source_query_dq_detailed_stats": [ + ( + "product1_run_test", + "product_1", + "dq_spark_local.customer_order", + "query_dq", + "product_missing_count_threshold", + "((select count(*) from (select distinct product_id,order_id from order_source) a) - (select count(*) from (select distinct product_id,order_id from order_target) b) ) < 3", + "validity", + "row count threshold", + "pass", + 1, + "<3", + 5, + 0, + 5, + "2024-03-14 02:00:00", + "2024-03-14 02:10:00", + ) + ], + "target_query_dq_detailed_stats": [ + ( + "product1_run_test", + "product_1", + "dq_spark_local.customer_order", + "query_dq", + "product_missing_count_threshold", + "((select count(*) from (select distinct product_id,order_id from order_source) a) - (select count(*) from (select distinct product_id,order_id from order_target) b) ) < 3", + "validity", + "row count threshold", + "pass", + 1, + "<3", + 5, + 0, + 5, + "2024-03-14 03:00:00", + "2024-03-14 03:10:00", + ) + ], + "source_query_dq_output": [ + ( + "product1_run_test", + "your_product", + "dq_spark_local.customer_order", + "product_missing_count_threshold", + "source_f1", + "_source_dq", + { + "source_f1": [ + '{"product_id":"FUR-TA-10000577","order_id":"US-2015-108966"}', + '{"product_id":"OFF-ST-10000760","order_id":"US-2015-108966"}', + '{"product_id":"FUR-CH-10000454","order_id":"CA-2016-152156"}', + '{"product_id":"FUR-BO-10001798","order_id":"CA-2016-152156"}', + '{"product_id":"OFF-LA-10000240","order_id":"CA-2016-138688"}', + ] + }, + "2024-03-14 06:53:39", + ), + ( + "product1_run_test", + "your_product", + "dq_spark_local.customer_order", + "product_missing_count_threshold", + "target_f1", + "_source_dq", + { + "target_f1": [ + '{"product_id":"FUR-TA-10000577","order_id":"US-2015-108966"}', + '{"product_id":"FUR-CH-10000454","order_id":"CA-2016-152156"}', + '{"product_id":"FUR-BO-10001798","order_id":"CA-2016-152156"}', + '{"product_id":"OFF-LA-10000240","order_id":"CA-2016-138688"}', + ] + }, + "2024-03-14 06:53:39", + ), + ], + "target_query_dq_output": [ + ( + "product1_run_test", + "your_product", + "dq_spark_local.customer_order", + "product_missing_count_threshold", + "source_f1", + "_target_dq", + { + "source_f1": [ + '{"product_id":"FUR-TA-10000577","order_id":"US-2015-108966"}', + '{"product_id":"OFF-ST-10000760","order_id":"US-2015-108966"}', + '{"product_id":"FUR-CH-10000454","order_id":"CA-2016-152156"}', + '{"product_id":"FUR-BO-10001798","order_id":"CA-2016-152156"}', + '{"product_id":"OFF-LA-10000240","order_id":"CA-2016-138688"}', + ] + }, + "2024-03-14 06:53:39", + ), + ( + "product1_run_test", + "your_product", + "dq_spark_local.customer_order", + "product_missing_count_threshold", + "target_f1", + "_target_dq", + { + "target_f1": [ + '{"product_id":"FUR-TA-10000577","order_id":"US-2015-108966"}', + '{"product_id":"FUR-CH-10000454","order_id":"CA-2016-152156"}', + '{"product_id":"FUR-BO-10001798","order_id":"CA-2016-152156"}', + '{"product_id":"OFF-LA-10000240","order_id":"CA-2016-138688"}', + ] + }, + "2024-03-14 06:53:39", + ), + ], + "detailed_stats_table_writer_config": { + "mode": "overwrite", + "format": "delta", + "partitionBy": [], + "bucketBy": {}, + "sortBy": [], + "options": {"mergeSchema": "true"}, + }, + "test_dq_detailed_stats_table": "test_dq_detailed_stats_table", "test_querydq_output_custom_table_name": "test_querydq_output_custom_table_name", - "dq_expectations" : { - 'row_dq_rules': [{'product_id': 'your_product', 'table_name': 'dq_spark_local.customer_order', 'rule_type': 'row_dq', 'rule': 'rule1', 'column_name': 'sales', 'expectation': 'sales > 2', 'action_if_failed': 'drop', 'enable_for_source_dq_validation': False, 'enable_for_target_dq_validation': True, 'tag': 'accuracy', 'description': 'sales value should be greater than zero', 'enable_error_drop_alert': False, 'error_drop_threshold': 0}, - {'product_id': 'your_product', 'table_name': 'dq_spark_local.customer_order', 'rule_type': 'row_dq', 'rule': 'rule2', 'column_name': 'sales', 'expectation': 'sales > 2', 'action_if_failed': 'drop', 'enable_for_source_dq_validation': False, 'enable_for_target_dq_validation': True, 'tag': 'accuracy', 'description': 'sales value should be greater than zero', 'enable_error_drop_alert': False, 'error_drop_threshold': 0}, - {'product_id': 'your_product', 'table_name': 'dq_spark_local.customer_order', 'rule_type': 'row_dq', 'rule': 'rule3', 'column_name': 'sales', 'expectation': 'sales > 2', 'action_if_failed': 'drop', 'enable_for_source_dq_validation': False, 'enable_for_target_dq_validation': True, 'tag': 'accuracy', 'description': 'sales value should be greater than zero', 'enable_error_drop_alert': False, 'error_drop_threshold': 0}], - }, - - - - + "dq_expectations": { + "row_dq_rules": [ + { + "product_id": "your_product", + "table_name": "dq_spark_local.customer_order", + "rule_type": "row_dq", + "rule": "rule1", + "column_name": "sales", + "expectation": "sales > 2", + "action_if_failed": "drop", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": True, + "tag": "accuracy", + "description": "sales value should be greater than zero", + "enable_error_drop_alert": False, + "error_drop_threshold": 0, + }, + { + "product_id": "your_product", + "table_name": "dq_spark_local.customer_order", + "rule_type": "row_dq", + "rule": "rule2", + "column_name": "sales", + "expectation": "sales > 2", + "action_if_failed": "drop", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": True, + "tag": "accuracy", + "description": "sales value should be greater than zero", + "enable_error_drop_alert": False, + "error_drop_threshold": 0, + }, + { + "product_id": "your_product", + "table_name": "dq_spark_local.customer_order", + "rule_type": "row_dq", + "rule": "rule3", + "column_name": "sales", + "expectation": "sales > 2", + "action_if_failed": "drop", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": True, + "tag": "accuracy", + "description": "sales value should be greater than zero", + "enable_error_drop_alert": False, + "error_drop_threshold": 0, + }, + ], + }, "source_agg_results": [ { "rule_name": "rule2", @@ -1248,39 +1732,95 @@ def test_get_row_dq_detailed_stats_exception( "input_count": 100, "error_count": 100, "output_count": 0, - - - "rules_execution_settings_config": - {'row_dq': True, 'source_agg_dq': False, 'source_query_dq': False, 'target_agg_dq': False, 'target_query_dq': False}, + "rules_execution_settings_config": { + "row_dq": True, + "source_agg_dq": False, + "source_query_dq": False, + "target_agg_dq": False, + "target_query_dq": False, + }, "agg_dq_detailed_stats_status": False, "query_dq_detailed_stats_status": False, "source_agg_dq_detailed_stats": [], - "target_agg_dq_detailed_stats": [], - "source_query_dq_detailed_stats": [], - "target_query_dq_detailed_stats": [], - - "source_query_dq_output": [], "target_query_dq_output": [], - - "detailed_stats_table_writer_config" : {'mode': 'overwrite', "format": "delta", 'partitionBy': [], 'bucketBy': {}, 'sortBy': [], - 'options': {"mergeSchema": "true"}}, - "test_dq_detailed_stats_table":"test_dq_detailed_stats_table", + "detailed_stats_table_writer_config": { + "mode": "overwrite", + "format": "delta", + "partitionBy": [], + "bucketBy": {}, + "sortBy": [], + "options": {"mergeSchema": "true"}, + }, + "test_dq_detailed_stats_table": "test_dq_detailed_stats_table", "test_querydq_output_custom_table_name": "test_querydq_output_custom_table_name", - "dq_expectations" : { - 'row_dq_rules': [{'product_id': 'your_product', 'table_name': 'dq_spark_local.customer_order', 'rule_type': 'row_dq', 'rule': 'rule1', 'column_name': 'sales', 'expectation': 'sales > 2', 'action_if_failed': 'drop', 'enable_for_source_dq_validation': False, 'enable_for_target_dq_validation': True, 'tag': 'accuracy', 'description': 'sales value should be greater than zero', 'enable_error_drop_alert': False, 'error_drop_threshold': 0}, - {'product_id': 'your_product', 'table_name': 'dq_spark_local.customer_order', 'rule_type': 'row_dq', 'rule': 'rule2', 'column_name': 'sales', 'expectation': 'sales > 2', 'action_if_failed': 'drop', 'enable_for_source_dq_validation': False, 'enable_for_target_dq_validation': True, 'tag': 'accuracy', 'description': 'sales value should be greater than zero', 'enable_error_drop_alert': False, 'error_drop_threshold': 0}, - {'product_id': 'your_product', 'table_name': 'dq_spark_local.customer_order', 'rule_type': 'row_dq', 'rule': 'rule3', 'column_name': 'sales', 'expectation': 'sales > 2', 'action_if_failed': 'drop', 'enable_for_source_dq_validation': False, 'enable_for_target_dq_validation': True, 'tag': 'accuracy', 'description': 'sales value should be greater than zero', 'enable_error_drop_alert': False, 'error_drop_threshold': 0}, - {'product_id': 'your_product', 'table_name': 'dq_spark_local.customer_order', 'rule_type': 'row_dq', 'rule': 'rule4', 'column_name': 'sales', 'expectation': 'sales > 2', 'action_if_failed': 'drop', 'enable_for_source_dq_validation': False, 'enable_for_target_dq_validation': True, 'tag': 'accuracy', 'description': 'sales value should be greater than zero', 'enable_error_drop_alert': False, 'error_drop_threshold': 0}], + "dq_expectations": { + "row_dq_rules": [ + { + "product_id": "your_product", + "table_name": "dq_spark_local.customer_order", + "rule_type": "row_dq", + "rule": "rule1", + "column_name": "sales", + "expectation": "sales > 2", + "action_if_failed": "drop", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": True, + "tag": "accuracy", + "description": "sales value should be greater than zero", + "enable_error_drop_alert": False, + "error_drop_threshold": 0, + }, + { + "product_id": "your_product", + "table_name": "dq_spark_local.customer_order", + "rule_type": "row_dq", + "rule": "rule2", + "column_name": "sales", + "expectation": "sales > 2", + "action_if_failed": "drop", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": True, + "tag": "accuracy", + "description": "sales value should be greater than zero", + "enable_error_drop_alert": False, + "error_drop_threshold": 0, }, - - - - - + { + "product_id": "your_product", + "table_name": "dq_spark_local.customer_order", + "rule_type": "row_dq", + "rule": "rule3", + "column_name": "sales", + "expectation": "sales > 2", + "action_if_failed": "drop", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": True, + "tag": "accuracy", + "description": "sales value should be greater than zero", + "enable_error_drop_alert": False, + "error_drop_threshold": 0, + }, + { + "product_id": "your_product", + "table_name": "dq_spark_local.customer_order", + "rule_type": "row_dq", + "rule": "rule4", + "column_name": "sales", + "expectation": "sales > 2", + "action_if_failed": "drop", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": True, + "tag": "accuracy", + "description": "sales value should be greater than zero", + "enable_error_drop_alert": False, + "error_drop_threshold": 0, + }, + ], + }, "source_agg_results": None, "final_agg_results": None, "source_query_dq_results": None, @@ -1304,7 +1844,7 @@ def test_get_row_dq_detailed_stats_exception( }, { "rule_name": "rule3", - "rule": "rule3", + "rule": "rule3", "action_if_failed": "ignore", "rule_type": "row_dq", "failed_count": 88, @@ -1316,7 +1856,7 @@ def test_get_row_dq_detailed_stats_exception( "action_if_failed": "fail", "rule_type": "row_dq", "failed_count": 60, - "failed_row_count": 60, + "failed_row_count": 60, }, ], "row_dq_error_threshold": [ @@ -1397,7 +1937,6 @@ def test_get_row_dq_detailed_stats_exception( "options": {"mergeSchema": "true"}, }, ), - ], ) def test_write_error_stats( @@ -1416,7 +1955,6 @@ def test_write_error_stats( setattr(_mock_context, "get_run_id_name", "meta_dq_run_id") setattr(_mock_context, "get_run_id", "product1_run_test") - setattr( _mock_context, "get_dq_run_status", input_record.get("status").get("run_status") ) @@ -1560,22 +2098,84 @@ def test_write_error_stats( ) setattr(_mock_context, "get_dq_stats_table_name", "test_dq_stats_table") - setattr(_mock_context, "get_rules_execution_settings_config", input_record.get("rules_execution_settings_config")) - setattr(_mock_context, "get_agg_dq_detailed_stats_status", input_record.get("agg_dq_detailed_stats_status")) - setattr(_mock_context, "get_query_dq_detailed_stats_status", input_record.get("query_dq_detailed_stats_status")) - setattr(_mock_context, "get_source_agg_dq_detailed_stats", input_record.get("source_agg_dq_detailed_stats")) - setattr(_mock_context, "get_target_agg_dq_detailed_stats", input_record.get("target_agg_dq_detailed_stats")) - setattr(_mock_context, "get_target_query_dq_detailed_stats", input_record.get("target_query_dq_detailed_stats")) - setattr(_mock_context, "get_source_query_dq_detailed_stats", input_record.get("source_query_dq_detailed_stats")) - setattr(_mock_context, "get_detailed_stats_table_writer_config", input_record.get("detailed_stats_table_writer_config")) - setattr(_mock_context, "get_dq_detailed_stats_table_name", input_record.get("test_dq_detailed_stats_table")) - setattr(_mock_context, "get_query_dq_output_custom_table_name", input_record.get("test_querydq_output_custom_table_name")) - setattr(_mock_context, "get_source_query_dq_output", input_record.get("source_query_dq_output")) - setattr(_mock_context, "get_target_query_dq_output", input_record.get("target_query_dq_output")) + setattr( + _mock_context, + "get_rules_execution_settings_config", + input_record.get("rules_execution_settings_config"), + ) + setattr( + _mock_context, + "get_agg_dq_detailed_stats_status", + input_record.get("agg_dq_detailed_stats_status"), + ) + setattr( + _mock_context, + "get_query_dq_detailed_stats_status", + input_record.get("query_dq_detailed_stats_status"), + ) + setattr( + _mock_context, + "get_source_agg_dq_detailed_stats", + input_record.get("source_agg_dq_detailed_stats"), + ) + setattr( + _mock_context, + "get_target_agg_dq_detailed_stats", + input_record.get("target_agg_dq_detailed_stats"), + ) + setattr( + _mock_context, + "get_target_query_dq_detailed_stats", + input_record.get("target_query_dq_detailed_stats"), + ) + setattr( + _mock_context, + "get_source_query_dq_detailed_stats", + input_record.get("source_query_dq_detailed_stats"), + ) + setattr( + _mock_context, + "get_detailed_stats_table_writer_config", + input_record.get("detailed_stats_table_writer_config"), + ) + setattr( + _mock_context, + "get_dq_detailed_stats_table_name", + input_record.get("test_dq_detailed_stats_table"), + ) + setattr( + _mock_context, + "get_query_dq_output_custom_table_name", + input_record.get("test_querydq_output_custom_table_name"), + ) + setattr( + _mock_context, + "get_source_query_dq_output", + input_record.get("source_query_dq_output"), + ) + setattr( + _mock_context, + "get_target_query_dq_output", + input_record.get("target_query_dq_output"), + ) setattr(_mock_context, "product_id", "product_1") setattr(_mock_context, "get_dq_expectations", input_record.get("dq_expectations")) - - + setattr( + _mock_context, + "get_row_dq_start_time", + datetime.strptime("2024-03-14 00:00:00", "%Y-%m-%d %H:%M:%S"), + ) + setattr( + _mock_context, + "get_row_dq_end_time", + datetime.strptime("2024-03-14 00:10:00", "%Y-%m-%d %H:%M:%S"), + ) + setattr( + _mock_context, + "get_job_metadata", + '{"dag": "dag1", "task": "task1", "team": "my_squad"}', + ) + if writer_config is None: setattr( _mock_context, @@ -1634,404 +2234,908 @@ def test_write_error_stats( assert row.dq_status == input_record.get("status") assert row.meta_dq_run_id == "product1_run_test" + assert ( + spark.read.format("kafka") + .option("kafka.bootstrap.servers", "localhost:9092") + .option("subscribe", "dq-sparkexpectations-stats") + .option("startingOffsets", "earliest") + .option("endingOffsets", "latest") + .load() + .orderBy(col("timestamp").desc()) + .limit(1) + .selectExpr("cast(value as string) as value") + .collect() + == stats_table.selectExpr("to_json(struct(*)) AS value").collect() + ) - assert spark.read.format("kafka").option( - "kafka.bootstrap.servers", "localhost:9092" - ).option("subscribe", "dq-sparkexpectations-stats").option( - "startingOffsets", "earliest" - ).option( - "endingOffsets", "latest" - ).load().orderBy(col('timestamp').desc()).limit(1).selectExpr( - "cast(value as string) as value").collect() == stats_table.selectExpr( - "to_json(struct(*)) AS value").collect() - - - -@pytest.mark.parametrize("input_record, expected_result,dq_check, writer_config", [ - - ({ - "input_count": 100, - "error_count": 10, - "output_count": 90, - "rules_execution_settings_config": - {'row_dq': True, 'source_agg_dq': True, 'source_query_dq': True, 'target_agg_dq': True, 'target_query_dq': True}, - "agg_dq_detailed_stats_status": True, - "source_agg_dq_status": "Passed", - "final_agg_dq_status": "Passed", - "query_dq_detailed_stats_status": False, - "source_query_dq_status": "Passed", - "final_query_dq_status": "Passed", - "row_dq_status": "Passed", - "summarised_row_dq_res" : [{'rule_type':"row_dq", "rule" : "sales_greater_than_zero", "description" : "sales value should be greater than zero", "failed_row_count": 1, "tag" :"validity", "action_if_failed" : "drop"}], - "run_id" : "product_1_01450932-d5c2-11ee-a9ca-88e9fe5a7109", - "input_count" : 5, - "dq_expectations" : { - 'row_dq_rules': [{'product_id': 'your_product', 'table_name': 'dq_spark_local.customer_order', 'rule_type': 'row_dq', 'rule': 'sales_greater_than_zero', 'column_name': 'sales', 'expectation': 'sales > 2', 'action_if_failed': 'drop', 'enable_for_source_dq_validation': False, 'enable_for_target_dq_validation': True, 'tag': 'accuracy', 'description': 'sales value should be greater than zero', 'enable_error_drop_alert': False, 'error_drop_threshold': 0}], - }, - "test_dq_detailed_stats_table":"test_dq_detailed_stats_table", - - "test_querydq_output_custom_table_name": "test_querydq_output_custom_table_name", - - "detailed_stats_table_writer_config" : {'mode': 'overwrite', "format": "delta", 'partitionBy': [], 'bucketBy': {}, 'sortBy': [], - 'options': {"mergeSchema": "true"}}, - - "rowdq_detailed_stats" : [('product_1_01450932-d5c2-11ee-a9ca-88e9fe5a7109', 'product_1','dq_spark_local.customer_order', 'row_dq', 'sales_greater_than_zero', 'sales > 2', 'accuracy', 'sales value should be greater than zero', 'fail', None, None, None, 4,0,4), - ], - - "source_agg_dq_detailed_stats": [('product_1_01450932-d5c2-11ee-a9ca-88e9fe5a7109', 'product_1', 'dq_spark_local.customer_order', 'agg_dq', 'sum_of_sales', 'sum(sales)>10000', 'validity', 'regex format validation for quantity','fail', 1988, '>10000', 5,0,5), - ], - - "target_agg_dq_detailed_stats": [('product_1_01450932-d5c2-11ee-a9ca-88e9fe5a7109', 'product_1', 'dq_spark_local.customer_order', 'agg_dq', 'sum_of_sales', 'sum(sales)>10000', 'validity', 'regex format validation for quantity','fail', 1030, '>10000', 4,0,4), - ], - - "source_query_dq_detailed_stats": [('product_1_52fed65a-d670-11ee-8dfb-ae03267c3341', 'product_1', 'dq_spark_local.customer_order', 'query_dq', 'product_missing_count_threshold', '((select count(*) from (select distinct product_id,order_id from order_source) a) - (select count(*) from (select distinct product_id,order_id from order_target) b) ) < 3', 'validity', 'row count threshold', 'pass', - 1, - '<3', 5,0,5) - ], - - "target_query_dq_detailed_stats": [('product_1_52fed65a-d670-11ee-8dfb-ae03267c3341', 'product_1', 'dq_spark_local.customer_order', 'query_dq', 'product_missing_count_threshold', '((select count(*) from (select distinct product_id,order_id from order_source) a) - (select count(*) from (select distinct product_id,order_id from order_target) b) ) < 3', 'validity', 'row count threshold', 'pass', - 1, - '<3', 5,0,5) - - ], - - - "source_query_dq_output": [('your_product_96bb003e-e1cf-11ee-9a59-ae03267c3340', - 'your_product', 'dq_spark_local.customer_order', - 'product_missing_count_threshold', 'source_f1', '_source_dq', - {'source_f1': ['{"product_id":"FUR-TA-10000577","order_id":"US-2015-108966"}', - '{"product_id":"OFF-ST-10000760","order_id":"US-2015-108966"}', - '{"product_id":"FUR-CH-10000454","order_id":"CA-2016-152156"}', - '{"product_id":"FUR-BO-10001798","order_id":"CA-2016-152156"}', - '{"product_id":"OFF-LA-10000240","order_id":"CA-2016-138688"}']}, '2024-03-14 06:53:39'), - ('your_product_96bb003e-e1cf-11ee-9a59-ae03267c3340', - 'your_product', 'dq_spark_local.customer_order', - 'product_missing_count_threshold', 'target_f1', '_source_dq', - {'target_f1': ['{"product_id":"FUR-TA-10000577","order_id":"US-2015-108966"}', - '{"product_id":"FUR-CH-10000454","order_id":"CA-2016-152156"}', - '{"product_id":"FUR-BO-10001798","order_id":"CA-2016-152156"}', - '{"product_id":"OFF-LA-10000240","order_id":"CA-2016-138688"}']}, '2024-03-14 06:53:39'), - ], - "target_query_dq_output": [('your_product_96bb003e-e1cf-11ee-9a59-ae03267c3340', - 'your_product', 'dq_spark_local.customer_order', - 'product_missing_count_threshold', 'source_f1', '_target_dq', - {'source_f1': ['{"product_id":"FUR-TA-10000577","order_id":"US-2015-108966"}', - '{"product_id":"OFF-ST-10000760","order_id":"US-2015-108966"}', - '{"product_id":"FUR-CH-10000454","order_id":"CA-2016-152156"}', - '{"product_id":"FUR-BO-10001798","order_id":"CA-2016-152156"}', - '{"product_id":"OFF-LA-10000240","order_id":"CA-2016-138688"}']}, '2024-03-14 06:53:39'), - ('your_product_96bb003e-e1cf-11ee-9a59-ae03267c3340', - 'your_product', 'dq_spark_local.customer_order', - 'product_missing_count_threshold', 'target_f1', '_target_dq', - {'target_f1': ['{"product_id":"FUR-TA-10000577","order_id":"US-2015-108966"}', - '{"product_id":"FUR-CH-10000454","order_id":"CA-2016-152156"}', - '{"product_id":"FUR-BO-10001798","order_id":"CA-2016-152156"}', - '{"product_id":"OFF-LA-10000240","order_id":"CA-2016-138688"}']}, '2024-03-14 06:53:39') - ], - - - - - }, { - "product_id": "product_1", - "table_name": "dq_spark_local.customer_order", - "rule":"sum_of_sales", - "rule_type": "agg_dq", - "source_expectations": "sum(sales)>10000", - "source_dq_status": "fail", - "source_dq_actual_result": '1988', - "source_dq_row_count": '5', - "target_expectations": "sum(sales)>10000", - "target_dq_status": "fail", - "target_dq_actual_result": '1030', - "target_dq_row_count": '4', - "source_expectations": "sum(sales)>10000", - - }, 'agg_dq',None), - - ({ - "input_count": 100, - "error_count": 10, - "output_count": 90, - "rules_execution_settings_config": - {'row_dq': True, 'source_agg_dq': True, 'source_query_dq': True, 'target_agg_dq': True, 'target_query_dq': True}, - "agg_dq_detailed_stats_status": True, - "source_agg_dq_status": "Passed", - "final_agg_dq_status": "Passed", - "query_dq_detailed_stats_status": False, - "source_query_dq_status": "Passed", - "final_query_dq_status": "Passed", - "row_dq_status": "Passed", - "summarised_row_dq_res" : [{'rule_type':"row_dq", "rule" : "sales_greater_than_zero", "description" : "sales value should be greater than zero", "failed_row_count": 1, "tag" :"validity", "action_if_failed" : "drop"}], - "run_id" : "product_1_01450932-d5c2-11ee-a9ca-88e9fe5a7109", - "input_count" : 5, - "dq_expectations" : { - 'row_dq_rules': [{'product_id': 'your_product', 'table_name': 'dq_spark_local.customer_order', 'rule_type': 'row_dq', 'rule': 'sales_greater_than_zero', 'column_name': 'sales', 'expectation': 'sales > 2', 'action_if_failed': 'drop', 'enable_for_source_dq_validation': False, 'enable_for_target_dq_validation': True, 'tag': 'accuracy', 'description': 'sales value should be greater than zero', 'enable_error_drop_alert': False, 'error_drop_threshold': 0}], - }, - "test_dq_detailed_stats_table":"test_dq_detailed_stats_table", - - "test_querydq_output_custom_table_name": "test_querydq_output_custom_table_name", - - "detailed_stats_table_writer_config" : {'mode': 'overwrite', "format": "delta", 'partitionBy': [], 'bucketBy': {}, 'sortBy': [], - 'options': {"mergeSchema": "true"}}, - - "rowdq_detailed_stats" : [('product_1_01450932-d5c2-11ee-a9ca-88e9fe5a7109', 'product_1','dq_spark_local.customer_order', 'row_dq', 'sales_greater_than_zero', 'sales > 2', 'accuracy', 'sales value should be greater than zero', 'fail', None, None, None, 4,0,4), - ], - - "source_agg_dq_detailed_stats": [('product_1_01450932-d5c2-11ee-a9ca-88e9fe5a7109', 'product_1', 'dq_spark_local.customer_order', 'agg_dq', 'sum_of_sales', 'sum(sales)>10000', 'validity', 'regex format validation for quantity','fail', 1988, '>10000', 5,0,5), - ], - - "target_agg_dq_detailed_stats": [('product_1_01450932-d5c2-11ee-a9ca-88e9fe5a7109', 'product_1', 'dq_spark_local.customer_order', 'agg_dq', 'sum_of_sales', 'sum(sales)>10000', 'validity', 'regex format validation for quantity','fail', 1030, '>10000', 4,0,4), - ], - - "source_query_dq_detailed_stats": [('product_1_52fed65a-d670-11ee-8dfb-ae03267c3341', 'product_1', 'dq_spark_local.customer_order', 'query_dq', 'product_missing_count_threshold', '((select count(*) from (select distinct product_id,order_id from order_source) a) - (select count(*) from (select distinct product_id,order_id from order_target) b) ) < 3', 'validity', 'row count threshold', 'pass', - 1, - '<3', 5,0,5) - ], - - "target_query_dq_detailed_stats": [('product_1_52fed65a-d670-11ee-8dfb-ae03267c3341', 'product_1', 'dq_spark_local.customer_order', 'query_dq', 'product_missing_count_threshold', '((select count(*) from (select distinct product_id,order_id from order_source) a) - (select count(*) from (select distinct product_id,order_id from order_target) b) ) < 3', 'validity', 'row count threshold', 'pass', - 1, - '<3', 5,0,5) - - ], - - - "source_query_dq_output": [('your_product_96bb003e-e1cf-11ee-9a59-ae03267c3340', - 'your_product', 'dq_spark_local.customer_order', - 'product_missing_count_threshold', 'source_f1', '_source_dq', - {'source_f1': ['{"product_id":"FUR-TA-10000577","order_id":"US-2015-108966"}', - '{"product_id":"OFF-ST-10000760","order_id":"US-2015-108966"}', - '{"product_id":"FUR-CH-10000454","order_id":"CA-2016-152156"}', - '{"product_id":"FUR-BO-10001798","order_id":"CA-2016-152156"}', - '{"product_id":"OFF-LA-10000240","order_id":"CA-2016-138688"}']}, '2024-03-14 06:53:39'), - ('your_product_96bb003e-e1cf-11ee-9a59-ae03267c3340', - 'your_product', 'dq_spark_local.customer_order', - 'product_missing_count_threshold', 'target_f1', '_source_dq', - {'target_f1': ['{"product_id":"FUR-TA-10000577","order_id":"US-2015-108966"}', - '{"product_id":"FUR-CH-10000454","order_id":"CA-2016-152156"}', - '{"product_id":"FUR-BO-10001798","order_id":"CA-2016-152156"}', - '{"product_id":"OFF-LA-10000240","order_id":"CA-2016-138688"}']}, '2024-03-14 06:53:39'), - ], - "target_query_dq_output": [('your_product_96bb003e-e1cf-11ee-9a59-ae03267c3340', - 'your_product', 'dq_spark_local.customer_order', - 'product_missing_count_threshold', 'source_f1', '_target_dq', - {'source_f1': ['{"product_id":"FUR-TA-10000577","order_id":"US-2015-108966"}', - '{"product_id":"OFF-ST-10000760","order_id":"US-2015-108966"}', - '{"product_id":"FUR-CH-10000454","order_id":"CA-2016-152156"}', - '{"product_id":"FUR-BO-10001798","order_id":"CA-2016-152156"}', - '{"product_id":"OFF-LA-10000240","order_id":"CA-2016-138688"}']}, '2024-03-14 06:53:39'), - ('your_product_96bb003e-e1cf-11ee-9a59-ae03267c3340', - 'your_product', 'dq_spark_local.customer_order', - 'product_missing_count_threshold', 'target_f1', '_target_dq', - {'target_f1': ['{"product_id":"FUR-TA-10000577","order_id":"US-2015-108966"}', - '{"product_id":"FUR-CH-10000454","order_id":"CA-2016-152156"}', - '{"product_id":"FUR-BO-10001798","order_id":"CA-2016-152156"}', - '{"product_id":"OFF-LA-10000240","order_id":"CA-2016-138688"}']}, '2024-03-14 06:53:39') - ], - - - - - }, { - "product_id": "product1", - "table_name": "dq_spark_local.customer_order", - "rule":"sales_greater_than_zero", - "rule_type": "row_dq", - "source_expectations": "sales > 2", - "source_dq_status": "fail", - "source_dq_actual_result": None, - "source_dq_row_count": '5', - "target_expectations": None, - "target_dq_status": None, - "target_dq_actual_result": None, - "target_dq_row_count": None, - - }, 'row_dq',None), - - ({ - "input_count": 100, - "error_count": 10, - "output_count": 90, - "rules_execution_settings_config": - {'row_dq': False, 'source_agg_dq': False, 'source_query_dq': True, 'target_agg_dq': False, 'target_query_dq': False}, - "agg_dq_detailed_stats_status": False, - "source_agg_dq_status": "Skipped", - "final_agg_dq_status": "Skipped", - "query_dq_detailed_stats_status": True, - "source_query_dq_status": "Passed", - "final_query_dq_status": "Skipped", - "row_dq_status": "Passed", - "summarised_row_dq_res" : [], - "run_id" : "product_1_01450932-d5c2-11ee-a9ca-88e9fe5a7109", - "input_count" : 5, - "dq_expectations" : { - 'row_dq_rules': [], - }, - "test_dq_detailed_stats_table":"test_dq_detailed_stats_table", - - "test_querydq_output_custom_table_name": "test_querydq_output_custom_table_name", - - "detailed_stats_table_writer_config" : {'mode': 'overwrite', "format": "delta", 'partitionBy': [], 'bucketBy': {}, 'sortBy': [], - 'options': {"mergeSchema": "true"}}, - - "rowdq_detailed_stats" : [ - ], - - "source_agg_dq_detailed_stats": [ - ], - - "target_agg_dq_detailed_stats": [ - ], - - "source_query_dq_detailed_stats": [('product_1_52fed65a-d670-11ee-8dfb-ae03267c3341', 'product_1', 'dq_spark_local.customer_order', 'query_dq', 'product_missing_count_threshold', '((select count(*) from (select distinct product_id,order_id from order_source) a) - (select count(*) from (select distinct product_id,order_id from order_target) b) ) < 3', 'validity', 'row count threshold', 'pass', - 1, - '<3', 5,0,5) - ], - - "target_query_dq_detailed_stats": [ - - ], - - - "source_query_dq_output": [('your_product_96bb003e-e1cf-11ee-9a59-ae03267c3340', - 'your_product', 'dq_spark_local.customer_order', - 'product_missing_count_threshold', 'source_f1', '_source_dq', - {'source_f1': ['{"product_id":"FUR-TA-10000577","order_id":"US-2015-108966"}', - '{"product_id":"OFF-ST-10000760","order_id":"US-2015-108966"}', - '{"product_id":"FUR-CH-10000454","order_id":"CA-2016-152156"}', - '{"product_id":"FUR-BO-10001798","order_id":"CA-2016-152156"}', - '{"product_id":"OFF-LA-10000240","order_id":"CA-2016-138688"}']}, '2024-03-14 06:53:39'), - ('your_product_96bb003e-e1cf-11ee-9a59-ae03267c3340', - 'your_product', 'dq_spark_local.customer_order', - 'product_missing_count_threshold', 'target_f1', '_source_dq', - {'target_f1': ['{"product_id":"FUR-TA-10000577","order_id":"US-2015-108966"}', - '{"product_id":"FUR-CH-10000454","order_id":"CA-2016-152156"}', - '{"product_id":"FUR-BO-10001798","order_id":"CA-2016-152156"}', - '{"product_id":"OFF-LA-10000240","order_id":"CA-2016-138688"}']}, '2024-03-14 06:53:39'), - ], - "target_query_dq_output": [ ], - - - - - }, { - "product_id": "product_1", - "table_name": "dq_spark_local.customer_order", - "rule":"product_missing_count_threshold", - "rule_type": "query_dq", - "source_expectations": "((select count(*) from (select distinct product_id,order_id from order_source) a) - (select count(*) from (select distinct product_id,order_id from order_target) b) ) < 3", - "source_dq_status": "pass", - "source_dq_actual_result": '1', - "source_dq_row_count": '5', - "target_expectations": None, - "target_dq_status": None, - "target_dq_actual_result": None, - "target_dq_row_count": None, - - }, 'query_dq',None), - - ({ - "input_count": 100, - "error_count": 10, - "output_count": 90, - "rules_execution_settings_config": - {'row_dq': True, 'source_agg_dq': True, 'source_query_dq': True, 'target_agg_dq': True, 'target_query_dq': True}, - "agg_dq_detailed_stats_status": False, - "source_agg_dq_status": "Passed", - "final_agg_dq_status": "Passed", - "query_dq_detailed_stats_status": True, - "source_query_dq_status": "Passed", - "final_query_dq_status": "Passed", - "row_dq_status": "Passed", - "summarised_row_dq_res" : [{'rule_type':"row_dq", "rule" : "sales_greater_than_zero", "description" : "sales value should be greater than zero", "failed_row_count": 1, "tag" :"validity", "action_if_failed" : "drop"}], - "run_id" : "product_1_01450932-d5c2-11ee-a9ca-88e9fe5a7109", - "input_count" : 5, - "dq_expectations" : { - 'row_dq_rules': [{'product_id': 'your_product', 'table_name': 'dq_spark_local.customer_order', 'rule_type': 'row_dq', 'rule': 'sales_greater_than_zero', 'column_name': 'sales', 'expectation': 'sales > 2', 'action_if_failed': 'drop', 'enable_for_source_dq_validation': False, 'enable_for_target_dq_validation': True, 'tag': 'accuracy', 'description': 'sales value should be greater than zero', 'enable_error_drop_alert': False, 'error_drop_threshold': 0}], - }, - "test_dq_detailed_stats_table":"test_dq_detailed_stats_table", - - "test_querydq_output_custom_table_name": "test_querydq_output_custom_table_name", - - "detailed_stats_table_writer_config" : {'mode': 'overwrite', "format": "delta", 'partitionBy': [], 'bucketBy': {}, 'sortBy': [], - 'options': {"mergeSchema": "true"}}, - - "rowdq_detailed_stats" : [('product_1_01450932-d5c2-11ee-a9ca-88e9fe5a7109', 'product_1','dq_spark_local.customer_order', 'row_dq', 'sales_greater_than_zero', 'sales > 2', 'accuracy', 'sales value should be greater than zero', 'fail', None, None, None, 4), - ], - - "source_agg_dq_detailed_stats": [('product_1_01450932-d5c2-11ee-a9ca-88e9fe5a7109', 'product_1', 'dq_spark_local.customer_order', 'agg_dq', 'sum_of_sales', 'sum(sales)>10000', 'validity', 'regex format validation for quantity','fail', [1988], ['>10000'], 5), - ], - - "target_agg_dq_detailed_stats": [('product_1_01450932-d5c2-11ee-a9ca-88e9fe5a7109', 'product_1', 'dq_spark_local.customer_order', 'agg_dq', 'sum_of_sales', 'sum(sales)>10000', 'validity', 'regex format validation for quantity','fail', [1030], ['>10000'], 4), - ], - - "source_query_dq_detailed_stats": [('product_1_52fed65a-d670-11ee-8dfb-ae03267c3341', 'product_1', 'dq_spark_local.customer_order', 'query_dq', 'product_missing_count_threshold', '((select count(*) from (select distinct product_id,order_id from order_source) a) - (select count(*) from (select distinct product_id,order_id from order_target) b) ) < 3', 'validity', 'row count threshold', 'pass', - 1, - '<3', 5,0,5) - ], - - "target_query_dq_detailed_stats": [('product_1_52fed65a-d670-11ee-8dfb-ae03267c3341', 'product_1', 'dq_spark_local.customer_order', 'query_dq', 'product_missing_count_threshold', '((select count(*) from (select distinct product_id,order_id from order_source) a) - (select count(*) from (select distinct product_id,order_id from order_target) b) ) < 3', 'validity', 'row count threshold', 'pass', - 1, - '<3', 4,0,4) - ], - - "source_query_dq_output": [('your_product_96bb003e-e1cf-11ee-9a59-ae03267c3340', - 'your_product', 'dq_spark_local.customer_order', - 'product_missing_count_threshold', 'source_f1', '_source_dq', - {'source_f1': ['{"product_id":"FUR-TA-10000577","order_id":"US-2015-108966"}', - '{"product_id":"OFF-ST-10000760","order_id":"US-2015-108966"}', - '{"product_id":"FUR-CH-10000454","order_id":"CA-2016-152156"}', - '{"product_id":"FUR-BO-10001798","order_id":"CA-2016-152156"}', - '{"product_id":"OFF-LA-10000240","order_id":"CA-2016-138688"}']}, '2024-03-14 06:53:39'), - ('your_product_96bb003e-e1cf-11ee-9a59-ae03267c3340', - 'your_product', 'dq_spark_local.customer_order', - 'product_missing_count_threshold', 'target_f1', '_source_dq', - {'target_f1': ['{"product_id":"FUR-TA-10000577","order_id":"US-2015-108966"}', - '{"product_id":"FUR-CH-10000454","order_id":"CA-2016-152156"}', - '{"product_id":"FUR-BO-10001798","order_id":"CA-2016-152156"}', - '{"product_id":"OFF-LA-10000240","order_id":"CA-2016-138688"}']}, '2024-03-14 06:53:39'), - ], - "target_query_dq_output": [('your_product_96bb003e-e1cf-11ee-9a59-ae03267c3340', - 'your_product', 'dq_spark_local.customer_order', - 'product_missing_count_threshold', 'source_f1', '_target_dq', - {'source_f1': ['{"product_id":"FUR-TA-10000577","order_id":"US-2015-108966"}', - '{"product_id":"OFF-ST-10000760","order_id":"US-2015-108966"}', - '{"product_id":"FUR-CH-10000454","order_id":"CA-2016-152156"}', - '{"product_id":"FUR-BO-10001798","order_id":"CA-2016-152156"}', - '{"product_id":"OFF-LA-10000240","order_id":"CA-2016-138688"}']}, '2024-03-14 06:53:39'), - ('your_product_96bb003e-e1cf-11ee-9a59-ae03267c3340', - 'your_product', 'dq_spark_local.customer_order', - 'product_missing_count_threshold', 'target_f1', '_target_dq', - {'target_f1': ['{"product_id":"FUR-TA-10000577","order_id":"US-2015-108966"}', - '{"product_id":"FUR-CH-10000454","order_id":"CA-2016-152156"}', - '{"product_id":"FUR-BO-10001798","order_id":"CA-2016-152156"}', - '{"product_id":"OFF-LA-10000240","order_id":"CA-2016-138688"}']}, '2024-03-14 06:53:39') - ], - - - }, { - "product_id": "product_1", - "table_name": "dq_spark_local.customer_order", - "rule":"product_missing_count_threshold", - "rule_type": "query_dq", - "source_expectations": "((select count(*) from (select distinct product_id,order_id from order_source) a) - (select count(*) from (select distinct product_id,order_id from order_target) b) ) < 3", - "source_dq_status": "pass", - "source_dq_actual_result": 5, - "source_dq_row_count": 5, - "target_expectations": "((select count(*) from (select distinct product_id,order_id from order_source) a) - (select count(*) from (select distinct product_id,order_id from order_target) b) ) < 3", - "target_dq_status": "pass", - "target_dq_actual_result": 5, - "target_dq_row_count": 4, - - - },'query_dq', {'mode': 'append', "format": "bigquery", 'partitionBy': [], 'bucketBy': {}, 'sortBy': [], - 'options': {"mergeSchema": "true"}}), - -]) - -def test_write_detailed_stats(input_record, - expected_result,dq_check, - writer_config,) -> None: + +@pytest.mark.parametrize( + "input_record, expected_result,dq_check, writer_config", + [ + ( + { + "input_count": 100, + "error_count": 10, + "output_count": 90, + "rules_execution_settings_config": { + "row_dq": True, + "source_agg_dq": True, + "source_query_dq": True, + "target_agg_dq": True, + "target_query_dq": True, + }, + "agg_dq_detailed_stats_status": True, + "source_agg_dq_status": "Passed", + "final_agg_dq_status": "Passed", + "query_dq_detailed_stats_status": False, + "source_query_dq_status": "Passed", + "final_query_dq_status": "Passed", + "row_dq_status": "Passed", + "summarised_row_dq_res": [ + { + "rule_type": "row_dq", + "rule": "sales_greater_than_zero", + "description": "sales value should be greater than zero", + "failed_row_count": 1, + "tag": "validity", + "action_if_failed": "drop", + } + ], + "run_id": "product_1_01450932-d5c2-11ee-a9ca-88e9fe5a7109", + "input_count": 5, + "dq_expectations": { + "row_dq_rules": [ + { + "product_id": "your_product", + "table_name": "dq_spark_local.customer_order", + "rule_type": "row_dq", + "rule": "sales_greater_than_zero", + "column_name": "sales", + "expectation": "sales > 2", + "action_if_failed": "drop", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": True, + "tag": "accuracy", + "description": "sales value should be greater than zero", + "enable_error_drop_alert": False, + "error_drop_threshold": 0, + } + ], + }, + "test_dq_detailed_stats_table": "test_dq_detailed_stats_table", + "test_querydq_output_custom_table_name": "test_querydq_output_custom_table_name", + "detailed_stats_table_writer_config": { + "mode": "overwrite", + "format": "delta", + "partitionBy": [], + "bucketBy": {}, + "sortBy": [], + "options": {"mergeSchema": "true"}, + }, + "rowdq_detailed_stats": [ + ( + "product_1_01450932-d5c2-11ee-a9ca-88e9fe5a7109", + "product_1", + "dq_spark_local.customer_order", + "row_dq", + "sales_greater_than_zero", + "sales > 2", + "accuracy", + "sales value should be greater than zero", + "fail", + None, + None, + None, + 4, + 0, + 4, + "2024-03-14 00:00:00", + "2024-03-14 00:10:00", + ), + ], + "source_agg_dq_detailed_stats": [ + ( + "product_1_01450932-d5c2-11ee-a9ca-88e9fe5a7109", + "product_1", + "dq_spark_local.customer_order", + "agg_dq", + "sum_of_sales", + "sum(sales)>10000", + "validity", + "regex format validation for quantity", + "fail", + 1988, + ">10000", + 5, + 0, + 5, + "2024-03-14 00:00:00", + "2024-03-14 00:10:00", + ), + ], + "target_agg_dq_detailed_stats": [ + ( + "product_1_01450932-d5c2-11ee-a9ca-88e9fe5a7109", + "product_1", + "dq_spark_local.customer_order", + "agg_dq", + "sum_of_sales", + "sum(sales)>10000", + "validity", + "regex format validation for quantity", + "fail", + 1030, + ">10000", + 4, + 0, + 4, + "2024-03-14 01:00:00", + "2024-03-14 01:10:00", + ), + ], + "source_query_dq_detailed_stats": [ + ( + "product_1_52fed65a-d670-11ee-8dfb-ae03267c3341", + "product_1", + "dq_spark_local.customer_order", + "query_dq", + "product_missing_count_threshold", + "((select count(*) from (select distinct product_id,order_id from order_source) a) - (select count(*) from (select distinct product_id,order_id from order_target) b) ) < 3", + "validity", + "row count threshold", + "pass", + 1, + "<3", + 5, + 0, + 5, + "2024-03-14 02:00:00", + "2024-03-14 02:10:00", + ) + ], + "target_query_dq_detailed_stats": [ + ( + "product_1_52fed65a-d670-11ee-8dfb-ae03267c3341", + "product_1", + "dq_spark_local.customer_order", + "query_dq", + "product_missing_count_threshold", + "((select count(*) from (select distinct product_id,order_id from order_source) a) - (select count(*) from (select distinct product_id,order_id from order_target) b) ) < 3", + "validity", + "row count threshold", + "pass", + 1, + "<3", + 5, + 0, + 5, + "2024-03-14 03:00:00", + "2024-03-14 03:10:00", + ) + ], + "source_query_dq_output": [ + ( + "your_product_96bb003e-e1cf-11ee-9a59-ae03267c3340", + "your_product", + "dq_spark_local.customer_order", + "product_missing_count_threshold", + "source_f1", + "_source_dq", + { + "source_f1": [ + '{"product_id":"FUR-TA-10000577","order_id":"US-2015-108966"}', + '{"product_id":"OFF-ST-10000760","order_id":"US-2015-108966"}', + '{"product_id":"FUR-CH-10000454","order_id":"CA-2016-152156"}', + '{"product_id":"FUR-BO-10001798","order_id":"CA-2016-152156"}', + '{"product_id":"OFF-LA-10000240","order_id":"CA-2016-138688"}', + ] + }, + "2024-03-14 06:53:39", + ), + ( + "your_product_96bb003e-e1cf-11ee-9a59-ae03267c3340", + "your_product", + "dq_spark_local.customer_order", + "product_missing_count_threshold", + "target_f1", + "_source_dq", + { + "target_f1": [ + '{"product_id":"FUR-TA-10000577","order_id":"US-2015-108966"}', + '{"product_id":"FUR-CH-10000454","order_id":"CA-2016-152156"}', + '{"product_id":"FUR-BO-10001798","order_id":"CA-2016-152156"}', + '{"product_id":"OFF-LA-10000240","order_id":"CA-2016-138688"}', + ] + }, + "2024-03-14 06:53:39", + ), + ], + "target_query_dq_output": [ + ( + "your_product_96bb003e-e1cf-11ee-9a59-ae03267c3340", + "your_product", + "dq_spark_local.customer_order", + "product_missing_count_threshold", + "source_f1", + "_target_dq", + { + "source_f1": [ + '{"product_id":"FUR-TA-10000577","order_id":"US-2015-108966"}', + '{"product_id":"OFF-ST-10000760","order_id":"US-2015-108966"}', + '{"product_id":"FUR-CH-10000454","order_id":"CA-2016-152156"}', + '{"product_id":"FUR-BO-10001798","order_id":"CA-2016-152156"}', + '{"product_id":"OFF-LA-10000240","order_id":"CA-2016-138688"}', + ] + }, + "2024-03-14 06:53:39", + ), + ( + "your_product_96bb003e-e1cf-11ee-9a59-ae03267c3340", + "your_product", + "dq_spark_local.customer_order", + "product_missing_count_threshold", + "target_f1", + "_target_dq", + { + "target_f1": [ + '{"product_id":"FUR-TA-10000577","order_id":"US-2015-108966"}', + '{"product_id":"FUR-CH-10000454","order_id":"CA-2016-152156"}', + '{"product_id":"FUR-BO-10001798","order_id":"CA-2016-152156"}', + '{"product_id":"OFF-LA-10000240","order_id":"CA-2016-138688"}', + ] + }, + "2024-03-14 06:53:39", + ), + ], + }, + { + "product_id": "product_1", + "table_name": "dq_spark_local.customer_order", + "rule": "sum_of_sales", + "rule_type": "agg_dq", + "source_expectations": "sum(sales)>10000", + "source_dq_status": "fail", + "source_dq_actual_result": "1988", + "source_dq_row_count": "5", + "target_expectations": "sum(sales)>10000", + "target_dq_status": "fail", + "target_dq_actual_result": "1030", + "target_dq_row_count": "4", + "source_expectations": "sum(sales)>10000", + }, + "agg_dq", + None, + ), + ( + { + "input_count": 100, + "error_count": 10, + "output_count": 90, + "rules_execution_settings_config": { + "row_dq": True, + "source_agg_dq": True, + "source_query_dq": True, + "target_agg_dq": True, + "target_query_dq": True, + }, + "agg_dq_detailed_stats_status": True, + "source_agg_dq_status": "Passed", + "final_agg_dq_status": "Passed", + "query_dq_detailed_stats_status": False, + "source_query_dq_status": "Passed", + "final_query_dq_status": "Passed", + "row_dq_status": "Passed", + "summarised_row_dq_res": [ + { + "rule_type": "row_dq", + "rule": "sales_greater_than_zero", + "description": "sales value should be greater than zero", + "failed_row_count": 1, + "tag": "validity", + "action_if_failed": "drop", + } + ], + "run_id": "product_1_01450932-d5c2-11ee-a9ca-88e9fe5a7109", + "input_count": 5, + "dq_expectations": { + "row_dq_rules": [ + { + "product_id": "your_product", + "table_name": "dq_spark_local.customer_order", + "rule_type": "row_dq", + "rule": "sales_greater_than_zero", + "column_name": "sales", + "expectation": "sales > 2", + "action_if_failed": "drop", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": True, + "tag": "accuracy", + "description": "sales value should be greater than zero", + "enable_error_drop_alert": False, + "error_drop_threshold": 0, + } + ], + }, + "test_dq_detailed_stats_table": "test_dq_detailed_stats_table", + "test_querydq_output_custom_table_name": "test_querydq_output_custom_table_name", + "detailed_stats_table_writer_config": { + "mode": "overwrite", + "format": "delta", + "partitionBy": [], + "bucketBy": {}, + "sortBy": [], + "options": {"mergeSchema": "true"}, + }, + "rowdq_detailed_stats": [ + ( + "product_1_01450932-d5c2-11ee-a9ca-88e9fe5a7109", + "product_1", + "dq_spark_local.customer_order", + "row_dq", + "sales_greater_than_zero", + "sales > 2", + "accuracy", + "sales value should be greater than zero", + "fail", + None, + None, + None, + 4, + 0, + 4, + "2024-03-14 00:00:00", + "2024-03-14 00:10:00", + ), + ], + "source_agg_dq_detailed_stats": [ + ( + "product_1_01450932-d5c2-11ee-a9ca-88e9fe5a7109", + "product_1", + "dq_spark_local.customer_order", + "agg_dq", + "sum_of_sales", + "sum(sales)>10000", + "validity", + "regex format validation for quantity", + "fail", + 1988, + ">10000", + 5, + 0, + 5, + "2024-03-14 00:00:00", + "2024-03-14 00:10:00", + ), + ], + "target_agg_dq_detailed_stats": [ + ( + "product_1_01450932-d5c2-11ee-a9ca-88e9fe5a7109", + "product_1", + "dq_spark_local.customer_order", + "agg_dq", + "sum_of_sales", + "sum(sales)>10000", + "validity", + "regex format validation for quantity", + "fail", + 1030, + ">10000", + 4, + 0, + 4, + "2024-03-14 01:00:00", + "2024-03-14 01:10:00", + ), + ], + "source_query_dq_detailed_stats": [ + ( + "product_1_52fed65a-d670-11ee-8dfb-ae03267c3341", + "product_1", + "dq_spark_local.customer_order", + "query_dq", + "product_missing_count_threshold", + "((select count(*) from (select distinct product_id,order_id from order_source) a) - (select count(*) from (select distinct product_id,order_id from order_target) b) ) < 3", + "validity", + "row count threshold", + "pass", + 1, + "<3", + 5, + 0, + 5, + "2024-03-14 02:00:00", + "2024-03-14 02:10:00", + ) + ], + "target_query_dq_detailed_stats": [ + ( + "product_1_52fed65a-d670-11ee-8dfb-ae03267c3341", + "product_1", + "dq_spark_local.customer_order", + "query_dq", + "product_missing_count_threshold", + "((select count(*) from (select distinct product_id,order_id from order_source) a) - (select count(*) from (select distinct product_id,order_id from order_target) b) ) < 3", + "validity", + "row count threshold", + "pass", + 1, + "<3", + 5, + 0, + 5, + "2024-03-14 03:00:00", + "2024-03-14 03:10:00", + ) + ], + "source_query_dq_output": [ + ( + "your_product_96bb003e-e1cf-11ee-9a59-ae03267c3340", + "your_product", + "dq_spark_local.customer_order", + "product_missing_count_threshold", + "source_f1", + "_source_dq", + { + "source_f1": [ + '{"product_id":"FUR-TA-10000577","order_id":"US-2015-108966"}', + '{"product_id":"OFF-ST-10000760","order_id":"US-2015-108966"}', + '{"product_id":"FUR-CH-10000454","order_id":"CA-2016-152156"}', + '{"product_id":"FUR-BO-10001798","order_id":"CA-2016-152156"}', + '{"product_id":"OFF-LA-10000240","order_id":"CA-2016-138688"}', + ] + }, + "2024-03-14 06:53:39", + ), + ( + "your_product_96bb003e-e1cf-11ee-9a59-ae03267c3340", + "your_product", + "dq_spark_local.customer_order", + "product_missing_count_threshold", + "target_f1", + "_source_dq", + { + "target_f1": [ + '{"product_id":"FUR-TA-10000577","order_id":"US-2015-108966"}', + '{"product_id":"FUR-CH-10000454","order_id":"CA-2016-152156"}', + '{"product_id":"FUR-BO-10001798","order_id":"CA-2016-152156"}', + '{"product_id":"OFF-LA-10000240","order_id":"CA-2016-138688"}', + ] + }, + "2024-03-14 06:53:39", + ), + ], + "target_query_dq_output": [ + ( + "your_product_96bb003e-e1cf-11ee-9a59-ae03267c3340", + "your_product", + "dq_spark_local.customer_order", + "product_missing_count_threshold", + "source_f1", + "_target_dq", + { + "source_f1": [ + '{"product_id":"FUR-TA-10000577","order_id":"US-2015-108966"}', + '{"product_id":"OFF-ST-10000760","order_id":"US-2015-108966"}', + '{"product_id":"FUR-CH-10000454","order_id":"CA-2016-152156"}', + '{"product_id":"FUR-BO-10001798","order_id":"CA-2016-152156"}', + '{"product_id":"OFF-LA-10000240","order_id":"CA-2016-138688"}', + ] + }, + "2024-03-14 06:53:39", + ), + ( + "your_product_96bb003e-e1cf-11ee-9a59-ae03267c3340", + "your_product", + "dq_spark_local.customer_order", + "product_missing_count_threshold", + "target_f1", + "_target_dq", + { + "target_f1": [ + '{"product_id":"FUR-TA-10000577","order_id":"US-2015-108966"}', + '{"product_id":"FUR-CH-10000454","order_id":"CA-2016-152156"}', + '{"product_id":"FUR-BO-10001798","order_id":"CA-2016-152156"}', + '{"product_id":"OFF-LA-10000240","order_id":"CA-2016-138688"}', + ] + }, + "2024-03-14 06:53:39", + ), + ], + }, + { + "product_id": "product1", + "table_name": "dq_spark_local.customer_order", + "rule": "sales_greater_than_zero", + "rule_type": "row_dq", + "source_expectations": "sales > 2", + "source_dq_status": "fail", + "source_dq_actual_result": None, + "source_dq_row_count": "5", + "target_expectations": None, + "target_dq_status": None, + "target_dq_actual_result": None, + "target_dq_row_count": None, + }, + "row_dq", + None, + ), + ( + { + "input_count": 100, + "error_count": 10, + "output_count": 90, + "rules_execution_settings_config": { + "row_dq": False, + "source_agg_dq": False, + "source_query_dq": True, + "target_agg_dq": False, + "target_query_dq": False, + }, + "agg_dq_detailed_stats_status": False, + "source_agg_dq_status": "Skipped", + "final_agg_dq_status": "Skipped", + "query_dq_detailed_stats_status": True, + "source_query_dq_status": "Passed", + "final_query_dq_status": "Skipped", + "row_dq_status": "Passed", + "summarised_row_dq_res": [], + "run_id": "product_1_01450932-d5c2-11ee-a9ca-88e9fe5a7109", + "input_count": 5, + "dq_expectations": { + "row_dq_rules": [], + }, + "test_dq_detailed_stats_table": "test_dq_detailed_stats_table", + "test_querydq_output_custom_table_name": "test_querydq_output_custom_table_name", + "detailed_stats_table_writer_config": { + "mode": "overwrite", + "format": "delta", + "partitionBy": [], + "bucketBy": {}, + "sortBy": [], + "options": {"mergeSchema": "true"}, + }, + "rowdq_detailed_stats": [], + "source_agg_dq_detailed_stats": [], + "target_agg_dq_detailed_stats": [], + "source_query_dq_detailed_stats": [ + ( + "product_1_52fed65a-d670-11ee-8dfb-ae03267c3341", + "product_1", + "dq_spark_local.customer_order", + "query_dq", + "product_missing_count_threshold", + "((select count(*) from (select distinct product_id,order_id from order_source) a) - (select count(*) from (select distinct product_id,order_id from order_target) b) ) < 3", + "validity", + "row count threshold", + "pass", + 1, + "<3", + 5, + 0, + 5, + "2024-03-14 00:00:00", + "2024-03-14 00:10:00", + ) + ], + "target_query_dq_detailed_stats": [], + "source_query_dq_output": [ + ( + "your_product_96bb003e-e1cf-11ee-9a59-ae03267c3340", + "your_product", + "dq_spark_local.customer_order", + "product_missing_count_threshold", + "source_f1", + "_source_dq", + { + "source_f1": [ + '{"product_id":"FUR-TA-10000577","order_id":"US-2015-108966"}', + '{"product_id":"OFF-ST-10000760","order_id":"US-2015-108966"}', + '{"product_id":"FUR-CH-10000454","order_id":"CA-2016-152156"}', + '{"product_id":"FUR-BO-10001798","order_id":"CA-2016-152156"}', + '{"product_id":"OFF-LA-10000240","order_id":"CA-2016-138688"}', + ] + }, + "2024-03-14 06:53:39", + ), + ( + "your_product_96bb003e-e1cf-11ee-9a59-ae03267c3340", + "your_product", + "dq_spark_local.customer_order", + "product_missing_count_threshold", + "target_f1", + "_source_dq", + { + "target_f1": [ + '{"product_id":"FUR-TA-10000577","order_id":"US-2015-108966"}', + '{"product_id":"FUR-CH-10000454","order_id":"CA-2016-152156"}', + '{"product_id":"FUR-BO-10001798","order_id":"CA-2016-152156"}', + '{"product_id":"OFF-LA-10000240","order_id":"CA-2016-138688"}', + ] + }, + "2024-03-14 06:53:39", + ), + ], + "target_query_dq_output": [], + }, + { + "product_id": "product_1", + "table_name": "dq_spark_local.customer_order", + "rule": "product_missing_count_threshold", + "rule_type": "query_dq", + "source_expectations": "((select count(*) from (select distinct product_id,order_id from order_source) a) - (select count(*) from (select distinct product_id,order_id from order_target) b) ) < 3", + "source_dq_status": "pass", + "source_dq_actual_result": "1", + "source_dq_row_count": "5", + "target_expectations": None, + "target_dq_status": None, + "target_dq_actual_result": None, + "target_dq_row_count": None, + }, + "query_dq", + None, + ), + ( + { + "input_count": 100, + "error_count": 10, + "output_count": 90, + "rules_execution_settings_config": { + "row_dq": True, + "source_agg_dq": True, + "source_query_dq": True, + "target_agg_dq": True, + "target_query_dq": True, + }, + "agg_dq_detailed_stats_status": False, + "source_agg_dq_status": "Passed", + "final_agg_dq_status": "Passed", + "query_dq_detailed_stats_status": True, + "source_query_dq_status": "Passed", + "final_query_dq_status": "Passed", + "row_dq_status": "Passed", + "summarised_row_dq_res": [ + { + "rule_type": "row_dq", + "rule": "sales_greater_than_zero", + "description": "sales value should be greater than zero", + "failed_row_count": 1, + "tag": "validity", + "action_if_failed": "drop", + } + ], + "run_id": "product_1_01450932-d5c2-11ee-a9ca-88e9fe5a7109", + "input_count": 5, + "dq_expectations": { + "row_dq_rules": [ + { + "product_id": "your_product", + "table_name": "dq_spark_local.customer_order", + "rule_type": "row_dq", + "rule": "sales_greater_than_zero", + "column_name": "sales", + "expectation": "sales > 2", + "action_if_failed": "drop", + "enable_for_source_dq_validation": False, + "enable_for_target_dq_validation": True, + "tag": "accuracy", + "description": "sales value should be greater than zero", + "enable_error_drop_alert": False, + "error_drop_threshold": 0, + } + ], + }, + "test_dq_detailed_stats_table": "test_dq_detailed_stats_table", + "test_querydq_output_custom_table_name": "test_querydq_output_custom_table_name", + "detailed_stats_table_writer_config": { + "mode": "overwrite", + "format": "delta", + "partitionBy": [], + "bucketBy": {}, + "sortBy": [], + "options": {"mergeSchema": "true"}, + }, + "rowdq_detailed_stats": [ + ( + "product_1_01450932-d5c2-11ee-a9ca-88e9fe5a7109", + "product_1", + "dq_spark_local.customer_order", + "row_dq", + "sales_greater_than_zero", + "sales > 2", + "accuracy", + "sales value should be greater than zero", + "fail", + None, + None, + None, + 4, + "2024-03-14 00:00:00", + "2024-03-14 00:10:00", + ), + ], + "source_agg_dq_detailed_stats": [ + ( + "product_1_01450932-d5c2-11ee-a9ca-88e9fe5a7109", + "product_1", + "dq_spark_local.customer_order", + "agg_dq", + "sum_of_sales", + "sum(sales)>10000", + "validity", + "regex format validation for quantity", + "fail", + [1988], + [">10000"], + 5, + "2024-03-14 00:00:00", + "2024-03-14 00:10:00", + ), + ], + "target_agg_dq_detailed_stats": [ + ( + "product_1_01450932-d5c2-11ee-a9ca-88e9fe5a7109", + "product_1", + "dq_spark_local.customer_order", + "agg_dq", + "sum_of_sales", + "sum(sales)>10000", + "validity", + "regex format validation for quantity", + "fail", + [1030], + [">10000"], + 4, + "2024-03-14 01:00:00", + "2024-03-14 01:10:00", + ), + ], + "source_query_dq_detailed_stats": [ + ( + "product_1_52fed65a-d670-11ee-8dfb-ae03267c3341", + "product_1", + "dq_spark_local.customer_order", + "query_dq", + "product_missing_count_threshold", + "((select count(*) from (select distinct product_id,order_id from order_source) a) - (select count(*) from (select distinct product_id,order_id from order_target) b) ) < 3", + "validity", + "row count threshold", + "pass", + 1, + "<3", + 5, + 0, + 5, + "2024-03-14 02:00:00", + "2024-03-14 02:10:00", + ) + ], + "target_query_dq_detailed_stats": [ + ( + "product_1_52fed65a-d670-11ee-8dfb-ae03267c3341", + "product_1", + "dq_spark_local.customer_order", + "query_dq", + "product_missing_count_threshold", + "((select count(*) from (select distinct product_id,order_id from order_source) a) - (select count(*) from (select distinct product_id,order_id from order_target) b) ) < 3", + "validity", + "row count threshold", + "pass", + 1, + "<3", + 4, + 0, + 4, + "2024-03-14 03:00:00", + "2024-03-14 03:10:00", + ) + ], + "source_query_dq_output": [ + ( + "your_product_96bb003e-e1cf-11ee-9a59-ae03267c3340", + "your_product", + "dq_spark_local.customer_order", + "product_missing_count_threshold", + "source_f1", + "_source_dq", + { + "source_f1": [ + '{"product_id":"FUR-TA-10000577","order_id":"US-2015-108966"}', + '{"product_id":"OFF-ST-10000760","order_id":"US-2015-108966"}', + '{"product_id":"FUR-CH-10000454","order_id":"CA-2016-152156"}', + '{"product_id":"FUR-BO-10001798","order_id":"CA-2016-152156"}', + '{"product_id":"OFF-LA-10000240","order_id":"CA-2016-138688"}', + ] + }, + "2024-03-14 06:53:39", + ), + ( + "your_product_96bb003e-e1cf-11ee-9a59-ae03267c3340", + "your_product", + "dq_spark_local.customer_order", + "product_missing_count_threshold", + "target_f1", + "_source_dq", + { + "target_f1": [ + '{"product_id":"FUR-TA-10000577","order_id":"US-2015-108966"}', + '{"product_id":"FUR-CH-10000454","order_id":"CA-2016-152156"}', + '{"product_id":"FUR-BO-10001798","order_id":"CA-2016-152156"}', + '{"product_id":"OFF-LA-10000240","order_id":"CA-2016-138688"}', + ] + }, + "2024-03-14 06:53:39", + ), + ], + "target_query_dq_output": [ + ( + "your_product_96bb003e-e1cf-11ee-9a59-ae03267c3340", + "your_product", + "dq_spark_local.customer_order", + "product_missing_count_threshold", + "source_f1", + "_target_dq", + { + "source_f1": [ + '{"product_id":"FUR-TA-10000577","order_id":"US-2015-108966"}', + '{"product_id":"OFF-ST-10000760","order_id":"US-2015-108966"}', + '{"product_id":"FUR-CH-10000454","order_id":"CA-2016-152156"}', + '{"product_id":"FUR-BO-10001798","order_id":"CA-2016-152156"}', + '{"product_id":"OFF-LA-10000240","order_id":"CA-2016-138688"}', + ] + }, + "2024-03-14 06:53:39", + ), + ( + "your_product_96bb003e-e1cf-11ee-9a59-ae03267c3340", + "your_product", + "dq_spark_local.customer_order", + "product_missing_count_threshold", + "target_f1", + "_target_dq", + { + "target_f1": [ + '{"product_id":"FUR-TA-10000577","order_id":"US-2015-108966"}', + '{"product_id":"FUR-CH-10000454","order_id":"CA-2016-152156"}', + '{"product_id":"FUR-BO-10001798","order_id":"CA-2016-152156"}', + '{"product_id":"OFF-LA-10000240","order_id":"CA-2016-138688"}', + ] + }, + "2024-03-14 06:53:39", + ), + ], + }, + { + "product_id": "product_1", + "table_name": "dq_spark_local.customer_order", + "rule": "product_missing_count_threshold", + "rule_type": "query_dq", + "source_expectations": "((select count(*) from (select distinct product_id,order_id from order_source) a) - (select count(*) from (select distinct product_id,order_id from order_target) b) ) < 3", + "source_dq_status": "pass", + "source_dq_actual_result": 5, + "source_dq_row_count": 5, + "target_expectations": "((select count(*) from (select distinct product_id,order_id from order_source) a) - (select count(*) from (select distinct product_id,order_id from order_target) b) ) < 3", + "target_dq_status": "pass", + "target_dq_actual_result": 5, + "target_dq_row_count": 4, + }, + "query_dq", + { + "mode": "append", + "format": "bigquery", + "partitionBy": [], + "bucketBy": {}, + "sortBy": [], + "options": {"mergeSchema": "true"}, + }, + ), + ], +) +def test_write_detailed_stats( + input_record, + expected_result, + dq_check, + writer_config, +) -> None: """ This functions writes the detailed stats for all rule type into the detailed stats table @@ -2044,63 +3148,152 @@ def test_write_detailed_stats(input_record, """ _mock_context = Mock(spec=SparkExpectationsContext) - setattr(_mock_context, "get_rules_execution_settings_config", input_record.get("rules_execution_settings_config")) - setattr(_mock_context, "get_agg_dq_detailed_stats_status", input_record.get("agg_dq_detailed_stats_status")) - setattr(_mock_context, "get_source_agg_dq_status", input_record.get("source_agg_dq_status")) - setattr(_mock_context, "get_final_agg_dq_status", input_record.get("final_agg_dq_status")) - setattr(_mock_context, "get_query_dq_detailed_stats_status", input_record.get("query_dq_detailed_stats_status")) - setattr(_mock_context, "get_source_query_dq_status", input_record.get("source_query_dq_status")) - setattr(_mock_context, "get_final_query_dq_status", input_record.get("final_query_dq_status")) + setattr( + _mock_context, + "get_rules_execution_settings_config", + input_record.get("rules_execution_settings_config"), + ) + setattr( + _mock_context, + "get_agg_dq_detailed_stats_status", + input_record.get("agg_dq_detailed_stats_status"), + ) + setattr( + _mock_context, + "get_source_agg_dq_status", + input_record.get("source_agg_dq_status"), + ) + setattr( + _mock_context, + "get_final_agg_dq_status", + input_record.get("final_agg_dq_status"), + ) + setattr( + _mock_context, + "get_query_dq_detailed_stats_status", + input_record.get("query_dq_detailed_stats_status"), + ) + setattr( + _mock_context, + "get_source_query_dq_status", + input_record.get("source_query_dq_status"), + ) + setattr( + _mock_context, + "get_final_query_dq_status", + input_record.get("final_query_dq_status"), + ) setattr(_mock_context, "get_row_dq_status", input_record.get("row_dq_status")) - setattr(_mock_context, "get_summarized_row_dq_res", input_record.get("summarised_row_dq_res")) - setattr(_mock_context, "get_source_agg_dq_detailed_stats", input_record.get("source_agg_dq_detailed_stats")) - setattr(_mock_context, "get_target_agg_dq_detailed_stats", input_record.get("target_agg_dq_detailed_stats")) - setattr(_mock_context, "get_target_query_dq_detailed_stats", input_record.get("target_query_dq_detailed_stats")) - setattr(_mock_context, "get_source_query_dq_detailed_stats", input_record.get("source_query_dq_detailed_stats")) - setattr(_mock_context, "get_detailed_stats_table_writer_config", input_record.get("detailed_stats_table_writer_config")) - setattr(_mock_context, "get_dq_detailed_stats_table_name", input_record.get("test_dq_detailed_stats_table")) - setattr(_mock_context, "get_query_dq_output_custom_table_name", input_record.get("test_querydq_output_custom_table_name")) - - setattr(_mock_context, "get_source_query_dq_output", input_record.get("source_query_dq_output")) - setattr(_mock_context, "get_target_query_dq_output", input_record.get("target_query_dq_output")) - - - + setattr( + _mock_context, + "get_summarized_row_dq_res", + input_record.get("summarised_row_dq_res"), + ) + setattr( + _mock_context, + "get_source_agg_dq_detailed_stats", + input_record.get("source_agg_dq_detailed_stats"), + ) + setattr( + _mock_context, + "get_target_agg_dq_detailed_stats", + input_record.get("target_agg_dq_detailed_stats"), + ) + setattr( + _mock_context, + "get_target_query_dq_detailed_stats", + input_record.get("target_query_dq_detailed_stats"), + ) + setattr( + _mock_context, + "get_source_query_dq_detailed_stats", + input_record.get("source_query_dq_detailed_stats"), + ) + setattr( + _mock_context, + "get_detailed_stats_table_writer_config", + input_record.get("detailed_stats_table_writer_config"), + ) + setattr( + _mock_context, + "get_dq_detailed_stats_table_name", + input_record.get("test_dq_detailed_stats_table"), + ) + setattr( + _mock_context, + "get_query_dq_output_custom_table_name", + input_record.get("test_querydq_output_custom_table_name"), + ) - #setattr(_mock_context, "get_row_dq_detailed_stats", input_record.get("rowdq_detailed_stats")) + setattr( + _mock_context, + "get_source_query_dq_output", + input_record.get("source_query_dq_output"), + ) + setattr( + _mock_context, + "get_target_query_dq_output", + input_record.get("target_query_dq_output"), + ) setattr(_mock_context, "get_run_id", input_record.get("run_id")) setattr(_mock_context, "product_id", "product_1") setattr(_mock_context, "get_table_name", "dq_spark_local.customer_order") setattr(_mock_context, "get_input_count", input_record.get("input_count")) setattr(_mock_context, "get_dq_expectations", input_record.get("dq_expectations")) - + setattr( + _mock_context, + "get_row_dq_start_time", + datetime.strptime("2024-03-14 00:00:00", "%Y-%m-%d %H:%M:%S"), + ) + setattr( + _mock_context, + "get_row_dq_end_time", + datetime.strptime("2024-03-14 00:10:00", "%Y-%m-%d %H:%M:%S"), + ) + setattr( + _mock_context, + "get_job_metadata", + '{"dag": "dag1", "task": "task1", "team": "my_squad"}', + ) if writer_config is None: - setattr(_mock_context, "_stats_table_writer_config", - WrappedDataFrameWriter().mode("overwrite").format("delta").build()) - setattr(_mock_context, 'get_stats_table_writer_config', - WrappedDataFrameWriter().mode("overwrite").format("delta").build()) + setattr( + _mock_context, + "_stats_table_writer_config", + WrappedDataFrameWriter().mode("overwrite").format("delta").build(), + ) + setattr( + _mock_context, + "get_stats_table_writer_config", + WrappedDataFrameWriter().mode("overwrite").format("delta").build(), + ) else: setattr(_mock_context, "_stats_table_writer_config", writer_config) - setattr(_mock_context, 'get_detailed_stats_table_writer_config', writer_config) + setattr(_mock_context, "get_detailed_stats_table_writer_config", writer_config) _mock_context.spark = spark - _mock_context.product_id = 'product1' + _mock_context.product_id = "product1" _fixture_writer = SparkExpectationsWriter(_mock_context) - if writer_config and writer_config['format'] == 'bigquery': - patcher = patch('pyspark.sql.DataFrameWriter.save') + if writer_config and writer_config["format"] == "bigquery": + patcher = patch("pyspark.sql.DataFrameWriter.save") mock_bq = patcher.start() - setattr(_mock_context, 'get_se_streaming_stats_dict', {'se.enable.streaming': False}) + setattr( + _mock_context, "get_se_streaming_stats_dict", {"se.enable.streaming": False} + ) _fixture_writer.write_detailed_stats() mock_bq.assert_called_with() else: - setattr(_mock_context, 'get_se_streaming_stats_dict', {'se.enable.streaming': True}) + setattr( + _mock_context, "get_se_streaming_stats_dict", {"se.enable.streaming": True} + ) _fixture_writer.write_detailed_stats() - - stats_table = spark.sql(f"select * from test_dq_detailed_stats_table where rule_type = '{dq_check}'") + + stats_table = spark.sql( + f"select * from test_dq_detailed_stats_table where rule_type = '{dq_check}'" + ) assert stats_table.count() == 1 row = stats_table.first() assert row.product_id == expected_result.get("product_id") @@ -2109,15 +3302,18 @@ def test_write_detailed_stats(input_record, assert row.rule == expected_result.get("rule") assert row.source_expectations == expected_result.get("source_expectations") assert row.source_dq_status == expected_result.get("source_dq_status") - assert row.source_dq_actual_outcome == expected_result.get("source_dq_actual_result") + assert row.source_dq_actual_outcome == expected_result.get( + "source_dq_actual_result" + ) assert row.source_dq_row_count == expected_result.get("source_dq_row_count") assert row.target_expectations == expected_result.get("target_expectations") assert row.target_dq_status == expected_result.get("target_dq_status") - assert row.target_dq_actual_outcome == expected_result.get("target_dq_actual_result") + assert row.target_dq_actual_outcome == expected_result.get( + "target_dq_actual_result" + ) assert row.target_dq_row_count == expected_result.get("target_dq_row_count") - def test_write_detailed_stats_exception() -> None: """ This functions writes the detailed stats for all rule type into the detailed stats table @@ -2131,35 +3327,32 @@ def test_write_detailed_stats_exception() -> None: """ _mock_context = Mock(spec=SparkExpectationsContext) - setattr(_mock_context, "get_rules_execution_settings_config", {'row_dq': True, 'source_agg_dq': True, 'source_query_dq': True, 'target_agg_dq': True, 'target_query_dq': True}) - setattr(_mock_context, "get_agg_dq_detailed_stats_status", True) + setattr( + _mock_context, + "get_rules_execution_settings_config", + { + "row_dq": True, + "source_agg_dq": True, + "source_query_dq": True, + "target_agg_dq": True, + "target_query_dq": True, + }, + ) + setattr(_mock_context, "get_agg_dq_detailed_stats_status", True) setattr(_mock_context, "get_source_agg_dq_status", "Passed") - _mock_context.spark = spark - _mock_context.product_id = 'product1' + _mock_context.product_id = "product1" _fixture_writer = SparkExpectationsWriter(_mock_context) - with pytest.raises(SparkExpectationsMiscException, - match=r"error occurred while saving the data into the stats table .*"): + with pytest.raises( + SparkExpectationsMiscException, + match=r"error occurred while saving the data into the stats table .*", + ): _fixture_writer.write_detailed_stats() - - - - - - - - - - - - - - @pytest.mark.parametrize("table_name, rule_type", [("test_error_table", "row_dq")]) @patch( "spark_expectations.sinks.utils.writer.SparkExpectationsWriter.save_df_as_table", @@ -2202,36 +3395,14 @@ def test_write_error_records_final_dependent( ) - - - - - - - - - - - - - - - - - - - -@pytest.mark.parametrize('table_name, rule_type', - [('test_error_table', - 'row_dq' - ) - ]) -def test_write_error_records_final(table_name, - rule_type, - _fixture_dq_dataset, - _fixture_expected_dq_dataset, - _fixture_writer): - +@pytest.mark.parametrize("table_name, rule_type", [("test_error_table", "row_dq")]) +def test_write_error_records_final( + table_name, + rule_type, + _fixture_dq_dataset, + _fixture_expected_dq_dataset, + _fixture_writer, +): config = WrappedDataFrameWriter().mode("overwrite").format("delta").build() setattr( @@ -2318,7 +3489,7 @@ def test_write_error_records_final_dependent( }, ], "meta_dq_run_id": "run_id", - "meta_dq_run_date": "2022-12-27 10:39:44" + "meta_dq_run_date": "2022-12-27 10:39:44", }, { "meta_row_dq_results": [ @@ -2331,7 +3502,7 @@ def test_write_error_records_final_dependent( } ], "meta_dq_run_id": "run_id", - "meta_dq_run_date": "2022-12-27 10:39:44" + "meta_dq_run_date": "2022-12-27 10:39:44", }, { "meta_row_dq_results": [ @@ -2344,7 +3515,7 @@ def test_write_error_records_final_dependent( } ], "meta_dq_run_id": "run_id", - "meta_dq_run_date": "2022-12-27 10:39:44" + "meta_dq_run_date": "2022-12-27 10:39:44", }, ], [ @@ -2379,7 +3550,7 @@ def test_write_error_records_final_dependent( } ], "meta_dq_run_id": "run_id", - "meta_dq_run_date": "2022-12-27 10:39:44" + "meta_dq_run_date": "2022-12-27 10:39:44", }, { "meta_row_dq_results": [ @@ -2392,7 +3563,7 @@ def test_write_error_records_final_dependent( } ], "meta_dq_run_id": "run_id", - "meta_dq_run_date": "2022-12-27 10:39:44" + "meta_dq_run_date": "2022-12-27 10:39:44", }, ], [