diff --git a/dbt_project.yml b/dbt_project.yml index d501fdaa..0071b2c8 100644 --- a/dbt_project.yml +++ b/dbt_project.yml @@ -24,7 +24,7 @@ clean-targets: # directories to be removed by `dbt clean` - "target" - "dbt_modules" vars: - re_data:max_columns_in_query: 12 + re_data:max_columns_in_query: 10 re_data:time_window_end: '{{ run_started_at.strftime("%Y-%m-%d 00:00:00") }}' re_data:time_window_start: '{{ (run_started_at - modules.datetime.timedelta(1)).strftime("%Y-%m-%d 00:00:00") }}' re_data:anomaly_detection_look_back_days: 30 @@ -66,7 +66,7 @@ vars: # as tables. These settings can be overridden in the individual model files # using the `{{ config(...) }}` macro. -require-dbt-version: [">=0.20.0", "<0.21.0"] +require-dbt-version: [">=0.20.0", "<0.22.0"] models: re_data: diff --git a/integration_tests/data/expected/expected_sample_data_metrics.csv b/integration_tests/data/expected/expected_sample_data_metrics.csv new file mode 100644 index 00000000..07a44e31 --- /dev/null +++ b/integration_tests/data/expected/expected_sample_data_metrics.csv @@ -0,0 +1,91 @@ +table_name,column_name,metric,value,time_window_start,time_window_end,interval_length_sec +"""postgres"".""dq_raw"".""sample_table""",value1,min,100,2021-05-01 00:00:00,2021-05-02 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",value1,max,200,2021-05-01 00:00:00,2021-05-02 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",value1,avg,127.5,2021-05-01 00:00:00,2021-05-02 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",value1,stddev,48.562674281111555,2021-05-01 00:00:00,2021-05-02 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",value1,variance,2358.3333333333335,2021-05-01 00:00:00,2021-05-02 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",value1,nulls_count,0,2021-05-01 00:00:00,2021-05-02 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",value1,nulls_percent,0,2021-05-01 00:00:00,2021-05-02 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",value1,diff,100,2021-05-01 00:00:00,2021-05-02 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",value2,min,109,2021-05-01 00:00:00,2021-05-02 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",value2,max,209,2021-05-01 00:00:00,2021-05-02 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",value2,avg,180.75,2021-05-01 00:00:00,2021-05-02 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",value2,stddev,47.97481978427155,2021-05-01 00:00:00,2021-05-02 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",value2,variance,2301.5833333333335,2021-05-01 00:00:00,2021-05-02 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",value2,nulls_count,0,2021-05-01 00:00:00,2021-05-02 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",value2,nulls_percent,0,2021-05-01 00:00:00,2021-05-02 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",value2,diff,100,2021-05-01 00:00:00,2021-05-02 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",null_value,min,,2021-05-01 00:00:00,2021-05-02 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",null_value,max,,2021-05-01 00:00:00,2021-05-02 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",null_value,avg,,2021-05-01 00:00:00,2021-05-02 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",null_value,stddev,,2021-05-01 00:00:00,2021-05-02 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",null_value,variance,,2021-05-01 00:00:00,2021-05-02 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",null_value,nulls_count,4,2021-05-01 00:00:00,2021-05-02 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",null_value,nulls_percent,100,2021-05-01 00:00:00,2021-05-02 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",null_value,diff,,2021-05-01 00:00:00,2021-05-02 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",event_type,min_length,3,2021-05-01 00:00:00,2021-05-02 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",event_type,max_length,4,2021-05-01 00:00:00,2021-05-02 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",event_type,avg_length,3.25,2021-05-01 00:00:00,2021-05-02 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",event_type,nulls_count,0,2021-05-01 00:00:00,2021-05-02 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",event_type,nulls_percent,0,2021-05-01 00:00:00,2021-05-02 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",event_type,missing_percent,0,2021-05-01 00:00:00,2021-05-02 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",event_type,missing_count,0,2021-05-01 00:00:00,2021-05-02 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",event_type,regex_test,4,2021-05-01 00:00:00,2021-05-02 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",event_type,match_regex,1,2021-05-01 00:00:00,2021-05-02 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",event_type,match_regex_percent,25,2021-05-01 00:00:00,2021-05-02 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",event_type,not_match_regex,1,2021-05-01 00:00:00,2021-05-02 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",event_type,not_match_regex_percent,25,2021-05-01 00:00:00,2021-05-02 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",event_type,distinct_values,2,2021-05-01 00:00:00,2021-05-02 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",event_type,duplicate_values,1,2021-05-01 00:00:00,2021-05-02 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",event_type,duplicate_rows,3,2021-05-01 00:00:00,2021-05-02 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",event_type,unique_rows,1,2021-05-01 00:00:00,2021-05-02 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""","",row_count,4,2021-05-01 00:00:00,2021-05-02 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""","",freshness,40765,2021-05-01 00:00:00,2021-05-02 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""","",buy_count,3,2021-05-01 00:00:00,2021-05-02 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""","",my_custom_table_metric,1000,2021-05-01 00:00:00,2021-05-02 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""","",distinct_table_rows,4,2021-05-01 00:00:00,2021-05-02 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",value1,min,100,2021-05-02 00:00:00,2021-05-03 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",value1,max,210,2021-05-02 00:00:00,2021-05-03 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",value1,avg,142.5,2021-05-02 00:00:00,2021-05-03 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",value1,stddev,49.91659710623979,2021-05-02 00:00:00,2021-05-03 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",value1,variance,2491.6666666666665,2021-05-02 00:00:00,2021-05-03 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",value1,nulls_count,0,2021-05-02 00:00:00,2021-05-03 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",value1,nulls_percent,0,2021-05-02 00:00:00,2021-05-03 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",value1,diff,110,2021-05-02 00:00:00,2021-05-03 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",value2,min,109,2021-05-02 00:00:00,2021-05-03 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",value2,max,209,2021-05-02 00:00:00,2021-05-03 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",value2,avg,180.75,2021-05-02 00:00:00,2021-05-03 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",value2,stddev,47.97481978427155,2021-05-02 00:00:00,2021-05-03 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",value2,variance,2301.5833333333335,2021-05-02 00:00:00,2021-05-03 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",value2,nulls_count,0,2021-05-02 00:00:00,2021-05-03 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",value2,nulls_percent,0,2021-05-02 00:00:00,2021-05-03 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",value2,diff,100,2021-05-02 00:00:00,2021-05-03 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",null_value,min,,2021-05-02 00:00:00,2021-05-03 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",null_value,max,,2021-05-02 00:00:00,2021-05-03 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",null_value,avg,,2021-05-02 00:00:00,2021-05-03 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",null_value,stddev,,2021-05-02 00:00:00,2021-05-03 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",null_value,variance,,2021-05-02 00:00:00,2021-05-03 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",null_value,nulls_count,4,2021-05-02 00:00:00,2021-05-03 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",null_value,nulls_percent,100,2021-05-02 00:00:00,2021-05-03 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",null_value,diff,,2021-05-02 00:00:00,2021-05-03 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",event_type,min_length,3,2021-05-02 00:00:00,2021-05-03 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",event_type,max_length,3,2021-05-02 00:00:00,2021-05-03 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",event_type,avg_length,3,2021-05-02 00:00:00,2021-05-03 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",event_type,nulls_count,0,2021-05-02 00:00:00,2021-05-03 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",event_type,nulls_percent,0,2021-05-02 00:00:00,2021-05-03 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",event_type,missing_percent,0,2021-05-02 00:00:00,2021-05-03 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",event_type,missing_count,0,2021-05-02 00:00:00,2021-05-03 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",event_type,regex_test,4,2021-05-02 00:00:00,2021-05-03 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",event_type,match_regex,0,2021-05-02 00:00:00,2021-05-03 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",event_type,match_regex_percent,0,2021-05-02 00:00:00,2021-05-03 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",event_type,not_match_regex,0,2021-05-02 00:00:00,2021-05-03 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",event_type,not_match_regex_percent,0,2021-05-02 00:00:00,2021-05-03 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",event_type,distinct_values,1,2021-05-02 00:00:00,2021-05-03 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",event_type,duplicate_values,1,2021-05-02 00:00:00,2021-05-03 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",event_type,duplicate_rows,4,2021-05-02 00:00:00,2021-05-03 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""",event_type,unique_rows,0,2021-05-02 00:00:00,2021-05-03 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""","",row_count,4,2021-05-02 00:00:00,2021-05-03 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""","",freshness,40765,2021-05-02 00:00:00,2021-05-03 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""","",buy_count,4,2021-05-02 00:00:00,2021-05-03 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""","",my_custom_table_metric,1000,2021-05-02 00:00:00,2021-05-03 00:00:00,86400 +"""postgres"".""dq_raw"".""sample_table""","",distinct_table_rows,4,2021-05-02 00:00:00,2021-05-03 00:00:00,86400 diff --git a/integration_tests/dbt_project.yml b/integration_tests/dbt_project.yml index 89d42048..fe99a5c6 100644 --- a/integration_tests/dbt_project.yml +++ b/integration_tests/dbt_project.yml @@ -117,3 +117,8 @@ seeds: expected_sample_data_z_score: +column_types: time_window_end: "TIMESTAMP" + + expected_sample_data_metrics: + +column_types: + time_window_start: "TIMESTAMP" + time_window_end: "TIMESTAMP" \ No newline at end of file diff --git a/integration_tests/models/expected/schema.yml b/integration_tests/models/expected/schema.yml index 155f0e8e..4bca58b7 100644 --- a/integration_tests/models/expected/schema.yml +++ b/integration_tests/models/expected/schema.yml @@ -29,6 +29,7 @@ models: compare_model: ref('expected_sample_data_max') compare_columns: - right(upper(replace(replace(table_name, '"', ''), '`', '')), 10) as table_name + - coalesce(upper(column_name), '') as column_name - metric - value - time_window_start @@ -41,6 +42,7 @@ models: compare_model: ref('expected_sample_data_max_length') compare_columns: - right(upper(replace(replace(table_name, '"', ''), '`', '')), 10) as table_name + - coalesce(upper(column_name), '') as column_name - metric - value - time_window_start @@ -53,6 +55,7 @@ models: compare_model: ref('expected_sample_data_z_score') compare_columns: - right(upper(replace(replace(table_name, '"', ''), '`', '')), 10) as table_name + - coalesce(upper(column_name), '') as column_name - metric - cast (z_score_value * 1000 as integer) as z_score_value - time_window_end @@ -61,3 +64,15 @@ models: - cast (last_stddev * 1000 as integer) as last_stddev - interval_length_sec + - name: test_re_data_metrics + tests: + - dbt_utils.equality: + compare_model: ref('expected_sample_data_metrics') + compare_columns: + - right(upper(replace(replace(table_name, '"', ''), '`', '')), 10) as table_name + - coalesce(upper(column_name), '') as column_name + - metric + - cast (value * 1000 as integer) as value + - time_window_start + - time_window_end + - interval_length_sec diff --git a/integration_tests/models/expected/test_re_data_metrics.sql b/integration_tests/models/expected/test_re_data_metrics.sql new file mode 100644 index 00000000..9cc4cc8b --- /dev/null +++ b/integration_tests/models/expected/test_re_data_metrics.sql @@ -0,0 +1,6 @@ +{% set table_name = re_data.full_table_name_values( + "sample_table", target.schema + "_raw", re_data.get_target_database()) %} + +select * from {{ ref('re_data_metrics')}} +where table_name = {{ table_name }} + diff --git a/integration_tests/test_dbs.py b/integration_tests/test_dbs.py index 3b95ff7b..cead954e 100644 --- a/integration_tests/test_dbs.py +++ b/integration_tests/test_dbs.py @@ -62,11 +62,7 @@ def test_postgres(): _test_generic('postgres') def test_snowflake(): - dbt_vars = copy.deepcopy(DBT_VARS) - schemas = dbt_vars['re_data:schemas'] - schemas = [el.upper() for el in schemas] - dbt_vars['re_data:schemas'] = schemas - _test_generic('snowflake', dbt_vars) + _test_generic('snowflake') def test_redshift(): _test_generic('redshift') diff --git a/macros/meta/schema_name.sql b/macros/meta/schema_name.sql new file mode 100644 index 00000000..b7f4baf9 --- /dev/null +++ b/macros/meta/schema_name.sql @@ -0,0 +1,13 @@ + +{% macro schema_name(name) %} + {% set result = adapter.dispatch('schema_name', 're_data')(name) %} + {{ return(result) }} +{% endmacro %} + +{% macro default__schema_name(name) %} + {{ return (name) }} +{% endmacro %} + +{% macro snowflake__schema_name(name) %} + {{ return (name.upper()) }} +{% endmacro %} \ No newline at end of file diff --git a/macros/metrics/base/build_in/optional_column_metrics.sql b/macros/metrics/base/build_in/optional_column_metrics.sql index e987fb56..f57e472f 100644 --- a/macros/metrics/base/build_in/optional_column_metrics.sql +++ b/macros/metrics/base/build_in/optional_column_metrics.sql @@ -41,12 +41,11 @@ {% macro postgres__distinct_values(context) %} {# /* In postgres, its faster to count distinct values in a column by selecting then counting in separate steps */ #} - ( with temp_table as ( + with temp_table as ( select distinct {{ context.column_name }} from {{ context.table_name }} where {{ in_time_window(context.time_filter) }} ) - select coalesce(count(*), 0) from temp_table - ) + select coalesce(count(*), 0) from temp_table {% endmacro %} {% macro re_data_metric_approx_distinct_values(context) %} @@ -75,7 +74,6 @@ {% endmacro %} {% macro re_data_metric_duplicate_values(context) %} - ( with temp_table as ( select {{ context.column_name }} from {{ context.table_name }} where {{ in_time_window(context.time_filter) }} @@ -83,11 +81,9 @@ having count(1) > 1 ) select coalesce(count(*), 0) from temp_table - ) {% endmacro %} {% macro re_data_metric_duplicate_rows(context) %} - ( with temp_table as ( select {{ context.column_name }}, count(1) as row_count from {{ context.table_name }} where {{ in_time_window(context.time_filter) }} @@ -95,11 +91,9 @@ having count(1) > 1 ) select coalesce(sum(row_count), 0) from temp_table - ) {% endmacro %} {% macro re_data_metric_unique_rows(context) %} - ( with temp_table as ( select {{ context.column_name }}, count(1) as row_count from {{ context.table_name }} where {{ in_time_window(context.time_filter) }} @@ -107,5 +101,4 @@ having count(1) = 1 ) select coalesce(sum(row_count), 0) from temp_table - ) {% endmacro %} \ No newline at end of file diff --git a/macros/metrics/base/build_in/optional_table_metrics.sql b/macros/metrics/base/build_in/optional_table_metrics.sql index a561260f..0ddb8893 100644 --- a/macros/metrics/base/build_in/optional_table_metrics.sql +++ b/macros/metrics/base/build_in/optional_table_metrics.sql @@ -1,8 +1,7 @@ {% macro re_data_metric_distinct_table_rows(context) %} - ( with temp_table AS ( + with temp_table AS ( select distinct * from {{ context.table_name }} where {{ in_time_window(context.time_filter) }} ) - select coalesce(count(*), 0) FROM temp_table - ) + select coalesce(count(*), 0) FROM temp_table {% endmacro %} diff --git a/macros/metrics/base/queries.sql b/macros/metrics/base/queries.sql index d49a7c24..ef93d614 100644 --- a/macros/metrics/base/queries.sql +++ b/macros/metrics/base/queries.sql @@ -64,7 +64,7 @@ with temp_table_metrics as ( select {%- for col_expr in col_exprs %} - {{ col_expr.expr }} as {{ col_expr.col_name + '___' + col_expr.metric }} + ( {{ col_expr.expr }} ) as {{ col_expr.col_name + '___' + col_expr.metric }} {%- if not loop.last %},{%- endif %} {% endfor %} from diff --git a/macros/utils/get_database.sql b/macros/utils/get_database.sql new file mode 100644 index 00000000..407e0b4e --- /dev/null +++ b/macros/utils/get_database.sql @@ -0,0 +1,16 @@ + +{% macro get_target_database() %} + {{- adapter.dispatch('get_target_database', 're_data')() -}} +{% endmacro %} + +{% macro default__get_target_database() %} + {{- return (target.dbname) -}} +{% endmacro %} + +{% macro bigquery__get_target_database() %} + {{- return (target.project) -}} +{% endmacro %} + +{% macro snowflake__get_target_database() %} + {{- return (target.database) -}} +{% endmacro %} \ No newline at end of file diff --git a/models/meta/re_data_columns.sql b/models/meta/re_data_columns.sql index d1f6ddfc..3b69e764 100644 --- a/models/meta/re_data_columns.sql +++ b/models/meta/re_data_columns.sql @@ -8,7 +8,8 @@ {% set schemas = var('re_data:schemas') %} with columns_froms_select as ( {% for for_schema in schemas %} - {{ get_monitored_columns(for_schema) }} + {% set schema_name = re_data.schema_name(for_schema) %} + {{ get_monitored_columns(schema_name) }} {%- if not loop.last %} union all {%- endif %} {% endfor %} )