From b93fa4974ebc24e8a2c0725696dec8739aecb8a6 Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Fri, 19 Apr 2024 16:56:11 +1200 Subject: [PATCH 01/59] Add macro for new hash-based comparison strategy --- .vscode/settings.json | 21 +++ dbt_project.yml | 2 + integration_tests/dbt_project.yml | 5 + .../unit_test_model_a.sql | 1 + .../unit_test_model_b.sql | 1 + .../unit_compare_queries.sql | 8 ++ .../unit_reworked_compare.sql | 9 ++ .../models/unit_test_wrappers/unit_tests.yml | 124 ++++++++++++++++++ macros/get_comparison_bounds.sql | 22 ++++ macros/reworked_compare.sql | 112 ++++++++++++++++ 10 files changed, 305 insertions(+) create mode 100644 .vscode/settings.json create mode 100644 integration_tests/models/unit_test_placeholder_models/unit_test_model_a.sql create mode 100644 integration_tests/models/unit_test_placeholder_models/unit_test_model_b.sql create mode 100644 integration_tests/models/unit_test_wrappers/unit_compare_queries.sql create mode 100644 integration_tests/models/unit_test_wrappers/unit_reworked_compare.sql create mode 100644 integration_tests/models/unit_test_wrappers/unit_tests.yml create mode 100644 macros/get_comparison_bounds.sql create mode 100644 macros/reworked_compare.sql diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 00000000..437dcba6 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,21 @@ +{ + "yaml.schemas": { + "https://raw.githubusercontent.com/dbt-labs/dbt-jsonschema/main/schemas/latest/dbt_yml_files-latest.json": [ + "/**/*.yml", + "!profiles.yml", + "!dbt_project.yml", + "!packages.yml", + "!selectors.yml", + "!profile_template.yml" + ], + "https://raw.githubusercontent.com/dbt-labs/dbt-jsonschema/main/schemas/latest/dbt_project-latest.json": [ + "dbt_project.yml" + ], + "https://raw.githubusercontent.com/dbt-labs/dbt-jsonschema/main/schemas/latest/selectors-latest.json": [ + "selectors.yml" + ], + "https://raw.githubusercontent.com/dbt-labs/dbt-jsonschema/main/schemas/latest/packages-latest.json": [ + "packages.yml" + ] + }, +} \ No newline at end of file diff --git a/dbt_project.yml b/dbt_project.yml index 987e03d1..e6fb7460 100644 --- a/dbt_project.yml +++ b/dbt_project.yml @@ -4,6 +4,8 @@ config-version: 2 require-dbt-version: [">=1.2.0", "<2.0.0"] +profile: joel_ska + target-path: "target" clean-targets: ["target", "dbt_packages"] macro-paths: ["macros"] diff --git a/integration_tests/dbt_project.yml b/integration_tests/dbt_project.yml index 07120e4c..13646b9e 100644 --- a/integration_tests/dbt_project.yml +++ b/integration_tests/dbt_project.yml @@ -17,3 +17,8 @@ clean-targets: # directories to be removed by `dbt clean` seeds: +quote_columns: false + +vars: + compare_queries_summarize: true + reworked_compare__columns: [] + reworked_compare__event_time: \ No newline at end of file diff --git a/integration_tests/models/unit_test_placeholder_models/unit_test_model_a.sql b/integration_tests/models/unit_test_placeholder_models/unit_test_model_a.sql new file mode 100644 index 00000000..55a6c71e --- /dev/null +++ b/integration_tests/models/unit_test_placeholder_models/unit_test_model_a.sql @@ -0,0 +1 @@ +select 1 as id, 'xyz' as col1, 'tuv' as col2, 123 as col3, getdate() as created_at \ No newline at end of file diff --git a/integration_tests/models/unit_test_placeholder_models/unit_test_model_b.sql b/integration_tests/models/unit_test_placeholder_models/unit_test_model_b.sql new file mode 100644 index 00000000..55a6c71e --- /dev/null +++ b/integration_tests/models/unit_test_placeholder_models/unit_test_model_b.sql @@ -0,0 +1 @@ +select 1 as id, 'xyz' as col1, 'tuv' as col2, 123 as col3, getdate() as created_at \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_compare_queries.sql b/integration_tests/models/unit_test_wrappers/unit_compare_queries.sql new file mode 100644 index 00000000..c589ee53 --- /dev/null +++ b/integration_tests/models/unit_test_wrappers/unit_compare_queries.sql @@ -0,0 +1,8 @@ + +{{ + audit_helper.compare_queries( + "select * from " ~ ref('unit_test_model_a'), + "select * from " ~ ref('unit_test_model_b'), + summarize = var('compare_queries_summarize') + ) +}} \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.sql b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.sql new file mode 100644 index 00000000..157826a8 --- /dev/null +++ b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.sql @@ -0,0 +1,9 @@ +{{ + audit_helper.reworked_compare( + ref('unit_test_model_a'), + ref('unit_test_model_b'), + primary_key='id', + columns=var('reworked_compare__columns'), + event_time=var('reworked_compare__event_time') + ) +}} \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_tests.yml b/integration_tests/models/unit_test_wrappers/unit_tests.yml new file mode 100644 index 00000000..b067950d --- /dev/null +++ b/integration_tests/models/unit_test_wrappers/unit_tests.yml @@ -0,0 +1,124 @@ +version: 2 + +unit_tests: + - name: identical_records_compare_queries + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + + expect: + rows: + - {"in_a": true, "in_b": true} + + model: unit_compare_queries + description: The world's most basic unit test. + overrides: + vars: + compare_queries_summarize: true + + - name: identical_records_compare_queries_no_summarize + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + + expect: + rows: [] + + model: unit_compare_queries + description: The world's second most basic unit test. + overrides: + vars: + compare_queries_summarize: false + + - name: reworked_compare_identical_tables + model: unit_reworked_compare + overrides: + vars: + reworked_compare__columns: ['id', 'col1', 'col2'] + reworked_compare__event_time: + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + + expect: + rows: + - {"status": 'identical', 'id': 1, num_in_status: 3} + - {"status": 'identical', 'id': 2, num_in_status: 3} + - {"status": 'identical', 'id': 3, num_in_status: 3} + + - name: reworked_compare_identical_tables_event_time_filter + model: unit_reworked_compare + overrides: + vars: + reworked_compare__columns: ['id', 'col1', 'col2', 'created_at'] + reworked_compare__event_time: 'created_at' + macros: + audit_helper.get_comparison_bounds: + "min_event_time": "2024-01-02" + "max_event_time": "2024-01-03" + + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def", "created_at": '2024-01-01' } + - { "id": 2, "col1": "hij", "col2": "klm", "created_at": '2024-01-02' } + - { "id": 3, "col1": "nop", "col2": "qrs", "created_at": '2024-01-03' } + - input: ref('unit_test_model_b') + rows: + - { "id": 2, "col1": "hij", "col2": "klm", "created_at": '2024-01-02' } + - { "id": 3, "col1": "nop", "col2": "qrs", "created_at": '2024-01-03' } + + expect: + rows: + - {"status": 'identical', 'id': 2, num_in_status: 2} + - {"status": 'identical', 'id': 3, num_in_status: 2} + + - name: reworked_compare_all_statuses + model: unit_reworked_compare + overrides: + vars: + reworked_compare__columns: ['id', 'col1', 'col2'] + reworked_compare__event_time: + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "changed", "col2": "values" } + - { "id": 4, "col1": "nop", "col2": "qrs" } + + expect: + rows: + - {"status": 'identical', 'id': 1, num_in_status: 1} + - {"status": 'modified', 'id': 2, num_in_status: 1} + - {"status": 'modified', 'id': 2, num_in_status: 1} + - {"status": 'removed', 'id': 3, num_in_status: 1} + - {"status": 'added', 'id': 4, num_in_status: 1} diff --git a/macros/get_comparison_bounds.sql b/macros/get_comparison_bounds.sql new file mode 100644 index 00000000..07903c19 --- /dev/null +++ b/macros/get_comparison_bounds.sql @@ -0,0 +1,22 @@ +{% macro get_comparison_bounds(a_relation, b_relation, event_time) %} + {% set min_max_queries %} + with min_maxes as ( + select min({{ event_time }}) as min_event_time, max({{ event_time }}) as max_event_time + from {{ a_relation }} + union all + select min({{ event_time }}) as min_event_time, max({{ event_time }}) as max_event_time + from {{ b_relation }} + ) + select max(min_event_time) as "min_event_time", min(max_event_time) as "max_event_time" + from min_maxes + {% endset %} + + {% set query_response = dbt_utils.get_query_results_as_dict(min_max_queries) %} + + {% set min_max_event_time_results = {} %} + {% for k in query_response.keys() %} + {% do min_max_event_time_results.update({k: query_response[k][0]}) %} + {% endfor %} + + {% do return(min_max_event_time_results) %} +{% endmacro %} \ No newline at end of file diff --git a/macros/reworked_compare.sql b/macros/reworked_compare.sql new file mode 100644 index 00000000..8d5c476e --- /dev/null +++ b/macros/reworked_compare.sql @@ -0,0 +1,112 @@ +{% macro reworked_compare(a_relation, b_relation, primary_key=[], columns=[], event_time=None, sample_limit=20) %} + + {% set joined_cols = columns | join(", ") %} + + {% if event_time %} + {% set min_max_event_time_results = audit_helper.get_comparison_bounds(a_relation, b_relation, event_time) %} + {% set min_event_time = min_max_event_time_results["min_event_time"] %} + {% set max_event_time = min_max_event_time_results["max_event_time"] %} + {% endif %} + + with a as ( + select + *, + hash({{ joined_cols }}) as dbt_compare_row_hash + from {{ a_relation }} + {% if min_event_time and max_event_time %} + where {{ event_time }} >= '{{ min_event_time }}' + and {{ event_time }} <= '{{ max_event_time }}' + {% endif %} + ), + + b as ( + select + *, + hash({{ joined_cols }}) as dbt_compare_row_hash + from {{ b_relation }} + {% if min_event_time and max_event_time %} + where {{ event_time }} >= '{{ min_event_time }}' + and {{ event_time }} <= '{{ max_event_time }}' + {% endif %} + ), + + a_intersect_b as ( + + select * from a + where a.dbt_compare_row_hash in (select b.dbt_compare_row_hash from b) + + ), + + a_except_b as ( + + select * from a + where a.dbt_compare_row_hash not in (select b.dbt_compare_row_hash from b) + + ), + + b_except_a as ( + + select * from b + where b.dbt_compare_row_hash not in (select a.dbt_compare_row_hash from a) + + ), + + all_records as ( + + select + *, + true as in_a, + true as in_b, + from a_intersect_b + + union all + + select + *, + true as in_a, + false as in_b + from a_except_b + + union all + + select + *, + false as in_a, + true as in_b + from b_except_a + + ), + + + classified as ( + + select + *, + case + when in_a and in_b then 'identical' + when {{ dbt.bool_or('in_a') }} over (partition by {{ primary_key }}) + and {{ dbt.bool_or('in_b') }} over (partition by {{ primary_key }}) + then 'modified' + when in_a then 'removed' + when in_b then 'added' + end as status + from all_records + order by {{ primary_key ~ ", " if primary_key is not none }} in_a desc, in_b desc + + ), + + final as ( + select + *, + count(distinct {{ primary_key }}) over (partition by status) as num_in_status, + dense_rank() over (partition by status order by {{ primary_key }}) as sample_number + from classified + ) + + select * from final + {% if sample_limit %} + where sample_number <= {{ sample_limit }} + {% endif %} + order by status, sample_number + +{% endmacro %} \ No newline at end of file From d3dfa77f277c51507d83ffd5601d5a77a5c9706d Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Fri, 19 Apr 2024 20:52:02 +1200 Subject: [PATCH 02/59] split out SF-focused version of macro --- macros/get_comparison_bounds.sql | 4 +- macros/reworked_compare.sql | 145 ++++++++++++++++++++++--------- 2 files changed, 106 insertions(+), 43 deletions(-) diff --git a/macros/get_comparison_bounds.sql b/macros/get_comparison_bounds.sql index 07903c19..e5f50f63 100644 --- a/macros/get_comparison_bounds.sql +++ b/macros/get_comparison_bounds.sql @@ -7,7 +7,7 @@ select min({{ event_time }}) as min_event_time, max({{ event_time }}) as max_event_time from {{ b_relation }} ) - select max(min_event_time) as "min_event_time", min(max_event_time) as "max_event_time" + select max(min_event_time) as min_event_time, min(max_event_time) as max_event_time from min_maxes {% endset %} @@ -15,7 +15,7 @@ {% set min_max_event_time_results = {} %} {% for k in query_response.keys() %} - {% do min_max_event_time_results.update({k: query_response[k][0]}) %} + {% do min_max_event_time_results.update({k | lower: query_response[k][0]}) %} {% endfor %} {% do return(min_max_event_time_results) %} diff --git a/macros/reworked_compare.sql b/macros/reworked_compare.sql index 8d5c476e..6f595659 100644 --- a/macros/reworked_compare.sql +++ b/macros/reworked_compare.sql @@ -6,50 +6,18 @@ {% set min_max_event_time_results = audit_helper.get_comparison_bounds(a_relation, b_relation, event_time) %} {% set min_event_time = min_max_event_time_results["min_event_time"] %} {% set max_event_time = min_max_event_time_results["max_event_time"] %} + {% set event_time_props = { + "event_time": event_time, + "min_event_time": min_event_time, + "max_event_time": max_event_time + } %} {% endif %} - with a as ( - select - *, - hash({{ joined_cols }}) as dbt_compare_row_hash - from {{ a_relation }} - {% if min_event_time and max_event_time %} - where {{ event_time }} >= '{{ min_event_time }}' - and {{ event_time }} <= '{{ max_event_time }}' - {% endif %} - ), - - b as ( - select - *, - hash({{ joined_cols }}) as dbt_compare_row_hash - from {{ b_relation }} - {% if min_event_time and max_event_time %} - where {{ event_time }} >= '{{ min_event_time }}' - and {{ event_time }} <= '{{ max_event_time }}' - {% endif %} - ), - - a_intersect_b as ( + with - select * from a - where a.dbt_compare_row_hash in (select b.dbt_compare_row_hash from b) - - ), - - a_except_b as ( - - select * from a - where a.dbt_compare_row_hash not in (select b.dbt_compare_row_hash from b) - - ), - - b_except_a as ( - - select * from b - where b.dbt_compare_row_hash not in (select a.dbt_compare_row_hash from a) - - ), + {{ generate_set_results(a_relation, b_relation, columns, event_time_props)}} + + , all_records as ( @@ -109,4 +77,99 @@ {% endif %} order by status, sample_number +{% endmacro %} + +{% macro generate_set_results(a_relation, b_relation, columns, event_time_props=None) %} + {{ return(adapter.dispatch('generate_set_results', 'audit_helper')(a_relation, b_relation, columns, event_time_props)) }} +{% endmacro %} + +{% macro default__generate_set_results(a_relation, b_relation, columns, event_time_props) %} +{% set columns_joined = columns | join(", ") %} + + a as ( + select {{ columns_joined }} + from {{ a_relation }} + {% if event_time_props %} + where {{ event_time }} >= '{{ event_time_props["min_event_time"] }}' + and {{ event_time }} <= '{{ event_time_props["max_event_time"] }}' + {% endif %} + ), + + b as ( + select {{ columns_joined }} + from {{ b_relation }} + {% if event_time_props %} + where {{ event_time }} >= '{{ event_time_props["min_event_time"] }}' + and {{ event_time }} <= '{{ event_time_props["max_event_time"] }}' + {% endif %} + ), + + a_intersect_b as ( + + select * from a + {{ dbt.intersect() }} + select * from b + + ), + + a_except_b as ( + + select * from a + {{ dbt.except() }} + select * from b + + ), + + b_except_a as ( + + select * from b + {{ dbt.except() }} + select * from a + + ) +{% endmacro %} + +{% macro snowflake__generate_set_results(a_relation, b_relation, columns, event_time_props) %} + a as ( + select + *, + hash({{ joined_cols }}) as dbt_compare_row_hash + from {{ a_relation }} + {% if event_time_props %} + where {{ event_time }} >= '{{ event_time_props["min_event_time"] }}' + and {{ event_time }} <= '{{ event_time_props["max_event_time"] }}' + {% endif %} + ), + + b as ( + select + *, + hash({{ joined_cols }}) as dbt_compare_row_hash + from {{ b_relation }} + {% if event_time_props %} + where {{ event_time }} >= '{{ event_time_props["min_event_time"] }}' + and {{ event_time }} <= '{{ event_time_props["max_event_time"] }}' + {% endif %} + ), + + a_intersect_b as ( + + select * from a + where a.dbt_compare_row_hash in (select b.dbt_compare_row_hash from b) + + ), + + a_except_b as ( + + select * from a + where a.dbt_compare_row_hash not in (select b.dbt_compare_row_hash from b) + + ), + + b_except_a as ( + + select * from b + where b.dbt_compare_row_hash not in (select a.dbt_compare_row_hash from a) + + ) {% endmacro %} \ No newline at end of file From 1a6c35fbe34732173fd3080ca3c6a437673c8150 Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Tue, 23 Apr 2024 17:44:35 +1200 Subject: [PATCH 03/59] Fix change to complex object --- macros/reworked_compare.sql | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/macros/reworked_compare.sql b/macros/reworked_compare.sql index 6f595659..977546fb 100644 --- a/macros/reworked_compare.sql +++ b/macros/reworked_compare.sql @@ -15,7 +15,7 @@ with - {{ generate_set_results(a_relation, b_relation, columns, event_time_props)}} + {{ audit_helper.generate_set_results(a_relation, b_relation, columns, event_time_props)}} , @@ -84,23 +84,23 @@ {% endmacro %} {% macro default__generate_set_results(a_relation, b_relation, columns, event_time_props) %} -{% set columns_joined = columns | join(", ") %} + {% set joined_cols = columns | join(", ") %} a as ( - select {{ columns_joined }} + select {{ joined_cols }} from {{ a_relation }} {% if event_time_props %} - where {{ event_time }} >= '{{ event_time_props["min_event_time"] }}' - and {{ event_time }} <= '{{ event_time_props["max_event_time"] }}' + where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' + and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' {% endif %} ), b as ( - select {{ columns_joined }} + select {{ joined_cols }} from {{ b_relation }} {% if event_time_props %} - where {{ event_time }} >= '{{ event_time_props["min_event_time"] }}' - and {{ event_time }} <= '{{ event_time_props["max_event_time"] }}' + where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' + and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' {% endif %} ), @@ -130,14 +130,15 @@ {% endmacro %} {% macro snowflake__generate_set_results(a_relation, b_relation, columns, event_time_props) %} + {% set joined_cols = columns | join(", ") %} a as ( select *, hash({{ joined_cols }}) as dbt_compare_row_hash from {{ a_relation }} {% if event_time_props %} - where {{ event_time }} >= '{{ event_time_props["min_event_time"] }}' - and {{ event_time }} <= '{{ event_time_props["max_event_time"] }}' + where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' + and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' {% endif %} ), @@ -147,8 +148,8 @@ hash({{ joined_cols }}) as dbt_compare_row_hash from {{ b_relation }} {% if event_time_props %} - where {{ event_time }} >= '{{ event_time_props["min_event_time"] }}' - and {{ event_time }} <= '{{ event_time_props["max_event_time"] }}' + where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' + and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' {% endif %} ), From 4a7f1201306b596a1c574a1c66f0878c114b66b5 Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Tue, 23 Apr 2024 17:51:01 +1200 Subject: [PATCH 04/59] Fix overuse of star --- .../unit_test_model_b_more_cols.sql | 1 + ...worked_compare_column_details_mismatch.sql | 9 +++++++ .../models/unit_test_wrappers/unit_tests.yml | 26 +++++++++++++++++++ macros/reworked_compare.sql | 4 +-- package-lock.yml | 4 +++ 5 files changed, 42 insertions(+), 2 deletions(-) create mode 100644 integration_tests/models/unit_test_placeholder_models/unit_test_model_b_more_cols.sql create mode 100644 integration_tests/models/unit_test_wrappers/unit_reworked_compare_column_details_mismatch.sql create mode 100644 package-lock.yml diff --git a/integration_tests/models/unit_test_placeholder_models/unit_test_model_b_more_cols.sql b/integration_tests/models/unit_test_placeholder_models/unit_test_model_b_more_cols.sql new file mode 100644 index 00000000..f0105eae --- /dev/null +++ b/integration_tests/models/unit_test_placeholder_models/unit_test_model_b_more_cols.sql @@ -0,0 +1 @@ +select 1 as id, 'xyz' as col1, 'tuv' as col2, 123 as col3, getdate() as created_at, 'pineapple' as pizza \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_reworked_compare_column_details_mismatch.sql b/integration_tests/models/unit_test_wrappers/unit_reworked_compare_column_details_mismatch.sql new file mode 100644 index 00000000..e1752942 --- /dev/null +++ b/integration_tests/models/unit_test_wrappers/unit_reworked_compare_column_details_mismatch.sql @@ -0,0 +1,9 @@ +{{ + audit_helper.reworked_compare( + ref('unit_test_model_a'), + ref('unit_test_model_b_more_cols'), + primary_key='id', + columns=var('reworked_compare__columns'), + event_time=var('reworked_compare__event_time') + ) +}} \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_tests.yml b/integration_tests/models/unit_test_wrappers/unit_tests.yml index b067950d..d5013c72 100644 --- a/integration_tests/models/unit_test_wrappers/unit_tests.yml +++ b/integration_tests/models/unit_test_wrappers/unit_tests.yml @@ -122,3 +122,29 @@ unit_tests: - {"status": 'modified', 'id': 2, num_in_status: 1} - {"status": 'removed', 'id': 3, num_in_status: 1} - {"status": 'added', 'id': 4, num_in_status: 1} + + - name: reworked_compare_all_statuses_different_columns + model: unit_reworked_compare_column_details_mismatch + overrides: + vars: + reworked_compare__columns: ['id', 'col1'] + reworked_compare__event_time: + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b_more_cols') + rows: + - { "id": 1, "col1": "abc" } + - { "id": 2, "col1": "changed" } + - { "id": 4, "col1": "nop" } + + expect: + rows: + - {"status": 'identical', 'id': 1, num_in_status: 1} + - {"status": 'modified', 'id': 2, num_in_status: 1} + - {"status": 'modified', 'id': 2, num_in_status: 1} + - {"status": 'removed', 'id': 3, num_in_status: 1} + - {"status": 'added', 'id': 4, num_in_status: 1} diff --git a/macros/reworked_compare.sql b/macros/reworked_compare.sql index 977546fb..a0cb217a 100644 --- a/macros/reworked_compare.sql +++ b/macros/reworked_compare.sql @@ -133,7 +133,7 @@ {% set joined_cols = columns | join(", ") %} a as ( select - *, + {{ joined_cols }}, hash({{ joined_cols }}) as dbt_compare_row_hash from {{ a_relation }} {% if event_time_props %} @@ -144,7 +144,7 @@ b as ( select - *, + {{ joined_cols }}, hash({{ joined_cols }}) as dbt_compare_row_hash from {{ b_relation }} {% if event_time_props %} diff --git a/package-lock.yml b/package-lock.yml new file mode 100644 index 00000000..32c6ccc0 --- /dev/null +++ b/package-lock.yml @@ -0,0 +1,4 @@ +packages: + - package: dbt-labs/dbt_utils + version: 1.1.1 +sha1_hash: 106400343ad0c92a7417f5156d0d6c3893bb2429 From 87afbe92e7e85a16fed1ea4a14746e8daef1aa8e Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Thu, 25 Apr 2024 16:41:30 +1200 Subject: [PATCH 05/59] switch from compare rels to compare queries --- .../unit_reworked_compare.sql | 4 ++-- ...worked_compare_column_details_mismatch.sql | 4 ++-- macros/reworked_compare.sql | 22 +++++++++---------- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.sql b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.sql index 157826a8..b2947a17 100644 --- a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.sql +++ b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.sql @@ -1,7 +1,7 @@ {{ audit_helper.reworked_compare( - ref('unit_test_model_a'), - ref('unit_test_model_b'), + "select * from " ~ ref('unit_test_model_a'), + "select * from " ~ ref('unit_test_model_b'), primary_key='id', columns=var('reworked_compare__columns'), event_time=var('reworked_compare__event_time') diff --git a/integration_tests/models/unit_test_wrappers/unit_reworked_compare_column_details_mismatch.sql b/integration_tests/models/unit_test_wrappers/unit_reworked_compare_column_details_mismatch.sql index e1752942..0e4a17f1 100644 --- a/integration_tests/models/unit_test_wrappers/unit_reworked_compare_column_details_mismatch.sql +++ b/integration_tests/models/unit_test_wrappers/unit_reworked_compare_column_details_mismatch.sql @@ -1,7 +1,7 @@ {{ audit_helper.reworked_compare( - ref('unit_test_model_a'), - ref('unit_test_model_b_more_cols'), + "select * from " ~ ref('unit_test_model_a'), + "select * from " ~ ref('unit_test_model_b_more_cols'), primary_key='id', columns=var('reworked_compare__columns'), event_time=var('reworked_compare__event_time') diff --git a/macros/reworked_compare.sql b/macros/reworked_compare.sql index a0cb217a..febdd4d4 100644 --- a/macros/reworked_compare.sql +++ b/macros/reworked_compare.sql @@ -1,9 +1,9 @@ -{% macro reworked_compare(a_relation, b_relation, primary_key=[], columns=[], event_time=None, sample_limit=20) %} +{% macro reworked_compare(a_query, b_query, primary_key=[], columns=[], event_time=None, sample_limit=20) %} {% set joined_cols = columns | join(", ") %} {% if event_time %} - {% set min_max_event_time_results = audit_helper.get_comparison_bounds(a_relation, b_relation, event_time) %} + {% set min_max_event_time_results = audit_helper.get_comparison_bounds(a_query, b_query, event_time) %} {% set min_event_time = min_max_event_time_results["min_event_time"] %} {% set max_event_time = min_max_event_time_results["max_event_time"] %} {% set event_time_props = { @@ -15,7 +15,7 @@ with - {{ audit_helper.generate_set_results(a_relation, b_relation, columns, event_time_props)}} + {{ audit_helper.generate_set_results(a_query, b_query, columns, event_time_props)}} , @@ -79,16 +79,16 @@ {% endmacro %} -{% macro generate_set_results(a_relation, b_relation, columns, event_time_props=None) %} - {{ return(adapter.dispatch('generate_set_results', 'audit_helper')(a_relation, b_relation, columns, event_time_props)) }} +{% macro generate_set_results(a_query, b_query, columns, event_time_props=None) %} + {{ return(adapter.dispatch('generate_set_results', 'audit_helper')(a_query, b_query, columns, event_time_props)) }} {% endmacro %} -{% macro default__generate_set_results(a_relation, b_relation, columns, event_time_props) %} +{% macro default__generate_set_results(a_query, b_query, columns, event_time_props) %} {% set joined_cols = columns | join(", ") %} a as ( select {{ joined_cols }} - from {{ a_relation }} + from {{ a_query }} {% if event_time_props %} where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' @@ -97,7 +97,7 @@ b as ( select {{ joined_cols }} - from {{ b_relation }} + from {{ b_query }} {% if event_time_props %} where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' @@ -129,13 +129,13 @@ ) {% endmacro %} -{% macro snowflake__generate_set_results(a_relation, b_relation, columns, event_time_props) %} +{% macro snowflake__generate_set_results(a_query, b_query, columns, event_time_props) %} {% set joined_cols = columns | join(", ") %} a as ( select {{ joined_cols }}, hash({{ joined_cols }}) as dbt_compare_row_hash - from {{ a_relation }} + from {{ a_query }} {% if event_time_props %} where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' @@ -146,7 +146,7 @@ select {{ joined_cols }}, hash({{ joined_cols }}) as dbt_compare_row_hash - from {{ b_relation }} + from {{ b_query }} {% if event_time_props %} where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' From e754ab74a3be7d2c11bd1267c0ea3898182587ed Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Thu, 25 Apr 2024 16:48:27 +1200 Subject: [PATCH 06/59] provide wrapping parens --- macros/reworked_compare.sql | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/macros/reworked_compare.sql b/macros/reworked_compare.sql index febdd4d4..7e075598 100644 --- a/macros/reworked_compare.sql +++ b/macros/reworked_compare.sql @@ -88,7 +88,7 @@ a as ( select {{ joined_cols }} - from {{ a_query }} + from ( {{- a_query -}} ) {% if event_time_props %} where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' @@ -97,7 +97,7 @@ b as ( select {{ joined_cols }} - from {{ b_query }} + from ( {{- b_query -}} ) {% if event_time_props %} where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' @@ -135,7 +135,7 @@ select {{ joined_cols }}, hash({{ joined_cols }}) as dbt_compare_row_hash - from {{ a_query }} + from ( {{- a_query -}} ) {% if event_time_props %} where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' @@ -146,7 +146,7 @@ select {{ joined_cols }}, hash({{ joined_cols }}) as dbt_compare_row_hash - from {{ b_query }} + from ( {{- b_query -}} ) {% if event_time_props %} where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' From e6be75cf86225fba9f9050fc64770d0398d8fe9c Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Thu, 25 Apr 2024 17:23:41 +1200 Subject: [PATCH 07/59] switch to array of columns for PK --- macros/reworked_compare.sql | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/macros/reworked_compare.sql b/macros/reworked_compare.sql index 7e075598..eb87a5e7 100644 --- a/macros/reworked_compare.sql +++ b/macros/reworked_compare.sql @@ -1,6 +1,7 @@ -{% macro reworked_compare(a_query, b_query, primary_key=[], columns=[], event_time=None, sample_limit=20) %} - +{% macro reworked_compare(a_query, b_query, primary_key_columns=[], columns=[], event_time=None, sample_limit=20) %} + {% set joined_cols = columns | join(", ") %} + {% set primary_key = primary_key_columns | join(", ") } {% if event_time %} {% set min_max_event_time_results = audit_helper.get_comparison_bounds(a_query, b_query, event_time) %} @@ -59,7 +60,7 @@ when in_b then 'added' end as status from all_records - order by {{ primary_key ~ ", " if primary_key is not none }} in_a desc, in_b desc + order by {{ primary_key }}, in_a desc, in_b desc ), From 60fe426357d82ed2e21beb0d4d90bbd2427466d6 Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Thu, 25 Apr 2024 17:24:03 +1200 Subject: [PATCH 08/59] split unit tests into own files, change unit tests to array pk --- .../unit_compare_queries.yml | 47 +++++++++++ .../unit_reworked_compare.sql | 2 +- ...it_tests.yml => unit_reworked_compare.yml} | 83 ++----------------- ...worked_compare_column_details_mismatch.sql | 2 +- ...worked_compare_column_details_mismatch.yml | 26 ++++++ 5 files changed, 81 insertions(+), 79 deletions(-) create mode 100644 integration_tests/models/unit_test_wrappers/unit_compare_queries.yml rename integration_tests/models/unit_test_wrappers/{unit_tests.yml => unit_reworked_compare.yml} (55%) create mode 100644 integration_tests/models/unit_test_wrappers/unit_reworked_compare_column_details_mismatch.yml diff --git a/integration_tests/models/unit_test_wrappers/unit_compare_queries.yml b/integration_tests/models/unit_test_wrappers/unit_compare_queries.yml new file mode 100644 index 00000000..0308e509 --- /dev/null +++ b/integration_tests/models/unit_test_wrappers/unit_compare_queries.yml @@ -0,0 +1,47 @@ +unit_tests: + - name: identical_records_compare_queries + model: unit_compare_queries + description: The world's most basic unit test. + + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + + expect: + rows: + - {"in_a": true, "in_b": true} + + overrides: + vars: + compare_queries_summarize: true + + - name: identical_records_compare_queries_no_summarize + model: unit_compare_queries + description: The world's second most basic unit test. + + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + + expect: + rows: [] + + overrides: + vars: + compare_queries_summarize: false diff --git a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.sql b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.sql index b2947a17..fdcebaf7 100644 --- a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.sql +++ b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.sql @@ -2,7 +2,7 @@ audit_helper.reworked_compare( "select * from " ~ ref('unit_test_model_a'), "select * from " ~ ref('unit_test_model_b'), - primary_key='id', + primary_key=['id'], columns=var('reworked_compare__columns'), event_time=var('reworked_compare__event_time') ) diff --git a/integration_tests/models/unit_test_wrappers/unit_tests.yml b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml similarity index 55% rename from integration_tests/models/unit_test_wrappers/unit_tests.yml rename to integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml index d5013c72..49d3b394 100644 --- a/integration_tests/models/unit_test_wrappers/unit_tests.yml +++ b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml @@ -1,57 +1,7 @@ -version: 2 - unit_tests: - - name: identical_records_compare_queries - given: - - input: ref('unit_test_model_a') - rows: - - { "id": 1, "col1": "abc", "col2": "def" } - - { "id": 2, "col1": "hij", "col2": "klm" } - - { "id": 3, "col1": "nop", "col2": "qrs" } - - input: ref('unit_test_model_b') - rows: - - { "id": 1, "col1": "abc", "col2": "def" } - - { "id": 2, "col1": "hij", "col2": "klm" } - - { "id": 3, "col1": "nop", "col2": "qrs" } - - expect: - rows: - - {"in_a": true, "in_b": true} - - model: unit_compare_queries - description: The world's most basic unit test. - overrides: - vars: - compare_queries_summarize: true - - - name: identical_records_compare_queries_no_summarize - given: - - input: ref('unit_test_model_a') - rows: - - { "id": 1, "col1": "abc", "col2": "def" } - - { "id": 2, "col1": "hij", "col2": "klm" } - - { "id": 3, "col1": "nop", "col2": "qrs" } - - input: ref('unit_test_model_b') - rows: - - { "id": 1, "col1": "abc", "col2": "def" } - - { "id": 2, "col1": "hij", "col2": "klm" } - - { "id": 3, "col1": "nop", "col2": "qrs" } - - expect: - rows: [] - - model: unit_compare_queries - description: The world's second most basic unit test. - overrides: - vars: - compare_queries_summarize: false - - name: reworked_compare_identical_tables model: unit_reworked_compare - overrides: - vars: - reworked_compare__columns: ['id', 'col1', 'col2'] - reworked_compare__event_time: + given: - input: ref('unit_test_model_a') rows: @@ -70,6 +20,11 @@ unit_tests: - {"status": 'identical', 'id': 2, num_in_status: 3} - {"status": 'identical', 'id': 3, num_in_status: 3} + overrides: + vars: + reworked_compare__columns: ['id', 'col1', 'col2'] + reworked_compare__event_time: + - name: reworked_compare_identical_tables_event_time_filter model: unit_reworked_compare overrides: @@ -122,29 +77,3 @@ unit_tests: - {"status": 'modified', 'id': 2, num_in_status: 1} - {"status": 'removed', 'id': 3, num_in_status: 1} - {"status": 'added', 'id': 4, num_in_status: 1} - - - name: reworked_compare_all_statuses_different_columns - model: unit_reworked_compare_column_details_mismatch - overrides: - vars: - reworked_compare__columns: ['id', 'col1'] - reworked_compare__event_time: - given: - - input: ref('unit_test_model_a') - rows: - - { "id": 1, "col1": "abc", "col2": "def" } - - { "id": 2, "col1": "hij", "col2": "klm" } - - { "id": 3, "col1": "nop", "col2": "qrs" } - - input: ref('unit_test_model_b_more_cols') - rows: - - { "id": 1, "col1": "abc" } - - { "id": 2, "col1": "changed" } - - { "id": 4, "col1": "nop" } - - expect: - rows: - - {"status": 'identical', 'id': 1, num_in_status: 1} - - {"status": 'modified', 'id': 2, num_in_status: 1} - - {"status": 'modified', 'id': 2, num_in_status: 1} - - {"status": 'removed', 'id': 3, num_in_status: 1} - - {"status": 'added', 'id': 4, num_in_status: 1} diff --git a/integration_tests/models/unit_test_wrappers/unit_reworked_compare_column_details_mismatch.sql b/integration_tests/models/unit_test_wrappers/unit_reworked_compare_column_details_mismatch.sql index 0e4a17f1..2762c083 100644 --- a/integration_tests/models/unit_test_wrappers/unit_reworked_compare_column_details_mismatch.sql +++ b/integration_tests/models/unit_test_wrappers/unit_reworked_compare_column_details_mismatch.sql @@ -2,7 +2,7 @@ audit_helper.reworked_compare( "select * from " ~ ref('unit_test_model_a'), "select * from " ~ ref('unit_test_model_b_more_cols'), - primary_key='id', + primary_key=['id'], columns=var('reworked_compare__columns'), event_time=var('reworked_compare__event_time') ) diff --git a/integration_tests/models/unit_test_wrappers/unit_reworked_compare_column_details_mismatch.yml b/integration_tests/models/unit_test_wrappers/unit_reworked_compare_column_details_mismatch.yml new file mode 100644 index 00000000..923d2584 --- /dev/null +++ b/integration_tests/models/unit_test_wrappers/unit_reworked_compare_column_details_mismatch.yml @@ -0,0 +1,26 @@ +unit_tests: + - name: reworked_compare_all_statuses_different_columns + model: unit_reworked_compare_column_details_mismatch + overrides: + vars: + reworked_compare__columns: ['id', 'col1'] + reworked_compare__event_time: + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b_more_cols') + rows: + - { "id": 1, "col1": "abc" } + - { "id": 2, "col1": "changed" } + - { "id": 4, "col1": "nop" } + + expect: + rows: + - {"status": 'identical', 'id': 1, num_in_status: 1} + - {"status": 'modified', 'id': 2, num_in_status: 1} + - {"status": 'modified', 'id': 2, num_in_status: 1} + - {"status": 'removed', 'id': 3, num_in_status: 1} + - {"status": 'added', 'id': 4, num_in_status: 1} From 886728dae1e3fd5d59aff33b66257706f3d1c914 Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Thu, 25 Apr 2024 18:41:50 +1200 Subject: [PATCH 09/59] tidy up get_comp_bounds --- macros/get_comparison_bounds.sql | 6 +++--- macros/reworked_compare.sql | 11 ++--------- 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/macros/get_comparison_bounds.sql b/macros/get_comparison_bounds.sql index e5f50f63..85f8fcdc 100644 --- a/macros/get_comparison_bounds.sql +++ b/macros/get_comparison_bounds.sql @@ -13,10 +13,10 @@ {% set query_response = dbt_utils.get_query_results_as_dict(min_max_queries) %} - {% set min_max_event_time_results = {} %} + {% set event_time_props = {"event_time": event_time} %} {% for k in query_response.keys() %} - {% do min_max_event_time_results.update({k | lower: query_response[k][0]}) %} + {% do event_time_props.update({k | lower: query_response[k][0]}) %} {% endfor %} - {% do return(min_max_event_time_results) %} + {% do return(event_time_props) %} {% endmacro %} \ No newline at end of file diff --git a/macros/reworked_compare.sql b/macros/reworked_compare.sql index eb87a5e7..3378dbf0 100644 --- a/macros/reworked_compare.sql +++ b/macros/reworked_compare.sql @@ -1,17 +1,10 @@ {% macro reworked_compare(a_query, b_query, primary_key_columns=[], columns=[], event_time=None, sample_limit=20) %} {% set joined_cols = columns | join(", ") %} - {% set primary_key = primary_key_columns | join(", ") } + {% set primary_key = primary_key_columns | join(", ") %} {% if event_time %} - {% set min_max_event_time_results = audit_helper.get_comparison_bounds(a_query, b_query, event_time) %} - {% set min_event_time = min_max_event_time_results["min_event_time"] %} - {% set max_event_time = min_max_event_time_results["max_event_time"] %} - {% set event_time_props = { - "event_time": event_time, - "min_event_time": min_event_time, - "max_event_time": max_event_time - } %} + {% set event_time_props = audit_helper.get_comparison_bounds(a_query, b_query, event_time) %} {% endif %} with From b53db5822e51082c077ff9726d278c2402e6383e Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Thu, 25 Apr 2024 18:42:10 +1200 Subject: [PATCH 10/59] fix arg rename --- .../models/unit_test_wrappers/unit_reworked_compare.sql | 2 +- .../unit_reworked_compare_column_details_mismatch.sql | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.sql b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.sql index fdcebaf7..38960022 100644 --- a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.sql +++ b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.sql @@ -2,7 +2,7 @@ audit_helper.reworked_compare( "select * from " ~ ref('unit_test_model_a'), "select * from " ~ ref('unit_test_model_b'), - primary_key=['id'], + primary_key_columns=['id'], columns=var('reworked_compare__columns'), event_time=var('reworked_compare__event_time') ) diff --git a/integration_tests/models/unit_test_wrappers/unit_reworked_compare_column_details_mismatch.sql b/integration_tests/models/unit_test_wrappers/unit_reworked_compare_column_details_mismatch.sql index 2762c083..d8ed546a 100644 --- a/integration_tests/models/unit_test_wrappers/unit_reworked_compare_column_details_mismatch.sql +++ b/integration_tests/models/unit_test_wrappers/unit_reworked_compare_column_details_mismatch.sql @@ -2,7 +2,7 @@ audit_helper.reworked_compare( "select * from " ~ ref('unit_test_model_a'), "select * from " ~ ref('unit_test_model_b_more_cols'), - primary_key=['id'], + primary_key_columns=['id'], columns=var('reworked_compare__columns'), event_time=var('reworked_compare__event_time') ) From 0d766d67ccf36f8de99882d09d52336378aed49e Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Thu, 25 Apr 2024 18:42:26 +1200 Subject: [PATCH 11/59] add quick_are_queries_identical and unit tests --- .../unit_quick_are_queries_identical.sql | 8 +++ .../unit_quick_are_queries_identical.yml | 72 +++++++++++++++++++ macros/quick_are_queries_identical.sql | 42 +++++++++++ 3 files changed, 122 insertions(+) create mode 100644 integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.sql create mode 100644 integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.yml create mode 100644 macros/quick_are_queries_identical.sql diff --git a/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.sql b/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.sql new file mode 100644 index 00000000..72fd9e72 --- /dev/null +++ b/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.sql @@ -0,0 +1,8 @@ +{{ + audit_helper.quick_are_queries_identical( + "select * from " ~ ref('unit_test_model_a'), + "select * from " ~ ref('unit_test_model_b'), + columns=var('quick_are_queries_identical_cols'), + event_time=var('quick_are_queries_identical_event_time') + ) +}} \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.yml b/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.yml new file mode 100644 index 00000000..3a43e843 --- /dev/null +++ b/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.yml @@ -0,0 +1,72 @@ +unit_tests: + - name: quick_are_queries_identical_identical_tables + model: quick_are_queries_identical + + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + + expect: + rows: + - {"are_tables_identical": true} + + overrides: + vars: + quick_are_queries_identical_cols: ['id', 'col1', 'col2'] + quick_are_queries_identical_event_time: + + - name: quick_are_queries_identical_identical_tables_event_time_filter + model: quick_are_queries_identical + overrides: + vars: + quick_are_queries_identical_cols: ['id', 'col1', 'col2', 'created_at'] + quick_are_queries_identical_event_time: 'created_at' + macros: + audit_helper.get_comparison_bounds: + "min_event_time": "2024-01-02" + "max_event_time": "2024-01-03" + + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def", "created_at": '2024-01-01' } + - { "id": 2, "col1": "hij", "col2": "klm", "created_at": '2024-01-02' } + - { "id": 3, "col1": "nop", "col2": "qrs", "created_at": '2024-01-03' } + - input: ref('unit_test_model_b') + rows: + - { "id": 2, "col1": "hij", "col2": "klm", "created_at": '2024-01-02' } + - { "id": 3, "col1": "nop", "col2": "qrs", "created_at": '2024-01-03' } + + expect: + rows: + - {"are_tables_identical": true} + + - name: quick_are_queries_identical_differences + model: unit_reworked_compare + overrides: + vars: + quick_are_queries_identical_cols: ['id', 'col1', 'col2'] + quick_are_queries_identical_event_time: + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "changed", "col2": "values" } + - { "id": 4, "col1": "nop", "col2": "qrs" } + + expect: + rows: + - {"are_tables_identical": false} \ No newline at end of file diff --git a/macros/quick_are_queries_identical.sql b/macros/quick_are_queries_identical.sql new file mode 100644 index 00000000..218c78bf --- /dev/null +++ b/macros/quick_are_queries_identical.sql @@ -0,0 +1,42 @@ +{% macro quick_are_queries_identical(query_a, query_b, columns=[], event_time=None) %} + {{ return (adapter.dispatch('quick_are_queries_identical', 'audit_helper')(query_a, query_b, columns, event_time)) }} +{% endmacro %} + +{% macro default__quick_are_queries_identical(query_a, query_b, columns, event_time) %} + {% set joined_cols = columns | join(", ") %} + {% if event_time %} + {% set event_time_props = audit_helper.get_comparison_bounds(a_query, b_query, event_time) %} + {% endif %} + + select count(hash_result) = 1 as are_tables_identical + from ( + select hash_agg(joined_cols) as hash_result + from ({{ query_a }}) + {% if event_time_props %} + where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' + and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' + {% endif %} + + union + + select hash_agg(joined_cols) as hash_result + from analytics_dev.dbt_jlabes.fct_dbt_invocations + {% if event_time_props %} + where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' + and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' + {% endif %} + + ) as hashes +{% endmacro %} + +{% macro is_quick_are_queries_identical_supported() %} + {{ return (adapter.dispatch('is_quick_are_queries_identical_supported', 'audit_helper')()) }} +{% endmacro %} + +{% macro default__is_quick_are_queries_identical_supported() %} + {{ return (False) }} +{% endmacro %} + +{% macro snowflake__is_quick_are_queries_identical_supported() %} + {{ return (True) }} +{% endmacro %} \ No newline at end of file From c8ccf596004f7b9438f5ff65f9524097cb812a38 Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Mon, 6 May 2024 16:30:34 +1200 Subject: [PATCH 12/59] Move data tests into own directory --- .../compare_all_columns_concat_pk_with_summary.sql | 0 .../compare_all_columns_concat_pk_without_summary.sql | 0 .../models/{ => data_tests}/compare_all_columns_where_clause.sql | 0 .../models/{ => data_tests}/compare_all_columns_with_summary.sql | 0 .../compare_all_columns_with_summary_and_exclude.sql | 0 .../{ => data_tests}/compare_all_columns_without_summary.sql | 0 integration_tests/models/{ => data_tests}/compare_queries.sql | 0 .../compare_queries_concat_pk_without_summary.sql | 0 .../models/{ => data_tests}/compare_queries_with_summary.sql | 0 .../models/{ => data_tests}/compare_queries_without_summary.sql | 0 .../models/{ => data_tests}/compare_relation_columns.sql | 0 .../compare_relations_concat_pk_without_summary.sql | 0 .../models/{ => data_tests}/compare_relations_with_exclude.sql | 0 .../models/{ => data_tests}/compare_relations_with_summary.sql | 0 .../models/{ => data_tests}/compare_relations_without_exclude.sql | 0 .../models/{ => data_tests}/compare_relations_without_summary.sql | 0 integration_tests/models/{ => data_tests}/compare_row_counts.sql | 0 .../models/{ => data_tests}/compare_which_columns_differ.sql | 0 .../compare_which_columns_differ_exclude_cols.sql | 0 integration_tests/models/{ => data_tests}/schema.yml | 0 20 files changed, 0 insertions(+), 0 deletions(-) rename integration_tests/models/{ => data_tests}/compare_all_columns_concat_pk_with_summary.sql (100%) rename integration_tests/models/{ => data_tests}/compare_all_columns_concat_pk_without_summary.sql (100%) rename integration_tests/models/{ => data_tests}/compare_all_columns_where_clause.sql (100%) rename integration_tests/models/{ => data_tests}/compare_all_columns_with_summary.sql (100%) rename integration_tests/models/{ => data_tests}/compare_all_columns_with_summary_and_exclude.sql (100%) rename integration_tests/models/{ => data_tests}/compare_all_columns_without_summary.sql (100%) rename integration_tests/models/{ => data_tests}/compare_queries.sql (100%) rename integration_tests/models/{ => data_tests}/compare_queries_concat_pk_without_summary.sql (100%) rename integration_tests/models/{ => data_tests}/compare_queries_with_summary.sql (100%) rename integration_tests/models/{ => data_tests}/compare_queries_without_summary.sql (100%) rename integration_tests/models/{ => data_tests}/compare_relation_columns.sql (100%) rename integration_tests/models/{ => data_tests}/compare_relations_concat_pk_without_summary.sql (100%) rename integration_tests/models/{ => data_tests}/compare_relations_with_exclude.sql (100%) rename integration_tests/models/{ => data_tests}/compare_relations_with_summary.sql (100%) rename integration_tests/models/{ => data_tests}/compare_relations_without_exclude.sql (100%) rename integration_tests/models/{ => data_tests}/compare_relations_without_summary.sql (100%) rename integration_tests/models/{ => data_tests}/compare_row_counts.sql (100%) rename integration_tests/models/{ => data_tests}/compare_which_columns_differ.sql (100%) rename integration_tests/models/{ => data_tests}/compare_which_columns_differ_exclude_cols.sql (100%) rename integration_tests/models/{ => data_tests}/schema.yml (100%) diff --git a/integration_tests/models/compare_all_columns_concat_pk_with_summary.sql b/integration_tests/models/data_tests/compare_all_columns_concat_pk_with_summary.sql similarity index 100% rename from integration_tests/models/compare_all_columns_concat_pk_with_summary.sql rename to integration_tests/models/data_tests/compare_all_columns_concat_pk_with_summary.sql diff --git a/integration_tests/models/compare_all_columns_concat_pk_without_summary.sql b/integration_tests/models/data_tests/compare_all_columns_concat_pk_without_summary.sql similarity index 100% rename from integration_tests/models/compare_all_columns_concat_pk_without_summary.sql rename to integration_tests/models/data_tests/compare_all_columns_concat_pk_without_summary.sql diff --git a/integration_tests/models/compare_all_columns_where_clause.sql b/integration_tests/models/data_tests/compare_all_columns_where_clause.sql similarity index 100% rename from integration_tests/models/compare_all_columns_where_clause.sql rename to integration_tests/models/data_tests/compare_all_columns_where_clause.sql diff --git a/integration_tests/models/compare_all_columns_with_summary.sql b/integration_tests/models/data_tests/compare_all_columns_with_summary.sql similarity index 100% rename from integration_tests/models/compare_all_columns_with_summary.sql rename to integration_tests/models/data_tests/compare_all_columns_with_summary.sql diff --git a/integration_tests/models/compare_all_columns_with_summary_and_exclude.sql b/integration_tests/models/data_tests/compare_all_columns_with_summary_and_exclude.sql similarity index 100% rename from integration_tests/models/compare_all_columns_with_summary_and_exclude.sql rename to integration_tests/models/data_tests/compare_all_columns_with_summary_and_exclude.sql diff --git a/integration_tests/models/compare_all_columns_without_summary.sql b/integration_tests/models/data_tests/compare_all_columns_without_summary.sql similarity index 100% rename from integration_tests/models/compare_all_columns_without_summary.sql rename to integration_tests/models/data_tests/compare_all_columns_without_summary.sql diff --git a/integration_tests/models/compare_queries.sql b/integration_tests/models/data_tests/compare_queries.sql similarity index 100% rename from integration_tests/models/compare_queries.sql rename to integration_tests/models/data_tests/compare_queries.sql diff --git a/integration_tests/models/compare_queries_concat_pk_without_summary.sql b/integration_tests/models/data_tests/compare_queries_concat_pk_without_summary.sql similarity index 100% rename from integration_tests/models/compare_queries_concat_pk_without_summary.sql rename to integration_tests/models/data_tests/compare_queries_concat_pk_without_summary.sql diff --git a/integration_tests/models/compare_queries_with_summary.sql b/integration_tests/models/data_tests/compare_queries_with_summary.sql similarity index 100% rename from integration_tests/models/compare_queries_with_summary.sql rename to integration_tests/models/data_tests/compare_queries_with_summary.sql diff --git a/integration_tests/models/compare_queries_without_summary.sql b/integration_tests/models/data_tests/compare_queries_without_summary.sql similarity index 100% rename from integration_tests/models/compare_queries_without_summary.sql rename to integration_tests/models/data_tests/compare_queries_without_summary.sql diff --git a/integration_tests/models/compare_relation_columns.sql b/integration_tests/models/data_tests/compare_relation_columns.sql similarity index 100% rename from integration_tests/models/compare_relation_columns.sql rename to integration_tests/models/data_tests/compare_relation_columns.sql diff --git a/integration_tests/models/compare_relations_concat_pk_without_summary.sql b/integration_tests/models/data_tests/compare_relations_concat_pk_without_summary.sql similarity index 100% rename from integration_tests/models/compare_relations_concat_pk_without_summary.sql rename to integration_tests/models/data_tests/compare_relations_concat_pk_without_summary.sql diff --git a/integration_tests/models/compare_relations_with_exclude.sql b/integration_tests/models/data_tests/compare_relations_with_exclude.sql similarity index 100% rename from integration_tests/models/compare_relations_with_exclude.sql rename to integration_tests/models/data_tests/compare_relations_with_exclude.sql diff --git a/integration_tests/models/compare_relations_with_summary.sql b/integration_tests/models/data_tests/compare_relations_with_summary.sql similarity index 100% rename from integration_tests/models/compare_relations_with_summary.sql rename to integration_tests/models/data_tests/compare_relations_with_summary.sql diff --git a/integration_tests/models/compare_relations_without_exclude.sql b/integration_tests/models/data_tests/compare_relations_without_exclude.sql similarity index 100% rename from integration_tests/models/compare_relations_without_exclude.sql rename to integration_tests/models/data_tests/compare_relations_without_exclude.sql diff --git a/integration_tests/models/compare_relations_without_summary.sql b/integration_tests/models/data_tests/compare_relations_without_summary.sql similarity index 100% rename from integration_tests/models/compare_relations_without_summary.sql rename to integration_tests/models/data_tests/compare_relations_without_summary.sql diff --git a/integration_tests/models/compare_row_counts.sql b/integration_tests/models/data_tests/compare_row_counts.sql similarity index 100% rename from integration_tests/models/compare_row_counts.sql rename to integration_tests/models/data_tests/compare_row_counts.sql diff --git a/integration_tests/models/compare_which_columns_differ.sql b/integration_tests/models/data_tests/compare_which_columns_differ.sql similarity index 100% rename from integration_tests/models/compare_which_columns_differ.sql rename to integration_tests/models/data_tests/compare_which_columns_differ.sql diff --git a/integration_tests/models/compare_which_columns_differ_exclude_cols.sql b/integration_tests/models/data_tests/compare_which_columns_differ_exclude_cols.sql similarity index 100% rename from integration_tests/models/compare_which_columns_differ_exclude_cols.sql rename to integration_tests/models/data_tests/compare_which_columns_differ_exclude_cols.sql diff --git a/integration_tests/models/schema.yml b/integration_tests/models/data_tests/schema.yml similarity index 100% rename from integration_tests/models/schema.yml rename to integration_tests/models/data_tests/schema.yml From 58751e6495118193fbb1386e8b6a97992aa41775 Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Mon, 6 May 2024 16:34:07 +1200 Subject: [PATCH 13/59] Add test for multiple PKs --- integration_tests/dbt_project.yml | 4 ++- .../unit_test_model_a.sql | 2 +- .../unit_test_model_b.sql | 2 +- .../unit_test_model_b_more_cols.sql | 2 +- .../unit_reworked_compare.sql | 2 +- .../unit_reworked_compare.yml | 27 +++++++++++++++++++ 6 files changed, 34 insertions(+), 5 deletions(-) diff --git a/integration_tests/dbt_project.yml b/integration_tests/dbt_project.yml index 13646b9e..f23704fa 100644 --- a/integration_tests/dbt_project.yml +++ b/integration_tests/dbt_project.yml @@ -20,5 +20,7 @@ seeds: vars: compare_queries_summarize: true + reworked_compare__primary_key_columns: [] reworked_compare__columns: [] - reworked_compare__event_time: \ No newline at end of file + reworked_compare__event_time: + quick_are_queries_identical_cols: [] \ No newline at end of file diff --git a/integration_tests/models/unit_test_placeholder_models/unit_test_model_a.sql b/integration_tests/models/unit_test_placeholder_models/unit_test_model_a.sql index 55a6c71e..183f26ca 100644 --- a/integration_tests/models/unit_test_placeholder_models/unit_test_model_a.sql +++ b/integration_tests/models/unit_test_placeholder_models/unit_test_model_a.sql @@ -1 +1 @@ -select 1 as id, 'xyz' as col1, 'tuv' as col2, 123 as col3, getdate() as created_at \ No newline at end of file +select 1 as id, 2 as id_2, 'xyz' as col1, 'tuv' as col2, 123 as col3, getdate() as created_at \ No newline at end of file diff --git a/integration_tests/models/unit_test_placeholder_models/unit_test_model_b.sql b/integration_tests/models/unit_test_placeholder_models/unit_test_model_b.sql index 55a6c71e..183f26ca 100644 --- a/integration_tests/models/unit_test_placeholder_models/unit_test_model_b.sql +++ b/integration_tests/models/unit_test_placeholder_models/unit_test_model_b.sql @@ -1 +1 @@ -select 1 as id, 'xyz' as col1, 'tuv' as col2, 123 as col3, getdate() as created_at \ No newline at end of file +select 1 as id, 2 as id_2, 'xyz' as col1, 'tuv' as col2, 123 as col3, getdate() as created_at \ No newline at end of file diff --git a/integration_tests/models/unit_test_placeholder_models/unit_test_model_b_more_cols.sql b/integration_tests/models/unit_test_placeholder_models/unit_test_model_b_more_cols.sql index f0105eae..11476a96 100644 --- a/integration_tests/models/unit_test_placeholder_models/unit_test_model_b_more_cols.sql +++ b/integration_tests/models/unit_test_placeholder_models/unit_test_model_b_more_cols.sql @@ -1 +1 @@ -select 1 as id, 'xyz' as col1, 'tuv' as col2, 123 as col3, getdate() as created_at, 'pineapple' as pizza \ No newline at end of file +select 1 as id, 2 as id_2, 'xyz' as col1, 'tuv' as col2, 123 as col3, getdate() as created_at, 'pineapple' as pizza \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.sql b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.sql index 38960022..37473546 100644 --- a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.sql +++ b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.sql @@ -2,7 +2,7 @@ audit_helper.reworked_compare( "select * from " ~ ref('unit_test_model_a'), "select * from " ~ ref('unit_test_model_b'), - primary_key_columns=['id'], + primary_key_columns=var('reworked_compare__primary_key_columns'), columns=var('reworked_compare__columns'), event_time=var('reworked_compare__event_time') ) diff --git a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml index 49d3b394..1b3561a2 100644 --- a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml +++ b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml @@ -24,6 +24,7 @@ unit_tests: vars: reworked_compare__columns: ['id', 'col1', 'col2'] reworked_compare__event_time: + reworked_compare__primary_key_columns: ['id'] - name: reworked_compare_identical_tables_event_time_filter model: unit_reworked_compare @@ -31,6 +32,7 @@ unit_tests: vars: reworked_compare__columns: ['id', 'col1', 'col2', 'created_at'] reworked_compare__event_time: 'created_at' + reworked_compare__primary_key_columns: ['id'] macros: audit_helper.get_comparison_bounds: "min_event_time": "2024-01-02" @@ -58,6 +60,7 @@ unit_tests: vars: reworked_compare__columns: ['id', 'col1', 'col2'] reworked_compare__event_time: + reworked_compare__primary_key_columns: ['id'] given: - input: ref('unit_test_model_a') rows: @@ -77,3 +80,27 @@ unit_tests: - {"status": 'modified', 'id': 2, num_in_status: 1} - {"status": 'removed', 'id': 3, num_in_status: 1} - {"status": 'added', 'id': 4, num_in_status: 1} + + - name: reworked_compare_all_statuses_multiple_pk_cols + model: unit_reworked_compare + overrides: + vars: + reworked_compare__columns: ['id', 'id_2', 'col1', 'col2'] + reworked_compare__event_time: + reworked_compare__primary_key_columns: ['id', 'id_2'] + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 12, "id_2": 3, "col1": "abc", "col2": "def" } + - { "id": 1, "id_2": 23, "col1": "hij", "col2": "klm" } + - { "id": 3, "id_2": 4, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": 12, "id_2": 3, "col1": "abc", "col2": "def" } + - { "id": 1, "id_2": 23, "col1": "hij", "col2": "klm" } + - { "id": 3, "id_2": 4, "col1": "nop", "col2": "qrs" } + expect: + rows: + - {"status": 'identical', 'id': 12, "id_2": 3, "num_in_status": 3} + - {"status": 'identical', 'id': 1, "id_2": 23, "num_in_status": 3} + - {"status": 'identical', 'id': 3, "id_2": 4, "num_in_status": 3} \ No newline at end of file From 022b91b0dc2346b5716f498daddaea6a2e98e3c0 Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Mon, 6 May 2024 16:44:14 +1200 Subject: [PATCH 14/59] fix incorrect unit test configs --- .../unit_test_wrappers/unit_quick_are_queries_identical.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.yml b/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.yml index 3a43e843..7690a3fb 100644 --- a/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.yml +++ b/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.yml @@ -1,6 +1,6 @@ unit_tests: - name: quick_are_queries_identical_identical_tables - model: quick_are_queries_identical + model: unit_quick_are_queries_identical given: - input: ref('unit_test_model_a') @@ -24,7 +24,7 @@ unit_tests: quick_are_queries_identical_event_time: - name: quick_are_queries_identical_identical_tables_event_time_filter - model: quick_are_queries_identical + model: unit_quick_are_queries_identical overrides: vars: quick_are_queries_identical_cols: ['id', 'col1', 'col2', 'created_at'] @@ -50,7 +50,7 @@ unit_tests: - {"are_tables_identical": true} - name: quick_are_queries_identical_differences - model: unit_reworked_compare + model: unit_quick_are_queries_identical overrides: vars: quick_are_queries_identical_cols: ['id', 'col1', 'col2'] From bef6e1838231054f4a1f57abbbb5a996bab498ae Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Mon, 6 May 2024 16:59:50 +1200 Subject: [PATCH 15/59] make data types for id and id_2 big enough nums --- .../models/unit_test_placeholder_models/unit_test_model_a.sql | 2 +- .../models/unit_test_placeholder_models/unit_test_model_b.sql | 2 +- .../unit_test_model_b_more_cols.sql | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/integration_tests/models/unit_test_placeholder_models/unit_test_model_a.sql b/integration_tests/models/unit_test_placeholder_models/unit_test_model_a.sql index 183f26ca..3c729df2 100644 --- a/integration_tests/models/unit_test_placeholder_models/unit_test_model_a.sql +++ b/integration_tests/models/unit_test_placeholder_models/unit_test_model_a.sql @@ -1 +1 @@ -select 1 as id, 2 as id_2, 'xyz' as col1, 'tuv' as col2, 123 as col3, getdate() as created_at \ No newline at end of file +select 12 as id, 22 as id_2, 'xyz' as col1, 'tuv' as col2, 123 as col3, getdate() as created_at \ No newline at end of file diff --git a/integration_tests/models/unit_test_placeholder_models/unit_test_model_b.sql b/integration_tests/models/unit_test_placeholder_models/unit_test_model_b.sql index 183f26ca..3c729df2 100644 --- a/integration_tests/models/unit_test_placeholder_models/unit_test_model_b.sql +++ b/integration_tests/models/unit_test_placeholder_models/unit_test_model_b.sql @@ -1 +1 @@ -select 1 as id, 2 as id_2, 'xyz' as col1, 'tuv' as col2, 123 as col3, getdate() as created_at \ No newline at end of file +select 12 as id, 22 as id_2, 'xyz' as col1, 'tuv' as col2, 123 as col3, getdate() as created_at \ No newline at end of file diff --git a/integration_tests/models/unit_test_placeholder_models/unit_test_model_b_more_cols.sql b/integration_tests/models/unit_test_placeholder_models/unit_test_model_b_more_cols.sql index 11476a96..b9e425b7 100644 --- a/integration_tests/models/unit_test_placeholder_models/unit_test_model_b_more_cols.sql +++ b/integration_tests/models/unit_test_placeholder_models/unit_test_model_b_more_cols.sql @@ -1 +1 @@ -select 1 as id, 2 as id_2, 'xyz' as col1, 'tuv' as col2, 123 as col3, getdate() as created_at, 'pineapple' as pizza \ No newline at end of file +select 12 as id, 22 as id_2, 'xyz' as col1, 'tuv' as col2, 123 as col3, getdate() as created_at, 'pineapple' as pizza \ No newline at end of file From 0f1e09ece6e4146da012d6cefe1a07febefa9b75 Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Mon, 6 May 2024 17:18:21 +1200 Subject: [PATCH 16/59] Mock event_time response --- .../unit_test_wrappers/unit_quick_are_queries_identical.yml | 1 + .../models/unit_test_wrappers/unit_reworked_compare.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.yml b/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.yml index 7690a3fb..32f458d5 100644 --- a/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.yml +++ b/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.yml @@ -33,6 +33,7 @@ unit_tests: audit_helper.get_comparison_bounds: "min_event_time": "2024-01-02" "max_event_time": "2024-01-03" + "event_time": 'created_at' given: - input: ref('unit_test_model_a') diff --git a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml index 1b3561a2..91cc7f4a 100644 --- a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml +++ b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml @@ -37,6 +37,7 @@ unit_tests: audit_helper.get_comparison_bounds: "min_event_time": "2024-01-02" "max_event_time": "2024-01-03" + "event_time": 'created_at' given: - input: ref('unit_test_model_a') From 33e4c507585abfe4f27227d4de4ad6c1ef5959fe Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Mon, 6 May 2024 17:30:24 +1200 Subject: [PATCH 17/59] fix hardcoded value in quick_are_qs_identical --- integration_tests/dbt_project.yml | 3 ++- macros/quick_are_queries_identical.sql | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/integration_tests/dbt_project.yml b/integration_tests/dbt_project.yml index f23704fa..39664887 100644 --- a/integration_tests/dbt_project.yml +++ b/integration_tests/dbt_project.yml @@ -23,4 +23,5 @@ vars: reworked_compare__primary_key_columns: [] reworked_compare__columns: [] reworked_compare__event_time: - quick_are_queries_identical_cols: [] \ No newline at end of file + quick_are_queries_identical_cols: [] + quick_are_queries_identical_event_time: \ No newline at end of file diff --git a/macros/quick_are_queries_identical.sql b/macros/quick_are_queries_identical.sql index 218c78bf..466b4e6f 100644 --- a/macros/quick_are_queries_identical.sql +++ b/macros/quick_are_queries_identical.sql @@ -10,7 +10,7 @@ select count(hash_result) = 1 as are_tables_identical from ( - select hash_agg(joined_cols) as hash_result + select hash_agg({{ joined_cols }}) as hash_result from ({{ query_a }}) {% if event_time_props %} where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' @@ -19,8 +19,8 @@ union - select hash_agg(joined_cols) as hash_result - from analytics_dev.dbt_jlabes.fct_dbt_invocations + select hash_agg({{ joined_cols }}) as hash_result + from ({{ query_b }}) {% if event_time_props %} where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' From 0df1b6f3ad06bfbe550b1a684ed29ccb8b9fbaf3 Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Tue, 7 May 2024 15:20:18 +1200 Subject: [PATCH 18/59] Add unit tests for null handling (still broken) --- .../unit_quick_are_queries_identical.yml | 26 +++++- .../unit_reworked_compare.yml | 88 ++++++++++++++++++- macros/reworked_compare.sql | 42 +++++---- 3 files changed, 135 insertions(+), 21 deletions(-) diff --git a/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.yml b/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.yml index 32f458d5..0d953506 100644 --- a/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.yml +++ b/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.yml @@ -70,4 +70,28 @@ unit_tests: expect: rows: - - {"are_tables_identical": false} \ No newline at end of file + - {"are_tables_identical": false} + + - name: quick_are_queries_identical_identical_tables_with_null_pks + model: unit_quick_are_queries_identical + + given: + - input: ref('unit_test_model_a') + rows: + - { "id":, "col1": "abc", "col2": "def" } + - { "id":, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id":, "col1": "abc", "col2": "def" } + - { "id":, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + + expect: + rows: + - {"are_tables_identical": true} + + overrides: + vars: + quick_are_queries_identical_cols: ['id', 'col1', 'col2'] + quick_are_queries_identical_event_time: diff --git a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml index 91cc7f4a..a7b111ee 100644 --- a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml +++ b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml @@ -82,7 +82,7 @@ unit_tests: - {"status": 'removed', 'id': 3, num_in_status: 1} - {"status": 'added', 'id': 4, num_in_status: 1} - - name: reworked_compare_all_statuses_multiple_pk_cols + - name: reworked_compare_identical_tables_multiple_pk_cols model: unit_reworked_compare overrides: vars: @@ -104,4 +104,88 @@ unit_tests: rows: - {"status": 'identical', 'id': 12, "id_2": 3, "num_in_status": 3} - {"status": 'identical', 'id': 1, "id_2": 23, "num_in_status": 3} - - {"status": 'identical', 'id': 3, "id_2": 4, "num_in_status": 3} \ No newline at end of file + - {"status": 'identical', 'id': 3, "id_2": 4, "num_in_status": 3} + + - name: reworked_compare_identical_tables_single_null_pk + model: unit_reworked_compare + + given: + - input: ref('unit_test_model_a') + rows: + - { "id": , "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": , "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + + expect: + rows: + - {"status": 'identical', 'id': , num_in_status: 3} + - {"status": 'identical', 'id': 2, num_in_status: 3} + - {"status": 'identical', 'id': 3, num_in_status: 3} + + overrides: + vars: + reworked_compare__columns: ['id', 'col1', 'col2'] + reworked_compare__event_time: + reworked_compare__primary_key_columns: ['id'] + + - name: reworked_compare_identical_tables_multiple_null_pk + model: unit_reworked_compare + + given: + - input: ref('unit_test_model_a') + rows: + - { "id": , "col1": "abc", "col2": "def" } + - { "id": , "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": , "col1": "abc", "col2": "def" } + - { "id": , "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + + expect: + rows: + - {"status": 'identical', 'id': 3, num_in_status: 3} + - {"status": 'identical', 'id': , num_in_status: 3} + - {"status": 'identical', 'id': , num_in_status: 3} + + overrides: + vars: + reworked_compare__columns: ['id', 'col1', 'col2'] + reworked_compare__event_time: + reworked_compare__primary_key_columns: ['id'] + + - name: reworked_compare_identical_tables_multiple_null_pk_with_duplicate_rows + description: The two rows with a null ID are identical. They should both be returned as individual rows instead of being combined + model: unit_reworked_compare + + given: + - input: ref('unit_test_model_a') + rows: + - { "id": , "col1": "abc", "col2": "def" } + - { "id": , "col1": "abc", "col2": "def" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": , "col1": "abc", "col2": "def" } + - { "id": , "col1": "abc", "col2": "def" } + - { "id": , "col1": "abc", "col2": "def" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + + expect: + rows: + - {"status": 'identical', 'id': 3, num_in_status: 3} + - {"status": 'identical', 'id': , num_in_status: 3} + - {"status": 'identical', 'id': , num_in_status: 3} + - {"status": 'added', 'id': , num_in_status: 1} + + overrides: + vars: + reworked_compare__columns: ['id', 'col1', 'col2'] + reworked_compare__event_time: + reworked_compare__primary_key_columns: ['id'] \ No newline at end of file diff --git a/macros/reworked_compare.sql b/macros/reworked_compare.sql index 3378dbf0..792697a6 100644 --- a/macros/reworked_compare.sql +++ b/macros/reworked_compare.sql @@ -9,7 +9,7 @@ with - {{ audit_helper.generate_set_results(a_query, b_query, columns, event_time_props)}} + {{ audit_helper.generate_set_results(a_query, b_query, primary_key, columns, event_time_props)}} , @@ -46,8 +46,8 @@ *, case when in_a and in_b then 'identical' - when {{ dbt.bool_or('in_a') }} over (partition by {{ primary_key }}) - and {{ dbt.bool_or('in_b') }} over (partition by {{ primary_key }}) + when {{ dbt.bool_or('in_a') }} over (partition by {{ primary_key }}, dbt_audit_pk_row_num) + and {{ dbt.bool_or('in_b') }} over (partition by {{ primary_key }}, dbt_audit_pk_row_num) then 'modified' when in_a then 'removed' when in_b then 'added' @@ -60,8 +60,8 @@ final as ( select *, - count(distinct {{ primary_key }}) over (partition by status) as num_in_status, - dense_rank() over (partition by status order by {{ primary_key }}) as sample_number + count(distinct {{ primary_key }}, dbt_audit_pk_row_num) over (partition by status) as num_in_status, + dense_rank() over (partition by status order by {{ primary_key }}, dbt_audit_pk_row_num) as sample_number from classified ) @@ -73,15 +73,17 @@ {% endmacro %} -{% macro generate_set_results(a_query, b_query, columns, event_time_props=None) %} - {{ return(adapter.dispatch('generate_set_results', 'audit_helper')(a_query, b_query, columns, event_time_props)) }} +{% macro generate_set_results(a_query, b_query, primary_key, columns, event_time_props=None) %} + {{ return(adapter.dispatch('generate_set_results', 'audit_helper')(a_query, b_query, primary_key, columns, event_time_props)) }} {% endmacro %} -{% macro default__generate_set_results(a_query, b_query, columns, event_time_props) %} +{% macro default__generate_set_results(a_query, b_query, primary_key, columns, event_time_props) %} {% set joined_cols = columns | join(", ") %} a as ( - select {{ joined_cols }} + select + {{ joined_cols }}, + row_number() over (partition by {{ primary_key }} order by {{ primary_key}} ) as dbt_audit_pk_row_num from ( {{- a_query -}} ) {% if event_time_props %} where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' @@ -90,7 +92,9 @@ ), b as ( - select {{ joined_cols }} + select + {{ joined_cols }}, + row_number() over (partition by {{ primary_key }} order by {{ primary_key}} ) as dbt_audit_pk_row_num from ( {{- b_query -}} ) {% if event_time_props %} where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' @@ -123,12 +127,13 @@ ) {% endmacro %} -{% macro snowflake__generate_set_results(a_query, b_query, columns, event_time_props) %} +{% macro snowflake__generate_set_results(a_query, b_query, primary_key, columns, event_time_props) %} {% set joined_cols = columns | join(", ") %} a as ( select - {{ joined_cols }}, - hash({{ joined_cols }}) as dbt_compare_row_hash + {{ joined_cols }}, + row_number() over (partition by {{ primary_key }} order by {{ primary_key}} ) as dbt_audit_pk_row_num, + hash({{ joined_cols }}, dbt_audit_pk_row_num) as dbt_audit_row_hash from ( {{- a_query -}} ) {% if event_time_props %} where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' @@ -138,8 +143,9 @@ b as ( select - {{ joined_cols }}, - hash({{ joined_cols }}) as dbt_compare_row_hash + {{ joined_cols }}, + row_number() over (partition by {{ primary_key }} order by {{ primary_key}} ) as dbt_audit_pk_row_num, + hash({{ joined_cols }}, dbt_audit_pk_row_num) as dbt_audit_row_hash from ( {{- b_query -}} ) {% if event_time_props %} where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' @@ -150,21 +156,21 @@ a_intersect_b as ( select * from a - where a.dbt_compare_row_hash in (select b.dbt_compare_row_hash from b) + where a.dbt_audit_row_hash in (select b.dbt_audit_row_hash from b) ), a_except_b as ( select * from a - where a.dbt_compare_row_hash not in (select b.dbt_compare_row_hash from b) + where a.dbt_audit_row_hash not in (select b.dbt_audit_row_hash from b) ), b_except_a as ( select * from b - where b.dbt_compare_row_hash not in (select a.dbt_compare_row_hash from a) + where b.dbt_audit_row_hash not in (select a.dbt_audit_row_hash from a) ) {% endmacro %} \ No newline at end of file From 9a75fc966552931bb6b2d3de87b17d0f92e95d32 Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Tue, 7 May 2024 16:33:28 +1200 Subject: [PATCH 19/59] Rename columsn to be more unique --- .../unit_reworked_compare.yml | 54 ++++++++++--------- ...worked_compare_column_details_mismatch.yml | 10 ++-- macros/reworked_compare.sql | 13 ++--- 3 files changed, 41 insertions(+), 36 deletions(-) diff --git a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml index a7b111ee..8cebfbcc 100644 --- a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml +++ b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml @@ -16,9 +16,9 @@ unit_tests: expect: rows: - - {"status": 'identical', 'id': 1, num_in_status: 3} - - {"status": 'identical', 'id': 2, num_in_status: 3} - - {"status": 'identical', 'id': 3, num_in_status: 3} + - {"dbt_audit_row_status": 'identical', 'id': 1, dbt_audit_num_rows_in_status: 3} + - {"dbt_audit_row_status": 'identical', 'id': 2, dbt_audit_num_rows_in_status: 3} + - {"dbt_audit_row_status": 'identical', 'id': 3, dbt_audit_num_rows_in_status: 3} overrides: vars: @@ -52,9 +52,9 @@ unit_tests: expect: rows: - - {"status": 'identical', 'id': 2, num_in_status: 2} - - {"status": 'identical', 'id': 3, num_in_status: 2} - + - {"dbt_audit_row_status": 'identical', 'id': 2, dbt_audit_num_rows_in_status: 2} + - {"dbt_audit_row_status": 'identical', 'id': 3, dbt_audit_num_rows_in_status: 2} + - name: reworked_compare_all_statuses model: unit_reworked_compare overrides: @@ -76,11 +76,11 @@ unit_tests: expect: rows: - - {"status": 'identical', 'id': 1, num_in_status: 1} - - {"status": 'modified', 'id': 2, num_in_status: 1} - - {"status": 'modified', 'id': 2, num_in_status: 1} - - {"status": 'removed', 'id': 3, num_in_status: 1} - - {"status": 'added', 'id': 4, num_in_status: 1} + - {"dbt_audit_row_status": 'identical', 'id': 1, dbt_audit_num_rows_in_status: 1} + - {"dbt_audit_row_status": 'modified', 'id': 2, dbt_audit_num_rows_in_status: 1} + - {"dbt_audit_row_status": 'modified', 'id': 2, dbt_audit_num_rows_in_status: 1} + - {"dbt_audit_row_status": 'removed', 'id': 3, dbt_audit_num_rows_in_status: 1} + - {"dbt_audit_row_status": 'added', 'id': 4, dbt_audit_num_rows_in_status: 1} - name: reworked_compare_identical_tables_multiple_pk_cols model: unit_reworked_compare @@ -102,9 +102,9 @@ unit_tests: - { "id": 3, "id_2": 4, "col1": "nop", "col2": "qrs" } expect: rows: - - {"status": 'identical', 'id': 12, "id_2": 3, "num_in_status": 3} - - {"status": 'identical', 'id': 1, "id_2": 23, "num_in_status": 3} - - {"status": 'identical', 'id': 3, "id_2": 4, "num_in_status": 3} + - {"dbt_audit_row_status": 'identical', 'id': 12, "id_2": 3, "dbt_audit_num_rows_in_status": 3} + - {"dbt_audit_row_status": 'identical', 'id': 1, "id_2": 23, "dbt_audit_num_rows_in_status": 3} + - {"dbt_audit_row_status": 'identical', 'id': 3, "id_2": 4, "dbt_audit_num_rows_in_status": 3} - name: reworked_compare_identical_tables_single_null_pk model: unit_reworked_compare @@ -123,9 +123,9 @@ unit_tests: expect: rows: - - {"status": 'identical', 'id': , num_in_status: 3} - - {"status": 'identical', 'id': 2, num_in_status: 3} - - {"status": 'identical', 'id': 3, num_in_status: 3} + - {"dbt_audit_row_status": 'identical', 'id': , dbt_audit_num_rows_in_status: 3} + - {"dbt_audit_row_status": 'identical', 'id': 2, dbt_audit_num_rows_in_status: 3} + - {"dbt_audit_row_status": 'identical', 'id': 3, dbt_audit_num_rows_in_status: 3} overrides: vars: @@ -150,9 +150,9 @@ unit_tests: expect: rows: - - {"status": 'identical', 'id': 3, num_in_status: 3} - - {"status": 'identical', 'id': , num_in_status: 3} - - {"status": 'identical', 'id': , num_in_status: 3} + - {"dbt_audit_row_status": 'identical', 'id': 3, dbt_audit_num_rows_in_status: 3} + - {"dbt_audit_row_status": 'identical', 'id': , dbt_audit_num_rows_in_status: 3} + - {"dbt_audit_row_status": 'identical', 'id': , dbt_audit_num_rows_in_status: 3} overrides: vars: @@ -179,13 +179,17 @@ unit_tests: expect: rows: - - {"status": 'identical', 'id': 3, num_in_status: 3} - - {"status": 'identical', 'id': , num_in_status: 3} - - {"status": 'identical', 'id': , num_in_status: 3} - - {"status": 'added', 'id': , num_in_status: 1} + - {"dbt_audit_row_status": 'identical', 'id': 3, dbt_audit_num_rows_in_status: 3} + - {"dbt_audit_row_status": 'identical', 'id': , dbt_audit_num_rows_in_status: 3} + - {"dbt_audit_row_status": 'identical', 'id': , dbt_audit_num_rows_in_status: 3} + - {"dbt_audit_row_status": 'added', 'id': , dbt_audit_num_rows_in_status: 1} overrides: vars: reworked_compare__columns: ['id', 'col1', 'col2'] reworked_compare__event_time: - reworked_compare__primary_key_columns: ['id'] \ No newline at end of file + reworked_compare__primary_key_columns: ['id'] + + config: + tags: [bq_only] + \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_reworked_compare_column_details_mismatch.yml b/integration_tests/models/unit_test_wrappers/unit_reworked_compare_column_details_mismatch.yml index 923d2584..f134aa24 100644 --- a/integration_tests/models/unit_test_wrappers/unit_reworked_compare_column_details_mismatch.yml +++ b/integration_tests/models/unit_test_wrappers/unit_reworked_compare_column_details_mismatch.yml @@ -19,8 +19,8 @@ unit_tests: expect: rows: - - {"status": 'identical', 'id': 1, num_in_status: 1} - - {"status": 'modified', 'id': 2, num_in_status: 1} - - {"status": 'modified', 'id': 2, num_in_status: 1} - - {"status": 'removed', 'id': 3, num_in_status: 1} - - {"status": 'added', 'id': 4, num_in_status: 1} + - {"dbt_audit_row_status": 'identical', 'id': 1, dbt_audit_num_rows_in_status: 1} + - {"dbt_audit_row_status": 'modified', 'id': 2, dbt_audit_num_rows_in_status: 1} + - {"dbt_audit_row_status": 'modified', 'id': 2, dbt_audit_num_rows_in_status: 1} + - {"dbt_audit_row_status": 'removed', 'id': 3, dbt_audit_num_rows_in_status: 1} + - {"dbt_audit_row_status": 'added', 'id': 4, dbt_audit_num_rows_in_status: 1} diff --git a/macros/reworked_compare.sql b/macros/reworked_compare.sql index 792697a6..09479ce3 100644 --- a/macros/reworked_compare.sql +++ b/macros/reworked_compare.sql @@ -51,7 +51,7 @@ then 'modified' when in_a then 'removed' when in_b then 'added' - end as status + end as dbt_audit_row_status from all_records order by {{ primary_key }}, in_a desc, in_b desc @@ -60,16 +60,16 @@ final as ( select *, - count(distinct {{ primary_key }}, dbt_audit_pk_row_num) over (partition by status) as num_in_status, - dense_rank() over (partition by status order by {{ primary_key }}, dbt_audit_pk_row_num) as sample_number + count(distinct {{ primary_key }}, dbt_audit_pk_row_num) over (partition by dbt_audit_row_status) as dbt_audit_num_rows_in_status, + dense_rank() over (partition by dbt_audit_row_status order by {{ primary_key }}, dbt_audit_pk_row_num) as dbt_audit_sample_number from classified ) select * from final {% if sample_limit %} - where sample_number <= {{ sample_limit }} + where dbt_audit_sample_number <= {{ sample_limit }} {% endif %} - order by status, sample_number + order by dbt_audit_row_status, dbt_audit_sample_number {% endmacro %} @@ -83,7 +83,8 @@ a as ( select {{ joined_cols }}, - row_number() over (partition by {{ primary_key }} order by {{ primary_key}} ) as dbt_audit_pk_row_num + row_number() over (partition by {{ primary_key }} order by {{ primary_key}} ) as dbt_audit_pk_row_num, + audit_helper.generate_surrogate_key(primary_keys + ) from ( {{- a_query -}} ) {% if event_time_props %} where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' From 815760075460827c507a10b5d962d5b80f548f1b Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Wed, 8 May 2024 13:21:21 +1200 Subject: [PATCH 20/59] Steal surrogate key macro from utils --- macros/utils/generate_null_safe_sk.sql | 25 ++++++++++++++++++++ macros/{ => utils}/get_comparison_bounds.sql | 0 2 files changed, 25 insertions(+) create mode 100644 macros/utils/generate_null_safe_sk.sql rename macros/{ => utils}/get_comparison_bounds.sql (100%) diff --git a/macros/utils/generate_null_safe_sk.sql b/macros/utils/generate_null_safe_sk.sql new file mode 100644 index 00000000..4078c334 --- /dev/null +++ b/macros/utils/generate_null_safe_sk.sql @@ -0,0 +1,25 @@ +{# Taken from https://github.com/dbt-labs/dbt-utils/blob/main/macros/sql/generate_surrogate_key.sql but without the option to treat nulls as empty strings #} + +{%- macro generate_null_safe_surrogate_key(field_list) -%} + {{ return(adapter.dispatch('generate_null_safe_surrogate_key', 'audit_helper')(field_list)) }} +{% endmacro %} + +{%- macro default__generate_null_safe_surrogate_key(field_list) -%} + +{%- set fields = [] -%} + +{%- for field in field_list -%} + + {%- do fields.append( + "coalesce(cast(" ~ field ~ " as " ~ dbt.type_string() ~ "), '_dbt_audit_helper_surrogate_key_null_')" + ) -%} + + {%- if not loop.last %} + {%- do fields.append("'-'") -%} + {%- endif -%} + +{%- endfor -%} + +{{ dbt.hash(dbt.concat(fields)) }} + +{%- endmacro -%} \ No newline at end of file diff --git a/macros/get_comparison_bounds.sql b/macros/utils/get_comparison_bounds.sql similarity index 100% rename from macros/get_comparison_bounds.sql rename to macros/utils/get_comparison_bounds.sql From 0e78f25af84d549cd204e17c53bba30bb6f421f2 Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Tue, 14 May 2024 16:00:49 +1200 Subject: [PATCH 21/59] Use generated surrogate key across the board in place of PK --- .../unit_reworked_compare.yml | 12 ++---- macros/reworked_compare.sql | 39 +++++++++++-------- 2 files changed, 26 insertions(+), 25 deletions(-) diff --git a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml index 8cebfbcc..fc821eff 100644 --- a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml +++ b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml @@ -150,9 +150,9 @@ unit_tests: expect: rows: - - {"dbt_audit_row_status": 'identical', 'id': 3, dbt_audit_num_rows_in_status: 3} - {"dbt_audit_row_status": 'identical', 'id': , dbt_audit_num_rows_in_status: 3} - {"dbt_audit_row_status": 'identical', 'id': , dbt_audit_num_rows_in_status: 3} + - {"dbt_audit_row_status": 'identical', 'id': 3, dbt_audit_num_rows_in_status: 3} overrides: vars: @@ -179,17 +179,13 @@ unit_tests: expect: rows: - - {"dbt_audit_row_status": 'identical', 'id': 3, dbt_audit_num_rows_in_status: 3} + - {"dbt_audit_row_status": 'added', 'id': , dbt_audit_num_rows_in_status: 1} - {"dbt_audit_row_status": 'identical', 'id': , dbt_audit_num_rows_in_status: 3} - {"dbt_audit_row_status": 'identical', 'id': , dbt_audit_num_rows_in_status: 3} - - {"dbt_audit_row_status": 'added', 'id': , dbt_audit_num_rows_in_status: 1} + - {"dbt_audit_row_status": 'identical', 'id': 3, dbt_audit_num_rows_in_status: 3} overrides: vars: reworked_compare__columns: ['id', 'col1', 'col2'] reworked_compare__event_time: - reworked_compare__primary_key_columns: ['id'] - - config: - tags: [bq_only] - \ No newline at end of file + reworked_compare__primary_key_columns: ['id'] \ No newline at end of file diff --git a/macros/reworked_compare.sql b/macros/reworked_compare.sql index 09479ce3..d3b1fb8e 100644 --- a/macros/reworked_compare.sql +++ b/macros/reworked_compare.sql @@ -1,15 +1,17 @@ {% macro reworked_compare(a_query, b_query, primary_key_columns=[], columns=[], event_time=None, sample_limit=20) %} {% set joined_cols = columns | join(", ") %} - {% set primary_key = primary_key_columns | join(", ") %} {% if event_time %} {% set event_time_props = audit_helper.get_comparison_bounds(a_query, b_query, event_time) %} {% endif %} with - - {{ audit_helper.generate_set_results(a_query, b_query, primary_key, columns, event_time_props)}} + {#- + Set generation is dispatched because it's possible to get performance optimisations + on some platforms, while keeping the post-processing standardised + -#} + {{ audit_helper.generate_set_results(a_query, b_query, primary_key_columns, columns, event_time_props)}} , @@ -46,22 +48,22 @@ *, case when in_a and in_b then 'identical' - when {{ dbt.bool_or('in_a') }} over (partition by {{ primary_key }}, dbt_audit_pk_row_num) - and {{ dbt.bool_or('in_b') }} over (partition by {{ primary_key }}, dbt_audit_pk_row_num) + when {{ dbt.bool_or('in_a') }} over (partition by dbt_audit_surrogate_key, dbt_audit_pk_row_num) + and {{ dbt.bool_or('in_b') }} over (partition by dbt_audit_surrogate_key, dbt_audit_pk_row_num) then 'modified' when in_a then 'removed' when in_b then 'added' end as dbt_audit_row_status from all_records - order by {{ primary_key }}, in_a desc, in_b desc + order by dbt_audit_surrogate_key, in_a desc, in_b desc ), final as ( select *, - count(distinct {{ primary_key }}, dbt_audit_pk_row_num) over (partition by dbt_audit_row_status) as dbt_audit_num_rows_in_status, - dense_rank() over (partition by dbt_audit_row_status order by {{ primary_key }}, dbt_audit_pk_row_num) as dbt_audit_sample_number + count(distinct dbt_audit_surrogate_key, dbt_audit_pk_row_num) over (partition by dbt_audit_row_status) as dbt_audit_num_rows_in_status, + dense_rank() over (partition by dbt_audit_row_status order by dbt_audit_surrogate_key, dbt_audit_pk_row_num) as dbt_audit_sample_number from classified ) @@ -73,18 +75,18 @@ {% endmacro %} -{% macro generate_set_results(a_query, b_query, primary_key, columns, event_time_props=None) %} - {{ return(adapter.dispatch('generate_set_results', 'audit_helper')(a_query, b_query, primary_key, columns, event_time_props)) }} +{% macro generate_set_results(a_query, b_query, primary_key_columns, columns, event_time_props=None) %} + {{ return(adapter.dispatch('generate_set_results', 'audit_helper')(a_query, b_query, primary_key_columns, columns, event_time_props)) }} {% endmacro %} -{% macro default__generate_set_results(a_query, b_query, primary_key, columns, event_time_props) %} +{% macro default__generate_set_results(a_query, b_query, primary_key_columns, columns, event_time_props) %} {% set joined_cols = columns | join(", ") %} a as ( select {{ joined_cols }}, - row_number() over (partition by {{ primary_key }} order by {{ primary_key}} ) as dbt_audit_pk_row_num, - audit_helper.generate_surrogate_key(primary_keys + ) + {{ audit_helper.generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key, + row_number() over (partition by dbt_audit_surrogate_key order by dbt_audit_surrogate_key ) as dbt_audit_pk_row_num from ( {{- a_query -}} ) {% if event_time_props %} where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' @@ -95,7 +97,8 @@ b as ( select {{ joined_cols }}, - row_number() over (partition by {{ primary_key }} order by {{ primary_key}} ) as dbt_audit_pk_row_num + {{ audit_helper.generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key, + row_number() over (partition by dbt_audit_surrogate_key order by dbt_audit_surrogate_key ) as dbt_audit_pk_row_num from ( {{- b_query -}} ) {% if event_time_props %} where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' @@ -128,12 +131,13 @@ ) {% endmacro %} -{% macro snowflake__generate_set_results(a_query, b_query, primary_key, columns, event_time_props) %} +{% macro snowflake__generate_set_results(a_query, b_query, primary_key_columns, columns, event_time_props) %} {% set joined_cols = columns | join(", ") %} a as ( select {{ joined_cols }}, - row_number() over (partition by {{ primary_key }} order by {{ primary_key}} ) as dbt_audit_pk_row_num, + {{ audit_helper.generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key, + row_number() over (partition by dbt_audit_surrogate_key order by dbt_audit_surrogate_key ) as dbt_audit_pk_row_num, hash({{ joined_cols }}, dbt_audit_pk_row_num) as dbt_audit_row_hash from ( {{- a_query -}} ) {% if event_time_props %} @@ -145,7 +149,8 @@ b as ( select {{ joined_cols }}, - row_number() over (partition by {{ primary_key }} order by {{ primary_key}} ) as dbt_audit_pk_row_num, + {{ audit_helper.generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key, + row_number() over (partition by dbt_audit_surrogate_key order by dbt_audit_surrogate_key ) as dbt_audit_pk_row_num, hash({{ joined_cols }}, dbt_audit_pk_row_num) as dbt_audit_row_hash from ( {{- b_query -}} ) {% if event_time_props %} From f59b4110e6a03a4f88a21c43e863222810fd500d Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Tue, 14 May 2024 16:02:21 +1200 Subject: [PATCH 22/59] rm my profile reference --- dbt_project.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/dbt_project.yml b/dbt_project.yml index e6fb7460..987e03d1 100644 --- a/dbt_project.yml +++ b/dbt_project.yml @@ -4,8 +4,6 @@ config-version: 2 require-dbt-version: [">=1.2.0", "<2.0.0"] -profile: joel_ska - target-path: "target" clean-targets: ["target", "dbt_packages"] macro-paths: ["macros"] From ab7d8b9d853c05c7126b7527e822e713c60e9239 Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Tue, 14 May 2024 16:06:06 +1200 Subject: [PATCH 23/59] Update quick_are_queries_identical.sql --- macros/quick_are_queries_identical.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/macros/quick_are_queries_identical.sql b/macros/quick_are_queries_identical.sql index 466b4e6f..fecf2d83 100644 --- a/macros/quick_are_queries_identical.sql +++ b/macros/quick_are_queries_identical.sql @@ -8,7 +8,7 @@ {% set event_time_props = audit_helper.get_comparison_bounds(a_query, b_query, event_time) %} {% endif %} - select count(hash_result) = 1 as are_tables_identical + select count(distinct hash_result) = 1 as are_tables_identical from ( select hash_agg({{ joined_cols }}) as hash_result from ({{ query_a }}) @@ -17,7 +17,7 @@ and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' {% endif %} - union + union all select hash_agg({{ joined_cols }}) as hash_result from ({{ query_b }}) From 120ac18df8e0550fc5c6e548efda1387a575133f Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Tue, 14 May 2024 16:31:31 +1200 Subject: [PATCH 24/59] Add diagram explaining comparison bounds --- macros/utils/get_comparison_bounds.sql | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/macros/utils/get_comparison_bounds.sql b/macros/utils/get_comparison_bounds.sql index 85f8fcdc..4f224f5f 100644 --- a/macros/utils/get_comparison_bounds.sql +++ b/macros/utils/get_comparison_bounds.sql @@ -1,3 +1,23 @@ +/* +The idea here is that if the event_time is set, we will only compare records enclosed in both models. +This improves performance and allows us to compare apples to apples, instead of detecting millions/billions +of "deletions" identified due to prod having all data while CI only has a few days' worth. + +In the diagram below, the thatched section is the comparison bounds. You can think of it as + + greatest(model_a.min_value, model_b.min_value) + least(model_a.max_value, model_b.max_value) + + ┌────────────────────────────┐ + a min_value │ a max_value │ + └──► ┌───────┼────────────────────┐ ◄───┘ │ + │ │┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼│ │ +model_a │ │┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼│ │ model_b + │ │┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼│ │ + └───────┼────────────────────┘ │ + ┌──► └────────────────────────────┘ ◄────┐ + b min_value b max_value +*/ {% macro get_comparison_bounds(a_relation, b_relation, event_time) %} {% set min_max_queries %} with min_maxes as ( From c275056b5cc0720283232cc499527763c746ae7f Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Tue, 14 May 2024 16:39:42 +1200 Subject: [PATCH 25/59] Add comments explaining warehouse-specific optimisations --- macros/quick_are_queries_identical.sql | 9 +++++++++ macros/reworked_compare.sql | 1 + 2 files changed, 10 insertions(+) diff --git a/macros/quick_are_queries_identical.sql b/macros/quick_are_queries_identical.sql index fecf2d83..1f57c3d7 100644 --- a/macros/quick_are_queries_identical.sql +++ b/macros/quick_are_queries_identical.sql @@ -1,3 +1,12 @@ +/* +As described by the Infinite Lambda team here: https://infinitelambda.com/data-validation-refactoring-snowflake/ + +Some platforms let you take a hash of the whole table, which can be very very fast compared to comparing each row. + +If you run this and it returns false, you still have to run the more in-depth queries to find out what specific changes there are, +but it's a good way to quickly verify identical results if that's what you're expecting. +*/ + {% macro quick_are_queries_identical(query_a, query_b, columns=[], event_time=None) %} {{ return (adapter.dispatch('quick_are_queries_identical', 'audit_helper')(query_a, query_b, columns, event_time)) }} {% endmacro %} diff --git a/macros/reworked_compare.sql b/macros/reworked_compare.sql index d3b1fb8e..8b178cb6 100644 --- a/macros/reworked_compare.sql +++ b/macros/reworked_compare.sql @@ -10,6 +10,7 @@ {#- Set generation is dispatched because it's possible to get performance optimisations on some platforms, while keeping the post-processing standardised + See https://infinitelambda.com/data-validation-refactoring-snowflake/ for an example and background -#} {{ audit_helper.generate_set_results(a_query, b_query, primary_key_columns, columns, event_time_props)}} From 311fbdcd6ef256344540ba35bf61ccc7dd386083 Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Tue, 14 May 2024 16:48:30 +1200 Subject: [PATCH 26/59] cross-db support --- .../models/unit_test_placeholder_models/unit_test_model_a.sql | 2 +- .../models/unit_test_placeholder_models/unit_test_model_b.sql | 2 +- .../unit_test_model_b_more_cols.sql | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/integration_tests/models/unit_test_placeholder_models/unit_test_model_a.sql b/integration_tests/models/unit_test_placeholder_models/unit_test_model_a.sql index 3c729df2..a4bc3985 100644 --- a/integration_tests/models/unit_test_placeholder_models/unit_test_model_a.sql +++ b/integration_tests/models/unit_test_placeholder_models/unit_test_model_a.sql @@ -1 +1 @@ -select 12 as id, 22 as id_2, 'xyz' as col1, 'tuv' as col2, 123 as col3, getdate() as created_at \ No newline at end of file +select 12 as id, 22 as id_2, 'xyz' as col1, 'tuv' as col2, 123 as col3, {{ dbt.current_timestamp() }} as created_at \ No newline at end of file diff --git a/integration_tests/models/unit_test_placeholder_models/unit_test_model_b.sql b/integration_tests/models/unit_test_placeholder_models/unit_test_model_b.sql index 3c729df2..a4bc3985 100644 --- a/integration_tests/models/unit_test_placeholder_models/unit_test_model_b.sql +++ b/integration_tests/models/unit_test_placeholder_models/unit_test_model_b.sql @@ -1 +1 @@ -select 12 as id, 22 as id_2, 'xyz' as col1, 'tuv' as col2, 123 as col3, getdate() as created_at \ No newline at end of file +select 12 as id, 22 as id_2, 'xyz' as col1, 'tuv' as col2, 123 as col3, {{ dbt.current_timestamp() }} as created_at \ No newline at end of file diff --git a/integration_tests/models/unit_test_placeholder_models/unit_test_model_b_more_cols.sql b/integration_tests/models/unit_test_placeholder_models/unit_test_model_b_more_cols.sql index b9e425b7..816accc5 100644 --- a/integration_tests/models/unit_test_placeholder_models/unit_test_model_b_more_cols.sql +++ b/integration_tests/models/unit_test_placeholder_models/unit_test_model_b_more_cols.sql @@ -1 +1 @@ -select 12 as id, 22 as id_2, 'xyz' as col1, 'tuv' as col2, 123 as col3, getdate() as created_at, 'pineapple' as pizza \ No newline at end of file +select 12 as id, 22 as id_2, 'xyz' as col1, 'tuv' as col2, 123 as col3, {{ dbt.current_timestamp() }} as created_at, 'pineapple' as pizza \ No newline at end of file From ac635218123c9422421b6eae1c2df82e62d3f467 Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Tue, 14 May 2024 16:54:06 +1200 Subject: [PATCH 27/59] subq --- macros/quick_are_queries_identical.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/macros/quick_are_queries_identical.sql b/macros/quick_are_queries_identical.sql index 1f57c3d7..add26638 100644 --- a/macros/quick_are_queries_identical.sql +++ b/macros/quick_are_queries_identical.sql @@ -20,7 +20,7 @@ but it's a good way to quickly verify identical results if that's what you're ex select count(distinct hash_result) = 1 as are_tables_identical from ( select hash_agg({{ joined_cols }}) as hash_result - from ({{ query_a }}) + from ({{ query_a }}) query_a_subq {% if event_time_props %} where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' @@ -29,7 +29,7 @@ but it's a good way to quickly verify identical results if that's what you're ex union all select hash_agg({{ joined_cols }}) as hash_result - from ({{ query_b }}) + from ({{ query_b }}) query_b_subq {% if event_time_props %} where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' From ffae04fbd3581cdda4187df40641cd195feac47d Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Tue, 14 May 2024 17:05:01 +1200 Subject: [PATCH 28/59] no postgres or redshift for a sec --- .circleci/config.yml | 56 ++++++++++++++++++++++---------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 1701f721..79daed08 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -38,34 +38,34 @@ jobs: mkdir -p ~/.dbt cp integration_tests/ci/sample.profiles.yml ~/.dbt/profiles.yml - - run: - name: "Run Tests - Postgres" - environment: - POSTGRES_TEST_HOST: localhost - POSTGRES_TEST_USER: root - POSTGRES_TEST_PASS: '' - POSTGRES_TEST_PORT: 5432 - POSTGRES_TEST_DBNAME: circle_test - command: | - . dbt_venv/bin/activate - cd integration_tests - dbt deps --target postgres - dbt seed --target postgres --full-refresh - dbt compile --target postgres - dbt run --target postgres - dbt test --target postgres - - - run: - name: "Run Tests - Redshift" - command: | - . dbt_venv/bin/activate - echo `pwd` - cd integration_tests - dbt deps --target redshift - dbt seed --target redshift --full-refresh - dbt compile --target redshift - dbt run --target redshift - dbt test --target redshift + # - run: + # name: "Run Tests - Postgres" + # environment: + # POSTGRES_TEST_HOST: localhost + # POSTGRES_TEST_USER: root + # POSTGRES_TEST_PASS: '' + # POSTGRES_TEST_PORT: 5432 + # POSTGRES_TEST_DBNAME: circle_test + # command: | + # . dbt_venv/bin/activate + # cd integration_tests + # dbt deps --target postgres + # dbt seed --target postgres --full-refresh + # dbt compile --target postgres + # dbt run --target postgres + # dbt test --target postgres + + # - run: + # name: "Run Tests - Redshift" + # command: | + # . dbt_venv/bin/activate + # echo `pwd` + # cd integration_tests + # dbt deps --target redshift + # dbt seed --target redshift --full-refresh + # dbt compile --target redshift + # dbt run --target redshift + # dbt test --target redshift - run: name: "Run Tests - Snowflake" From 3ff4f7d4a1665a84b50614733f2fa11644c7d0cb Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Tue, 14 May 2024 17:18:55 +1200 Subject: [PATCH 29/59] add default var values for compare wrappers --- integration_tests/dbt_project.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/integration_tests/dbt_project.yml b/integration_tests/dbt_project.yml index 39664887..66c943e4 100644 --- a/integration_tests/dbt_project.yml +++ b/integration_tests/dbt_project.yml @@ -20,8 +20,8 @@ seeds: vars: compare_queries_summarize: true - reworked_compare__primary_key_columns: [] - reworked_compare__columns: [] + reworked_compare__primary_key_columns: ['col1'] + reworked_compare__columns: ['col1'] reworked_compare__event_time: - quick_are_queries_identical_cols: [] + quick_are_queries_identical_cols: [col1] quick_are_queries_identical_event_time: \ No newline at end of file From 8865b4a900d9df08a70428ef86da9df2284f78f7 Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Tue, 14 May 2024 17:35:21 +1200 Subject: [PATCH 30/59] avoid lateral alias reference for BQ --- macros/reworked_compare.sql | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/macros/reworked_compare.sql b/macros/reworked_compare.sql index 8b178cb6..c56dad0f 100644 --- a/macros/reworked_compare.sql +++ b/macros/reworked_compare.sql @@ -83,11 +83,10 @@ {% macro default__generate_set_results(a_query, b_query, primary_key_columns, columns, event_time_props) %} {% set joined_cols = columns | join(", ") %} - a as ( + a_base as ( select {{ joined_cols }}, - {{ audit_helper.generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key, - row_number() over (partition by dbt_audit_surrogate_key order by dbt_audit_surrogate_key ) as dbt_audit_pk_row_num + {{ audit_helper.generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key from ( {{- a_query -}} ) {% if event_time_props %} where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' @@ -95,11 +94,10 @@ {% endif %} ), - b as ( + b_base as ( select {{ joined_cols }}, - {{ audit_helper.generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key, - row_number() over (partition by dbt_audit_surrogate_key order by dbt_audit_surrogate_key ) as dbt_audit_pk_row_num + {{ audit_helper.generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key from ( {{- b_query -}} ) {% if event_time_props %} where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' @@ -107,6 +105,20 @@ {% endif %} ), + a as ( + select + *, + row_number() over (partition by dbt_audit_surrogate_key order by dbt_audit_surrogate_key) as dbt_audit_pk_row_num + from a_base + ), + + b as ( + select + *, + row_number() over (partition by dbt_audit_surrogate_key order by dbt_audit_surrogate_key) as dbt_audit_pk_row_num + from b_base + ), + a_intersect_b as ( select * from a From 683c8827314d632193e3761fa93d581c736ad2ff Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Tue, 14 May 2024 20:00:33 +1200 Subject: [PATCH 31/59] BQ doesn't support count(arg1, arg2) --- macros/reworked_compare.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/macros/reworked_compare.sql b/macros/reworked_compare.sql index c56dad0f..bcf95523 100644 --- a/macros/reworked_compare.sql +++ b/macros/reworked_compare.sql @@ -63,7 +63,7 @@ final as ( select *, - count(distinct dbt_audit_surrogate_key, dbt_audit_pk_row_num) over (partition by dbt_audit_row_status) as dbt_audit_num_rows_in_status, + count(distinct {{ dbt.concat(["dbt_audit_surrogate_key", "dbt_audit_pk_row_num"]) }}) over (partition by dbt_audit_row_status) as dbt_audit_num_rows_in_status, dense_rank() over (partition by dbt_audit_row_status order by dbt_audit_surrogate_key, dbt_audit_pk_row_num) as dbt_audit_sample_number from classified ) From 93e2b251788b80e398b5a908064e4e4f85800126 Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Tue, 14 May 2024 20:10:24 +1200 Subject: [PATCH 32/59] re-enable redshift --- .circleci/config.yml | 22 +++++++++++----------- integration_tests/ci/sample.profiles.yml | 6 +++--- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 79daed08..cb3d5aae 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -55,17 +55,17 @@ jobs: # dbt run --target postgres # dbt test --target postgres - # - run: - # name: "Run Tests - Redshift" - # command: | - # . dbt_venv/bin/activate - # echo `pwd` - # cd integration_tests - # dbt deps --target redshift - # dbt seed --target redshift --full-refresh - # dbt compile --target redshift - # dbt run --target redshift - # dbt test --target redshift + - run: + name: "Run Tests - Redshift" + command: | + . dbt_venv/bin/activate + echo `pwd` + cd integration_tests + dbt deps --target redshift + dbt seed --target redshift --full-refresh + dbt compile --target redshift + dbt run --target redshift + dbt test --target redshift - run: name: "Run Tests - Snowflake" diff --git a/integration_tests/ci/sample.profiles.yml b/integration_tests/ci/sample.profiles.yml index 843d659e..167e8a8a 100644 --- a/integration_tests/ci/sample.profiles.yml +++ b/integration_tests/ci/sample.profiles.yml @@ -27,7 +27,7 @@ integration_tests: dbname: "{{ env_var('REDSHIFT_TEST_DBNAME') }}" port: "{{ env_var('REDSHIFT_TEST_PORT') | as_number }}" schema: audit_helper_integration_tests_redshift - threads: 1 + threads: 8 bigquery: type: bigquery @@ -35,7 +35,7 @@ integration_tests: keyfile: "{{ env_var('BIGQUERY_SERVICE_KEY_PATH') }}" project: "{{ env_var('BIGQUERY_TEST_DATABASE') }}" schema: audit_helper_integration_tests_bigquery - threads: 1 + threads: 8 snowflake: type: snowflake @@ -46,4 +46,4 @@ integration_tests: database: "{{ env_var('SNOWFLAKE_TEST_DATABASE') }}" warehouse: "{{ env_var('SNOWFLAKE_TEST_WAREHOUSE') }}" schema: audit_helper_integration_tests_snowflake - threads: 1 + threads: 8 From 981768dd396e285de61331790e162173226e7942 Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Tue, 14 May 2024 20:17:11 +1200 Subject: [PATCH 33/59] Alias subq for redshift --- macros/reworked_compare.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/macros/reworked_compare.sql b/macros/reworked_compare.sql index bcf95523..42e8d854 100644 --- a/macros/reworked_compare.sql +++ b/macros/reworked_compare.sql @@ -87,7 +87,7 @@ select {{ joined_cols }}, {{ audit_helper.generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key - from ( {{- a_query -}} ) + from ( {{- a_query -}} ) a_base_subq {% if event_time_props %} where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' @@ -98,7 +98,7 @@ select {{ joined_cols }}, {{ audit_helper.generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key - from ( {{- b_query -}} ) + from ( {{- b_query -}} ) b_base_subq {% if event_time_props %} where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' From 7e3e1714de9f32029b686dce09462eb4568f7d08 Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Fri, 17 May 2024 11:03:10 +1200 Subject: [PATCH 34/59] remove extra comma --- macros/reworked_compare.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/macros/reworked_compare.sql b/macros/reworked_compare.sql index 42e8d854..c9eaa261 100644 --- a/macros/reworked_compare.sql +++ b/macros/reworked_compare.sql @@ -21,7 +21,7 @@ select *, true as in_a, - true as in_b, + true as in_b from a_intersect_b union all From df95fcae469a998b8098d3429daa3a6ed99d4e28 Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Fri, 17 May 2024 11:50:52 +1200 Subject: [PATCH 35/59] add row status of nonunique_pk --- .../unit_reworked_compare.yml | 17 +++++++++-------- macros/reworked_compare.sql | 1 + 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml index fc821eff..dd69783b 100644 --- a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml +++ b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml @@ -108,6 +108,7 @@ unit_tests: - name: reworked_compare_identical_tables_single_null_pk model: unit_reworked_compare + description: "`nonunique_pk` status checks whether a PK is unique. It's intended to avoid arbitrary comparisons, not protect against null records (that's what constraints or tests are for)." given: - input: ref('unit_test_model_a') @@ -150,9 +151,9 @@ unit_tests: expect: rows: - - {"dbt_audit_row_status": 'identical', 'id': , dbt_audit_num_rows_in_status: 3} - - {"dbt_audit_row_status": 'identical', 'id': , dbt_audit_num_rows_in_status: 3} - - {"dbt_audit_row_status": 'identical', 'id': 3, dbt_audit_num_rows_in_status: 3} + - {"dbt_audit_row_status": 'identical', 'id': 3, dbt_audit_num_rows_in_status: 1} + - {"dbt_audit_row_status": 'nonunique_pk', 'id': , dbt_audit_num_rows_in_status: 2} + - {"dbt_audit_row_status": 'nonunique_pk', 'id': , dbt_audit_num_rows_in_status: 2} overrides: vars: @@ -161,7 +162,7 @@ unit_tests: reworked_compare__primary_key_columns: ['id'] - name: reworked_compare_identical_tables_multiple_null_pk_with_duplicate_rows - description: The two rows with a null ID are identical. They should both be returned as individual rows instead of being combined + description: All rows with a null ID are identical. They should both be returned as individual rows instead of being combined model: unit_reworked_compare given: @@ -179,10 +180,10 @@ unit_tests: expect: rows: - - {"dbt_audit_row_status": 'added', 'id': , dbt_audit_num_rows_in_status: 1} - - {"dbt_audit_row_status": 'identical', 'id': , dbt_audit_num_rows_in_status: 3} - - {"dbt_audit_row_status": 'identical', 'id': , dbt_audit_num_rows_in_status: 3} - - {"dbt_audit_row_status": 'identical', 'id': 3, dbt_audit_num_rows_in_status: 3} + - {"dbt_audit_row_status": 'identical', 'id': 3, dbt_audit_num_rows_in_status: 1} + - {"dbt_audit_row_status": 'nonunique_pk', 'id': , dbt_audit_num_rows_in_status: 3} + - {"dbt_audit_row_status": 'nonunique_pk', 'id': , dbt_audit_num_rows_in_status: 3} + - {"dbt_audit_row_status": 'nonunique_pk', 'id': , dbt_audit_num_rows_in_status: 3} overrides: vars: diff --git a/macros/reworked_compare.sql b/macros/reworked_compare.sql index c9eaa261..9a058c0b 100644 --- a/macros/reworked_compare.sql +++ b/macros/reworked_compare.sql @@ -48,6 +48,7 @@ select *, case + when max(dbt_audit_pk_row_num) over (partition by dbt_audit_surrogate_key) > 1 then 'nonunique_pk' when in_a and in_b then 'identical' when {{ dbt.bool_or('in_a') }} over (partition by dbt_audit_surrogate_key, dbt_audit_pk_row_num) and {{ dbt.bool_or('in_b') }} over (partition by dbt_audit_surrogate_key, dbt_audit_pk_row_num) From 9523db8675d112896abad65f613cab51962e45c3 Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Fri, 17 May 2024 12:55:22 +1200 Subject: [PATCH 36/59] remove redundant test and wrapper model --- .../unit_test_model_b_more_cols.sql | 1 - .../unit_reworked_compare.yml | 40 ++++++++++++++++--- ...worked_compare_column_details_mismatch.sql | 9 ----- ...worked_compare_column_details_mismatch.yml | 26 ------------ 4 files changed, 35 insertions(+), 41 deletions(-) delete mode 100644 integration_tests/models/unit_test_placeholder_models/unit_test_model_b_more_cols.sql delete mode 100644 integration_tests/models/unit_test_wrappers/unit_reworked_compare_column_details_mismatch.sql delete mode 100644 integration_tests/models/unit_test_wrappers/unit_reworked_compare_column_details_mismatch.yml diff --git a/integration_tests/models/unit_test_placeholder_models/unit_test_model_b_more_cols.sql b/integration_tests/models/unit_test_placeholder_models/unit_test_model_b_more_cols.sql deleted file mode 100644 index 816accc5..00000000 --- a/integration_tests/models/unit_test_placeholder_models/unit_test_model_b_more_cols.sql +++ /dev/null @@ -1 +0,0 @@ -select 12 as id, 22 as id_2, 'xyz' as col1, 'tuv' as col2, 123 as col3, {{ dbt.current_timestamp() }} as created_at, 'pineapple' as pizza \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml index dd69783b..68b632ec 100644 --- a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml +++ b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml @@ -17,8 +17,8 @@ unit_tests: expect: rows: - {"dbt_audit_row_status": 'identical', 'id': 1, dbt_audit_num_rows_in_status: 3} - - {"dbt_audit_row_status": 'identical', 'id': 2, dbt_audit_num_rows_in_status: 3} - {"dbt_audit_row_status": 'identical', 'id': 3, dbt_audit_num_rows_in_status: 3} + - {"dbt_audit_row_status": 'identical', 'id': 2, dbt_audit_num_rows_in_status: 3} overrides: vars: @@ -76,8 +76,8 @@ unit_tests: expect: rows: - - {"dbt_audit_row_status": 'identical', 'id': 1, dbt_audit_num_rows_in_status: 1} - {"dbt_audit_row_status": 'modified', 'id': 2, dbt_audit_num_rows_in_status: 1} + - {"dbt_audit_row_status": 'identical', 'id': 1, dbt_audit_num_rows_in_status: 1} - {"dbt_audit_row_status": 'modified', 'id': 2, dbt_audit_num_rows_in_status: 1} - {"dbt_audit_row_status": 'removed', 'id': 3, dbt_audit_num_rows_in_status: 1} - {"dbt_audit_row_status": 'added', 'id': 4, dbt_audit_num_rows_in_status: 1} @@ -162,7 +162,7 @@ unit_tests: reworked_compare__primary_key_columns: ['id'] - name: reworked_compare_identical_tables_multiple_null_pk_with_duplicate_rows - description: All rows with a null ID are identical. They should both be returned as individual rows instead of being combined + description: All rows with a null ID are identical. They should be returned as individual rows instead of being combined model: unit_reworked_compare given: @@ -180,13 +180,43 @@ unit_tests: expect: rows: - - {"dbt_audit_row_status": 'identical', 'id': 3, dbt_audit_num_rows_in_status: 1} - {"dbt_audit_row_status": 'nonunique_pk', 'id': , dbt_audit_num_rows_in_status: 3} - {"dbt_audit_row_status": 'nonunique_pk', 'id': , dbt_audit_num_rows_in_status: 3} - {"dbt_audit_row_status": 'nonunique_pk', 'id': , dbt_audit_num_rows_in_status: 3} + - {"dbt_audit_row_status": 'identical', 'id': 3, dbt_audit_num_rows_in_status: 1} overrides: vars: reworked_compare__columns: ['id', 'col1', 'col2'] reworked_compare__event_time: - reworked_compare__primary_key_columns: ['id'] \ No newline at end of file + reworked_compare__primary_key_columns: ['id'] + + - name: reworked_compare_struct + model: my_model + + - name: reworked_compare_all_statuses_different_column_set + model: unit_reworked_compare + overrides: + vars: + reworked_compare__primary_key_columns: ['id'] + reworked_compare__columns: ['id', 'col1'] + reworked_compare__event_time: + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": 1, "col1": "abc" } + - { "id": 2, "col1": "ddd" } + - { "id": 4, "col1": "nop" } + + expect: + rows: + - {"dbt_audit_row_status": 'added', 'id': 4, dbt_audit_num_rows_in_status: 1} + - {"dbt_audit_row_status": 'identical', 'id': 1, dbt_audit_num_rows_in_status: 1} + - {"dbt_audit_row_status": 'modified', 'id': 2, dbt_audit_num_rows_in_status: 1} + - {"dbt_audit_row_status": 'modified', 'id': 2, dbt_audit_num_rows_in_status: 1} + - {"dbt_audit_row_status": 'removed', 'id': 3, dbt_audit_num_rows_in_status: 1} diff --git a/integration_tests/models/unit_test_wrappers/unit_reworked_compare_column_details_mismatch.sql b/integration_tests/models/unit_test_wrappers/unit_reworked_compare_column_details_mismatch.sql deleted file mode 100644 index d8ed546a..00000000 --- a/integration_tests/models/unit_test_wrappers/unit_reworked_compare_column_details_mismatch.sql +++ /dev/null @@ -1,9 +0,0 @@ -{{ - audit_helper.reworked_compare( - "select * from " ~ ref('unit_test_model_a'), - "select * from " ~ ref('unit_test_model_b_more_cols'), - primary_key_columns=['id'], - columns=var('reworked_compare__columns'), - event_time=var('reworked_compare__event_time') - ) -}} \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_reworked_compare_column_details_mismatch.yml b/integration_tests/models/unit_test_wrappers/unit_reworked_compare_column_details_mismatch.yml deleted file mode 100644 index f134aa24..00000000 --- a/integration_tests/models/unit_test_wrappers/unit_reworked_compare_column_details_mismatch.yml +++ /dev/null @@ -1,26 +0,0 @@ -unit_tests: - - name: reworked_compare_all_statuses_different_columns - model: unit_reworked_compare_column_details_mismatch - overrides: - vars: - reworked_compare__columns: ['id', 'col1'] - reworked_compare__event_time: - given: - - input: ref('unit_test_model_a') - rows: - - { "id": 1, "col1": "abc", "col2": "def" } - - { "id": 2, "col1": "hij", "col2": "klm" } - - { "id": 3, "col1": "nop", "col2": "qrs" } - - input: ref('unit_test_model_b_more_cols') - rows: - - { "id": 1, "col1": "abc" } - - { "id": 2, "col1": "changed" } - - { "id": 4, "col1": "nop" } - - expect: - rows: - - {"dbt_audit_row_status": 'identical', 'id': 1, dbt_audit_num_rows_in_status: 1} - - {"dbt_audit_row_status": 'modified', 'id': 2, dbt_audit_num_rows_in_status: 1} - - {"dbt_audit_row_status": 'modified', 'id': 2, dbt_audit_num_rows_in_status: 1} - - {"dbt_audit_row_status": 'removed', 'id': 3, dbt_audit_num_rows_in_status: 1} - - {"dbt_audit_row_status": 'added', 'id': 4, dbt_audit_num_rows_in_status: 1} From a506d72a199da92586d8a6ee141a1bc1f8422dd5 Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Sat, 18 May 2024 05:55:44 +1200 Subject: [PATCH 37/59] Create json-y tests for snowflake --- .../unit_test_struct_model_a.sql | 1 + .../unit_test_struct_model_b.sql | 1 + .../unit_reworked_compare.yml | 3 - .../unit_reworked_compare_struct.sql | 9 ++ .../unit_reworked_compare_struct.yml | 130 ++++++++++++++++++ 5 files changed, 141 insertions(+), 3 deletions(-) create mode 100644 integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_a.sql create mode 100644 integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_b.sql create mode 100644 integration_tests/models/unit_test_wrappers/unit_reworked_compare_struct.sql create mode 100644 integration_tests/models/unit_test_wrappers/unit_reworked_compare_struct.yml diff --git a/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_a.sql b/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_a.sql new file mode 100644 index 00000000..220f0767 --- /dev/null +++ b/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_a.sql @@ -0,0 +1 @@ +select 1 as id, 'John Doe' as col1, object_construct('street', '123 Main St', 'city', 'Anytown', 'state', 'CA') as col2 \ No newline at end of file diff --git a/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_b.sql b/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_b.sql new file mode 100644 index 00000000..220f0767 --- /dev/null +++ b/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_b.sql @@ -0,0 +1 @@ +select 1 as id, 'John Doe' as col1, object_construct('street', '123 Main St', 'city', 'Anytown', 'state', 'CA') as col2 \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml index 68b632ec..a82a9b7e 100644 --- a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml +++ b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml @@ -191,9 +191,6 @@ unit_tests: reworked_compare__event_time: reworked_compare__primary_key_columns: ['id'] - - name: reworked_compare_struct - model: my_model - - name: reworked_compare_all_statuses_different_column_set model: unit_reworked_compare overrides: diff --git a/integration_tests/models/unit_test_wrappers/unit_reworked_compare_struct.sql b/integration_tests/models/unit_test_wrappers/unit_reworked_compare_struct.sql new file mode 100644 index 00000000..7aab2177 --- /dev/null +++ b/integration_tests/models/unit_test_wrappers/unit_reworked_compare_struct.sql @@ -0,0 +1,9 @@ +{{ + audit_helper.reworked_compare( + "select * from " ~ ref('unit_test_struct_model_a'), + "select * from " ~ ref('unit_test_struct_model_b'), + primary_key_columns=var('reworked_compare__primary_key_columns'), + columns=var('reworked_compare__columns'), + event_time=var('reworked_compare__event_time') + ) +}} \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_reworked_compare_struct.yml b/integration_tests/models/unit_test_wrappers/unit_reworked_compare_struct.yml new file mode 100644 index 00000000..309eb6e8 --- /dev/null +++ b/integration_tests/models/unit_test_wrappers/unit_reworked_compare_struct.yml @@ -0,0 +1,130 @@ +unit_tests: + - name: reworked_compare_struct + model: unit_reworked_compare_struct + given: + - input: ref('unit_test_struct_model_a') + format: sql + rows: | + select 1 as id, 'John Doe' as col1, object_construct('street', '123 Main St', 'city', 'Anytown', 'state', 'CA') as col2 + - input: ref('unit_test_struct_model_b') + format: sql + rows: | + select 1 as id, 'John Doe' as col1, object_construct('street', '123 Main St', 'city', 'Anytown', 'state', 'CA') as col2 + expect: + rows: + - {"id": 1, "dbt_audit_row_status": "identical", "dbt_audit_num_rows_in_status": 1} + overrides: + vars: + reworked_compare__columns: ['id', 'col1', 'col2'] + reworked_compare__event_time: + reworked_compare__primary_key_columns: ['id'] + config: + tags: "{{ 'works_on_my_machine' if (target.type in ['bq', 'redshift']) else 'skip' }}" + + - name: unit_reworked_compare_struct_identical_values_different_order + model: unit_reworked_compare_struct + description: Snowflake sorts objects' keys alphabetically, so sort order is ignored. + given: + - input: ref('unit_test_struct_model_a') + format: sql + rows: | + select 1 as id, 'John Doe' as col1, object_construct('street', '123 Main St', 'city', 'Anytown', 'state', 'CA') as col2 + - input: ref('unit_test_struct_model_b') + format: sql + rows: | + select 1 as id, 'John Doe' as col1, object_construct('state', 'CA', 'street', '123 Main St', 'city', 'Anytown') as col2 + expect: + rows: + - {"id": 1, "dbt_audit_row_status": "identical", "dbt_audit_num_rows_in_status": 1} + overrides: + vars: + reworked_compare__columns: ['id', 'col1', 'col2'] + reworked_compare__event_time: + reworked_compare__primary_key_columns: ['id'] + + - name: unit_reworked_compare_struct_removed_key + model: unit_reworked_compare_struct + given: + - input: ref('unit_test_struct_model_a') + format: sql + rows: | + select 1 as id, 'John Doe' as col1, object_construct('street', '123 Main St', 'city', 'Anytown', 'state', 'CA') as col2 + - input: ref('unit_test_struct_model_b') + format: sql + rows: | + select 1 as id, 'John Doe' as col1, object_construct('street', '123 Main St', 'state', 'CA') as col2 + expect: + rows: + - {"id": 1, "dbt_audit_row_status": "modified", "dbt_audit_num_rows_in_status": 1} + - {"id": 1, "dbt_audit_row_status": "modified", "dbt_audit_num_rows_in_status": 1} + overrides: + vars: + reworked_compare__columns: ['id', 'col1', 'col2'] + reworked_compare__event_time: + reworked_compare__primary_key_columns: ['id'] + + - name: reworked_compare_complex_struct + model: unit_reworked_compare_struct + given: + - input: ref('unit_test_struct_model_a') + format: sql + rows: | + select 1 as id, 'John Doe' as col1, object_construct('emails', array_construct('john.doe@example.com', 'john.d@example.com'), 'phones', array_construct(object_construct('type', 'home', 'number', '123-456-7890'), object_construct('type', 'work', 'number', '987-654-3210'))) as col2 + - input: ref('unit_test_struct_model_b') + format: sql + rows: | + select 1 as id, 'John Doe' as col1, object_construct('emails', array_construct('john.doe@example.com', 'john.d@example.com'), 'phones', array_construct(object_construct('type', 'home', 'number', '123-456-7890'), object_construct('type', 'work', 'number', '987-654-3210'))) as col2 + expect: + rows: + - {"id": 1, "dbt_audit_row_status": "identical", "dbt_audit_num_rows_in_status": 1} + overrides: + vars: + reworked_compare__columns: ['id', 'col1', 'col2'] + reworked_compare__event_time: + reworked_compare__primary_key_columns: ['id'] + + + - name: reworked_compare_complex_struct_different_values + model: unit_reworked_compare_struct + given: + - input: ref('unit_test_struct_model_a') + format: sql + rows: | + select 1 as id, 'John Doe' as col1, object_construct('emails', array_construct('john.doe@example.com', 'john.d@example.com'), 'phones', array_construct(object_construct('type', 'home', 'number', '123-456-7890'), object_construct('type', 'work', 'number', '987-654-3210'))) as col2 + - input: ref('unit_test_struct_model_b') + format: sql + rows: | + select 1 as id, 'John Doe' as col1, object_construct('emails', array_construct('john.smith@example.com', 'john.d@example.com'), 'phones', array_construct(object_construct('type', 'home', 'number', '123-456-7890'), object_construct('type', 'work', 'number', '987-654-3210'))) as col2 + expect: + rows: + - {"id": 1, "dbt_audit_row_status": "modified", "dbt_audit_num_rows_in_status": 1} + - {"id": 1, "dbt_audit_row_status": "modified", "dbt_audit_num_rows_in_status": 1} + overrides: + vars: + reworked_compare__columns: ['id', 'col1', 'col2'] + reworked_compare__event_time: + reworked_compare__primary_key_columns: ['id'] + + - name: unit_reworked_compare_complex_struct_identical_values_different_order + model: unit_reworked_compare_struct + description: Snowflake sorts objects' keys alphabetically, but respects the order items are added to arrays so differences are detected. + given: + - input: ref('unit_test_struct_model_a') + format: sql + rows: | + select 1 as id, 'John Doe' as col1, object_construct('emails', array_construct('john.doe@example.com', 'john.d@example.com'), 'phones', array_construct(object_construct('type', 'home', 'number', '123-456-7890'), object_construct('type', 'work', 'number', '987-654-3210'))) as col2 + - input: ref('unit_test_struct_model_b') + format: sql + rows: | + select 1 as id, 'John Doe' as col1, object_construct('emails', array_construct('john.doe@example.com', 'john.d@example.com'), 'phones', array_construct(object_construct('type', 'work', 'number', '987-654-3210'), object_construct('type', 'home', 'number', '123-456-7890'))) as col2 + expect: + rows: + - {"id": 1, "dbt_audit_row_status": "modified", "dbt_audit_num_rows_in_status": 1} + - {"id": 1, "dbt_audit_row_status": "modified", "dbt_audit_num_rows_in_status": 1} + overrides: + vars: + reworked_compare__columns: ['id', 'col1', 'col2'] + reworked_compare__event_time: + reworked_compare__primary_key_columns: ['id'] + + From a7542a8986e58b185c166e54e7c9b2585d2f7b13 Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Sat, 18 May 2024 12:14:32 +1200 Subject: [PATCH 38/59] Add workaround for redshift to support count num rows in status --- .../unit_reworked_compare.yml | 4 ++-- macros/reworked_compare.sql | 2 +- macros/utils/_count_num_rows_in_status.sql | 17 +++++++++++++++++ 3 files changed, 20 insertions(+), 3 deletions(-) create mode 100644 macros/utils/_count_num_rows_in_status.sql diff --git a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml index a82a9b7e..3346b9d9 100644 --- a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml +++ b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml @@ -76,9 +76,9 @@ unit_tests: expect: rows: - - {"dbt_audit_row_status": 'modified', 'id': 2, dbt_audit_num_rows_in_status: 1} - {"dbt_audit_row_status": 'identical', 'id': 1, dbt_audit_num_rows_in_status: 1} - {"dbt_audit_row_status": 'modified', 'id': 2, dbt_audit_num_rows_in_status: 1} + - {"dbt_audit_row_status": 'modified', 'id': 2, dbt_audit_num_rows_in_status: 1} - {"dbt_audit_row_status": 'removed', 'id': 3, dbt_audit_num_rows_in_status: 1} - {"dbt_audit_row_status": 'added', 'id': 4, dbt_audit_num_rows_in_status: 1} @@ -180,10 +180,10 @@ unit_tests: expect: rows: + - {"dbt_audit_row_status": 'identical', 'id': 3, dbt_audit_num_rows_in_status: 1} - {"dbt_audit_row_status": 'nonunique_pk', 'id': , dbt_audit_num_rows_in_status: 3} - {"dbt_audit_row_status": 'nonunique_pk', 'id': , dbt_audit_num_rows_in_status: 3} - {"dbt_audit_row_status": 'nonunique_pk', 'id': , dbt_audit_num_rows_in_status: 3} - - {"dbt_audit_row_status": 'identical', 'id': 3, dbt_audit_num_rows_in_status: 1} overrides: vars: diff --git a/macros/reworked_compare.sql b/macros/reworked_compare.sql index 9a058c0b..69537d92 100644 --- a/macros/reworked_compare.sql +++ b/macros/reworked_compare.sql @@ -64,7 +64,7 @@ final as ( select *, - count(distinct {{ dbt.concat(["dbt_audit_surrogate_key", "dbt_audit_pk_row_num"]) }}) over (partition by dbt_audit_row_status) as dbt_audit_num_rows_in_status, + {{ audit_helper._count_num_rows_in_status() }} as dbt_audit_num_rows_in_status, dense_rank() over (partition by dbt_audit_row_status order by dbt_audit_surrogate_key, dbt_audit_pk_row_num) as dbt_audit_sample_number from classified ) diff --git a/macros/utils/_count_num_rows_in_status.sql b/macros/utils/_count_num_rows_in_status.sql new file mode 100644 index 00000000..c7d14e08 --- /dev/null +++ b/macros/utils/_count_num_rows_in_status.sql @@ -0,0 +1,17 @@ +{% macro _count_num_rows_in_status() %} + {{ return(adapter.dispatch('_count_num_rows_in_status', 'audit_helper')()) }} +{% endmacro %} + +{%- macro default___count_num_rows_in_status() -%} + count(distinct dbt_audit_surrogate_key, dbt_audit_pk_row_num) over (partition by dbt_audit_row_status) +{% endmacro %} + +{%- macro bigquery___count_num_rows_in_status() -%} + count(distinct {{ dbt.concat(["dbt_audit_surrogate_key", "dbt_audit_pk_row_num"]) }}) over (partition by dbt_audit_row_status) +{% endmacro %} + +{%- macro redshift___count_num_rows_in_status() -%} + {#- Redshift doesn't support count(distinct) inside of window functions :( -#} + {#- modified rows are the only ones that return two rows per PK/row num pairing, so just need to be halved -#} + (count(*) over (partition by dbt_audit_row_status)) / case when dbt_audit_row_status = 'modified' then 2 else 1 end +{% endmacro %} \ No newline at end of file From eb2cfcdd2e3993456c44d865d4c4926331217700 Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Sat, 18 May 2024 12:25:31 +1200 Subject: [PATCH 39/59] skip incompatible tests --- .circleci/config.yml | 4 ++-- .../unit_test_placeholder_models/unit_test_struct_model_a.sql | 2 ++ .../unit_test_placeholder_models/unit_test_struct_model_b.sql | 2 ++ .../unit_test_wrappers/unit_quick_are_queries_identical.sql | 2 ++ 4 files changed, 8 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index cb3d5aae..cffe4dd4 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -64,8 +64,8 @@ jobs: dbt deps --target redshift dbt seed --target redshift --full-refresh dbt compile --target redshift - dbt run --target redshift - dbt test --target redshift + dbt run --target redshift --exclude tag:skip+ + dbt test --target redshift --exclude tag:skip+ - run: name: "Run Tests - Snowflake" diff --git a/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_a.sql b/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_a.sql index 220f0767..8ae63364 100644 --- a/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_a.sql +++ b/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_a.sql @@ -1 +1,3 @@ +{{ config(tags=['skip' if target.type in ('redshift') else 'runnable']) }} + select 1 as id, 'John Doe' as col1, object_construct('street', '123 Main St', 'city', 'Anytown', 'state', 'CA') as col2 \ No newline at end of file diff --git a/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_b.sql b/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_b.sql index 220f0767..8ae63364 100644 --- a/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_b.sql +++ b/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_b.sql @@ -1 +1,3 @@ +{{ config(tags=['skip' if target.type in ('redshift') else 'runnable']) }} + select 1 as id, 'John Doe' as col1, object_construct('street', '123 Main St', 'city', 'Anytown', 'state', 'CA') as col2 \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.sql b/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.sql index 72fd9e72..85a9b618 100644 --- a/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.sql +++ b/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.sql @@ -1,3 +1,5 @@ +{{ config(tags=['skip' if target.type in ('redshift') else 'runnable']) }} + {{ audit_helper.quick_are_queries_identical( "select * from " ~ ref('unit_test_model_a'), From 10392b0201f4536681f707ada9db388e206fb3eb Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Sat, 18 May 2024 15:59:13 +1200 Subject: [PATCH 40/59] Fix redshift lack of bool_or support in window funcs --- macros/reworked_compare.sql | 139 +------------------- macros/utils/_classify_audit_row_status.sql | 28 ++++ macros/utils/_generate_set_results.sql | 123 +++++++++++++++++ 3 files changed, 154 insertions(+), 136 deletions(-) create mode 100644 macros/utils/_classify_audit_row_status.sql create mode 100644 macros/utils/_generate_set_results.sql diff --git a/macros/reworked_compare.sql b/macros/reworked_compare.sql index 69537d92..e7838833 100644 --- a/macros/reworked_compare.sql +++ b/macros/reworked_compare.sql @@ -7,12 +7,8 @@ {% endif %} with - {#- - Set generation is dispatched because it's possible to get performance optimisations - on some platforms, while keeping the post-processing standardised - See https://infinitelambda.com/data-validation-refactoring-snowflake/ for an example and background - -#} - {{ audit_helper.generate_set_results(a_query, b_query, primary_key_columns, columns, event_time_props)}} + + {{ audit_helper._generate_set_results(a_query, b_query, primary_key_columns, columns, event_time_props)}} , @@ -44,21 +40,10 @@ classified as ( - select *, - case - when max(dbt_audit_pk_row_num) over (partition by dbt_audit_surrogate_key) > 1 then 'nonunique_pk' - when in_a and in_b then 'identical' - when {{ dbt.bool_or('in_a') }} over (partition by dbt_audit_surrogate_key, dbt_audit_pk_row_num) - and {{ dbt.bool_or('in_b') }} over (partition by dbt_audit_surrogate_key, dbt_audit_pk_row_num) - then 'modified' - when in_a then 'removed' - when in_b then 'added' - end as dbt_audit_row_status + {{ audit_helper._classify_audit_row_status() }} as dbt_audit_row_status from all_records - order by dbt_audit_surrogate_key, in_a desc, in_b desc - ), final as ( @@ -75,122 +60,4 @@ {% endif %} order by dbt_audit_row_status, dbt_audit_sample_number -{% endmacro %} - -{% macro generate_set_results(a_query, b_query, primary_key_columns, columns, event_time_props=None) %} - {{ return(adapter.dispatch('generate_set_results', 'audit_helper')(a_query, b_query, primary_key_columns, columns, event_time_props)) }} -{% endmacro %} - -{% macro default__generate_set_results(a_query, b_query, primary_key_columns, columns, event_time_props) %} - {% set joined_cols = columns | join(", ") %} - - a_base as ( - select - {{ joined_cols }}, - {{ audit_helper.generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key - from ( {{- a_query -}} ) a_base_subq - {% if event_time_props %} - where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' - and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' - {% endif %} - ), - - b_base as ( - select - {{ joined_cols }}, - {{ audit_helper.generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key - from ( {{- b_query -}} ) b_base_subq - {% if event_time_props %} - where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' - and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' - {% endif %} - ), - - a as ( - select - *, - row_number() over (partition by dbt_audit_surrogate_key order by dbt_audit_surrogate_key) as dbt_audit_pk_row_num - from a_base - ), - - b as ( - select - *, - row_number() over (partition by dbt_audit_surrogate_key order by dbt_audit_surrogate_key) as dbt_audit_pk_row_num - from b_base - ), - - a_intersect_b as ( - - select * from a - {{ dbt.intersect() }} - select * from b - - ), - - a_except_b as ( - - select * from a - {{ dbt.except() }} - select * from b - - ), - - b_except_a as ( - - select * from b - {{ dbt.except() }} - select * from a - - ) -{% endmacro %} - -{% macro snowflake__generate_set_results(a_query, b_query, primary_key_columns, columns, event_time_props) %} - {% set joined_cols = columns | join(", ") %} - a as ( - select - {{ joined_cols }}, - {{ audit_helper.generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key, - row_number() over (partition by dbt_audit_surrogate_key order by dbt_audit_surrogate_key ) as dbt_audit_pk_row_num, - hash({{ joined_cols }}, dbt_audit_pk_row_num) as dbt_audit_row_hash - from ( {{- a_query -}} ) - {% if event_time_props %} - where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' - and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' - {% endif %} - ), - - b as ( - select - {{ joined_cols }}, - {{ audit_helper.generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key, - row_number() over (partition by dbt_audit_surrogate_key order by dbt_audit_surrogate_key ) as dbt_audit_pk_row_num, - hash({{ joined_cols }}, dbt_audit_pk_row_num) as dbt_audit_row_hash - from ( {{- b_query -}} ) - {% if event_time_props %} - where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' - and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' - {% endif %} - ), - - a_intersect_b as ( - - select * from a - where a.dbt_audit_row_hash in (select b.dbt_audit_row_hash from b) - - ), - - a_except_b as ( - - select * from a - where a.dbt_audit_row_hash not in (select b.dbt_audit_row_hash from b) - - ), - - b_except_a as ( - - select * from b - where b.dbt_audit_row_hash not in (select a.dbt_audit_row_hash from a) - - ) {% endmacro %} \ No newline at end of file diff --git a/macros/utils/_classify_audit_row_status.sql b/macros/utils/_classify_audit_row_status.sql new file mode 100644 index 00000000..e28e3f4e --- /dev/null +++ b/macros/utils/_classify_audit_row_status.sql @@ -0,0 +1,28 @@ +{% macro _classify_audit_row_status() %} + {{ return(adapter.dispatch('_classify_audit_row_status', 'audit_helper')()) }} +{% endmacro %} + +{%- macro default___classify_audit_row_status() -%} + case + when max(dbt_audit_pk_row_num) over (partition by dbt_audit_surrogate_key) > 1 then 'nonunique_pk' + when in_a and in_b then 'identical' + when {{ dbt.bool_or('in_a') }} over (partition by dbt_audit_surrogate_key, dbt_audit_pk_row_num) + and {{ dbt.bool_or('in_b') }} over (partition by dbt_audit_surrogate_key, dbt_audit_pk_row_num) + then 'modified' + when in_a then 'removed' + when in_b then 'added' + end +{% endmacro %} + + +{%- macro redshift___classify_audit_row_status() -%} + {#- Redshift doesn't support bitwise operations (e.g. bool_or) inside of a window function :( -#} + case + when max(dbt_audit_pk_row_num) over (partition by dbt_audit_surrogate_key) > 1 then 'nonunique_pk' + when in_a and in_b then 'identical' + when max(case when in_a then 1 else 0 end) over (partition by dbt_audit_surrogate_key, dbt_audit_pk_row_num) = 1 + and max(case when in_b then 1 else 0 end) over (partition by dbt_audit_surrogate_key, dbt_audit_pk_row_num) = 1 + then 'modified' + when in_a then 'removed' + when in_b then 'added' + end{% endmacro %} \ No newline at end of file diff --git a/macros/utils/_generate_set_results.sql b/macros/utils/_generate_set_results.sql new file mode 100644 index 00000000..848ff9e2 --- /dev/null +++ b/macros/utils/_generate_set_results.sql @@ -0,0 +1,123 @@ +{#- + Set generation is dispatched because it's possible to get performance optimisations + on some platforms, while keeping the post-processing standardised + See https://infinitelambda.com/data-validation-refactoring-snowflake/ for an example and background +-#} + +{% macro _generate_set_results(a_query, b_query, primary_key_columns, columns, event_time_props=None) %} + {{ return(adapter.dispatch('_generate_set_results', 'audit_helper')(a_query, b_query, primary_key_columns, columns, event_time_props)) }} +{% endmacro %} + +{% macro default___generate_set_results(a_query, b_query, primary_key_columns, columns, event_time_props) %} + {% set joined_cols = columns | join(", ") %} + + a_base as ( + select + {{ joined_cols }}, + {{ audit_helper.generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key + from ( {{- a_query -}} ) a_base_subq + {% if event_time_props %} + where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' + and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' + {% endif %} + ), + + b_base as ( + select + {{ joined_cols }}, + {{ audit_helper.generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key + from ( {{- b_query -}} ) b_base_subq + {% if event_time_props %} + where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' + and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' + {% endif %} + ), + + a as ( + select + *, + row_number() over (partition by dbt_audit_surrogate_key order by dbt_audit_surrogate_key) as dbt_audit_pk_row_num + from a_base + ), + + b as ( + select + *, + row_number() over (partition by dbt_audit_surrogate_key order by dbt_audit_surrogate_key) as dbt_audit_pk_row_num + from b_base + ), + + a_intersect_b as ( + + select * from a + {{ dbt.intersect() }} + select * from b + + ), + + a_except_b as ( + + select * from a + {{ dbt.except() }} + select * from b + + ), + + b_except_a as ( + + select * from b + {{ dbt.except() }} + select * from a + + ) +{% endmacro %} + +{% macro snowflake___generate_set_results(a_query, b_query, primary_key_columns, columns, event_time_props) %} + {% set joined_cols = columns | join(", ") %} + a as ( + select + {{ joined_cols }}, + {{ audit_helper.generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key, + row_number() over (partition by dbt_audit_surrogate_key order by dbt_audit_surrogate_key ) as dbt_audit_pk_row_num, + hash({{ joined_cols }}, dbt_audit_pk_row_num) as dbt_audit_row_hash + from ( {{- a_query -}} ) + {% if event_time_props %} + where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' + and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' + {% endif %} + ), + + b as ( + select + {{ joined_cols }}, + {{ audit_helper.generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key, + row_number() over (partition by dbt_audit_surrogate_key order by dbt_audit_surrogate_key ) as dbt_audit_pk_row_num, + hash({{ joined_cols }}, dbt_audit_pk_row_num) as dbt_audit_row_hash + from ( {{- b_query -}} ) + {% if event_time_props %} + where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' + and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' + {% endif %} + ), + + a_intersect_b as ( + + select * from a + where a.dbt_audit_row_hash in (select b.dbt_audit_row_hash from b) + + ), + + a_except_b as ( + + select * from a + where a.dbt_audit_row_hash not in (select b.dbt_audit_row_hash from b) + + ), + + b_except_a as ( + + select * from b + where b.dbt_audit_row_hash not in (select a.dbt_audit_row_hash from a) + + ) +{% endmacro %} \ No newline at end of file From 8c9690cdbdf8eed7e64f78965e1d51ad0607c4ee Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Sat, 18 May 2024 15:59:23 +1200 Subject: [PATCH 41/59] add skip exclusions for everything else --- .circleci/config.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index cffe4dd4..200699ff 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -52,8 +52,8 @@ jobs: # dbt deps --target postgres # dbt seed --target postgres --full-refresh # dbt compile --target postgres - # dbt run --target postgres - # dbt test --target postgres + # dbt run --target postgres --exclude tag:skip+ + # dbt test --target postgres --exclude tag:skip+ - run: name: "Run Tests - Redshift" @@ -76,8 +76,8 @@ jobs: dbt deps --target snowflake dbt seed --target snowflake --full-refresh dbt compile --target snowflake - dbt run --target snowflake - dbt test --target snowflake + dbt run --target snowflake --exclude tag:skip+ + dbt test --target snowflake --exclude tag:skip+ - run: name: "Run Tests - BigQuery" @@ -91,8 +91,8 @@ jobs: dbt deps --target bigquery dbt seed --target bigquery --full-refresh dbt compile --target bigquery - dbt run --target bigquery --full-refresh - dbt test --target bigquery + dbt run --target bigquery --full-refresh --exclude tag:skip+ + dbt test --target bigquery --exclude tag:skip+ - save_cache: From 1cf18874b4ef703a83db14e8ebf88edbb2d02508 Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Sat, 18 May 2024 16:32:34 +1200 Subject: [PATCH 42/59] fix incorrect skip tag application --- .../unit_test_placeholder_models/unit_test_struct_model_a.sql | 2 +- .../unit_test_placeholder_models/unit_test_struct_model_b.sql | 2 +- .../unit_test_wrappers/unit_quick_are_queries_identical.sql | 2 +- .../models/unit_test_wrappers/unit_reworked_compare_struct.yml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_a.sql b/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_a.sql index 8ae63364..3d7e9999 100644 --- a/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_a.sql +++ b/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_a.sql @@ -1,3 +1,3 @@ -{{ config(tags=['skip' if target.type in ('redshift') else 'runnable']) }} +{{ config(tags=['skip' if (target.type in ['redshift']) else 'runnable']) }} select 1 as id, 'John Doe' as col1, object_construct('street', '123 Main St', 'city', 'Anytown', 'state', 'CA') as col2 \ No newline at end of file diff --git a/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_b.sql b/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_b.sql index 8ae63364..3d7e9999 100644 --- a/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_b.sql +++ b/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_b.sql @@ -1,3 +1,3 @@ -{{ config(tags=['skip' if target.type in ('redshift') else 'runnable']) }} +{{ config(tags=['skip' if (target.type in ['redshift']) else 'runnable']) }} select 1 as id, 'John Doe' as col1, object_construct('street', '123 Main St', 'city', 'Anytown', 'state', 'CA') as col2 \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.sql b/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.sql index 85a9b618..c66828f5 100644 --- a/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.sql +++ b/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.sql @@ -1,4 +1,4 @@ -{{ config(tags=['skip' if target.type in ('redshift') else 'runnable']) }} +{{ config(tags=['skip' if (target.type in ['redshift']) else 'runnable']) }} {{ audit_helper.quick_are_queries_identical( diff --git a/integration_tests/models/unit_test_wrappers/unit_reworked_compare_struct.yml b/integration_tests/models/unit_test_wrappers/unit_reworked_compare_struct.yml index 309eb6e8..3139a570 100644 --- a/integration_tests/models/unit_test_wrappers/unit_reworked_compare_struct.yml +++ b/integration_tests/models/unit_test_wrappers/unit_reworked_compare_struct.yml @@ -19,7 +19,7 @@ unit_tests: reworked_compare__event_time: reworked_compare__primary_key_columns: ['id'] config: - tags: "{{ 'works_on_my_machine' if (target.type in ['bq', 'redshift']) else 'skip' }}" + tags: "{{ 'skip' if (target.type in ['redshift']) else 'runnable' }}" - name: unit_reworked_compare_struct_identical_values_different_order model: unit_reworked_compare_struct From 319a967c227a68d1f4602124867d6d869819a7be Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Sat, 18 May 2024 16:32:46 +1200 Subject: [PATCH 43/59] Move user configs to project.yml from profiles --- integration_tests/ci/sample.profiles.yml | 4 ---- integration_tests/dbt_project.yml | 8 ++++++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/integration_tests/ci/sample.profiles.yml b/integration_tests/ci/sample.profiles.yml index 167e8a8a..66eac960 100644 --- a/integration_tests/ci/sample.profiles.yml +++ b/integration_tests/ci/sample.profiles.yml @@ -2,10 +2,6 @@ # HEY! This file is used in the dbt-audit-helper integrations tests with CircleCI. # You should __NEVER__ check credentials into version control. Thanks for reading :) -config: - send_anonymous_usage_stats: False - use_colors: True - integration_tests: target: postgres outputs: diff --git a/integration_tests/dbt_project.yml b/integration_tests/dbt_project.yml index 66c943e4..ef906729 100644 --- a/integration_tests/dbt_project.yml +++ b/integration_tests/dbt_project.yml @@ -23,5 +23,9 @@ vars: reworked_compare__primary_key_columns: ['col1'] reworked_compare__columns: ['col1'] reworked_compare__event_time: - quick_are_queries_identical_cols: [col1] - quick_are_queries_identical_event_time: \ No newline at end of file + quick_are_queries_identical_cols: ['col1'] + quick_are_queries_identical_event_time: + +flags: + send_anonymous_usage_stats: False + use_colors: True \ No newline at end of file From 698aa997d9f0240bcb95db313ee9377b0dd13f8a Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Sat, 18 May 2024 17:15:54 +1200 Subject: [PATCH 44/59] Temporarily disable unpassable redshift tests --- .../models/unit_test_wrappers/unit_reworked_compare.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml index 3346b9d9..27cc867d 100644 --- a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml +++ b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml @@ -133,6 +133,8 @@ unit_tests: reworked_compare__columns: ['id', 'col1', 'col2'] reworked_compare__event_time: reworked_compare__primary_key_columns: ['id'] + config: + tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-redshift/issues/821 - name: reworked_compare_identical_tables_multiple_null_pk model: unit_reworked_compare @@ -160,6 +162,8 @@ unit_tests: reworked_compare__columns: ['id', 'col1', 'col2'] reworked_compare__event_time: reworked_compare__primary_key_columns: ['id'] + config: + tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-redshift/issues/821 - name: reworked_compare_identical_tables_multiple_null_pk_with_duplicate_rows description: All rows with a null ID are identical. They should be returned as individual rows instead of being combined @@ -190,6 +194,8 @@ unit_tests: reworked_compare__columns: ['id', 'col1', 'col2'] reworked_compare__event_time: reworked_compare__primary_key_columns: ['id'] + config: + tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-redshift/issues/821 - name: reworked_compare_all_statuses_different_column_set model: unit_reworked_compare From a255d43ff6053c5f80e791316728de36ed4df257 Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Sat, 18 May 2024 17:18:29 +1200 Subject: [PATCH 45/59] add temp skip to circle's config.yml --- .circleci/config.yml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 200699ff..00367f6b 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -52,8 +52,8 @@ jobs: # dbt deps --target postgres # dbt seed --target postgres --full-refresh # dbt compile --target postgres - # dbt run --target postgres --exclude tag:skip+ - # dbt test --target postgres --exclude tag:skip+ + # dbt run --target postgres --exclude tag:skip+ temporary_skip+ + # dbt test --target postgres --exclude tag:skip+ temporary_skip+ - run: name: "Run Tests - Redshift" @@ -64,8 +64,8 @@ jobs: dbt deps --target redshift dbt seed --target redshift --full-refresh dbt compile --target redshift - dbt run --target redshift --exclude tag:skip+ - dbt test --target redshift --exclude tag:skip+ + dbt run --target redshift --exclude tag:skip+ temporary_skip+ + dbt test --target redshift --exclude tag:skip+ temporary_skip+ - run: name: "Run Tests - Snowflake" @@ -76,8 +76,8 @@ jobs: dbt deps --target snowflake dbt seed --target snowflake --full-refresh dbt compile --target snowflake - dbt run --target snowflake --exclude tag:skip+ - dbt test --target snowflake --exclude tag:skip+ + dbt run --target snowflake --exclude tag:skip+ temporary_skip+ + dbt test --target snowflake --exclude tag:skip+ temporary_skip+ - run: name: "Run Tests - BigQuery" @@ -91,8 +91,8 @@ jobs: dbt deps --target bigquery dbt seed --target bigquery --full-refresh dbt compile --target bigquery - dbt run --target bigquery --full-refresh --exclude tag:skip+ - dbt test --target bigquery --exclude tag:skip+ + dbt run --target bigquery --full-refresh --exclude tag:skip+ temporary_skip+ + dbt test --target bigquery --exclude tag:skip+ temporary_skip+ - save_cache: From a9a47c131bc419446dfa2e22ed8629894373156e Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Sat, 18 May 2024 17:22:38 +1200 Subject: [PATCH 46/59] forgot tag: method --- .circleci/config.yml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 00367f6b..4aec9aa7 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -52,8 +52,8 @@ jobs: # dbt deps --target postgres # dbt seed --target postgres --full-refresh # dbt compile --target postgres - # dbt run --target postgres --exclude tag:skip+ temporary_skip+ - # dbt test --target postgres --exclude tag:skip+ temporary_skip+ + # dbt run --target postgres --exclude tag:skip+ tag:temporary_skip+ + # dbt test --target postgres --exclude tag:skip+ tag:temporary_skip+ - run: name: "Run Tests - Redshift" @@ -64,8 +64,8 @@ jobs: dbt deps --target redshift dbt seed --target redshift --full-refresh dbt compile --target redshift - dbt run --target redshift --exclude tag:skip+ temporary_skip+ - dbt test --target redshift --exclude tag:skip+ temporary_skip+ + dbt run --target redshift --exclude tag:skip+ tag:temporary_skip+ + dbt test --target redshift --exclude tag:skip+ tag:temporary_skip+ - run: name: "Run Tests - Snowflake" @@ -76,8 +76,8 @@ jobs: dbt deps --target snowflake dbt seed --target snowflake --full-refresh dbt compile --target snowflake - dbt run --target snowflake --exclude tag:skip+ temporary_skip+ - dbt test --target snowflake --exclude tag:skip+ temporary_skip+ + dbt run --target snowflake --exclude tag:skip+ tag:temporary_skip+ + dbt test --target snowflake --exclude tag:skip+ tag:temporary_skip+ - run: name: "Run Tests - BigQuery" @@ -91,8 +91,8 @@ jobs: dbt deps --target bigquery dbt seed --target bigquery --full-refresh dbt compile --target bigquery - dbt run --target bigquery --full-refresh --exclude tag:skip+ temporary_skip+ - dbt test --target bigquery --exclude tag:skip+ temporary_skip+ + dbt run --target bigquery --full-refresh --exclude tag:skip+ tag:temporary_skip+ + dbt test --target bigquery --exclude tag:skip+ tag:temporary_skip+ - save_cache: From ec2d1421ccf04f1d7e4cdebd6449018c43b1d3ec Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Wed, 22 May 2024 13:46:08 +1200 Subject: [PATCH 47/59] Temporarily skip reworked_compare_all_statuses_different_column_set --- .gitignore | 3 ++- .../models/unit_test_wrappers/unit_reworked_compare.yml | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index a33e3f41..0606e5c3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ target/ dbt_packages/ logs/ -logfile \ No newline at end of file +logfile +.DS_Store diff --git a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml index 27cc867d..cd20a672 100644 --- a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml +++ b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml @@ -223,3 +223,5 @@ unit_tests: - {"dbt_audit_row_status": 'modified', 'id': 2, dbt_audit_num_rows_in_status: 1} - {"dbt_audit_row_status": 'modified', 'id': 2, dbt_audit_num_rows_in_status: 1} - {"dbt_audit_row_status": 'removed', 'id': 3, dbt_audit_num_rows_in_status: 1} + config: + tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-core/issues/10167 From fe91fd1b1b8fe3343aeb73d268977b138686229e Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Wed, 22 May 2024 16:37:44 +1200 Subject: [PATCH 48/59] Skip another test redshift --- .../models/unit_test_wrappers/unit_reworked_compare.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml index cd20a672..61c7ee37 100644 --- a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml +++ b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml @@ -81,6 +81,8 @@ unit_tests: - {"dbt_audit_row_status": 'modified', 'id': 2, dbt_audit_num_rows_in_status: 1} - {"dbt_audit_row_status": 'removed', 'id': 3, dbt_audit_num_rows_in_status: 1} - {"dbt_audit_row_status": 'added', 'id': 4, dbt_audit_num_rows_in_status: 1} + config: + tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-core/issues/10167 - name: reworked_compare_identical_tables_multiple_pk_cols model: unit_reworked_compare From 77f6a50a19cde71a5d123c5e4f33bf0ca10be56b Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Wed, 22 May 2024 17:10:26 +1200 Subject: [PATCH 49/59] disable unsupported tests BQ --- .../unit_test_placeholder_models/unit_test_struct_model_a.sql | 2 +- .../unit_test_placeholder_models/unit_test_struct_model_b.sql | 2 +- .../unit_test_wrappers/unit_quick_are_queries_identical.sql | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_a.sql b/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_a.sql index 3d7e9999..918912e9 100644 --- a/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_a.sql +++ b/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_a.sql @@ -1,3 +1,3 @@ -{{ config(tags=['skip' if (target.type in ['redshift']) else 'runnable']) }} +{{ config(tags=['skip' if (target.type in ['redshift', 'bigquery']) else 'runnable']) }} select 1 as id, 'John Doe' as col1, object_construct('street', '123 Main St', 'city', 'Anytown', 'state', 'CA') as col2 \ No newline at end of file diff --git a/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_b.sql b/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_b.sql index 3d7e9999..918912e9 100644 --- a/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_b.sql +++ b/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_b.sql @@ -1,3 +1,3 @@ -{{ config(tags=['skip' if (target.type in ['redshift']) else 'runnable']) }} +{{ config(tags=['skip' if (target.type in ['redshift', 'bigquery']) else 'runnable']) }} select 1 as id, 'John Doe' as col1, object_construct('street', '123 Main St', 'city', 'Anytown', 'state', 'CA') as col2 \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.sql b/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.sql index c66828f5..d272dc50 100644 --- a/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.sql +++ b/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.sql @@ -1,4 +1,4 @@ -{{ config(tags=['skip' if (target.type in ['redshift']) else 'runnable']) }} +{{ config(tags=['skip' if (target.type in ['redshift', 'bigquery']) else 'runnable']) }} {{ audit_helper.quick_are_queries_identical( From df7300118d66f724fb2813adf22730691ac48e3a Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Mon, 27 May 2024 14:24:28 +1200 Subject: [PATCH 50/59] postgres too? --- .circleci/config.yml | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 4aec9aa7..03718dbc 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -38,22 +38,22 @@ jobs: mkdir -p ~/.dbt cp integration_tests/ci/sample.profiles.yml ~/.dbt/profiles.yml - # - run: - # name: "Run Tests - Postgres" - # environment: - # POSTGRES_TEST_HOST: localhost - # POSTGRES_TEST_USER: root - # POSTGRES_TEST_PASS: '' - # POSTGRES_TEST_PORT: 5432 - # POSTGRES_TEST_DBNAME: circle_test - # command: | - # . dbt_venv/bin/activate - # cd integration_tests - # dbt deps --target postgres - # dbt seed --target postgres --full-refresh - # dbt compile --target postgres - # dbt run --target postgres --exclude tag:skip+ tag:temporary_skip+ - # dbt test --target postgres --exclude tag:skip+ tag:temporary_skip+ + - run: + name: "Run Tests - Postgres" + environment: + POSTGRES_TEST_HOST: localhost + POSTGRES_TEST_USER: root + POSTGRES_TEST_PASS: '' + POSTGRES_TEST_PORT: 5432 + POSTGRES_TEST_DBNAME: circle_test + command: | + . dbt_venv/bin/activate + cd integration_tests + dbt deps --target postgres + dbt seed --target postgres --full-refresh + dbt compile --target postgres + dbt run --target postgres --exclude tag:skip+ tag:temporary_skip+ + dbt test --target postgres --exclude tag:skip+ tag:temporary_skip+ - run: name: "Run Tests - Redshift" From 12e307dc8f59f94a9e3e0f61013601fa5c05b848 Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Mon, 27 May 2024 14:44:41 +1200 Subject: [PATCH 51/59] Fixes for postgres --- .../unit_test_struct_model_a.sql | 2 +- .../unit_test_struct_model_b.sql | 2 +- .../unit_quick_are_queries_identical.sql | 4 ++-- macros/utils/_count_num_rows_in_status.sql | 10 +++++++++- 4 files changed, 13 insertions(+), 5 deletions(-) diff --git a/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_a.sql b/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_a.sql index 918912e9..55f280a9 100644 --- a/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_a.sql +++ b/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_a.sql @@ -1,3 +1,3 @@ -{{ config(tags=['skip' if (target.type in ['redshift', 'bigquery']) else 'runnable']) }} +{{ config(tags=['skip' if (target.type in ['redshift', 'bigquery', 'postgres']) else 'runnable']) }} select 1 as id, 'John Doe' as col1, object_construct('street', '123 Main St', 'city', 'Anytown', 'state', 'CA') as col2 \ No newline at end of file diff --git a/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_b.sql b/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_b.sql index 918912e9..55f280a9 100644 --- a/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_b.sql +++ b/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_b.sql @@ -1,3 +1,3 @@ -{{ config(tags=['skip' if (target.type in ['redshift', 'bigquery']) else 'runnable']) }} +{{ config(tags=['skip' if (target.type in ['redshift', 'bigquery', 'postgres']) else 'runnable']) }} select 1 as id, 'John Doe' as col1, object_construct('street', '123 Main St', 'city', 'Anytown', 'state', 'CA') as col2 \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.sql b/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.sql index d272dc50..1fc11811 100644 --- a/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.sql +++ b/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.sql @@ -1,4 +1,4 @@ -{{ config(tags=['skip' if (target.type in ['redshift', 'bigquery']) else 'runnable']) }} +{{ config(tags=['skip' if (target.type in ['redshift', 'bigquery', 'postgres']) else 'runnable']) }} {{ audit_helper.quick_are_queries_identical( @@ -7,4 +7,4 @@ columns=var('quick_are_queries_identical_cols'), event_time=var('quick_are_queries_identical_event_time') ) -}} \ No newline at end of file +}} \ No newline at end of file diff --git a/macros/utils/_count_num_rows_in_status.sql b/macros/utils/_count_num_rows_in_status.sql index c7d14e08..1f557316 100644 --- a/macros/utils/_count_num_rows_in_status.sql +++ b/macros/utils/_count_num_rows_in_status.sql @@ -7,11 +7,19 @@ {% endmacro %} {%- macro bigquery___count_num_rows_in_status() -%} - count(distinct {{ dbt.concat(["dbt_audit_surrogate_key", "dbt_audit_pk_row_num"]) }}) over (partition by dbt_audit_row_status) + {{ _count_num_rows_in_status_concat() }} +{% endmacro %} + +{%- macro postgres___count_num_rows_in_status() -%} + {{ _count_num_rows_in_status_concat() }} {% endmacro %} {%- macro redshift___count_num_rows_in_status() -%} {#- Redshift doesn't support count(distinct) inside of window functions :( -#} {#- modified rows are the only ones that return two rows per PK/row num pairing, so just need to be halved -#} (count(*) over (partition by dbt_audit_row_status)) / case when dbt_audit_row_status = 'modified' then 2 else 1 end +{% endmacro %} + +{% macro _count_num_rows_in_status_concat() %} + count(distinct {{ dbt.concat(["dbt_audit_surrogate_key", "dbt_audit_pk_row_num"]) }}) over (partition by dbt_audit_row_status) {% endmacro %} \ No newline at end of file From f2171681ce55f85e6fc1dc6189016ade020d55fe Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Mon, 27 May 2024 14:48:55 +1200 Subject: [PATCH 52/59] namespace macros --- macros/utils/_count_num_rows_in_status.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/macros/utils/_count_num_rows_in_status.sql b/macros/utils/_count_num_rows_in_status.sql index 1f557316..574832c6 100644 --- a/macros/utils/_count_num_rows_in_status.sql +++ b/macros/utils/_count_num_rows_in_status.sql @@ -7,11 +7,11 @@ {% endmacro %} {%- macro bigquery___count_num_rows_in_status() -%} - {{ _count_num_rows_in_status_concat() }} + {{ audit_helper._count_num_rows_in_status_concat() }} {% endmacro %} {%- macro postgres___count_num_rows_in_status() -%} - {{ _count_num_rows_in_status_concat() }} + {{ audit_helper._count_num_rows_in_status_concat() }} {% endmacro %} {%- macro redshift___count_num_rows_in_status() -%} From 88f2be84545576cd6a1360f1f76d5fc79da31290 Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Mon, 27 May 2024 14:52:49 +1200 Subject: [PATCH 53/59] It's a postgres problem, not a redshift problem --- macros/utils/_count_num_rows_in_status.sql | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/macros/utils/_count_num_rows_in_status.sql b/macros/utils/_count_num_rows_in_status.sql index 574832c6..c3781815 100644 --- a/macros/utils/_count_num_rows_in_status.sql +++ b/macros/utils/_count_num_rows_in_status.sql @@ -7,19 +7,11 @@ {% endmacro %} {%- macro bigquery___count_num_rows_in_status() -%} - {{ audit_helper._count_num_rows_in_status_concat() }} + count(distinct {{ dbt.concat(["dbt_audit_surrogate_key", "dbt_audit_pk_row_num"]) }}) over (partition by dbt_audit_row_status) {% endmacro %} {%- macro postgres___count_num_rows_in_status() -%} - {{ audit_helper._count_num_rows_in_status_concat() }} -{% endmacro %} - -{%- macro redshift___count_num_rows_in_status() -%} - {#- Redshift doesn't support count(distinct) inside of window functions :( -#} + {#- Postgres/Redshift doesn't support count(distinct) inside of window functions :( -#} {#- modified rows are the only ones that return two rows per PK/row num pairing, so just need to be halved -#} (count(*) over (partition by dbt_audit_row_status)) / case when dbt_audit_row_status = 'modified' then 2 else 1 end {% endmacro %} - -{% macro _count_num_rows_in_status_concat() %} - count(distinct {{ dbt.concat(["dbt_audit_surrogate_key", "dbt_audit_pk_row_num"]) }}) over (partition by dbt_audit_row_status) -{% endmacro %} \ No newline at end of file From ad6e9d84d0fe633f15cb257af2a2bf3a6f8310cf Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Mon, 27 May 2024 15:00:20 +1200 Subject: [PATCH 54/59] Handle postgres 63 char limit --- .../models/unit_test_wrappers/unit_reworked_compare.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml index 61c7ee37..fd2c0d02 100644 --- a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml +++ b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml @@ -167,7 +167,7 @@ unit_tests: config: tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-redshift/issues/821 - - name: reworked_compare_identical_tables_multiple_null_pk_with_duplicate_rows + - name: reworked_compare_identical_tables_multi_null_pk_dupe_rows description: All rows with a null ID are identical. They should be returned as individual rows instead of being combined model: unit_reworked_compare From 669bb69858c3aeff148978030c50bcecb11f7329 Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Mon, 27 May 2024 15:05:09 +1200 Subject: [PATCH 55/59] Add databricks --- .circleci/config.yml | 14 +++++++++++++- integration_tests/ci/sample.profiles.yml | 8 ++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 03718dbc..cde7c9e7 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -33,7 +33,7 @@ jobs: . dbt_venv/bin/activate python -m pip install --upgrade pip setuptools - python -m pip install --pre dbt-core dbt-postgres dbt-redshift dbt-snowflake dbt-bigquery + python -m pip install --pre dbt-core dbt-postgres dbt-redshift dbt-snowflake dbt-bigquery dbt-databricks mkdir -p ~/.dbt cp integration_tests/ci/sample.profiles.yml ~/.dbt/profiles.yml @@ -94,6 +94,17 @@ jobs: dbt run --target bigquery --full-refresh --exclude tag:skip+ tag:temporary_skip+ dbt test --target bigquery --exclude tag:skip+ tag:temporary_skip+ + - run: + name: "Run Tests - Databricks" + command: | + . dbt_venv/bin/activate + echo `pwd` + cd integration_tests + dbt deps --target databricks + dbt seed --target databricks --full-refresh + dbt compile --target databricks + dbt run --target databricks --exclude tag:skip+ tag:temporary_skip+ + dbt test --target databricks --exclude tag:skip+ tag:temporary_skip+ - save_cache: key: deps1-{{ .Branch }} @@ -115,3 +126,4 @@ workflows: - profile-redshift - profile-snowflake - profile-bigquery + - profile-databricks diff --git a/integration_tests/ci/sample.profiles.yml b/integration_tests/ci/sample.profiles.yml index 66eac960..ea8effc1 100644 --- a/integration_tests/ci/sample.profiles.yml +++ b/integration_tests/ci/sample.profiles.yml @@ -43,3 +43,11 @@ integration_tests: warehouse: "{{ env_var('SNOWFLAKE_TEST_WAREHOUSE') }}" schema: audit_helper_integration_tests_snowflake threads: 8 + + databricks: + type: databricks + schema: dbt_project_evaluator_integration_tests_databricks + host: "{{ env_var('DATABRICKS_TEST_HOST') }}" + http_path: "{{ env_var('DATABRICKS_TEST_HTTP_PATH') }}" + token: "{{ env_var('DATABRICKS_TEST_ACCESS_TOKEN') }}" + threads: 10 From 0c192a949efa9e88648c14967c1a99827377f242 Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Mon, 27 May 2024 15:15:14 +1200 Subject: [PATCH 56/59] Rename tests to data_tests --- .../models/data_tests/schema.yml | 38 +++++++++---------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/integration_tests/models/data_tests/schema.yml b/integration_tests/models/data_tests/schema.yml index 4bea9838..fbe74ff7 100644 --- a/integration_tests/models/data_tests/schema.yml +++ b/integration_tests/models/data_tests/schema.yml @@ -2,96 +2,96 @@ version: 2 models: - name: compare_queries - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_relations_without_exclude') - name: compare_queries_concat_pk_without_summary - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_without_summary') - name: compare_queries_with_summary - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_with_summary') - name: compare_queries_without_summary - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_without_summary') - name: compare_relations_with_summary - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_with_summary') - name: compare_relations_without_summary - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_without_summary') - name: compare_relations_with_exclude - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_relations_with_exclude') - name: compare_relations_without_exclude - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_relations_without_exclude') - name: compare_all_columns_with_summary - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_all_columns_with_summary') - name: compare_all_columns_without_summary - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_all_columns_without_summary') - name: compare_all_columns_concat_pk_with_summary - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_all_columns_concat_pk_with_summary') - name: compare_all_columns_concat_pk_without_summary - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_all_columns_concat_pk_without_summary') - name: compare_all_columns_with_summary_and_exclude - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_all_columns_with_summary_and_exclude') - name: compare_all_columns_where_clause - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_all_columns_where_clause') - name: compare_relation_columns - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_relation_columns') - name: compare_relations_concat_pk_without_summary - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_without_summary') - name: compare_which_columns_differ - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_which_columns_differ') - name: compare_which_columns_differ_exclude_cols - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_which_columns_differ_exclude_cols') - name: compare_row_counts - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_row_counts') From 317e4d706530879390d02a2d2b911e0f10e7363b Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Mon, 27 May 2024 15:44:38 +1200 Subject: [PATCH 57/59] Found a better workaround for missing count distinct window --- macros/utils/_count_num_rows_in_status.sql | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/macros/utils/_count_num_rows_in_status.sql b/macros/utils/_count_num_rows_in_status.sql index c3781815..82730a2c 100644 --- a/macros/utils/_count_num_rows_in_status.sql +++ b/macros/utils/_count_num_rows_in_status.sql @@ -11,7 +11,18 @@ {% endmacro %} {%- macro postgres___count_num_rows_in_status() -%} - {#- Postgres/Redshift doesn't support count(distinct) inside of window functions :( -#} - {#- modified rows are the only ones that return two rows per PK/row num pairing, so just need to be halved -#} - (count(*) over (partition by dbt_audit_row_status)) / case when dbt_audit_row_status = 'modified' then 2 else 1 end + _count_num_rows_in_status_without_distinct_window_func() {% endmacro %} + +{%- macro databricks___count_num_rows_in_status() -%} + _count_num_rows_in_status_without_distinct_window_func() +{% endmacro %} + +{% macro _count_num_rows_in_status_without_distinct_window_func() %} + {#- Some platforms don't support count(distinct) inside of window functions -#} + {#- You can get the same outcome by dense_rank, assuming no nulls (we've already handled that) #} + {# https://stackoverflow.com/a/22347502 -#} + dense_rank() over (partition by dbt_audit_row_status order by dbt_audit_surrogate_key, dbt_audit_pk_row_num) + + dense_rank() over (partition by dbt_audit_row_status order by dbt_audit_surrogate_key desc, dbt_audit_pk_row_num desc) + - 1 +{% endmacro %} \ No newline at end of file From 0d1a1de45b151bd5d2c3472430e0ec151bacf36d Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Mon, 27 May 2024 15:47:51 +1200 Subject: [PATCH 58/59] actually call the macro --- macros/utils/_count_num_rows_in_status.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/macros/utils/_count_num_rows_in_status.sql b/macros/utils/_count_num_rows_in_status.sql index 82730a2c..fa81c591 100644 --- a/macros/utils/_count_num_rows_in_status.sql +++ b/macros/utils/_count_num_rows_in_status.sql @@ -11,11 +11,11 @@ {% endmacro %} {%- macro postgres___count_num_rows_in_status() -%} - _count_num_rows_in_status_without_distinct_window_func() + {{ audit_helper._count_num_rows_in_status_without_distinct_window_func() }} {% endmacro %} {%- macro databricks___count_num_rows_in_status() -%} - _count_num_rows_in_status_without_distinct_window_func() + {{ audit_helper._count_num_rows_in_status_without_distinct_window_func() }} {% endmacro %} {% macro _count_num_rows_in_status_without_distinct_window_func() %} From 559f8d51895a7af6e84f9c9002c469bda5f0a61c Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Mon, 27 May 2024 15:59:31 +1200 Subject: [PATCH 59/59] disable syntax-failing tests on dbx --- .../unit_test_placeholder_models/unit_test_struct_model_a.sql | 2 +- .../unit_test_placeholder_models/unit_test_struct_model_b.sql | 2 +- .../unit_test_wrappers/unit_quick_are_queries_identical.sql | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_a.sql b/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_a.sql index 55f280a9..24d584e8 100644 --- a/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_a.sql +++ b/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_a.sql @@ -1,3 +1,3 @@ -{{ config(tags=['skip' if (target.type in ['redshift', 'bigquery', 'postgres']) else 'runnable']) }} +{{ config(tags=['skip' if (target.type in ['redshift', 'bigquery', 'postgres', 'databricks']) else 'runnable']) }} select 1 as id, 'John Doe' as col1, object_construct('street', '123 Main St', 'city', 'Anytown', 'state', 'CA') as col2 \ No newline at end of file diff --git a/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_b.sql b/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_b.sql index 55f280a9..24d584e8 100644 --- a/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_b.sql +++ b/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_b.sql @@ -1,3 +1,3 @@ -{{ config(tags=['skip' if (target.type in ['redshift', 'bigquery', 'postgres']) else 'runnable']) }} +{{ config(tags=['skip' if (target.type in ['redshift', 'bigquery', 'postgres', 'databricks']) else 'runnable']) }} select 1 as id, 'John Doe' as col1, object_construct('street', '123 Main St', 'city', 'Anytown', 'state', 'CA') as col2 \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.sql b/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.sql index 1fc11811..e969b1e2 100644 --- a/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.sql +++ b/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.sql @@ -1,4 +1,4 @@ -{{ config(tags=['skip' if (target.type in ['redshift', 'bigquery', 'postgres']) else 'runnable']) }} +{{ config(tags=['skip' if (target.type in ['redshift', 'bigquery', 'postgres', 'databricks']) else 'runnable']) }} {{ audit_helper.quick_are_queries_identical(