From 9da3c5103349937535662113988d87b6bbedbee5 Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Mon, 27 May 2024 16:17:41 +1200 Subject: [PATCH 01/13] Add new macros for diff calculation, and unit tests (#99) * Add macro for new hash-based comparison strategy * split out SF-focused version of macro * Fix change to complex object * Fix overuse of star * switch from compare rels to compare queries * provide wrapping parens * switch to array of columns for PK * split unit tests into own files, change unit tests to array pk * tidy up get_comp_bounds * fix arg rename * add quick_are_queries_identical and unit tests * Move data tests into own directory * Add test for multiple PKs * fix incorrect unit test configs * make data types for id and id_2 big enough nums * Mock event_time response * fix hardcoded value in quick_are_qs_identical * Add unit tests for null handling (still broken) * Rename columsn to be more unique * Steal surrogate key macro from utils * Use generated surrogate key across the board in place of PK * rm my profile reference * Update quick_are_queries_identical.sql * Add diagram explaining comparison bounds * Add comments explaining warehouse-specific optimisations * cross-db support * subq * no postgres or redshift for a sec * add default var values for compare wrappers * avoid lateral alias reference for BQ * BQ doesn't support count(arg1, arg2) * re-enable redshift * Alias subq for redshift * remove extra comma * add row status of nonunique_pk * remove redundant test and wrapper model * Create json-y tests for snowflake * Add workaround for redshift to support count num rows in status * skip incompatible tests * Fix redshift lack of bool_or support in window funcs * add skip exclusions for everything else * fix incorrect skip tag application * Move user configs to project.yml from profiles * Temporarily disable unpassable redshift tests * add temp skip to circle's config.yml * forgot tag: method * Temporarily skip reworked_compare_all_statuses_different_column_set * Skip another test redshift * disable unsupported tests BQ * postgres too? * Fixes for postgres * namespace macros * It's a postgres problem, not a redshift problem * Handle postgres 63 char limit * Add databricks * Rename tests to data_tests * Found a better workaround for missing count distinct window * actually call the macro * disable syntax-failing tests on dbx --- .circleci/config.yml | 30 ++- .gitignore | 3 +- .vscode/settings.json | 21 ++ integration_tests/ci/sample.profiles.yml | 18 +- integration_tests/dbt_project.yml | 12 + ...are_all_columns_concat_pk_with_summary.sql | 0 ..._all_columns_concat_pk_without_summary.sql | 0 .../compare_all_columns_where_clause.sql | 0 .../compare_all_columns_with_summary.sql | 0 ...e_all_columns_with_summary_and_exclude.sql | 0 .../compare_all_columns_without_summary.sql | 0 .../{ => data_tests}/compare_queries.sql | 0 ...pare_queries_concat_pk_without_summary.sql | 0 .../compare_queries_with_summary.sql | 0 .../compare_queries_without_summary.sql | 0 .../compare_relation_columns.sql | 0 ...re_relations_concat_pk_without_summary.sql | 0 .../compare_relations_with_exclude.sql | 0 .../compare_relations_with_summary.sql | 0 .../compare_relations_without_exclude.sql | 0 .../compare_relations_without_summary.sql | 0 .../{ => data_tests}/compare_row_counts.sql | 0 .../compare_which_columns_differ.sql | 0 ...pare_which_columns_differ_exclude_cols.sql | 0 .../models/{ => data_tests}/schema.yml | 38 +-- .../unit_test_model_a.sql | 1 + .../unit_test_model_b.sql | 1 + .../unit_test_struct_model_a.sql | 3 + .../unit_test_struct_model_b.sql | 3 + .../unit_compare_queries.sql | 8 + .../unit_compare_queries.yml | 47 ++++ .../unit_quick_are_queries_identical.sql | 10 + .../unit_quick_are_queries_identical.yml | 97 ++++++++ .../unit_reworked_compare.sql | 9 + .../unit_reworked_compare.yml | 229 ++++++++++++++++++ .../unit_reworked_compare_struct.sql | 9 + .../unit_reworked_compare_struct.yml | 130 ++++++++++ macros/quick_are_queries_identical.sql | 51 ++++ macros/reworked_compare.sql | 63 +++++ macros/utils/_classify_audit_row_status.sql | 28 +++ macros/utils/_count_num_rows_in_status.sql | 28 +++ macros/utils/_generate_set_results.sql | 123 ++++++++++ macros/utils/generate_null_safe_sk.sql | 25 ++ macros/utils/get_comparison_bounds.sql | 42 ++++ package-lock.yml | 4 + 45 files changed, 997 insertions(+), 36 deletions(-) create mode 100644 .vscode/settings.json rename integration_tests/models/{ => data_tests}/compare_all_columns_concat_pk_with_summary.sql (100%) rename integration_tests/models/{ => data_tests}/compare_all_columns_concat_pk_without_summary.sql (100%) rename integration_tests/models/{ => data_tests}/compare_all_columns_where_clause.sql (100%) rename integration_tests/models/{ => data_tests}/compare_all_columns_with_summary.sql (100%) rename integration_tests/models/{ => data_tests}/compare_all_columns_with_summary_and_exclude.sql (100%) rename integration_tests/models/{ => data_tests}/compare_all_columns_without_summary.sql (100%) rename integration_tests/models/{ => data_tests}/compare_queries.sql (100%) rename integration_tests/models/{ => data_tests}/compare_queries_concat_pk_without_summary.sql (100%) rename integration_tests/models/{ => data_tests}/compare_queries_with_summary.sql (100%) rename integration_tests/models/{ => data_tests}/compare_queries_without_summary.sql (100%) rename integration_tests/models/{ => data_tests}/compare_relation_columns.sql (100%) rename integration_tests/models/{ => data_tests}/compare_relations_concat_pk_without_summary.sql (100%) rename integration_tests/models/{ => data_tests}/compare_relations_with_exclude.sql (100%) rename integration_tests/models/{ => data_tests}/compare_relations_with_summary.sql (100%) rename integration_tests/models/{ => data_tests}/compare_relations_without_exclude.sql (100%) rename integration_tests/models/{ => data_tests}/compare_relations_without_summary.sql (100%) rename integration_tests/models/{ => data_tests}/compare_row_counts.sql (100%) rename integration_tests/models/{ => data_tests}/compare_which_columns_differ.sql (100%) rename integration_tests/models/{ => data_tests}/compare_which_columns_differ_exclude_cols.sql (100%) rename integration_tests/models/{ => data_tests}/schema.yml (90%) create mode 100644 integration_tests/models/unit_test_placeholder_models/unit_test_model_a.sql create mode 100644 integration_tests/models/unit_test_placeholder_models/unit_test_model_b.sql create mode 100644 integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_a.sql create mode 100644 integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_b.sql create mode 100644 integration_tests/models/unit_test_wrappers/unit_compare_queries.sql create mode 100644 integration_tests/models/unit_test_wrappers/unit_compare_queries.yml create mode 100644 integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.sql create mode 100644 integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.yml create mode 100644 integration_tests/models/unit_test_wrappers/unit_reworked_compare.sql create mode 100644 integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml create mode 100644 integration_tests/models/unit_test_wrappers/unit_reworked_compare_struct.sql create mode 100644 integration_tests/models/unit_test_wrappers/unit_reworked_compare_struct.yml create mode 100644 macros/quick_are_queries_identical.sql create mode 100644 macros/reworked_compare.sql create mode 100644 macros/utils/_classify_audit_row_status.sql create mode 100644 macros/utils/_count_num_rows_in_status.sql create mode 100644 macros/utils/_generate_set_results.sql create mode 100644 macros/utils/generate_null_safe_sk.sql create mode 100644 macros/utils/get_comparison_bounds.sql create mode 100644 package-lock.yml diff --git a/.circleci/config.yml b/.circleci/config.yml index 1701f721..cde7c9e7 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -33,7 +33,7 @@ jobs: . dbt_venv/bin/activate python -m pip install --upgrade pip setuptools - python -m pip install --pre dbt-core dbt-postgres dbt-redshift dbt-snowflake dbt-bigquery + python -m pip install --pre dbt-core dbt-postgres dbt-redshift dbt-snowflake dbt-bigquery dbt-databricks mkdir -p ~/.dbt cp integration_tests/ci/sample.profiles.yml ~/.dbt/profiles.yml @@ -52,8 +52,8 @@ jobs: dbt deps --target postgres dbt seed --target postgres --full-refresh dbt compile --target postgres - dbt run --target postgres - dbt test --target postgres + dbt run --target postgres --exclude tag:skip+ tag:temporary_skip+ + dbt test --target postgres --exclude tag:skip+ tag:temporary_skip+ - run: name: "Run Tests - Redshift" @@ -64,8 +64,8 @@ jobs: dbt deps --target redshift dbt seed --target redshift --full-refresh dbt compile --target redshift - dbt run --target redshift - dbt test --target redshift + dbt run --target redshift --exclude tag:skip+ tag:temporary_skip+ + dbt test --target redshift --exclude tag:skip+ tag:temporary_skip+ - run: name: "Run Tests - Snowflake" @@ -76,8 +76,8 @@ jobs: dbt deps --target snowflake dbt seed --target snowflake --full-refresh dbt compile --target snowflake - dbt run --target snowflake - dbt test --target snowflake + dbt run --target snowflake --exclude tag:skip+ tag:temporary_skip+ + dbt test --target snowflake --exclude tag:skip+ tag:temporary_skip+ - run: name: "Run Tests - BigQuery" @@ -91,9 +91,20 @@ jobs: dbt deps --target bigquery dbt seed --target bigquery --full-refresh dbt compile --target bigquery - dbt run --target bigquery --full-refresh - dbt test --target bigquery + dbt run --target bigquery --full-refresh --exclude tag:skip+ tag:temporary_skip+ + dbt test --target bigquery --exclude tag:skip+ tag:temporary_skip+ + - run: + name: "Run Tests - Databricks" + command: | + . dbt_venv/bin/activate + echo `pwd` + cd integration_tests + dbt deps --target databricks + dbt seed --target databricks --full-refresh + dbt compile --target databricks + dbt run --target databricks --exclude tag:skip+ tag:temporary_skip+ + dbt test --target databricks --exclude tag:skip+ tag:temporary_skip+ - save_cache: key: deps1-{{ .Branch }} @@ -115,3 +126,4 @@ workflows: - profile-redshift - profile-snowflake - profile-bigquery + - profile-databricks diff --git a/.gitignore b/.gitignore index a33e3f41..0606e5c3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ target/ dbt_packages/ logs/ -logfile \ No newline at end of file +logfile +.DS_Store diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 00000000..437dcba6 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,21 @@ +{ + "yaml.schemas": { + "https://raw.githubusercontent.com/dbt-labs/dbt-jsonschema/main/schemas/latest/dbt_yml_files-latest.json": [ + "/**/*.yml", + "!profiles.yml", + "!dbt_project.yml", + "!packages.yml", + "!selectors.yml", + "!profile_template.yml" + ], + "https://raw.githubusercontent.com/dbt-labs/dbt-jsonschema/main/schemas/latest/dbt_project-latest.json": [ + "dbt_project.yml" + ], + "https://raw.githubusercontent.com/dbt-labs/dbt-jsonschema/main/schemas/latest/selectors-latest.json": [ + "selectors.yml" + ], + "https://raw.githubusercontent.com/dbt-labs/dbt-jsonschema/main/schemas/latest/packages-latest.json": [ + "packages.yml" + ] + }, +} \ No newline at end of file diff --git a/integration_tests/ci/sample.profiles.yml b/integration_tests/ci/sample.profiles.yml index 843d659e..ea8effc1 100644 --- a/integration_tests/ci/sample.profiles.yml +++ b/integration_tests/ci/sample.profiles.yml @@ -2,10 +2,6 @@ # HEY! This file is used in the dbt-audit-helper integrations tests with CircleCI. # You should __NEVER__ check credentials into version control. Thanks for reading :) -config: - send_anonymous_usage_stats: False - use_colors: True - integration_tests: target: postgres outputs: @@ -27,7 +23,7 @@ integration_tests: dbname: "{{ env_var('REDSHIFT_TEST_DBNAME') }}" port: "{{ env_var('REDSHIFT_TEST_PORT') | as_number }}" schema: audit_helper_integration_tests_redshift - threads: 1 + threads: 8 bigquery: type: bigquery @@ -35,7 +31,7 @@ integration_tests: keyfile: "{{ env_var('BIGQUERY_SERVICE_KEY_PATH') }}" project: "{{ env_var('BIGQUERY_TEST_DATABASE') }}" schema: audit_helper_integration_tests_bigquery - threads: 1 + threads: 8 snowflake: type: snowflake @@ -46,4 +42,12 @@ integration_tests: database: "{{ env_var('SNOWFLAKE_TEST_DATABASE') }}" warehouse: "{{ env_var('SNOWFLAKE_TEST_WAREHOUSE') }}" schema: audit_helper_integration_tests_snowflake - threads: 1 + threads: 8 + + databricks: + type: databricks + schema: dbt_project_evaluator_integration_tests_databricks + host: "{{ env_var('DATABRICKS_TEST_HOST') }}" + http_path: "{{ env_var('DATABRICKS_TEST_HTTP_PATH') }}" + token: "{{ env_var('DATABRICKS_TEST_ACCESS_TOKEN') }}" + threads: 10 diff --git a/integration_tests/dbt_project.yml b/integration_tests/dbt_project.yml index 07120e4c..ef906729 100644 --- a/integration_tests/dbt_project.yml +++ b/integration_tests/dbt_project.yml @@ -17,3 +17,15 @@ clean-targets: # directories to be removed by `dbt clean` seeds: +quote_columns: false + +vars: + compare_queries_summarize: true + reworked_compare__primary_key_columns: ['col1'] + reworked_compare__columns: ['col1'] + reworked_compare__event_time: + quick_are_queries_identical_cols: ['col1'] + quick_are_queries_identical_event_time: + +flags: + send_anonymous_usage_stats: False + use_colors: True \ No newline at end of file diff --git a/integration_tests/models/compare_all_columns_concat_pk_with_summary.sql b/integration_tests/models/data_tests/compare_all_columns_concat_pk_with_summary.sql similarity index 100% rename from integration_tests/models/compare_all_columns_concat_pk_with_summary.sql rename to integration_tests/models/data_tests/compare_all_columns_concat_pk_with_summary.sql diff --git a/integration_tests/models/compare_all_columns_concat_pk_without_summary.sql b/integration_tests/models/data_tests/compare_all_columns_concat_pk_without_summary.sql similarity index 100% rename from integration_tests/models/compare_all_columns_concat_pk_without_summary.sql rename to integration_tests/models/data_tests/compare_all_columns_concat_pk_without_summary.sql diff --git a/integration_tests/models/compare_all_columns_where_clause.sql b/integration_tests/models/data_tests/compare_all_columns_where_clause.sql similarity index 100% rename from integration_tests/models/compare_all_columns_where_clause.sql rename to integration_tests/models/data_tests/compare_all_columns_where_clause.sql diff --git a/integration_tests/models/compare_all_columns_with_summary.sql b/integration_tests/models/data_tests/compare_all_columns_with_summary.sql similarity index 100% rename from integration_tests/models/compare_all_columns_with_summary.sql rename to integration_tests/models/data_tests/compare_all_columns_with_summary.sql diff --git a/integration_tests/models/compare_all_columns_with_summary_and_exclude.sql b/integration_tests/models/data_tests/compare_all_columns_with_summary_and_exclude.sql similarity index 100% rename from integration_tests/models/compare_all_columns_with_summary_and_exclude.sql rename to integration_tests/models/data_tests/compare_all_columns_with_summary_and_exclude.sql diff --git a/integration_tests/models/compare_all_columns_without_summary.sql b/integration_tests/models/data_tests/compare_all_columns_without_summary.sql similarity index 100% rename from integration_tests/models/compare_all_columns_without_summary.sql rename to integration_tests/models/data_tests/compare_all_columns_without_summary.sql diff --git a/integration_tests/models/compare_queries.sql b/integration_tests/models/data_tests/compare_queries.sql similarity index 100% rename from integration_tests/models/compare_queries.sql rename to integration_tests/models/data_tests/compare_queries.sql diff --git a/integration_tests/models/compare_queries_concat_pk_without_summary.sql b/integration_tests/models/data_tests/compare_queries_concat_pk_without_summary.sql similarity index 100% rename from integration_tests/models/compare_queries_concat_pk_without_summary.sql rename to integration_tests/models/data_tests/compare_queries_concat_pk_without_summary.sql diff --git a/integration_tests/models/compare_queries_with_summary.sql b/integration_tests/models/data_tests/compare_queries_with_summary.sql similarity index 100% rename from integration_tests/models/compare_queries_with_summary.sql rename to integration_tests/models/data_tests/compare_queries_with_summary.sql diff --git a/integration_tests/models/compare_queries_without_summary.sql b/integration_tests/models/data_tests/compare_queries_without_summary.sql similarity index 100% rename from integration_tests/models/compare_queries_without_summary.sql rename to integration_tests/models/data_tests/compare_queries_without_summary.sql diff --git a/integration_tests/models/compare_relation_columns.sql b/integration_tests/models/data_tests/compare_relation_columns.sql similarity index 100% rename from integration_tests/models/compare_relation_columns.sql rename to integration_tests/models/data_tests/compare_relation_columns.sql diff --git a/integration_tests/models/compare_relations_concat_pk_without_summary.sql b/integration_tests/models/data_tests/compare_relations_concat_pk_without_summary.sql similarity index 100% rename from integration_tests/models/compare_relations_concat_pk_without_summary.sql rename to integration_tests/models/data_tests/compare_relations_concat_pk_without_summary.sql diff --git a/integration_tests/models/compare_relations_with_exclude.sql b/integration_tests/models/data_tests/compare_relations_with_exclude.sql similarity index 100% rename from integration_tests/models/compare_relations_with_exclude.sql rename to integration_tests/models/data_tests/compare_relations_with_exclude.sql diff --git a/integration_tests/models/compare_relations_with_summary.sql b/integration_tests/models/data_tests/compare_relations_with_summary.sql similarity index 100% rename from integration_tests/models/compare_relations_with_summary.sql rename to integration_tests/models/data_tests/compare_relations_with_summary.sql diff --git a/integration_tests/models/compare_relations_without_exclude.sql b/integration_tests/models/data_tests/compare_relations_without_exclude.sql similarity index 100% rename from integration_tests/models/compare_relations_without_exclude.sql rename to integration_tests/models/data_tests/compare_relations_without_exclude.sql diff --git a/integration_tests/models/compare_relations_without_summary.sql b/integration_tests/models/data_tests/compare_relations_without_summary.sql similarity index 100% rename from integration_tests/models/compare_relations_without_summary.sql rename to integration_tests/models/data_tests/compare_relations_without_summary.sql diff --git a/integration_tests/models/compare_row_counts.sql b/integration_tests/models/data_tests/compare_row_counts.sql similarity index 100% rename from integration_tests/models/compare_row_counts.sql rename to integration_tests/models/data_tests/compare_row_counts.sql diff --git a/integration_tests/models/compare_which_columns_differ.sql b/integration_tests/models/data_tests/compare_which_columns_differ.sql similarity index 100% rename from integration_tests/models/compare_which_columns_differ.sql rename to integration_tests/models/data_tests/compare_which_columns_differ.sql diff --git a/integration_tests/models/compare_which_columns_differ_exclude_cols.sql b/integration_tests/models/data_tests/compare_which_columns_differ_exclude_cols.sql similarity index 100% rename from integration_tests/models/compare_which_columns_differ_exclude_cols.sql rename to integration_tests/models/data_tests/compare_which_columns_differ_exclude_cols.sql diff --git a/integration_tests/models/schema.yml b/integration_tests/models/data_tests/schema.yml similarity index 90% rename from integration_tests/models/schema.yml rename to integration_tests/models/data_tests/schema.yml index 4bea9838..fbe74ff7 100644 --- a/integration_tests/models/schema.yml +++ b/integration_tests/models/data_tests/schema.yml @@ -2,96 +2,96 @@ version: 2 models: - name: compare_queries - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_relations_without_exclude') - name: compare_queries_concat_pk_without_summary - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_without_summary') - name: compare_queries_with_summary - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_with_summary') - name: compare_queries_without_summary - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_without_summary') - name: compare_relations_with_summary - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_with_summary') - name: compare_relations_without_summary - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_without_summary') - name: compare_relations_with_exclude - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_relations_with_exclude') - name: compare_relations_without_exclude - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_relations_without_exclude') - name: compare_all_columns_with_summary - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_all_columns_with_summary') - name: compare_all_columns_without_summary - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_all_columns_without_summary') - name: compare_all_columns_concat_pk_with_summary - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_all_columns_concat_pk_with_summary') - name: compare_all_columns_concat_pk_without_summary - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_all_columns_concat_pk_without_summary') - name: compare_all_columns_with_summary_and_exclude - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_all_columns_with_summary_and_exclude') - name: compare_all_columns_where_clause - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_all_columns_where_clause') - name: compare_relation_columns - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_relation_columns') - name: compare_relations_concat_pk_without_summary - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_without_summary') - name: compare_which_columns_differ - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_which_columns_differ') - name: compare_which_columns_differ_exclude_cols - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_which_columns_differ_exclude_cols') - name: compare_row_counts - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_row_counts') diff --git a/integration_tests/models/unit_test_placeholder_models/unit_test_model_a.sql b/integration_tests/models/unit_test_placeholder_models/unit_test_model_a.sql new file mode 100644 index 00000000..a4bc3985 --- /dev/null +++ b/integration_tests/models/unit_test_placeholder_models/unit_test_model_a.sql @@ -0,0 +1 @@ +select 12 as id, 22 as id_2, 'xyz' as col1, 'tuv' as col2, 123 as col3, {{ dbt.current_timestamp() }} as created_at \ No newline at end of file diff --git a/integration_tests/models/unit_test_placeholder_models/unit_test_model_b.sql b/integration_tests/models/unit_test_placeholder_models/unit_test_model_b.sql new file mode 100644 index 00000000..a4bc3985 --- /dev/null +++ b/integration_tests/models/unit_test_placeholder_models/unit_test_model_b.sql @@ -0,0 +1 @@ +select 12 as id, 22 as id_2, 'xyz' as col1, 'tuv' as col2, 123 as col3, {{ dbt.current_timestamp() }} as created_at \ No newline at end of file diff --git a/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_a.sql b/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_a.sql new file mode 100644 index 00000000..24d584e8 --- /dev/null +++ b/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_a.sql @@ -0,0 +1,3 @@ +{{ config(tags=['skip' if (target.type in ['redshift', 'bigquery', 'postgres', 'databricks']) else 'runnable']) }} + +select 1 as id, 'John Doe' as col1, object_construct('street', '123 Main St', 'city', 'Anytown', 'state', 'CA') as col2 \ No newline at end of file diff --git a/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_b.sql b/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_b.sql new file mode 100644 index 00000000..24d584e8 --- /dev/null +++ b/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_b.sql @@ -0,0 +1,3 @@ +{{ config(tags=['skip' if (target.type in ['redshift', 'bigquery', 'postgres', 'databricks']) else 'runnable']) }} + +select 1 as id, 'John Doe' as col1, object_construct('street', '123 Main St', 'city', 'Anytown', 'state', 'CA') as col2 \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_compare_queries.sql b/integration_tests/models/unit_test_wrappers/unit_compare_queries.sql new file mode 100644 index 00000000..c589ee53 --- /dev/null +++ b/integration_tests/models/unit_test_wrappers/unit_compare_queries.sql @@ -0,0 +1,8 @@ + +{{ + audit_helper.compare_queries( + "select * from " ~ ref('unit_test_model_a'), + "select * from " ~ ref('unit_test_model_b'), + summarize = var('compare_queries_summarize') + ) +}} \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_compare_queries.yml b/integration_tests/models/unit_test_wrappers/unit_compare_queries.yml new file mode 100644 index 00000000..0308e509 --- /dev/null +++ b/integration_tests/models/unit_test_wrappers/unit_compare_queries.yml @@ -0,0 +1,47 @@ +unit_tests: + - name: identical_records_compare_queries + model: unit_compare_queries + description: The world's most basic unit test. + + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + + expect: + rows: + - {"in_a": true, "in_b": true} + + overrides: + vars: + compare_queries_summarize: true + + - name: identical_records_compare_queries_no_summarize + model: unit_compare_queries + description: The world's second most basic unit test. + + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + + expect: + rows: [] + + overrides: + vars: + compare_queries_summarize: false diff --git a/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.sql b/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.sql new file mode 100644 index 00000000..e969b1e2 --- /dev/null +++ b/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.sql @@ -0,0 +1,10 @@ +{{ config(tags=['skip' if (target.type in ['redshift', 'bigquery', 'postgres', 'databricks']) else 'runnable']) }} + +{{ + audit_helper.quick_are_queries_identical( + "select * from " ~ ref('unit_test_model_a'), + "select * from " ~ ref('unit_test_model_b'), + columns=var('quick_are_queries_identical_cols'), + event_time=var('quick_are_queries_identical_event_time') + ) +}} \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.yml b/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.yml new file mode 100644 index 00000000..0d953506 --- /dev/null +++ b/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.yml @@ -0,0 +1,97 @@ +unit_tests: + - name: quick_are_queries_identical_identical_tables + model: unit_quick_are_queries_identical + + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + + expect: + rows: + - {"are_tables_identical": true} + + overrides: + vars: + quick_are_queries_identical_cols: ['id', 'col1', 'col2'] + quick_are_queries_identical_event_time: + + - name: quick_are_queries_identical_identical_tables_event_time_filter + model: unit_quick_are_queries_identical + overrides: + vars: + quick_are_queries_identical_cols: ['id', 'col1', 'col2', 'created_at'] + quick_are_queries_identical_event_time: 'created_at' + macros: + audit_helper.get_comparison_bounds: + "min_event_time": "2024-01-02" + "max_event_time": "2024-01-03" + "event_time": 'created_at' + + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def", "created_at": '2024-01-01' } + - { "id": 2, "col1": "hij", "col2": "klm", "created_at": '2024-01-02' } + - { "id": 3, "col1": "nop", "col2": "qrs", "created_at": '2024-01-03' } + - input: ref('unit_test_model_b') + rows: + - { "id": 2, "col1": "hij", "col2": "klm", "created_at": '2024-01-02' } + - { "id": 3, "col1": "nop", "col2": "qrs", "created_at": '2024-01-03' } + + expect: + rows: + - {"are_tables_identical": true} + + - name: quick_are_queries_identical_differences + model: unit_quick_are_queries_identical + overrides: + vars: + quick_are_queries_identical_cols: ['id', 'col1', 'col2'] + quick_are_queries_identical_event_time: + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "changed", "col2": "values" } + - { "id": 4, "col1": "nop", "col2": "qrs" } + + expect: + rows: + - {"are_tables_identical": false} + + - name: quick_are_queries_identical_identical_tables_with_null_pks + model: unit_quick_are_queries_identical + + given: + - input: ref('unit_test_model_a') + rows: + - { "id":, "col1": "abc", "col2": "def" } + - { "id":, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id":, "col1": "abc", "col2": "def" } + - { "id":, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + + expect: + rows: + - {"are_tables_identical": true} + + overrides: + vars: + quick_are_queries_identical_cols: ['id', 'col1', 'col2'] + quick_are_queries_identical_event_time: diff --git a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.sql b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.sql new file mode 100644 index 00000000..37473546 --- /dev/null +++ b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.sql @@ -0,0 +1,9 @@ +{{ + audit_helper.reworked_compare( + "select * from " ~ ref('unit_test_model_a'), + "select * from " ~ ref('unit_test_model_b'), + primary_key_columns=var('reworked_compare__primary_key_columns'), + columns=var('reworked_compare__columns'), + event_time=var('reworked_compare__event_time') + ) +}} \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml new file mode 100644 index 00000000..fd2c0d02 --- /dev/null +++ b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml @@ -0,0 +1,229 @@ +unit_tests: + - name: reworked_compare_identical_tables + model: unit_reworked_compare + + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + + expect: + rows: + - {"dbt_audit_row_status": 'identical', 'id': 1, dbt_audit_num_rows_in_status: 3} + - {"dbt_audit_row_status": 'identical', 'id': 3, dbt_audit_num_rows_in_status: 3} + - {"dbt_audit_row_status": 'identical', 'id': 2, dbt_audit_num_rows_in_status: 3} + + overrides: + vars: + reworked_compare__columns: ['id', 'col1', 'col2'] + reworked_compare__event_time: + reworked_compare__primary_key_columns: ['id'] + + - name: reworked_compare_identical_tables_event_time_filter + model: unit_reworked_compare + overrides: + vars: + reworked_compare__columns: ['id', 'col1', 'col2', 'created_at'] + reworked_compare__event_time: 'created_at' + reworked_compare__primary_key_columns: ['id'] + macros: + audit_helper.get_comparison_bounds: + "min_event_time": "2024-01-02" + "max_event_time": "2024-01-03" + "event_time": 'created_at' + + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def", "created_at": '2024-01-01' } + - { "id": 2, "col1": "hij", "col2": "klm", "created_at": '2024-01-02' } + - { "id": 3, "col1": "nop", "col2": "qrs", "created_at": '2024-01-03' } + - input: ref('unit_test_model_b') + rows: + - { "id": 2, "col1": "hij", "col2": "klm", "created_at": '2024-01-02' } + - { "id": 3, "col1": "nop", "col2": "qrs", "created_at": '2024-01-03' } + + expect: + rows: + - {"dbt_audit_row_status": 'identical', 'id': 2, dbt_audit_num_rows_in_status: 2} + - {"dbt_audit_row_status": 'identical', 'id': 3, dbt_audit_num_rows_in_status: 2} + + - name: reworked_compare_all_statuses + model: unit_reworked_compare + overrides: + vars: + reworked_compare__columns: ['id', 'col1', 'col2'] + reworked_compare__event_time: + reworked_compare__primary_key_columns: ['id'] + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "changed", "col2": "values" } + - { "id": 4, "col1": "nop", "col2": "qrs" } + + expect: + rows: + - {"dbt_audit_row_status": 'identical', 'id': 1, dbt_audit_num_rows_in_status: 1} + - {"dbt_audit_row_status": 'modified', 'id': 2, dbt_audit_num_rows_in_status: 1} + - {"dbt_audit_row_status": 'modified', 'id': 2, dbt_audit_num_rows_in_status: 1} + - {"dbt_audit_row_status": 'removed', 'id': 3, dbt_audit_num_rows_in_status: 1} + - {"dbt_audit_row_status": 'added', 'id': 4, dbt_audit_num_rows_in_status: 1} + config: + tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-core/issues/10167 + + - name: reworked_compare_identical_tables_multiple_pk_cols + model: unit_reworked_compare + overrides: + vars: + reworked_compare__columns: ['id', 'id_2', 'col1', 'col2'] + reworked_compare__event_time: + reworked_compare__primary_key_columns: ['id', 'id_2'] + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 12, "id_2": 3, "col1": "abc", "col2": "def" } + - { "id": 1, "id_2": 23, "col1": "hij", "col2": "klm" } + - { "id": 3, "id_2": 4, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": 12, "id_2": 3, "col1": "abc", "col2": "def" } + - { "id": 1, "id_2": 23, "col1": "hij", "col2": "klm" } + - { "id": 3, "id_2": 4, "col1": "nop", "col2": "qrs" } + expect: + rows: + - {"dbt_audit_row_status": 'identical', 'id': 12, "id_2": 3, "dbt_audit_num_rows_in_status": 3} + - {"dbt_audit_row_status": 'identical', 'id': 1, "id_2": 23, "dbt_audit_num_rows_in_status": 3} + - {"dbt_audit_row_status": 'identical', 'id': 3, "id_2": 4, "dbt_audit_num_rows_in_status": 3} + + - name: reworked_compare_identical_tables_single_null_pk + model: unit_reworked_compare + description: "`nonunique_pk` status checks whether a PK is unique. It's intended to avoid arbitrary comparisons, not protect against null records (that's what constraints or tests are for)." + + given: + - input: ref('unit_test_model_a') + rows: + - { "id": , "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": , "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + + expect: + rows: + - {"dbt_audit_row_status": 'identical', 'id': , dbt_audit_num_rows_in_status: 3} + - {"dbt_audit_row_status": 'identical', 'id': 2, dbt_audit_num_rows_in_status: 3} + - {"dbt_audit_row_status": 'identical', 'id': 3, dbt_audit_num_rows_in_status: 3} + + overrides: + vars: + reworked_compare__columns: ['id', 'col1', 'col2'] + reworked_compare__event_time: + reworked_compare__primary_key_columns: ['id'] + config: + tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-redshift/issues/821 + + - name: reworked_compare_identical_tables_multiple_null_pk + model: unit_reworked_compare + + given: + - input: ref('unit_test_model_a') + rows: + - { "id": , "col1": "abc", "col2": "def" } + - { "id": , "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": , "col1": "abc", "col2": "def" } + - { "id": , "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + + expect: + rows: + - {"dbt_audit_row_status": 'identical', 'id': 3, dbt_audit_num_rows_in_status: 1} + - {"dbt_audit_row_status": 'nonunique_pk', 'id': , dbt_audit_num_rows_in_status: 2} + - {"dbt_audit_row_status": 'nonunique_pk', 'id': , dbt_audit_num_rows_in_status: 2} + + overrides: + vars: + reworked_compare__columns: ['id', 'col1', 'col2'] + reworked_compare__event_time: + reworked_compare__primary_key_columns: ['id'] + config: + tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-redshift/issues/821 + + - name: reworked_compare_identical_tables_multi_null_pk_dupe_rows + description: All rows with a null ID are identical. They should be returned as individual rows instead of being combined + model: unit_reworked_compare + + given: + - input: ref('unit_test_model_a') + rows: + - { "id": , "col1": "abc", "col2": "def" } + - { "id": , "col1": "abc", "col2": "def" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": , "col1": "abc", "col2": "def" } + - { "id": , "col1": "abc", "col2": "def" } + - { "id": , "col1": "abc", "col2": "def" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + + expect: + rows: + - {"dbt_audit_row_status": 'identical', 'id': 3, dbt_audit_num_rows_in_status: 1} + - {"dbt_audit_row_status": 'nonunique_pk', 'id': , dbt_audit_num_rows_in_status: 3} + - {"dbt_audit_row_status": 'nonunique_pk', 'id': , dbt_audit_num_rows_in_status: 3} + - {"dbt_audit_row_status": 'nonunique_pk', 'id': , dbt_audit_num_rows_in_status: 3} + + overrides: + vars: + reworked_compare__columns: ['id', 'col1', 'col2'] + reworked_compare__event_time: + reworked_compare__primary_key_columns: ['id'] + config: + tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-redshift/issues/821 + + - name: reworked_compare_all_statuses_different_column_set + model: unit_reworked_compare + overrides: + vars: + reworked_compare__primary_key_columns: ['id'] + reworked_compare__columns: ['id', 'col1'] + reworked_compare__event_time: + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": 1, "col1": "abc" } + - { "id": 2, "col1": "ddd" } + - { "id": 4, "col1": "nop" } + + expect: + rows: + - {"dbt_audit_row_status": 'added', 'id': 4, dbt_audit_num_rows_in_status: 1} + - {"dbt_audit_row_status": 'identical', 'id': 1, dbt_audit_num_rows_in_status: 1} + - {"dbt_audit_row_status": 'modified', 'id': 2, dbt_audit_num_rows_in_status: 1} + - {"dbt_audit_row_status": 'modified', 'id': 2, dbt_audit_num_rows_in_status: 1} + - {"dbt_audit_row_status": 'removed', 'id': 3, dbt_audit_num_rows_in_status: 1} + config: + tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-core/issues/10167 diff --git a/integration_tests/models/unit_test_wrappers/unit_reworked_compare_struct.sql b/integration_tests/models/unit_test_wrappers/unit_reworked_compare_struct.sql new file mode 100644 index 00000000..7aab2177 --- /dev/null +++ b/integration_tests/models/unit_test_wrappers/unit_reworked_compare_struct.sql @@ -0,0 +1,9 @@ +{{ + audit_helper.reworked_compare( + "select * from " ~ ref('unit_test_struct_model_a'), + "select * from " ~ ref('unit_test_struct_model_b'), + primary_key_columns=var('reworked_compare__primary_key_columns'), + columns=var('reworked_compare__columns'), + event_time=var('reworked_compare__event_time') + ) +}} \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_reworked_compare_struct.yml b/integration_tests/models/unit_test_wrappers/unit_reworked_compare_struct.yml new file mode 100644 index 00000000..3139a570 --- /dev/null +++ b/integration_tests/models/unit_test_wrappers/unit_reworked_compare_struct.yml @@ -0,0 +1,130 @@ +unit_tests: + - name: reworked_compare_struct + model: unit_reworked_compare_struct + given: + - input: ref('unit_test_struct_model_a') + format: sql + rows: | + select 1 as id, 'John Doe' as col1, object_construct('street', '123 Main St', 'city', 'Anytown', 'state', 'CA') as col2 + - input: ref('unit_test_struct_model_b') + format: sql + rows: | + select 1 as id, 'John Doe' as col1, object_construct('street', '123 Main St', 'city', 'Anytown', 'state', 'CA') as col2 + expect: + rows: + - {"id": 1, "dbt_audit_row_status": "identical", "dbt_audit_num_rows_in_status": 1} + overrides: + vars: + reworked_compare__columns: ['id', 'col1', 'col2'] + reworked_compare__event_time: + reworked_compare__primary_key_columns: ['id'] + config: + tags: "{{ 'skip' if (target.type in ['redshift']) else 'runnable' }}" + + - name: unit_reworked_compare_struct_identical_values_different_order + model: unit_reworked_compare_struct + description: Snowflake sorts objects' keys alphabetically, so sort order is ignored. + given: + - input: ref('unit_test_struct_model_a') + format: sql + rows: | + select 1 as id, 'John Doe' as col1, object_construct('street', '123 Main St', 'city', 'Anytown', 'state', 'CA') as col2 + - input: ref('unit_test_struct_model_b') + format: sql + rows: | + select 1 as id, 'John Doe' as col1, object_construct('state', 'CA', 'street', '123 Main St', 'city', 'Anytown') as col2 + expect: + rows: + - {"id": 1, "dbt_audit_row_status": "identical", "dbt_audit_num_rows_in_status": 1} + overrides: + vars: + reworked_compare__columns: ['id', 'col1', 'col2'] + reworked_compare__event_time: + reworked_compare__primary_key_columns: ['id'] + + - name: unit_reworked_compare_struct_removed_key + model: unit_reworked_compare_struct + given: + - input: ref('unit_test_struct_model_a') + format: sql + rows: | + select 1 as id, 'John Doe' as col1, object_construct('street', '123 Main St', 'city', 'Anytown', 'state', 'CA') as col2 + - input: ref('unit_test_struct_model_b') + format: sql + rows: | + select 1 as id, 'John Doe' as col1, object_construct('street', '123 Main St', 'state', 'CA') as col2 + expect: + rows: + - {"id": 1, "dbt_audit_row_status": "modified", "dbt_audit_num_rows_in_status": 1} + - {"id": 1, "dbt_audit_row_status": "modified", "dbt_audit_num_rows_in_status": 1} + overrides: + vars: + reworked_compare__columns: ['id', 'col1', 'col2'] + reworked_compare__event_time: + reworked_compare__primary_key_columns: ['id'] + + - name: reworked_compare_complex_struct + model: unit_reworked_compare_struct + given: + - input: ref('unit_test_struct_model_a') + format: sql + rows: | + select 1 as id, 'John Doe' as col1, object_construct('emails', array_construct('john.doe@example.com', 'john.d@example.com'), 'phones', array_construct(object_construct('type', 'home', 'number', '123-456-7890'), object_construct('type', 'work', 'number', '987-654-3210'))) as col2 + - input: ref('unit_test_struct_model_b') + format: sql + rows: | + select 1 as id, 'John Doe' as col1, object_construct('emails', array_construct('john.doe@example.com', 'john.d@example.com'), 'phones', array_construct(object_construct('type', 'home', 'number', '123-456-7890'), object_construct('type', 'work', 'number', '987-654-3210'))) as col2 + expect: + rows: + - {"id": 1, "dbt_audit_row_status": "identical", "dbt_audit_num_rows_in_status": 1} + overrides: + vars: + reworked_compare__columns: ['id', 'col1', 'col2'] + reworked_compare__event_time: + reworked_compare__primary_key_columns: ['id'] + + + - name: reworked_compare_complex_struct_different_values + model: unit_reworked_compare_struct + given: + - input: ref('unit_test_struct_model_a') + format: sql + rows: | + select 1 as id, 'John Doe' as col1, object_construct('emails', array_construct('john.doe@example.com', 'john.d@example.com'), 'phones', array_construct(object_construct('type', 'home', 'number', '123-456-7890'), object_construct('type', 'work', 'number', '987-654-3210'))) as col2 + - input: ref('unit_test_struct_model_b') + format: sql + rows: | + select 1 as id, 'John Doe' as col1, object_construct('emails', array_construct('john.smith@example.com', 'john.d@example.com'), 'phones', array_construct(object_construct('type', 'home', 'number', '123-456-7890'), object_construct('type', 'work', 'number', '987-654-3210'))) as col2 + expect: + rows: + - {"id": 1, "dbt_audit_row_status": "modified", "dbt_audit_num_rows_in_status": 1} + - {"id": 1, "dbt_audit_row_status": "modified", "dbt_audit_num_rows_in_status": 1} + overrides: + vars: + reworked_compare__columns: ['id', 'col1', 'col2'] + reworked_compare__event_time: + reworked_compare__primary_key_columns: ['id'] + + - name: unit_reworked_compare_complex_struct_identical_values_different_order + model: unit_reworked_compare_struct + description: Snowflake sorts objects' keys alphabetically, but respects the order items are added to arrays so differences are detected. + given: + - input: ref('unit_test_struct_model_a') + format: sql + rows: | + select 1 as id, 'John Doe' as col1, object_construct('emails', array_construct('john.doe@example.com', 'john.d@example.com'), 'phones', array_construct(object_construct('type', 'home', 'number', '123-456-7890'), object_construct('type', 'work', 'number', '987-654-3210'))) as col2 + - input: ref('unit_test_struct_model_b') + format: sql + rows: | + select 1 as id, 'John Doe' as col1, object_construct('emails', array_construct('john.doe@example.com', 'john.d@example.com'), 'phones', array_construct(object_construct('type', 'work', 'number', '987-654-3210'), object_construct('type', 'home', 'number', '123-456-7890'))) as col2 + expect: + rows: + - {"id": 1, "dbt_audit_row_status": "modified", "dbt_audit_num_rows_in_status": 1} + - {"id": 1, "dbt_audit_row_status": "modified", "dbt_audit_num_rows_in_status": 1} + overrides: + vars: + reworked_compare__columns: ['id', 'col1', 'col2'] + reworked_compare__event_time: + reworked_compare__primary_key_columns: ['id'] + + diff --git a/macros/quick_are_queries_identical.sql b/macros/quick_are_queries_identical.sql new file mode 100644 index 00000000..add26638 --- /dev/null +++ b/macros/quick_are_queries_identical.sql @@ -0,0 +1,51 @@ +/* +As described by the Infinite Lambda team here: https://infinitelambda.com/data-validation-refactoring-snowflake/ + +Some platforms let you take a hash of the whole table, which can be very very fast compared to comparing each row. + +If you run this and it returns false, you still have to run the more in-depth queries to find out what specific changes there are, +but it's a good way to quickly verify identical results if that's what you're expecting. +*/ + +{% macro quick_are_queries_identical(query_a, query_b, columns=[], event_time=None) %} + {{ return (adapter.dispatch('quick_are_queries_identical', 'audit_helper')(query_a, query_b, columns, event_time)) }} +{% endmacro %} + +{% macro default__quick_are_queries_identical(query_a, query_b, columns, event_time) %} + {% set joined_cols = columns | join(", ") %} + {% if event_time %} + {% set event_time_props = audit_helper.get_comparison_bounds(a_query, b_query, event_time) %} + {% endif %} + + select count(distinct hash_result) = 1 as are_tables_identical + from ( + select hash_agg({{ joined_cols }}) as hash_result + from ({{ query_a }}) query_a_subq + {% if event_time_props %} + where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' + and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' + {% endif %} + + union all + + select hash_agg({{ joined_cols }}) as hash_result + from ({{ query_b }}) query_b_subq + {% if event_time_props %} + where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' + and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' + {% endif %} + + ) as hashes +{% endmacro %} + +{% macro is_quick_are_queries_identical_supported() %} + {{ return (adapter.dispatch('is_quick_are_queries_identical_supported', 'audit_helper')()) }} +{% endmacro %} + +{% macro default__is_quick_are_queries_identical_supported() %} + {{ return (False) }} +{% endmacro %} + +{% macro snowflake__is_quick_are_queries_identical_supported() %} + {{ return (True) }} +{% endmacro %} \ No newline at end of file diff --git a/macros/reworked_compare.sql b/macros/reworked_compare.sql new file mode 100644 index 00000000..e7838833 --- /dev/null +++ b/macros/reworked_compare.sql @@ -0,0 +1,63 @@ +{% macro reworked_compare(a_query, b_query, primary_key_columns=[], columns=[], event_time=None, sample_limit=20) %} + + {% set joined_cols = columns | join(", ") %} + + {% if event_time %} + {% set event_time_props = audit_helper.get_comparison_bounds(a_query, b_query, event_time) %} + {% endif %} + + with + + {{ audit_helper._generate_set_results(a_query, b_query, primary_key_columns, columns, event_time_props)}} + + , + + all_records as ( + + select + *, + true as in_a, + true as in_b + from a_intersect_b + + union all + + select + *, + true as in_a, + false as in_b + from a_except_b + + union all + + select + *, + false as in_a, + true as in_b + from b_except_a + + ), + + + classified as ( + select + *, + {{ audit_helper._classify_audit_row_status() }} as dbt_audit_row_status + from all_records + ), + + final as ( + select + *, + {{ audit_helper._count_num_rows_in_status() }} as dbt_audit_num_rows_in_status, + dense_rank() over (partition by dbt_audit_row_status order by dbt_audit_surrogate_key, dbt_audit_pk_row_num) as dbt_audit_sample_number + from classified + ) + + select * from final + {% if sample_limit %} + where dbt_audit_sample_number <= {{ sample_limit }} + {% endif %} + order by dbt_audit_row_status, dbt_audit_sample_number + +{% endmacro %} \ No newline at end of file diff --git a/macros/utils/_classify_audit_row_status.sql b/macros/utils/_classify_audit_row_status.sql new file mode 100644 index 00000000..e28e3f4e --- /dev/null +++ b/macros/utils/_classify_audit_row_status.sql @@ -0,0 +1,28 @@ +{% macro _classify_audit_row_status() %} + {{ return(adapter.dispatch('_classify_audit_row_status', 'audit_helper')()) }} +{% endmacro %} + +{%- macro default___classify_audit_row_status() -%} + case + when max(dbt_audit_pk_row_num) over (partition by dbt_audit_surrogate_key) > 1 then 'nonunique_pk' + when in_a and in_b then 'identical' + when {{ dbt.bool_or('in_a') }} over (partition by dbt_audit_surrogate_key, dbt_audit_pk_row_num) + and {{ dbt.bool_or('in_b') }} over (partition by dbt_audit_surrogate_key, dbt_audit_pk_row_num) + then 'modified' + when in_a then 'removed' + when in_b then 'added' + end +{% endmacro %} + + +{%- macro redshift___classify_audit_row_status() -%} + {#- Redshift doesn't support bitwise operations (e.g. bool_or) inside of a window function :( -#} + case + when max(dbt_audit_pk_row_num) over (partition by dbt_audit_surrogate_key) > 1 then 'nonunique_pk' + when in_a and in_b then 'identical' + when max(case when in_a then 1 else 0 end) over (partition by dbt_audit_surrogate_key, dbt_audit_pk_row_num) = 1 + and max(case when in_b then 1 else 0 end) over (partition by dbt_audit_surrogate_key, dbt_audit_pk_row_num) = 1 + then 'modified' + when in_a then 'removed' + when in_b then 'added' + end{% endmacro %} \ No newline at end of file diff --git a/macros/utils/_count_num_rows_in_status.sql b/macros/utils/_count_num_rows_in_status.sql new file mode 100644 index 00000000..fa81c591 --- /dev/null +++ b/macros/utils/_count_num_rows_in_status.sql @@ -0,0 +1,28 @@ +{% macro _count_num_rows_in_status() %} + {{ return(adapter.dispatch('_count_num_rows_in_status', 'audit_helper')()) }} +{% endmacro %} + +{%- macro default___count_num_rows_in_status() -%} + count(distinct dbt_audit_surrogate_key, dbt_audit_pk_row_num) over (partition by dbt_audit_row_status) +{% endmacro %} + +{%- macro bigquery___count_num_rows_in_status() -%} + count(distinct {{ dbt.concat(["dbt_audit_surrogate_key", "dbt_audit_pk_row_num"]) }}) over (partition by dbt_audit_row_status) +{% endmacro %} + +{%- macro postgres___count_num_rows_in_status() -%} + {{ audit_helper._count_num_rows_in_status_without_distinct_window_func() }} +{% endmacro %} + +{%- macro databricks___count_num_rows_in_status() -%} + {{ audit_helper._count_num_rows_in_status_without_distinct_window_func() }} +{% endmacro %} + +{% macro _count_num_rows_in_status_without_distinct_window_func() %} + {#- Some platforms don't support count(distinct) inside of window functions -#} + {#- You can get the same outcome by dense_rank, assuming no nulls (we've already handled that) #} + {# https://stackoverflow.com/a/22347502 -#} + dense_rank() over (partition by dbt_audit_row_status order by dbt_audit_surrogate_key, dbt_audit_pk_row_num) + + dense_rank() over (partition by dbt_audit_row_status order by dbt_audit_surrogate_key desc, dbt_audit_pk_row_num desc) + - 1 +{% endmacro %} \ No newline at end of file diff --git a/macros/utils/_generate_set_results.sql b/macros/utils/_generate_set_results.sql new file mode 100644 index 00000000..848ff9e2 --- /dev/null +++ b/macros/utils/_generate_set_results.sql @@ -0,0 +1,123 @@ +{#- + Set generation is dispatched because it's possible to get performance optimisations + on some platforms, while keeping the post-processing standardised + See https://infinitelambda.com/data-validation-refactoring-snowflake/ for an example and background +-#} + +{% macro _generate_set_results(a_query, b_query, primary_key_columns, columns, event_time_props=None) %} + {{ return(adapter.dispatch('_generate_set_results', 'audit_helper')(a_query, b_query, primary_key_columns, columns, event_time_props)) }} +{% endmacro %} + +{% macro default___generate_set_results(a_query, b_query, primary_key_columns, columns, event_time_props) %} + {% set joined_cols = columns | join(", ") %} + + a_base as ( + select + {{ joined_cols }}, + {{ audit_helper.generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key + from ( {{- a_query -}} ) a_base_subq + {% if event_time_props %} + where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' + and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' + {% endif %} + ), + + b_base as ( + select + {{ joined_cols }}, + {{ audit_helper.generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key + from ( {{- b_query -}} ) b_base_subq + {% if event_time_props %} + where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' + and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' + {% endif %} + ), + + a as ( + select + *, + row_number() over (partition by dbt_audit_surrogate_key order by dbt_audit_surrogate_key) as dbt_audit_pk_row_num + from a_base + ), + + b as ( + select + *, + row_number() over (partition by dbt_audit_surrogate_key order by dbt_audit_surrogate_key) as dbt_audit_pk_row_num + from b_base + ), + + a_intersect_b as ( + + select * from a + {{ dbt.intersect() }} + select * from b + + ), + + a_except_b as ( + + select * from a + {{ dbt.except() }} + select * from b + + ), + + b_except_a as ( + + select * from b + {{ dbt.except() }} + select * from a + + ) +{% endmacro %} + +{% macro snowflake___generate_set_results(a_query, b_query, primary_key_columns, columns, event_time_props) %} + {% set joined_cols = columns | join(", ") %} + a as ( + select + {{ joined_cols }}, + {{ audit_helper.generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key, + row_number() over (partition by dbt_audit_surrogate_key order by dbt_audit_surrogate_key ) as dbt_audit_pk_row_num, + hash({{ joined_cols }}, dbt_audit_pk_row_num) as dbt_audit_row_hash + from ( {{- a_query -}} ) + {% if event_time_props %} + where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' + and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' + {% endif %} + ), + + b as ( + select + {{ joined_cols }}, + {{ audit_helper.generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key, + row_number() over (partition by dbt_audit_surrogate_key order by dbt_audit_surrogate_key ) as dbt_audit_pk_row_num, + hash({{ joined_cols }}, dbt_audit_pk_row_num) as dbt_audit_row_hash + from ( {{- b_query -}} ) + {% if event_time_props %} + where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' + and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' + {% endif %} + ), + + a_intersect_b as ( + + select * from a + where a.dbt_audit_row_hash in (select b.dbt_audit_row_hash from b) + + ), + + a_except_b as ( + + select * from a + where a.dbt_audit_row_hash not in (select b.dbt_audit_row_hash from b) + + ), + + b_except_a as ( + + select * from b + where b.dbt_audit_row_hash not in (select a.dbt_audit_row_hash from a) + + ) +{% endmacro %} \ No newline at end of file diff --git a/macros/utils/generate_null_safe_sk.sql b/macros/utils/generate_null_safe_sk.sql new file mode 100644 index 00000000..4078c334 --- /dev/null +++ b/macros/utils/generate_null_safe_sk.sql @@ -0,0 +1,25 @@ +{# Taken from https://github.com/dbt-labs/dbt-utils/blob/main/macros/sql/generate_surrogate_key.sql but without the option to treat nulls as empty strings #} + +{%- macro generate_null_safe_surrogate_key(field_list) -%} + {{ return(adapter.dispatch('generate_null_safe_surrogate_key', 'audit_helper')(field_list)) }} +{% endmacro %} + +{%- macro default__generate_null_safe_surrogate_key(field_list) -%} + +{%- set fields = [] -%} + +{%- for field in field_list -%} + + {%- do fields.append( + "coalesce(cast(" ~ field ~ " as " ~ dbt.type_string() ~ "), '_dbt_audit_helper_surrogate_key_null_')" + ) -%} + + {%- if not loop.last %} + {%- do fields.append("'-'") -%} + {%- endif -%} + +{%- endfor -%} + +{{ dbt.hash(dbt.concat(fields)) }} + +{%- endmacro -%} \ No newline at end of file diff --git a/macros/utils/get_comparison_bounds.sql b/macros/utils/get_comparison_bounds.sql new file mode 100644 index 00000000..4f224f5f --- /dev/null +++ b/macros/utils/get_comparison_bounds.sql @@ -0,0 +1,42 @@ +/* +The idea here is that if the event_time is set, we will only compare records enclosed in both models. +This improves performance and allows us to compare apples to apples, instead of detecting millions/billions +of "deletions" identified due to prod having all data while CI only has a few days' worth. + +In the diagram below, the thatched section is the comparison bounds. You can think of it as + + greatest(model_a.min_value, model_b.min_value) + least(model_a.max_value, model_b.max_value) + + ┌────────────────────────────┐ + a min_value │ a max_value │ + └──► ┌───────┼────────────────────┐ ◄───┘ │ + │ │┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼│ │ +model_a │ │┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼│ │ model_b + │ │┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼│ │ + └───────┼────────────────────┘ │ + ┌──► └────────────────────────────┘ ◄────┐ + b min_value b max_value +*/ +{% macro get_comparison_bounds(a_relation, b_relation, event_time) %} + {% set min_max_queries %} + with min_maxes as ( + select min({{ event_time }}) as min_event_time, max({{ event_time }}) as max_event_time + from {{ a_relation }} + union all + select min({{ event_time }}) as min_event_time, max({{ event_time }}) as max_event_time + from {{ b_relation }} + ) + select max(min_event_time) as min_event_time, min(max_event_time) as max_event_time + from min_maxes + {% endset %} + + {% set query_response = dbt_utils.get_query_results_as_dict(min_max_queries) %} + + {% set event_time_props = {"event_time": event_time} %} + {% for k in query_response.keys() %} + {% do event_time_props.update({k | lower: query_response[k][0]}) %} + {% endfor %} + + {% do return(event_time_props) %} +{% endmacro %} \ No newline at end of file diff --git a/package-lock.yml b/package-lock.yml new file mode 100644 index 00000000..32c6ccc0 --- /dev/null +++ b/package-lock.yml @@ -0,0 +1,4 @@ +packages: + - package: dbt-labs/dbt_utils + version: 1.1.1 +sha1_hash: 106400343ad0c92a7417f5156d0d6c3893bb2429 From d28f3e13c30d4b78e89e6ed21b3763e4819c9314 Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Tue, 28 May 2024 10:24:37 +1200 Subject: [PATCH 02/13] try to install core from main to get sorting fix --- .circleci/config.yml | 3 ++- .../unit_reworked_compare.yml | 20 +++++++++---------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index cde7c9e7..fb1fc0e0 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -33,7 +33,8 @@ jobs: . dbt_venv/bin/activate python -m pip install --upgrade pip setuptools - python -m pip install --pre dbt-core dbt-postgres dbt-redshift dbt-snowflake dbt-bigquery dbt-databricks + python -m pip install --pre dbt-postgres dbt-redshift dbt-snowflake dbt-bigquery dbt-databricks + python -m pip install git+https://github.com/dbt-labs/dbt-core.git@main mkdir -p ~/.dbt cp integration_tests/ci/sample.profiles.yml ~/.dbt/profiles.yml diff --git a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml index fd2c0d02..e94a6176 100644 --- a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml +++ b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml @@ -81,8 +81,8 @@ unit_tests: - {"dbt_audit_row_status": 'modified', 'id': 2, dbt_audit_num_rows_in_status: 1} - {"dbt_audit_row_status": 'removed', 'id': 3, dbt_audit_num_rows_in_status: 1} - {"dbt_audit_row_status": 'added', 'id': 4, dbt_audit_num_rows_in_status: 1} - config: - tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-core/issues/10167 + # config: + # tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-core/issues/10167 - name: reworked_compare_identical_tables_multiple_pk_cols model: unit_reworked_compare @@ -135,8 +135,8 @@ unit_tests: reworked_compare__columns: ['id', 'col1', 'col2'] reworked_compare__event_time: reworked_compare__primary_key_columns: ['id'] - config: - tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-redshift/issues/821 + # config: + # tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-redshift/issues/821 - name: reworked_compare_identical_tables_multiple_null_pk model: unit_reworked_compare @@ -164,8 +164,8 @@ unit_tests: reworked_compare__columns: ['id', 'col1', 'col2'] reworked_compare__event_time: reworked_compare__primary_key_columns: ['id'] - config: - tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-redshift/issues/821 + # config: + # tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-redshift/issues/821 - name: reworked_compare_identical_tables_multi_null_pk_dupe_rows description: All rows with a null ID are identical. They should be returned as individual rows instead of being combined @@ -196,8 +196,8 @@ unit_tests: reworked_compare__columns: ['id', 'col1', 'col2'] reworked_compare__event_time: reworked_compare__primary_key_columns: ['id'] - config: - tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-redshift/issues/821 + # config: + # tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-redshift/issues/821 - name: reworked_compare_all_statuses_different_column_set model: unit_reworked_compare @@ -225,5 +225,5 @@ unit_tests: - {"dbt_audit_row_status": 'modified', 'id': 2, dbt_audit_num_rows_in_status: 1} - {"dbt_audit_row_status": 'modified', 'id': 2, dbt_audit_num_rows_in_status: 1} - {"dbt_audit_row_status": 'removed', 'id': 3, dbt_audit_num_rows_in_status: 1} - config: - tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-core/issues/10167 + # config: + # tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-core/issues/10167 From dfcdcc9bbafb2c284371a5555d7cc8222348b2e3 Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Tue, 28 May 2024 10:40:56 +1200 Subject: [PATCH 03/13] Revert "try to install core from main to get sorting fix" This reverts commit d28f3e13c30d4b78e89e6ed21b3763e4819c9314. --- .circleci/config.yml | 3 +-- .../unit_reworked_compare.yml | 20 +++++++++---------- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index fb1fc0e0..cde7c9e7 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -33,8 +33,7 @@ jobs: . dbt_venv/bin/activate python -m pip install --upgrade pip setuptools - python -m pip install --pre dbt-postgres dbt-redshift dbt-snowflake dbt-bigquery dbt-databricks - python -m pip install git+https://github.com/dbt-labs/dbt-core.git@main + python -m pip install --pre dbt-core dbt-postgres dbt-redshift dbt-snowflake dbt-bigquery dbt-databricks mkdir -p ~/.dbt cp integration_tests/ci/sample.profiles.yml ~/.dbt/profiles.yml diff --git a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml index e94a6176..fd2c0d02 100644 --- a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml +++ b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml @@ -81,8 +81,8 @@ unit_tests: - {"dbt_audit_row_status": 'modified', 'id': 2, dbt_audit_num_rows_in_status: 1} - {"dbt_audit_row_status": 'removed', 'id': 3, dbt_audit_num_rows_in_status: 1} - {"dbt_audit_row_status": 'added', 'id': 4, dbt_audit_num_rows_in_status: 1} - # config: - # tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-core/issues/10167 + config: + tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-core/issues/10167 - name: reworked_compare_identical_tables_multiple_pk_cols model: unit_reworked_compare @@ -135,8 +135,8 @@ unit_tests: reworked_compare__columns: ['id', 'col1', 'col2'] reworked_compare__event_time: reworked_compare__primary_key_columns: ['id'] - # config: - # tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-redshift/issues/821 + config: + tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-redshift/issues/821 - name: reworked_compare_identical_tables_multiple_null_pk model: unit_reworked_compare @@ -164,8 +164,8 @@ unit_tests: reworked_compare__columns: ['id', 'col1', 'col2'] reworked_compare__event_time: reworked_compare__primary_key_columns: ['id'] - # config: - # tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-redshift/issues/821 + config: + tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-redshift/issues/821 - name: reworked_compare_identical_tables_multi_null_pk_dupe_rows description: All rows with a null ID are identical. They should be returned as individual rows instead of being combined @@ -196,8 +196,8 @@ unit_tests: reworked_compare__columns: ['id', 'col1', 'col2'] reworked_compare__event_time: reworked_compare__primary_key_columns: ['id'] - # config: - # tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-redshift/issues/821 + config: + tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-redshift/issues/821 - name: reworked_compare_all_statuses_different_column_set model: unit_reworked_compare @@ -225,5 +225,5 @@ unit_tests: - {"dbt_audit_row_status": 'modified', 'id': 2, dbt_audit_num_rows_in_status: 1} - {"dbt_audit_row_status": 'modified', 'id': 2, dbt_audit_num_rows_in_status: 1} - {"dbt_audit_row_status": 'removed', 'id': 3, dbt_audit_num_rows_in_status: 1} - # config: - # tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-core/issues/10167 + config: + tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-core/issues/10167 From 9de9bb137699a6a604344a941b9e28de3aa040d3 Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Tue, 28 May 2024 22:26:03 +1200 Subject: [PATCH 04/13] Audit helper code review changes * add BQ support for qucik are queries identical * explain why using dense_rank * remove the compile step to avoid compilation error * Don't throw incompatible quick compare error during parse * add where clause to check we're not assuming its absence * enable first basic struct tests * Skip raising exception during parsing * json_build_object doesn't work on rs * changed behaviour redshift * skip complex structs on rs for now * temp disable all complex structs * skip some currently failoing bq tests * Properly exclude tests to skip, add comments * dbx too * rename reworked_compare to compare_and_classify_query_results --- .circleci/config.yml | 5 -- integration_tests/dbt_project.yml | 6 +- .../unit_tests/struct_generation_macros.sql | 12 +++ .../unit_test_struct_model_a.sql | 17 +++- .../unit_test_struct_model_b.sql | 17 +++- .../unit_compare_queries.sql | 4 +- .../unit_quick_are_queries_identical.sql | 6 +- .../unit_reworked_compare.sql | 12 +-- .../unit_reworked_compare.yml | 64 +++++++------- .../unit_reworked_compare_struct.sql | 12 +-- .../unit_reworked_compare_struct.yml | 83 ++++++++++--------- .../tests/fixtures/simple_struct.sql | 14 ++++ .../simple_struct_different_order.sql | 14 ++++ .../fixtures/simple_struct_removed_key.sql | 14 ++++ macros/quick_are_queries_identical.sql | 55 +++++++++--- macros/reworked_compare.sql | 5 +- 16 files changed, 224 insertions(+), 116 deletions(-) create mode 100644 integration_tests/macros/unit_tests/struct_generation_macros.sql create mode 100644 integration_tests/tests/fixtures/simple_struct.sql create mode 100644 integration_tests/tests/fixtures/simple_struct_different_order.sql create mode 100644 integration_tests/tests/fixtures/simple_struct_removed_key.sql diff --git a/.circleci/config.yml b/.circleci/config.yml index cde7c9e7..950a6cfd 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -51,7 +51,6 @@ jobs: cd integration_tests dbt deps --target postgres dbt seed --target postgres --full-refresh - dbt compile --target postgres dbt run --target postgres --exclude tag:skip+ tag:temporary_skip+ dbt test --target postgres --exclude tag:skip+ tag:temporary_skip+ @@ -63,7 +62,6 @@ jobs: cd integration_tests dbt deps --target redshift dbt seed --target redshift --full-refresh - dbt compile --target redshift dbt run --target redshift --exclude tag:skip+ tag:temporary_skip+ dbt test --target redshift --exclude tag:skip+ tag:temporary_skip+ @@ -75,7 +73,6 @@ jobs: cd integration_tests dbt deps --target snowflake dbt seed --target snowflake --full-refresh - dbt compile --target snowflake dbt run --target snowflake --exclude tag:skip+ tag:temporary_skip+ dbt test --target snowflake --exclude tag:skip+ tag:temporary_skip+ @@ -90,7 +87,6 @@ jobs: cd integration_tests dbt deps --target bigquery dbt seed --target bigquery --full-refresh - dbt compile --target bigquery dbt run --target bigquery --full-refresh --exclude tag:skip+ tag:temporary_skip+ dbt test --target bigquery --exclude tag:skip+ tag:temporary_skip+ @@ -102,7 +98,6 @@ jobs: cd integration_tests dbt deps --target databricks dbt seed --target databricks --full-refresh - dbt compile --target databricks dbt run --target databricks --exclude tag:skip+ tag:temporary_skip+ dbt test --target databricks --exclude tag:skip+ tag:temporary_skip+ diff --git a/integration_tests/dbt_project.yml b/integration_tests/dbt_project.yml index ef906729..873057e5 100644 --- a/integration_tests/dbt_project.yml +++ b/integration_tests/dbt_project.yml @@ -20,9 +20,9 @@ seeds: vars: compare_queries_summarize: true - reworked_compare__primary_key_columns: ['col1'] - reworked_compare__columns: ['col1'] - reworked_compare__event_time: + compare_classify__primary_key_columns: ['col1'] + compare_classify__columns: ['col1'] + compare_classify__event_time: quick_are_queries_identical_cols: ['col1'] quick_are_queries_identical_event_time: diff --git a/integration_tests/macros/unit_tests/struct_generation_macros.sql b/integration_tests/macros/unit_tests/struct_generation_macros.sql new file mode 100644 index 00000000..187c64b4 --- /dev/null +++ b/integration_tests/macros/unit_tests/struct_generation_macros.sql @@ -0,0 +1,12 @@ +{%- macro _basic_json_function() -%} + {%- if target.type == 'snowflake' -%} + object_construct + {%- elif target.type == 'bigquery' -%} + json_object + {%- elif target.type == 'databricks' -%} + map + {%- elif execute -%} + {# Only raise exception if it's actually being called, not during parsing #} + {%- do exceptions.raise_compiler_error("Unknown adapter '"~ target.type ~ "'")-%} + {%- endif -%} +{%- endmacro -%} \ No newline at end of file diff --git a/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_a.sql b/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_a.sql index 24d584e8..1cfabba6 100644 --- a/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_a.sql +++ b/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_a.sql @@ -1,3 +1,16 @@ -{{ config(tags=['skip' if (target.type in ['redshift', 'bigquery', 'postgres', 'databricks']) else 'runnable']) }} +{{ config(tags=['skip' if (target.type in ['postgres']) else 'runnable']) }} -select 1 as id, 'John Doe' as col1, object_construct('street', '123 Main St', 'city', 'Anytown', 'state', 'CA') as col2 \ No newline at end of file +{% if target.name != 'redshift' %} + +select + 1 as id, + 'John Doe' as col1, + {{ audit_helper_integration_tests._basic_json_function() -}}('street', '123 Main St', 'city', 'Anytown', 'state', 'CA') as col2 + +{% else %} + +select + 1 AS id, + 'John Doe' AS col1, + json_parse('{"street": "123 Main St", "city": "Anytown", "state": "CA"}') AS col2 +{% endif %} \ No newline at end of file diff --git a/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_b.sql b/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_b.sql index 24d584e8..1cfabba6 100644 --- a/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_b.sql +++ b/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_b.sql @@ -1,3 +1,16 @@ -{{ config(tags=['skip' if (target.type in ['redshift', 'bigquery', 'postgres', 'databricks']) else 'runnable']) }} +{{ config(tags=['skip' if (target.type in ['postgres']) else 'runnable']) }} -select 1 as id, 'John Doe' as col1, object_construct('street', '123 Main St', 'city', 'Anytown', 'state', 'CA') as col2 \ No newline at end of file +{% if target.name != 'redshift' %} + +select + 1 as id, + 'John Doe' as col1, + {{ audit_helper_integration_tests._basic_json_function() -}}('street', '123 Main St', 'city', 'Anytown', 'state', 'CA') as col2 + +{% else %} + +select + 1 AS id, + 'John Doe' AS col1, + json_parse('{"street": "123 Main St", "city": "Anytown", "state": "CA"}') AS col2 +{% endif %} \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_compare_queries.sql b/integration_tests/models/unit_test_wrappers/unit_compare_queries.sql index c589ee53..03272c9f 100644 --- a/integration_tests/models/unit_test_wrappers/unit_compare_queries.sql +++ b/integration_tests/models/unit_test_wrappers/unit_compare_queries.sql @@ -1,8 +1,8 @@ {{ audit_helper.compare_queries( - "select * from " ~ ref('unit_test_model_a'), - "select * from " ~ ref('unit_test_model_b'), + "select * from " ~ ref('unit_test_model_a') ~ " where 1=1", + "select * from " ~ ref('unit_test_model_b') ~ " where 1=1", summarize = var('compare_queries_summarize') ) }} \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.sql b/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.sql index e969b1e2..5eea8d81 100644 --- a/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.sql +++ b/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.sql @@ -1,9 +1,9 @@ -{{ config(tags=['skip' if (target.type in ['redshift', 'bigquery', 'postgres', 'databricks']) else 'runnable']) }} +{{ config(tags=['skip' if (target.type in ['redshift', 'postgres', 'databricks']) else 'runnable']) }} {{ audit_helper.quick_are_queries_identical( - "select * from " ~ ref('unit_test_model_a'), - "select * from " ~ ref('unit_test_model_b'), + "select * from " ~ ref('unit_test_model_a') ~ " where 1=1", + "select * from " ~ ref('unit_test_model_b') ~ " where 1=1", columns=var('quick_are_queries_identical_cols'), event_time=var('quick_are_queries_identical_event_time') ) diff --git a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.sql b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.sql index 37473546..fbef7dcb 100644 --- a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.sql +++ b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.sql @@ -1,9 +1,9 @@ {{ - audit_helper.reworked_compare( - "select * from " ~ ref('unit_test_model_a'), - "select * from " ~ ref('unit_test_model_b'), - primary_key_columns=var('reworked_compare__primary_key_columns'), - columns=var('reworked_compare__columns'), - event_time=var('reworked_compare__event_time') + audit_helper.compare_and_classify_query_results( + "select * from " ~ ref('unit_test_model_a') ~ " where 1=1", + "select * from " ~ ref('unit_test_model_b') ~ " where 1=1", + primary_key_columns=var('compare_classify__primary_key_columns'), + columns=var('compare_classify__columns'), + event_time=var('compare_classify__event_time') ) }} \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml index fd2c0d02..ffa30c9a 100644 --- a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml +++ b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml @@ -1,5 +1,5 @@ unit_tests: - - name: reworked_compare_identical_tables + - name: compare_classify_identical_tables model: unit_reworked_compare given: @@ -22,17 +22,17 @@ unit_tests: overrides: vars: - reworked_compare__columns: ['id', 'col1', 'col2'] - reworked_compare__event_time: - reworked_compare__primary_key_columns: ['id'] + compare_classify__columns: ['id', 'col1', 'col2'] + compare_classify__event_time: + compare_classify__primary_key_columns: ['id'] - - name: reworked_compare_identical_tables_event_time_filter + - name: compare_classify_identical_tables_event_time_filter model: unit_reworked_compare overrides: vars: - reworked_compare__columns: ['id', 'col1', 'col2', 'created_at'] - reworked_compare__event_time: 'created_at' - reworked_compare__primary_key_columns: ['id'] + compare_classify__columns: ['id', 'col1', 'col2', 'created_at'] + compare_classify__event_time: 'created_at' + compare_classify__primary_key_columns: ['id'] macros: audit_helper.get_comparison_bounds: "min_event_time": "2024-01-02" @@ -55,13 +55,13 @@ unit_tests: - {"dbt_audit_row_status": 'identical', 'id': 2, dbt_audit_num_rows_in_status: 2} - {"dbt_audit_row_status": 'identical', 'id': 3, dbt_audit_num_rows_in_status: 2} - - name: reworked_compare_all_statuses + - name: compare_classify_all_statuses model: unit_reworked_compare overrides: vars: - reworked_compare__columns: ['id', 'col1', 'col2'] - reworked_compare__event_time: - reworked_compare__primary_key_columns: ['id'] + compare_classify__columns: ['id', 'col1', 'col2'] + compare_classify__event_time: + compare_classify__primary_key_columns: ['id'] given: - input: ref('unit_test_model_a') rows: @@ -84,13 +84,13 @@ unit_tests: config: tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-core/issues/10167 - - name: reworked_compare_identical_tables_multiple_pk_cols + - name: compare_classify_identical_tables_multiple_pk_cols model: unit_reworked_compare overrides: vars: - reworked_compare__columns: ['id', 'id_2', 'col1', 'col2'] - reworked_compare__event_time: - reworked_compare__primary_key_columns: ['id', 'id_2'] + compare_classify__columns: ['id', 'id_2', 'col1', 'col2'] + compare_classify__event_time: + compare_classify__primary_key_columns: ['id', 'id_2'] given: - input: ref('unit_test_model_a') rows: @@ -108,7 +108,7 @@ unit_tests: - {"dbt_audit_row_status": 'identical', 'id': 1, "id_2": 23, "dbt_audit_num_rows_in_status": 3} - {"dbt_audit_row_status": 'identical', 'id': 3, "id_2": 4, "dbt_audit_num_rows_in_status": 3} - - name: reworked_compare_identical_tables_single_null_pk + - name: compare_classify_identical_tables_single_null_pk model: unit_reworked_compare description: "`nonunique_pk` status checks whether a PK is unique. It's intended to avoid arbitrary comparisons, not protect against null records (that's what constraints or tests are for)." @@ -132,13 +132,13 @@ unit_tests: overrides: vars: - reworked_compare__columns: ['id', 'col1', 'col2'] - reworked_compare__event_time: - reworked_compare__primary_key_columns: ['id'] + compare_classify__columns: ['id', 'col1', 'col2'] + compare_classify__event_time: + compare_classify__primary_key_columns: ['id'] config: tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-redshift/issues/821 - - name: reworked_compare_identical_tables_multiple_null_pk + - name: compare_classify_identical_tables_multiple_null_pk model: unit_reworked_compare given: @@ -161,13 +161,13 @@ unit_tests: overrides: vars: - reworked_compare__columns: ['id', 'col1', 'col2'] - reworked_compare__event_time: - reworked_compare__primary_key_columns: ['id'] + compare_classify__columns: ['id', 'col1', 'col2'] + compare_classify__event_time: + compare_classify__primary_key_columns: ['id'] config: tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-redshift/issues/821 - - name: reworked_compare_identical_tables_multi_null_pk_dupe_rows + - name: compare_classify_identical_tables_multi_null_pk_dupe_rows description: All rows with a null ID are identical. They should be returned as individual rows instead of being combined model: unit_reworked_compare @@ -193,19 +193,19 @@ unit_tests: overrides: vars: - reworked_compare__columns: ['id', 'col1', 'col2'] - reworked_compare__event_time: - reworked_compare__primary_key_columns: ['id'] + compare_classify__columns: ['id', 'col1', 'col2'] + compare_classify__event_time: + compare_classify__primary_key_columns: ['id'] config: tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-redshift/issues/821 - - name: reworked_compare_all_statuses_different_column_set + - name: compare_classify_all_statuses_different_column_set model: unit_reworked_compare overrides: vars: - reworked_compare__primary_key_columns: ['id'] - reworked_compare__columns: ['id', 'col1'] - reworked_compare__event_time: + compare_classify__primary_key_columns: ['id'] + compare_classify__columns: ['id', 'col1'] + compare_classify__event_time: given: - input: ref('unit_test_model_a') rows: diff --git a/integration_tests/models/unit_test_wrappers/unit_reworked_compare_struct.sql b/integration_tests/models/unit_test_wrappers/unit_reworked_compare_struct.sql index 7aab2177..82b04ea4 100644 --- a/integration_tests/models/unit_test_wrappers/unit_reworked_compare_struct.sql +++ b/integration_tests/models/unit_test_wrappers/unit_reworked_compare_struct.sql @@ -1,9 +1,9 @@ {{ - audit_helper.reworked_compare( - "select * from " ~ ref('unit_test_struct_model_a'), - "select * from " ~ ref('unit_test_struct_model_b'), - primary_key_columns=var('reworked_compare__primary_key_columns'), - columns=var('reworked_compare__columns'), - event_time=var('reworked_compare__event_time') + audit_helper.compare_and_classify_query_results( + "select * from " ~ ref('unit_test_struct_model_a') ~ " where 1=1", + "select * from " ~ ref('unit_test_struct_model_b') ~ " where 1=1", + primary_key_columns=var('compare_classify__primary_key_columns'), + columns=var('compare_classify__columns'), + event_time=var('compare_classify__event_time') ) }} \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_reworked_compare_struct.yml b/integration_tests/models/unit_test_wrappers/unit_reworked_compare_struct.yml index 3139a570..ac4cf8f3 100644 --- a/integration_tests/models/unit_test_wrappers/unit_reworked_compare_struct.yml +++ b/integration_tests/models/unit_test_wrappers/unit_reworked_compare_struct.yml @@ -1,69 +1,67 @@ unit_tests: - - name: reworked_compare_struct + - name: compare_classify_simple_struct model: unit_reworked_compare_struct given: - input: ref('unit_test_struct_model_a') format: sql - rows: | - select 1 as id, 'John Doe' as col1, object_construct('street', '123 Main St', 'city', 'Anytown', 'state', 'CA') as col2 + fixture: simple_struct - input: ref('unit_test_struct_model_b') format: sql - rows: | - select 1 as id, 'John Doe' as col1, object_construct('street', '123 Main St', 'city', 'Anytown', 'state', 'CA') as col2 + fixture: simple_struct expect: rows: - {"id": 1, "dbt_audit_row_status": "identical", "dbt_audit_num_rows_in_status": 1} overrides: vars: - reworked_compare__columns: ['id', 'col1', 'col2'] - reworked_compare__event_time: - reworked_compare__primary_key_columns: ['id'] + compare_classify__columns: ['id', 'col1', 'col2'] + compare_classify__event_time: + compare_classify__primary_key_columns: ['id'] config: - tags: "{{ 'skip' if (target.type in ['redshift']) else 'runnable' }}" + tags: "{{ 'skip' if (target.type in ['bigquery', 'databricks']) else 'runnable' }}" #Can't do set operations on even simple JSON cols - - name: unit_reworked_compare_struct_identical_values_different_order + - name: unit_compare_classify_struct_identical_values_different_order model: unit_reworked_compare_struct - description: Snowflake sorts objects' keys alphabetically, so sort order is ignored. + description: Objects' keys are sorted alphabetically, so sort order is ignored. given: - input: ref('unit_test_struct_model_a') format: sql - rows: | - select 1 as id, 'John Doe' as col1, object_construct('street', '123 Main St', 'city', 'Anytown', 'state', 'CA') as col2 + fixture: simple_struct - input: ref('unit_test_struct_model_b') format: sql - rows: | - select 1 as id, 'John Doe' as col1, object_construct('state', 'CA', 'street', '123 Main St', 'city', 'Anytown') as col2 + fixture: simple_struct_different_order expect: rows: - {"id": 1, "dbt_audit_row_status": "identical", "dbt_audit_num_rows_in_status": 1} overrides: vars: - reworked_compare__columns: ['id', 'col1', 'col2'] - reworked_compare__event_time: - reworked_compare__primary_key_columns: ['id'] + compare_classify__columns: ['id', 'col1', 'col2'] + compare_classify__event_time: + compare_classify__primary_key_columns: ['id'] + config: + tags: "{{ 'skip' if (target.type in ['bigquery', 'databricks']) else 'runnable' }}" #Can't do set operations on even simple JSON cols - - name: unit_reworked_compare_struct_removed_key + - name: unit_compare_classify_struct_removed_key model: unit_reworked_compare_struct given: - input: ref('unit_test_struct_model_a') format: sql - rows: | - select 1 as id, 'John Doe' as col1, object_construct('street', '123 Main St', 'city', 'Anytown', 'state', 'CA') as col2 + fixture: simple_struct - input: ref('unit_test_struct_model_b') format: sql - rows: | - select 1 as id, 'John Doe' as col1, object_construct('street', '123 Main St', 'state', 'CA') as col2 + fixture: simple_struct_removed_key expect: rows: - {"id": 1, "dbt_audit_row_status": "modified", "dbt_audit_num_rows_in_status": 1} - {"id": 1, "dbt_audit_row_status": "modified", "dbt_audit_num_rows_in_status": 1} overrides: vars: - reworked_compare__columns: ['id', 'col1', 'col2'] - reworked_compare__event_time: - reworked_compare__primary_key_columns: ['id'] + compare_classify__columns: ['id', 'col1', 'col2'] + compare_classify__event_time: + compare_classify__primary_key_columns: ['id'] + config: + tags: "{{ 'skip' if (target.type in ['bigquery', 'databricks']) else 'runnable' }}" #Can't do set operations on even simple JSON cols - - name: reworked_compare_complex_struct + - name: compare_classify_complex_struct model: unit_reworked_compare_struct given: - input: ref('unit_test_struct_model_a') @@ -79,12 +77,13 @@ unit_tests: - {"id": 1, "dbt_audit_row_status": "identical", "dbt_audit_num_rows_in_status": 1} overrides: vars: - reworked_compare__columns: ['id', 'col1', 'col2'] - reworked_compare__event_time: - reworked_compare__primary_key_columns: ['id'] - + compare_classify__columns: ['id', 'col1', 'col2'] + compare_classify__event_time: + compare_classify__primary_key_columns: ['id'] + config: + tags: "{{ 'skip' if (target.type in ['redshift', 'bigquery', 'databricks']) else 'runnable' }}" #haven't ported these to be multi-warehouse yet - - name: reworked_compare_complex_struct_different_values + - name: compare_classify_complex_struct_different_values model: unit_reworked_compare_struct given: - input: ref('unit_test_struct_model_a') @@ -101,11 +100,13 @@ unit_tests: - {"id": 1, "dbt_audit_row_status": "modified", "dbt_audit_num_rows_in_status": 1} overrides: vars: - reworked_compare__columns: ['id', 'col1', 'col2'] - reworked_compare__event_time: - reworked_compare__primary_key_columns: ['id'] + compare_classify__columns: ['id', 'col1', 'col2'] + compare_classify__event_time: + compare_classify__primary_key_columns: ['id'] + config: + tags: "{{ 'skip' if (target.type in ['redshift', 'bigquery', 'databricks']) else 'runnable' }}" #haven't ported these to be multi-warehouse yet - - name: unit_reworked_compare_complex_struct_identical_values_different_order + - name: unit_compare_classify_complex_struct_identical_values_different_order model: unit_reworked_compare_struct description: Snowflake sorts objects' keys alphabetically, but respects the order items are added to arrays so differences are detected. given: @@ -123,8 +124,8 @@ unit_tests: - {"id": 1, "dbt_audit_row_status": "modified", "dbt_audit_num_rows_in_status": 1} overrides: vars: - reworked_compare__columns: ['id', 'col1', 'col2'] - reworked_compare__event_time: - reworked_compare__primary_key_columns: ['id'] - - + compare_classify__columns: ['id', 'col1', 'col2'] + compare_classify__event_time: + compare_classify__primary_key_columns: ['id'] + config: + tags: "{{ 'skip' if (target.type in ['redshift', 'bigquery', 'databricks']) else 'runnable' }}" #haven't ported these to be multi-warehouse yet diff --git a/integration_tests/tests/fixtures/simple_struct.sql b/integration_tests/tests/fixtures/simple_struct.sql new file mode 100644 index 00000000..006e62f6 --- /dev/null +++ b/integration_tests/tests/fixtures/simple_struct.sql @@ -0,0 +1,14 @@ +{% if target.name != 'redshift' %} + +select + 1 as id, + 'John Doe' as col1, + {{ audit_helper_integration_tests._basic_json_function() -}}('street', '123 Main St', 'city', 'Anytown', 'state', 'CA') as col2 + +{% else %} + +select + 1 AS id, + 'John Doe' AS col1, + json_parse('{"street": "123 Main St", "city": "Anytown", "state": "CA"}') AS col2 +{% endif %} \ No newline at end of file diff --git a/integration_tests/tests/fixtures/simple_struct_different_order.sql b/integration_tests/tests/fixtures/simple_struct_different_order.sql new file mode 100644 index 00000000..ee89fb70 --- /dev/null +++ b/integration_tests/tests/fixtures/simple_struct_different_order.sql @@ -0,0 +1,14 @@ +{% if target.name != 'redshift' %} + +select + 1 as id, + 'John Doe' as col1, + {{ audit_helper_integration_tests._basic_json_function() -}}( 'state', 'CA', 'street', '123 Main St', 'city', 'Anytown') as col2 + +{% else %} + +select + 1 AS id, + 'John Doe' AS col1, + json_parse('{"state": "CA", "street": "123 Main St", "city": "Anytown"}') AS col2 +{% endif %} \ No newline at end of file diff --git a/integration_tests/tests/fixtures/simple_struct_removed_key.sql b/integration_tests/tests/fixtures/simple_struct_removed_key.sql new file mode 100644 index 00000000..ae3084bd --- /dev/null +++ b/integration_tests/tests/fixtures/simple_struct_removed_key.sql @@ -0,0 +1,14 @@ +{% if target.name != 'redshift' %} + +select + 1 as id, + 'John Doe' as col1, + {{ audit_helper_integration_tests._basic_json_function() -}}('street', '123 Main St', 'state', 'CA') as col2 + +{% else %} + +select + 1 AS id, + 'John Doe' AS col1, + json_parse('{"street": "123 Main St", "state": "CA"}') AS col2 +{% endif %} \ No newline at end of file diff --git a/macros/quick_are_queries_identical.sql b/macros/quick_are_queries_identical.sql index add26638..0a049ad4 100644 --- a/macros/quick_are_queries_identical.sql +++ b/macros/quick_are_queries_identical.sql @@ -12,6 +12,49 @@ but it's a good way to quickly verify identical results if that's what you're ex {% endmacro %} {% macro default__quick_are_queries_identical(query_a, query_b, columns, event_time) %} + {% if execute %} + {# Need to only throw this error when the macro is actually trying to be used, not during intial parse phase #} + {# if/when unit tests get support for `enabled` config, this check can be removed as they won't be supplied for parse anyway #} + {% do exceptions.raise_compiler_error("quick_are_queries_identical() is not implemented for adapter '"~ target.type ~ "'" ) %} + {% endif %} +{% endmacro %} + +{% macro bigquery__quick_are_queries_identical(query_a, query_b, columns, event_time) %} + {% set joined_cols = columns | join(", ") %} + {% if event_time %} + {% set event_time_props = audit_helper.get_comparison_bounds(a_query, b_query, event_time) %} + {% endif %} + + with query_a as ( + select {{ joined_cols }} + from ({{ query_a }}) + {% if event_time_props %} + where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' + and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' + {% endif %} + ), + query_b as ( + select {{ joined_cols }} + from ({{ query_b }}) + {% if event_time_props %} + where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' + and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' + {% endif %} + ) + + select count(distinct hash_result) = 1 as are_tables_identical + from ( + select bit_xor(farm_fingerprint(to_json_string(query_a))) as hash_result + from query_a + + union all + + select bit_xor(farm_fingerprint(to_json_string(query_b))) as hash_result + from query_b + ) as hashes +{% endmacro %} + +{% macro snowflake__quick_are_queries_identical(query_a, query_b, columns, event_time) %} {% set joined_cols = columns | join(", ") %} {% if event_time %} {% set event_time_props = audit_helper.get_comparison_bounds(a_query, b_query, event_time) %} @@ -36,16 +79,4 @@ but it's a good way to quickly verify identical results if that's what you're ex {% endif %} ) as hashes -{% endmacro %} - -{% macro is_quick_are_queries_identical_supported() %} - {{ return (adapter.dispatch('is_quick_are_queries_identical_supported', 'audit_helper')()) }} -{% endmacro %} - -{% macro default__is_quick_are_queries_identical_supported() %} - {{ return (False) }} -{% endmacro %} - -{% macro snowflake__is_quick_are_queries_identical_supported() %} - {{ return (True) }} {% endmacro %} \ No newline at end of file diff --git a/macros/reworked_compare.sql b/macros/reworked_compare.sql index e7838833..0025bf21 100644 --- a/macros/reworked_compare.sql +++ b/macros/reworked_compare.sql @@ -1,4 +1,4 @@ -{% macro reworked_compare(a_query, b_query, primary_key_columns=[], columns=[], event_time=None, sample_limit=20) %} +{% macro compare_and_classify_query_results(a_query, b_query, primary_key_columns=[], columns=[], event_time=None, sample_limit=20) %} {% set joined_cols = columns | join(", ") %} @@ -38,7 +38,6 @@ ), - classified as ( select *, @@ -50,6 +49,8 @@ select *, {{ audit_helper._count_num_rows_in_status() }} as dbt_audit_num_rows_in_status, + -- using dense_rank so that modified rows (which have a full row for both the left and right side) both get picked up in the sample. + -- For every other type this is equivalent to a row_number() dense_rank() over (partition by dbt_audit_row_status order by dbt_audit_surrogate_key, dbt_audit_pk_row_num) as dbt_audit_sample_number from classified ) From 98d0c8ab36afa3323f9f1fac998b53ca764ad5a4 Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Tue, 28 May 2024 22:40:38 +1200 Subject: [PATCH 05/13] Rename files --- ...ked_compare.sql => unit_compare_classify.sql} | 0 ...ked_compare.yml => unit_compare_classify.yml} | 16 ++++++++-------- ...ruct.sql => unit_compare_classify_struct.sql} | 0 ...ruct.yml => unit_compare_classify_struct.yml} | 12 ++++++------ 4 files changed, 14 insertions(+), 14 deletions(-) rename integration_tests/models/unit_test_wrappers/{unit_reworked_compare.sql => unit_compare_classify.sql} (100%) rename integration_tests/models/unit_test_wrappers/{unit_reworked_compare.yml => unit_compare_classify.yml} (97%) rename integration_tests/models/unit_test_wrappers/{unit_reworked_compare_struct.sql => unit_compare_classify_struct.sql} (100%) rename integration_tests/models/unit_test_wrappers/{unit_reworked_compare_struct.yml => unit_compare_classify_struct.yml} (96%) diff --git a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.sql b/integration_tests/models/unit_test_wrappers/unit_compare_classify.sql similarity index 100% rename from integration_tests/models/unit_test_wrappers/unit_reworked_compare.sql rename to integration_tests/models/unit_test_wrappers/unit_compare_classify.sql diff --git a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml b/integration_tests/models/unit_test_wrappers/unit_compare_classify.yml similarity index 97% rename from integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml rename to integration_tests/models/unit_test_wrappers/unit_compare_classify.yml index ffa30c9a..4dbf6c59 100644 --- a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml +++ b/integration_tests/models/unit_test_wrappers/unit_compare_classify.yml @@ -1,6 +1,6 @@ unit_tests: - name: compare_classify_identical_tables - model: unit_reworked_compare + model: unit_compare_classify given: - input: ref('unit_test_model_a') @@ -27,7 +27,7 @@ unit_tests: compare_classify__primary_key_columns: ['id'] - name: compare_classify_identical_tables_event_time_filter - model: unit_reworked_compare + model: unit_compare_classify overrides: vars: compare_classify__columns: ['id', 'col1', 'col2', 'created_at'] @@ -56,7 +56,7 @@ unit_tests: - {"dbt_audit_row_status": 'identical', 'id': 3, dbt_audit_num_rows_in_status: 2} - name: compare_classify_all_statuses - model: unit_reworked_compare + model: unit_compare_classify overrides: vars: compare_classify__columns: ['id', 'col1', 'col2'] @@ -85,7 +85,7 @@ unit_tests: tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-core/issues/10167 - name: compare_classify_identical_tables_multiple_pk_cols - model: unit_reworked_compare + model: unit_compare_classify overrides: vars: compare_classify__columns: ['id', 'id_2', 'col1', 'col2'] @@ -109,7 +109,7 @@ unit_tests: - {"dbt_audit_row_status": 'identical', 'id': 3, "id_2": 4, "dbt_audit_num_rows_in_status": 3} - name: compare_classify_identical_tables_single_null_pk - model: unit_reworked_compare + model: unit_compare_classify description: "`nonunique_pk` status checks whether a PK is unique. It's intended to avoid arbitrary comparisons, not protect against null records (that's what constraints or tests are for)." given: @@ -139,7 +139,7 @@ unit_tests: tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-redshift/issues/821 - name: compare_classify_identical_tables_multiple_null_pk - model: unit_reworked_compare + model: unit_compare_classify given: - input: ref('unit_test_model_a') @@ -169,7 +169,7 @@ unit_tests: - name: compare_classify_identical_tables_multi_null_pk_dupe_rows description: All rows with a null ID are identical. They should be returned as individual rows instead of being combined - model: unit_reworked_compare + model: unit_compare_classify given: - input: ref('unit_test_model_a') @@ -200,7 +200,7 @@ unit_tests: tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-redshift/issues/821 - name: compare_classify_all_statuses_different_column_set - model: unit_reworked_compare + model: unit_compare_classify overrides: vars: compare_classify__primary_key_columns: ['id'] diff --git a/integration_tests/models/unit_test_wrappers/unit_reworked_compare_struct.sql b/integration_tests/models/unit_test_wrappers/unit_compare_classify_struct.sql similarity index 100% rename from integration_tests/models/unit_test_wrappers/unit_reworked_compare_struct.sql rename to integration_tests/models/unit_test_wrappers/unit_compare_classify_struct.sql diff --git a/integration_tests/models/unit_test_wrappers/unit_reworked_compare_struct.yml b/integration_tests/models/unit_test_wrappers/unit_compare_classify_struct.yml similarity index 96% rename from integration_tests/models/unit_test_wrappers/unit_reworked_compare_struct.yml rename to integration_tests/models/unit_test_wrappers/unit_compare_classify_struct.yml index ac4cf8f3..f44d5e1a 100644 --- a/integration_tests/models/unit_test_wrappers/unit_reworked_compare_struct.yml +++ b/integration_tests/models/unit_test_wrappers/unit_compare_classify_struct.yml @@ -1,6 +1,6 @@ unit_tests: - name: compare_classify_simple_struct - model: unit_reworked_compare_struct + model: unit_compare_classify_struct given: - input: ref('unit_test_struct_model_a') format: sql @@ -20,7 +20,7 @@ unit_tests: tags: "{{ 'skip' if (target.type in ['bigquery', 'databricks']) else 'runnable' }}" #Can't do set operations on even simple JSON cols - name: unit_compare_classify_struct_identical_values_different_order - model: unit_reworked_compare_struct + model: unit_compare_classify_struct description: Objects' keys are sorted alphabetically, so sort order is ignored. given: - input: ref('unit_test_struct_model_a') @@ -41,7 +41,7 @@ unit_tests: tags: "{{ 'skip' if (target.type in ['bigquery', 'databricks']) else 'runnable' }}" #Can't do set operations on even simple JSON cols - name: unit_compare_classify_struct_removed_key - model: unit_reworked_compare_struct + model: unit_compare_classify_struct given: - input: ref('unit_test_struct_model_a') format: sql @@ -62,7 +62,7 @@ unit_tests: tags: "{{ 'skip' if (target.type in ['bigquery', 'databricks']) else 'runnable' }}" #Can't do set operations on even simple JSON cols - name: compare_classify_complex_struct - model: unit_reworked_compare_struct + model: unit_compare_classify_struct given: - input: ref('unit_test_struct_model_a') format: sql @@ -84,7 +84,7 @@ unit_tests: tags: "{{ 'skip' if (target.type in ['redshift', 'bigquery', 'databricks']) else 'runnable' }}" #haven't ported these to be multi-warehouse yet - name: compare_classify_complex_struct_different_values - model: unit_reworked_compare_struct + model: unit_compare_classify_struct given: - input: ref('unit_test_struct_model_a') format: sql @@ -107,7 +107,7 @@ unit_tests: tags: "{{ 'skip' if (target.type in ['redshift', 'bigquery', 'databricks']) else 'runnable' }}" #haven't ported these to be multi-warehouse yet - name: unit_compare_classify_complex_struct_identical_values_different_order - model: unit_reworked_compare_struct + model: unit_compare_classify_struct description: Snowflake sorts objects' keys alphabetically, but respects the order items are added to arrays so differences are detected. given: - input: ref('unit_test_struct_model_a') From d744647897914b75f507c55e88ec3159803fafbc Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Tue, 28 May 2024 22:41:03 +1200 Subject: [PATCH 06/13] rename macro file --- ...eworked_compare.sql => compare_and_classify_query_results.sql} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename macros/{reworked_compare.sql => compare_and_classify_query_results.sql} (100%) diff --git a/macros/reworked_compare.sql b/macros/compare_and_classify_query_results.sql similarity index 100% rename from macros/reworked_compare.sql rename to macros/compare_and_classify_query_results.sql From 6835ec90d1d9a998bb5b29ca5cc63b81331beab2 Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Wed, 29 May 2024 13:48:29 +1200 Subject: [PATCH 07/13] Add relation_focused macros --- macros/compare_and_classify_relation_rows.sql | 16 ++++++++++++++++ macros/quick_are_relations_identical.sql | 14 ++++++++++++++ .../_get_intersecting_columns_from_relations.sql | 13 +++++++++++++ 3 files changed, 43 insertions(+) create mode 100644 macros/compare_and_classify_relation_rows.sql create mode 100644 macros/quick_are_relations_identical.sql create mode 100644 macros/utils/_get_intersecting_columns_from_relations.sql diff --git a/macros/compare_and_classify_relation_rows.sql b/macros/compare_and_classify_relation_rows.sql new file mode 100644 index 00000000..d7c6f0f3 --- /dev/null +++ b/macros/compare_and_classify_relation_rows.sql @@ -0,0 +1,16 @@ +{% macro compare_and_classify_relation_rows(a_relation, b_relation, primary_key_columns=[], columns=None, event_time=None, sample_limit=20) %} + {%- if not columns -%} + {%- set columns = audit_helper._get_intersecting_columns_from_relations(a_relation, b_relation) -%} + {%- endif -%} + + {{ + audit_helper.compare_and_classify_query_results( + "select * from " ~ a_relation, + "select * from " ~ b_relation, + primary_key_columns, + columns, + event_time, + sample_limit + ) + }} +{% endmacro %} \ No newline at end of file diff --git a/macros/quick_are_relations_identical.sql b/macros/quick_are_relations_identical.sql new file mode 100644 index 00000000..2b3173da --- /dev/null +++ b/macros/quick_are_relations_identical.sql @@ -0,0 +1,14 @@ +{% macro quick_are_relations_identical(a_relation, b_relation, columns=None, event_time=None) %} + {% if not columns %} + {% set columns = audit_helper._get_intersecting_columns_from_relations(a_relation, b_relation) %} + {% endif %} + + {{ + audit_helper.quick_are_queries_identical( + "select * from " ~ a_relation, + "select * from " ~ b_relation, + columns, + event_time + ) + }} +{% endmacro %} \ No newline at end of file diff --git a/macros/utils/_get_intersecting_columns_from_relations.sql b/macros/utils/_get_intersecting_columns_from_relations.sql new file mode 100644 index 00000000..18d2ccb1 --- /dev/null +++ b/macros/utils/_get_intersecting_columns_from_relations.sql @@ -0,0 +1,13 @@ +{% macro _get_intersecting_columns_from_relations(a_relation, b_relation) %} + {%- set a_cols = dbt_utils.get_filtered_columns_in_relation(a_relation) -%} + {%- set b_cols = dbt_utils.get_filtered_columns_in_relation(b_relation) -%} + + {%- set intersection = [] -%} + {%- for col in a_cols -%} + {%- if col in b_cols -%} + {%- do intersection.append(col) -%} + {%- endif -%} + {%- endfor -%} + + {% do return(intersection) %} +{% endmacro %} \ No newline at end of file From da194ec1f65f74fb2a237c56c2d11a8dac101834 Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Wed, 5 Jun 2024 15:46:19 +1200 Subject: [PATCH 08/13] Add BQ-specific generate_set_results for hashes, enable json tests --- .../unit_compare_classify_struct.yml | 12 ++-- macros/utils/_generate_set_results.sql | 64 +++++++++++++++++++ 2 files changed, 70 insertions(+), 6 deletions(-) diff --git a/integration_tests/models/unit_test_wrappers/unit_compare_classify_struct.yml b/integration_tests/models/unit_test_wrappers/unit_compare_classify_struct.yml index f44d5e1a..b7f82a48 100644 --- a/integration_tests/models/unit_test_wrappers/unit_compare_classify_struct.yml +++ b/integration_tests/models/unit_test_wrappers/unit_compare_classify_struct.yml @@ -17,7 +17,7 @@ unit_tests: compare_classify__event_time: compare_classify__primary_key_columns: ['id'] config: - tags: "{{ 'skip' if (target.type in ['bigquery', 'databricks']) else 'runnable' }}" #Can't do set operations on even simple JSON cols + tags: "{{ 'skip' if (target.type in ['databricks']) else 'runnable' }}" #Can't do set operations on even simple JSON cols - name: unit_compare_classify_struct_identical_values_different_order model: unit_compare_classify_struct @@ -38,7 +38,7 @@ unit_tests: compare_classify__event_time: compare_classify__primary_key_columns: ['id'] config: - tags: "{{ 'skip' if (target.type in ['bigquery', 'databricks']) else 'runnable' }}" #Can't do set operations on even simple JSON cols + tags: "{{ 'skip' if (target.type in ['databricks']) else 'runnable' }}" #Can't do set operations on even simple JSON cols - name: unit_compare_classify_struct_removed_key model: unit_compare_classify_struct @@ -59,7 +59,7 @@ unit_tests: compare_classify__event_time: compare_classify__primary_key_columns: ['id'] config: - tags: "{{ 'skip' if (target.type in ['bigquery', 'databricks']) else 'runnable' }}" #Can't do set operations on even simple JSON cols + tags: "{{ 'skip' if (target.type in ['databricks']) else 'runnable' }}" #Can't do set operations on even simple JSON cols - name: compare_classify_complex_struct model: unit_compare_classify_struct @@ -81,7 +81,7 @@ unit_tests: compare_classify__event_time: compare_classify__primary_key_columns: ['id'] config: - tags: "{{ 'skip' if (target.type in ['redshift', 'bigquery', 'databricks']) else 'runnable' }}" #haven't ported these to be multi-warehouse yet + tags: "{{ 'skip' if (target.type in ['redshift', 'databricks']) else 'runnable' }}" #haven't ported these to be multi-warehouse yet - name: compare_classify_complex_struct_different_values model: unit_compare_classify_struct @@ -104,7 +104,7 @@ unit_tests: compare_classify__event_time: compare_classify__primary_key_columns: ['id'] config: - tags: "{{ 'skip' if (target.type in ['redshift', 'bigquery', 'databricks']) else 'runnable' }}" #haven't ported these to be multi-warehouse yet + tags: "{{ 'skip' if (target.type in ['redshift', 'databricks']) else 'runnable' }}" #haven't ported these to be multi-warehouse yet - name: unit_compare_classify_complex_struct_identical_values_different_order model: unit_compare_classify_struct @@ -128,4 +128,4 @@ unit_tests: compare_classify__event_time: compare_classify__primary_key_columns: ['id'] config: - tags: "{{ 'skip' if (target.type in ['redshift', 'bigquery', 'databricks']) else 'runnable' }}" #haven't ported these to be multi-warehouse yet + tags: "{{ 'skip' if (target.type in ['redshift', 'databricks']) else 'runnable' }}" #haven't ported these to be multi-warehouse yet diff --git a/macros/utils/_generate_set_results.sql b/macros/utils/_generate_set_results.sql index 848ff9e2..a5400714 100644 --- a/macros/utils/_generate_set_results.sql +++ b/macros/utils/_generate_set_results.sql @@ -72,6 +72,70 @@ ) {% endmacro %} + + +{% macro bigquery___generate_set_results(a_query, b_query, primary_key_columns, columns, event_time_props) %} + {% set joined_cols = columns | join(", ") %} + subset_columns_a as ( + select + {{ joined_cols }}, + {{ audit_helper.generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key, + row_number() over (partition by dbt_audit_surrogate_key order by dbt_audit_surrogate_key ) as dbt_audit_pk_row_num + from ( {{- a_query -}} ) + {% if event_time_props %} + where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' + and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' + {% endif %} + ), + + subset_columns_b as ( + select + {{ joined_cols }}, + {{ audit_helper.generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key, + row_number() over (partition by dbt_audit_surrogate_key order by dbt_audit_surrogate_key ) as dbt_audit_pk_row_num + from ( {{- b_query -}} ) + {% if event_time_props %} + where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' + and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' + {% endif %} + ), + + a as ( + select + *, + farm_fingerprint(to_json_string(subset_columns_a)) as dbt_audit_row_hash + from subset_columns_a + ), + + b as ( + select + *, + farm_fingerprint(to_json_string(subset_columns_b)) as dbt_audit_row_hash + from subset_columns_b + ), + + a_intersect_b as ( + + select * from a + where a.dbt_audit_row_hash in (select b.dbt_audit_row_hash from b) + + ), + + a_except_b as ( + + select * from a + where a.dbt_audit_row_hash not in (select b.dbt_audit_row_hash from b) + + ), + + b_except_a as ( + + select * from b + where b.dbt_audit_row_hash not in (select a.dbt_audit_row_hash from a) + + ) +{% endmacro %} + {% macro snowflake___generate_set_results(a_query, b_query, primary_key_columns, columns, event_time_props) %} {% set joined_cols = columns | join(", ") %} a as ( From eb1e6227b8d77e6a6edcad587c7fa3444325176f Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Thu, 6 Jun 2024 13:38:54 +1200 Subject: [PATCH 09/13] Implement hash comparisons for BQ and DBX (#103) * disable tests for unrelated adapters * Avoid lateral column aliasing * First cross-db complex struct fixture * Add final fixtures * Initial work on dbx compatibility * remove lateral column alias dbx * cast everything as string before hashing * add comment, enable all tests again * rename to dbt_audit_in_a instead of in_a * Protect against missing PK columns * gitignore package-lock.yml * add dbx variant of simple structs --- .gitignore | 2 + .../unit_tests/struct_generation_macros.sql | 18 ++++- .../unit_compare_classify.yml | 29 +++++++- .../unit_compare_classify_struct.yml | 67 +++++++++++------- .../tests/fixtures/complex_struct.sql | 8 +++ .../complex_struct_different_order.sql | 8 +++ .../complex_struct_different_value.sql | 8 +++ macros/compare_and_classify_query_results.sql | 13 ++-- macros/utils/_classify_audit_row_status.sql | 20 +++--- .../_ensure_all_pks_are_in_column_set.sql | 19 ++++++ macros/utils/_generate_set_results.sql | 68 +++++++++++++++++-- 11 files changed, 211 insertions(+), 49 deletions(-) create mode 100644 integration_tests/tests/fixtures/complex_struct.sql create mode 100644 integration_tests/tests/fixtures/complex_struct_different_order.sql create mode 100644 integration_tests/tests/fixtures/complex_struct_different_value.sql create mode 100644 macros/utils/_ensure_all_pks_are_in_column_set.sql diff --git a/.gitignore b/.gitignore index 0606e5c3..20eb2532 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,5 @@ dbt_packages/ logs/ logfile .DS_Store +package-lock.yml +integration_tests/package-lock.yml diff --git a/integration_tests/macros/unit_tests/struct_generation_macros.sql b/integration_tests/macros/unit_tests/struct_generation_macros.sql index 187c64b4..2c6767e4 100644 --- a/integration_tests/macros/unit_tests/struct_generation_macros.sql +++ b/integration_tests/macros/unit_tests/struct_generation_macros.sql @@ -7,6 +7,20 @@ map {%- elif execute -%} {# Only raise exception if it's actually being called, not during parsing #} - {%- do exceptions.raise_compiler_error("Unknown adapter '"~ target.type ~ "'")-%} + {%- do exceptions.raise_compiler_error("Unknown adapter '"~ target.type ~ "'") -%} {%- endif -%} -{%- endmacro -%} \ No newline at end of file +{%- endmacro -%} + +{% macro _complex_json_function(json) %} + + {% if target.type == 'redshift' %} + json_parse({{ json }}) + {% elif target.type == 'databricks' %} + from_json({{ json }}, schema_of_json({{ json }})) + {% elif target.type in ['snowflake', 'bigquery'] %} + parse_json({{ json }}) + {% elif execute %} + {# Only raise exception if it's actually being called, not during parsing #} + {%- do exceptions.raise_compiler_error("Unknown adapter '"~ target.type ~ "'") -%} + {% endif %} +{% endmacro %} \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_compare_classify.yml b/integration_tests/models/unit_test_wrappers/unit_compare_classify.yml index 4dbf6c59..c38753b1 100644 --- a/integration_tests/models/unit_test_wrappers/unit_compare_classify.yml +++ b/integration_tests/models/unit_test_wrappers/unit_compare_classify.yml @@ -155,9 +155,9 @@ unit_tests: expect: rows: - - {"dbt_audit_row_status": 'identical', 'id': 3, dbt_audit_num_rows_in_status: 1} - {"dbt_audit_row_status": 'nonunique_pk', 'id': , dbt_audit_num_rows_in_status: 2} - {"dbt_audit_row_status": 'nonunique_pk', 'id': , dbt_audit_num_rows_in_status: 2} + - {"dbt_audit_row_status": 'identical', 'id': 3, dbt_audit_num_rows_in_status: 1} overrides: vars: @@ -227,3 +227,30 @@ unit_tests: - {"dbt_audit_row_status": 'removed', 'id': 3, dbt_audit_num_rows_in_status: 1} config: tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-core/issues/10167 + + - name: compare_classify_identical_tables_without_pk_in_cols_list + model: unit_compare_classify + + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + + expect: + rows: + - {"dbt_audit_row_status": 'identical', 'id': 1, dbt_audit_num_rows_in_status: 3} + - {"dbt_audit_row_status": 'identical', 'id': 3, dbt_audit_num_rows_in_status: 3} + - {"dbt_audit_row_status": 'identical', 'id': 2, dbt_audit_num_rows_in_status: 3} + + overrides: + vars: + compare_classify__columns: ['col1', 'col2'] + compare_classify__event_time: + compare_classify__primary_key_columns: ['id'] diff --git a/integration_tests/models/unit_test_wrappers/unit_compare_classify_struct.yml b/integration_tests/models/unit_test_wrappers/unit_compare_classify_struct.yml index b7f82a48..98f634d1 100644 --- a/integration_tests/models/unit_test_wrappers/unit_compare_classify_struct.yml +++ b/integration_tests/models/unit_test_wrappers/unit_compare_classify_struct.yml @@ -16,12 +16,12 @@ unit_tests: compare_classify__columns: ['id', 'col1', 'col2'] compare_classify__event_time: compare_classify__primary_key_columns: ['id'] - config: - tags: "{{ 'skip' if (target.type in ['databricks']) else 'runnable' }}" #Can't do set operations on even simple JSON cols + # config: + # tags: "{{ 'skip' if (target.type in ['databricks']) else 'runnable' }}" #Can't do set operations on even simple JSON cols - name: unit_compare_classify_struct_identical_values_different_order model: unit_compare_classify_struct - description: Objects' keys are sorted alphabetically, so sort order is ignored. + description: Objects' keys are generally sorted alphabetically, so sort order is ignored. given: - input: ref('unit_test_struct_model_a') format: sql @@ -38,7 +38,31 @@ unit_tests: compare_classify__event_time: compare_classify__primary_key_columns: ['id'] config: - tags: "{{ 'skip' if (target.type in ['databricks']) else 'runnable' }}" #Can't do set operations on even simple JSON cols + #Databricks cares about the order and considers it a difference. We're not trying to have identical behaviour across warehouses so that's OK. + tags: "{{ 'skip' if (target.type in ['databricks']) else 'runnable' }}" + + - name: unit_compare_classify_struct_identical_values_different_order_dbx + model: unit_compare_classify_struct + description: Most platforms don't care about sort order. Databricks does. + given: + - input: ref('unit_test_struct_model_a') + format: sql + fixture: simple_struct + - input: ref('unit_test_struct_model_b') + format: sql + fixture: simple_struct_different_order + expect: + rows: + - {"id": 1, "dbt_audit_row_status": "modified", "dbt_audit_num_rows_in_status": 1} + - {"id": 1, "dbt_audit_row_status": "modified", "dbt_audit_num_rows_in_status": 1} + overrides: + vars: + compare_classify__columns: ['id', 'col1', 'col2'] + compare_classify__event_time: + compare_classify__primary_key_columns: ['id'] + config: + #Only for databricks + tags: "{{ 'skip' if (target.type not in ['databricks']) else 'runnable' }}" - name: unit_compare_classify_struct_removed_key model: unit_compare_classify_struct @@ -58,20 +82,18 @@ unit_tests: compare_classify__columns: ['id', 'col1', 'col2'] compare_classify__event_time: compare_classify__primary_key_columns: ['id'] - config: - tags: "{{ 'skip' if (target.type in ['databricks']) else 'runnable' }}" #Can't do set operations on even simple JSON cols + # config: + # tags: "{{ 'skip' if (target.type in ['databricks']) else 'runnable' }}" #Can't do set operations on even simple JSON cols - name: compare_classify_complex_struct model: unit_compare_classify_struct given: - input: ref('unit_test_struct_model_a') format: sql - rows: | - select 1 as id, 'John Doe' as col1, object_construct('emails', array_construct('john.doe@example.com', 'john.d@example.com'), 'phones', array_construct(object_construct('type', 'home', 'number', '123-456-7890'), object_construct('type', 'work', 'number', '987-654-3210'))) as col2 + fixture: complex_struct - input: ref('unit_test_struct_model_b') format: sql - rows: | - select 1 as id, 'John Doe' as col1, object_construct('emails', array_construct('john.doe@example.com', 'john.d@example.com'), 'phones', array_construct(object_construct('type', 'home', 'number', '123-456-7890'), object_construct('type', 'work', 'number', '987-654-3210'))) as col2 + fixture: complex_struct expect: rows: - {"id": 1, "dbt_audit_row_status": "identical", "dbt_audit_num_rows_in_status": 1} @@ -80,20 +102,19 @@ unit_tests: compare_classify__columns: ['id', 'col1', 'col2'] compare_classify__event_time: compare_classify__primary_key_columns: ['id'] - config: - tags: "{{ 'skip' if (target.type in ['redshift', 'databricks']) else 'runnable' }}" #haven't ported these to be multi-warehouse yet + # config: + # tags: "{{ 'skip' if (target.type in ['redshift', 'databricks']) else 'runnable' }}" #haven't ported these to be multi-warehouse yet - name: compare_classify_complex_struct_different_values model: unit_compare_classify_struct given: - input: ref('unit_test_struct_model_a') format: sql - rows: | - select 1 as id, 'John Doe' as col1, object_construct('emails', array_construct('john.doe@example.com', 'john.d@example.com'), 'phones', array_construct(object_construct('type', 'home', 'number', '123-456-7890'), object_construct('type', 'work', 'number', '987-654-3210'))) as col2 + fixture: complex_struct - input: ref('unit_test_struct_model_b') format: sql - rows: | - select 1 as id, 'John Doe' as col1, object_construct('emails', array_construct('john.smith@example.com', 'john.d@example.com'), 'phones', array_construct(object_construct('type', 'home', 'number', '123-456-7890'), object_construct('type', 'work', 'number', '987-654-3210'))) as col2 + fixture: complex_struct_different_value + expect: rows: - {"id": 1, "dbt_audit_row_status": "modified", "dbt_audit_num_rows_in_status": 1} @@ -103,8 +124,8 @@ unit_tests: compare_classify__columns: ['id', 'col1', 'col2'] compare_classify__event_time: compare_classify__primary_key_columns: ['id'] - config: - tags: "{{ 'skip' if (target.type in ['redshift', 'databricks']) else 'runnable' }}" #haven't ported these to be multi-warehouse yet + # config: + # tags: "{{ 'skip' if (target.type in ['redshift', 'databricks']) else 'runnable' }}" #haven't ported these to be multi-warehouse yet - name: unit_compare_classify_complex_struct_identical_values_different_order model: unit_compare_classify_struct @@ -112,12 +133,10 @@ unit_tests: given: - input: ref('unit_test_struct_model_a') format: sql - rows: | - select 1 as id, 'John Doe' as col1, object_construct('emails', array_construct('john.doe@example.com', 'john.d@example.com'), 'phones', array_construct(object_construct('type', 'home', 'number', '123-456-7890'), object_construct('type', 'work', 'number', '987-654-3210'))) as col2 + fixture: complex_struct - input: ref('unit_test_struct_model_b') format: sql - rows: | - select 1 as id, 'John Doe' as col1, object_construct('emails', array_construct('john.doe@example.com', 'john.d@example.com'), 'phones', array_construct(object_construct('type', 'work', 'number', '987-654-3210'), object_construct('type', 'home', 'number', '123-456-7890'))) as col2 + fixture: complex_struct_different_order expect: rows: - {"id": 1, "dbt_audit_row_status": "modified", "dbt_audit_num_rows_in_status": 1} @@ -127,5 +146,5 @@ unit_tests: compare_classify__columns: ['id', 'col1', 'col2'] compare_classify__event_time: compare_classify__primary_key_columns: ['id'] - config: - tags: "{{ 'skip' if (target.type in ['redshift', 'databricks']) else 'runnable' }}" #haven't ported these to be multi-warehouse yet + # config: + # tags: "{{ 'skip' if (target.type in ['redshift', 'databricks']) else 'runnable' }}" #haven't ported these to be multi-warehouse yet diff --git a/integration_tests/tests/fixtures/complex_struct.sql b/integration_tests/tests/fixtures/complex_struct.sql new file mode 100644 index 00000000..b96206c3 --- /dev/null +++ b/integration_tests/tests/fixtures/complex_struct.sql @@ -0,0 +1,8 @@ +{% set json %} + '{"emails":["john.doe@example.com","john.d@example.com"],"phones":[{"number":"123-456-7890","type":"home"},{"number":"987-654-3210","type":"work"}]}' +{% endset %} + +select + 1 as id, + 'John Doe' as col1, + {{ audit_helper_integration_tests._complex_json_function(json) }} as col2 \ No newline at end of file diff --git a/integration_tests/tests/fixtures/complex_struct_different_order.sql b/integration_tests/tests/fixtures/complex_struct_different_order.sql new file mode 100644 index 00000000..24ead4fc --- /dev/null +++ b/integration_tests/tests/fixtures/complex_struct_different_order.sql @@ -0,0 +1,8 @@ +{% set json %} + '{"emails":["john.doe@example.com","john.d@example.com"],"phones":[{"number":"987-654-3210","type":"work"}, {"number":"123-456-7890","type":"home"}]}' +{% endset %} + +select + 1 as id, + 'John Doe' as col1, + {{ audit_helper_integration_tests._complex_json_function(json) }} as col2 \ No newline at end of file diff --git a/integration_tests/tests/fixtures/complex_struct_different_value.sql b/integration_tests/tests/fixtures/complex_struct_different_value.sql new file mode 100644 index 00000000..5446b11a --- /dev/null +++ b/integration_tests/tests/fixtures/complex_struct_different_value.sql @@ -0,0 +1,8 @@ +{% set json %} +'{"emails":["john.smith@example.com","john.s@example.com"],"phones":[{"number":"123-456-7890","type":"home"},{"number":"987-654-3210","type":"work"}]}' +{% endset %} + +select + 1 as id, + 'John Doe' as col1, + {{ audit_helper_integration_tests._complex_json_function(json) }} as col2 \ No newline at end of file diff --git a/macros/compare_and_classify_query_results.sql b/macros/compare_and_classify_query_results.sql index 0025bf21..b5b40c58 100644 --- a/macros/compare_and_classify_query_results.sql +++ b/macros/compare_and_classify_query_results.sql @@ -1,5 +1,6 @@ {% macro compare_and_classify_query_results(a_query, b_query, primary_key_columns=[], columns=[], event_time=None, sample_limit=20) %} + {% set columns = audit_helper._ensure_all_pks_are_in_column_set(primary_key_columns, columns) %} {% set joined_cols = columns | join(", ") %} {% if event_time %} @@ -16,24 +17,24 @@ select *, - true as in_a, - true as in_b + true as dbt_audit_in_a, + true as dbt_audit_in_b from a_intersect_b union all select *, - true as in_a, - false as in_b + true as dbt_audit_in_a, + false as dbt_audit_in_b from a_except_b union all select *, - false as in_a, - true as in_b + false as dbt_audit_in_a, + true as dbt_audit_in_b from b_except_a ), diff --git a/macros/utils/_classify_audit_row_status.sql b/macros/utils/_classify_audit_row_status.sql index e28e3f4e..73dd631d 100644 --- a/macros/utils/_classify_audit_row_status.sql +++ b/macros/utils/_classify_audit_row_status.sql @@ -5,12 +5,12 @@ {%- macro default___classify_audit_row_status() -%} case when max(dbt_audit_pk_row_num) over (partition by dbt_audit_surrogate_key) > 1 then 'nonunique_pk' - when in_a and in_b then 'identical' - when {{ dbt.bool_or('in_a') }} over (partition by dbt_audit_surrogate_key, dbt_audit_pk_row_num) - and {{ dbt.bool_or('in_b') }} over (partition by dbt_audit_surrogate_key, dbt_audit_pk_row_num) + when dbt_audit_in_a and dbt_audit_in_b then 'identical' + when {{ dbt.bool_or('dbt_audit_in_a') }} over (partition by dbt_audit_surrogate_key, dbt_audit_pk_row_num) + and {{ dbt.bool_or('dbt_audit_in_b') }} over (partition by dbt_audit_surrogate_key, dbt_audit_pk_row_num) then 'modified' - when in_a then 'removed' - when in_b then 'added' + when dbt_audit_in_a then 'removed' + when dbt_audit_in_b then 'added' end {% endmacro %} @@ -19,10 +19,10 @@ {#- Redshift doesn't support bitwise operations (e.g. bool_or) inside of a window function :( -#} case when max(dbt_audit_pk_row_num) over (partition by dbt_audit_surrogate_key) > 1 then 'nonunique_pk' - when in_a and in_b then 'identical' - when max(case when in_a then 1 else 0 end) over (partition by dbt_audit_surrogate_key, dbt_audit_pk_row_num) = 1 - and max(case when in_b then 1 else 0 end) over (partition by dbt_audit_surrogate_key, dbt_audit_pk_row_num) = 1 + when dbt_audit_in_a and dbt_audit_in_b then 'identical' + when max(case when dbt_audit_in_a then 1 else 0 end) over (partition by dbt_audit_surrogate_key, dbt_audit_pk_row_num) = 1 + and max(case when dbt_audit_in_b then 1 else 0 end) over (partition by dbt_audit_surrogate_key, dbt_audit_pk_row_num) = 1 then 'modified' - when in_a then 'removed' - when in_b then 'added' + when dbt_audit_in_a then 'removed' + when dbt_audit_in_b then 'added' end{% endmacro %} \ No newline at end of file diff --git a/macros/utils/_ensure_all_pks_are_in_column_set.sql b/macros/utils/_ensure_all_pks_are_in_column_set.sql new file mode 100644 index 00000000..5c190541 --- /dev/null +++ b/macros/utils/_ensure_all_pks_are_in_column_set.sql @@ -0,0 +1,19 @@ +{# If someone forgot to include the PK columns in their main set of columns, fix it up for them #} +{# Assuming that the PKs are the most important columns, so they go to the front of the list #} + +{% macro _ensure_all_pks_are_in_column_set(primary_key_columns, columns) %} + {% set lower_cols = columns | map('lower') | list %} + {% set missing_pks = [] %} + + {% for pk in primary_key_columns %} + {% if pk | lower not in lower_cols %} + {% do missing_pks.append(pk) %} + {% endif %} + {% endfor %} + + {% if missing_pks | length > 0 %} + {% set columns = missing_pks + columns %} + {% endif %} + + {% do return (columns) %} +{% endmacro %} \ No newline at end of file diff --git a/macros/utils/_generate_set_results.sql b/macros/utils/_generate_set_results.sql index a5400714..2705061a 100644 --- a/macros/utils/_generate_set_results.sql +++ b/macros/utils/_generate_set_results.sql @@ -72,15 +72,14 @@ ) {% endmacro %} - - {% macro bigquery___generate_set_results(a_query, b_query, primary_key_columns, columns, event_time_props) %} {% set joined_cols = columns | join(", ") %} + {% set surrogate_key = audit_helper.generate_null_safe_surrogate_key(primary_key_columns) %} subset_columns_a as ( select {{ joined_cols }}, - {{ audit_helper.generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key, - row_number() over (partition by dbt_audit_surrogate_key order by dbt_audit_surrogate_key ) as dbt_audit_pk_row_num + {{ surrogate_key }} as dbt_audit_surrogate_key, + row_number() over (partition by {{ surrogate_key }} order by 1 ) as dbt_audit_pk_row_num from ( {{- a_query -}} ) {% if event_time_props %} where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' @@ -91,8 +90,8 @@ subset_columns_b as ( select {{ joined_cols }}, - {{ audit_helper.generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key, - row_number() over (partition by dbt_audit_surrogate_key order by dbt_audit_surrogate_key ) as dbt_audit_pk_row_num + {{ surrogate_key }} as dbt_audit_surrogate_key, + row_number() over (partition by {{ surrogate_key }} order by 1 ) as dbt_audit_pk_row_num from ( {{- b_query -}} ) {% if event_time_props %} where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' @@ -136,6 +135,63 @@ ) {% endmacro %} +{% macro databricks___generate_set_results(a_query, b_query, primary_key_columns, columns, event_time_props) %} + {% set cast_columns = [] %} + {# Map types can't be compared by default (you need to opt in to a legacy behaviour flag) #} + {# so everything needs to be cast as a string first :( #} + {% for col in columns %} + {% do cast_columns.append(dbt.cast(col, api.Column.translate_type("string"))) %} + {% endfor %} + {% set joined_cols = cast_columns | join(", ") %} + {% set surrogate_key = audit_helper.generate_null_safe_surrogate_key(primary_key_columns) %} + a as ( + select + {{ joined_cols }}, + {{ surrogate_key }} as dbt_audit_surrogate_key, + row_number() over (partition by {{ surrogate_key }} order by 1 ) as dbt_audit_pk_row_num, + xxhash64({{ joined_cols }}, dbt_audit_pk_row_num) as dbt_audit_row_hash + from ( {{- a_query -}} ) + {% if event_time_props %} + where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' + and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' + {% endif %} + ), + + b as ( + select + {{ joined_cols }}, + {{ surrogate_key }} as dbt_audit_surrogate_key, + row_number() over (partition by {{ surrogate_key }} order by 1 ) as dbt_audit_pk_row_num, + xxhash64({{ joined_cols }}, dbt_audit_pk_row_num) as dbt_audit_row_hash + from ( {{- b_query -}} ) + {% if event_time_props %} + where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' + and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' + {% endif %} + ), + + a_intersect_b as ( + + select * from a + where a.dbt_audit_row_hash in (select b.dbt_audit_row_hash from b) + + ), + + a_except_b as ( + + select * from a + where a.dbt_audit_row_hash not in (select b.dbt_audit_row_hash from b) + + ), + + b_except_a as ( + + select * from b + where b.dbt_audit_row_hash not in (select a.dbt_audit_row_hash from a) + + ) +{% endmacro %} + {% macro snowflake___generate_set_results(a_query, b_query, primary_key_columns, columns, event_time_props) %} {% set joined_cols = columns | join(", ") %} a as ( From f0ebf2f7dcadc34f2a297dcd2968ca9e298b4f30 Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Thu, 6 Jun 2024 14:11:39 +1200 Subject: [PATCH 10/13] Rename private macros to have _ prefix --- .../unit_compare_classify.yml | 2 +- .../unit_quick_are_queries_identical.yml | 2 +- macros/compare_and_classify_query_results.sql | 2 +- macros/quick_are_queries_identical.sql | 4 +- ...safe_sk.sql => _generate_null_safe_sk.sql} | 6 +- macros/utils/_generate_set_results.sql | 66 +++++++++++++++++-- ..._bounds.sql => _get_comparison_bounds.sql} | 2 +- 7 files changed, 71 insertions(+), 13 deletions(-) rename macros/utils/{generate_null_safe_sk.sql => _generate_null_safe_sk.sql} (70%) rename macros/utils/{get_comparison_bounds.sql => _get_comparison_bounds.sql} (97%) diff --git a/integration_tests/models/unit_test_wrappers/unit_compare_classify.yml b/integration_tests/models/unit_test_wrappers/unit_compare_classify.yml index 4dbf6c59..2e8a24ff 100644 --- a/integration_tests/models/unit_test_wrappers/unit_compare_classify.yml +++ b/integration_tests/models/unit_test_wrappers/unit_compare_classify.yml @@ -34,7 +34,7 @@ unit_tests: compare_classify__event_time: 'created_at' compare_classify__primary_key_columns: ['id'] macros: - audit_helper.get_comparison_bounds: + audit_helper._get_comparison_bounds: "min_event_time": "2024-01-02" "max_event_time": "2024-01-03" "event_time": 'created_at' diff --git a/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.yml b/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.yml index 0d953506..ddfddc2d 100644 --- a/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.yml +++ b/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.yml @@ -30,7 +30,7 @@ unit_tests: quick_are_queries_identical_cols: ['id', 'col1', 'col2', 'created_at'] quick_are_queries_identical_event_time: 'created_at' macros: - audit_helper.get_comparison_bounds: + audit_helper._get_comparison_bounds: "min_event_time": "2024-01-02" "max_event_time": "2024-01-03" "event_time": 'created_at' diff --git a/macros/compare_and_classify_query_results.sql b/macros/compare_and_classify_query_results.sql index 0025bf21..c81f8b58 100644 --- a/macros/compare_and_classify_query_results.sql +++ b/macros/compare_and_classify_query_results.sql @@ -3,7 +3,7 @@ {% set joined_cols = columns | join(", ") %} {% if event_time %} - {% set event_time_props = audit_helper.get_comparison_bounds(a_query, b_query, event_time) %} + {% set event_time_props = audit_helper._get_comparison_bounds(a_query, b_query, event_time) %} {% endif %} with diff --git a/macros/quick_are_queries_identical.sql b/macros/quick_are_queries_identical.sql index 0a049ad4..af00c8cd 100644 --- a/macros/quick_are_queries_identical.sql +++ b/macros/quick_are_queries_identical.sql @@ -22,7 +22,7 @@ but it's a good way to quickly verify identical results if that's what you're ex {% macro bigquery__quick_are_queries_identical(query_a, query_b, columns, event_time) %} {% set joined_cols = columns | join(", ") %} {% if event_time %} - {% set event_time_props = audit_helper.get_comparison_bounds(a_query, b_query, event_time) %} + {% set event_time_props = audit_helper._get_comparison_bounds(a_query, b_query, event_time) %} {% endif %} with query_a as ( @@ -57,7 +57,7 @@ but it's a good way to quickly verify identical results if that's what you're ex {% macro snowflake__quick_are_queries_identical(query_a, query_b, columns, event_time) %} {% set joined_cols = columns | join(", ") %} {% if event_time %} - {% set event_time_props = audit_helper.get_comparison_bounds(a_query, b_query, event_time) %} + {% set event_time_props = audit_helper._get_comparison_bounds(a_query, b_query, event_time) %} {% endif %} select count(distinct hash_result) = 1 as are_tables_identical diff --git a/macros/utils/generate_null_safe_sk.sql b/macros/utils/_generate_null_safe_sk.sql similarity index 70% rename from macros/utils/generate_null_safe_sk.sql rename to macros/utils/_generate_null_safe_sk.sql index 4078c334..26ed6450 100644 --- a/macros/utils/generate_null_safe_sk.sql +++ b/macros/utils/_generate_null_safe_sk.sql @@ -1,10 +1,10 @@ {# Taken from https://github.com/dbt-labs/dbt-utils/blob/main/macros/sql/generate_surrogate_key.sql but without the option to treat nulls as empty strings #} -{%- macro generate_null_safe_surrogate_key(field_list) -%} - {{ return(adapter.dispatch('generate_null_safe_surrogate_key', 'audit_helper')(field_list)) }} +{%- macro _generate_null_safe_surrogate_key(field_list) -%} + {{ return(adapter.dispatch('_generate_null_safe_surrogate_key', 'audit_helper')(field_list)) }} {% endmacro %} -{%- macro default__generate_null_safe_surrogate_key(field_list) -%} +{%- macro default___generate_null_safe_surrogate_key(field_list) -%} {%- set fields = [] -%} diff --git a/macros/utils/_generate_set_results.sql b/macros/utils/_generate_set_results.sql index a5400714..0bc409f0 100644 --- a/macros/utils/_generate_set_results.sql +++ b/macros/utils/_generate_set_results.sql @@ -14,7 +14,7 @@ a_base as ( select {{ joined_cols }}, - {{ audit_helper.generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key + {{ audit_helper._generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key from ( {{- a_query -}} ) a_base_subq {% if event_time_props %} where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' @@ -25,7 +25,7 @@ b_base as ( select {{ joined_cols }}, - {{ audit_helper.generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key + {{ audit_helper._generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key from ( {{- b_query -}} ) b_base_subq {% if event_time_props %} where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' @@ -76,6 +76,7 @@ {% macro bigquery___generate_set_results(a_query, b_query, primary_key_columns, columns, event_time_props) %} {% set joined_cols = columns | join(", ") %} + {% set surrogate_key = audit_helper._generate_null_safe_surrogate_key(primary_key_columns) %} subset_columns_a as ( select {{ joined_cols }}, @@ -136,12 +137,69 @@ ) {% endmacro %} +{% macro databricks___generate_set_results(a_query, b_query, primary_key_columns, columns, event_time_props) %} + {% set cast_columns = [] %} + {# Map types can't be compared by default (you need to opt in to a legacy behaviour flag) #} + {# so everything needs to be cast as a string first :( #} + {% for col in columns %} + {% do cast_columns.append(dbt.cast(col, api.Column.translate_type("string"))) %} + {% endfor %} + {% set joined_cols = cast_columns | join(", ") %} + {% set surrogate_key = audit_helper._generate_null_safe_surrogate_key(primary_key_columns) %} + a as ( + select + {{ joined_cols }}, + {{ surrogate_key }} as dbt_audit_surrogate_key, + row_number() over (partition by {{ surrogate_key }} order by 1 ) as dbt_audit_pk_row_num, + xxhash64({{ joined_cols }}, dbt_audit_pk_row_num) as dbt_audit_row_hash + from ( {{- a_query -}} ) + {% if event_time_props %} + where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' + and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' + {% endif %} + ), + + b as ( + select + {{ joined_cols }}, + {{ surrogate_key }} as dbt_audit_surrogate_key, + row_number() over (partition by {{ surrogate_key }} order by 1 ) as dbt_audit_pk_row_num, + xxhash64({{ joined_cols }}, dbt_audit_pk_row_num) as dbt_audit_row_hash + from ( {{- b_query -}} ) + {% if event_time_props %} + where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' + and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' + {% endif %} + ), + + a_intersect_b as ( + + select * from a + where a.dbt_audit_row_hash in (select b.dbt_audit_row_hash from b) + + ), + + a_except_b as ( + + select * from a + where a.dbt_audit_row_hash not in (select b.dbt_audit_row_hash from b) + + ), + + b_except_a as ( + + select * from b + where b.dbt_audit_row_hash not in (select a.dbt_audit_row_hash from a) + + ) +{% endmacro %} + {% macro snowflake___generate_set_results(a_query, b_query, primary_key_columns, columns, event_time_props) %} {% set joined_cols = columns | join(", ") %} a as ( select {{ joined_cols }}, - {{ audit_helper.generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key, + {{ audit_helper._generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key, row_number() over (partition by dbt_audit_surrogate_key order by dbt_audit_surrogate_key ) as dbt_audit_pk_row_num, hash({{ joined_cols }}, dbt_audit_pk_row_num) as dbt_audit_row_hash from ( {{- a_query -}} ) @@ -154,7 +212,7 @@ b as ( select {{ joined_cols }}, - {{ audit_helper.generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key, + {{ audit_helper._generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key, row_number() over (partition by dbt_audit_surrogate_key order by dbt_audit_surrogate_key ) as dbt_audit_pk_row_num, hash({{ joined_cols }}, dbt_audit_pk_row_num) as dbt_audit_row_hash from ( {{- b_query -}} ) diff --git a/macros/utils/get_comparison_bounds.sql b/macros/utils/_get_comparison_bounds.sql similarity index 97% rename from macros/utils/get_comparison_bounds.sql rename to macros/utils/_get_comparison_bounds.sql index 4f224f5f..75369071 100644 --- a/macros/utils/get_comparison_bounds.sql +++ b/macros/utils/_get_comparison_bounds.sql @@ -18,7 +18,7 @@ model_a │ │┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼ ┌──► └────────────────────────────┘ ◄────┐ b min_value b max_value */ -{% macro get_comparison_bounds(a_relation, b_relation, event_time) %} +{% macro _get_comparison_bounds(a_relation, b_relation, event_time) %} {% set min_max_queries %} with min_maxes as ( select min({{ event_time }}) as min_event_time, max({{ event_time }}) as max_event_time From 593f2177c726b89508b03b9348eb708ac5e4163a Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Thu, 6 Jun 2024 15:17:06 +1200 Subject: [PATCH 11/13] Fix get comparison bounds (#104) * change to getting comparison bounds for queries not relations * add test for introspective queries --- .../data_tests/compare_and_classify_query_results.sql | 11 +++++++++++ macros/utils/_get_comparison_bounds.sql | 9 ++++++--- 2 files changed, 17 insertions(+), 3 deletions(-) create mode 100644 integration_tests/models/data_tests/compare_and_classify_query_results.sql diff --git a/integration_tests/models/data_tests/compare_and_classify_query_results.sql b/integration_tests/models/data_tests/compare_and_classify_query_results.sql new file mode 100644 index 00000000..747f146f --- /dev/null +++ b/integration_tests/models/data_tests/compare_and_classify_query_results.sql @@ -0,0 +1,11 @@ +-- this has no tests, it's just making sure that the introspecive queries for event_time actually run + +{{ + audit_helper.compare_and_classify_query_results( + a_query="select * from " ~ ref('unit_test_model_a') ~ " where 1=1", + b_query="select * from " ~ ref('unit_test_model_b') ~ " where 1=1", + primary_key_columns=['id'], + columns=['id', 'col1', 'col2'], + event_time='created_at' + ) +}} \ No newline at end of file diff --git a/macros/utils/_get_comparison_bounds.sql b/macros/utils/_get_comparison_bounds.sql index 75369071..c644f062 100644 --- a/macros/utils/_get_comparison_bounds.sql +++ b/macros/utils/_get_comparison_bounds.sql @@ -18,14 +18,14 @@ model_a │ │┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼ ┌──► └────────────────────────────┘ ◄────┐ b min_value b max_value */ -{% macro _get_comparison_bounds(a_relation, b_relation, event_time) %} +{% macro _get_comparison_bounds(a_query, b_query, event_time) %} {% set min_max_queries %} with min_maxes as ( select min({{ event_time }}) as min_event_time, max({{ event_time }}) as max_event_time - from {{ a_relation }} + from ({{ a_query }}) a_subq union all select min({{ event_time }}) as min_event_time, max({{ event_time }}) as max_event_time - from {{ b_relation }} + from ({{ b_query }}) b_subq ) select max(min_event_time) as min_event_time, min(max_event_time) as max_event_time from min_maxes @@ -34,6 +34,9 @@ model_a │ │┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼ {% set query_response = dbt_utils.get_query_results_as_dict(min_max_queries) %} {% set event_time_props = {"event_time": event_time} %} + + {# query_response.keys() are only `min_event_time` and `max_event_time`, but they have indeterminate capitalisation #} + {# hence the dynamic approach for what is otherwise just two well-known values #} {% for k in query_response.keys() %} {% do event_time_props.update({k | lower: query_response[k][0]}) %} {% endfor %} From dee003bdd9e3dc1ea86e45aa963493ebef8c653d Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Thu, 13 Jun 2024 16:18:07 +1200 Subject: [PATCH 12/13] Make compare query columns multi pk (#105) --- integration_tests/dbt_project.yml | 7 +- .../compare_which_columns_differ.sql | 4 +- ...pare_which_columns_differ_exclude_cols.sql | 13 +- .../unit_compare_classify.sql | 6 +- .../unit_compare_classify.yml | 54 ++++---- .../unit_compare_classify_struct.sql | 6 +- .../unit_compare_classify_struct.yml | 44 +++--- ...nit_compare_which_query_columns_differ.sql | 17 +++ ...nit_compare_which_query_columns_differ.yml | 124 +++++++++++++++++ .../unit_ensure_all_pks_are_in_column_set.sql | 19 +++ .../unit_ensure_all_pks_are_in_column_set.yml | 130 ++++++++++++++++++ .../unit_quick_are_queries_identical.sql | 2 +- .../unit_quick_are_queries_identical.yml | 8 +- macros/compare_which_columns_differ.sql | 46 ------- macros/compare_which_query_columns_differ.sql | 64 +++++++++ .../compare_which_relation_columns_differ.sql | 15 ++ macros/quick_are_queries_identical.sql | 20 +-- macros/utils/_generate_set_results.sql | 40 ++---- macros/utils/_get_comparison_bounds.sql | 7 + 19 files changed, 462 insertions(+), 164 deletions(-) create mode 100644 integration_tests/models/unit_test_wrappers/unit_compare_which_query_columns_differ.sql create mode 100644 integration_tests/models/unit_test_wrappers/unit_compare_which_query_columns_differ.yml create mode 100644 integration_tests/models/unit_test_wrappers/unit_ensure_all_pks_are_in_column_set.sql create mode 100644 integration_tests/models/unit_test_wrappers/unit_ensure_all_pks_are_in_column_set.yml delete mode 100644 macros/compare_which_columns_differ.sql create mode 100644 macros/compare_which_query_columns_differ.sql create mode 100644 macros/compare_which_relation_columns_differ.sql diff --git a/integration_tests/dbt_project.yml b/integration_tests/dbt_project.yml index 873057e5..2f4cb84d 100644 --- a/integration_tests/dbt_project.yml +++ b/integration_tests/dbt_project.yml @@ -20,11 +20,10 @@ seeds: vars: compare_queries_summarize: true - compare_classify__primary_key_columns: ['col1'] - compare_classify__columns: ['col1'] - compare_classify__event_time: + primary_key_columns_var: ['col1'] + columns_var: ['col1'] + event_time_var: quick_are_queries_identical_cols: ['col1'] - quick_are_queries_identical_event_time: flags: send_anonymous_usage_stats: False diff --git a/integration_tests/models/data_tests/compare_which_columns_differ.sql b/integration_tests/models/data_tests/compare_which_columns_differ.sql index a68523d3..ef158803 100644 --- a/integration_tests/models/data_tests/compare_which_columns_differ.sql +++ b/integration_tests/models/data_tests/compare_which_columns_differ.sql @@ -9,9 +9,9 @@ select has_difference from ( - {{ audit_helper.compare_which_columns_differ( + {{ audit_helper.compare_which_relation_columns_differ( a_relation=a_relation, b_relation=b_relation, - primary_key="id" + primary_key_columns=["id"] ) }} ) as macro_output diff --git a/integration_tests/models/data_tests/compare_which_columns_differ_exclude_cols.sql b/integration_tests/models/data_tests/compare_which_columns_differ_exclude_cols.sql index 7630f549..8d2d5aa2 100644 --- a/integration_tests/models/data_tests/compare_which_columns_differ_exclude_cols.sql +++ b/integration_tests/models/data_tests/compare_which_columns_differ_exclude_cols.sql @@ -2,17 +2,24 @@ {% set b_relation=ref('data_compare_which_columns_differ_b') %} +{% set pk_cols = ['id'] %} +{% set cols = ['id','value_changes','becomes_not_null','does_not_change'] %} + +{% if target.type == 'snowflake' %} + {% set pk_cols = pk_cols | map("upper") | list %} + {% set cols = cols | map("upper") | list %} +{% endif %} select lower(column_name) as column_name, has_difference from ( - {{ audit_helper.compare_which_columns_differ( + {{ audit_helper.compare_which_relation_columns_differ( a_relation=a_relation, b_relation=b_relation, - primary_key="id", - exclude_columns=["becomes_null"] + primary_key_columns=pk_cols, + columns=cols ) }} ) as macro_output \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_compare_classify.sql b/integration_tests/models/unit_test_wrappers/unit_compare_classify.sql index fbef7dcb..e2c707a8 100644 --- a/integration_tests/models/unit_test_wrappers/unit_compare_classify.sql +++ b/integration_tests/models/unit_test_wrappers/unit_compare_classify.sql @@ -2,8 +2,8 @@ audit_helper.compare_and_classify_query_results( "select * from " ~ ref('unit_test_model_a') ~ " where 1=1", "select * from " ~ ref('unit_test_model_b') ~ " where 1=1", - primary_key_columns=var('compare_classify__primary_key_columns'), - columns=var('compare_classify__columns'), - event_time=var('compare_classify__event_time') + primary_key_columns=var('primary_key_columns_var'), + columns=var('columns_var'), + event_time=var('event_time_var') ) }} \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_compare_classify.yml b/integration_tests/models/unit_test_wrappers/unit_compare_classify.yml index f808f7e8..759526fe 100644 --- a/integration_tests/models/unit_test_wrappers/unit_compare_classify.yml +++ b/integration_tests/models/unit_test_wrappers/unit_compare_classify.yml @@ -22,17 +22,17 @@ unit_tests: overrides: vars: - compare_classify__columns: ['id', 'col1', 'col2'] - compare_classify__event_time: - compare_classify__primary_key_columns: ['id'] + columns_var: ['id', 'col1', 'col2'] + event_time_var: + primary_key_columns_var: ['id'] - name: compare_classify_identical_tables_event_time_filter model: unit_compare_classify overrides: vars: - compare_classify__columns: ['id', 'col1', 'col2', 'created_at'] - compare_classify__event_time: 'created_at' - compare_classify__primary_key_columns: ['id'] + columns_var: ['id', 'col1', 'col2', 'created_at'] + event_time_var: 'created_at' + primary_key_columns_var: ['id'] macros: audit_helper._get_comparison_bounds: "min_event_time": "2024-01-02" @@ -59,9 +59,9 @@ unit_tests: model: unit_compare_classify overrides: vars: - compare_classify__columns: ['id', 'col1', 'col2'] - compare_classify__event_time: - compare_classify__primary_key_columns: ['id'] + columns_var: ['id', 'col1', 'col2'] + event_time_var: + primary_key_columns_var: ['id'] given: - input: ref('unit_test_model_a') rows: @@ -88,9 +88,9 @@ unit_tests: model: unit_compare_classify overrides: vars: - compare_classify__columns: ['id', 'id_2', 'col1', 'col2'] - compare_classify__event_time: - compare_classify__primary_key_columns: ['id', 'id_2'] + columns_var: ['id', 'id_2', 'col1', 'col2'] + event_time_var: + primary_key_columns_var: ['id', 'id_2'] given: - input: ref('unit_test_model_a') rows: @@ -132,9 +132,9 @@ unit_tests: overrides: vars: - compare_classify__columns: ['id', 'col1', 'col2'] - compare_classify__event_time: - compare_classify__primary_key_columns: ['id'] + columns_var: ['id', 'col1', 'col2'] + event_time_var: + primary_key_columns_var: ['id'] config: tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-redshift/issues/821 @@ -161,9 +161,9 @@ unit_tests: overrides: vars: - compare_classify__columns: ['id', 'col1', 'col2'] - compare_classify__event_time: - compare_classify__primary_key_columns: ['id'] + columns_var: ['id', 'col1', 'col2'] + event_time_var: + primary_key_columns_var: ['id'] config: tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-redshift/issues/821 @@ -193,9 +193,9 @@ unit_tests: overrides: vars: - compare_classify__columns: ['id', 'col1', 'col2'] - compare_classify__event_time: - compare_classify__primary_key_columns: ['id'] + columns_var: ['id', 'col1', 'col2'] + event_time_var: + primary_key_columns_var: ['id'] config: tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-redshift/issues/821 @@ -203,9 +203,9 @@ unit_tests: model: unit_compare_classify overrides: vars: - compare_classify__primary_key_columns: ['id'] - compare_classify__columns: ['id', 'col1'] - compare_classify__event_time: + primary_key_columns_var: ['id'] + columns_var: ['id', 'col1'] + event_time_var: given: - input: ref('unit_test_model_a') rows: @@ -251,6 +251,6 @@ unit_tests: overrides: vars: - compare_classify__columns: ['col1', 'col2'] - compare_classify__event_time: - compare_classify__primary_key_columns: ['id'] + columns_var: ['col1', 'col2'] + event_time_var: + primary_key_columns_var: ['id'] diff --git a/integration_tests/models/unit_test_wrappers/unit_compare_classify_struct.sql b/integration_tests/models/unit_test_wrappers/unit_compare_classify_struct.sql index 82b04ea4..4184f3dc 100644 --- a/integration_tests/models/unit_test_wrappers/unit_compare_classify_struct.sql +++ b/integration_tests/models/unit_test_wrappers/unit_compare_classify_struct.sql @@ -2,8 +2,8 @@ audit_helper.compare_and_classify_query_results( "select * from " ~ ref('unit_test_struct_model_a') ~ " where 1=1", "select * from " ~ ref('unit_test_struct_model_b') ~ " where 1=1", - primary_key_columns=var('compare_classify__primary_key_columns'), - columns=var('compare_classify__columns'), - event_time=var('compare_classify__event_time') + primary_key_columns=var('primary_key_columns_var'), + columns=var('columns_var'), + event_time=var('event_time_var') ) }} \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_compare_classify_struct.yml b/integration_tests/models/unit_test_wrappers/unit_compare_classify_struct.yml index 98f634d1..ab86ed44 100644 --- a/integration_tests/models/unit_test_wrappers/unit_compare_classify_struct.yml +++ b/integration_tests/models/unit_test_wrappers/unit_compare_classify_struct.yml @@ -13,11 +13,9 @@ unit_tests: - {"id": 1, "dbt_audit_row_status": "identical", "dbt_audit_num_rows_in_status": 1} overrides: vars: - compare_classify__columns: ['id', 'col1', 'col2'] - compare_classify__event_time: - compare_classify__primary_key_columns: ['id'] - # config: - # tags: "{{ 'skip' if (target.type in ['databricks']) else 'runnable' }}" #Can't do set operations on even simple JSON cols + columns_var: ['id', 'col1', 'col2'] + event_time_var: + primary_key_columns_var: ['id'] - name: unit_compare_classify_struct_identical_values_different_order model: unit_compare_classify_struct @@ -34,9 +32,9 @@ unit_tests: - {"id": 1, "dbt_audit_row_status": "identical", "dbt_audit_num_rows_in_status": 1} overrides: vars: - compare_classify__columns: ['id', 'col1', 'col2'] - compare_classify__event_time: - compare_classify__primary_key_columns: ['id'] + columns_var: ['id', 'col1', 'col2'] + event_time_var: + primary_key_columns_var: ['id'] config: #Databricks cares about the order and considers it a difference. We're not trying to have identical behaviour across warehouses so that's OK. tags: "{{ 'skip' if (target.type in ['databricks']) else 'runnable' }}" @@ -57,9 +55,9 @@ unit_tests: - {"id": 1, "dbt_audit_row_status": "modified", "dbt_audit_num_rows_in_status": 1} overrides: vars: - compare_classify__columns: ['id', 'col1', 'col2'] - compare_classify__event_time: - compare_classify__primary_key_columns: ['id'] + columns_var: ['id', 'col1', 'col2'] + event_time_var: + primary_key_columns_var: ['id'] config: #Only for databricks tags: "{{ 'skip' if (target.type not in ['databricks']) else 'runnable' }}" @@ -79,9 +77,9 @@ unit_tests: - {"id": 1, "dbt_audit_row_status": "modified", "dbt_audit_num_rows_in_status": 1} overrides: vars: - compare_classify__columns: ['id', 'col1', 'col2'] - compare_classify__event_time: - compare_classify__primary_key_columns: ['id'] + columns_var: ['id', 'col1', 'col2'] + event_time_var: + primary_key_columns_var: ['id'] # config: # tags: "{{ 'skip' if (target.type in ['databricks']) else 'runnable' }}" #Can't do set operations on even simple JSON cols @@ -99,9 +97,9 @@ unit_tests: - {"id": 1, "dbt_audit_row_status": "identical", "dbt_audit_num_rows_in_status": 1} overrides: vars: - compare_classify__columns: ['id', 'col1', 'col2'] - compare_classify__event_time: - compare_classify__primary_key_columns: ['id'] + columns_var: ['id', 'col1', 'col2'] + event_time_var: + primary_key_columns_var: ['id'] # config: # tags: "{{ 'skip' if (target.type in ['redshift', 'databricks']) else 'runnable' }}" #haven't ported these to be multi-warehouse yet @@ -121,9 +119,9 @@ unit_tests: - {"id": 1, "dbt_audit_row_status": "modified", "dbt_audit_num_rows_in_status": 1} overrides: vars: - compare_classify__columns: ['id', 'col1', 'col2'] - compare_classify__event_time: - compare_classify__primary_key_columns: ['id'] + columns_var: ['id', 'col1', 'col2'] + event_time_var: + primary_key_columns_var: ['id'] # config: # tags: "{{ 'skip' if (target.type in ['redshift', 'databricks']) else 'runnable' }}" #haven't ported these to be multi-warehouse yet @@ -143,8 +141,8 @@ unit_tests: - {"id": 1, "dbt_audit_row_status": "modified", "dbt_audit_num_rows_in_status": 1} overrides: vars: - compare_classify__columns: ['id', 'col1', 'col2'] - compare_classify__event_time: - compare_classify__primary_key_columns: ['id'] + columns_var: ['id', 'col1', 'col2'] + event_time_var: + primary_key_columns_var: ['id'] # config: # tags: "{{ 'skip' if (target.type in ['redshift', 'databricks']) else 'runnable' }}" #haven't ported these to be multi-warehouse yet diff --git a/integration_tests/models/unit_test_wrappers/unit_compare_which_query_columns_differ.sql b/integration_tests/models/unit_test_wrappers/unit_compare_which_query_columns_differ.sql new file mode 100644 index 00000000..d2c12fde --- /dev/null +++ b/integration_tests/models/unit_test_wrappers/unit_compare_which_query_columns_differ.sql @@ -0,0 +1,17 @@ +{% set pk_cols = var('primary_key_columns_var') %} +{% set cols = var('columns_var') %} + +{% if target.type == 'snowflake' and flags.WHICH == 'run' %} + {% set pk_cols = pk_cols | map("upper") | list %} + {% set cols = cols | map("upper") | list %} +{% endif %} + +{{ + audit_helper.compare_which_query_columns_differ( + a_query = "select * from " ~ ref('unit_test_model_a') ~ " where 1=1", + b_query = "select * from " ~ ref('unit_test_model_b') ~ " where 1=1", + primary_key_columns = pk_cols, + columns = cols, + event_time = var('event_time_var') + ) +}} \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_compare_which_query_columns_differ.yml b/integration_tests/models/unit_test_wrappers/unit_compare_which_query_columns_differ.yml new file mode 100644 index 00000000..49007dd1 --- /dev/null +++ b/integration_tests/models/unit_test_wrappers/unit_compare_which_query_columns_differ.yml @@ -0,0 +1,124 @@ +unit_tests: + - name: compare_cols_identical_tables + model: unit_compare_which_query_columns_differ + + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + + expect: + rows: + - {"column_name": 'id', 'has_difference': false} + - {"column_name": 'col1', 'has_difference': false} + - {"column_name": 'col2', 'has_difference': false} + + overrides: + vars: + columns_var: ['id', 'col1', 'col2'] + event_time_var: + primary_key_columns_var: ['id'] + config: + tags: "{{ 'skip' if (target.type in ['snowflake']) else 'runnable' }}" #Case sensitivity + + - name: compare_cols_identical_tables_event_time_filter + model: unit_compare_which_query_columns_differ + overrides: + vars: + columns_var: ['id', 'col1', 'col2', 'created_at'] + event_time_var: 'created_at' + primary_key_columns_var: ['id'] + macros: + audit_helper._get_comparison_bounds: + "min_event_time": "2024-01-02" + "max_event_time": "2024-01-03" + "event_time": 'created_at' + + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def", "created_at": '2024-01-01' } + - { "id": 2, "col1": "hij", "col2": "klm", "created_at": '2024-01-02' } + - { "id": 3, "col1": "nop", "col2": "qrs", "created_at": '2024-01-03' } + - input: ref('unit_test_model_b') + rows: + - { "id": 2, "col1": "hij", "col2": "klm", "created_at": '2024-01-02' } + - { "id": 3, "col1": "nop", "col2": "qrs", "created_at": '2024-01-03' } + + expect: + rows: + - {"column_name": 'id', "has_difference": false} + - {"column_name": 'col1', "has_difference": false} + - {"column_name": 'col2', "has_difference": false} + - {"column_name": 'created_at', "has_difference": false} + config: + tags: "{{ 'skip' if (target.type in ['snowflake']) else 'runnable' }}" #Case sensitivity + + - name: compare_cols_identical_tables_snowflake + model: unit_compare_which_query_columns_differ + + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + + expect: + rows: + - {"column_name": 'ID', 'has_difference': false} + - {"column_name": 'COL1', 'has_difference': false} + - {"column_name": 'COL2', 'has_difference': false} + + overrides: + vars: + columns_var: ['ID', 'COL1', 'COL2'] + event_time_var: + primary_key_columns_var: ['ID'] + config: + tags: "{{ 'skip' if (target.type not in ['snowflake']) else 'runnable' }}" #Case sensitivity + + - name: compare_cols_identical_tables_event_time_filter_snowflake + model: unit_compare_which_query_columns_differ + overrides: + vars: + columns_var: ['ID', 'COL1', 'COL2', 'CREATED_AT'] + event_time_var: 'CREATED_AT' + primary_key_columns_var: ['ID'] + macros: + audit_helper._get_comparison_bounds: + "min_event_time": "2024-01-02" + "max_event_time": "2024-01-03" + "event_time": 'created_at' + + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def", "created_at": '2024-01-01' } + - { "id": 2, "col1": "hij", "col2": "klm", "created_at": '2024-01-02' } + - { "id": 3, "col1": "nop", "col2": "qrs", "created_at": '2024-01-03' } + - input: ref('unit_test_model_b') + rows: + - { "id": 2, "col1": "hij", "col2": "klm", "created_at": '2024-01-02' } + - { "id": 3, "col1": "nop", "col2": "qrs", "created_at": '2024-01-03' } + + expect: + rows: + - {"column_name": 'ID', "has_difference": false} + - {"column_name": 'COL1', "has_difference": false} + - {"column_name": 'COL2', "has_difference": false} + - {"column_name": 'CREATED_AT', "has_difference": false} + config: + tags: "{{ 'skip' if (target.type not in ['snowflake']) else 'runnable' }}" #Case sensitivity \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_ensure_all_pks_are_in_column_set.sql b/integration_tests/models/unit_test_wrappers/unit_ensure_all_pks_are_in_column_set.sql new file mode 100644 index 00000000..a76f30cd --- /dev/null +++ b/integration_tests/models/unit_test_wrappers/unit_ensure_all_pks_are_in_column_set.sql @@ -0,0 +1,19 @@ +{% set results = + audit_helper._ensure_all_pks_are_in_column_set( + primary_key_columns=var('primary_key_columns_var', ['a_column_with_a_large_unwieldy_name']), + columns=var('columns_var', ['b_column_with_a_large_unwieldy_name']), + ) +%} + +{% if (var('primary_key_columns_var') | length == 0) and (var('columns_var') | length == 0) %} +-- need to still provide a table shape +select 'abcdefabcdef' as col, 1 as row_index +limit 0 +{% endif %} + +{% for result in results %} + select '{{ result }}' as col, {{ loop.index }} as row_index + {% if not loop.last %} + union all + {% endif %} +{% endfor %} \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_ensure_all_pks_are_in_column_set.yml b/integration_tests/models/unit_test_wrappers/unit_ensure_all_pks_are_in_column_set.yml new file mode 100644 index 00000000..35767c9d --- /dev/null +++ b/integration_tests/models/unit_test_wrappers/unit_ensure_all_pks_are_in_column_set.yml @@ -0,0 +1,130 @@ +unit_tests: + - name: ensure_all_pks_in_columns + model: unit_ensure_all_pks_are_in_column_set + given: [] + overrides: + vars: + primary_key_columns_var: ['pk1', 'pk2'] + columns_var: ['pk1', 'pk2', 'column_a', 'column_b'] + + expect: + rows: + - {"col": 'pk1', "row_index": 1} + - {"col": 'pk2', "row_index": 2} + - {"col": 'column_a', "row_index": 3} + - {"col": 'column_b', "row_index": 4} + + - name: ensure_all_pks_in_columns_pks_at_end + model: unit_ensure_all_pks_are_in_column_set + description: PKs are specified in `columns` so should be at end of list + given: [] + overrides: + vars: + primary_key_columns_var: ['pk1', 'pk2'] + columns_var: ['column_a', 'column_b', 'pk1', 'pk2'] + + expect: + rows: + - {"col": 'column_a', "row_index": 1} + - {"col": 'column_b', "row_index": 2} + - {"col": 'pk1', "row_index": 3} + - {"col": 'pk2', "row_index": 4} + + - name: ensure_all_pks_in_columns_one_missing_pk + model: unit_ensure_all_pks_are_in_column_set + description: PK specified in `columns` should be at end of list, missing PK will be added at front + given: [] + overrides: + vars: + primary_key_columns_var: ['pk1', 'pk2'] + columns_var: ['column_a', 'column_b', 'pk2'] + + expect: + rows: + - {"col": 'pk1', "row_index": 1} + - {"col": 'column_a', "row_index": 2} + - {"col": 'column_b', "row_index": 3} + - {"col": 'pk2', "row_index": 4} + + - name: ensure_all_pks_in_columns_empty_sets + model: unit_ensure_all_pks_are_in_column_set + given: [] + overrides: + vars: + primary_key_columns_var: [] + columns_var: [] + + expect: + rows: [] + + - name: ensure_all_pks_in_columns_no_pks + model: unit_ensure_all_pks_are_in_column_set + given: [] + overrides: + vars: + primary_key_columns_var: [] + columns_var: ['column_a', 'column_b'] + + expect: + rows: + - {"col": 'column_a', "row_index": 1} + - {"col": 'column_b', "row_index": 2} + + - name: ensure_all_pks_in_columns_no_cols + model: unit_ensure_all_pks_are_in_column_set + given: [] + overrides: + vars: + primary_key_columns_var: ['pk1', 'pk2'] + columns_var: [] + + expect: + rows: + - {"col": 'pk1', "row_index": 1} + - {"col": 'pk2', "row_index": 2} + + - name: ensure_all_pks_in_columns_caps_pk + model: unit_ensure_all_pks_are_in_column_set + given: [] + overrides: + vars: + primary_key_columns_var: ['pk2', 'PK1'] + columns_var: ['pk1', 'pk2', 'column_a', 'column_b'] + + expect: + rows: + - {"col": 'pk1', "row_index": 1} + - {"col": 'pk2', "row_index": 2} + - {"col": 'column_a', "row_index": 3} + - {"col": 'column_b', "row_index": 4} + + - name: ensure_all_pks_in_columns_caps_col + model: unit_ensure_all_pks_are_in_column_set + given: [] + overrides: + vars: + primary_key_columns_var: ['pk2', 'pk1'] + columns_var: ['pk1', 'pk2', 'COLUMN_A', 'column_b'] + + expect: + rows: + - {"col": 'pk1', "row_index": 1} + - {"col": 'pk2', "row_index": 2} + - {"col": 'COLUMN_A', "row_index": 3} + - {"col": 'column_b', "row_index": 4} + + - name: ensure_all_pks_in_columns_caps_pk_in_both + model: unit_ensure_all_pks_are_in_column_set + given: [] + overrides: + vars: + primary_key_columns_var: ['pk2', 'PK1'] + columns_var: ['PK1', 'pk2', 'column_a', 'column_b'] + + expect: + rows: + - {"col": 'PK1', "row_index": 1} + - {"col": 'pk2', "row_index": 2} + - {"col": 'column_a', "row_index": 3} + - {"col": 'column_b', "row_index": 4} + \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.sql b/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.sql index 5eea8d81..92661c51 100644 --- a/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.sql +++ b/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.sql @@ -5,6 +5,6 @@ "select * from " ~ ref('unit_test_model_a') ~ " where 1=1", "select * from " ~ ref('unit_test_model_b') ~ " where 1=1", columns=var('quick_are_queries_identical_cols'), - event_time=var('quick_are_queries_identical_event_time') + event_time=var('event_time_var') ) }} \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.yml b/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.yml index ddfddc2d..66f56a27 100644 --- a/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.yml +++ b/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.yml @@ -21,14 +21,14 @@ unit_tests: overrides: vars: quick_are_queries_identical_cols: ['id', 'col1', 'col2'] - quick_are_queries_identical_event_time: + event_time_var: - name: quick_are_queries_identical_identical_tables_event_time_filter model: unit_quick_are_queries_identical overrides: vars: quick_are_queries_identical_cols: ['id', 'col1', 'col2', 'created_at'] - quick_are_queries_identical_event_time: 'created_at' + event_time_var: 'created_at' macros: audit_helper._get_comparison_bounds: "min_event_time": "2024-01-02" @@ -55,7 +55,7 @@ unit_tests: overrides: vars: quick_are_queries_identical_cols: ['id', 'col1', 'col2'] - quick_are_queries_identical_event_time: + event_time_var: given: - input: ref('unit_test_model_a') rows: @@ -94,4 +94,4 @@ unit_tests: overrides: vars: quick_are_queries_identical_cols: ['id', 'col1', 'col2'] - quick_are_queries_identical_event_time: + event_time_var: diff --git a/macros/compare_which_columns_differ.sql b/macros/compare_which_columns_differ.sql deleted file mode 100644 index bc7c16cc..00000000 --- a/macros/compare_which_columns_differ.sql +++ /dev/null @@ -1,46 +0,0 @@ -{% macro compare_which_columns_differ(a_relation, b_relation, primary_key, exclude_columns=[]) %} - {{ return(adapter.dispatch('compare_which_columns_differ', 'audit_helper')(a_relation, b_relation, primary_key, exclude_columns)) }} -{% endmacro %} - -{% macro default__compare_which_columns_differ(a_relation, b_relation, primary_key, exclude_columns=[]) %} - -{% set column_names = dbt_utils.get_filtered_columns_in_relation(from=a_relation, except=exclude_columns) %} - -with bool_or as ( - - select - true as anchor - {% for column in column_names %} - {% set column_name = adapter.quote(column) %} - {% set compare_statement %} - ((a.{{ column_name }} != b.{{ column_name }}) - or (a.{{ column_name }} is null and b.{{ column_name }} is not null) - or (a.{{ column_name }} is not null and b.{{ column_name }} is null)) - {% endset %} - - , {{ dbt.bool_or(compare_statement) }} as {{ column | lower }}_has_difference - - {% endfor %} - from {{ a_relation }} as a - inner join {{ b_relation }} as b - on a.{{ primary_key }} = b.{{ primary_key }} - -) - -{% for column in column_names %} - - select - '{{ column }}' as column_name, - {{ column | lower }}_has_difference as has_difference - - from bool_or - - {% if not loop.last %} - - union all - - {% endif %} - -{% endfor %} - -{% endmacro %} diff --git a/macros/compare_which_query_columns_differ.sql b/macros/compare_which_query_columns_differ.sql new file mode 100644 index 00000000..139b8c17 --- /dev/null +++ b/macros/compare_which_query_columns_differ.sql @@ -0,0 +1,64 @@ +{% macro compare_which_query_columns_differ(a_query, b_query, primary_key_columns=[], columns=[], event_time=None) %} + {{ return(adapter.dispatch('compare_which_query_columns_differ', 'audit_helper')(a_query, b_query, primary_key_columns, columns, event_time)) }} +{% endmacro %} + +{% macro default__compare_which_query_columns_differ(a_query, b_query, primary_key_columns, columns, event_time) %} + {% set columns = audit_helper._ensure_all_pks_are_in_column_set(primary_key_columns, columns) %} + {% if event_time %} + {% set event_time_props = audit_helper._get_comparison_bounds(event_time) %} + {% endif %} + + {% set joined_cols = columns | join (", ") %} + + with a as ( + select + {{ joined_cols }}, + {{ audit_helper._generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key + from ({{ a_query }}) as a_subq + {{ audit_helper.event_time_filter(event_time_props) }} + ), + b as ( + select + {{ joined_cols }}, + {{ audit_helper._generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key + from ({{ b_query }}) as b_subq + {{ audit_helper.event_time_filter(event_time_props) }} + ), + + calculated as ( + select + {% for column in columns %} + {% set quoted_column = adapter.quote(column) %} + {% set compare_statement %} + ( + (a.{{ quoted_column }} != b.{{ quoted_column }}) + or (a.{{ quoted_column }} is null and b.{{ quoted_column }} is not null) + or (a.{{ quoted_column }} is not null and b.{{ quoted_column }} is null) + ) + {% endset %} + + {{ dbt.bool_or(compare_statement) }} as {{ column | lower }}_has_difference + + {%- if not loop.last %}, {% endif %} + {% endfor %} + from a + inner join b on a.dbt_audit_surrogate_key = b.dbt_audit_surrogate_key + ) + + {% for column in columns %} + + select + '{{ column }}' as column_name, + {{ column | lower }}_has_difference as has_difference + + from calculated + + {% if not loop.last %} + + union all + + {% endif %} + + {% endfor %} + +{% endmacro %} diff --git a/macros/compare_which_relation_columns_differ.sql b/macros/compare_which_relation_columns_differ.sql new file mode 100644 index 00000000..ac6efe12 --- /dev/null +++ b/macros/compare_which_relation_columns_differ.sql @@ -0,0 +1,15 @@ +{% macro compare_which_relation_columns_differ(a_relation, b_relation, primary_key_columns=[], columns=[], event_time=None) %} + {%- if not columns -%} + {%- set columns = audit_helper._get_intersecting_columns_from_relations(a_relation, b_relation) -%} + {%- endif -%} + + {{ + audit_helper.compare_which_query_columns_differ( + "select * from " ~ a_relation, + "select * from " ~ b_relation, + primary_key_columns, + columns, + event_time + ) + }} +{% endmacro %} \ No newline at end of file diff --git a/macros/quick_are_queries_identical.sql b/macros/quick_are_queries_identical.sql index af00c8cd..a0800c3d 100644 --- a/macros/quick_are_queries_identical.sql +++ b/macros/quick_are_queries_identical.sql @@ -28,18 +28,12 @@ but it's a good way to quickly verify identical results if that's what you're ex with query_a as ( select {{ joined_cols }} from ({{ query_a }}) - {% if event_time_props %} - where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' - and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' - {% endif %} + {{ audit_helper.event_time_filter(event_time_props) }} ), query_b as ( select {{ joined_cols }} from ({{ query_b }}) - {% if event_time_props %} - where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' - and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' - {% endif %} + {{ audit_helper.event_time_filter(event_time_props) }} ) select count(distinct hash_result) = 1 as are_tables_identical @@ -64,19 +58,13 @@ but it's a good way to quickly verify identical results if that's what you're ex from ( select hash_agg({{ joined_cols }}) as hash_result from ({{ query_a }}) query_a_subq - {% if event_time_props %} - where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' - and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' - {% endif %} + {{ audit_helper.event_time_filter(event_time_props) }} union all select hash_agg({{ joined_cols }}) as hash_result from ({{ query_b }}) query_b_subq - {% if event_time_props %} - where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' - and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' - {% endif %} + {{ audit_helper.event_time_filter(event_time_props) }} ) as hashes {% endmacro %} \ No newline at end of file diff --git a/macros/utils/_generate_set_results.sql b/macros/utils/_generate_set_results.sql index b4fcc473..890cb9d1 100644 --- a/macros/utils/_generate_set_results.sql +++ b/macros/utils/_generate_set_results.sql @@ -16,10 +16,7 @@ {{ joined_cols }}, {{ audit_helper._generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key from ( {{- a_query -}} ) a_base_subq - {% if event_time_props %} - where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' - and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' - {% endif %} + {{ audit_helper.event_time_filter(event_time_props) }} ), b_base as ( @@ -27,10 +24,7 @@ {{ joined_cols }}, {{ audit_helper._generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key from ( {{- b_query -}} ) b_base_subq - {% if event_time_props %} - where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' - and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' - {% endif %} + {{ audit_helper.event_time_filter(event_time_props) }} ), a as ( @@ -81,10 +75,7 @@ {{ surrogate_key }} as dbt_audit_surrogate_key, row_number() over (partition by {{ surrogate_key }} order by 1 ) as dbt_audit_pk_row_num from ( {{- a_query -}} ) - {% if event_time_props %} - where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' - and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' - {% endif %} + {{ audit_helper.event_time_filter(event_time_props) }} ), subset_columns_b as ( @@ -93,10 +84,7 @@ {{ surrogate_key }} as dbt_audit_surrogate_key, row_number() over (partition by {{ surrogate_key }} order by 1 ) as dbt_audit_pk_row_num from ( {{- b_query -}} ) - {% if event_time_props %} - where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' - and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' - {% endif %} + {{ audit_helper.event_time_filter(event_time_props) }} ), a as ( @@ -151,10 +139,7 @@ row_number() over (partition by {{ surrogate_key }} order by 1 ) as dbt_audit_pk_row_num, xxhash64({{ joined_cols }}, dbt_audit_pk_row_num) as dbt_audit_row_hash from ( {{- a_query -}} ) - {% if event_time_props %} - where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' - and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' - {% endif %} + {{ audit_helper.event_time_filter(event_time_props) }} ), b as ( @@ -164,10 +149,7 @@ row_number() over (partition by {{ surrogate_key }} order by 1 ) as dbt_audit_pk_row_num, xxhash64({{ joined_cols }}, dbt_audit_pk_row_num) as dbt_audit_row_hash from ( {{- b_query -}} ) - {% if event_time_props %} - where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' - and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' - {% endif %} + {{ audit_helper.event_time_filter(event_time_props) }} ), a_intersect_b as ( @@ -201,10 +183,7 @@ row_number() over (partition by dbt_audit_surrogate_key order by dbt_audit_surrogate_key ) as dbt_audit_pk_row_num, hash({{ joined_cols }}, dbt_audit_pk_row_num) as dbt_audit_row_hash from ( {{- a_query -}} ) - {% if event_time_props %} - where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' - and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' - {% endif %} + {{ audit_helper.event_time_filter(event_time_props) }} ), b as ( @@ -214,10 +193,7 @@ row_number() over (partition by dbt_audit_surrogate_key order by dbt_audit_surrogate_key ) as dbt_audit_pk_row_num, hash({{ joined_cols }}, dbt_audit_pk_row_num) as dbt_audit_row_hash from ( {{- b_query -}} ) - {% if event_time_props %} - where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' - and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' - {% endif %} + {{ audit_helper.event_time_filter(event_time_props) }} ), a_intersect_b as ( diff --git a/macros/utils/_get_comparison_bounds.sql b/macros/utils/_get_comparison_bounds.sql index c644f062..8ac05731 100644 --- a/macros/utils/_get_comparison_bounds.sql +++ b/macros/utils/_get_comparison_bounds.sql @@ -42,4 +42,11 @@ model_a │ │┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼ {% endfor %} {% do return(event_time_props) %} +{% endmacro %} + +{% macro event_time_filter(event_time_props) %} + {% if event_time_props %} + where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' + and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' + {% endif %} {% endmacro %} \ No newline at end of file From 2c3cc94441939ea0ea078960d70b121c8e7ce2c4 Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Fri, 14 Jun 2024 08:55:32 +1200 Subject: [PATCH 13/13] rm packagelock.yml --- package-lock.yml | 4 ---- 1 file changed, 4 deletions(-) delete mode 100644 package-lock.yml diff --git a/package-lock.yml b/package-lock.yml deleted file mode 100644 index 32c6ccc0..00000000 --- a/package-lock.yml +++ /dev/null @@ -1,4 +0,0 @@ -packages: - - package: dbt-labs/dbt_utils - version: 1.1.1 -sha1_hash: 106400343ad0c92a7417f5156d0d6c3893bb2429