diff --git a/.circleci/config.yml b/.circleci/config.yml index 1701f721..cde7c9e7 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -33,7 +33,7 @@ jobs: . dbt_venv/bin/activate python -m pip install --upgrade pip setuptools - python -m pip install --pre dbt-core dbt-postgres dbt-redshift dbt-snowflake dbt-bigquery + python -m pip install --pre dbt-core dbt-postgres dbt-redshift dbt-snowflake dbt-bigquery dbt-databricks mkdir -p ~/.dbt cp integration_tests/ci/sample.profiles.yml ~/.dbt/profiles.yml @@ -52,8 +52,8 @@ jobs: dbt deps --target postgres dbt seed --target postgres --full-refresh dbt compile --target postgres - dbt run --target postgres - dbt test --target postgres + dbt run --target postgres --exclude tag:skip+ tag:temporary_skip+ + dbt test --target postgres --exclude tag:skip+ tag:temporary_skip+ - run: name: "Run Tests - Redshift" @@ -64,8 +64,8 @@ jobs: dbt deps --target redshift dbt seed --target redshift --full-refresh dbt compile --target redshift - dbt run --target redshift - dbt test --target redshift + dbt run --target redshift --exclude tag:skip+ tag:temporary_skip+ + dbt test --target redshift --exclude tag:skip+ tag:temporary_skip+ - run: name: "Run Tests - Snowflake" @@ -76,8 +76,8 @@ jobs: dbt deps --target snowflake dbt seed --target snowflake --full-refresh dbt compile --target snowflake - dbt run --target snowflake - dbt test --target snowflake + dbt run --target snowflake --exclude tag:skip+ tag:temporary_skip+ + dbt test --target snowflake --exclude tag:skip+ tag:temporary_skip+ - run: name: "Run Tests - BigQuery" @@ -91,9 +91,20 @@ jobs: dbt deps --target bigquery dbt seed --target bigquery --full-refresh dbt compile --target bigquery - dbt run --target bigquery --full-refresh - dbt test --target bigquery + dbt run --target bigquery --full-refresh --exclude tag:skip+ tag:temporary_skip+ + dbt test --target bigquery --exclude tag:skip+ tag:temporary_skip+ + - run: + name: "Run Tests - Databricks" + command: | + . dbt_venv/bin/activate + echo `pwd` + cd integration_tests + dbt deps --target databricks + dbt seed --target databricks --full-refresh + dbt compile --target databricks + dbt run --target databricks --exclude tag:skip+ tag:temporary_skip+ + dbt test --target databricks --exclude tag:skip+ tag:temporary_skip+ - save_cache: key: deps1-{{ .Branch }} @@ -115,3 +126,4 @@ workflows: - profile-redshift - profile-snowflake - profile-bigquery + - profile-databricks diff --git a/.gitignore b/.gitignore index a33e3f41..0606e5c3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ target/ dbt_packages/ logs/ -logfile \ No newline at end of file +logfile +.DS_Store diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 00000000..437dcba6 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,21 @@ +{ + "yaml.schemas": { + "https://raw.githubusercontent.com/dbt-labs/dbt-jsonschema/main/schemas/latest/dbt_yml_files-latest.json": [ + "/**/*.yml", + "!profiles.yml", + "!dbt_project.yml", + "!packages.yml", + "!selectors.yml", + "!profile_template.yml" + ], + "https://raw.githubusercontent.com/dbt-labs/dbt-jsonschema/main/schemas/latest/dbt_project-latest.json": [ + "dbt_project.yml" + ], + "https://raw.githubusercontent.com/dbt-labs/dbt-jsonschema/main/schemas/latest/selectors-latest.json": [ + "selectors.yml" + ], + "https://raw.githubusercontent.com/dbt-labs/dbt-jsonschema/main/schemas/latest/packages-latest.json": [ + "packages.yml" + ] + }, +} \ No newline at end of file diff --git a/integration_tests/ci/sample.profiles.yml b/integration_tests/ci/sample.profiles.yml index 843d659e..ea8effc1 100644 --- a/integration_tests/ci/sample.profiles.yml +++ b/integration_tests/ci/sample.profiles.yml @@ -2,10 +2,6 @@ # HEY! This file is used in the dbt-audit-helper integrations tests with CircleCI. # You should __NEVER__ check credentials into version control. Thanks for reading :) -config: - send_anonymous_usage_stats: False - use_colors: True - integration_tests: target: postgres outputs: @@ -27,7 +23,7 @@ integration_tests: dbname: "{{ env_var('REDSHIFT_TEST_DBNAME') }}" port: "{{ env_var('REDSHIFT_TEST_PORT') | as_number }}" schema: audit_helper_integration_tests_redshift - threads: 1 + threads: 8 bigquery: type: bigquery @@ -35,7 +31,7 @@ integration_tests: keyfile: "{{ env_var('BIGQUERY_SERVICE_KEY_PATH') }}" project: "{{ env_var('BIGQUERY_TEST_DATABASE') }}" schema: audit_helper_integration_tests_bigquery - threads: 1 + threads: 8 snowflake: type: snowflake @@ -46,4 +42,12 @@ integration_tests: database: "{{ env_var('SNOWFLAKE_TEST_DATABASE') }}" warehouse: "{{ env_var('SNOWFLAKE_TEST_WAREHOUSE') }}" schema: audit_helper_integration_tests_snowflake - threads: 1 + threads: 8 + + databricks: + type: databricks + schema: dbt_project_evaluator_integration_tests_databricks + host: "{{ env_var('DATABRICKS_TEST_HOST') }}" + http_path: "{{ env_var('DATABRICKS_TEST_HTTP_PATH') }}" + token: "{{ env_var('DATABRICKS_TEST_ACCESS_TOKEN') }}" + threads: 10 diff --git a/integration_tests/dbt_project.yml b/integration_tests/dbt_project.yml index 07120e4c..ef906729 100644 --- a/integration_tests/dbt_project.yml +++ b/integration_tests/dbt_project.yml @@ -17,3 +17,15 @@ clean-targets: # directories to be removed by `dbt clean` seeds: +quote_columns: false + +vars: + compare_queries_summarize: true + reworked_compare__primary_key_columns: ['col1'] + reworked_compare__columns: ['col1'] + reworked_compare__event_time: + quick_are_queries_identical_cols: ['col1'] + quick_are_queries_identical_event_time: + +flags: + send_anonymous_usage_stats: False + use_colors: True \ No newline at end of file diff --git a/integration_tests/models/compare_all_columns_concat_pk_with_summary.sql b/integration_tests/models/data_tests/compare_all_columns_concat_pk_with_summary.sql similarity index 100% rename from integration_tests/models/compare_all_columns_concat_pk_with_summary.sql rename to integration_tests/models/data_tests/compare_all_columns_concat_pk_with_summary.sql diff --git a/integration_tests/models/compare_all_columns_concat_pk_without_summary.sql b/integration_tests/models/data_tests/compare_all_columns_concat_pk_without_summary.sql similarity index 100% rename from integration_tests/models/compare_all_columns_concat_pk_without_summary.sql rename to integration_tests/models/data_tests/compare_all_columns_concat_pk_without_summary.sql diff --git a/integration_tests/models/compare_all_columns_where_clause.sql b/integration_tests/models/data_tests/compare_all_columns_where_clause.sql similarity index 100% rename from integration_tests/models/compare_all_columns_where_clause.sql rename to integration_tests/models/data_tests/compare_all_columns_where_clause.sql diff --git a/integration_tests/models/compare_all_columns_with_summary.sql b/integration_tests/models/data_tests/compare_all_columns_with_summary.sql similarity index 100% rename from integration_tests/models/compare_all_columns_with_summary.sql rename to integration_tests/models/data_tests/compare_all_columns_with_summary.sql diff --git a/integration_tests/models/compare_all_columns_with_summary_and_exclude.sql b/integration_tests/models/data_tests/compare_all_columns_with_summary_and_exclude.sql similarity index 100% rename from integration_tests/models/compare_all_columns_with_summary_and_exclude.sql rename to integration_tests/models/data_tests/compare_all_columns_with_summary_and_exclude.sql diff --git a/integration_tests/models/compare_all_columns_without_summary.sql b/integration_tests/models/data_tests/compare_all_columns_without_summary.sql similarity index 100% rename from integration_tests/models/compare_all_columns_without_summary.sql rename to integration_tests/models/data_tests/compare_all_columns_without_summary.sql diff --git a/integration_tests/models/compare_queries.sql b/integration_tests/models/data_tests/compare_queries.sql similarity index 100% rename from integration_tests/models/compare_queries.sql rename to integration_tests/models/data_tests/compare_queries.sql diff --git a/integration_tests/models/compare_queries_concat_pk_without_summary.sql b/integration_tests/models/data_tests/compare_queries_concat_pk_without_summary.sql similarity index 100% rename from integration_tests/models/compare_queries_concat_pk_without_summary.sql rename to integration_tests/models/data_tests/compare_queries_concat_pk_without_summary.sql diff --git a/integration_tests/models/compare_queries_with_summary.sql b/integration_tests/models/data_tests/compare_queries_with_summary.sql similarity index 100% rename from integration_tests/models/compare_queries_with_summary.sql rename to integration_tests/models/data_tests/compare_queries_with_summary.sql diff --git a/integration_tests/models/compare_queries_without_summary.sql b/integration_tests/models/data_tests/compare_queries_without_summary.sql similarity index 100% rename from integration_tests/models/compare_queries_without_summary.sql rename to integration_tests/models/data_tests/compare_queries_without_summary.sql diff --git a/integration_tests/models/compare_relation_columns.sql b/integration_tests/models/data_tests/compare_relation_columns.sql similarity index 100% rename from integration_tests/models/compare_relation_columns.sql rename to integration_tests/models/data_tests/compare_relation_columns.sql diff --git a/integration_tests/models/compare_relations_concat_pk_without_summary.sql b/integration_tests/models/data_tests/compare_relations_concat_pk_without_summary.sql similarity index 100% rename from integration_tests/models/compare_relations_concat_pk_without_summary.sql rename to integration_tests/models/data_tests/compare_relations_concat_pk_without_summary.sql diff --git a/integration_tests/models/compare_relations_with_exclude.sql b/integration_tests/models/data_tests/compare_relations_with_exclude.sql similarity index 100% rename from integration_tests/models/compare_relations_with_exclude.sql rename to integration_tests/models/data_tests/compare_relations_with_exclude.sql diff --git a/integration_tests/models/compare_relations_with_summary.sql b/integration_tests/models/data_tests/compare_relations_with_summary.sql similarity index 100% rename from integration_tests/models/compare_relations_with_summary.sql rename to integration_tests/models/data_tests/compare_relations_with_summary.sql diff --git a/integration_tests/models/compare_relations_without_exclude.sql b/integration_tests/models/data_tests/compare_relations_without_exclude.sql similarity index 100% rename from integration_tests/models/compare_relations_without_exclude.sql rename to integration_tests/models/data_tests/compare_relations_without_exclude.sql diff --git a/integration_tests/models/compare_relations_without_summary.sql b/integration_tests/models/data_tests/compare_relations_without_summary.sql similarity index 100% rename from integration_tests/models/compare_relations_without_summary.sql rename to integration_tests/models/data_tests/compare_relations_without_summary.sql diff --git a/integration_tests/models/compare_row_counts.sql b/integration_tests/models/data_tests/compare_row_counts.sql similarity index 100% rename from integration_tests/models/compare_row_counts.sql rename to integration_tests/models/data_tests/compare_row_counts.sql diff --git a/integration_tests/models/compare_which_columns_differ.sql b/integration_tests/models/data_tests/compare_which_columns_differ.sql similarity index 100% rename from integration_tests/models/compare_which_columns_differ.sql rename to integration_tests/models/data_tests/compare_which_columns_differ.sql diff --git a/integration_tests/models/compare_which_columns_differ_exclude_cols.sql b/integration_tests/models/data_tests/compare_which_columns_differ_exclude_cols.sql similarity index 100% rename from integration_tests/models/compare_which_columns_differ_exclude_cols.sql rename to integration_tests/models/data_tests/compare_which_columns_differ_exclude_cols.sql diff --git a/integration_tests/models/schema.yml b/integration_tests/models/data_tests/schema.yml similarity index 90% rename from integration_tests/models/schema.yml rename to integration_tests/models/data_tests/schema.yml index 4bea9838..fbe74ff7 100644 --- a/integration_tests/models/schema.yml +++ b/integration_tests/models/data_tests/schema.yml @@ -2,96 +2,96 @@ version: 2 models: - name: compare_queries - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_relations_without_exclude') - name: compare_queries_concat_pk_without_summary - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_without_summary') - name: compare_queries_with_summary - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_with_summary') - name: compare_queries_without_summary - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_without_summary') - name: compare_relations_with_summary - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_with_summary') - name: compare_relations_without_summary - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_without_summary') - name: compare_relations_with_exclude - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_relations_with_exclude') - name: compare_relations_without_exclude - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_relations_without_exclude') - name: compare_all_columns_with_summary - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_all_columns_with_summary') - name: compare_all_columns_without_summary - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_all_columns_without_summary') - name: compare_all_columns_concat_pk_with_summary - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_all_columns_concat_pk_with_summary') - name: compare_all_columns_concat_pk_without_summary - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_all_columns_concat_pk_without_summary') - name: compare_all_columns_with_summary_and_exclude - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_all_columns_with_summary_and_exclude') - name: compare_all_columns_where_clause - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_all_columns_where_clause') - name: compare_relation_columns - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_relation_columns') - name: compare_relations_concat_pk_without_summary - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_without_summary') - name: compare_which_columns_differ - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_which_columns_differ') - name: compare_which_columns_differ_exclude_cols - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_which_columns_differ_exclude_cols') - name: compare_row_counts - tests: + data_tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_row_counts') diff --git a/integration_tests/models/unit_test_placeholder_models/unit_test_model_a.sql b/integration_tests/models/unit_test_placeholder_models/unit_test_model_a.sql new file mode 100644 index 00000000..a4bc3985 --- /dev/null +++ b/integration_tests/models/unit_test_placeholder_models/unit_test_model_a.sql @@ -0,0 +1 @@ +select 12 as id, 22 as id_2, 'xyz' as col1, 'tuv' as col2, 123 as col3, {{ dbt.current_timestamp() }} as created_at \ No newline at end of file diff --git a/integration_tests/models/unit_test_placeholder_models/unit_test_model_b.sql b/integration_tests/models/unit_test_placeholder_models/unit_test_model_b.sql new file mode 100644 index 00000000..a4bc3985 --- /dev/null +++ b/integration_tests/models/unit_test_placeholder_models/unit_test_model_b.sql @@ -0,0 +1 @@ +select 12 as id, 22 as id_2, 'xyz' as col1, 'tuv' as col2, 123 as col3, {{ dbt.current_timestamp() }} as created_at \ No newline at end of file diff --git a/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_a.sql b/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_a.sql new file mode 100644 index 00000000..24d584e8 --- /dev/null +++ b/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_a.sql @@ -0,0 +1,3 @@ +{{ config(tags=['skip' if (target.type in ['redshift', 'bigquery', 'postgres', 'databricks']) else 'runnable']) }} + +select 1 as id, 'John Doe' as col1, object_construct('street', '123 Main St', 'city', 'Anytown', 'state', 'CA') as col2 \ No newline at end of file diff --git a/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_b.sql b/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_b.sql new file mode 100644 index 00000000..24d584e8 --- /dev/null +++ b/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_b.sql @@ -0,0 +1,3 @@ +{{ config(tags=['skip' if (target.type in ['redshift', 'bigquery', 'postgres', 'databricks']) else 'runnable']) }} + +select 1 as id, 'John Doe' as col1, object_construct('street', '123 Main St', 'city', 'Anytown', 'state', 'CA') as col2 \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_compare_queries.sql b/integration_tests/models/unit_test_wrappers/unit_compare_queries.sql new file mode 100644 index 00000000..c589ee53 --- /dev/null +++ b/integration_tests/models/unit_test_wrappers/unit_compare_queries.sql @@ -0,0 +1,8 @@ + +{{ + audit_helper.compare_queries( + "select * from " ~ ref('unit_test_model_a'), + "select * from " ~ ref('unit_test_model_b'), + summarize = var('compare_queries_summarize') + ) +}} \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_compare_queries.yml b/integration_tests/models/unit_test_wrappers/unit_compare_queries.yml new file mode 100644 index 00000000..0308e509 --- /dev/null +++ b/integration_tests/models/unit_test_wrappers/unit_compare_queries.yml @@ -0,0 +1,47 @@ +unit_tests: + - name: identical_records_compare_queries + model: unit_compare_queries + description: The world's most basic unit test. + + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + + expect: + rows: + - {"in_a": true, "in_b": true} + + overrides: + vars: + compare_queries_summarize: true + + - name: identical_records_compare_queries_no_summarize + model: unit_compare_queries + description: The world's second most basic unit test. + + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + + expect: + rows: [] + + overrides: + vars: + compare_queries_summarize: false diff --git a/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.sql b/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.sql new file mode 100644 index 00000000..e969b1e2 --- /dev/null +++ b/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.sql @@ -0,0 +1,10 @@ +{{ config(tags=['skip' if (target.type in ['redshift', 'bigquery', 'postgres', 'databricks']) else 'runnable']) }} + +{{ + audit_helper.quick_are_queries_identical( + "select * from " ~ ref('unit_test_model_a'), + "select * from " ~ ref('unit_test_model_b'), + columns=var('quick_are_queries_identical_cols'), + event_time=var('quick_are_queries_identical_event_time') + ) +}} \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.yml b/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.yml new file mode 100644 index 00000000..0d953506 --- /dev/null +++ b/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.yml @@ -0,0 +1,97 @@ +unit_tests: + - name: quick_are_queries_identical_identical_tables + model: unit_quick_are_queries_identical + + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + + expect: + rows: + - {"are_tables_identical": true} + + overrides: + vars: + quick_are_queries_identical_cols: ['id', 'col1', 'col2'] + quick_are_queries_identical_event_time: + + - name: quick_are_queries_identical_identical_tables_event_time_filter + model: unit_quick_are_queries_identical + overrides: + vars: + quick_are_queries_identical_cols: ['id', 'col1', 'col2', 'created_at'] + quick_are_queries_identical_event_time: 'created_at' + macros: + audit_helper.get_comparison_bounds: + "min_event_time": "2024-01-02" + "max_event_time": "2024-01-03" + "event_time": 'created_at' + + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def", "created_at": '2024-01-01' } + - { "id": 2, "col1": "hij", "col2": "klm", "created_at": '2024-01-02' } + - { "id": 3, "col1": "nop", "col2": "qrs", "created_at": '2024-01-03' } + - input: ref('unit_test_model_b') + rows: + - { "id": 2, "col1": "hij", "col2": "klm", "created_at": '2024-01-02' } + - { "id": 3, "col1": "nop", "col2": "qrs", "created_at": '2024-01-03' } + + expect: + rows: + - {"are_tables_identical": true} + + - name: quick_are_queries_identical_differences + model: unit_quick_are_queries_identical + overrides: + vars: + quick_are_queries_identical_cols: ['id', 'col1', 'col2'] + quick_are_queries_identical_event_time: + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "changed", "col2": "values" } + - { "id": 4, "col1": "nop", "col2": "qrs" } + + expect: + rows: + - {"are_tables_identical": false} + + - name: quick_are_queries_identical_identical_tables_with_null_pks + model: unit_quick_are_queries_identical + + given: + - input: ref('unit_test_model_a') + rows: + - { "id":, "col1": "abc", "col2": "def" } + - { "id":, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id":, "col1": "abc", "col2": "def" } + - { "id":, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + + expect: + rows: + - {"are_tables_identical": true} + + overrides: + vars: + quick_are_queries_identical_cols: ['id', 'col1', 'col2'] + quick_are_queries_identical_event_time: diff --git a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.sql b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.sql new file mode 100644 index 00000000..37473546 --- /dev/null +++ b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.sql @@ -0,0 +1,9 @@ +{{ + audit_helper.reworked_compare( + "select * from " ~ ref('unit_test_model_a'), + "select * from " ~ ref('unit_test_model_b'), + primary_key_columns=var('reworked_compare__primary_key_columns'), + columns=var('reworked_compare__columns'), + event_time=var('reworked_compare__event_time') + ) +}} \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml new file mode 100644 index 00000000..fd2c0d02 --- /dev/null +++ b/integration_tests/models/unit_test_wrappers/unit_reworked_compare.yml @@ -0,0 +1,229 @@ +unit_tests: + - name: reworked_compare_identical_tables + model: unit_reworked_compare + + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + + expect: + rows: + - {"dbt_audit_row_status": 'identical', 'id': 1, dbt_audit_num_rows_in_status: 3} + - {"dbt_audit_row_status": 'identical', 'id': 3, dbt_audit_num_rows_in_status: 3} + - {"dbt_audit_row_status": 'identical', 'id': 2, dbt_audit_num_rows_in_status: 3} + + overrides: + vars: + reworked_compare__columns: ['id', 'col1', 'col2'] + reworked_compare__event_time: + reworked_compare__primary_key_columns: ['id'] + + - name: reworked_compare_identical_tables_event_time_filter + model: unit_reworked_compare + overrides: + vars: + reworked_compare__columns: ['id', 'col1', 'col2', 'created_at'] + reworked_compare__event_time: 'created_at' + reworked_compare__primary_key_columns: ['id'] + macros: + audit_helper.get_comparison_bounds: + "min_event_time": "2024-01-02" + "max_event_time": "2024-01-03" + "event_time": 'created_at' + + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def", "created_at": '2024-01-01' } + - { "id": 2, "col1": "hij", "col2": "klm", "created_at": '2024-01-02' } + - { "id": 3, "col1": "nop", "col2": "qrs", "created_at": '2024-01-03' } + - input: ref('unit_test_model_b') + rows: + - { "id": 2, "col1": "hij", "col2": "klm", "created_at": '2024-01-02' } + - { "id": 3, "col1": "nop", "col2": "qrs", "created_at": '2024-01-03' } + + expect: + rows: + - {"dbt_audit_row_status": 'identical', 'id': 2, dbt_audit_num_rows_in_status: 2} + - {"dbt_audit_row_status": 'identical', 'id': 3, dbt_audit_num_rows_in_status: 2} + + - name: reworked_compare_all_statuses + model: unit_reworked_compare + overrides: + vars: + reworked_compare__columns: ['id', 'col1', 'col2'] + reworked_compare__event_time: + reworked_compare__primary_key_columns: ['id'] + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "changed", "col2": "values" } + - { "id": 4, "col1": "nop", "col2": "qrs" } + + expect: + rows: + - {"dbt_audit_row_status": 'identical', 'id': 1, dbt_audit_num_rows_in_status: 1} + - {"dbt_audit_row_status": 'modified', 'id': 2, dbt_audit_num_rows_in_status: 1} + - {"dbt_audit_row_status": 'modified', 'id': 2, dbt_audit_num_rows_in_status: 1} + - {"dbt_audit_row_status": 'removed', 'id': 3, dbt_audit_num_rows_in_status: 1} + - {"dbt_audit_row_status": 'added', 'id': 4, dbt_audit_num_rows_in_status: 1} + config: + tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-core/issues/10167 + + - name: reworked_compare_identical_tables_multiple_pk_cols + model: unit_reworked_compare + overrides: + vars: + reworked_compare__columns: ['id', 'id_2', 'col1', 'col2'] + reworked_compare__event_time: + reworked_compare__primary_key_columns: ['id', 'id_2'] + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 12, "id_2": 3, "col1": "abc", "col2": "def" } + - { "id": 1, "id_2": 23, "col1": "hij", "col2": "klm" } + - { "id": 3, "id_2": 4, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": 12, "id_2": 3, "col1": "abc", "col2": "def" } + - { "id": 1, "id_2": 23, "col1": "hij", "col2": "klm" } + - { "id": 3, "id_2": 4, "col1": "nop", "col2": "qrs" } + expect: + rows: + - {"dbt_audit_row_status": 'identical', 'id': 12, "id_2": 3, "dbt_audit_num_rows_in_status": 3} + - {"dbt_audit_row_status": 'identical', 'id': 1, "id_2": 23, "dbt_audit_num_rows_in_status": 3} + - {"dbt_audit_row_status": 'identical', 'id': 3, "id_2": 4, "dbt_audit_num_rows_in_status": 3} + + - name: reworked_compare_identical_tables_single_null_pk + model: unit_reworked_compare + description: "`nonunique_pk` status checks whether a PK is unique. It's intended to avoid arbitrary comparisons, not protect against null records (that's what constraints or tests are for)." + + given: + - input: ref('unit_test_model_a') + rows: + - { "id": , "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": , "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + + expect: + rows: + - {"dbt_audit_row_status": 'identical', 'id': , dbt_audit_num_rows_in_status: 3} + - {"dbt_audit_row_status": 'identical', 'id': 2, dbt_audit_num_rows_in_status: 3} + - {"dbt_audit_row_status": 'identical', 'id': 3, dbt_audit_num_rows_in_status: 3} + + overrides: + vars: + reworked_compare__columns: ['id', 'col1', 'col2'] + reworked_compare__event_time: + reworked_compare__primary_key_columns: ['id'] + config: + tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-redshift/issues/821 + + - name: reworked_compare_identical_tables_multiple_null_pk + model: unit_reworked_compare + + given: + - input: ref('unit_test_model_a') + rows: + - { "id": , "col1": "abc", "col2": "def" } + - { "id": , "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": , "col1": "abc", "col2": "def" } + - { "id": , "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + + expect: + rows: + - {"dbt_audit_row_status": 'identical', 'id': 3, dbt_audit_num_rows_in_status: 1} + - {"dbt_audit_row_status": 'nonunique_pk', 'id': , dbt_audit_num_rows_in_status: 2} + - {"dbt_audit_row_status": 'nonunique_pk', 'id': , dbt_audit_num_rows_in_status: 2} + + overrides: + vars: + reworked_compare__columns: ['id', 'col1', 'col2'] + reworked_compare__event_time: + reworked_compare__primary_key_columns: ['id'] + config: + tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-redshift/issues/821 + + - name: reworked_compare_identical_tables_multi_null_pk_dupe_rows + description: All rows with a null ID are identical. They should be returned as individual rows instead of being combined + model: unit_reworked_compare + + given: + - input: ref('unit_test_model_a') + rows: + - { "id": , "col1": "abc", "col2": "def" } + - { "id": , "col1": "abc", "col2": "def" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": , "col1": "abc", "col2": "def" } + - { "id": , "col1": "abc", "col2": "def" } + - { "id": , "col1": "abc", "col2": "def" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + + expect: + rows: + - {"dbt_audit_row_status": 'identical', 'id': 3, dbt_audit_num_rows_in_status: 1} + - {"dbt_audit_row_status": 'nonunique_pk', 'id': , dbt_audit_num_rows_in_status: 3} + - {"dbt_audit_row_status": 'nonunique_pk', 'id': , dbt_audit_num_rows_in_status: 3} + - {"dbt_audit_row_status": 'nonunique_pk', 'id': , dbt_audit_num_rows_in_status: 3} + + overrides: + vars: + reworked_compare__columns: ['id', 'col1', 'col2'] + reworked_compare__event_time: + reworked_compare__primary_key_columns: ['id'] + config: + tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-redshift/issues/821 + + - name: reworked_compare_all_statuses_different_column_set + model: unit_reworked_compare + overrides: + vars: + reworked_compare__primary_key_columns: ['id'] + reworked_compare__columns: ['id', 'col1'] + reworked_compare__event_time: + given: + - input: ref('unit_test_model_a') + rows: + - { "id": 1, "col1": "abc", "col2": "def" } + - { "id": 2, "col1": "hij", "col2": "klm" } + - { "id": 3, "col1": "nop", "col2": "qrs" } + - input: ref('unit_test_model_b') + rows: + - { "id": 1, "col1": "abc" } + - { "id": 2, "col1": "ddd" } + - { "id": 4, "col1": "nop" } + + expect: + rows: + - {"dbt_audit_row_status": 'added', 'id': 4, dbt_audit_num_rows_in_status: 1} + - {"dbt_audit_row_status": 'identical', 'id': 1, dbt_audit_num_rows_in_status: 1} + - {"dbt_audit_row_status": 'modified', 'id': 2, dbt_audit_num_rows_in_status: 1} + - {"dbt_audit_row_status": 'modified', 'id': 2, dbt_audit_num_rows_in_status: 1} + - {"dbt_audit_row_status": 'removed', 'id': 3, dbt_audit_num_rows_in_status: 1} + config: + tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-core/issues/10167 diff --git a/integration_tests/models/unit_test_wrappers/unit_reworked_compare_struct.sql b/integration_tests/models/unit_test_wrappers/unit_reworked_compare_struct.sql new file mode 100644 index 00000000..7aab2177 --- /dev/null +++ b/integration_tests/models/unit_test_wrappers/unit_reworked_compare_struct.sql @@ -0,0 +1,9 @@ +{{ + audit_helper.reworked_compare( + "select * from " ~ ref('unit_test_struct_model_a'), + "select * from " ~ ref('unit_test_struct_model_b'), + primary_key_columns=var('reworked_compare__primary_key_columns'), + columns=var('reworked_compare__columns'), + event_time=var('reworked_compare__event_time') + ) +}} \ No newline at end of file diff --git a/integration_tests/models/unit_test_wrappers/unit_reworked_compare_struct.yml b/integration_tests/models/unit_test_wrappers/unit_reworked_compare_struct.yml new file mode 100644 index 00000000..3139a570 --- /dev/null +++ b/integration_tests/models/unit_test_wrappers/unit_reworked_compare_struct.yml @@ -0,0 +1,130 @@ +unit_tests: + - name: reworked_compare_struct + model: unit_reworked_compare_struct + given: + - input: ref('unit_test_struct_model_a') + format: sql + rows: | + select 1 as id, 'John Doe' as col1, object_construct('street', '123 Main St', 'city', 'Anytown', 'state', 'CA') as col2 + - input: ref('unit_test_struct_model_b') + format: sql + rows: | + select 1 as id, 'John Doe' as col1, object_construct('street', '123 Main St', 'city', 'Anytown', 'state', 'CA') as col2 + expect: + rows: + - {"id": 1, "dbt_audit_row_status": "identical", "dbt_audit_num_rows_in_status": 1} + overrides: + vars: + reworked_compare__columns: ['id', 'col1', 'col2'] + reworked_compare__event_time: + reworked_compare__primary_key_columns: ['id'] + config: + tags: "{{ 'skip' if (target.type in ['redshift']) else 'runnable' }}" + + - name: unit_reworked_compare_struct_identical_values_different_order + model: unit_reworked_compare_struct + description: Snowflake sorts objects' keys alphabetically, so sort order is ignored. + given: + - input: ref('unit_test_struct_model_a') + format: sql + rows: | + select 1 as id, 'John Doe' as col1, object_construct('street', '123 Main St', 'city', 'Anytown', 'state', 'CA') as col2 + - input: ref('unit_test_struct_model_b') + format: sql + rows: | + select 1 as id, 'John Doe' as col1, object_construct('state', 'CA', 'street', '123 Main St', 'city', 'Anytown') as col2 + expect: + rows: + - {"id": 1, "dbt_audit_row_status": "identical", "dbt_audit_num_rows_in_status": 1} + overrides: + vars: + reworked_compare__columns: ['id', 'col1', 'col2'] + reworked_compare__event_time: + reworked_compare__primary_key_columns: ['id'] + + - name: unit_reworked_compare_struct_removed_key + model: unit_reworked_compare_struct + given: + - input: ref('unit_test_struct_model_a') + format: sql + rows: | + select 1 as id, 'John Doe' as col1, object_construct('street', '123 Main St', 'city', 'Anytown', 'state', 'CA') as col2 + - input: ref('unit_test_struct_model_b') + format: sql + rows: | + select 1 as id, 'John Doe' as col1, object_construct('street', '123 Main St', 'state', 'CA') as col2 + expect: + rows: + - {"id": 1, "dbt_audit_row_status": "modified", "dbt_audit_num_rows_in_status": 1} + - {"id": 1, "dbt_audit_row_status": "modified", "dbt_audit_num_rows_in_status": 1} + overrides: + vars: + reworked_compare__columns: ['id', 'col1', 'col2'] + reworked_compare__event_time: + reworked_compare__primary_key_columns: ['id'] + + - name: reworked_compare_complex_struct + model: unit_reworked_compare_struct + given: + - input: ref('unit_test_struct_model_a') + format: sql + rows: | + select 1 as id, 'John Doe' as col1, object_construct('emails', array_construct('john.doe@example.com', 'john.d@example.com'), 'phones', array_construct(object_construct('type', 'home', 'number', '123-456-7890'), object_construct('type', 'work', 'number', '987-654-3210'))) as col2 + - input: ref('unit_test_struct_model_b') + format: sql + rows: | + select 1 as id, 'John Doe' as col1, object_construct('emails', array_construct('john.doe@example.com', 'john.d@example.com'), 'phones', array_construct(object_construct('type', 'home', 'number', '123-456-7890'), object_construct('type', 'work', 'number', '987-654-3210'))) as col2 + expect: + rows: + - {"id": 1, "dbt_audit_row_status": "identical", "dbt_audit_num_rows_in_status": 1} + overrides: + vars: + reworked_compare__columns: ['id', 'col1', 'col2'] + reworked_compare__event_time: + reworked_compare__primary_key_columns: ['id'] + + + - name: reworked_compare_complex_struct_different_values + model: unit_reworked_compare_struct + given: + - input: ref('unit_test_struct_model_a') + format: sql + rows: | + select 1 as id, 'John Doe' as col1, object_construct('emails', array_construct('john.doe@example.com', 'john.d@example.com'), 'phones', array_construct(object_construct('type', 'home', 'number', '123-456-7890'), object_construct('type', 'work', 'number', '987-654-3210'))) as col2 + - input: ref('unit_test_struct_model_b') + format: sql + rows: | + select 1 as id, 'John Doe' as col1, object_construct('emails', array_construct('john.smith@example.com', 'john.d@example.com'), 'phones', array_construct(object_construct('type', 'home', 'number', '123-456-7890'), object_construct('type', 'work', 'number', '987-654-3210'))) as col2 + expect: + rows: + - {"id": 1, "dbt_audit_row_status": "modified", "dbt_audit_num_rows_in_status": 1} + - {"id": 1, "dbt_audit_row_status": "modified", "dbt_audit_num_rows_in_status": 1} + overrides: + vars: + reworked_compare__columns: ['id', 'col1', 'col2'] + reworked_compare__event_time: + reworked_compare__primary_key_columns: ['id'] + + - name: unit_reworked_compare_complex_struct_identical_values_different_order + model: unit_reworked_compare_struct + description: Snowflake sorts objects' keys alphabetically, but respects the order items are added to arrays so differences are detected. + given: + - input: ref('unit_test_struct_model_a') + format: sql + rows: | + select 1 as id, 'John Doe' as col1, object_construct('emails', array_construct('john.doe@example.com', 'john.d@example.com'), 'phones', array_construct(object_construct('type', 'home', 'number', '123-456-7890'), object_construct('type', 'work', 'number', '987-654-3210'))) as col2 + - input: ref('unit_test_struct_model_b') + format: sql + rows: | + select 1 as id, 'John Doe' as col1, object_construct('emails', array_construct('john.doe@example.com', 'john.d@example.com'), 'phones', array_construct(object_construct('type', 'work', 'number', '987-654-3210'), object_construct('type', 'home', 'number', '123-456-7890'))) as col2 + expect: + rows: + - {"id": 1, "dbt_audit_row_status": "modified", "dbt_audit_num_rows_in_status": 1} + - {"id": 1, "dbt_audit_row_status": "modified", "dbt_audit_num_rows_in_status": 1} + overrides: + vars: + reworked_compare__columns: ['id', 'col1', 'col2'] + reworked_compare__event_time: + reworked_compare__primary_key_columns: ['id'] + + diff --git a/macros/quick_are_queries_identical.sql b/macros/quick_are_queries_identical.sql new file mode 100644 index 00000000..add26638 --- /dev/null +++ b/macros/quick_are_queries_identical.sql @@ -0,0 +1,51 @@ +/* +As described by the Infinite Lambda team here: https://infinitelambda.com/data-validation-refactoring-snowflake/ + +Some platforms let you take a hash of the whole table, which can be very very fast compared to comparing each row. + +If you run this and it returns false, you still have to run the more in-depth queries to find out what specific changes there are, +but it's a good way to quickly verify identical results if that's what you're expecting. +*/ + +{% macro quick_are_queries_identical(query_a, query_b, columns=[], event_time=None) %} + {{ return (adapter.dispatch('quick_are_queries_identical', 'audit_helper')(query_a, query_b, columns, event_time)) }} +{% endmacro %} + +{% macro default__quick_are_queries_identical(query_a, query_b, columns, event_time) %} + {% set joined_cols = columns | join(", ") %} + {% if event_time %} + {% set event_time_props = audit_helper.get_comparison_bounds(a_query, b_query, event_time) %} + {% endif %} + + select count(distinct hash_result) = 1 as are_tables_identical + from ( + select hash_agg({{ joined_cols }}) as hash_result + from ({{ query_a }}) query_a_subq + {% if event_time_props %} + where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' + and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' + {% endif %} + + union all + + select hash_agg({{ joined_cols }}) as hash_result + from ({{ query_b }}) query_b_subq + {% if event_time_props %} + where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' + and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' + {% endif %} + + ) as hashes +{% endmacro %} + +{% macro is_quick_are_queries_identical_supported() %} + {{ return (adapter.dispatch('is_quick_are_queries_identical_supported', 'audit_helper')()) }} +{% endmacro %} + +{% macro default__is_quick_are_queries_identical_supported() %} + {{ return (False) }} +{% endmacro %} + +{% macro snowflake__is_quick_are_queries_identical_supported() %} + {{ return (True) }} +{% endmacro %} \ No newline at end of file diff --git a/macros/reworked_compare.sql b/macros/reworked_compare.sql new file mode 100644 index 00000000..e7838833 --- /dev/null +++ b/macros/reworked_compare.sql @@ -0,0 +1,63 @@ +{% macro reworked_compare(a_query, b_query, primary_key_columns=[], columns=[], event_time=None, sample_limit=20) %} + + {% set joined_cols = columns | join(", ") %} + + {% if event_time %} + {% set event_time_props = audit_helper.get_comparison_bounds(a_query, b_query, event_time) %} + {% endif %} + + with + + {{ audit_helper._generate_set_results(a_query, b_query, primary_key_columns, columns, event_time_props)}} + + , + + all_records as ( + + select + *, + true as in_a, + true as in_b + from a_intersect_b + + union all + + select + *, + true as in_a, + false as in_b + from a_except_b + + union all + + select + *, + false as in_a, + true as in_b + from b_except_a + + ), + + + classified as ( + select + *, + {{ audit_helper._classify_audit_row_status() }} as dbt_audit_row_status + from all_records + ), + + final as ( + select + *, + {{ audit_helper._count_num_rows_in_status() }} as dbt_audit_num_rows_in_status, + dense_rank() over (partition by dbt_audit_row_status order by dbt_audit_surrogate_key, dbt_audit_pk_row_num) as dbt_audit_sample_number + from classified + ) + + select * from final + {% if sample_limit %} + where dbt_audit_sample_number <= {{ sample_limit }} + {% endif %} + order by dbt_audit_row_status, dbt_audit_sample_number + +{% endmacro %} \ No newline at end of file diff --git a/macros/utils/_classify_audit_row_status.sql b/macros/utils/_classify_audit_row_status.sql new file mode 100644 index 00000000..e28e3f4e --- /dev/null +++ b/macros/utils/_classify_audit_row_status.sql @@ -0,0 +1,28 @@ +{% macro _classify_audit_row_status() %} + {{ return(adapter.dispatch('_classify_audit_row_status', 'audit_helper')()) }} +{% endmacro %} + +{%- macro default___classify_audit_row_status() -%} + case + when max(dbt_audit_pk_row_num) over (partition by dbt_audit_surrogate_key) > 1 then 'nonunique_pk' + when in_a and in_b then 'identical' + when {{ dbt.bool_or('in_a') }} over (partition by dbt_audit_surrogate_key, dbt_audit_pk_row_num) + and {{ dbt.bool_or('in_b') }} over (partition by dbt_audit_surrogate_key, dbt_audit_pk_row_num) + then 'modified' + when in_a then 'removed' + when in_b then 'added' + end +{% endmacro %} + + +{%- macro redshift___classify_audit_row_status() -%} + {#- Redshift doesn't support bitwise operations (e.g. bool_or) inside of a window function :( -#} + case + when max(dbt_audit_pk_row_num) over (partition by dbt_audit_surrogate_key) > 1 then 'nonunique_pk' + when in_a and in_b then 'identical' + when max(case when in_a then 1 else 0 end) over (partition by dbt_audit_surrogate_key, dbt_audit_pk_row_num) = 1 + and max(case when in_b then 1 else 0 end) over (partition by dbt_audit_surrogate_key, dbt_audit_pk_row_num) = 1 + then 'modified' + when in_a then 'removed' + when in_b then 'added' + end{% endmacro %} \ No newline at end of file diff --git a/macros/utils/_count_num_rows_in_status.sql b/macros/utils/_count_num_rows_in_status.sql new file mode 100644 index 00000000..fa81c591 --- /dev/null +++ b/macros/utils/_count_num_rows_in_status.sql @@ -0,0 +1,28 @@ +{% macro _count_num_rows_in_status() %} + {{ return(adapter.dispatch('_count_num_rows_in_status', 'audit_helper')()) }} +{% endmacro %} + +{%- macro default___count_num_rows_in_status() -%} + count(distinct dbt_audit_surrogate_key, dbt_audit_pk_row_num) over (partition by dbt_audit_row_status) +{% endmacro %} + +{%- macro bigquery___count_num_rows_in_status() -%} + count(distinct {{ dbt.concat(["dbt_audit_surrogate_key", "dbt_audit_pk_row_num"]) }}) over (partition by dbt_audit_row_status) +{% endmacro %} + +{%- macro postgres___count_num_rows_in_status() -%} + {{ audit_helper._count_num_rows_in_status_without_distinct_window_func() }} +{% endmacro %} + +{%- macro databricks___count_num_rows_in_status() -%} + {{ audit_helper._count_num_rows_in_status_without_distinct_window_func() }} +{% endmacro %} + +{% macro _count_num_rows_in_status_without_distinct_window_func() %} + {#- Some platforms don't support count(distinct) inside of window functions -#} + {#- You can get the same outcome by dense_rank, assuming no nulls (we've already handled that) #} + {# https://stackoverflow.com/a/22347502 -#} + dense_rank() over (partition by dbt_audit_row_status order by dbt_audit_surrogate_key, dbt_audit_pk_row_num) + + dense_rank() over (partition by dbt_audit_row_status order by dbt_audit_surrogate_key desc, dbt_audit_pk_row_num desc) + - 1 +{% endmacro %} \ No newline at end of file diff --git a/macros/utils/_generate_set_results.sql b/macros/utils/_generate_set_results.sql new file mode 100644 index 00000000..848ff9e2 --- /dev/null +++ b/macros/utils/_generate_set_results.sql @@ -0,0 +1,123 @@ +{#- + Set generation is dispatched because it's possible to get performance optimisations + on some platforms, while keeping the post-processing standardised + See https://infinitelambda.com/data-validation-refactoring-snowflake/ for an example and background +-#} + +{% macro _generate_set_results(a_query, b_query, primary_key_columns, columns, event_time_props=None) %} + {{ return(adapter.dispatch('_generate_set_results', 'audit_helper')(a_query, b_query, primary_key_columns, columns, event_time_props)) }} +{% endmacro %} + +{% macro default___generate_set_results(a_query, b_query, primary_key_columns, columns, event_time_props) %} + {% set joined_cols = columns | join(", ") %} + + a_base as ( + select + {{ joined_cols }}, + {{ audit_helper.generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key + from ( {{- a_query -}} ) a_base_subq + {% if event_time_props %} + where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' + and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' + {% endif %} + ), + + b_base as ( + select + {{ joined_cols }}, + {{ audit_helper.generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key + from ( {{- b_query -}} ) b_base_subq + {% if event_time_props %} + where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' + and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' + {% endif %} + ), + + a as ( + select + *, + row_number() over (partition by dbt_audit_surrogate_key order by dbt_audit_surrogate_key) as dbt_audit_pk_row_num + from a_base + ), + + b as ( + select + *, + row_number() over (partition by dbt_audit_surrogate_key order by dbt_audit_surrogate_key) as dbt_audit_pk_row_num + from b_base + ), + + a_intersect_b as ( + + select * from a + {{ dbt.intersect() }} + select * from b + + ), + + a_except_b as ( + + select * from a + {{ dbt.except() }} + select * from b + + ), + + b_except_a as ( + + select * from b + {{ dbt.except() }} + select * from a + + ) +{% endmacro %} + +{% macro snowflake___generate_set_results(a_query, b_query, primary_key_columns, columns, event_time_props) %} + {% set joined_cols = columns | join(", ") %} + a as ( + select + {{ joined_cols }}, + {{ audit_helper.generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key, + row_number() over (partition by dbt_audit_surrogate_key order by dbt_audit_surrogate_key ) as dbt_audit_pk_row_num, + hash({{ joined_cols }}, dbt_audit_pk_row_num) as dbt_audit_row_hash + from ( {{- a_query -}} ) + {% if event_time_props %} + where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' + and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' + {% endif %} + ), + + b as ( + select + {{ joined_cols }}, + {{ audit_helper.generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key, + row_number() over (partition by dbt_audit_surrogate_key order by dbt_audit_surrogate_key ) as dbt_audit_pk_row_num, + hash({{ joined_cols }}, dbt_audit_pk_row_num) as dbt_audit_row_hash + from ( {{- b_query -}} ) + {% if event_time_props %} + where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' + and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' + {% endif %} + ), + + a_intersect_b as ( + + select * from a + where a.dbt_audit_row_hash in (select b.dbt_audit_row_hash from b) + + ), + + a_except_b as ( + + select * from a + where a.dbt_audit_row_hash not in (select b.dbt_audit_row_hash from b) + + ), + + b_except_a as ( + + select * from b + where b.dbt_audit_row_hash not in (select a.dbt_audit_row_hash from a) + + ) +{% endmacro %} \ No newline at end of file diff --git a/macros/utils/generate_null_safe_sk.sql b/macros/utils/generate_null_safe_sk.sql new file mode 100644 index 00000000..4078c334 --- /dev/null +++ b/macros/utils/generate_null_safe_sk.sql @@ -0,0 +1,25 @@ +{# Taken from https://github.com/dbt-labs/dbt-utils/blob/main/macros/sql/generate_surrogate_key.sql but without the option to treat nulls as empty strings #} + +{%- macro generate_null_safe_surrogate_key(field_list) -%} + {{ return(adapter.dispatch('generate_null_safe_surrogate_key', 'audit_helper')(field_list)) }} +{% endmacro %} + +{%- macro default__generate_null_safe_surrogate_key(field_list) -%} + +{%- set fields = [] -%} + +{%- for field in field_list -%} + + {%- do fields.append( + "coalesce(cast(" ~ field ~ " as " ~ dbt.type_string() ~ "), '_dbt_audit_helper_surrogate_key_null_')" + ) -%} + + {%- if not loop.last %} + {%- do fields.append("'-'") -%} + {%- endif -%} + +{%- endfor -%} + +{{ dbt.hash(dbt.concat(fields)) }} + +{%- endmacro -%} \ No newline at end of file diff --git a/macros/utils/get_comparison_bounds.sql b/macros/utils/get_comparison_bounds.sql new file mode 100644 index 00000000..4f224f5f --- /dev/null +++ b/macros/utils/get_comparison_bounds.sql @@ -0,0 +1,42 @@ +/* +The idea here is that if the event_time is set, we will only compare records enclosed in both models. +This improves performance and allows us to compare apples to apples, instead of detecting millions/billions +of "deletions" identified due to prod having all data while CI only has a few days' worth. + +In the diagram below, the thatched section is the comparison bounds. You can think of it as + + greatest(model_a.min_value, model_b.min_value) + least(model_a.max_value, model_b.max_value) + + ┌────────────────────────────┐ + a min_value │ a max_value │ + └──► ┌───────┼────────────────────┐ ◄───┘ │ + │ │┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼│ │ +model_a │ │┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼│ │ model_b + │ │┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼│ │ + └───────┼────────────────────┘ │ + ┌──► └────────────────────────────┘ ◄────┐ + b min_value b max_value +*/ +{% macro get_comparison_bounds(a_relation, b_relation, event_time) %} + {% set min_max_queries %} + with min_maxes as ( + select min({{ event_time }}) as min_event_time, max({{ event_time }}) as max_event_time + from {{ a_relation }} + union all + select min({{ event_time }}) as min_event_time, max({{ event_time }}) as max_event_time + from {{ b_relation }} + ) + select max(min_event_time) as min_event_time, min(max_event_time) as max_event_time + from min_maxes + {% endset %} + + {% set query_response = dbt_utils.get_query_results_as_dict(min_max_queries) %} + + {% set event_time_props = {"event_time": event_time} %} + {% for k in query_response.keys() %} + {% do event_time_props.update({k | lower: query_response[k][0]}) %} + {% endfor %} + + {% do return(event_time_props) %} +{% endmacro %} \ No newline at end of file diff --git a/package-lock.yml b/package-lock.yml new file mode 100644 index 00000000..32c6ccc0 --- /dev/null +++ b/package-lock.yml @@ -0,0 +1,4 @@ +packages: + - package: dbt-labs/dbt_utils + version: 1.1.1 +sha1_hash: 106400343ad0c92a7417f5156d0d6c3893bb2429