-
Notifications
You must be signed in to change notification settings - Fork 234
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Rationalize incremental materialization #141
Changes from 6 commits
8ba1fc1
6de3d07
fb6a4bb
6f7e1f2
b60d7c2
c8e3770
bc18f80
af77fd8
e08a23e
9bbc61b
ab57d62
a12f74d
43ab587
ac35028
abe0a27
b8c7d77
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
{% macro get_insert_overwrite_sql(source_relation, target_relation) %} | ||
|
||
{%- set dest_columns = adapter.get_columns_in_relation(target_relation) -%} | ||
{%- set dest_cols_csv = dest_columns | map(attribute='quoted') | join(', ') -%} | ||
insert overwrite table {{ target_relation }} | ||
|
@@ -8,6 +8,17 @@ | |
|
||
{% endmacro %} | ||
|
||
|
||
{% macro get_insert_into_sql(source_relation, target_relation) %} | ||
|
||
{%- set dest_columns = adapter.get_columns_in_relation(target_relation) -%} | ||
{%- set dest_cols_csv = dest_columns | map(attribute='quoted') | join(', ') -%} | ||
insert into table {{ target_relation }} | ||
select {{dest_cols_csv}} from {{ source_relation.include(database=false, schema=false) }} | ||
|
||
{% endmacro %} | ||
|
||
|
||
{% macro dbt_spark_validate_get_file_format() %} | ||
{#-- Find and validate the file format #} | ||
{%- set file_format = config.get("file_format", default="parquet") -%} | ||
|
@@ -24,59 +35,79 @@ | |
{% do return(file_format) %} | ||
{% endmacro %} | ||
|
||
|
||
{% macro dbt_spark_validate_get_incremental_strategy(file_format) %} | ||
{#-- Find and validate the incremental strategy #} | ||
{%- set strategy = config.get("incremental_strategy", default="insert_overwrite") -%} | ||
{%- set strategy = config.get("incremental_strategy", default="append") -%} | ||
|
||
{% set invalid_strategy_msg -%} | ||
Invalid incremental strategy provided: {{ strategy }} | ||
Expected one of: 'merge', 'insert_overwrite' | ||
Expected one of: 'append', 'merge', 'insert_overwrite' | ||
{%- endset %} | ||
|
||
{% set invalid_merge_msg -%} | ||
Invalid incremental strategy provided: {{ strategy }} | ||
You can only choose this strategy when file_format is set to 'delta' | ||
{%- endset %} | ||
|
||
{% set invalid_insert_overwrite_delta_msg -%} | ||
Invalid incremental strategy provided: {{ strategy }} | ||
You cannot use this strategy when file_format is set to 'delta' | ||
Use the 'append' or 'merge' strategy instead | ||
{%- endset %} | ||
|
||
{% set invalid_insert_overwrite_endpoint_msg -%} | ||
Invalid incremental strategy provided: {{ strategy }} | ||
You cannot use this strategy when connecting via endpoint | ||
Use the 'append' or 'merge' strategy instead | ||
{%- endset %} | ||
|
||
{% if strategy not in ['merge', 'insert_overwrite'] %} | ||
{% if strategy not in ['append', 'merge', 'insert_overwrite'] %} | ||
{% do exceptions.raise_compiler_error(invalid_strategy_msg) %} | ||
{%-else %} | ||
{% if strategy == 'merge' and file_format != 'delta' %} | ||
{% do exceptions.raise_compiler_error(invalid_merge_msg) %} | ||
{% endif %} | ||
{% if strategy == 'insert_overwrite' and file_format == 'delta' %} | ||
{% do exceptions.raise_compiler_error(invalid_insert_overwrite_delta_msg) %} | ||
{% endif %} | ||
{% if strategy == 'insert_overwrite' and target.endpoint %} | ||
{% do exceptions.raise_compiler_error(invalid_insert_overwrite_endpoint_msg) %} | ||
{% endif %} | ||
{% endif %} | ||
|
||
{% do return(strategy) %} | ||
{% endmacro %} | ||
|
||
{% macro dbt_spark_validate_merge(file_format) %} | ||
{% set invalid_file_format_msg -%} | ||
You can only choose the 'merge' incremental_strategy when file_format is set to 'delta' | ||
{%- endset %} | ||
|
||
{% if file_format != 'delta' %} | ||
{% do exceptions.raise_compiler_error(invalid_file_format_msg) %} | ||
{% endif %} | ||
|
||
{% endmacro %} | ||
|
||
|
||
{% macro spark__get_merge_sql(target, source, unique_key, dest_columns, predicates=none) %} | ||
{# ignore dest_columns - we will just use `*` #} | ||
|
||
{% set merge_condition %} | ||
{% if unique_key %} | ||
on DBT_INTERNAL_SOURCE.{{ unique_key }} = DBT_INTERNAL_DEST.{{ unique_key }} | ||
{% else %} | ||
on false | ||
{% endif %} | ||
{% endset %} | ||
|
||
merge into {{ target }} as DBT_INTERNAL_DEST | ||
using {{ source.include(schema=false) }} as DBT_INTERNAL_SOURCE | ||
on DBT_INTERNAL_SOURCE.{{ unique_key }} = DBT_INTERNAL_DEST.{{ unique_key }} | ||
{{ merge_condition }} | ||
when matched then update set * | ||
when not matched then insert * | ||
{% endmacro %} | ||
|
||
|
||
{% macro dbt_spark_get_incremental_sql(strategy, source, target, unique_key) %} | ||
{%- if strategy == 'insert_overwrite' -%} | ||
{%- if strategy == 'append' -%} | ||
{#-- insert new records into existing table, without updating or overwriting #} | ||
{{ get_insert_into_sql(source, target) }} | ||
{%- elif strategy == 'insert_overwrite' -%} | ||
{#-- insert statements don't like CTEs, so support them via a temp view #} | ||
{{ get_insert_overwrite_sql(source, target) }} | ||
{%- else -%} | ||
{#-- merge all columns with databricks delta - schema changes are handled for us #} | ||
{%- elif strategy == 'merge' -%} | ||
{#-- merge all columns with databricks delta - schema changes are handled for us #} | ||
{{ get_merge_sql(target, source, unique_key, dest_columns=none, predicates=none) }} | ||
{%- endif -%} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe raise an error if it doesn't match any of the cases? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good call. It's already raised earlier on: Just the same, I'll add another explicit exception here, just in case users override some macros but not others. |
||
|
||
|
@@ -85,31 +116,21 @@ | |
|
||
{% materialization incremental, adapter='spark' -%} | ||
{#-- Validate early so we don't run SQL if the file_format is invalid --#} | ||
{% set file_format = dbt_spark_validate_get_file_format() -%} | ||
{%- set file_format = dbt_spark_validate_get_file_format() -%} | ||
{#-- Validate early so we don't run SQL if the strategy is invalid --#} | ||
{% set strategy = dbt_spark_validate_get_incremental_strategy(file_format) -%} | ||
{%- set strategy = dbt_spark_validate_get_incremental_strategy(file_format) -%} | ||
{%- set unique_key = config.get('unique_key', none) -%} | ||
|
||
{%- set full_refresh_mode = (flags.FULL_REFRESH == True) -%} | ||
|
||
{% set target_relation = this %} | ||
{% set existing_relation = load_relation(this) %} | ||
{% set tmp_relation = make_temp_relation(this) %} | ||
|
||
{% if strategy == 'merge' %} | ||
{%- set unique_key = config.require('unique_key') -%} | ||
{% do dbt_spark_validate_merge(file_format) %} | ||
{% if strategy == 'insert_overwrite' and config.get('partition_by') %} | ||
set spark.sql.sources.partitionOverwriteMode = DYNAMIC | ||
{% endif %} | ||
|
||
{% if config.get('partition_by') %} | ||
{% call statement() %} | ||
set spark.sql.sources.partitionOverwriteMode = DYNAMIC | ||
{% endcall %} | ||
{% endif %} | ||
|
||
{% call statement() %} | ||
set spark.sql.hive.convertMetastoreParquet = false | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good to see this good :) |
||
{% endcall %} | ||
|
||
{{ run_hooks(pre_hooks) }} | ||
|
||
{% if existing_relation is none %} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This feels awkward, why not just use an
INSERT INTO
statement?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is in line with default dbt behavior here.
As I see it, the only reason we implement
spark__get_merge_sql
is to benefit from the improved wildcardupdate *
/insert *
.I agree it feels a bit silly—this is identical to
append
strategy /INSERT INTO
statement. The alternative is to keep requiring aunique_key
withmerge
. In any case, I'd rather err closer to the side of default behavior.