From 02cda181e4965c84cb9500de5cd4fc2c6d792671 Mon Sep 17 00:00:00 2001 From: Michelle Ark Date: Tue, 11 Jul 2023 09:36:42 -0700 Subject: [PATCH] Contracts: Handle struct column specified both at root and nested levels + arrays of structs (#806) (cherry picked from commit 291713c92263f6ce8749bb2b9bc7ea75cae9cf3b) --- .../unreleased/Fixes-20230630-213112.yaml | 7 ++ dbt/adapters/bigquery/column.py | 77 +++++++++--- dbt/adapters/bigquery/impl.py | 2 +- .../macros/utils/get_columns_spec_ddl.sql | 10 ++ tests/unit/test_column.py | 111 ++++++++++++++++++ 5 files changed, 189 insertions(+), 18 deletions(-) create mode 100644 .changes/unreleased/Fixes-20230630-213112.yaml diff --git a/.changes/unreleased/Fixes-20230630-213112.yaml b/.changes/unreleased/Fixes-20230630-213112.yaml new file mode 100644 index 000000000..7238c0cb1 --- /dev/null +++ b/.changes/unreleased/Fixes-20230630-213112.yaml @@ -0,0 +1,7 @@ +kind: Fixes +body: 'Contracts: Handle struct column specified both at root and nested levels + + arrays of structs' +time: 2023-06-30T21:31:12.63257-04:00 +custom: + Author: michelleark + Issue: 781 782 diff --git a/dbt/adapters/bigquery/column.py b/dbt/adapters/bigquery/column.py index cfa298b82..a5a60cfc0 100644 --- a/dbt/adapters/bigquery/column.py +++ b/dbt/adapters/bigquery/column.py @@ -5,6 +5,8 @@ from google.cloud.bigquery import SchemaField +_PARENT_DATA_TYPE_KEY = "__parent_data_type" + Self = TypeVar("Self", bound="BigQueryColumn") @@ -131,7 +133,7 @@ def column_to_bq_schema(self) -> SchemaField: def get_nested_column_data_types( columns: Dict[str, Dict[str, Any]], constraints: Optional[Dict[str, str]] = None, -) -> Dict[str, Dict[str, str]]: +) -> Dict[str, Dict[str, Optional[str]]]: """ columns: * Dictionary where keys are of flat columns names and values are dictionary of column attributes @@ -159,16 +161,16 @@ def get_nested_column_data_types( """ constraints = constraints or {} - nested_column_data_types: Dict[str, Union[str, Dict]] = {} + nested_column_data_types: Dict[str, Optional[Union[str, Dict]]] = {} for column in columns.values(): _update_nested_column_data_types( column["name"], - column["data_type"], + column.get("data_type"), constraints.get(column["name"]), nested_column_data_types, ) - formatted_nested_column_data_types: Dict[str, Dict[str, str]] = {} + formatted_nested_column_data_types: Dict[str, Dict[str, Optional[str]]] = {} for column_name, unformatted_column_type in nested_column_data_types.items(): formatted_nested_column_data_types[column_name] = { "name": column_name, @@ -191,9 +193,9 @@ def get_nested_column_data_types( def _update_nested_column_data_types( column_name: str, - column_data_type: str, + column_data_type: Optional[str], column_rendered_constraint: Optional[str], - nested_column_data_types: Dict[str, Union[str, Dict]], + nested_column_data_types: Dict[str, Optional[Union[str, Dict]]], ) -> None: """ Recursively update nested_column_data_types given a column_name, column_data_type, and optional column_rendered_constraint. @@ -215,15 +217,38 @@ def _update_nested_column_data_types( if len(column_name_parts) == 1: # Base case: column is not nested - store its data_type concatenated with constraint if provided. - nested_column_data_types[root_column_name] = ( - column_data_type - if column_rendered_constraint is None - else f"{column_data_type} {column_rendered_constraint}" + column_data_type_and_constraints = ( + ( + column_data_type + if column_rendered_constraint is None + else f"{column_data_type} {column_rendered_constraint}" + ) + if column_data_type + else None ) + + if existing_nested_column_data_type := nested_column_data_types.get(root_column_name): + assert isinstance(existing_nested_column_data_type, dict) # keeping mypy happy + # entry could already exist if this is a parent column -- preserve the parent data type under "_PARENT_DATA_TYPE_KEY" + existing_nested_column_data_type.update( + {_PARENT_DATA_TYPE_KEY: column_data_type_and_constraints} + ) + else: + nested_column_data_types.update({root_column_name: column_data_type_and_constraints}) else: - # Initialize nested dictionary - if root_column_name not in nested_column_data_types: - nested_column_data_types[root_column_name] = {} + parent_data_type = nested_column_data_types.get(root_column_name) + if isinstance(parent_data_type, dict): + # nested dictionary already initialized + pass + elif parent_data_type is None: + # initialize nested dictionary + nested_column_data_types.update({root_column_name: {}}) + else: + # a parent specified its base type -- preserve its data_type and potential rendered constraints + # this is used to specify a top-level 'struct' or 'array' field with its own description, constraints, etc + nested_column_data_types.update( + {root_column_name: {_PARENT_DATA_TYPE_KEY: parent_data_type}} + ) # Recursively process rest of remaining column name remaining_column_name = ".".join(column_name_parts[1:]) @@ -237,7 +262,9 @@ def _update_nested_column_data_types( ) -def _format_nested_data_type(unformatted_nested_data_type: Union[str, Dict[str, Any]]) -> str: +def _format_nested_data_type( + unformatted_nested_data_type: Optional[Union[str, Dict[str, Any]]] +) -> Optional[str]: """ Recursively format a (STRUCT) data type given an arbitrarily nested data type structure. @@ -249,11 +276,27 @@ def _format_nested_data_type(unformatted_nested_data_type: Union[str, Dict[str, >>> BigQueryAdapter._format_nested_data_type({'c': 'string not_null', 'd': {'e': 'string'}}) 'struct>' """ - if isinstance(unformatted_nested_data_type, str): + if unformatted_nested_data_type is None: + return None + elif isinstance(unformatted_nested_data_type, str): return unformatted_nested_data_type else: + parent_data_type, *parent_constraints = unformatted_nested_data_type.pop( + _PARENT_DATA_TYPE_KEY, "" + ).split() or [None] + formatted_nested_types = [ - f"{column_name} {_format_nested_data_type(column_type)}" + f"{column_name} {_format_nested_data_type(column_type) or ''}".strip() for column_name, column_type in unformatted_nested_data_type.items() ] - return f"""struct<{", ".join(formatted_nested_types)}>""" + + formatted_nested_type = f"""struct<{", ".join(formatted_nested_types)}>""" + + if parent_data_type and parent_data_type.lower() == "array": + formatted_nested_type = f"""array<{formatted_nested_type}>""" + + if parent_constraints: + parent_constraints = " ".join(parent_constraints) + formatted_nested_type = f"""{formatted_nested_type} {parent_constraints}""" + + return formatted_nested_type diff --git a/dbt/adapters/bigquery/impl.py b/dbt/adapters/bigquery/impl.py index b3d397722..528e0f799 100644 --- a/dbt/adapters/bigquery/impl.py +++ b/dbt/adapters/bigquery/impl.py @@ -276,7 +276,7 @@ def nest_column_data_types( cls, columns: Dict[str, Dict[str, Any]], constraints: Optional[Dict[str, str]] = None, - ) -> Dict[str, Dict[str, str]]: + ) -> Dict[str, Dict[str, Optional[str]]]: return get_nested_column_data_types(columns, constraints) def get_columns_in_relation(self, relation: BigQueryRelation) -> List[BigQueryColumn]: diff --git a/dbt/include/bigquery/macros/utils/get_columns_spec_ddl.sql b/dbt/include/bigquery/macros/utils/get_columns_spec_ddl.sql index a1247dcd6..1a4193c71 100644 --- a/dbt/include/bigquery/macros/utils/get_columns_spec_ddl.sql +++ b/dbt/include/bigquery/macros/utils/get_columns_spec_ddl.sql @@ -5,6 +5,16 @@ {%- endmacro -%} {% macro bigquery__get_empty_schema_sql(columns) %} + {%- set col_err = [] -%} + {% for col in columns.values() %} + {%- if col['data_type'] is not defined -%} + {{ col_err.append(col['name']) }} + {%- endif -%} + {%- endfor -%} + {%- if (col_err | length) > 0 -%} + {{ exceptions.column_type_missing(column_names=col_err) }} + {%- endif -%} + {%- set columns = adapter.nest_column_data_types(columns) -%} {{ return(dbt.default__get_empty_schema_sql(columns)) }} {% endmacro %} diff --git a/tests/unit/test_column.py b/tests/unit/test_column.py index 8c3c645bb..10f30594e 100644 --- a/tests/unit/test_column.py +++ b/tests/unit/test_column.py @@ -14,6 +14,12 @@ None, {"a": {"name": "a", "data_type": "string"}}, ), + # Flat column - missing data_type + ( + {"a": {"name": "a"}}, + None, + {"a": {"name": "a", "data_type": None}}, + ), # Flat column - with constraints ( {"a": {"name": "a", "data_type": "string"}}, @@ -32,18 +38,75 @@ None, {"b": {"name": "b", "data_type": "struct"}}, ), + # Single nested column, 1 level - missing data_type + ( + {"b.nested": {"name": "b.nested"}}, + None, + {"b": {"name": "b", "data_type": "struct"}}, + ), # Single nested column, 1 level - with constraints ( {"b.nested": {"name": "b.nested", "data_type": "string"}}, {"b.nested": "not null"}, {"b": {"name": "b", "data_type": "struct"}}, ), + # Single nested column, 1 level - with constraints, missing data_type (constraints not valid without data_type) + ( + {"b.nested": {"name": "b.nested"}}, + {"b.nested": "not null"}, + {"b": {"name": "b", "data_type": "struct"}}, + ), # Single nested column, 1 level - with constraints + other keys ( {"b.nested": {"name": "b.nested", "data_type": "string", "other": "unpreserved"}}, {"b.nested": "not null"}, {"b": {"name": "b", "data_type": "struct"}}, ), + # Single nested column, 1 level - with corresponding parent column + ( + { + "b": {"name": "b", "data_type": "struct"}, + "b.nested": {"name": "b.nested", "data_type": "string"}, + }, + None, + {"b": {"name": "b", "data_type": "struct"}}, + ), + # Single nested column, 1 level - with corresponding parent column specified last + ( + { + "b.nested": {"name": "b.nested", "data_type": "string"}, + "b": {"name": "b", "data_type": "struct"}, + }, + None, + {"b": {"name": "b", "data_type": "struct"}}, + ), + # Single nested column, 1 level - with corresponding parent column + parent constraint + ( + { + "b": {"name": "b", "data_type": "struct"}, + "b.nested": {"name": "b.nested", "data_type": "string"}, + }, + {"b": "not null"}, + {"b": {"name": "b", "data_type": "struct not null"}}, + ), + # Single nested column, 1 level - with corresponding parent column as array + ( + { + "b": {"name": "b", "data_type": "array"}, + "b.nested": {"name": "b.nested", "data_type": "string"}, + }, + None, + {"b": {"name": "b", "data_type": "array>"}}, + ), + # Single nested column, 1 level - with corresponding parent column as array + constraint + ( + { + "b": {"name": "b", "data_type": "array"}, + "b.nested": {"name": "b.nested", "data_type": "string"}, + }, + {"b": "not null"}, + {"b": {"name": "b", "data_type": "array> not null"}}, + ), # Multiple nested columns, 1 level ( { @@ -106,6 +169,28 @@ }, }, ), + # Nested columns, multiple levels - missing data_type + ( + { + "b.user.name.first": { + "name": "b.user.name.first", + "data_type": "string", + }, + "b.user.name.last": { + "name": "b.user.name.last", + "data_type": "string", + }, + "b.user.id": {"name": "b.user.id", "data_type": "int64"}, + "b.user.country": {"name": "b.user.country"}, # missing data_type + }, + None, + { + "b": { + "name": "b", + "data_type": "struct, id int64, country>>", + }, + }, + ), # Nested columns, multiple levels - with constraints! ( { @@ -128,6 +213,32 @@ }, }, ), + # Nested columns, multiple levels - with parent arrays and constraints! + ( + { + "b.user.names": { + "name": "b.user.names", + "data_type": "array", + }, + "b.user.names.first": { + "name": "b.user.names.first", + "data_type": "string", + }, + "b.user.names.last": { + "name": "b.user.names.last", + "data_type": "string", + }, + "b.user.id": {"name": "b.user.id", "data_type": "int64"}, + "b.user.country": {"name": "b.user.country", "data_type": "string"}, + }, + {"b.user.names.first": "not null", "b.user.id": "unique"}, + { + "b": { + "name": "b", + "data_type": "struct>, id int64 unique, country string>>", + }, + }, + ), ], ) def test_get_nested_column_data_types(columns, constraints, expected_nested_columns):