diff --git a/samples/load_table_dataframe_array_contains.py b/samples/load_table_dataframe_array_contains.py new file mode 100644 index 000000000..2df9d4fa8 --- /dev/null +++ b/samples/load_table_dataframe_array_contains.py @@ -0,0 +1,40 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def load_table_dataframe_array_contains(table_id): + + # [START bigquery_load_table_dataframe_array_contains] + + from google.cloud import bigquery + import pandas + + # Construct a BigQuery client object. + client = bigquery.Client() + + # TODO(developer): Set table_id to the ID of the table to create. + # table_id = "your-project.your_dataset.your_table_name" + + dataframe = pandas.DataFrame({"A": [[1, 2, 3], [4, 5, 6], [7, 8, 9]]}) + job = client.load_table_from_dataframe(dataframe, table_id) # Make an API request. + job.result() # Wait for the job to complete. + + table = client.get_table(table_id) # Make an API request. + print( + "Loaded {} rows and {} columns to {}".format( + table.num_rows, len(table.schema), table_id + ) + ) + # [END bigquery_load_table_dataframe_array_contains] + return table diff --git a/samples/snippets/requirements.txt b/samples/snippets/requirements.txt index bf895a1ae..b61fdd8f3 100644 --- a/samples/snippets/requirements.txt +++ b/samples/snippets/requirements.txt @@ -6,5 +6,5 @@ ipython==7.16.1; python_version < '3.7' ipython==7.17.0; python_version >= '3.7' matplotlib==3.3.2 pandas==1.1.4 -pyarrow==1.0.1 +pyarrow==2.0.0 pytz==2020.1 diff --git a/samples/tests/test_load_table_dataframe_array_contains.py b/samples/tests/test_load_table_dataframe_array_contains.py new file mode 100644 index 000000000..094784487 --- /dev/null +++ b/samples/tests/test_load_table_dataframe_array_contains.py @@ -0,0 +1,34 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from .. import load_table_dataframe_array_contains + + +pandas = pytest.importorskip("pandas") +pyarrow = pytest.importorskip("pyarrow", minversion="2.0.0") + + +def test_load_table_dataframe_array_contains(capsys, random_table_id): + + table = load_table_dataframe_array_contains.load_table_dataframe_array_contains( + random_table_id + ) + out, _ = capsys.readouterr() + expected_column_names = ["A"] + assert "Loaded 3 rows and {} columns".format(len(expected_column_names)) in out + + column_names = [field.name for field in table.schema] + assert column_names == expected_column_names diff --git a/setup.py b/setup.py index 548ceac09..c6a8a6de1 100644 --- a/setup.py +++ b/setup.py @@ -46,12 +46,12 @@ # grpc.Channel.close() method isn't added until 1.32.0. # https://github.com/grpc/grpc/pull/15254 "grpcio >= 1.32.0, < 2.0dev", - "pyarrow >= 1.0.0, < 2.0dev", + "pyarrow >= 2.0.0, < 3.0dev", ], "pandas": [ "pandas>=0.23.0", - # pyarrow 1.0.0 is required for the use of timestamp_as_object keyword. - "pyarrow >= 1.0.0, < 2.0dev", + # pyarrow 2.0.0 is required for the use of arrays in dataframe to load the table . + "pyarrow >= 2.0.0, < 3.0dev", ], "tqdm": ["tqdm >= 4.7.4, <5.0.0dev"], "opentelemetry": [ diff --git a/tests/system.py b/tests/system.py index 68fcb918c..9269f11cf 100644 --- a/tests/system.py +++ b/tests/system.py @@ -129,7 +129,7 @@ ) PANDAS_MINIMUM_VERSION = pkg_resources.parse_version("1.0.0") -PYARROW_MINIMUM_VERSION = pkg_resources.parse_version("0.17.0") +PYARROW_MINIMUM_VERSION = pkg_resources.parse_version("2.0.0") if pandas: PANDAS_INSTALLED_VERSION = pkg_resources.get_distribution("pandas").parsed_version @@ -1086,9 +1086,9 @@ def test_load_table_from_dataframe_w_explicit_schema(self): @unittest.skipIf( pyarrow is None or PYARROW_INSTALLED_VERSION < PYARROW_MINIMUM_VERSION, - "Only `pyarrow version >=0.17.0` is supported", + "Only `pyarrow version >=2.0.0` is supported", ) - @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf(pandas is None, "Requires " "`pandas`") def test_load_table_from_dataframe_w_struct_datatype(self): """Test that a DataFrame with struct datatype can be uploaded if a BigQuery schema is specified. @@ -1126,6 +1126,62 @@ def test_load_table_from_dataframe_w_struct_datatype(self): self.assertEqual(table.schema, table_schema) self.assertEqual(table.num_rows, 3) + @unittest.skipIf( + pyarrow is None or PYARROW_INSTALLED_VERSION < PYARROW_MINIMUM_VERSION, + "Only `pyarrow version >=2.0.0` is supported", + ) + @unittest.skipIf(pandas is None, "Requires `pandas`") + def test_load_table_from_dataframe_w_array_datatype(self): + """Test that a DataFrame contains array can be uploaded if a + BigQuery without specifying a schema. + + https://github.com/googleapis/python-bigquery/issues/19 + """ + table_schema = [ + bigquery.SchemaField( + "A", + "RECORD", + "NULLABLE", + None, + ( + bigquery.SchemaField( + "list", + "RECORD", + "REPEATED", + None, + ( + bigquery.SchemaField( + "item", "INTEGER", "NULLABLE", None, (), None + ), + ), + None, + ), + ), + None, + ) + ] + dataset_id = _make_dataset_id("bq_load_test") + self.temp_dataset(dataset_id) + table_id = "{}.{}.load_table_from_dataframe_w_array_datatype".format( + Config.CLIENT.project, dataset_id + ) + + job_config = bigquery.LoadJobConfig(autodetect=True) + table = retry_403(Config.CLIENT.create_table)(Table(table_id)) + self.to_delete.insert(0, table) + + dataframe = pandas.DataFrame({"A": [[1, 2, 3], [4, 5, 6], [7, 8, 9]]}) + + load_job = Config.CLIENT.load_table_from_dataframe( + dataframe, table_id, job_config=job_config + ) + load_job.result() + + table = Config.CLIENT.get_table(table_id) + + self.assertEqual(table.schema, table_schema) + self.assertEqual(table.num_rows, 3) + def test_load_table_from_json_basic_use(self): table_schema = ( bigquery.SchemaField("name", "STRING", mode="REQUIRED"),