From 9175ffca0ae593edd8d75c5694f424a9765fc3ea Mon Sep 17 00:00:00 2001 From: Peter Lamut Date: Thu, 31 Oct 2019 11:03:15 +0200 Subject: [PATCH 1/4] test(bigquery): add insert_rows*() tests w/o row IDs --- bigquery/tests/unit/test_client.py | 104 +++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) diff --git a/bigquery/tests/unit/test_client.py b/bigquery/tests/unit/test_client.py index 91b9bc642187..ad29aca29ab5 100644 --- a/bigquery/tests/unit/test_client.py +++ b/bigquery/tests/unit/test_client.py @@ -4572,6 +4572,40 @@ def test_insert_rows_w_record_schema(self): method="POST", path="/%s" % PATH, data=SENT ) + def test_insert_rows_wo_explicit_insert_ids(self): + from google.cloud.bigquery.schema import SchemaField + from google.cloud.bigquery.table import Table + + PATH = "projects/{}/datasets/{}/tables/{}/insertAll".format( + self.PROJECT, self.DS_ID, self.TABLE_ID, + ) + creds = _make_credentials() + http = object() + client = self._make_one(project=self.PROJECT, credentials=creds, _http=http) + conn = client._connection = make_connection({}) + schema = [ + SchemaField("full_name", "STRING", mode="REQUIRED"), + SchemaField("age", "INTEGER", mode="REQUIRED"), + ] + table = Table(self.TABLE_REF, schema=schema) + ROWS = [ + {"full_name": "Phred Phlyntstone", "age": 32}, + {"full_name": "Bharney Rhubble", "age": 33}, + ] + + def _row_data(row): + row["age"] = str(row["age"]) + return row + + SENT = {"rows": [{"json": _row_data(row), "insertId": None} for row in ROWS]} + + errors = client.insert_rows(table, ROWS, row_ids=[None] * len(ROWS)) + + self.assertEqual(len(errors), 0) + conn.api_request.assert_called_once_with( + method="POST", path="/{}".format(PATH), data=SENT + ) + def test_insert_rows_errors(self): from google.cloud.bigquery.table import Table @@ -4765,6 +4799,55 @@ def test_insert_rows_from_dataframe_many_columns(self): assert len(actual_calls) == 1 assert actual_calls[0] == expected_call + @unittest.skipIf(pandas is None, "Requires `pandas`") + def test_insert_rows_from_dataframe_wo_explicit_insert_ids(self): + from google.cloud.bigquery.table import SchemaField + from google.cloud.bigquery.table import Table + + API_PATH = "/projects/{}/datasets/{}/tables/{}/insertAll".format( + self.PROJECT, self.DS_ID, self.TABLE_REF.table_id + ) + + dataframe = pandas.DataFrame( + [ + {"name": u"Little One", "adult": False}, + {"name": u"Young Gun", "adult": True}, + ] + ) + + # create client + creds = _make_credentials() + http = object() + client = self._make_one(project=self.PROJECT, credentials=creds, _http=http) + conn = client._connection = make_connection({}, {}) + + # create table + schema = [ + SchemaField("name", "STRING", mode="REQUIRED"), + SchemaField("adult", "BOOLEAN", mode="REQUIRED"), + ] + table = Table(self.TABLE_REF, schema=schema) + + error_info = client.insert_rows_from_dataframe( + table, dataframe, row_ids=[None] * len(dataframe) + ) + + self.assertEqual(len(error_info), 1) + assert error_info[0] == [] # no chunk errors + + EXPECTED_SENT_DATA = { + "rows": [ + {"insertId": None, "json": {"name": "Little One", "adult": "false"}}, + {"insertId": None, "json": {"name": "Young Gun", "adult": "true"}}, + ] + } + + actual_calls = conn.api_request.call_args_list + assert len(actual_calls) == 1 + assert actual_calls[0] == mock.call( + method="POST", path=API_PATH, data=EXPECTED_SENT_DATA + ) + def test_insert_rows_json(self): from google.cloud.bigquery.table import Table, SchemaField from google.cloud.bigquery.dataset import DatasetReference @@ -4833,6 +4916,27 @@ def test_insert_rows_json_with_string_id(self): data=expected, ) + def test_insert_rows_json_wo_explicit_insert_ids(self): + rows = [{"col1": "val1"}, {"col2": "val2"}] + creds = _make_credentials() + http = object() + client = self._make_one( + project="default-project", credentials=creds, _http=http + ) + conn = client._connection = make_connection({}) + + errors = client.insert_rows_json( + "proj.dset.tbl", rows, row_ids=[None] * len(rows), + ) + + self.assertEqual(len(errors), 0) + expected = {"rows": [{"json": row, "insertId": None} for row in rows]} + conn.api_request.assert_called_once_with( + method="POST", + path="/projects/proj/datasets/dset/tables/tbl/insertAll", + data=expected, + ) + def test_list_partitions(self): from google.cloud.bigquery.table import Table From fad82389cabe033bc5d7a5094f40b335b7646caf Mon Sep 17 00:00:00 2001 From: Peter Lamut Date: Thu, 31 Oct 2019 11:43:15 +0200 Subject: [PATCH 2/4] Groom the insert_rows_json() method's docstring --- bigquery/google/cloud/bigquery/client.py | 35 +++++++++++++----------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/bigquery/google/cloud/bigquery/client.py b/bigquery/google/cloud/bigquery/client.py index 02bfc651af0d..bae4359300f8 100644 --- a/bigquery/google/cloud/bigquery/client.py +++ b/bigquery/google/cloud/bigquery/client.py @@ -2264,29 +2264,32 @@ def insert_rows_json( table (Union[ \ google.cloud.bigquery.table.Table \ google.cloud.bigquery.table.TableReference, \ - str, \ + str \ ]): The destination table for the row data, or a reference to it. json_rows (Sequence[Dict]): Row data to be inserted. Keys must match the table schema fields and values must be JSON-compatible representations. - row_ids (Sequence[str]): - (Optional) Unique ids, one per row being inserted. If omitted, - unique IDs are created. - skip_invalid_rows (bool): - (Optional) Insert all valid rows of a request, even if invalid - rows exist. The default value is False, which causes the entire - request to fail if any invalid rows exist. - ignore_unknown_values (bool): - (Optional) Accept rows that contain values that do not match the - schema. The unknown values are ignored. Default is False, which + row_ids (Optional[Sequence[Optional[str]]]): + Unique IDs, one per row being inserted. An ID can also be + ``None``, indicating that an explicit insert ID should **not** + be used for that row. If the argument is omitted altogether, + unique IDs are created automatically. + skip_invalid_rows (Optional[bool]): + Insert all valid rows of a request, even if invalid rows exist. + The default value is ``False``, which causes the entire request + to fail if any invalid rows exist. + ignore_unknown_values (Optional[bool]): + Accept rows that contain values that do not match the schema. + The unknown values are ignored. Default is ``False``, which treats unknown values as errors. - template_suffix (str): - (Optional) treat ``name`` as a template table and provide a suffix. - BigQuery will create the table `` + `` based - on the schema of the template table. See + template_suffix (Optional[str]): + Treat ``name`` as a template table and provide a suffix. + BigQuery will create the table `` + `` + based on the schema of the template table. See https://cloud.google.com/bigquery/streaming-data-into-bigquery#template-tables - retry (google.api_core.retry.Retry): (Optional) How to retry the RPC. + retry (Optional[google.api_core.retry.Retry]): + How to retry the RPC. Returns: Sequence[Mappings]: From b6d6f08596de3698c61092c0527904266e91024e Mon Sep 17 00:00:00 2001 From: Peter Lamut Date: Thu, 31 Oct 2019 12:08:16 +0200 Subject: [PATCH 3/4] docs: document how to achieve higher insert write limit --- bigquery/docs/usage/tables.rst | 14 ++++++++ .../table_insert_rows_no_explicit_row_ids.py | 36 +++++++++++++++++++ ...t_table_insert_rows_no_explicit_row_ids.py | 33 +++++++++++++++++ 3 files changed, 83 insertions(+) create mode 100644 bigquery/samples/table_insert_rows_no_explicit_row_ids.py create mode 100644 bigquery/samples/tests/test_table_insert_rows_no_explicit_row_ids.py diff --git a/bigquery/docs/usage/tables.rst b/bigquery/docs/usage/tables.rst index 6a6cbd356639..896c5390a8ef 100644 --- a/bigquery/docs/usage/tables.rst +++ b/bigquery/docs/usage/tables.rst @@ -122,6 +122,20 @@ Insert rows into a table's data with the :start-after: [START bigquery_table_insert_rows] :end-before: [END bigquery_table_insert_rows] +Insert rows into a table's data with the +:func:`~google.cloud.bigquery.client.Client.insert_rows` method, achieving +higher write limit: + +.. literalinclude:: ../samples/table_insert_rows_no_explicit_row_ids.py + :language: python + :dedent: 4 + :start-after: [START bigquery_table_insert_rows_no_explicit_row_ids] + :end-before: [END bigquery_table_insert_rows_no_explicit_row_ids] + +Mind that inserting data without row insert IDs can come at the expense of more +duplicate inserts. See also: +`Streaming inserts `_. + Add an empty column to the existing table with the :func:`~google.cloud.bigquery.update_table` method: diff --git a/bigquery/samples/table_insert_rows_no_explicit_row_ids.py b/bigquery/samples/table_insert_rows_no_explicit_row_ids.py new file mode 100644 index 000000000000..3e7eb742472b --- /dev/null +++ b/bigquery/samples/table_insert_rows_no_explicit_row_ids.py @@ -0,0 +1,36 @@ +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def table_insert_rows_no_explicit_row_ids(client, table_id): + + # [START bigquery_table_insert_rows_no_explicit_row_ids] + # TODO(developer): Import the client library. + # from google.cloud import bigquery + + # TODO(developer): Construct a BigQuery client object. + # client = bigquery.Client() + + # TODO(developer): Set table_id to the ID of the model to fetch. + # table_id = "your-project.your_dataset.your_table" + + table = client.get_table(table_id) # Make an API request. + rows_to_insert = [(u"Phred Phlyntstone", 32), (u"Wylma Phlyntstone", 29)] + + errors = client.insert_rows( + table, rows_to_insert, row_ids=[None] * len(rows_to_insert) + ) # Make an API request. + if errors == []: + print("New rows have been added.") + # [END bigquery_table_insert_rows_no_explicit_row_ids] diff --git a/bigquery/samples/tests/test_table_insert_rows_no_explicit_row_ids.py b/bigquery/samples/tests/test_table_insert_rows_no_explicit_row_ids.py new file mode 100644 index 000000000000..7473f3a8669d --- /dev/null +++ b/bigquery/samples/tests/test_table_insert_rows_no_explicit_row_ids.py @@ -0,0 +1,33 @@ +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from google.cloud import bigquery + +from .. import table_insert_rows_no_explicit_row_ids as mut + + +def test_table_insert_rows_no_explicit_row_ids(capsys, client, random_table_id): + + schema = [ + bigquery.SchemaField("full_name", "STRING", mode="REQUIRED"), + bigquery.SchemaField("age", "INTEGER", mode="REQUIRED"), + ] + + table = bigquery.Table(random_table_id, schema=schema) + table = client.create_table(table) + + mut.table_insert_rows_no_explicit_row_ids(client, random_table_id) + out, err = capsys.readouterr() + assert "New rows have been added." in out From d14b1e949dd57619976544f3145ce8b3e6e34c05 Mon Sep 17 00:00:00 2001 From: Peter Lamut Date: Fri, 1 Nov 2019 10:48:06 +0200 Subject: [PATCH 4/4] Make method names less confusing for insert IDs --- bigquery/docs/usage/tables.rst | 10 +++++----- ...y => table_insert_rows_explicit_none_insert_ids.py} | 6 +++--- ...test_table_insert_rows_explicit_none_insert_ids.py} | 6 +++--- bigquery/tests/unit/test_client.py | 6 +++--- 4 files changed, 14 insertions(+), 14 deletions(-) rename bigquery/samples/{table_insert_rows_no_explicit_row_ids.py => table_insert_rows_explicit_none_insert_ids.py} (86%) rename bigquery/samples/tests/{test_table_insert_rows_no_explicit_row_ids.py => test_table_insert_rows_explicit_none_insert_ids.py} (80%) diff --git a/bigquery/docs/usage/tables.rst b/bigquery/docs/usage/tables.rst index 896c5390a8ef..d58dcc5d9ac4 100644 --- a/bigquery/docs/usage/tables.rst +++ b/bigquery/docs/usage/tables.rst @@ -126,14 +126,14 @@ Insert rows into a table's data with the :func:`~google.cloud.bigquery.client.Client.insert_rows` method, achieving higher write limit: -.. literalinclude:: ../samples/table_insert_rows_no_explicit_row_ids.py +.. literalinclude:: ../samples/table_insert_rows_explicit_none_insert_ids.py :language: python :dedent: 4 - :start-after: [START bigquery_table_insert_rows_no_explicit_row_ids] - :end-before: [END bigquery_table_insert_rows_no_explicit_row_ids] + :start-after: [START bigquery_table_insert_rows_explicit_none_insert_ids] + :end-before: [END bigquery_table_insert_rows_explicit_none_insert_ids] -Mind that inserting data without row insert IDs can come at the expense of more -duplicate inserts. See also: +Mind that inserting data with ``None`` row insert IDs can come at the expense of +more duplicate inserts. See also: `Streaming inserts `_. Add an empty column to the existing table with the diff --git a/bigquery/samples/table_insert_rows_no_explicit_row_ids.py b/bigquery/samples/table_insert_rows_explicit_none_insert_ids.py similarity index 86% rename from bigquery/samples/table_insert_rows_no_explicit_row_ids.py rename to bigquery/samples/table_insert_rows_explicit_none_insert_ids.py index 3e7eb742472b..953e7e210312 100644 --- a/bigquery/samples/table_insert_rows_no_explicit_row_ids.py +++ b/bigquery/samples/table_insert_rows_explicit_none_insert_ids.py @@ -13,9 +13,9 @@ # limitations under the License. -def table_insert_rows_no_explicit_row_ids(client, table_id): +def table_insert_rows_explicit_none_insert_ids(client, table_id): - # [START bigquery_table_insert_rows_no_explicit_row_ids] + # [START bigquery_table_insert_rows_explicit_none_insert_ids] # TODO(developer): Import the client library. # from google.cloud import bigquery @@ -33,4 +33,4 @@ def table_insert_rows_no_explicit_row_ids(client, table_id): ) # Make an API request. if errors == []: print("New rows have been added.") - # [END bigquery_table_insert_rows_no_explicit_row_ids] + # [END bigquery_table_insert_rows_explicit_none_insert_ids] diff --git a/bigquery/samples/tests/test_table_insert_rows_no_explicit_row_ids.py b/bigquery/samples/tests/test_table_insert_rows_explicit_none_insert_ids.py similarity index 80% rename from bigquery/samples/tests/test_table_insert_rows_no_explicit_row_ids.py rename to bigquery/samples/tests/test_table_insert_rows_explicit_none_insert_ids.py index 7473f3a8669d..6a59609baacf 100644 --- a/bigquery/samples/tests/test_table_insert_rows_no_explicit_row_ids.py +++ b/bigquery/samples/tests/test_table_insert_rows_explicit_none_insert_ids.py @@ -15,10 +15,10 @@ from google.cloud import bigquery -from .. import table_insert_rows_no_explicit_row_ids as mut +from .. import table_insert_rows_explicit_none_insert_ids as mut -def test_table_insert_rows_no_explicit_row_ids(capsys, client, random_table_id): +def test_table_insert_rows_explicit_none_insert_ids(capsys, client, random_table_id): schema = [ bigquery.SchemaField("full_name", "STRING", mode="REQUIRED"), @@ -28,6 +28,6 @@ def test_table_insert_rows_no_explicit_row_ids(capsys, client, random_table_id): table = bigquery.Table(random_table_id, schema=schema) table = client.create_table(table) - mut.table_insert_rows_no_explicit_row_ids(client, random_table_id) + mut.table_insert_rows_explicit_none_insert_ids(client, random_table_id) out, err = capsys.readouterr() assert "New rows have been added." in out diff --git a/bigquery/tests/unit/test_client.py b/bigquery/tests/unit/test_client.py index ad29aca29ab5..b4e5e96f1e8e 100644 --- a/bigquery/tests/unit/test_client.py +++ b/bigquery/tests/unit/test_client.py @@ -4572,7 +4572,7 @@ def test_insert_rows_w_record_schema(self): method="POST", path="/%s" % PATH, data=SENT ) - def test_insert_rows_wo_explicit_insert_ids(self): + def test_insert_rows_w_explicit_none_insert_ids(self): from google.cloud.bigquery.schema import SchemaField from google.cloud.bigquery.table import Table @@ -4800,7 +4800,7 @@ def test_insert_rows_from_dataframe_many_columns(self): assert actual_calls[0] == expected_call @unittest.skipIf(pandas is None, "Requires `pandas`") - def test_insert_rows_from_dataframe_wo_explicit_insert_ids(self): + def test_insert_rows_from_dataframe_w_explicit_none_insert_ids(self): from google.cloud.bigquery.table import SchemaField from google.cloud.bigquery.table import Table @@ -4916,7 +4916,7 @@ def test_insert_rows_json_with_string_id(self): data=expected, ) - def test_insert_rows_json_wo_explicit_insert_ids(self): + def test_insert_rows_json_w_explicit_none_insert_ids(self): rows = [{"col1": "val1"}, {"col2": "val2"}] creds = _make_credentials() http = object()