From e9a84798e760dffdcc9c434beb7106b61f043f72 Mon Sep 17 00:00:00 2001 From: Alix Hamilton Date: Fri, 17 Nov 2017 16:32:07 -0800 Subject: [PATCH] adds to_dataframe() helper to QueryJob --- bigquery/google/cloud/bigquery/job.py | 15 +++++++++++- bigquery/tests/unit/test_job.py | 35 +++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/bigquery/google/cloud/bigquery/job.py b/bigquery/google/cloud/bigquery/job.py index 91301b1ed8d2b..9f66e3ec9ea0a 100644 --- a/bigquery/google/cloud/bigquery/job.py +++ b/bigquery/google/cloud/bigquery/job.py @@ -1929,7 +1929,7 @@ def result(self, timeout=None, retry=DEFAULT_RETRY): :type retry: :class:`google.api_core.retry.Retry` :param retry: (Optional) How to retry the call that retrieves rows. - :rtype: :class:`~google.api_core.page_iterator.Iterator` + :rtype: :class:`~google.cloud.bigquery.table.RowIterator` :returns: Iterator of row data :class:`~google.cloud.bigquery.table.Row`-s. During each page, the iterator will have the ``total_rows`` @@ -1949,6 +1949,19 @@ def result(self, timeout=None, retry=DEFAULT_RETRY): return self._client.list_rows(dest_table, selected_fields=schema, retry=retry) + def to_dataframe(self): + """Return a pandas DataFrame from a QueryJob + + Returns: + A :class:`~pandas.DataFrame` populated with row data and column + headers from the query results. The column headers are derived + from the destination table's schema. + + Raises: + ValueError: If the `pandas` library cannot be imported. + """ + return self.result().to_dataframe() + def __iter__(self): return iter(self.result()) diff --git a/bigquery/tests/unit/test_job.py b/bigquery/tests/unit/test_job.py index f6ac61973ef05..2f141a4dc04d3 100644 --- a/bigquery/tests/unit/test_job.py +++ b/bigquery/tests/unit/test_job.py @@ -2724,6 +2724,41 @@ def test_reload_w_alternate_client(self): self.assertEqual(req['path'], PATH) self._verifyResourceProperties(job, RESOURCE) + @unittest.skipIf(pandas is None, 'Requires `pandas`') + def test_to_dataframe(self): + begun_resource = self._make_resource() + query_resource = { + 'jobComplete': True, + 'jobReference': { + 'projectId': self.PROJECT, + 'jobId': self.JOB_ID, + }, + 'schema': { + 'fields': [ + {'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE'}, + {'name': 'age', 'type': 'INTEGER', 'mode': 'NULLABLE'}, + ], + }, + 'rows': [ + {'f': [{'v': 'Phred Phlyntstone'}, {'v': '32'}]}, + {'f': [{'v': 'Bharney Rhubble'}, {'v': '33'}]}, + {'f': [{'v': 'Wylma Phlyntstone'}, {'v': '29'}]}, + {'f': [{'v': 'Bhettye Rhubble'}, {'v': '27'}]}, + ], + } + done_resource = copy.deepcopy(begun_resource) + done_resource['status'] = {'state': 'DONE'} + connection = _Connection( + begun_resource, query_resource, done_resource, query_resource) + client = _make_client(project=self.PROJECT, connection=connection) + job = self._make_one(self.JOB_ID, self.QUERY, client) + + df = job.to_dataframe() + + self.assertIsInstance(df, pandas.DataFrame) + self.assertEqual(len(df), 4) # verify the number of rows + self.assertEqual(list(df), ['name', 'age']) # verify the column names + def test_iter(self): import types