Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(bigquery): expose date_as_object parameter to users #150

Merged
merged 6 commits into from
Jun 29, 2020
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 11 additions & 3 deletions google/cloud/bigquery/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -3320,6 +3320,7 @@ def to_dataframe(
dtypes=None,
progress_bar_type=None,
create_bqstorage_client=True,
date_as_object=True,
):
"""Return a pandas DataFrame from a QueryJob

Expand Down Expand Up @@ -3350,16 +3351,22 @@ def to_dataframe(
for details.

..versionadded:: 1.11.0
create_bqstorage_client (bool):
Optional. If ``True`` (default), create a BigQuery Storage API
client using the default API settings. The BigQuery Storage API
create_bqstorage_client (Optional[bool]):
If ``True`` (default), create a BigQuery Storage APIclient
plamut marked this conversation as resolved.
Show resolved Hide resolved
using the default API settings. The BigQuery Storage API
is a faster way to fetch rows from BigQuery. See the
``bqstorage_client`` parameter for more information.

This argument does nothing if ``bqstorage_client`` is supplied.

..versionadded:: 1.24.0

date_as_object (Optional[bool]):
If ``True`` (default), cast dates to objects. If ``False``, convert
to datetime64[ns] dtype.

..versionadded:: 1.26.0

Returns:
A :class:`~pandas.DataFrame` populated with row data and column
headers from the query results. The column headers are derived
Expand All @@ -3373,6 +3380,7 @@ def to_dataframe(
dtypes=dtypes,
progress_bar_type=progress_bar_type,
create_bqstorage_client=create_bqstorage_client,
date_as_object=date_as_object,
)

def __iter__(self):
Expand Down
17 changes: 13 additions & 4 deletions google/cloud/bigquery/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -1633,6 +1633,7 @@ def to_dataframe(
dtypes=None,
progress_bar_type=None,
create_bqstorage_client=True,
date_as_object=True,
):
"""Create a pandas DataFrame by loading all pages of a query.

Expand Down Expand Up @@ -1673,16 +1674,22 @@ def to_dataframe(
progress bar as a graphical dialog box.

..versionadded:: 1.11.0
create_bqstorage_client (bool):
Optional. If ``True`` (default), create a BigQuery Storage API
client using the default API settings. The BigQuery Storage API
create_bqstorage_client (Optional[bool]):
If ``True`` (default), create a BigQuery Storage API client
using the default API settings. The BigQuery Storage API
is a faster way to fetch rows from BigQuery. See the
``bqstorage_client`` parameter for more information.

This argument does nothing if ``bqstorage_client`` is supplied.

..versionadded:: 1.24.0

date_as_object (Optional[bool]):
If ``True`` (default), cast dates to objects. If ``False``, convert
to datetime64[ns] dtype.

..versionadded:: 1.26.0

Returns:
pandas.DataFrame:
A :class:`~pandas.DataFrame` populated with row data and column
Expand Down Expand Up @@ -1722,7 +1729,7 @@ def to_dataframe(
bqstorage_client=bqstorage_client,
create_bqstorage_client=create_bqstorage_client,
)
df = record_batch.to_pandas()
df = record_batch.to_pandas(date_as_object=date_as_object)
for column in dtypes:
df[column] = pandas.Series(df[column], dtype=dtypes[column])
return df
Expand Down Expand Up @@ -1799,6 +1806,7 @@ def to_dataframe(
dtypes=None,
progress_bar_type=None,
create_bqstorage_client=True,
date_as_object=True,
plamut marked this conversation as resolved.
Show resolved Hide resolved
):
"""Create an empty dataframe.

Expand All @@ -1807,6 +1815,7 @@ def to_dataframe(
dtypes (Any): Ignored. Added for compatibility with RowIterator.
progress_bar_type (Any): Ignored. Added for compatibility with RowIterator.
create_bqstorage_client (bool): Ignored. Added for compatibility with RowIterator.
date_as_object (bool): Ignored. Added for compatibility with RowIterator.

Returns:
pandas.DataFrame: An empty :class:`~pandas.DataFrame`.
Expand Down
73 changes: 72 additions & 1 deletion tests/unit/test_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -5504,7 +5504,15 @@ def test_to_dataframe_column_dtypes(self):
},
}
row_data = [
["1.4338368E9", "420", "1.1", "1.77", "Cash", "true", "1999-12-01"],
[
"1.4338368E9",
"420",
"1.1",
"1.77",
"Cto_dataframeash",
"true",
"1999-12-01",
],
["1.3878117E9", "2580", "17.7", "28.5", "Cash", "false", "1953-06-14"],
["1.3855653E9", "2280", "4.4", "7.1", "Credit", "true", "1981-11-04"],
]
Expand Down Expand Up @@ -5533,6 +5541,69 @@ def test_to_dataframe_column_dtypes(self):
self.assertEqual(df.complete.dtype.name, "bool")
self.assertEqual(df.date.dtype.name, "object")

@unittest.skipIf(pyarrow is None, "Requires `pyarrow`")
@unittest.skipIf(pandas is None, "Requires `pandas`")
def test_to_dataframe_column_date_dtypes(self):
begun_resource = self._make_resource()
query_resource = {
"jobComplete": True,
"jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID},
"totalRows": "1",
"schema": {"fields": [{"name": "date", "type": "DATE"}]},
}
row_data = [
["1999-12-01"],
]
rows = [{"f": [{"v": field} for field in row]} for row in row_data]
query_resource["rows"] = rows
done_resource = copy.deepcopy(begun_resource)
done_resource["status"] = {"state": "DONE"}
connection = _make_connection(
begun_resource, query_resource, done_resource, query_resource
)
client = _make_client(project=self.PROJECT, connection=connection)
job = self._make_one(self.JOB_ID, self.QUERY, client)
df = job.to_dataframe(date_as_object=False, create_bqstorage_client=False)

self.assertIsInstance(df, pandas.DataFrame)
self.assertEqual(len(df), 1) # verify the number of rows
exp_columns = [field["name"] for field in query_resource["schema"]["fields"]]
self.assertEqual(list(df), exp_columns) # verify the column names

self.assertEqual(df.date.dtype.name, "datetime64[ns]")
plamut marked this conversation as resolved.
Show resolved Hide resolved

@unittest.skipIf(pandas is None, "Requires `pandas`")
def test_to_dataframe_column_date_dtypes_wo_pyarrow(self):
begun_resource = self._make_resource()
query_resource = {
"jobComplete": True,
"jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID},
"totalRows": "1",
"schema": {"fields": [{"name": "date", "type": "DATE"}]},
}
row_data = [
["1999-12-01"],
]
rows = [{"f": [{"v": field} for field in row]} for row in row_data]
query_resource["rows"] = rows
done_resource = copy.deepcopy(begun_resource)
done_resource["status"] = {"state": "DONE"}
connection = _make_connection(
begun_resource, query_resource, done_resource, query_resource
)
client = _make_client(project=self.PROJECT, connection=connection)
job = self._make_one(self.JOB_ID, self.QUERY, client)

with mock.patch("google.cloud.bigquery.table.pyarrow", None):
df = job.to_dataframe(date_as_object=False, create_bqstorage_client=False)

self.assertIsInstance(df, pandas.DataFrame)
self.assertEqual(len(df), 1) # verify the number of rows
exp_columns = [field["name"] for field in query_resource["schema"]["fields"]]
self.assertEqual(list(df), exp_columns) # verify the column names

self.assertEqual(df.date.dtype.name, "object")

@unittest.skipIf(pandas is None, "Requires `pandas`")
@unittest.skipIf(tqdm is None, "Requires `tqdm`")
@mock.patch("tqdm.tqdm")
Expand Down