From 7517dd0f4905f533209e31354f31c0e0268afea8 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 28 Jan 2019 17:46:05 -0600 Subject: [PATCH 1/2] DOC: Document breaking change to read_csv Closes https://github.com/pandas-dev/pandas/issues/24987 --- doc/source/user_guide/io.rst | 30 +++++++++++++++++++++ doc/source/whatsnew/v0.24.0.rst | 46 +++++++++++++++++++++++++++++++++ pandas/io/parsers.py | 11 +++++--- 3 files changed, 84 insertions(+), 3 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 58e1b2370c7c8..b23a0f10e9e2b 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -989,6 +989,36 @@ a single date rather than the entire array. os.remove('tmp.csv') + +.. _io.csv.mixed_timezones: + +Parsing a CSV with mixed Timezones +++++++++++++++++++++++++++++++++++ + +Pandas cannot natively represent a column or index with mixed timezones. If your CSV +file contains columns with a mixture of timezones, the default result will be +an object-dtype column with strings, even with ``parse_dates``. + + +.. ipython:: python + + content = """\ + a + 2000-01-01T00:00:00+05:00 + 2000-01-01T00:00:00+06:00""" + df = pd.read_csv(StringIO(content), parse_dates=['a']) + df['a'] + +To parse the mixed-timezone values as a datetime column, pass a partially-applied +:func:`to_datetime` with ``utc=True`` as the ``date_parser``. + +.. ipython:: python + + df = pd.read_csv(StringIO(content), parse_dates=['a'], + date_parser=lambda col: pd.to_datetime(col, utc=True)) + df['a'] + + .. _io.dayfirst: diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 16319a3b83ca4..da7bb631550db 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -648,6 +648,52 @@ that the dates have been converted to UTC pd.to_datetime(["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30"], utc=True) + +.. _whatsnew_0240.api_breaking.read_csv_mixed_tz: + +Parsing mixed-timezones with :func:`read_csv` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:func:`read_csv` no longer silently converts mixed-timezone columns to UTC (:issue:`24987`). + +*Previous Behavior* + +.. code-block:: python + + >>> import io + >>> content = """\ + ... a + ... 2000-01-01T00:00:00+05:00 + ... 2000-01-01T00:00:00+06:00""" + >>> df = pd.read_csv(io.StringIO(content), parse_dates=['a']) + >>> df.a + 0 1999-12-31 19:00:00 + 1 1999-12-31 18:00:00 + Name: a, dtype: datetime64[ns] + +*New Behavior* + +.. ipython:: python + + import io + content = """\ + a + 2000-01-01T00:00:00+05:00 + 2000-01-01T00:00:00+06:00""" + df = pd.read_csv(io.StringIO(content), parse_dates=['a']) + df.a + +As can be seen, the ``dtype`` is object; each value in the column is a string. +To convert the strings to an array of datetimes, the ``date_parser`` argument + +.. ipython:: python + + df = pd.read_csv(io.StringIO(content), parse_dates=['a'], + date_parser=lambda col: pd.to_datetime(col, utc=True) + df.a + +See :ref:`whatsnew_0240.api.timezone_offset_parsing` for more. + .. _whatsnew_0240.api_breaking.period_end_time: Time values in ``dt.end_time`` and ``to_timestamp(how='end')`` diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index b31d3f665f47f..4163a571df800 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -203,9 +203,14 @@ * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call result 'foo' - If a column or index contains an unparseable date, the entire column or - index will be returned unaltered as an object data type. For non-standard - datetime parsing, use ``pd.to_datetime`` after ``pd.read_csv`` + If a column or index cannot be represented as an array of datetimes, + say because of an unparseable value or a mixture of timezones, the column + or index will be returned unaltered as an object data type. For + non-standard datetime parsing, use ``pd.to_datetime`` after + ``pd.read_csv``. To parse an index or column with a mixture of timezones, + specify ``date_parser`` to be a partially-applied + :func:`pandas.to_datetime` with ``utc=True``. See + :ref:`io.csv.mixed_timezones` for more. Note: A fast-path exists for iso8601-formatted dates. infer_datetime_format : bool, default False From c26684d955c83c233d218e2e5bea4bd0fc6813a3 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 28 Jan 2019 20:48:06 -0600 Subject: [PATCH 2/2] missing paren --- doc/source/whatsnew/v0.24.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index da7bb631550db..a49ea2cf493a6 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -689,7 +689,7 @@ To convert the strings to an array of datetimes, the ``date_parser`` argument .. ipython:: python df = pd.read_csv(io.StringIO(content), parse_dates=['a'], - date_parser=lambda col: pd.to_datetime(col, utc=True) + date_parser=lambda col: pd.to_datetime(col, utc=True)) df.a See :ref:`whatsnew_0240.api.timezone_offset_parsing` for more.