Remove NotImplementedError for parse_dates keyword in read_excel

Rebase and update of PR pandas-dev#12051 Author: Joris Van den Bossche <jorisvandenbossche@gmail.com> Author: Graham R. Jeffries <graham.r.jeffries@gmail.com> This patch had conflicts when merged, resolved by Committer: Jeff Reback <jeff@reback.net> Closes pandas-dev#14326 from jorisvandenbossche/pr/12051 and squashes the following commits: 0b65a7a [Joris Van den Bossche] update wording 656ec44 [Joris Van den Bossche] Fix detection to raise warning b1c7f87 [Joris Van den Bossche] add whatsnew 925ce1b [Joris Van den Bossche] Update tests 0e10a9d [Graham R. Jeffries] remove read_excel kwd NotImplemented error, update documentation pandas-dev#11544
jreback · Mar 27, 2017 · 10d8c40 · 10d8c40
1 parent 686e9e0
commit 10d8c40
Show file tree

Hide file tree

Showing 4 changed files with 43 additions and 20 deletions.
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -2767,6 +2767,20 @@ indices to be parsed.
 
    read_excel('path_to_file.xls', 'Sheet1', parse_cols=[0, 2, 3])
 
+
+Parsing Dates
++++++++++++++
+
+Datetime-like values are normally automatically converted to the appropriate
+dtype when reading the excel file. But if you have a column of strings that
+*look* like dates (but are not actually formatted as dates in excel), you can
+use the `parse_dates` keyword to parse those strings to datetimes:
+
+.. code-block:: python
+
+   read_excel('path_to_file.xls', 'Sheet1', parse_dates=['date_strings'])
+
+
 Cell Converters
 +++++++++++++++
 

diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
@@ -517,13 +517,17 @@ Other enhancements
 - The ``pd.read_json`` and ``DataFrame.to_json`` has gained support for reading and writing json lines with ``lines`` option see :ref:`Line delimited json <io.jsonl>` (:issue:`9180`)
 - :func:`read_excel` now supports the true_values and false_values keyword arguments (:issue:`13347`)
 - ``groupby()`` will now accept a scalar and a single-element list for specifying ``level`` on a non-``MultiIndex`` grouper. (:issue:`13907`)
+<<<<<<< HEAD
 - Non-convertible dates in an excel date column will be returned without conversion and the column will be ``object`` dtype, rather than raising an exception (:issue:`10001`).
 - ``pd.Timedelta(None)`` is now accepted and will return ``NaT``, mirroring ``pd.Timestamp`` (:issue:`13687`)
 - ``pd.read_stata()`` can now handle some format 111 files, which are produced by SAS when generating Stata dta files (:issue:`11526`)
 - ``Series`` and ``Index`` now support ``divmod`` which will return a tuple of
   series or indices. This behaves like a standard binary operator with regards
   to broadcasting rules (:issue:`14208`).
 
+=======
+- Re-enable the ``parse_dates`` keyword of ``read_excel`` to parse string columns as dates (:issue:`14326`)
+>>>>>>> PR_TOOL_MERGE_PR_14326
 
 .. _whatsnew_0190.api:
 

diff --git a/pandas/io/excel.py b/pandas/io/excel.py
@@ -343,13 +343,10 @@ def _parse_excel(self, sheetname=0, header=0, skiprows=None, names=None,
         if 'chunksize' in kwds:
             raise NotImplementedError("chunksize keyword of read_excel "
                                       "is not implemented")
-        if parse_dates:
-            raise NotImplementedError("parse_dates keyword of read_excel "
-                                      "is not implemented")
 
-        if date_parser is not None:
-            raise NotImplementedError("date_parser keyword of read_excel "
-                                      "is not implemented")
+        if parse_dates is True and not index_col:
+            warn("The 'parse_dates=True' keyword of read_excel was provided"
+                 " without an 'index_col' keyword value.")
 
         import xlrd
         from xlrd import (xldate, XL_CELL_DATE,

diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py
@@ -924,17 +924,27 @@ def test_read_excel_chunksize(self):
                           chunksize=100)
 
     def test_read_excel_parse_dates(self):
-        # GH 11544
-        with tm.assertRaises(NotImplementedError):
-            pd.read_excel(os.path.join(self.dirpath, 'test1' + self.ext),
-                          parse_dates=True)
+        # GH 11544, 12051
 
-    def test_read_excel_date_parser(self):
-        # GH 11544
-        with tm.assertRaises(NotImplementedError):
-            dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
-            pd.read_excel(os.path.join(self.dirpath, 'test1' + self.ext),
-                          date_parser=dateparse)
+        df = DataFrame(
+            {'col': [1, 2, 3],
+             'date_strings': pd.date_range('2012-01-01', periods=3)})
+        df2 = df.copy()
+        df2['date_strings'] = df2['date_strings'].dt.strftime('%m/%d/%Y')
+
+        with ensure_clean(self.ext) as pth:
+            df2.to_excel(pth)
+
+            res = read_excel(pth)
+            tm.assert_frame_equal(df2, res)
+
+            res = read_excel(pth, parse_dates=['date_strings'])
+            tm.assert_frame_equal(df, res)
+
+            dateparser = lambda x: pd.datetime.strptime(x, '%m/%d/%Y')
+            res = read_excel(pth, parse_dates=['date_strings'],
+                             date_parser=dateparser)
+            tm.assert_frame_equal(df, res)
 
     def test_read_excel_skiprows_list(self):
         # GH 4903
@@ -1382,8 +1392,7 @@ def test_to_excel_multiindex(self):
             # round trip
             frame.to_excel(path, 'test1', merge_cells=self.merge_cells)
             reader = ExcelFile(path)
-            df = read_excel(reader, 'test1', index_col=[0, 1],
-                            parse_dates=False)
+            df = read_excel(reader, 'test1', index_col=[0, 1])
             tm.assert_frame_equal(frame, df)
 
     # GH13511
@@ -1424,8 +1433,7 @@ def test_to_excel_multiindex_cols(self):
             frame.to_excel(path, 'test1', merge_cells=self.merge_cells)
             reader = ExcelFile(path)
             df = read_excel(reader, 'test1', header=header,
-                            index_col=[0, 1],
-                            parse_dates=False)
+                            index_col=[0, 1])
             if not self.merge_cells:
                 fm = frame.columns.format(sparsify=False,
                                           adjoin=False, names=False)