Openpyxl engine for reading excel files (#25092)

pandas-dev · Jun 28, 2019 · 1be0561 · 1be0561
1 parent be4b48e
commit 1be0561
Show file tree

Hide file tree

Showing 8 changed files with 125 additions and 4 deletions.
diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -159,6 +159,7 @@ Other enhancements
 - Added support for reading SPSS .sav files using :func:`read_spss` (:issue:`26537`)
 - Added new option ``plotting.backend`` to be able to select a plotting backend different than the existing ``matplotlib`` one. Use ``pandas.set_option('plotting.backend', '<backend-module>')`` where ``<backend-module`` is a library implementing the pandas plotting API (:issue:`14130`)
 - :class:`pandas.offsets.BusinessHour` supports multiple opening hours intervals (:issue:`15481`)
+- :func:`read_excel` can now use ``openpyxl`` to read Excel files via the ``engine='openpyxl'`` argument. This will become the default in a future release (:issue:`11499`)
 
 .. _whatsnew_0250.api_breaking:
 

diff --git a/pandas/_typing.py b/pandas/_typing.py
@@ -24,3 +24,4 @@
 FilePathOrBuffer = Union[str, Path, IO[AnyStr]]
 
 FrameOrSeries = TypeVar('FrameOrSeries', ABCSeries, ABCDataFrame)
+Scalar = Union[str, int, float]
diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
@@ -411,7 +411,43 @@ def use_inf_as_na_cb(key):
     cf.register_option('chained_assignment', 'warn', chained_assignment,
                        validator=is_one_of_factory([None, 'warn', 'raise']))
 
-# Set up the io.excel specific configuration.
+
+# Set up the io.excel specific reader configuration.
+reader_engine_doc = """
+: string
+    The default Excel reader engine for '{ext}' files. Available options:
+    auto, {others}.
+"""
+
+_xls_options = ['xlrd']
+_xlsm_options = ['xlrd', 'openpyxl']
+_xlsx_options = ['xlrd', 'openpyxl']
+
+
+with cf.config_prefix("io.excel.xls"):
+    cf.register_option("reader", "auto",
+                       reader_engine_doc.format(
+                           ext='xls',
+                           others=', '.join(_xls_options)),
+                       validator=str)
+
+with cf.config_prefix("io.excel.xlsm"):
+    cf.register_option("reader", "auto",
+                       reader_engine_doc.format(
+                           ext='xlsm',
+                           others=', '.join(_xlsm_options)),
+                       validator=str)
+
+
+with cf.config_prefix("io.excel.xlsx"):
+    cf.register_option("reader", "auto",
+                       reader_engine_doc.format(
+                           ext='xlsx',
+                           others=', '.join(_xlsx_options)),
+                       validator=str)
+
+
+# Set up the io.excel specific writer configuration.
 writer_engine_doc = """
 : string
     The default Excel writer engine for '{ext}' files. Available options:

diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
@@ -422,7 +422,7 @@ def parse(self,
             data = self.get_sheet_data(sheet, convert_float)
             usecols = _maybe_convert_usecols(usecols)
 
-            if sheet.nrows == 0:
+            if not data:
                 output[asheetname] = DataFrame()
                 continue
 
@@ -769,9 +769,11 @@ class ExcelFile:
     """
 
     from pandas.io.excel._xlrd import _XlrdReader
+    from pandas.io.excel._openpyxl import _OpenpyxlReader
 
     _engines = {
         'xlrd': _XlrdReader,
+        'openpyxl': _OpenpyxlReader,
     }
 
     def __init__(self, io, engine=None):

diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py
@@ -1,4 +1,12 @@
-from pandas.io.excel._base import ExcelWriter
+from typing import List
+
+import numpy as np
+
+from pandas.compat._optional import import_optional_dependency
+
+from pandas._typing import FilePathOrBuffer, Scalar
+
+from pandas.io.excel._base import ExcelWriter, _BaseExcelReader
 from pandas.io.excel._util import _validate_freeze_panes
 
 
@@ -451,3 +459,67 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0,
                             xcell = wks.cell(column=col, row=row)
                             for k, v in style_kwargs.items():
                                 setattr(xcell, k, v)
+
+
+class _OpenpyxlReader(_BaseExcelReader):
+
+    def __init__(self, filepath_or_buffer: FilePathOrBuffer) -> None:
+        """Reader using openpyxl engine.
+
+        Parameters
+        ----------
+        filepath_or_buffer : string, path object or Workbook
+            Object to be parsed.
+        """
+        import_optional_dependency("openpyxl")
+        super().__init__(filepath_or_buffer)
+
+    @property
+    def _workbook_class(self):
+        from openpyxl import Workbook
+        return Workbook
+
+    def load_workbook(self, filepath_or_buffer: FilePathOrBuffer):
+        from openpyxl import load_workbook
+        return load_workbook(filepath_or_buffer,
+                             read_only=True, data_only=True)
+
+    @property
+    def sheet_names(self) -> List[str]:
+        return self.book.sheetnames
+
+    def get_sheet_by_name(self, name: str):
+        return self.book[name]
+
+    def get_sheet_by_index(self, index: int):
+        return self.book.worksheets[index]
+
+    def _convert_cell(self, cell, convert_float: bool) -> Scalar:
+
+        # TODO: replace with openpyxl constants
+        if cell.is_date:
+            return cell.value
+        elif cell.data_type == 'e':
+            return np.nan
+        elif cell.data_type == 'b':
+            return bool(cell.value)
+        elif cell.value is None:
+            return ''  # compat with xlrd
+        elif cell.data_type == 'n':
+            # GH5394
+            if convert_float:
+                val = int(cell.value)
+                if val == cell.value:
+                    return val
+            else:
+                return float(cell.value)
+
+        return cell.value
+
+    def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
+        data = []  # type: List[List[Scalar]]
+        for row in sheet.rows:
+            data.append(
+                [self._convert_cell(cell, convert_float) for cell in row])
+
+        return data
diff --git a/pandas/tests/io/data/test1.xlsm b/pandas/tests/io/data/test1.xlsm
diff --git a/pandas/tests/io/data/test1.xlsx b/pandas/tests/io/data/test1.xlsx
diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
@@ -38,13 +38,17 @@ class TestReaders:
         # Add any engines to test here
         pytest.param('xlrd', marks=pytest.mark.skipif(
             not td.safe_import("xlrd"), reason="no xlrd")),
+        pytest.param('openpyxl', marks=pytest.mark.skipif(
+            not td.safe_import("openpyxl"), reason="no openpyxl")),
         pytest.param(None, marks=pytest.mark.skipif(
             not td.safe_import("xlrd"), reason="no xlrd")),
     ])
-    def cd_and_set_engine(self, request, datapath, monkeypatch):
+    def cd_and_set_engine(self, request, datapath, monkeypatch, read_ext):
         """
         Change directory and set engine for read_excel calls.
         """
+        if request.param == 'openpyxl' and read_ext == '.xls':
+            pytest.skip()
         func = partial(pd.read_excel, engine=request.param)
         monkeypatch.chdir(datapath("io", "data"))
         monkeypatch.setattr(pd, 'read_excel', func)
@@ -397,6 +401,9 @@ def test_date_conversion_overflow(self, read_ext):
                                  [1e+20, 'Timothy Brown']],
                                 columns=['DateColWithBigInt', 'StringCol'])
 
+        if pd.read_excel.keywords['engine'] == 'openpyxl':
+            pytest.xfail("Maybe not supported by openpyxl")
+
         result = pd.read_excel('testdateoverflow' + read_ext)
         tm.assert_frame_equal(result, expected)
 
@@ -724,6 +731,8 @@ class TestExcelFileRead:
         # Add any engines to test here
         pytest.param('xlrd', marks=pytest.mark.skipif(
             not td.safe_import("xlrd"), reason="no xlrd")),
+        pytest.param('openpyxl', marks=pytest.mark.skipif(
+            not td.safe_import("openpyxl"), reason="no openpyxl")),
         pytest.param(None, marks=pytest.mark.skipif(
             not td.safe_import("xlrd"), reason="no xlrd")),
     ])
Original file line number	Diff line number	Diff line change
Expand Up		@@ -24,3 +24,4 @@
		FilePathOrBuffer = Union[str, Path, IO[AnyStr]]

		FrameOrSeries = TypeVar('FrameOrSeries', ABCSeries, ABCDataFrame)
		Scalar = Union[str, int, float]