Skip to content

Commit

Permalink
Openpyxl engine for reading excel files (#25092)
Browse files Browse the repository at this point in the history
  • Loading branch information
tdamsma authored and WillAyd committed Jun 28, 2019
1 parent be4b48e commit 1be0561
Show file tree
Hide file tree
Showing 8 changed files with 125 additions and 4 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ Other enhancements
- Added support for reading SPSS .sav files using :func:`read_spss` (:issue:`26537`)
- Added new option ``plotting.backend`` to be able to select a plotting backend different than the existing ``matplotlib`` one. Use ``pandas.set_option('plotting.backend', '<backend-module>')`` where ``<backend-module`` is a library implementing the pandas plotting API (:issue:`14130`)
- :class:`pandas.offsets.BusinessHour` supports multiple opening hours intervals (:issue:`15481`)
- :func:`read_excel` can now use ``openpyxl`` to read Excel files via the ``engine='openpyxl'`` argument. This will become the default in a future release (:issue:`11499`)

.. _whatsnew_0250.api_breaking:

Expand Down
1 change: 1 addition & 0 deletions pandas/_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,4 @@
FilePathOrBuffer = Union[str, Path, IO[AnyStr]]

FrameOrSeries = TypeVar('FrameOrSeries', ABCSeries, ABCDataFrame)
Scalar = Union[str, int, float]
38 changes: 37 additions & 1 deletion pandas/core/config_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,7 +411,43 @@ def use_inf_as_na_cb(key):
cf.register_option('chained_assignment', 'warn', chained_assignment,
validator=is_one_of_factory([None, 'warn', 'raise']))

# Set up the io.excel specific configuration.

# Set up the io.excel specific reader configuration.
reader_engine_doc = """
: string
The default Excel reader engine for '{ext}' files. Available options:
auto, {others}.
"""

_xls_options = ['xlrd']
_xlsm_options = ['xlrd', 'openpyxl']
_xlsx_options = ['xlrd', 'openpyxl']


with cf.config_prefix("io.excel.xls"):
cf.register_option("reader", "auto",
reader_engine_doc.format(
ext='xls',
others=', '.join(_xls_options)),
validator=str)

with cf.config_prefix("io.excel.xlsm"):
cf.register_option("reader", "auto",
reader_engine_doc.format(
ext='xlsm',
others=', '.join(_xlsm_options)),
validator=str)


with cf.config_prefix("io.excel.xlsx"):
cf.register_option("reader", "auto",
reader_engine_doc.format(
ext='xlsx',
others=', '.join(_xlsx_options)),
validator=str)


# Set up the io.excel specific writer configuration.
writer_engine_doc = """
: string
The default Excel writer engine for '{ext}' files. Available options:
Expand Down
4 changes: 3 additions & 1 deletion pandas/io/excel/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -422,7 +422,7 @@ def parse(self,
data = self.get_sheet_data(sheet, convert_float)
usecols = _maybe_convert_usecols(usecols)

if sheet.nrows == 0:
if not data:
output[asheetname] = DataFrame()
continue

Expand Down Expand Up @@ -769,9 +769,11 @@ class ExcelFile:
"""

from pandas.io.excel._xlrd import _XlrdReader
from pandas.io.excel._openpyxl import _OpenpyxlReader

_engines = {
'xlrd': _XlrdReader,
'openpyxl': _OpenpyxlReader,
}

def __init__(self, io, engine=None):
Expand Down
74 changes: 73 additions & 1 deletion pandas/io/excel/_openpyxl.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,12 @@
from pandas.io.excel._base import ExcelWriter
from typing import List

import numpy as np

from pandas.compat._optional import import_optional_dependency

from pandas._typing import FilePathOrBuffer, Scalar

from pandas.io.excel._base import ExcelWriter, _BaseExcelReader
from pandas.io.excel._util import _validate_freeze_panes


Expand Down Expand Up @@ -451,3 +459,67 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0,
xcell = wks.cell(column=col, row=row)
for k, v in style_kwargs.items():
setattr(xcell, k, v)


class _OpenpyxlReader(_BaseExcelReader):

def __init__(self, filepath_or_buffer: FilePathOrBuffer) -> None:
"""Reader using openpyxl engine.
Parameters
----------
filepath_or_buffer : string, path object or Workbook
Object to be parsed.
"""
import_optional_dependency("openpyxl")
super().__init__(filepath_or_buffer)

@property
def _workbook_class(self):
from openpyxl import Workbook
return Workbook

def load_workbook(self, filepath_or_buffer: FilePathOrBuffer):
from openpyxl import load_workbook
return load_workbook(filepath_or_buffer,
read_only=True, data_only=True)

@property
def sheet_names(self) -> List[str]:
return self.book.sheetnames

def get_sheet_by_name(self, name: str):
return self.book[name]

def get_sheet_by_index(self, index: int):
return self.book.worksheets[index]

def _convert_cell(self, cell, convert_float: bool) -> Scalar:

# TODO: replace with openpyxl constants
if cell.is_date:
return cell.value
elif cell.data_type == 'e':
return np.nan
elif cell.data_type == 'b':
return bool(cell.value)
elif cell.value is None:
return '' # compat with xlrd
elif cell.data_type == 'n':
# GH5394
if convert_float:
val = int(cell.value)
if val == cell.value:
return val
else:
return float(cell.value)

return cell.value

def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
data = [] # type: List[List[Scalar]]
for row in sheet.rows:
data.append(
[self._convert_cell(cell, convert_float) for cell in row])

return data
Binary file modified pandas/tests/io/data/test1.xlsm
Binary file not shown.
Binary file modified pandas/tests/io/data/test1.xlsx
Binary file not shown.
11 changes: 10 additions & 1 deletion pandas/tests/io/excel/test_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,17 @@ class TestReaders:
# Add any engines to test here
pytest.param('xlrd', marks=pytest.mark.skipif(
not td.safe_import("xlrd"), reason="no xlrd")),
pytest.param('openpyxl', marks=pytest.mark.skipif(
not td.safe_import("openpyxl"), reason="no openpyxl")),
pytest.param(None, marks=pytest.mark.skipif(
not td.safe_import("xlrd"), reason="no xlrd")),
])
def cd_and_set_engine(self, request, datapath, monkeypatch):
def cd_and_set_engine(self, request, datapath, monkeypatch, read_ext):
"""
Change directory and set engine for read_excel calls.
"""
if request.param == 'openpyxl' and read_ext == '.xls':
pytest.skip()
func = partial(pd.read_excel, engine=request.param)
monkeypatch.chdir(datapath("io", "data"))
monkeypatch.setattr(pd, 'read_excel', func)
Expand Down Expand Up @@ -397,6 +401,9 @@ def test_date_conversion_overflow(self, read_ext):
[1e+20, 'Timothy Brown']],
columns=['DateColWithBigInt', 'StringCol'])

if pd.read_excel.keywords['engine'] == 'openpyxl':
pytest.xfail("Maybe not supported by openpyxl")

result = pd.read_excel('testdateoverflow' + read_ext)
tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -724,6 +731,8 @@ class TestExcelFileRead:
# Add any engines to test here
pytest.param('xlrd', marks=pytest.mark.skipif(
not td.safe_import("xlrd"), reason="no xlrd")),
pytest.param('openpyxl', marks=pytest.mark.skipif(
not td.safe_import("openpyxl"), reason="no openpyxl")),
pytest.param(None, marks=pytest.mark.skipif(
not td.safe_import("xlrd"), reason="no xlrd")),
])
Expand Down

0 comments on commit 1be0561

Please sign in to comment.