Skip to content

Commit

Permalink
Deprecate using xlrd engine and change default engine to read excel…
Browse files Browse the repository at this point in the history
… files to openpyxl
  • Loading branch information
cruzzoe authored and roberthdevries committed Aug 23, 2020
1 parent eca6068 commit 45e8193
Show file tree
Hide file tree
Showing 6 changed files with 89 additions and 20 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for mor
Deprecations
~~~~~~~~~~~~
- Deprecated parameter ``inplace`` in :meth:`MultiIndex.set_codes` and :meth:`MultiIndex.set_levels` (:issue:`35626`)
- :func:`read_excel` default engine "xlrd" is replaced by "openpyxl" because "xlrd" is deprecated (:issue:`28547`).
-
-

Expand Down
18 changes: 14 additions & 4 deletions pandas/io/excel/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import os
from textwrap import fill
from typing import Any, Mapping, Union
import warnings

from pandas._config import config

Expand Down Expand Up @@ -825,8 +826,7 @@ def _is_ods_stream(stream: Union[BufferedIOBase, RawIOBase]) -> bool:
class ExcelFile:
"""
Class for parsing tabular excel sheets into DataFrame objects.
Uses xlrd engine by default. See read_excel for more documentation
Uses xlrd, openpyxl or odf. See read_excel for more documentation
Parameters
----------
Expand All @@ -837,7 +837,7 @@ class ExcelFile:
engine : str, default None
If io is not a buffer or path, this must be set to identify io.
Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb``,
default ``xlrd``.
default ``openpyxl``, ``xlrd`` for .xls files, ``odf`` for .ods files.
Engine compatibility :
- ``xlrd`` supports most old/new Excel file formats.
- ``openpyxl`` supports newer Excel file formats.
Expand All @@ -861,14 +861,24 @@ def __init__(
self, path_or_buffer, engine=None, storage_options: StorageOptions = None
):
if engine is None:
engine = "xlrd"
engine = "openpyxl"
if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)):
if _is_ods_stream(path_or_buffer):
engine = "odf"
else:
ext = os.path.splitext(str(path_or_buffer))[-1]
if ext == ".ods":
engine = "odf"
elif ext == ".xls":
engine = "xlrd"

elif engine == "xlrd":
warnings.warn(
'The Excel reader engine "xlrd" is deprecated, use "openpyxl" instead. '
'Specify engine="openpyxl" to suppress this warning.',
FutureWarning,
stacklevel=2,
)
if engine not in self._engines:
raise ValueError(f"Unknown engine: {engine}")

Expand Down
7 changes: 6 additions & 1 deletion pandas/io/excel/_openpyxl.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from datetime import datetime
from typing import List

import numpy as np
Expand Down Expand Up @@ -517,7 +518,11 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar:

# TODO: replace with openpyxl constants
if cell.is_date:
return cell.value
try:
# workaround for inaccurate timestamp notation in excel
return datetime.fromtimestamp(round(cell.value.timestamp()))
except (AttributeError, OSError):
return cell.value
elif cell.data_type == "e":
return np.nan
elif cell.data_type == "b":
Expand Down
37 changes: 25 additions & 12 deletions pandas/tests/io/excel/test_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@
marks=[
td.skip_if_no("xlrd"),
pytest.mark.filterwarnings("ignore:.*(tree\\.iter|html argument)"),
pytest.mark.filterwarnings(
'ignore:The Excel reader engine "xlrd" is deprecated,'
),
],
),
pytest.param(
Expand All @@ -34,8 +37,8 @@
pytest.param(
None,
marks=[
td.skip_if_no("xlrd"),
pytest.mark.filterwarnings("ignore:.*(tree\\.iter|html argument)"),
td.skip_if_no("openpyxl"),
pytest.mark.filterwarnings("ignore:.*html argument"),
],
),
pytest.param("pyxlsb", marks=td.skip_if_no("pyxlsb")),
Expand All @@ -51,6 +54,8 @@ def _is_valid_engine_ext_pair(engine, read_ext: str) -> bool:
engine = engine.values[0]
if engine == "openpyxl" and read_ext == ".xls":
return False
if engine is None and read_ext == ".xls":
return False
if engine == "odf" and read_ext != ".ods":
return False
if read_ext == ".ods" and engine != "odf":
Expand Down Expand Up @@ -559,7 +564,7 @@ def test_date_conversion_overflow(self, read_ext):
columns=["DateColWithBigInt", "StringCol"],
)

if pd.read_excel.keywords["engine"] == "openpyxl":
if pd.read_excel.keywords["engine"] in ["openpyxl", None]:
pytest.xfail("Maybe not supported by openpyxl")

result = pd.read_excel("testdateoverflow" + read_ext)
Expand Down Expand Up @@ -942,7 +947,10 @@ def test_read_excel_squeeze(self, read_ext):
expected = pd.Series([1, 2, 3], name="a")
tm.assert_series_equal(actual, expected)

def test_deprecated_kwargs(self, read_ext):
def test_deprecated_kwargs(self, engine, read_ext):
if engine == "xlrd":
pytest.skip("Use of xlrd engine produces a FutureWarning as well")

with tm.assert_produces_warning(FutureWarning, raise_on_extra_warnings=False):
pd.read_excel("test1" + read_ext, "Sheet1", 0)

Expand All @@ -961,6 +969,19 @@ def test_no_header_with_list_index_col(self, read_ext):
)
tm.assert_frame_equal(expected, result)

def test_excel_high_surrogate(self, engine, read_ext):
# GH 23809
if read_ext != ".xlsx":
pytest.skip("Test is only applicable to .xlsx file")
if engine in ["openpyxl", None]:
pytest.skip("Test does not work for openpyxl")

expected = pd.DataFrame(["\udc88"], columns=["Column1"])

# should not produce a segmentation violation
actual = pd.read_excel("high_surrogate.xlsx")
tm.assert_frame_equal(expected, actual)


class TestExcelFileRead:
@pytest.fixture(autouse=True)
Expand Down Expand Up @@ -1116,14 +1137,6 @@ def test_excel_read_binary(self, engine, read_ext):
actual = pd.read_excel(data, engine=engine)
tm.assert_frame_equal(expected, actual)

def test_excel_high_surrogate(self, engine):
# GH 23809
expected = pd.DataFrame(["\udc88"], columns=["Column1"])

# should not produce a segmentation violation
actual = pd.read_excel("high_surrogate.xlsx")
tm.assert_frame_equal(expected, actual)

@pytest.mark.parametrize("filename", ["df_empty.xlsx", "df_equals.xlsx"])
def test_header_with_index_col(self, engine, filename):
# GH 33476
Expand Down
18 changes: 16 additions & 2 deletions pandas/tests/io/excel/test_writers.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,12 +351,16 @@ def test_excel_sheet_by_name_raise(self, path, engine):
msg = "sheet 0 not found"
with pytest.raises(ValueError, match=msg):
pd.read_excel(xl, "0")
else:
elif engine == "xlwt":
import xlrd

msg = "No sheet named <'0'>"
with pytest.raises(xlrd.XLRDError, match=msg):
pd.read_excel(xl, sheet_name="0")
else: # openpyxl
msg = "Worksheet 0 does not exist."
with pytest.raises(KeyError, match=msg):
pd.read_excel(xl, sheet_name="0")

def test_excel_writer_context_manager(self, frame, path):
with ExcelWriter(path) as writer:
Expand Down Expand Up @@ -1199,6 +1203,9 @@ def test_datetimes(self, path):

tm.assert_series_equal(write_frame["A"], read_frame["A"])

@pytest.mark.filterwarnings(
'ignore:The Excel reader engine "xlrd" is deprecated:FutureWarning'
)
def test_bytes_io(self, engine):
# see gh-7074
bio = BytesIO()
Expand All @@ -1209,8 +1216,15 @@ def test_bytes_io(self, engine):
df.to_excel(writer)
writer.save()

if engine == "xlwt":
read_engine = "xlrd"
elif engine == "xlsxwriter":
read_engine = "openpyxl"
else:
read_engine = engine

bio.seek(0)
reread_df = pd.read_excel(bio, index_col=0)
reread_df = pd.read_excel(bio, index_col=0, engine=read_engine)
tm.assert_frame_equal(df, reread_df)

def test_write_lists_dict(self, path):
Expand Down
28 changes: 27 additions & 1 deletion pandas/tests/io/excel/test_xlrd.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ def skip_ods_and_xlsb_files(read_ext):
pytest.skip("Not valid for xlrd")


@pytest.mark.filterwarnings(
'ignore:The Excel reader engine "xlrd" is deprecated:FutureWarning'
)
def test_read_xlrd_book(read_ext, frame):
df = frame

Expand All @@ -36,8 +39,31 @@ def test_read_xlrd_book(read_ext, frame):


# TODO: test for openpyxl as well
@pytest.mark.filterwarnings(
'ignore:The Excel reader engine "xlrd" is deprecated:FutureWarning'
)
def test_excel_table_sheet_by_index(datapath, read_ext):
path = datapath("io", "data", "excel", f"test1{read_ext}")
with pd.ExcelFile(path) as excel:
with pd.ExcelFile(path, engine="xlrd") as excel:
with pytest.raises(xlrd.XLRDError):
pd.read_excel(excel, sheet_name="asdf")


def test_excel_file_warning_with_xlsx_file(datapath):
# GH 29375
path = datapath("io", "data", "excel", "test1.xlsx")
with tm.assert_produces_warning(
FutureWarning, check_stacklevel=True, raise_on_extra_warnings=False
) as w:
pd.ExcelFile(path, engine="xlrd")
assert '"xlrd" is deprecated, use "openpyxl" instead.' in str(w[0].message)


def test_read_excel_warning_with_xlsx_file(tmpdir, datapath):
# GH 29375
path = datapath("io", "data", "excel", "test1.xlsx")
with tm.assert_produces_warning(
FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False
) as w:
pd.read_excel(path, "Sheet1", engine="xlrd")
assert '"xlrd" is deprecated, use "openpyxl" instead.' in str(w[0].message)

0 comments on commit 45e8193

Please sign in to comment.