Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DEPR: Deprecate using xlrd engine for read_excel #35029

Merged
merged 16 commits into from
Dec 1, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,15 @@ including other versions of pandas.

{{ header }}

.. warning::

Previously, the default argument ``engine=None`` to ``pd.read_excel``
would result in using the `xlrd <https://xlrd.readthedocs.io/en/latest/>`_ engine in
many cases. The engine ``xlrd`` is no longer maintained, and is not supported with
python >= 3.9. If `openpyxl <https://pypi.org/project/openpyxl/>`_ is installed,
many of these cases will now default to using the ``openpyxl`` engine. See the
:func:`read_excel` documentation for more details.

.. ---------------------------------------------------------------------------

Enhancements
Expand Down
95 changes: 91 additions & 4 deletions pandas/io/excel/_base.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
import abc
import datetime
import inspect
from io import BufferedIOBase, BytesIO, RawIOBase
import os
from textwrap import fill
from typing import Any, Dict, Mapping, Union, cast
import warnings

from pandas._config import config

from pandas._libs.parsers import STR_NA_VALUES
from pandas._typing import Buffer, FilePathOrBuffer, StorageOptions
from pandas.compat._optional import import_optional_dependency
from pandas.errors import EmptyDataError
from pandas.util._decorators import Appender, deprecate_nonkeyword_arguments

Expand Down Expand Up @@ -99,12 +102,32 @@
of dtype conversion.
engine : str, default None
If io is not a buffer or path, this must be set to identify io.
Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb", default "xlrd".
Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb".
Engine compatibility :

- "xlrd" supports most old/new Excel file formats.
rhshadrach marked this conversation as resolved.
Show resolved Hide resolved
- "openpyxl" supports newer Excel file formats.
- "odf" supports OpenDocument file formats (.odf, .ods, .odt).
- "pyxlsb" supports Binary Excel files.

.. versionchanged:: 1.2.0
The engine `xlrd <https://xlrd.readthedocs.io/en/latest/>`_
is no longer maintained, and is not supported with
python >= 3.9. When ``engine=None``, the following logic will be
used to determine the engine.

- If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt),
then `odf <https://pypi.org/project/odfpy/>`_ will be used.
- Otherwise if ``path_or_buffer`` is a bytes stream, the file has the
extension ``.xls``, or is an ``xlrd`` Book instance, then ``xlrd`` will
be used.
- Otherwise if `openpyxl <https://pypi.org/project/openpyxl/>`_ is installed,
then ``openpyxl`` will be used.
- Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised.

Specifying ``engine="xlrd"`` will continue to be allowed for the
indefinite future.

converters : dict, default None
Dict of functions for converting values in certain columns. Keys can
either be integers or column labels, values are functions that take one
Expand Down Expand Up @@ -877,13 +900,32 @@ class ExcelFile:
.xls, .xlsx, .xlsb, .xlsm, .odf, .ods, or .odt file.
engine : str, default None
If io is not a buffer or path, this must be set to identify io.
Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb``,
default ``xlrd``.
Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb``
Engine compatibility :

- ``xlrd`` supports most old/new Excel file formats.
- ``openpyxl`` supports newer Excel file formats.
- ``odf`` supports OpenDocument file formats (.odf, .ods, .odt).
- ``pyxlsb`` supports Binary Excel files.

.. versionchanged:: 1.2.0

The engine `xlrd <https://xlrd.readthedocs.io/en/latest/>`_
is no longer maintained, and is not supported with
python >= 3.9. When ``engine=None``, the following logic will be
used to determine the engine.

- If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

obviously as much of the formatting you can do here as well

then `odf <https://pypi.org/project/odfpy/>`_ will be used.
- Otherwise if ``path_or_buffer`` is a bytes stream, the file has the
extension ``.xls``, or is an ``xlrd`` Book instance, then ``xlrd``
will be used.
- Otherwise if `openpyxl <https://pypi.org/project/openpyxl/>`_ is installed,
then ``openpyxl`` will be used.
- Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised.

Specifying ``engine="xlrd"`` will continue to be allowed for the
indefinite future.
"""

from pandas.io.excel._odfreader import ODFReader
Expand All @@ -902,14 +944,59 @@ def __init__(
self, path_or_buffer, engine=None, storage_options: StorageOptions = None
):
if engine is None:
engine = "xlrd"
# Determine ext and use odf for ods stream/file
if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)):
ext = None
if _is_ods_stream(path_or_buffer):
engine = "odf"
else:
ext = os.path.splitext(str(path_or_buffer))[-1]
if ext == ".ods":
engine = "odf"

WillAyd marked this conversation as resolved.
Show resolved Hide resolved
if (
import_optional_dependency(
"xlrd", raise_on_missing=False, on_version="ignore"
)
is not None
):
from xlrd import Book

if isinstance(path_or_buffer, Book):
engine = "xlrd"

# GH 35029 - Prefer openpyxl except for xls files
if engine is None:
if ext is None or isinstance(path_or_buffer, bytes) or ext == ".xls":
engine = "xlrd"
elif (
import_optional_dependency(
"openpyxl", raise_on_missing=False, on_version="ignore"
)
is not None
):
engine = "openpyxl"
else:
caller = inspect.stack()[1]
if (
caller.filename.endswith("pandas/io/excel/_base.py")
and caller.function == "read_excel"
):
stacklevel = 4
else:
stacklevel = 2
warnings.warn(
"The xlrd engine is no longer maintained and is not "
"supported when using pandas with python >= 3.9. However, "
"the engine xlrd will continue to be allowed for the "
"indefinite future. Beginning with pandas 1.2.0, the "
"openpyxl engine will be used if it is installed and the "
"engine argument is not specified. Either install openpyxl "
"or specify engine='xlrd' to silence this warning.",
FutureWarning,
stacklevel=stacklevel,
)
engine = "xlrd"
if engine not in self._engines:
raise ValueError(f"Unknown engine: {engine}")

Expand Down
6 changes: 5 additions & 1 deletion pandas/tests/io/excel/test_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -577,6 +577,10 @@ def test_date_conversion_overflow(self, read_ext):
if pd.read_excel.keywords["engine"] == "openpyxl":
pytest.xfail("Maybe not supported by openpyxl")

if pd.read_excel.keywords["engine"] is None:
# GH 35029
pytest.xfail("Defaults to openpyxl, maybe not supported")

result = pd.read_excel("testdateoverflow" + read_ext)
tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -1159,7 +1163,7 @@ def test_excel_high_surrogate(self, engine):
expected = DataFrame(["\udc88"], columns=["Column1"])

# should not produce a segmentation violation
actual = pd.read_excel("high_surrogate.xlsx")
actual = pd.read_excel("high_surrogate.xlsx", engine="xlrd")
tm.assert_frame_equal(expected, actual)

@pytest.mark.parametrize("filename", ["df_empty.xlsx", "df_equals.xlsx"])
Expand Down
9 changes: 7 additions & 2 deletions pandas/tests/io/excel/test_writers.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,12 +351,15 @@ def test_excel_sheet_by_name_raise(self, path, engine):
msg = "sheet 0 not found"
with pytest.raises(ValueError, match=msg):
pd.read_excel(xl, "0")
else:
elif engine == "xlwt":
import xlrd

msg = "No sheet named <'0'>"
with pytest.raises(xlrd.XLRDError, match=msg):
pd.read_excel(xl, sheet_name="0")
else:
with pytest.raises(KeyError, match="Worksheet 0 does not exist."):
pd.read_excel(xl, sheet_name="0")

def test_excel_writer_context_manager(self, frame, path):
with ExcelWriter(path) as writer:
Expand Down Expand Up @@ -1193,7 +1196,9 @@ def test_datetimes(self, path):

write_frame = DataFrame({"A": datetimes})
write_frame.to_excel(path, "Sheet1")
read_frame = pd.read_excel(path, sheet_name="Sheet1", header=0)
# GH 35029 - Default changed to openpyxl, but test is for odf/xlrd
engine = "odf" if path.endswith("ods") else "xlrd"
read_frame = pd.read_excel(path, sheet_name="Sheet1", header=0, engine=engine)

tm.assert_series_equal(write_frame["A"], read_frame["A"])

Expand Down
46 changes: 45 additions & 1 deletion pandas/tests/io/excel/test_xlrd.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import pytest

from pandas.compat._optional import import_optional_dependency

import pandas as pd
import pandas._testing as tm

Expand Down Expand Up @@ -38,6 +40,48 @@ def test_read_xlrd_book(read_ext, frame):
# TODO: test for openpyxl as well
def test_excel_table_sheet_by_index(datapath, read_ext):
path = datapath("io", "data", "excel", f"test1{read_ext}")
with ExcelFile(path) as excel:
with ExcelFile(path, engine="xlrd") as excel:
with pytest.raises(xlrd.XLRDError):
pd.read_excel(excel, sheet_name="asdf")


def test_excel_file_warning_with_xlsx_file(datapath):
# GH 29375
path = datapath("io", "data", "excel", "test1.xlsx")
has_openpyxl = (
import_optional_dependency(
"openpyxl", raise_on_missing=False, on_version="ignore"
)
is not None
)
if not has_openpyxl:
with tm.assert_produces_warning(
FutureWarning,
raise_on_extra_warnings=False,
match="The xlrd engine is no longer maintained",
):
ExcelFile(path, engine=None)
else:
with tm.assert_produces_warning(None):
pd.read_excel(path, "Sheet1", engine=None)


def test_read_excel_warning_with_xlsx_file(tmpdir, datapath):
# GH 29375
path = datapath("io", "data", "excel", "test1.xlsx")
has_openpyxl = (
import_optional_dependency(
"openpyxl", raise_on_missing=False, on_version="ignore"
)
is not None
)
if not has_openpyxl:
with tm.assert_produces_warning(
FutureWarning,
raise_on_extra_warnings=False,
match="The xlrd engine is no longer maintained",
):
pd.read_excel(path, "Sheet1", engine=None)
else:
with tm.assert_produces_warning(None):
pd.read_excel(path, "Sheet1", engine=None)