Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Excel Reader Refactor - Base Class Introduction #24829

Merged
merged 9 commits into from
Jan 26, 2019
230 changes: 128 additions & 102 deletions pandas/io/excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,60 +375,20 @@ def read_excel(io,
**kwds)


class _XlrdReader(object):
class _BaseExcelReader(object):

def __init__(self, filepath_or_buffer):
"""Reader using xlrd engine.

Parameters
----------
filepath_or_buffer : string, path object or Workbook
Object to be parsed.
"""
err_msg = "Install xlrd >= 1.0.0 for Excel support"

try:
import xlrd
except ImportError:
raise ImportError(err_msg)
else:
if xlrd.__VERSION__ < LooseVersion("1.0.0"):
raise ImportError(err_msg +
". Current version " + xlrd.__VERSION__)

# If filepath_or_buffer is a url, want to keep the data as bytes so
# can't pass to get_filepath_or_buffer()
if _is_url(filepath_or_buffer):
filepath_or_buffer = _urlopen(filepath_or_buffer)
elif not isinstance(filepath_or_buffer, (ExcelFile, xlrd.Book)):
filepath_or_buffer, _, _, _ = get_filepath_or_buffer(
filepath_or_buffer)
@property
def sheet_names(self):
raise NotImplementedError

if isinstance(filepath_or_buffer, xlrd.Book):
self.book = filepath_or_buffer
elif not isinstance(filepath_or_buffer, xlrd.Book) and hasattr(
filepath_or_buffer, "read"):
# N.B. xlrd.Book has a read attribute too
if hasattr(filepath_or_buffer, 'seek'):
try:
# GH 19779
filepath_or_buffer.seek(0)
except UnsupportedOperation:
# HTTPResponse does not support seek()
# GH 20434
pass
def get_sheet_by_name(self, name):
raise NotImplementedError
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use AbstractMethodError

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason for not using abc for this?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIRC @TomAugspurger mentioned issues with performance of isinstance check against ABC subclasses (admittedly not applicable to this class) which would be why we haven't used elsewhere in code


data = filepath_or_buffer.read()
self.book = xlrd.open_workbook(file_contents=data)
elif isinstance(filepath_or_buffer, compat.string_types):
self.book = xlrd.open_workbook(filepath_or_buffer)
else:
raise ValueError('Must explicitly set engine if not passing in'
' buffer or path for io.')
def get_sheet_by_index(self, index):
raise NotImplementedError

@property
def sheet_names(self):
return self.book.sheet_names()
def get_sheet_data(self, sheet, convert_float):
raise NotImplementedError

def parse(self,
sheet_name=0,
Expand All @@ -455,56 +415,14 @@ def parse(self,

_validate_header_arg(header)

from xlrd import (xldate, XL_CELL_DATE,
XL_CELL_ERROR, XL_CELL_BOOLEAN,
XL_CELL_NUMBER)

epoch1904 = self.book.datemode

def _parse_cell(cell_contents, cell_typ):
"""converts the contents of the cell into a pandas
appropriate object"""

if cell_typ == XL_CELL_DATE:

# Use the newer xlrd datetime handling.
try:
cell_contents = xldate.xldate_as_datetime(
cell_contents, epoch1904)
except OverflowError:
return cell_contents

# Excel doesn't distinguish between dates and time,
# so we treat dates on the epoch as times only.
# Also, Excel supports 1900 and 1904 epochs.
year = (cell_contents.timetuple())[0:3]
if ((not epoch1904 and year == (1899, 12, 31)) or
(epoch1904 and year == (1904, 1, 1))):
cell_contents = time(cell_contents.hour,
cell_contents.minute,
cell_contents.second,
cell_contents.microsecond)

elif cell_typ == XL_CELL_ERROR:
cell_contents = np.nan
elif cell_typ == XL_CELL_BOOLEAN:
cell_contents = bool(cell_contents)
elif convert_float and cell_typ == XL_CELL_NUMBER:
# GH5394 - Excel 'numbers' are always floats
# it's a minimal perf hit and less surprising
val = int(cell_contents)
if val == cell_contents:
cell_contents = val
return cell_contents

ret_dict = False

# Keep sheetname to maintain backwards compatibility.
if isinstance(sheet_name, list):
sheets = sheet_name
ret_dict = True
elif sheet_name is None:
sheets = self.book.sheet_names()
sheets = self.sheet_names
ret_dict = True
else:
sheets = [sheet_name]
Expand All @@ -519,19 +437,13 @@ def _parse_cell(cell_contents, cell_typ):
print("Reading sheet {sheet}".format(sheet=asheetname))

if isinstance(asheetname, compat.string_types):
sheet = self.book.sheet_by_name(asheetname)
sheet = self.get_sheet_by_name(asheetname)
else: # assume an integer if not a string
sheet = self.book.sheet_by_index(asheetname)
sheet = self.get_sheet_by_index(asheetname)

data = []
data = self.get_sheet_data(sheet, convert_float)
usecols = _maybe_convert_usecols(usecols)

for i in range(sheet.nrows):
row = [_parse_cell(value, typ)
for value, typ in zip(sheet.row_values(i),
sheet.row_types(i))]
data.append(row)

if sheet.nrows == 0:
output[asheetname] = DataFrame()
continue
Expand Down Expand Up @@ -620,6 +532,120 @@ def _parse_cell(cell_contents, cell_typ):
return output[asheetname]


class _XlrdReader(_BaseExcelReader):

def __init__(self, filepath_or_buffer):
"""Reader using xlrd engine.

Parameters
----------
filepath_or_buffer : string, path object or Workbook
Object to be parsed.
"""
err_msg = "Install xlrd >= 1.0.0 for Excel support"

try:
import xlrd
except ImportError:
raise ImportError(err_msg)
else:
if xlrd.__VERSION__ < LooseVersion("1.0.0"):
raise ImportError(err_msg +
". Current version " + xlrd.__VERSION__)

# If filepath_or_buffer is a url, want to keep the data as bytes so
# can't pass to get_filepath_or_buffer()
if _is_url(filepath_or_buffer):
filepath_or_buffer = _urlopen(filepath_or_buffer)
elif not isinstance(filepath_or_buffer, (ExcelFile, xlrd.Book)):
filepath_or_buffer, _, _, _ = get_filepath_or_buffer(
filepath_or_buffer)

if isinstance(filepath_or_buffer, xlrd.Book):
self.book = filepath_or_buffer
elif hasattr(filepath_or_buffer, "read"):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that I changed this condition slightly to remove the not isinstance(filepath_or_buffer, xlrd.Book) as it was unnecessary given the preceding statement

# N.B. xlrd.Book has a read attribute too
if hasattr(filepath_or_buffer, 'seek'):
try:
# GH 19779
filepath_or_buffer.seek(0)
except UnsupportedOperation:
# HTTPResponse does not support seek()
# GH 20434
pass

data = filepath_or_buffer.read()
self.book = xlrd.open_workbook(file_contents=data)
elif isinstance(filepath_or_buffer, compat.string_types):
self.book = xlrd.open_workbook(filepath_or_buffer)
else:
raise ValueError('Must explicitly set engine if not passing in'
' buffer or path for io.')

@property
def sheet_names(self):
return self.book.sheet_names()

def get_sheet_by_name(self, name):
return self.book.sheet_by_name(name)

def get_sheet_by_index(self, index):
return self.book.sheet_by_index(index)

def get_sheet_data(self, sheet, convert_float):
from xlrd import (xldate, XL_CELL_DATE,
XL_CELL_ERROR, XL_CELL_BOOLEAN,
XL_CELL_NUMBER)

epoch1904 = self.book.datemode

def _parse_cell(cell_contents, cell_typ):
"""converts the contents of the cell into a pandas
appropriate object"""

if cell_typ == XL_CELL_DATE:

# Use the newer xlrd datetime handling.
try:
cell_contents = xldate.xldate_as_datetime(
cell_contents, epoch1904)
except OverflowError:
return cell_contents

# Excel doesn't distinguish between dates and time,
# so we treat dates on the epoch as times only.
# Also, Excel supports 1900 and 1904 epochs.
year = (cell_contents.timetuple())[0:3]
if ((not epoch1904 and year == (1899, 12, 31)) or
(epoch1904 and year == (1904, 1, 1))):
cell_contents = time(cell_contents.hour,
cell_contents.minute,
cell_contents.second,
cell_contents.microsecond)

elif cell_typ == XL_CELL_ERROR:
cell_contents = np.nan
elif cell_typ == XL_CELL_BOOLEAN:
cell_contents = bool(cell_contents)
elif convert_float and cell_typ == XL_CELL_NUMBER:
# GH5394 - Excel 'numbers' are always floats
# it's a minimal perf hit and less surprising
val = int(cell_contents)
if val == cell_contents:
cell_contents = val
return cell_contents

data = []

for i in range(sheet.nrows):
row = [_parse_cell(value, typ)
for value, typ in zip(sheet.row_values(i),
sheet.row_types(i))]
data.append(row)

return data


class ExcelFile(object):
"""
Class for parsing tabular excel sheets into DataFrame objects.
Expand Down