Skip to content

Commit

Permalink
Excel Reader Refactor - Base Class Introduction (pandas-dev#24829)
Browse files Browse the repository at this point in the history
  • Loading branch information
WillAyd authored and Pingviinituutti committed Feb 28, 2019
1 parent 6772d95 commit 8e33f64
Showing 1 changed file with 133 additions and 102 deletions.
235 changes: 133 additions & 102 deletions pandas/io/excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,60 +375,25 @@ def read_excel(io,
**kwds)


class _XlrdReader(object):

def __init__(self, filepath_or_buffer):
"""Reader using xlrd engine.
Parameters
----------
filepath_or_buffer : string, path object or Workbook
Object to be parsed.
"""
err_msg = "Install xlrd >= 1.0.0 for Excel support"

try:
import xlrd
except ImportError:
raise ImportError(err_msg)
else:
if xlrd.__VERSION__ < LooseVersion("1.0.0"):
raise ImportError(err_msg +
". Current version " + xlrd.__VERSION__)
@add_metaclass(abc.ABCMeta)
class _BaseExcelReader(object):

# If filepath_or_buffer is a url, want to keep the data as bytes so
# can't pass to get_filepath_or_buffer()
if _is_url(filepath_or_buffer):
filepath_or_buffer = _urlopen(filepath_or_buffer)
elif not isinstance(filepath_or_buffer, (ExcelFile, xlrd.Book)):
filepath_or_buffer, _, _, _ = get_filepath_or_buffer(
filepath_or_buffer)
@property
@abc.abstractmethod
def sheet_names(self):
pass

if isinstance(filepath_or_buffer, xlrd.Book):
self.book = filepath_or_buffer
elif not isinstance(filepath_or_buffer, xlrd.Book) and hasattr(
filepath_or_buffer, "read"):
# N.B. xlrd.Book has a read attribute too
if hasattr(filepath_or_buffer, 'seek'):
try:
# GH 19779
filepath_or_buffer.seek(0)
except UnsupportedOperation:
# HTTPResponse does not support seek()
# GH 20434
pass
@abc.abstractmethod
def get_sheet_by_name(self, name):
pass

data = filepath_or_buffer.read()
self.book = xlrd.open_workbook(file_contents=data)
elif isinstance(filepath_or_buffer, compat.string_types):
self.book = xlrd.open_workbook(filepath_or_buffer)
else:
raise ValueError('Must explicitly set engine if not passing in'
' buffer or path for io.')
@abc.abstractmethod
def get_sheet_by_index(self, index):
pass

@property
def sheet_names(self):
return self.book.sheet_names()
@abc.abstractmethod
def get_sheet_data(self, sheet, convert_float):
pass

def parse(self,
sheet_name=0,
Expand All @@ -455,56 +420,14 @@ def parse(self,

_validate_header_arg(header)

from xlrd import (xldate, XL_CELL_DATE,
XL_CELL_ERROR, XL_CELL_BOOLEAN,
XL_CELL_NUMBER)

epoch1904 = self.book.datemode

def _parse_cell(cell_contents, cell_typ):
"""converts the contents of the cell into a pandas
appropriate object"""

if cell_typ == XL_CELL_DATE:

# Use the newer xlrd datetime handling.
try:
cell_contents = xldate.xldate_as_datetime(
cell_contents, epoch1904)
except OverflowError:
return cell_contents

# Excel doesn't distinguish between dates and time,
# so we treat dates on the epoch as times only.
# Also, Excel supports 1900 and 1904 epochs.
year = (cell_contents.timetuple())[0:3]
if ((not epoch1904 and year == (1899, 12, 31)) or
(epoch1904 and year == (1904, 1, 1))):
cell_contents = time(cell_contents.hour,
cell_contents.minute,
cell_contents.second,
cell_contents.microsecond)

elif cell_typ == XL_CELL_ERROR:
cell_contents = np.nan
elif cell_typ == XL_CELL_BOOLEAN:
cell_contents = bool(cell_contents)
elif convert_float and cell_typ == XL_CELL_NUMBER:
# GH5394 - Excel 'numbers' are always floats
# it's a minimal perf hit and less surprising
val = int(cell_contents)
if val == cell_contents:
cell_contents = val
return cell_contents

ret_dict = False

# Keep sheetname to maintain backwards compatibility.
if isinstance(sheet_name, list):
sheets = sheet_name
ret_dict = True
elif sheet_name is None:
sheets = self.book.sheet_names()
sheets = self.sheet_names
ret_dict = True
else:
sheets = [sheet_name]
Expand All @@ -519,19 +442,13 @@ def _parse_cell(cell_contents, cell_typ):
print("Reading sheet {sheet}".format(sheet=asheetname))

if isinstance(asheetname, compat.string_types):
sheet = self.book.sheet_by_name(asheetname)
sheet = self.get_sheet_by_name(asheetname)
else: # assume an integer if not a string
sheet = self.book.sheet_by_index(asheetname)
sheet = self.get_sheet_by_index(asheetname)

data = []
data = self.get_sheet_data(sheet, convert_float)
usecols = _maybe_convert_usecols(usecols)

for i in range(sheet.nrows):
row = [_parse_cell(value, typ)
for value, typ in zip(sheet.row_values(i),
sheet.row_types(i))]
data.append(row)

if sheet.nrows == 0:
output[asheetname] = DataFrame()
continue
Expand Down Expand Up @@ -620,6 +537,120 @@ def _parse_cell(cell_contents, cell_typ):
return output[asheetname]


class _XlrdReader(_BaseExcelReader):

def __init__(self, filepath_or_buffer):
"""Reader using xlrd engine.
Parameters
----------
filepath_or_buffer : string, path object or Workbook
Object to be parsed.
"""
err_msg = "Install xlrd >= 1.0.0 for Excel support"

try:
import xlrd
except ImportError:
raise ImportError(err_msg)
else:
if xlrd.__VERSION__ < LooseVersion("1.0.0"):
raise ImportError(err_msg +
". Current version " + xlrd.__VERSION__)

# If filepath_or_buffer is a url, want to keep the data as bytes so
# can't pass to get_filepath_or_buffer()
if _is_url(filepath_or_buffer):
filepath_or_buffer = _urlopen(filepath_or_buffer)
elif not isinstance(filepath_or_buffer, (ExcelFile, xlrd.Book)):
filepath_or_buffer, _, _, _ = get_filepath_or_buffer(
filepath_or_buffer)

if isinstance(filepath_or_buffer, xlrd.Book):
self.book = filepath_or_buffer
elif hasattr(filepath_or_buffer, "read"):
# N.B. xlrd.Book has a read attribute too
if hasattr(filepath_or_buffer, 'seek'):
try:
# GH 19779
filepath_or_buffer.seek(0)
except UnsupportedOperation:
# HTTPResponse does not support seek()
# GH 20434
pass

data = filepath_or_buffer.read()
self.book = xlrd.open_workbook(file_contents=data)
elif isinstance(filepath_or_buffer, compat.string_types):
self.book = xlrd.open_workbook(filepath_or_buffer)
else:
raise ValueError('Must explicitly set engine if not passing in'
' buffer or path for io.')

@property
def sheet_names(self):
return self.book.sheet_names()

def get_sheet_by_name(self, name):
return self.book.sheet_by_name(name)

def get_sheet_by_index(self, index):
return self.book.sheet_by_index(index)

def get_sheet_data(self, sheet, convert_float):
from xlrd import (xldate, XL_CELL_DATE,
XL_CELL_ERROR, XL_CELL_BOOLEAN,
XL_CELL_NUMBER)

epoch1904 = self.book.datemode

def _parse_cell(cell_contents, cell_typ):
"""converts the contents of the cell into a pandas
appropriate object"""

if cell_typ == XL_CELL_DATE:

# Use the newer xlrd datetime handling.
try:
cell_contents = xldate.xldate_as_datetime(
cell_contents, epoch1904)
except OverflowError:
return cell_contents

# Excel doesn't distinguish between dates and time,
# so we treat dates on the epoch as times only.
# Also, Excel supports 1900 and 1904 epochs.
year = (cell_contents.timetuple())[0:3]
if ((not epoch1904 and year == (1899, 12, 31)) or
(epoch1904 and year == (1904, 1, 1))):
cell_contents = time(cell_contents.hour,
cell_contents.minute,
cell_contents.second,
cell_contents.microsecond)

elif cell_typ == XL_CELL_ERROR:
cell_contents = np.nan
elif cell_typ == XL_CELL_BOOLEAN:
cell_contents = bool(cell_contents)
elif convert_float and cell_typ == XL_CELL_NUMBER:
# GH5394 - Excel 'numbers' are always floats
# it's a minimal perf hit and less surprising
val = int(cell_contents)
if val == cell_contents:
cell_contents = val
return cell_contents

data = []

for i in range(sheet.nrows):
row = [_parse_cell(value, typ)
for value, typ in zip(sheet.row_values(i),
sheet.row_types(i))]
data.append(row)

return data


class ExcelFile(object):
"""
Class for parsing tabular excel sheets into DataFrame objects.
Expand Down

0 comments on commit 8e33f64

Please sign in to comment.