-
-
Notifications
You must be signed in to change notification settings - Fork 18.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Excel Reader Refactor - Base Class Introduction #24829
Merged
+133
−102
Merged
Changes from 5 commits
Commits
Show all changes
9 commits
Select commit
Hold shift + click to select a range
bdadcf0
Code decoupling
WillAyd 52bd430
Merge branch 'master' into excel-cleanup
WillAyd e58c219
Added base reader class
WillAyd 1bd93f6
LINT fixup
WillAyd ae1d8d2
Merge remote-tracking branch 'upstream/master' into excel-cleanup
WillAyd badae15
Merge remote-tracking branch 'upstream/master' into excel-cleanup
WillAyd 801ddc5
Used AbstractMethodError
WillAyd a154cf3
Used ABC metaclass
WillAyd 18dd6ea
Merge remote-tracking branch 'upstream/master' into excel-cleanup
WillAyd File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -375,60 +375,20 @@ def read_excel(io, | |
**kwds) | ||
|
||
|
||
class _XlrdReader(object): | ||
class _BaseExcelReader(object): | ||
|
||
def __init__(self, filepath_or_buffer): | ||
"""Reader using xlrd engine. | ||
|
||
Parameters | ||
---------- | ||
filepath_or_buffer : string, path object or Workbook | ||
Object to be parsed. | ||
""" | ||
err_msg = "Install xlrd >= 1.0.0 for Excel support" | ||
|
||
try: | ||
import xlrd | ||
except ImportError: | ||
raise ImportError(err_msg) | ||
else: | ||
if xlrd.__VERSION__ < LooseVersion("1.0.0"): | ||
raise ImportError(err_msg + | ||
". Current version " + xlrd.__VERSION__) | ||
|
||
# If filepath_or_buffer is a url, want to keep the data as bytes so | ||
# can't pass to get_filepath_or_buffer() | ||
if _is_url(filepath_or_buffer): | ||
filepath_or_buffer = _urlopen(filepath_or_buffer) | ||
elif not isinstance(filepath_or_buffer, (ExcelFile, xlrd.Book)): | ||
filepath_or_buffer, _, _, _ = get_filepath_or_buffer( | ||
filepath_or_buffer) | ||
@property | ||
def sheet_names(self): | ||
raise NotImplementedError | ||
|
||
if isinstance(filepath_or_buffer, xlrd.Book): | ||
self.book = filepath_or_buffer | ||
elif not isinstance(filepath_or_buffer, xlrd.Book) and hasattr( | ||
filepath_or_buffer, "read"): | ||
# N.B. xlrd.Book has a read attribute too | ||
if hasattr(filepath_or_buffer, 'seek'): | ||
try: | ||
# GH 19779 | ||
filepath_or_buffer.seek(0) | ||
except UnsupportedOperation: | ||
# HTTPResponse does not support seek() | ||
# GH 20434 | ||
pass | ||
def get_sheet_by_name(self, name): | ||
raise NotImplementedError | ||
|
||
data = filepath_or_buffer.read() | ||
self.book = xlrd.open_workbook(file_contents=data) | ||
elif isinstance(filepath_or_buffer, compat.string_types): | ||
self.book = xlrd.open_workbook(filepath_or_buffer) | ||
else: | ||
raise ValueError('Must explicitly set engine if not passing in' | ||
' buffer or path for io.') | ||
def get_sheet_by_index(self, index): | ||
raise NotImplementedError | ||
|
||
@property | ||
def sheet_names(self): | ||
return self.book.sheet_names() | ||
def get_sheet_data(self, sheet, convert_float): | ||
raise NotImplementedError | ||
|
||
def parse(self, | ||
sheet_name=0, | ||
|
@@ -455,56 +415,14 @@ def parse(self, | |
|
||
_validate_header_arg(header) | ||
|
||
from xlrd import (xldate, XL_CELL_DATE, | ||
XL_CELL_ERROR, XL_CELL_BOOLEAN, | ||
XL_CELL_NUMBER) | ||
|
||
epoch1904 = self.book.datemode | ||
|
||
def _parse_cell(cell_contents, cell_typ): | ||
"""converts the contents of the cell into a pandas | ||
appropriate object""" | ||
|
||
if cell_typ == XL_CELL_DATE: | ||
|
||
# Use the newer xlrd datetime handling. | ||
try: | ||
cell_contents = xldate.xldate_as_datetime( | ||
cell_contents, epoch1904) | ||
except OverflowError: | ||
return cell_contents | ||
|
||
# Excel doesn't distinguish between dates and time, | ||
# so we treat dates on the epoch as times only. | ||
# Also, Excel supports 1900 and 1904 epochs. | ||
year = (cell_contents.timetuple())[0:3] | ||
if ((not epoch1904 and year == (1899, 12, 31)) or | ||
(epoch1904 and year == (1904, 1, 1))): | ||
cell_contents = time(cell_contents.hour, | ||
cell_contents.minute, | ||
cell_contents.second, | ||
cell_contents.microsecond) | ||
|
||
elif cell_typ == XL_CELL_ERROR: | ||
cell_contents = np.nan | ||
elif cell_typ == XL_CELL_BOOLEAN: | ||
cell_contents = bool(cell_contents) | ||
elif convert_float and cell_typ == XL_CELL_NUMBER: | ||
# GH5394 - Excel 'numbers' are always floats | ||
# it's a minimal perf hit and less surprising | ||
val = int(cell_contents) | ||
if val == cell_contents: | ||
cell_contents = val | ||
return cell_contents | ||
|
||
ret_dict = False | ||
|
||
# Keep sheetname to maintain backwards compatibility. | ||
if isinstance(sheet_name, list): | ||
sheets = sheet_name | ||
ret_dict = True | ||
elif sheet_name is None: | ||
sheets = self.book.sheet_names() | ||
sheets = self.sheet_names | ||
ret_dict = True | ||
else: | ||
sheets = [sheet_name] | ||
|
@@ -519,19 +437,13 @@ def _parse_cell(cell_contents, cell_typ): | |
print("Reading sheet {sheet}".format(sheet=asheetname)) | ||
|
||
if isinstance(asheetname, compat.string_types): | ||
sheet = self.book.sheet_by_name(asheetname) | ||
sheet = self.get_sheet_by_name(asheetname) | ||
else: # assume an integer if not a string | ||
sheet = self.book.sheet_by_index(asheetname) | ||
sheet = self.get_sheet_by_index(asheetname) | ||
|
||
data = [] | ||
data = self.get_sheet_data(sheet, convert_float) | ||
usecols = _maybe_convert_usecols(usecols) | ||
|
||
for i in range(sheet.nrows): | ||
row = [_parse_cell(value, typ) | ||
for value, typ in zip(sheet.row_values(i), | ||
sheet.row_types(i))] | ||
data.append(row) | ||
|
||
if sheet.nrows == 0: | ||
output[asheetname] = DataFrame() | ||
continue | ||
|
@@ -620,6 +532,120 @@ def _parse_cell(cell_contents, cell_typ): | |
return output[asheetname] | ||
|
||
|
||
class _XlrdReader(_BaseExcelReader): | ||
|
||
def __init__(self, filepath_or_buffer): | ||
"""Reader using xlrd engine. | ||
|
||
Parameters | ||
---------- | ||
filepath_or_buffer : string, path object or Workbook | ||
Object to be parsed. | ||
""" | ||
err_msg = "Install xlrd >= 1.0.0 for Excel support" | ||
|
||
try: | ||
import xlrd | ||
except ImportError: | ||
raise ImportError(err_msg) | ||
else: | ||
if xlrd.__VERSION__ < LooseVersion("1.0.0"): | ||
raise ImportError(err_msg + | ||
". Current version " + xlrd.__VERSION__) | ||
|
||
# If filepath_or_buffer is a url, want to keep the data as bytes so | ||
# can't pass to get_filepath_or_buffer() | ||
if _is_url(filepath_or_buffer): | ||
filepath_or_buffer = _urlopen(filepath_or_buffer) | ||
elif not isinstance(filepath_or_buffer, (ExcelFile, xlrd.Book)): | ||
filepath_or_buffer, _, _, _ = get_filepath_or_buffer( | ||
filepath_or_buffer) | ||
|
||
if isinstance(filepath_or_buffer, xlrd.Book): | ||
self.book = filepath_or_buffer | ||
elif hasattr(filepath_or_buffer, "read"): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note that I changed this condition slightly to remove the |
||
# N.B. xlrd.Book has a read attribute too | ||
if hasattr(filepath_or_buffer, 'seek'): | ||
try: | ||
# GH 19779 | ||
filepath_or_buffer.seek(0) | ||
except UnsupportedOperation: | ||
# HTTPResponse does not support seek() | ||
# GH 20434 | ||
pass | ||
|
||
data = filepath_or_buffer.read() | ||
self.book = xlrd.open_workbook(file_contents=data) | ||
elif isinstance(filepath_or_buffer, compat.string_types): | ||
self.book = xlrd.open_workbook(filepath_or_buffer) | ||
else: | ||
raise ValueError('Must explicitly set engine if not passing in' | ||
' buffer or path for io.') | ||
|
||
@property | ||
def sheet_names(self): | ||
return self.book.sheet_names() | ||
|
||
def get_sheet_by_name(self, name): | ||
return self.book.sheet_by_name(name) | ||
|
||
def get_sheet_by_index(self, index): | ||
return self.book.sheet_by_index(index) | ||
|
||
def get_sheet_data(self, sheet, convert_float): | ||
from xlrd import (xldate, XL_CELL_DATE, | ||
XL_CELL_ERROR, XL_CELL_BOOLEAN, | ||
XL_CELL_NUMBER) | ||
|
||
epoch1904 = self.book.datemode | ||
|
||
def _parse_cell(cell_contents, cell_typ): | ||
"""converts the contents of the cell into a pandas | ||
appropriate object""" | ||
|
||
if cell_typ == XL_CELL_DATE: | ||
|
||
# Use the newer xlrd datetime handling. | ||
try: | ||
cell_contents = xldate.xldate_as_datetime( | ||
cell_contents, epoch1904) | ||
except OverflowError: | ||
return cell_contents | ||
|
||
# Excel doesn't distinguish between dates and time, | ||
# so we treat dates on the epoch as times only. | ||
# Also, Excel supports 1900 and 1904 epochs. | ||
year = (cell_contents.timetuple())[0:3] | ||
if ((not epoch1904 and year == (1899, 12, 31)) or | ||
(epoch1904 and year == (1904, 1, 1))): | ||
cell_contents = time(cell_contents.hour, | ||
cell_contents.minute, | ||
cell_contents.second, | ||
cell_contents.microsecond) | ||
|
||
elif cell_typ == XL_CELL_ERROR: | ||
cell_contents = np.nan | ||
elif cell_typ == XL_CELL_BOOLEAN: | ||
cell_contents = bool(cell_contents) | ||
elif convert_float and cell_typ == XL_CELL_NUMBER: | ||
# GH5394 - Excel 'numbers' are always floats | ||
# it's a minimal perf hit and less surprising | ||
val = int(cell_contents) | ||
if val == cell_contents: | ||
cell_contents = val | ||
return cell_contents | ||
|
||
data = [] | ||
|
||
for i in range(sheet.nrows): | ||
row = [_parse_cell(value, typ) | ||
for value, typ in zip(sheet.row_values(i), | ||
sheet.row_types(i))] | ||
data.append(row) | ||
|
||
return data | ||
|
||
|
||
class ExcelFile(object): | ||
""" | ||
Class for parsing tabular excel sheets into DataFrame objects. | ||
|
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
use AbstractMethodError
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there a reason for not using
abc
for this?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
IIRC @TomAugspurger mentioned issues with performance of
isinstance
check against ABC subclasses (admittedly not applicable to this class) which would be why we haven't used elsewhere in code