From 8e33f64faa0098ba5c115a01f3b78a5a17f3af65 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Sat, 26 Jan 2019 07:34:46 -0800 Subject: [PATCH] Excel Reader Refactor - Base Class Introduction (#24829) --- pandas/io/excel.py | 235 +++++++++++++++++++++++++-------------------- 1 file changed, 133 insertions(+), 102 deletions(-) diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 3a7c39ec653098..3d85ae7fd1f468 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -375,60 +375,25 @@ def read_excel(io, **kwds) -class _XlrdReader(object): - - def __init__(self, filepath_or_buffer): - """Reader using xlrd engine. - - Parameters - ---------- - filepath_or_buffer : string, path object or Workbook - Object to be parsed. - """ - err_msg = "Install xlrd >= 1.0.0 for Excel support" - - try: - import xlrd - except ImportError: - raise ImportError(err_msg) - else: - if xlrd.__VERSION__ < LooseVersion("1.0.0"): - raise ImportError(err_msg + - ". Current version " + xlrd.__VERSION__) +@add_metaclass(abc.ABCMeta) +class _BaseExcelReader(object): - # If filepath_or_buffer is a url, want to keep the data as bytes so - # can't pass to get_filepath_or_buffer() - if _is_url(filepath_or_buffer): - filepath_or_buffer = _urlopen(filepath_or_buffer) - elif not isinstance(filepath_or_buffer, (ExcelFile, xlrd.Book)): - filepath_or_buffer, _, _, _ = get_filepath_or_buffer( - filepath_or_buffer) + @property + @abc.abstractmethod + def sheet_names(self): + pass - if isinstance(filepath_or_buffer, xlrd.Book): - self.book = filepath_or_buffer - elif not isinstance(filepath_or_buffer, xlrd.Book) and hasattr( - filepath_or_buffer, "read"): - # N.B. xlrd.Book has a read attribute too - if hasattr(filepath_or_buffer, 'seek'): - try: - # GH 19779 - filepath_or_buffer.seek(0) - except UnsupportedOperation: - # HTTPResponse does not support seek() - # GH 20434 - pass + @abc.abstractmethod + def get_sheet_by_name(self, name): + pass - data = filepath_or_buffer.read() - self.book = xlrd.open_workbook(file_contents=data) - elif isinstance(filepath_or_buffer, compat.string_types): - self.book = xlrd.open_workbook(filepath_or_buffer) - else: - raise ValueError('Must explicitly set engine if not passing in' - ' buffer or path for io.') + @abc.abstractmethod + def get_sheet_by_index(self, index): + pass - @property - def sheet_names(self): - return self.book.sheet_names() + @abc.abstractmethod + def get_sheet_data(self, sheet, convert_float): + pass def parse(self, sheet_name=0, @@ -455,48 +420,6 @@ def parse(self, _validate_header_arg(header) - from xlrd import (xldate, XL_CELL_DATE, - XL_CELL_ERROR, XL_CELL_BOOLEAN, - XL_CELL_NUMBER) - - epoch1904 = self.book.datemode - - def _parse_cell(cell_contents, cell_typ): - """converts the contents of the cell into a pandas - appropriate object""" - - if cell_typ == XL_CELL_DATE: - - # Use the newer xlrd datetime handling. - try: - cell_contents = xldate.xldate_as_datetime( - cell_contents, epoch1904) - except OverflowError: - return cell_contents - - # Excel doesn't distinguish between dates and time, - # so we treat dates on the epoch as times only. - # Also, Excel supports 1900 and 1904 epochs. - year = (cell_contents.timetuple())[0:3] - if ((not epoch1904 and year == (1899, 12, 31)) or - (epoch1904 and year == (1904, 1, 1))): - cell_contents = time(cell_contents.hour, - cell_contents.minute, - cell_contents.second, - cell_contents.microsecond) - - elif cell_typ == XL_CELL_ERROR: - cell_contents = np.nan - elif cell_typ == XL_CELL_BOOLEAN: - cell_contents = bool(cell_contents) - elif convert_float and cell_typ == XL_CELL_NUMBER: - # GH5394 - Excel 'numbers' are always floats - # it's a minimal perf hit and less surprising - val = int(cell_contents) - if val == cell_contents: - cell_contents = val - return cell_contents - ret_dict = False # Keep sheetname to maintain backwards compatibility. @@ -504,7 +427,7 @@ def _parse_cell(cell_contents, cell_typ): sheets = sheet_name ret_dict = True elif sheet_name is None: - sheets = self.book.sheet_names() + sheets = self.sheet_names ret_dict = True else: sheets = [sheet_name] @@ -519,19 +442,13 @@ def _parse_cell(cell_contents, cell_typ): print("Reading sheet {sheet}".format(sheet=asheetname)) if isinstance(asheetname, compat.string_types): - sheet = self.book.sheet_by_name(asheetname) + sheet = self.get_sheet_by_name(asheetname) else: # assume an integer if not a string - sheet = self.book.sheet_by_index(asheetname) + sheet = self.get_sheet_by_index(asheetname) - data = [] + data = self.get_sheet_data(sheet, convert_float) usecols = _maybe_convert_usecols(usecols) - for i in range(sheet.nrows): - row = [_parse_cell(value, typ) - for value, typ in zip(sheet.row_values(i), - sheet.row_types(i))] - data.append(row) - if sheet.nrows == 0: output[asheetname] = DataFrame() continue @@ -620,6 +537,120 @@ def _parse_cell(cell_contents, cell_typ): return output[asheetname] +class _XlrdReader(_BaseExcelReader): + + def __init__(self, filepath_or_buffer): + """Reader using xlrd engine. + + Parameters + ---------- + filepath_or_buffer : string, path object or Workbook + Object to be parsed. + """ + err_msg = "Install xlrd >= 1.0.0 for Excel support" + + try: + import xlrd + except ImportError: + raise ImportError(err_msg) + else: + if xlrd.__VERSION__ < LooseVersion("1.0.0"): + raise ImportError(err_msg + + ". Current version " + xlrd.__VERSION__) + + # If filepath_or_buffer is a url, want to keep the data as bytes so + # can't pass to get_filepath_or_buffer() + if _is_url(filepath_or_buffer): + filepath_or_buffer = _urlopen(filepath_or_buffer) + elif not isinstance(filepath_or_buffer, (ExcelFile, xlrd.Book)): + filepath_or_buffer, _, _, _ = get_filepath_or_buffer( + filepath_or_buffer) + + if isinstance(filepath_or_buffer, xlrd.Book): + self.book = filepath_or_buffer + elif hasattr(filepath_or_buffer, "read"): + # N.B. xlrd.Book has a read attribute too + if hasattr(filepath_or_buffer, 'seek'): + try: + # GH 19779 + filepath_or_buffer.seek(0) + except UnsupportedOperation: + # HTTPResponse does not support seek() + # GH 20434 + pass + + data = filepath_or_buffer.read() + self.book = xlrd.open_workbook(file_contents=data) + elif isinstance(filepath_or_buffer, compat.string_types): + self.book = xlrd.open_workbook(filepath_or_buffer) + else: + raise ValueError('Must explicitly set engine if not passing in' + ' buffer or path for io.') + + @property + def sheet_names(self): + return self.book.sheet_names() + + def get_sheet_by_name(self, name): + return self.book.sheet_by_name(name) + + def get_sheet_by_index(self, index): + return self.book.sheet_by_index(index) + + def get_sheet_data(self, sheet, convert_float): + from xlrd import (xldate, XL_CELL_DATE, + XL_CELL_ERROR, XL_CELL_BOOLEAN, + XL_CELL_NUMBER) + + epoch1904 = self.book.datemode + + def _parse_cell(cell_contents, cell_typ): + """converts the contents of the cell into a pandas + appropriate object""" + + if cell_typ == XL_CELL_DATE: + + # Use the newer xlrd datetime handling. + try: + cell_contents = xldate.xldate_as_datetime( + cell_contents, epoch1904) + except OverflowError: + return cell_contents + + # Excel doesn't distinguish between dates and time, + # so we treat dates on the epoch as times only. + # Also, Excel supports 1900 and 1904 epochs. + year = (cell_contents.timetuple())[0:3] + if ((not epoch1904 and year == (1899, 12, 31)) or + (epoch1904 and year == (1904, 1, 1))): + cell_contents = time(cell_contents.hour, + cell_contents.minute, + cell_contents.second, + cell_contents.microsecond) + + elif cell_typ == XL_CELL_ERROR: + cell_contents = np.nan + elif cell_typ == XL_CELL_BOOLEAN: + cell_contents = bool(cell_contents) + elif convert_float and cell_typ == XL_CELL_NUMBER: + # GH5394 - Excel 'numbers' are always floats + # it's a minimal perf hit and less surprising + val = int(cell_contents) + if val == cell_contents: + cell_contents = val + return cell_contents + + data = [] + + for i in range(sheet.nrows): + row = [_parse_cell(value, typ) + for value, typ in zip(sheet.row_values(i), + sheet.row_types(i))] + data.append(row) + + return data + + class ExcelFile(object): """ Class for parsing tabular excel sheets into DataFrame objects.