From bdadcf015c4e6fc6c0dc78d7ab6752a7cabee5c9 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 17 Jan 2019 02:39:09 -0800 Subject: [PATCH 1/5] Code decoupling --- pandas/io/excel.py | 80 ++++++++++++++++++++++++++-------------------- 1 file changed, 45 insertions(+), 35 deletions(-) diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 3a7c39ec65309..d9959934782a0 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -406,8 +406,7 @@ def __init__(self, filepath_or_buffer): if isinstance(filepath_or_buffer, xlrd.Book): self.book = filepath_or_buffer - elif not isinstance(filepath_or_buffer, xlrd.Book) and hasattr( - filepath_or_buffer, "read"): + elif hasattr(filepath_or_buffer, "read"): # N.B. xlrd.Book has a read attribute too if hasattr(filepath_or_buffer, 'seek'): try: @@ -430,31 +429,13 @@ def __init__(self, filepath_or_buffer): def sheet_names(self): return self.book.sheet_names() - def parse(self, - sheet_name=0, - header=0, - names=None, - index_col=None, - usecols=None, - squeeze=False, - dtype=None, - true_values=None, - false_values=None, - skiprows=None, - nrows=None, - na_values=None, - verbose=False, - parse_dates=False, - date_parser=None, - thousands=None, - comment=None, - skipfooter=0, - convert_float=True, - mangle_dupe_cols=True, - **kwds): + def get_sheet_by_name(self, name): + return self.book.sheet_by_name(name) - _validate_header_arg(header) + def get_sheet_by_index(self, index): + return self.book.sheet_by_index(index) + def get_sheet_data(self, sheet, convert_float): from xlrd import (xldate, XL_CELL_DATE, XL_CELL_ERROR, XL_CELL_BOOLEAN, XL_CELL_NUMBER) @@ -497,6 +478,41 @@ def _parse_cell(cell_contents, cell_typ): cell_contents = val return cell_contents + data = [] + + for i in range(sheet.nrows): + row = [_parse_cell(value, typ) + for value, typ in zip(sheet.row_values(i), + sheet.row_types(i))] + data.append(row) + + return data + + def parse(self, + sheet_name=0, + header=0, + names=None, + index_col=None, + usecols=None, + squeeze=False, + dtype=None, + true_values=None, + false_values=None, + skiprows=None, + nrows=None, + na_values=None, + verbose=False, + parse_dates=False, + date_parser=None, + thousands=None, + comment=None, + skipfooter=0, + convert_float=True, + mangle_dupe_cols=True, + **kwds): + + _validate_header_arg(header) + ret_dict = False # Keep sheetname to maintain backwards compatibility. @@ -504,7 +520,7 @@ def _parse_cell(cell_contents, cell_typ): sheets = sheet_name ret_dict = True elif sheet_name is None: - sheets = self.book.sheet_names() + sheets = self.sheet_names ret_dict = True else: sheets = [sheet_name] @@ -519,19 +535,13 @@ def _parse_cell(cell_contents, cell_typ): print("Reading sheet {sheet}".format(sheet=asheetname)) if isinstance(asheetname, compat.string_types): - sheet = self.book.sheet_by_name(asheetname) + sheet = self.get_sheet_by_name(asheetname) else: # assume an integer if not a string - sheet = self.book.sheet_by_index(asheetname) + sheet = self.get_sheet_by_index(asheetname) - data = [] + data = self.get_sheet_data(sheet, convert_float) usecols = _maybe_convert_usecols(usecols) - for i in range(sheet.nrows): - row = [_parse_cell(value, typ) - for value, typ in zip(sheet.row_values(i), - sheet.row_types(i))] - data.append(row) - if sheet.nrows == 0: output[asheetname] = DataFrame() continue From e58c21946785cb62b36394122e34d05bc6d9e4dd Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 18 Jan 2019 12:57:39 -0500 Subject: [PATCH 2/5] Added base reader class --- pandas/io/excel.py | 223 ++++++++++++++++++++++++--------------------- 1 file changed, 120 insertions(+), 103 deletions(-) diff --git a/pandas/io/excel.py b/pandas/io/excel.py index d9959934782a0..949af438e8b4b 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -375,118 +375,20 @@ def read_excel(io, **kwds) -class _XlrdReader(object): - - def __init__(self, filepath_or_buffer): - """Reader using xlrd engine. - - Parameters - ---------- - filepath_or_buffer : string, path object or Workbook - Object to be parsed. - """ - err_msg = "Install xlrd >= 1.0.0 for Excel support" - - try: - import xlrd - except ImportError: - raise ImportError(err_msg) - else: - if xlrd.__VERSION__ < LooseVersion("1.0.0"): - raise ImportError(err_msg + - ". Current version " + xlrd.__VERSION__) - - # If filepath_or_buffer is a url, want to keep the data as bytes so - # can't pass to get_filepath_or_buffer() - if _is_url(filepath_or_buffer): - filepath_or_buffer = _urlopen(filepath_or_buffer) - elif not isinstance(filepath_or_buffer, (ExcelFile, xlrd.Book)): - filepath_or_buffer, _, _, _ = get_filepath_or_buffer( - filepath_or_buffer) - - if isinstance(filepath_or_buffer, xlrd.Book): - self.book = filepath_or_buffer - elif hasattr(filepath_or_buffer, "read"): - # N.B. xlrd.Book has a read attribute too - if hasattr(filepath_or_buffer, 'seek'): - try: - # GH 19779 - filepath_or_buffer.seek(0) - except UnsupportedOperation: - # HTTPResponse does not support seek() - # GH 20434 - pass - - data = filepath_or_buffer.read() - self.book = xlrd.open_workbook(file_contents=data) - elif isinstance(filepath_or_buffer, compat.string_types): - self.book = xlrd.open_workbook(filepath_or_buffer) - else: - raise ValueError('Must explicitly set engine if not passing in' - ' buffer or path for io.') +class _BaseExcelReader(object): @property def sheet_names(self): - return self.book.sheet_names() + raise NotImplementedError def get_sheet_by_name(self, name): - return self.book.sheet_by_name(name) + raise NotImplementedError def get_sheet_by_index(self, index): - return self.book.sheet_by_index(index) + raise NotImplementedError def get_sheet_data(self, sheet, convert_float): - from xlrd import (xldate, XL_CELL_DATE, - XL_CELL_ERROR, XL_CELL_BOOLEAN, - XL_CELL_NUMBER) - - epoch1904 = self.book.datemode - - def _parse_cell(cell_contents, cell_typ): - """converts the contents of the cell into a pandas - appropriate object""" - - if cell_typ == XL_CELL_DATE: - - # Use the newer xlrd datetime handling. - try: - cell_contents = xldate.xldate_as_datetime( - cell_contents, epoch1904) - except OverflowError: - return cell_contents - - # Excel doesn't distinguish between dates and time, - # so we treat dates on the epoch as times only. - # Also, Excel supports 1900 and 1904 epochs. - year = (cell_contents.timetuple())[0:3] - if ((not epoch1904 and year == (1899, 12, 31)) or - (epoch1904 and year == (1904, 1, 1))): - cell_contents = time(cell_contents.hour, - cell_contents.minute, - cell_contents.second, - cell_contents.microsecond) - - elif cell_typ == XL_CELL_ERROR: - cell_contents = np.nan - elif cell_typ == XL_CELL_BOOLEAN: - cell_contents = bool(cell_contents) - elif convert_float and cell_typ == XL_CELL_NUMBER: - # GH5394 - Excel 'numbers' are always floats - # it's a minimal perf hit and less surprising - val = int(cell_contents) - if val == cell_contents: - cell_contents = val - return cell_contents - - data = [] - - for i in range(sheet.nrows): - row = [_parse_cell(value, typ) - for value, typ in zip(sheet.row_values(i), - sheet.row_types(i))] - data.append(row) - - return data + raise NotImplementedError def parse(self, sheet_name=0, @@ -628,6 +530,121 @@ def parse(self, return output else: return output[asheetname] + + + +class _XlrdReader(_BaseExcelReader): + + def __init__(self, filepath_or_buffer): + """Reader using xlrd engine. + + Parameters + ---------- + filepath_or_buffer : string, path object or Workbook + Object to be parsed. + """ + err_msg = "Install xlrd >= 1.0.0 for Excel support" + + try: + import xlrd + except ImportError: + raise ImportError(err_msg) + else: + if xlrd.__VERSION__ < LooseVersion("1.0.0"): + raise ImportError(err_msg + + ". Current version " + xlrd.__VERSION__) + + # If filepath_or_buffer is a url, want to keep the data as bytes so + # can't pass to get_filepath_or_buffer() + if _is_url(filepath_or_buffer): + filepath_or_buffer = _urlopen(filepath_or_buffer) + elif not isinstance(filepath_or_buffer, (ExcelFile, xlrd.Book)): + filepath_or_buffer, _, _, _ = get_filepath_or_buffer( + filepath_or_buffer) + + if isinstance(filepath_or_buffer, xlrd.Book): + self.book = filepath_or_buffer + elif hasattr(filepath_or_buffer, "read"): + # N.B. xlrd.Book has a read attribute too + if hasattr(filepath_or_buffer, 'seek'): + try: + # GH 19779 + filepath_or_buffer.seek(0) + except UnsupportedOperation: + # HTTPResponse does not support seek() + # GH 20434 + pass + + data = filepath_or_buffer.read() + self.book = xlrd.open_workbook(file_contents=data) + elif isinstance(filepath_or_buffer, compat.string_types): + self.book = xlrd.open_workbook(filepath_or_buffer) + else: + raise ValueError('Must explicitly set engine if not passing in' + ' buffer or path for io.') + + @property + def sheet_names(self): + return self.book.sheet_names() + + def get_sheet_by_name(self, name): + return self.book.sheet_by_name(name) + + def get_sheet_by_index(self, index): + return self.book.sheet_by_index(index) + + def get_sheet_data(self, sheet, convert_float): + from xlrd import (xldate, XL_CELL_DATE, + XL_CELL_ERROR, XL_CELL_BOOLEAN, + XL_CELL_NUMBER) + + epoch1904 = self.book.datemode + + def _parse_cell(cell_contents, cell_typ): + """converts the contents of the cell into a pandas + appropriate object""" + + if cell_typ == XL_CELL_DATE: + + # Use the newer xlrd datetime handling. + try: + cell_contents = xldate.xldate_as_datetime( + cell_contents, epoch1904) + except OverflowError: + return cell_contents + + # Excel doesn't distinguish between dates and time, + # so we treat dates on the epoch as times only. + # Also, Excel supports 1900 and 1904 epochs. + year = (cell_contents.timetuple())[0:3] + if ((not epoch1904 and year == (1899, 12, 31)) or + (epoch1904 and year == (1904, 1, 1))): + cell_contents = time(cell_contents.hour, + cell_contents.minute, + cell_contents.second, + cell_contents.microsecond) + + elif cell_typ == XL_CELL_ERROR: + cell_contents = np.nan + elif cell_typ == XL_CELL_BOOLEAN: + cell_contents = bool(cell_contents) + elif convert_float and cell_typ == XL_CELL_NUMBER: + # GH5394 - Excel 'numbers' are always floats + # it's a minimal perf hit and less surprising + val = int(cell_contents) + if val == cell_contents: + cell_contents = val + return cell_contents + + data = [] + + for i in range(sheet.nrows): + row = [_parse_cell(value, typ) + for value, typ in zip(sheet.row_values(i), + sheet.row_types(i))] + data.append(row) + + return data class ExcelFile(object): From 1bd93f63c7c96f1872d53557d08acb8c55c0d777 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 18 Jan 2019 12:58:22 -0500 Subject: [PATCH 3/5] LINT fixup --- pandas/io/excel.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 949af438e8b4b..57917c8bdb550 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -382,10 +382,10 @@ def sheet_names(self): raise NotImplementedError def get_sheet_by_name(self, name): - raise NotImplementedError + raise NotImplementedError def get_sheet_by_index(self, index): - raise NotImplementedError + raise NotImplementedError def get_sheet_data(self, sheet, convert_float): raise NotImplementedError @@ -530,7 +530,6 @@ def parse(self, return output else: return output[asheetname] - class _XlrdReader(_BaseExcelReader): From 801ddc5022809b534f718e285395e7215c8def1b Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 20 Jan 2019 13:53:37 -0500 Subject: [PATCH 4/5] Used AbstractMethodError --- pandas/io/excel.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 57917c8bdb550..f86e54a285134 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -18,7 +18,7 @@ import pandas.compat as compat from pandas.compat import ( OrderedDict, add_metaclass, lrange, map, range, string_types, u, zip) -from pandas.errors import EmptyDataError +from pandas.errors import AbstractMethodError, EmptyDataError from pandas.util._decorators import Appender, deprecate_kwarg from pandas.core.dtypes.common import ( @@ -379,16 +379,16 @@ class _BaseExcelReader(object): @property def sheet_names(self): - raise NotImplementedError + raise AbstractMethodError def get_sheet_by_name(self, name): - raise NotImplementedError + raise AbstractMethodError def get_sheet_by_index(self, index): - raise NotImplementedError + raise AbstractMethodError def get_sheet_data(self, sheet, convert_float): - raise NotImplementedError + raise AbstractMethodError def parse(self, sheet_name=0, From a154cf3b85c90fc6935bd333520611d2d1d062bc Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 24 Jan 2019 08:26:17 -0500 Subject: [PATCH 5/5] Used ABC metaclass --- pandas/io/excel.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/pandas/io/excel.py b/pandas/io/excel.py index f86e54a285134..3d85ae7fd1f46 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -18,7 +18,7 @@ import pandas.compat as compat from pandas.compat import ( OrderedDict, add_metaclass, lrange, map, range, string_types, u, zip) -from pandas.errors import AbstractMethodError, EmptyDataError +from pandas.errors import EmptyDataError from pandas.util._decorators import Appender, deprecate_kwarg from pandas.core.dtypes.common import ( @@ -375,20 +375,25 @@ def read_excel(io, **kwds) +@add_metaclass(abc.ABCMeta) class _BaseExcelReader(object): @property + @abc.abstractmethod def sheet_names(self): - raise AbstractMethodError + pass + @abc.abstractmethod def get_sheet_by_name(self, name): - raise AbstractMethodError + pass + @abc.abstractmethod def get_sheet_by_index(self, index): - raise AbstractMethodError + pass + @abc.abstractmethod def get_sheet_data(self, sheet, convert_float): - raise AbstractMethodError + pass def parse(self, sheet_name=0,