Refactor init for Excel readers to _BaseExcelReader (#26233)

pandas-dev · Apr 30, 2019 · 7eff627 · MichaelTiemannOSC · Aug 13, 2022 · 7eff627
1 parent b6324be
commit 7eff627
Show file tree

Hide file tree

Showing 2 changed files with 47 additions and 32 deletions.
diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
@@ -1,13 +1,14 @@
 import abc
 from collections import OrderedDict
 from datetime import date, datetime, timedelta
+from io import BytesIO
 import os
 from textwrap import fill
+from urllib.request import urlopen
 import warnings
 
 from pandas._config import config
 
-import pandas.compat as compat
 from pandas.errors import EmptyDataError
 from pandas.util._decorators import Appender, deprecate_kwarg
 
@@ -16,7 +17,9 @@
 
 from pandas.core.frame import DataFrame
 
-from pandas.io.common import _NA_VALUES, _stringify_path, _validate_header_arg
+from pandas.io.common import (
+    _NA_VALUES, _is_url, _stringify_path, _validate_header_arg,
+    get_filepath_or_buffer)
 from pandas.io.excel._util import (
     _fill_mi_header, _get_default_writer, _maybe_convert_usecols,
     _pop_header_name, get_writer)
@@ -329,6 +332,36 @@ def read_excel(io,
 
 class _BaseExcelReader(metaclass=abc.ABCMeta):
 
+    def __init__(self, filepath_or_buffer):
+        # If filepath_or_buffer is a url, load the data into a BytesIO
+        if _is_url(filepath_or_buffer):
+            filepath_or_buffer = BytesIO(urlopen(filepath_or_buffer).read())
+        elif not isinstance(filepath_or_buffer,
+                            (ExcelFile, self._workbook_class)):
+            filepath_or_buffer, _, _, _ = get_filepath_or_buffer(
+                filepath_or_buffer)
+
+        if isinstance(filepath_or_buffer, self._workbook_class):
+            self.book = filepath_or_buffer
+        elif hasattr(filepath_or_buffer, "read"):
+            # N.B. xlrd.Book has a read attribute too
+            filepath_or_buffer.seek(0)
+            self.book = self.load_workbook(filepath_or_buffer)
+        elif isinstance(filepath_or_buffer, str):
+            self.book = self.load_workbook(filepath_or_buffer)
+        else:
+            raise ValueError('Must explicitly set engine if not passing in'
+                             ' buffer or path for io.')
+
+    @property
+    @abc.abstractmethod
+    def _workbook_class(self):
+        pass
+
+    @abc.abstractmethod
+    def load_workbook(self, filepath_or_buffer):
+        pass
+
     @property
     @abc.abstractmethod
     def sheet_names(self):
@@ -701,7 +734,7 @@ def _value_with_fmt(self, val):
             val = val.total_seconds() / float(86400)
             fmt = '0'
         else:
-            val = compat.to_str(val)
+            val = str(val)
 
         return val, fmt
 

diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py
@@ -1,11 +1,8 @@
 from datetime import time
 from distutils.version import LooseVersion
-from io import UnsupportedOperation
-from urllib.request import urlopen
 
 import numpy as np
 
-from pandas.io.common import _is_url, get_filepath_or_buffer
 from pandas.io.excel._base import _BaseExcelReader
 
 
@@ -30,35 +27,20 @@ def __init__(self, filepath_or_buffer):
                 raise ImportError(err_msg +
                                   ". Current version " + xlrd.__VERSION__)
 
-        from pandas.io.excel._base import ExcelFile
-        # If filepath_or_buffer is a url, want to keep the data as bytes so
-        # can't pass to get_filepath_or_buffer()
-        if _is_url(filepath_or_buffer):
-            filepath_or_buffer = urlopen(filepath_or_buffer)
-        elif not isinstance(filepath_or_buffer, (ExcelFile, xlrd.Book)):
-            filepath_or_buffer, _, _, _ = get_filepath_or_buffer(
-                filepath_or_buffer)
-
-        if isinstance(filepath_or_buffer, xlrd.Book):
-            self.book = filepath_or_buffer
-        elif hasattr(filepath_or_buffer, "read"):
-            # N.B. xlrd.Book has a read attribute too
-            if hasattr(filepath_or_buffer, 'seek'):
-                try:
-                    # GH 19779
-                    filepath_or_buffer.seek(0)
-                except UnsupportedOperation:
-                    # HTTPResponse does not support seek()
-                    # GH 20434
-                    pass
+        super().__init__(filepath_or_buffer)
+
+    @property
+    def _workbook_class(self):
+        from xlrd import Book
+        return Book
 
+    def load_workbook(self, filepath_or_buffer):
+        from xlrd import open_workbook
+        if hasattr(filepath_or_buffer, "read"):
             data = filepath_or_buffer.read()
-            self.book = xlrd.open_workbook(file_contents=data)
-        elif isinstance(filepath_or_buffer, str):
-            self.book = xlrd.open_workbook(filepath_or_buffer)
+            return open_workbook(file_contents=data)
         else:
-            raise ValueError('Must explicitly set engine if not passing in'
-                             ' buffer or path for io.')
+            return open_workbook(filepath_or_buffer)
 
     @property
     def sheet_names(self):