Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor init for Excel readers to _BaseExcelReader #26233

Merged
39 changes: 36 additions & 3 deletions pandas/io/excel/_base.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
import abc
from collections import OrderedDict
from datetime import date, datetime, timedelta
from io import BytesIO
import os
from textwrap import fill
from urllib.request import urlopen
import warnings

from pandas._config import config

import pandas.compat as compat
from pandas.errors import EmptyDataError
from pandas.util._decorators import Appender, deprecate_kwarg

Expand All @@ -16,7 +17,9 @@

from pandas.core.frame import DataFrame

from pandas.io.common import _NA_VALUES, _stringify_path, _validate_header_arg
from pandas.io.common import (
_NA_VALUES, _is_url, _stringify_path, _validate_header_arg,
get_filepath_or_buffer)
from pandas.io.excel._util import (
_fill_mi_header, _get_default_writer, _maybe_convert_usecols,
_pop_header_name, get_writer)
Expand Down Expand Up @@ -329,6 +332,36 @@ def read_excel(io,

class _BaseExcelReader(metaclass=abc.ABCMeta):

def __init__(self, filepath_or_buffer):
# If filepath_or_buffer is a url, load the data into a BytesIO
if _is_url(filepath_or_buffer):
filepath_or_buffer = BytesIO(urlopen(filepath_or_buffer).read())
elif not isinstance(filepath_or_buffer,
(ExcelFile, self._workbook_class)):
filepath_or_buffer, _, _, _ = get_filepath_or_buffer(
filepath_or_buffer)

if isinstance(filepath_or_buffer, self._workbook_class):
self.book = filepath_or_buffer
elif hasattr(filepath_or_buffer, "read"):
# N.B. xlrd.Book has a read attribute too
filepath_or_buffer.seek(0)
self.book = self.load_workbook(filepath_or_buffer)
elif isinstance(filepath_or_buffer, str):
self.book = self.load_workbook(filepath_or_buffer)
else:
raise ValueError('Must explicitly set engine if not passing in'
' buffer or path for io.')

@property
@abc.abstractmethod
def _workbook_class(self):
pass

@abc.abstractmethod
def load_workbook(self, filepath_or_buffer):
pass

@property
@abc.abstractmethod
def sheet_names(self):
Expand Down Expand Up @@ -701,7 +734,7 @@ def _value_with_fmt(self, val):
val = val.total_seconds() / float(86400)
fmt = '0'
else:
val = compat.to_str(val)
val = str(val)

return val, fmt

Expand Down
40 changes: 11 additions & 29 deletions pandas/io/excel/_xlrd.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
from datetime import time
from distutils.version import LooseVersion
from io import UnsupportedOperation
from urllib.request import urlopen

import numpy as np

from pandas.io.common import _is_url, get_filepath_or_buffer
from pandas.io.excel._base import _BaseExcelReader


Expand All @@ -30,35 +27,20 @@ def __init__(self, filepath_or_buffer):
raise ImportError(err_msg +
". Current version " + xlrd.__VERSION__)

from pandas.io.excel._base import ExcelFile
# If filepath_or_buffer is a url, want to keep the data as bytes so
# can't pass to get_filepath_or_buffer()
if _is_url(filepath_or_buffer):
filepath_or_buffer = urlopen(filepath_or_buffer)
elif not isinstance(filepath_or_buffer, (ExcelFile, xlrd.Book)):
filepath_or_buffer, _, _, _ = get_filepath_or_buffer(
filepath_or_buffer)

if isinstance(filepath_or_buffer, xlrd.Book):
self.book = filepath_or_buffer
elif hasattr(filepath_or_buffer, "read"):
# N.B. xlrd.Book has a read attribute too
if hasattr(filepath_or_buffer, 'seek'):
try:
# GH 19779
filepath_or_buffer.seek(0)
except UnsupportedOperation:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see this wasn't carried over - is it no longer valid?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is now dealt with by wrapping url responses in BytesIO(urlopen(filepath_or_buffer).read()). So now all items with "read" can also support seek(0)

# HTTPResponse does not support seek()
# GH 20434
pass
super().__init__(filepath_or_buffer)

@property
def _workbook_class(self):
from xlrd import Book
return Book

def load_workbook(self, filepath_or_buffer):
from xlrd import open_workbook
if hasattr(filepath_or_buffer, "read"):
data = filepath_or_buffer.read()
self.book = xlrd.open_workbook(file_contents=data)
elif isinstance(filepath_or_buffer, str):
self.book = xlrd.open_workbook(filepath_or_buffer)
return open_workbook(file_contents=data)
else:
raise ValueError('Must explicitly set engine if not passing in'
' buffer or path for io.')
return open_workbook(filepath_or_buffer)

@property
def sheet_names(self):
Expand Down