Skip to content

Commit

Permalink
Refactor init for Excel readers to _BaseExcelReader (#26233)
Browse files Browse the repository at this point in the history
  • Loading branch information
tdamsma authored and WillAyd committed Apr 30, 2019
1 parent b6324be commit 7eff627
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 32 deletions.
39 changes: 36 additions & 3 deletions pandas/io/excel/_base.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
import abc
from collections import OrderedDict
from datetime import date, datetime, timedelta
from io import BytesIO
import os
from textwrap import fill
from urllib.request import urlopen
import warnings

from pandas._config import config

import pandas.compat as compat
from pandas.errors import EmptyDataError
from pandas.util._decorators import Appender, deprecate_kwarg

Expand All @@ -16,7 +17,9 @@

from pandas.core.frame import DataFrame

from pandas.io.common import _NA_VALUES, _stringify_path, _validate_header_arg
from pandas.io.common import (
_NA_VALUES, _is_url, _stringify_path, _validate_header_arg,
get_filepath_or_buffer)
from pandas.io.excel._util import (
_fill_mi_header, _get_default_writer, _maybe_convert_usecols,
_pop_header_name, get_writer)
Expand Down Expand Up @@ -329,6 +332,36 @@ def read_excel(io,

class _BaseExcelReader(metaclass=abc.ABCMeta):

def __init__(self, filepath_or_buffer):
# If filepath_or_buffer is a url, load the data into a BytesIO
if _is_url(filepath_or_buffer):
filepath_or_buffer = BytesIO(urlopen(filepath_or_buffer).read())
elif not isinstance(filepath_or_buffer,
(ExcelFile, self._workbook_class)):
filepath_or_buffer, _, _, _ = get_filepath_or_buffer(
filepath_or_buffer)

if isinstance(filepath_or_buffer, self._workbook_class):
self.book = filepath_or_buffer
elif hasattr(filepath_or_buffer, "read"):
# N.B. xlrd.Book has a read attribute too
filepath_or_buffer.seek(0)

This comment has been minimized.

Copy link
@MichaelTiemannOSC

MichaelTiemannOSC Aug 13, 2022

Contributor

The introduction of 'seek' here expands the envelope of what is expected of a "file-like object'. Pachyderm seems to think a file-like object needs to only implement read and close (and not seek). As a result, we cannot pass a PFSFile object (which claims to be file-like) to read_excel (which claims to accept a file-like object). Why do we need a seek(0) of what should be a fresh file?

self.book = self.load_workbook(filepath_or_buffer)
elif isinstance(filepath_or_buffer, str):
self.book = self.load_workbook(filepath_or_buffer)
else:
raise ValueError('Must explicitly set engine if not passing in'
' buffer or path for io.')

@property
@abc.abstractmethod
def _workbook_class(self):
pass

@abc.abstractmethod
def load_workbook(self, filepath_or_buffer):
pass

@property
@abc.abstractmethod
def sheet_names(self):
Expand Down Expand Up @@ -701,7 +734,7 @@ def _value_with_fmt(self, val):
val = val.total_seconds() / float(86400)
fmt = '0'
else:
val = compat.to_str(val)
val = str(val)

return val, fmt

Expand Down
40 changes: 11 additions & 29 deletions pandas/io/excel/_xlrd.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
from datetime import time
from distutils.version import LooseVersion
from io import UnsupportedOperation
from urllib.request import urlopen

import numpy as np

from pandas.io.common import _is_url, get_filepath_or_buffer
from pandas.io.excel._base import _BaseExcelReader


Expand All @@ -30,35 +27,20 @@ def __init__(self, filepath_or_buffer):
raise ImportError(err_msg +
". Current version " + xlrd.__VERSION__)

from pandas.io.excel._base import ExcelFile
# If filepath_or_buffer is a url, want to keep the data as bytes so
# can't pass to get_filepath_or_buffer()
if _is_url(filepath_or_buffer):
filepath_or_buffer = urlopen(filepath_or_buffer)
elif not isinstance(filepath_or_buffer, (ExcelFile, xlrd.Book)):
filepath_or_buffer, _, _, _ = get_filepath_or_buffer(
filepath_or_buffer)

if isinstance(filepath_or_buffer, xlrd.Book):
self.book = filepath_or_buffer
elif hasattr(filepath_or_buffer, "read"):
# N.B. xlrd.Book has a read attribute too
if hasattr(filepath_or_buffer, 'seek'):
try:
# GH 19779
filepath_or_buffer.seek(0)
except UnsupportedOperation:
# HTTPResponse does not support seek()
# GH 20434
pass
super().__init__(filepath_or_buffer)

@property
def _workbook_class(self):
from xlrd import Book
return Book

def load_workbook(self, filepath_or_buffer):
from xlrd import open_workbook
if hasattr(filepath_or_buffer, "read"):
data = filepath_or_buffer.read()
self.book = xlrd.open_workbook(file_contents=data)
elif isinstance(filepath_or_buffer, str):
self.book = xlrd.open_workbook(filepath_or_buffer)
return open_workbook(file_contents=data)
else:
raise ValueError('Must explicitly set engine if not passing in'
' buffer or path for io.')
return open_workbook(filepath_or_buffer)

@property
def sheet_names(self):
Expand Down

0 comments on commit 7eff627

Please sign in to comment.