Skip to content

Commit

Permalink
BUG: Don't extract header names if none specified (pandas-dev#23703)
Browse files Browse the repository at this point in the history
  • Loading branch information
gfyoung authored and Pingviinituutti committed Feb 28, 2019
1 parent 24cd841 commit 40cdd45
Show file tree
Hide file tree
Showing 6 changed files with 44 additions and 16 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1381,6 +1381,7 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form
- Bug in :func:`read_csv()` in which incorrect error messages were being raised when ``skipfooter`` was passed in along with ``nrows``, ``iterator``, or ``chunksize`` (:issue:`23711`)
- Bug in :meth:`read_csv()` in which :class:`MultiIndex` index names were being improperly handled in the cases when they were not provided (:issue:`23484`)
- Bug in :meth:`read_html()` in which the error message was not displaying the valid flavors when an invalid one was provided (:issue:`23549`)
- Bug in :meth:`read_excel()` in which extraneous header names were extracted, even though none were specified (:issue:`11733`)
- Bug in :meth:`read_excel()` in which ``index_col=None`` was not being respected and parsing index columns anyway (:issue:`20480`)
- Bug in :meth:`read_excel()` in which ``usecols`` was not being validated for proper column names when passed in as a string (:issue:`20480`)

Expand Down
48 changes: 32 additions & 16 deletions pandas/io/excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -630,11 +630,12 @@ def _parse_cell(cell_contents, cell_typ):
if is_integer(skiprows):
row += skiprows

data[row], control_row = _fill_mi_header(
data[row], control_row)
header_name, _ = _pop_header_name(
data[row], index_col)
header_names.append(header_name)
data[row], control_row = _fill_mi_header(data[row],
control_row)

if index_col is not None:
header_name, _ = _pop_header_name(data[row], index_col)
header_names.append(header_name)

if is_list_like(index_col):
# Forward fill values for MultiIndex index.
Expand Down Expand Up @@ -682,7 +683,8 @@ def _parse_cell(cell_contents, cell_typ):

output[asheetname] = parser.read(nrows=nrows)

if not squeeze or isinstance(output[asheetname], DataFrame):
if ((not squeeze or isinstance(output[asheetname], DataFrame))
and header_names):
output[asheetname].columns = output[
asheetname].columns.set_names(header_names)
except EmptyDataError:
Expand Down Expand Up @@ -863,16 +865,30 @@ def _fill_mi_header(row, control_row):


def _pop_header_name(row, index_col):
""" (header, new_data) for header rows in MultiIndex parsing"""
none_fill = lambda x: None if x == '' else x

if index_col is None:
# no index col specified, trim data for inference path
return none_fill(row[0]), row[1:]
else:
# pop out header name and fill w/ blank
i = index_col if not is_list_like(index_col) else max(index_col)
return none_fill(row[i]), row[:i] + [''] + row[i + 1:]
"""
Pop the header name for MultiIndex parsing.
Parameters
----------
row : list
The data row to parse for the header name.
index_col : int, list
The index columns for our data. Assumed to be non-null.
Returns
-------
header_name : str
The extracted header name.
trimmed_row : list
The original data row with the header name removed.
"""
# Pop out header name and fill w/blank.
i = index_col if not is_list_like(index_col) else max(index_col)

header_name = row[i]
header_name = None if header_name == "" else header_name

return header_name, row[:i] + [''] + row[i + 1:]


@add_metaclass(abc.ABCMeta)
Expand Down
Binary file modified pandas/tests/io/data/testmultiindex.xls
Binary file not shown.
Binary file modified pandas/tests/io/data/testmultiindex.xlsm
Binary file not shown.
Binary file modified pandas/tests/io/data/testmultiindex.xlsx
Binary file not shown.
11 changes: 11 additions & 0 deletions pandas/tests/io/test_excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -896,6 +896,17 @@ def test_read_excel_multiindex(self, ext):
header=[0, 1], skiprows=2)
tm.assert_frame_equal(actual, expected)

def test_read_excel_multiindex_header_only(self, ext):
# see gh-11733.
#
# Don't try to parse a header name if there isn't one.
mi_file = os.path.join(self.dirpath, "testmultiindex" + ext)
result = read_excel(mi_file, "index_col_none", header=[0, 1])

exp_columns = MultiIndex.from_product([("A", "B"), ("key", "val")])
expected = DataFrame([[1, 2, 3, 4]] * 2, columns=exp_columns)
tm.assert_frame_equal(result, expected)

@td.skip_if_no("xlsxwriter")
def test_read_excel_multiindex_empty_level(self, ext):
# see gh-12453
Expand Down

0 comments on commit 40cdd45

Please sign in to comment.