Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Don't extract header names if none specified #23703

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1324,6 +1324,7 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form
- Bug in :meth:`HDFStore.append` when appending a :class:`DataFrame` with an empty string column and ``min_itemsize`` < 8 (:issue:`12242`)
- Bug in :meth:`read_csv()` in which :class:`MultiIndex` index names were being improperly handled in the cases when they were not provided (:issue:`23484`)
- Bug in :meth:`read_html()` in which the error message was not displaying the valid flavors when an invalid one was provided (:issue:`23549`)
- Bug in :meth:`read_excel()` in which extraneous header names were extracted, even though none were specified (:issue:`11733`)
- Bug in :meth:`read_excel()` in which ``index_col=None`` was not being respected and parsing index columns anyway (:issue:`20480`)
- Bug in :meth:`read_excel()` in which ``usecols`` was not being validated for proper column names when passed in as a string (:issue:`20480`)

Expand Down
48 changes: 32 additions & 16 deletions pandas/io/excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -630,11 +630,12 @@ def _parse_cell(cell_contents, cell_typ):
if is_integer(skiprows):
row += skiprows

data[row], control_row = _fill_mi_header(
data[row], control_row)
header_name, _ = _pop_header_name(
data[row], index_col)
header_names.append(header_name)
data[row], control_row = _fill_mi_header(data[row],
control_row)

if index_col is not None:
header_name, _ = _pop_header_name(data[row], index_col)
header_names.append(header_name)

if is_list_like(index_col):
# Forward fill values for MultiIndex index.
Expand Down Expand Up @@ -682,7 +683,8 @@ def _parse_cell(cell_contents, cell_typ):

output[asheetname] = parser.read(nrows=nrows)

if not squeeze or isinstance(output[asheetname], DataFrame):
if ((not squeeze or isinstance(output[asheetname], DataFrame))
and header_names):
output[asheetname].columns = output[
asheetname].columns.set_names(header_names)
except EmptyDataError:
Expand Down Expand Up @@ -863,16 +865,30 @@ def _fill_mi_header(row, control_row):


def _pop_header_name(row, index_col):
""" (header, new_data) for header rows in MultiIndex parsing"""
none_fill = lambda x: None if x == '' else x

if index_col is None:
# no index col specified, trim data for inference path
return none_fill(row[0]), row[1:]
else:
# pop out header name and fill w/ blank
i = index_col if not is_list_like(index_col) else max(index_col)
return none_fill(row[i]), row[:i] + [''] + row[i + 1:]
"""
Pop the header name for MultiIndex parsing.

Parameters
----------
row : list
The data row to parse for the header name.
index_col : int, list
The index columns for our data. Assumed to be non-null.

Returns
-------
header_name : str
The extracted header name.
trimmed_row : list
The original data row with the header name removed.
"""
# Pop out header name and fill w/blank.
i = index_col if not is_list_like(index_col) else max(index_col)

header_name = row[i]
header_name = None if header_name == "" else header_name

return header_name, row[:i] + [''] + row[i + 1:]


@add_metaclass(abc.ABCMeta)
Expand Down
Binary file modified pandas/tests/io/data/testmultiindex.xls
Binary file not shown.
Binary file modified pandas/tests/io/data/testmultiindex.xlsm
Binary file not shown.
Binary file modified pandas/tests/io/data/testmultiindex.xlsx
Binary file not shown.
11 changes: 11 additions & 0 deletions pandas/tests/io/test_excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -896,6 +896,17 @@ def test_read_excel_multiindex(self, ext):
header=[0, 1], skiprows=2)
tm.assert_frame_equal(actual, expected)

def test_read_excel_multiindex_header_only(self, ext):
# see gh-11733.
#
# Don't try to parse a header name if there isn't one.
mi_file = os.path.join(self.dirpath, "testmultiindex" + ext)
result = read_excel(mi_file, "index_col_none", header=[0, 1])

exp_columns = MultiIndex.from_product([("A", "B"), ("key", "val")])
expected = DataFrame([[1, 2, 3, 4]] * 2, columns=exp_columns)
tm.assert_frame_equal(result, expected)

@td.skip_if_no("xlsxwriter")
def test_read_excel_multiindex_empty_level(self, ext):
# see gh-12453
Expand Down