diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 44c467795d1ed..71c05981782f6 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1324,6 +1324,7 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form - Bug in :meth:`HDFStore.append` when appending a :class:`DataFrame` with an empty string column and ``min_itemsize`` < 8 (:issue:`12242`) - Bug in :meth:`read_csv()` in which :class:`MultiIndex` index names were being improperly handled in the cases when they were not provided (:issue:`23484`) - Bug in :meth:`read_html()` in which the error message was not displaying the valid flavors when an invalid one was provided (:issue:`23549`) +- Bug in :meth:`read_excel()` in which extraneous header names were extracted, even though none were specified (:issue:`11733`) - Bug in :meth:`read_excel()` in which ``index_col=None`` was not being respected and parsing index columns anyway (:issue:`20480`) - Bug in :meth:`read_excel()` in which ``usecols`` was not being validated for proper column names when passed in as a string (:issue:`20480`) diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 141d2c79a1927..0c97587b9458f 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -630,11 +630,12 @@ def _parse_cell(cell_contents, cell_typ): if is_integer(skiprows): row += skiprows - data[row], control_row = _fill_mi_header( - data[row], control_row) - header_name, _ = _pop_header_name( - data[row], index_col) - header_names.append(header_name) + data[row], control_row = _fill_mi_header(data[row], + control_row) + + if index_col is not None: + header_name, _ = _pop_header_name(data[row], index_col) + header_names.append(header_name) if is_list_like(index_col): # Forward fill values for MultiIndex index. @@ -682,7 +683,8 @@ def _parse_cell(cell_contents, cell_typ): output[asheetname] = parser.read(nrows=nrows) - if not squeeze or isinstance(output[asheetname], DataFrame): + if ((not squeeze or isinstance(output[asheetname], DataFrame)) + and header_names): output[asheetname].columns = output[ asheetname].columns.set_names(header_names) except EmptyDataError: @@ -863,16 +865,30 @@ def _fill_mi_header(row, control_row): def _pop_header_name(row, index_col): - """ (header, new_data) for header rows in MultiIndex parsing""" - none_fill = lambda x: None if x == '' else x - - if index_col is None: - # no index col specified, trim data for inference path - return none_fill(row[0]), row[1:] - else: - # pop out header name and fill w/ blank - i = index_col if not is_list_like(index_col) else max(index_col) - return none_fill(row[i]), row[:i] + [''] + row[i + 1:] + """ + Pop the header name for MultiIndex parsing. + + Parameters + ---------- + row : list + The data row to parse for the header name. + index_col : int, list + The index columns for our data. Assumed to be non-null. + + Returns + ------- + header_name : str + The extracted header name. + trimmed_row : list + The original data row with the header name removed. + """ + # Pop out header name and fill w/blank. + i = index_col if not is_list_like(index_col) else max(index_col) + + header_name = row[i] + header_name = None if header_name == "" else header_name + + return header_name, row[:i] + [''] + row[i + 1:] @add_metaclass(abc.ABCMeta) diff --git a/pandas/tests/io/data/testmultiindex.xls b/pandas/tests/io/data/testmultiindex.xls index 51ef0f6c04cba..4329992642c8c 100644 Binary files a/pandas/tests/io/data/testmultiindex.xls and b/pandas/tests/io/data/testmultiindex.xls differ diff --git a/pandas/tests/io/data/testmultiindex.xlsm b/pandas/tests/io/data/testmultiindex.xlsm index 28c92a5f0be38..ebbca4856562f 100644 Binary files a/pandas/tests/io/data/testmultiindex.xlsm and b/pandas/tests/io/data/testmultiindex.xlsm differ diff --git a/pandas/tests/io/data/testmultiindex.xlsx b/pandas/tests/io/data/testmultiindex.xlsx index 815f3b07342ca..afe1758a7a132 100644 Binary files a/pandas/tests/io/data/testmultiindex.xlsx and b/pandas/tests/io/data/testmultiindex.xlsx differ diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index a097e0adbeb7a..fbcd0f279a9ff 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -896,6 +896,17 @@ def test_read_excel_multiindex(self, ext): header=[0, 1], skiprows=2) tm.assert_frame_equal(actual, expected) + def test_read_excel_multiindex_header_only(self, ext): + # see gh-11733. + # + # Don't try to parse a header name if there isn't one. + mi_file = os.path.join(self.dirpath, "testmultiindex" + ext) + result = read_excel(mi_file, "index_col_none", header=[0, 1]) + + exp_columns = MultiIndex.from_product([("A", "B"), ("key", "val")]) + expected = DataFrame([[1, 2, 3, 4]] * 2, columns=exp_columns) + tm.assert_frame_equal(result, expected) + @td.skip_if_no("xlsxwriter") def test_read_excel_multiindex_empty_level(self, ext): # see gh-12453