Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: parsing multi-column headers in read_csv (GH6051) #6170

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions pandas/io/tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2568,6 +2568,52 @@ def read_table(self, *args, **kwds):
kwds['buffer_lines'] = 2
return read_table(*args, **kwds)

def test_list_of_one_header(self):
data = """A,B,C
1,2,3
4,5,6
7,8,9
"""
df = self.read_csv(StringIO(data), header=[0])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use the assert_produces_warning here to assert that you are actually showing the warning (around the read_csv)


values = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
expected = DataFrame(values, columns=['A', 'B', 'C'])

tm.assert_frame_equal(df, expected)

def test_list_of_multiple_headers(self):
data = """A,B,C
a,b,c
1,2,3
4,5,6
7,8,9
"""
df = self.read_csv(StringIO(data), header=[0,1])

values = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
expected_columns = pd.MultiIndex.from_arrays([['A', 'B', 'C'], ['a', 'b', 'c']])
expected = DataFrame(values, columns=expected_columns)

tm.assert_frame_equal(df, expected)

def test_list_of_multiple_headers_with_duplicated_column_pairs(self):
data = """A,A,A,A,A,B,B
a,b,b,b,c,c,c
1,2,3,4,5,6,7
1,2,3,4,5,6,7
1,2,3,4,5,6,7
"""
df = self.read_csv(StringIO(data), header=[0,1])

values = [[1, 2, 3, 4, 5, 6, 7], [1, 2, 3, 4, 5, 6, 7], [1, 2, 3, 4, 5, 6, 7]]
expected_columns = pd.MultiIndex.from_arrays([
['A', 'A', 'A', 'A', 'A', 'B', 'B'],
['a', 'b', 'b.1', 'b.2', 'c', 'c', 'c.1']])
expected = DataFrame(values, columns=expected_columns)

tm.assert_frame_equal(df, expected)


def test_compact_ints(self):
data = ('0,1,0,0\n'
'1,1,0,0\n'
Expand Down
53 changes: 52 additions & 1 deletion pandas/parser.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import numpy as np
cimport util

import pandas.lib as lib
from pandas.compat import lzip

import time
import os
Expand Down Expand Up @@ -460,7 +461,8 @@ cdef class TextReader:
self.parser_start = 0
self.header = []
else:
if isinstance(header, list) and len(header):
if isinstance(header, list) and len(header) >= 2:
# FIXME
# need to artifically skip the final line
# which is still a header line
header = list(header)
Expand All @@ -473,6 +475,11 @@ cdef class TextReader:
self.has_mi_columns = 1
self.header = header
else:
# if the header is a list with length 1
# set the header as the only element in the list
if isinstance(header, list) and len(header) == 1:
header = header[0]

self.parser.header_start = header
self.parser.header_end = header
self.parser.header = header
Expand Down Expand Up @@ -586,6 +593,7 @@ cdef class TextReader:
char *errors = "strict"

header = []
is_duplicated = False

if self.parser.header_start >= 0:

Expand Down Expand Up @@ -633,6 +641,9 @@ cdef class TextReader:
count = counts.get(name, 0)
if count > 0 and self.mangle_dupe_cols and not self.has_mi_columns:
this_header.append('%s.%d' % (name, count))

# for warning later
is_duplicated = True
else:
this_header.append(name)
counts[name] = count + 1
Expand All @@ -653,6 +664,43 @@ cdef class TextReader:
data_line = hr + 1
header.append(this_header)

#
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

put a GH ???? (the ref number of the issue), instead of these FIXME's

# Append a seq number for the duplicated columns pairs
#
# i.e. [['a', 'a', 'a', 'b'],
# ['A', 'A', 'B', 'C']]
# ==>
# [['a', 'a', 'b', 'b'],
# ['A', 'A.1', 'B', 'C']]
#
if self.has_mi_columns:

# zip the header, so that we can easily find the duplicated pair
header = lzip(*header)

counts = {}
for i, column in enumerate(header):

# Check whether the column is duplicated
count = counts.get(column, 0)
if count > 0:
#
# FIXME
# Since we've added an extra header line (search FIXME in this page)
# Append an incremental seq number to the second-last element
#
tmp_column = list(column)
tmp_column[-2] = '%s.%d' % (tmp_column[-2], count)
header[i] = tuple(tmp_column)

# for warning later
is_duplicated = True

counts[column] = count + 1

# unzip the header
header = lzip(*header)

if self.names is not None:
header = [ self.names ]

Expand Down Expand Up @@ -710,6 +758,9 @@ cdef class TextReader:
elif self.allow_leading_cols and passed_count < field_count:
self.leading_cols = field_count - passed_count

if self.mangle_dupe_cols and is_duplicated:
warnings.warn('Duplicated columns have been mangled', DtypeWarning)

return header, field_count

cdef _implicit_index_count(self):
Expand Down