pandas-dev · waitingkuo · Jan 28, 2014 · Jan 28, 2014 · Jan 29, 2014 · Mar 13, 2014
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -2568,6 +2568,52 @@ def read_table(self, *args, **kwds):
         kwds['buffer_lines'] = 2
         return read_table(*args, **kwds)
 
+    def test_list_of_one_header(self):
+        data = """A,B,C
+1,2,3
+4,5,6
+7,8,9
+"""
+        df = self.read_csv(StringIO(data), header=[0])
+
+        values = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
+        expected = DataFrame(values, columns=['A', 'B', 'C'])
+
+        tm.assert_frame_equal(df, expected)
+
+    def test_list_of_multiple_headers(self):
+        data = """A,B,C
+a,b,c
+1,2,3
+4,5,6
+7,8,9
+"""
+        df = self.read_csv(StringIO(data), header=[0,1])
+
+        values = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
+        expected_columns = pd.MultiIndex.from_arrays([['A', 'B', 'C'], ['a', 'b', 'c']])
+        expected = DataFrame(values, columns=expected_columns)
+
+        tm.assert_frame_equal(df, expected)
+
+    def test_list_of_multiple_headers_with_duplicated_column_pairs(self):
+        data = """A,A,A,A,A,B,B
+a,b,b,b,c,c,c
+1,2,3,4,5,6,7
+1,2,3,4,5,6,7
+1,2,3,4,5,6,7
+"""
+        df = self.read_csv(StringIO(data), header=[0,1])
+
+        values = [[1, 2, 3, 4, 5, 6, 7], [1, 2, 3, 4, 5, 6, 7], [1, 2, 3, 4, 5, 6, 7]]
+        expected_columns = pd.MultiIndex.from_arrays([
+            ['A', 'A', 'A',   'A',  'A', 'B', 'B'], 
+            ['a', 'b', 'b.1', 'b.2', 'c', 'c', 'c.1']])
+        expected = DataFrame(values, columns=expected_columns)
+
+        tm.assert_frame_equal(df, expected)
+
+
     def test_compact_ints(self):
         data = ('0,1,0,0\n'
                 '1,1,0,0\n'

diff --git a/pandas/parser.pyx b/pandas/parser.pyx
@@ -30,6 +30,7 @@ import numpy as np
 cimport util
 
 import pandas.lib as lib
+from pandas.compat import lzip
 
 import time
 import os
@@ -460,7 +461,8 @@ cdef class TextReader:
             self.parser_start = 0
             self.header = []
         else:
-            if isinstance(header, list) and len(header):
+            if isinstance(header, list) and len(header) >= 2:
+                # FIXME
                 # need to artifically skip the final line
                 # which is still a header line
                 header = list(header)
@@ -473,6 +475,11 @@ cdef class TextReader:
                 self.has_mi_columns = 1
                 self.header = header
             else:
+                # if the header is a list with length 1
+                #   set the header as the only element in the list
+                if isinstance(header, list) and len(header) == 1:
+                    header = header[0]
+
                 self.parser.header_start = header
                 self.parser.header_end = header
                 self.parser.header = header
@@ -586,6 +593,7 @@ cdef class TextReader:
             char *errors = "strict"
 
         header = []
+        is_duplicated = False
 
         if self.parser.header_start >= 0:
 
@@ -633,6 +641,9 @@ cdef class TextReader:
                     count = counts.get(name, 0)
                     if count > 0 and self.mangle_dupe_cols and not self.has_mi_columns:
                         this_header.append('%s.%d' % (name, count))
+
+                        # for warning later
+                        is_duplicated = True
                     else:
                         this_header.append(name)
                     counts[name] = count + 1
@@ -653,6 +664,43 @@ cdef class TextReader:
                 data_line = hr + 1
                 header.append(this_header)
 
+            #
+            # Append a seq number for the duplicated columns pairs
+            #
+            # i.e. [['a', 'a', 'a', 'b'], 
+            #       ['A', 'A', 'B', 'C']]
+            #   ==>
+            #      [['a', 'a',   'b', 'b'], 
+            #       ['A', 'A.1', 'B', 'C']]
+            #
+            if self.has_mi_columns:
+
+                # zip the header, so that we can easily find the duplicated pair
+                header = lzip(*header)
+
+                counts = {}
+                for i, column in enumerate(header):
+
+                    # Check whether the column is duplicated
+                    count = counts.get(column, 0)
+                    if count > 0:
+                        #
+                        # FIXME
+                        # Since we've added an extra header line (search FIXME in this page)
+                        # Append an incremental seq number to the second-last element
+                        #
+                        tmp_column = list(column)
+                        tmp_column[-2] = '%s.%d' % (tmp_column[-2], count)
+                        header[i] = tuple(tmp_column)
+
+                        # for warning later
+                        is_duplicated = True
+
+                    counts[column] = count + 1
+
+                # unzip the header
+                header = lzip(*header)
+
             if self.names is not None:
                 header = [ self.names ]
 
@@ -710,6 +758,9 @@ cdef class TextReader:
             elif self.allow_leading_cols and passed_count < field_count:
                 self.leading_cols = field_count - passed_count
 
+        if self.mangle_dupe_cols and is_duplicated:
+            warnings.warn('Duplicated columns have been mangled', DtypeWarning)
+
         return header, field_count
 
     cdef _implicit_index_count(self):