BUG: Fix wrong column selection in drop_duplicates when duplicate col…

…umn names (pandas-dev#17879)
reef-technologies · Oct 16, 2017 · 0e71cda · 0e71cda
1 parent d4f9453
commit 0e71cda
Show file tree

Hide file tree

Showing 3 changed files with 18 additions and 1 deletion.
diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
@@ -1008,6 +1008,7 @@ Reshaping
 - Bug in :func:`concat` where order of result index was unpredictable if it contained non-comparable elements (:issue:`17344`)
 - Fixes regression when sorting by multiple columns on a ``datetime64`` dtype ``Series`` with ``NaT`` values (:issue:`16836`)
 - Bug in :func:`pivot_table` where the result's columns did not preserve the categorical dtype of ``columns`` when ``dropna`` was ``False`` (:issue:`17842`)
+- Bug in ``DataFrame.drop_duplicates`` where dropping with non-unique column names raised a ``ValueError`` (:issue:`17836`)
 
 Numeric
 ^^^^^^^

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -3556,7 +3556,8 @@ def f(vals):
               isinstance(subset, tuple) and subset in self.columns):
             subset = subset,
 
-        vals = (self[col].values for col in subset)
+        vals = (col.values for name, col in self.iteritems()
+                if name in subset)
         labels, shape = map(list, zip(*map(f, vals)))
 
         ids = get_group_index(labels, shape, sort=False, xnull=False)

diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py
@@ -1394,6 +1394,21 @@ def test_drop_duplicates(self):
         for keep in ['first', 'last', False]:
             assert df.duplicated(keep=keep).sum() == 0
 
+    def test_drop_duplicates_with_duplicate_column_names(self):
+        # GH17836
+        df = DataFrame([
+            [1, 2, 5],
+            [3, 4, 6],
+            [3, 4, 7]
+        ], columns=['a', 'a', 'b'])
+
+        result0 = df.drop_duplicates()
+        tm.assert_frame_equal(result0, df)
+
+        result1 = df.drop_duplicates('a')
+        expected1 = df[:2]
+        tm.assert_frame_equal(result1, expected1)
+
     def test_drop_duplicates_for_take_all(self):
         df = DataFrame({'AAA': ['foo', 'bar', 'baz', 'bar',
                                 'foo', 'bar', 'qux', 'foo'],