From 010d8275900de8e9bd02405fcfd5216646f222dc Mon Sep 17 00:00:00 2001 From: Noah Date: Fri, 16 Feb 2018 13:01:25 -0500 Subject: [PATCH 1/5] BUG: drop_duplicates not raising KeyError on missing key Fix #17879 introduced an error by iterating over the columns in the dataframe, not the columns in the subset. This meant that passing in a column name missing from the dataframe would no longer raise a `KeyError` like it had previously. This fix checks the subset first before pulling necessary columns from the dataframe, and raises the necessary `KeyError` when a given column doesn't exist. Fixes #19726 --- pandas/core/frame.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a001037b573d4..d42106c785fed 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3655,6 +3655,10 @@ def f(vals): isinstance(subset, tuple) and subset in self.columns): subset = subset, + for name in subset: + if name not in self.columns: + raise KeyError(name) + vals = (col.values for name, col in self.iteritems() if name in subset) labels, shape = map(list, zip(*map(f, vals))) From 7f8ee79c01fa200f28e59f5f001b899ffce84538 Mon Sep 17 00:00:00 2001 From: Noah Date: Fri, 16 Feb 2018 13:16:11 -0500 Subject: [PATCH 2/5] Adds test --- pandas/tests/frame/test_analytics.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index b9275fc69e7ff..5b670eb0e5009 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1492,6 +1492,15 @@ def test_drop_duplicates(self): for keep in ['first', 'last', False]: assert df.duplicated(keep=keep).sum() == 0 + def test_drop_duplicates_with_misspelled_column_name(self): + # GH 18XXX + df = pd.DataFrame({'A': [0, 1, 2, 3, 4, 5], + 'B': [0, 1, 2, 3, 4, 5], + 'C': [0, 1, 2, 3, 4, 6]}) + + with pytest.raises(KeyError): + df.drop_duplicates(['a', 'B']) + def test_drop_duplicates_with_duplicate_column_names(self): # GH17836 df = DataFrame([ From df31f095f6be258719cb3c80447486d978f4bf35 Mon Sep 17 00:00:00 2001 From: Noah Date: Fri, 16 Feb 2018 13:26:21 -0500 Subject: [PATCH 3/5] Adds changes to whatsnew --- doc/source/whatsnew/v0.23.0.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index a2198d9103528..a4924417ff2b7 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -767,6 +767,8 @@ Indexing - Bug in :class:`IntervalIndex` where empty and purely NA data was constructed inconsistently depending on the construction method (:issue:`18421`) - Bug in :func:`IntervalIndex.symmetric_difference` where the symmetric difference with a non-``IntervalIndex`` did not raise (:issue:`18475`) - Bug in :class:`IntervalIndex` where set operations that returned an empty ``IntervalIndex`` had the wrong dtype (:issue:`19101`) +- Bug in :meth:`DataFrame.drop_duplicates` where no ``KeyError`` is raised when passing in columns that don't exist on the ``DataFrame`` (issue:`19726`) + MultiIndex ^^^^^^^^^^ From 69e3fd852cde10ba2e275990487aba3502f27cc4 Mon Sep 17 00:00:00 2001 From: Noah Date: Tue, 20 Feb 2018 14:08:38 -0500 Subject: [PATCH 4/5] Adds tests, fixes diff check. --- pandas/core/frame.py | 9 ++++++--- pandas/tests/frame/test_analytics.py | 16 ++++++++++------ 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d42106c785fed..b7ec01f59ee0c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3655,9 +3655,12 @@ def f(vals): isinstance(subset, tuple) and subset in self.columns): subset = subset, - for name in subset: - if name not in self.columns: - raise KeyError(name) + # Verify all columns in subset exist in the queried dataframe + # Otherwise, raise a KeyError, same as if you try to __getitem__ with a + # key that doesn't exist. + diff = Index(subset).difference(self.columns) + if len(diff): + raise KeyError(diff) vals = (col.values for name, col in self.iteritems() if name in subset) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 5b670eb0e5009..f2b8387072c8d 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1492,14 +1492,18 @@ def test_drop_duplicates(self): for keep in ['first', 'last', False]: assert df.duplicated(keep=keep).sum() == 0 - def test_drop_duplicates_with_misspelled_column_name(self): - # GH 18XXX - df = pd.DataFrame({'A': [0, 1, 2, 3, 4, 5], - 'B': [0, 1, 2, 3, 4, 5], - 'C': [0, 1, 2, 3, 4, 6]}) + @pytest.mark.parametrize('subset', ['a', ['a'], ['a', 'B']]) + def test_duplicated_with_misspelled_column_name(self, subset): + # GH 19730 + df = pd.DataFrame({'A': [0, 0, 1], + 'B': [0, 0, 1], + 'C': [0, 0, 1]}) with pytest.raises(KeyError): - df.drop_duplicates(['a', 'B']) + df.duplicated(subset) + + with pytest.raises(KeyError): + df.drop_duplicates(subset) def test_drop_duplicates_with_duplicate_column_names(self): # GH17836 From 61481a44db8d82eae10f6c337245eed3052abea8 Mon Sep 17 00:00:00 2001 From: Noah Date: Tue, 20 Feb 2018 15:03:40 -0500 Subject: [PATCH 5/5] Use Index's built-in empty check --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b7ec01f59ee0c..b4db770e6bb74 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3659,7 +3659,7 @@ def f(vals): # Otherwise, raise a KeyError, same as if you try to __getitem__ with a # key that doesn't exist. diff = Index(subset).difference(self.columns) - if len(diff): + if not diff.empty: raise KeyError(diff) vals = (col.values for name, col in self.iteritems()