diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index a2198d9103528..a4924417ff2b7 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -767,6 +767,8 @@ Indexing - Bug in :class:`IntervalIndex` where empty and purely NA data was constructed inconsistently depending on the construction method (:issue:`18421`) - Bug in :func:`IntervalIndex.symmetric_difference` where the symmetric difference with a non-``IntervalIndex`` did not raise (:issue:`18475`) - Bug in :class:`IntervalIndex` where set operations that returned an empty ``IntervalIndex`` had the wrong dtype (:issue:`19101`) +- Bug in :meth:`DataFrame.drop_duplicates` where no ``KeyError`` is raised when passing in columns that don't exist on the ``DataFrame`` (issue:`19726`) + MultiIndex ^^^^^^^^^^ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a001037b573d4..b4db770e6bb74 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3655,6 +3655,13 @@ def f(vals): isinstance(subset, tuple) and subset in self.columns): subset = subset, + # Verify all columns in subset exist in the queried dataframe + # Otherwise, raise a KeyError, same as if you try to __getitem__ with a + # key that doesn't exist. + diff = Index(subset).difference(self.columns) + if not diff.empty: + raise KeyError(diff) + vals = (col.values for name, col in self.iteritems() if name in subset) labels, shape = map(list, zip(*map(f, vals))) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index b9275fc69e7ff..f2b8387072c8d 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1492,6 +1492,19 @@ def test_drop_duplicates(self): for keep in ['first', 'last', False]: assert df.duplicated(keep=keep).sum() == 0 + @pytest.mark.parametrize('subset', ['a', ['a'], ['a', 'B']]) + def test_duplicated_with_misspelled_column_name(self, subset): + # GH 19730 + df = pd.DataFrame({'A': [0, 0, 1], + 'B': [0, 0, 1], + 'C': [0, 0, 1]}) + + with pytest.raises(KeyError): + df.duplicated(subset) + + with pytest.raises(KeyError): + df.drop_duplicates(subset) + def test_drop_duplicates_with_duplicate_column_names(self): # GH17836 df = DataFrame([