From 010d8275900de8e9bd02405fcfd5216646f222dc Mon Sep 17 00:00:00 2001
From: Noah <nbtheduke@gmail.com>
Date: Fri, 16 Feb 2018 13:01:25 -0500
Subject: [PATCH 1/5] BUG: drop_duplicates not raising KeyError on missing key

Fix #17879 introduced an error by iterating over the columns in the dataframe,
not the columns in the subset. This meant that passing in a column name missing
from the dataframe would no longer raise a `KeyError` like it had previously.

This fix checks the subset first before pulling necessary columns from the
dataframe, and raises the necessary `KeyError` when a given column doesn't
exist.

Fixes #19726
---
 pandas/core/frame.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index a001037b573d4..d42106c785fed 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -3655,6 +3655,10 @@ def f(vals):
               isinstance(subset, tuple) and subset in self.columns):
             subset = subset,
 
+        for name in subset:
+            if name not in self.columns:
+                raise KeyError(name)
+
         vals = (col.values for name, col in self.iteritems()
                 if name in subset)
         labels, shape = map(list, zip(*map(f, vals)))

From 7f8ee79c01fa200f28e59f5f001b899ffce84538 Mon Sep 17 00:00:00 2001
From: Noah <nbtheduke@gmail.com>
Date: Fri, 16 Feb 2018 13:16:11 -0500
Subject: [PATCH 2/5] Adds test

---
 pandas/tests/frame/test_analytics.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py
index b9275fc69e7ff..5b670eb0e5009 100644
--- a/pandas/tests/frame/test_analytics.py
+++ b/pandas/tests/frame/test_analytics.py
@@ -1492,6 +1492,15 @@ def test_drop_duplicates(self):
         for keep in ['first', 'last', False]:
             assert df.duplicated(keep=keep).sum() == 0
 
+    def test_drop_duplicates_with_misspelled_column_name(self):
+        # GH 18XXX
+        df = pd.DataFrame({'A': [0, 1, 2, 3, 4, 5],
+                           'B': [0, 1, 2, 3, 4, 5],
+                           'C': [0, 1, 2, 3, 4, 6]})
+
+        with pytest.raises(KeyError):
+            df.drop_duplicates(['a', 'B'])
+
     def test_drop_duplicates_with_duplicate_column_names(self):
         # GH17836
         df = DataFrame([

From df31f095f6be258719cb3c80447486d978f4bf35 Mon Sep 17 00:00:00 2001
From: Noah <nbtheduke@gmail.com>
Date: Fri, 16 Feb 2018 13:26:21 -0500
Subject: [PATCH 3/5] Adds changes to whatsnew

---
 doc/source/whatsnew/v0.23.0.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
index a2198d9103528..a4924417ff2b7 100644
--- a/doc/source/whatsnew/v0.23.0.txt
+++ b/doc/source/whatsnew/v0.23.0.txt
@@ -767,6 +767,8 @@ Indexing
 - Bug in :class:`IntervalIndex` where empty and purely NA data was constructed inconsistently depending on the construction method (:issue:`18421`)
 - Bug in :func:`IntervalIndex.symmetric_difference` where the symmetric difference with a non-``IntervalIndex`` did not raise (:issue:`18475`)
 - Bug in :class:`IntervalIndex` where set operations that returned an empty ``IntervalIndex`` had the wrong dtype (:issue:`19101`)
+- Bug in :meth:`DataFrame.drop_duplicates` where no ``KeyError`` is raised when passing in columns that don't exist on the ``DataFrame`` (issue:`19726`)
+
 
 MultiIndex
 ^^^^^^^^^^

From 69e3fd852cde10ba2e275990487aba3502f27cc4 Mon Sep 17 00:00:00 2001
From: Noah <nbtheduke@gmail.com>
Date: Tue, 20 Feb 2018 14:08:38 -0500
Subject: [PATCH 4/5] Adds tests, fixes diff check.

---
 pandas/core/frame.py                 |  9 ++++++---
 pandas/tests/frame/test_analytics.py | 16 ++++++++++------
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index d42106c785fed..b7ec01f59ee0c 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -3655,9 +3655,12 @@ def f(vals):
               isinstance(subset, tuple) and subset in self.columns):
             subset = subset,
 
-        for name in subset:
-            if name not in self.columns:
-                raise KeyError(name)
+        # Verify all columns in subset exist in the queried dataframe
+        # Otherwise, raise a KeyError, same as if you try to __getitem__ with a
+        # key that doesn't exist.
+        diff = Index(subset).difference(self.columns)
+        if len(diff):
+            raise KeyError(diff)
 
         vals = (col.values for name, col in self.iteritems()
                 if name in subset)
diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py
index 5b670eb0e5009..f2b8387072c8d 100644
--- a/pandas/tests/frame/test_analytics.py
+++ b/pandas/tests/frame/test_analytics.py
@@ -1492,14 +1492,18 @@ def test_drop_duplicates(self):
         for keep in ['first', 'last', False]:
             assert df.duplicated(keep=keep).sum() == 0
 
-    def test_drop_duplicates_with_misspelled_column_name(self):
-        # GH 18XXX
-        df = pd.DataFrame({'A': [0, 1, 2, 3, 4, 5],
-                           'B': [0, 1, 2, 3, 4, 5],
-                           'C': [0, 1, 2, 3, 4, 6]})
+    @pytest.mark.parametrize('subset', ['a', ['a'], ['a', 'B']])
+    def test_duplicated_with_misspelled_column_name(self, subset):
+        # GH 19730
+        df = pd.DataFrame({'A': [0, 0, 1],
+                           'B': [0, 0, 1],
+                           'C': [0, 0, 1]})
 
         with pytest.raises(KeyError):
-            df.drop_duplicates(['a', 'B'])
+            df.duplicated(subset)
+
+        with pytest.raises(KeyError):
+            df.drop_duplicates(subset)
 
     def test_drop_duplicates_with_duplicate_column_names(self):
         # GH17836

From 61481a44db8d82eae10f6c337245eed3052abea8 Mon Sep 17 00:00:00 2001
From: Noah <nbtheduke@gmail.com>
Date: Tue, 20 Feb 2018 15:03:40 -0500
Subject: [PATCH 5/5] Use Index's built-in empty check

---
 pandas/core/frame.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index b7ec01f59ee0c..b4db770e6bb74 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -3659,7 +3659,7 @@ def f(vals):
         # Otherwise, raise a KeyError, same as if you try to __getitem__ with a
         # key that doesn't exist.
         diff = Index(subset).difference(self.columns)
-        if len(diff):
+        if not diff.empty:
             raise KeyError(diff)
 
         vals = (col.values for name, col in self.iteritems()