From b87888692bea1073fba1b827a84f5fbb4b9423be Mon Sep 17 00:00:00 2001
From: thequackdaddy <pquack@gmail.com>
Date: Tue, 20 Jun 2017 15:14:33 -0500
Subject: [PATCH 1/4] BUG: Load data from a CategoricalIndex for dtype
 comparison, closes #16627

---
 doc/source/whatsnew/v0.21.0.txt |  1 +
 pandas/core/reshape/merge.py    |  8 ++++++--
 pandas/tests/test_join.py       | 21 +++++++++++++++++++++
 3 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
index de2516d75040b..350cb5b6bcfd9 100644
--- a/doc/source/whatsnew/v0.21.0.txt
+++ b/doc/source/whatsnew/v0.21.0.txt
@@ -102,6 +102,7 @@ Indexing
 
 - When called with a null slice (e.g. ``df.iloc[:]``), the ``.iloc`` and ``.loc`` indexers return a shallow copy of the original object. Previously they returned the original object. (:issue:`13873`).
 - When called on an unsorted ``MultiIndex``, the ``loc`` indexer now will raise ``UnsortedIndexError`` only if proper slicing is used on non-sorted levels (:issue:`16734`).
+- Fixed a bug that prevented joining on a categorical MultiIndex (:issue:`13873`).
 
 
 I/O
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
index ffe0cac33ec8f..99d9af28ac019 100644
--- a/pandas/core/reshape/merge.py
+++ b/pandas/core/reshape/merge.py
@@ -11,7 +11,7 @@
 import pandas.compat as compat
 
 from pandas import (Categorical, Series, DataFrame,
-                    Index, MultiIndex, Timedelta)
+                    Index, MultiIndex, Timedelta, CategoricalIndex)
 from pandas.core.frame import _merge_doc
 from pandas.core.dtypes.common import (
     is_datetime64tz_dtype,
@@ -1441,9 +1441,13 @@ def _factorize_keys(lk, rk, sort=True):
         rk = rk.values
 
     # if we exactly match in categories, allow us to use codes
+    if isinstance(lk, CategoricalIndex):
+        ldata = lk._data
+    else:
+        ldata = lk
     if (is_categorical_dtype(lk) and
             is_categorical_dtype(rk) and
-            lk.is_dtype_equal(rk)):
+            ldata.is_dtype_equal(rk)):
         return lk.codes, rk.codes, len(lk.categories)
 
     if is_int_or_datetime_dtype(lk) and is_int_or_datetime_dtype(rk):
diff --git a/pandas/tests/test_join.py b/pandas/tests/test_join.py
index 3fc13d23b53f7..5d29c5355f880 100644
--- a/pandas/tests/test_join.py
+++ b/pandas/tests/test_join.py
@@ -192,3 +192,24 @@ def test_inner_join_indexer2():
 
     exp_ridx = np.array([0, 1, 2, 3], dtype=np.int64)
     assert_almost_equal(ridx, exp_ridx)
+
+
+def test_merge_join_categorical_multiindex():
+    # From issue 16627
+    import pandas as pd
+    a = {'Cat1': pd.Categorical(['a', 'b', 'a', 'c', 'a', 'b'],
+                                ['a', 'b', 'c']),
+         'Int1': [0, 1, 0, 1, 0, 0]}
+    a = pd.DataFrame(a)
+
+    b = {'Cat': pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'],
+                               ['a', 'b', 'c']),
+         'Int': [0, 0, 0, 1, 1, 1],
+         'Factor': [1.1, 1.2, 1.3, 1.4, 1.5, 1.6]}
+    b = pd.DataFrame(b).set_index(['Cat', 'Int'])['Factor']
+
+    c = pd.merge(a, b.reset_index(), left_on=['Cat1', 'Int1'],
+                 right_on=['Cat', 'Int'], how='left')
+    d = a.join(b, on=['Cat1', 'Int1'])
+    c = c.drop(['Cat', 'Int'], axis=1)
+    assert_almost_equal(c, d)

From 6f19f21c2f0d8edef4f9a475921abe66924d36cc Mon Sep 17 00:00:00 2001
From: thequackdaddy <pquack@gmail.com>
Date: Wed, 21 Jun 2017 10:16:48 -0500
Subject: [PATCH 2/4] Enable is_dtype_equal on CategoricalIndex, fixed some doc
 typos, added ordered CategoricalIndex test

---
 doc/source/whatsnew/v0.20.3.txt |  1 +
 doc/source/whatsnew/v0.21.0.txt |  1 -
 pandas/core/indexes/category.py |  3 +++
 pandas/core/reshape/merge.py    |  8 ++-----
 pandas/tests/test_join.py       | 42 +++++++++++++++++++++++----------
 5 files changed, 36 insertions(+), 19 deletions(-)

diff --git a/doc/source/whatsnew/v0.20.3.txt b/doc/source/whatsnew/v0.20.3.txt
index c730142450ea6..cbd86ae4c2722 100644
--- a/doc/source/whatsnew/v0.20.3.txt
+++ b/doc/source/whatsnew/v0.20.3.txt
@@ -51,6 +51,7 @@ Indexing
 ^^^^^^^^
 
 - Bug in ``Float64Index`` causing an empty array instead of ``None`` to be returned from ``.get(np.nan)`` on a Series whose index did not contain any ``NaN`` s (:issue:`8569`)
+- Fixed a bug that prevented joining on a categorical MultiIndex (:issue:`16627`).
 
 I/O
 ^^^
diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
index 350cb5b6bcfd9..de2516d75040b 100644
--- a/doc/source/whatsnew/v0.21.0.txt
+++ b/doc/source/whatsnew/v0.21.0.txt
@@ -102,7 +102,6 @@ Indexing
 
 - When called with a null slice (e.g. ``df.iloc[:]``), the ``.iloc`` and ``.loc`` indexers return a shallow copy of the original object. Previously they returned the original object. (:issue:`13873`).
 - When called on an unsorted ``MultiIndex``, the ``loc`` indexer now will raise ``UnsortedIndexError`` only if proper slicing is used on non-sorted levels (:issue:`16734`).
-- Fixed a bug that prevented joining on a categorical MultiIndex (:issue:`13873`).
 
 
 I/O
diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
index d9e0c218bfafc..d13636e8b43e2 100644
--- a/pandas/core/indexes/category.py
+++ b/pandas/core/indexes/category.py
@@ -560,6 +560,9 @@ def take(self, indices, axis=0, allow_fill=True,
                                            na_value=-1)
         return self._create_from_codes(taken)
 
+    def is_dtype_equal(self, other):
+        return self._data.is_dtype_equal(other)
+
     take_nd = take
 
     def map(self, mapper):
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
index 99d9af28ac019..ffe0cac33ec8f 100644
--- a/pandas/core/reshape/merge.py
+++ b/pandas/core/reshape/merge.py
@@ -11,7 +11,7 @@
 import pandas.compat as compat
 
 from pandas import (Categorical, Series, DataFrame,
-                    Index, MultiIndex, Timedelta, CategoricalIndex)
+                    Index, MultiIndex, Timedelta)
 from pandas.core.frame import _merge_doc
 from pandas.core.dtypes.common import (
     is_datetime64tz_dtype,
@@ -1441,13 +1441,9 @@ def _factorize_keys(lk, rk, sort=True):
         rk = rk.values
 
     # if we exactly match in categories, allow us to use codes
-    if isinstance(lk, CategoricalIndex):
-        ldata = lk._data
-    else:
-        ldata = lk
     if (is_categorical_dtype(lk) and
             is_categorical_dtype(rk) and
-            ldata.is_dtype_equal(rk)):
+            lk.is_dtype_equal(rk)):
         return lk.codes, rk.codes, len(lk.categories)
 
     if is_int_or_datetime_dtype(lk) and is_int_or_datetime_dtype(rk):
diff --git a/pandas/tests/test_join.py b/pandas/tests/test_join.py
index 5d29c5355f880..215ad21940f4a 100644
--- a/pandas/tests/test_join.py
+++ b/pandas/tests/test_join.py
@@ -1,11 +1,11 @@
 # -*- coding: utf-8 -*-
 
 import numpy as np
-from pandas import Index
+from pandas import Index, DataFrame, Categorical, merge
 
 from pandas._libs import join as _join
 import pandas.util.testing as tm
-from pandas.util.testing import assert_almost_equal
+from pandas.util.testing import assert_almost_equal, assert_frame_equal
 
 
 class TestIndexer(object):
@@ -196,20 +196,38 @@ def test_inner_join_indexer2():
 
 def test_merge_join_categorical_multiindex():
     # From issue 16627
-    import pandas as pd
-    a = {'Cat1': pd.Categorical(['a', 'b', 'a', 'c', 'a', 'b'],
-                                ['a', 'b', 'c']),
+    a = {'Cat1': Categorical(['a', 'b', 'a', 'c', 'a', 'b'],
+                             ['a', 'b', 'c']),
          'Int1': [0, 1, 0, 1, 0, 0]}
-    a = pd.DataFrame(a)
+    a = DataFrame(a)
 
-    b = {'Cat': pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'],
-                               ['a', 'b', 'c']),
+    b = {'Cat': Categorical(['a', 'b', 'c', 'a', 'b', 'c'],
+                            ['a', 'b', 'c']),
          'Int': [0, 0, 0, 1, 1, 1],
          'Factor': [1.1, 1.2, 1.3, 1.4, 1.5, 1.6]}
-    b = pd.DataFrame(b).set_index(['Cat', 'Int'])['Factor']
+    b = DataFrame(b).set_index(['Cat', 'Int'])['Factor']
 
-    c = pd.merge(a, b.reset_index(), left_on=['Cat1', 'Int1'],
-                 right_on=['Cat', 'Int'], how='left')
+    c = merge(a, b.reset_index(), left_on=['Cat1', 'Int1'],
+              right_on=['Cat', 'Int'], how='left')
     d = a.join(b, on=['Cat1', 'Int1'])
     c = c.drop(['Cat', 'Int'], axis=1)
-    assert_almost_equal(c, d)
+    assert_frame_equal(c, d)
+
+    a = {'Cat1': Categorical(['a', 'b', 'a', 'c', 'a', 'b'],
+                             ['b', 'a', 'c'],
+                             ordered=True),
+         'Int1': [0, 1, 0, 1, 0, 0]}
+    a = DataFrame(a)
+
+    b = {'Cat': Categorical(['a', 'b', 'c', 'a', 'b', 'c'],
+                            ['b', 'a', 'c'],
+                            ordered=True),
+         'Int': [0, 0, 0, 1, 1, 1],
+         'Factor': [1.1, 1.2, 1.3, 1.4, 1.5, 1.6]}
+    b = DataFrame(b).set_index(['Cat', 'Int'])['Factor']
+
+    c = merge(a, b.reset_index(), left_on=['Cat1', 'Int1'],
+              right_on=['Cat', 'Int'], how='left')
+    d = a.join(b, on=['Cat1', 'Int1'])
+    c = c.drop(['Cat', 'Int'], axis=1)
+    assert_frame_equal(c, d)

From 257edec9dd2f3ae15c16ec6bedf5a2ca20add898 Mon Sep 17 00:00:00 2001
From: thequackdaddy <pquack@gmail.com>
Date: Wed, 21 Jun 2017 19:48:58 -0500
Subject: [PATCH 3/4] Flake8 windows suggestion

---
 .github/PULL_REQUEST_TEMPLATE.md | 2 +-
 doc/source/contributing.rst      | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 9281c51059087..959858fb50f89 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -1,4 +1,4 @@
  - [ ] closes #xxxx
  - [ ] tests added / passed
- - [ ] passes ``git diff upstream/master --name-only -- '*.py' | flake8 --diff``
+ - [ ] passes ``git diff upstream/master --name-only -- '*.py' | flake8 --diff`` (On Windows, ``git diff upstream/master -u -- "*.py" | flake8 --diff`` might work as an alternative.)
  - [ ] whatsnew entry
diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst
index aacfe25b91564..67d3c2c515f6f 100644
--- a/doc/source/contributing.rst
+++ b/doc/source/contributing.rst
@@ -525,6 +525,11 @@ run this slightly modified command::
 
    git diff master --name-only -- '*.py' | grep 'pandas/' | xargs flake8
 
+Note that on Windows, `grep`, `xargs`, and other tools are likely unavailable.
+However, this has been shown to work on smaller commits::
+
+    git diff master -u -- "*.py" | flake8 --diff
+
 Backwards Compatibility
 ~~~~~~~~~~~~~~~~~~~~~~~
 

From 081c3ced906411c78e6b290efd3e9892de1ef76b Mon Sep 17 00:00:00 2001
From: thequackdaddy <pquack@gmail.com>
Date: Wed, 28 Jun 2017 19:54:30 -0500
Subject: [PATCH 4/4] Fixed some documentation/formatting issues, clarified the
 purpose of the test case.

---
 doc/source/contributing.rst     |  7 ++++---
 doc/source/whatsnew/v0.20.3.txt |  2 +-
 pandas/tests/test_join.py       | 21 +++++++++++----------
 3 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst
index 67d3c2c515f6f..cd444f796fabb 100644
--- a/doc/source/contributing.rst
+++ b/doc/source/contributing.rst
@@ -525,10 +525,11 @@ run this slightly modified command::
 
    git diff master --name-only -- '*.py' | grep 'pandas/' | xargs flake8
 
-Note that on Windows, `grep`, `xargs`, and other tools are likely unavailable.
-However, this has been shown to work on smaller commits::
+Note that on Windows, ``grep``, ``xargs``, and other tools are likely
+unavailable. However, this has been shown to work on smaller commits in the
+standard Windows command line::
 
-    git diff master -u -- "*.py" | flake8 --diff
+   git diff master -u -- "*.py" | flake8 --diff
 
 Backwards Compatibility
 ~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/doc/source/whatsnew/v0.20.3.txt b/doc/source/whatsnew/v0.20.3.txt
index cbd86ae4c2722..30f0fe2c5139a 100644
--- a/doc/source/whatsnew/v0.20.3.txt
+++ b/doc/source/whatsnew/v0.20.3.txt
@@ -51,7 +51,6 @@ Indexing
 ^^^^^^^^
 
 - Bug in ``Float64Index`` causing an empty array instead of ``None`` to be returned from ``.get(np.nan)`` on a Series whose index did not contain any ``NaN`` s (:issue:`8569`)
-- Fixed a bug that prevented joining on a categorical MultiIndex (:issue:`16627`).
 
 I/O
 ^^^
@@ -79,6 +78,7 @@ Sparse
 Reshaping
 ^^^^^^^^^
 
+- Bug in joining on a ``MultiIndex`` with a ``category`` dtype for a level (:issue:`16627`).
 
 
 Numeric
diff --git a/pandas/tests/test_join.py b/pandas/tests/test_join.py
index 215ad21940f4a..cde1cab37d09c 100644
--- a/pandas/tests/test_join.py
+++ b/pandas/tests/test_join.py
@@ -207,12 +207,13 @@ def test_merge_join_categorical_multiindex():
          'Factor': [1.1, 1.2, 1.3, 1.4, 1.5, 1.6]}
     b = DataFrame(b).set_index(['Cat', 'Int'])['Factor']
 
-    c = merge(a, b.reset_index(), left_on=['Cat1', 'Int1'],
-              right_on=['Cat', 'Int'], how='left')
-    d = a.join(b, on=['Cat1', 'Int1'])
-    c = c.drop(['Cat', 'Int'], axis=1)
-    assert_frame_equal(c, d)
+    expected = merge(a, b.reset_index(), left_on=['Cat1', 'Int1'],
+                     right_on=['Cat', 'Int'], how='left')
+    result = a.join(b, on=['Cat1', 'Int1'])
+    expected = expected.drop(['Cat', 'Int'], axis=1)
+    assert_frame_equal(expected, result)
 
+    # Same test, but with ordered categorical
     a = {'Cat1': Categorical(['a', 'b', 'a', 'c', 'a', 'b'],
                              ['b', 'a', 'c'],
                              ordered=True),
@@ -226,8 +227,8 @@ def test_merge_join_categorical_multiindex():
          'Factor': [1.1, 1.2, 1.3, 1.4, 1.5, 1.6]}
     b = DataFrame(b).set_index(['Cat', 'Int'])['Factor']
 
-    c = merge(a, b.reset_index(), left_on=['Cat1', 'Int1'],
-              right_on=['Cat', 'Int'], how='left')
-    d = a.join(b, on=['Cat1', 'Int1'])
-    c = c.drop(['Cat', 'Int'], axis=1)
-    assert_frame_equal(c, d)
+    expected = merge(a, b.reset_index(), left_on=['Cat1', 'Int1'],
+                     right_on=['Cat', 'Int'], how='left')
+    result = a.join(b, on=['Cat1', 'Int1'])
+    expected = expected.drop(['Cat', 'Int'], axis=1)
+    assert_frame_equal(expected, result)