From b87888692bea1073fba1b827a84f5fbb4b9423be Mon Sep 17 00:00:00 2001 From: thequackdaddy Date: Tue, 20 Jun 2017 15:14:33 -0500 Subject: [PATCH 1/4] BUG: Load data from a CategoricalIndex for dtype comparison, closes #16627 --- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/core/reshape/merge.py | 8 ++++++-- pandas/tests/test_join.py | 21 +++++++++++++++++++++ 3 files changed, 28 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index de2516d75040b..350cb5b6bcfd9 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -102,6 +102,7 @@ Indexing - When called with a null slice (e.g. ``df.iloc[:]``), the ``.iloc`` and ``.loc`` indexers return a shallow copy of the original object. Previously they returned the original object. (:issue:`13873`). - When called on an unsorted ``MultiIndex``, the ``loc`` indexer now will raise ``UnsortedIndexError`` only if proper slicing is used on non-sorted levels (:issue:`16734`). +- Fixed a bug that prevented joining on a categorical MultiIndex (:issue:`13873`). I/O diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index ffe0cac33ec8f..99d9af28ac019 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -11,7 +11,7 @@ import pandas.compat as compat from pandas import (Categorical, Series, DataFrame, - Index, MultiIndex, Timedelta) + Index, MultiIndex, Timedelta, CategoricalIndex) from pandas.core.frame import _merge_doc from pandas.core.dtypes.common import ( is_datetime64tz_dtype, @@ -1441,9 +1441,13 @@ def _factorize_keys(lk, rk, sort=True): rk = rk.values # if we exactly match in categories, allow us to use codes + if isinstance(lk, CategoricalIndex): + ldata = lk._data + else: + ldata = lk if (is_categorical_dtype(lk) and is_categorical_dtype(rk) and - lk.is_dtype_equal(rk)): + ldata.is_dtype_equal(rk)): return lk.codes, rk.codes, len(lk.categories) if is_int_or_datetime_dtype(lk) and is_int_or_datetime_dtype(rk): diff --git a/pandas/tests/test_join.py b/pandas/tests/test_join.py index 3fc13d23b53f7..5d29c5355f880 100644 --- a/pandas/tests/test_join.py +++ b/pandas/tests/test_join.py @@ -192,3 +192,24 @@ def test_inner_join_indexer2(): exp_ridx = np.array([0, 1, 2, 3], dtype=np.int64) assert_almost_equal(ridx, exp_ridx) + + +def test_merge_join_categorical_multiindex(): + # From issue 16627 + import pandas as pd + a = {'Cat1': pd.Categorical(['a', 'b', 'a', 'c', 'a', 'b'], + ['a', 'b', 'c']), + 'Int1': [0, 1, 0, 1, 0, 0]} + a = pd.DataFrame(a) + + b = {'Cat': pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'], + ['a', 'b', 'c']), + 'Int': [0, 0, 0, 1, 1, 1], + 'Factor': [1.1, 1.2, 1.3, 1.4, 1.5, 1.6]} + b = pd.DataFrame(b).set_index(['Cat', 'Int'])['Factor'] + + c = pd.merge(a, b.reset_index(), left_on=['Cat1', 'Int1'], + right_on=['Cat', 'Int'], how='left') + d = a.join(b, on=['Cat1', 'Int1']) + c = c.drop(['Cat', 'Int'], axis=1) + assert_almost_equal(c, d) From 6f19f21c2f0d8edef4f9a475921abe66924d36cc Mon Sep 17 00:00:00 2001 From: thequackdaddy Date: Wed, 21 Jun 2017 10:16:48 -0500 Subject: [PATCH 2/4] Enable is_dtype_equal on CategoricalIndex, fixed some doc typos, added ordered CategoricalIndex test --- doc/source/whatsnew/v0.20.3.txt | 1 + doc/source/whatsnew/v0.21.0.txt | 1 - pandas/core/indexes/category.py | 3 +++ pandas/core/reshape/merge.py | 8 ++----- pandas/tests/test_join.py | 42 +++++++++++++++++++++++---------- 5 files changed, 36 insertions(+), 19 deletions(-) diff --git a/doc/source/whatsnew/v0.20.3.txt b/doc/source/whatsnew/v0.20.3.txt index c730142450ea6..cbd86ae4c2722 100644 --- a/doc/source/whatsnew/v0.20.3.txt +++ b/doc/source/whatsnew/v0.20.3.txt @@ -51,6 +51,7 @@ Indexing ^^^^^^^^ - Bug in ``Float64Index`` causing an empty array instead of ``None`` to be returned from ``.get(np.nan)`` on a Series whose index did not contain any ``NaN`` s (:issue:`8569`) +- Fixed a bug that prevented joining on a categorical MultiIndex (:issue:`16627`). I/O ^^^ diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 350cb5b6bcfd9..de2516d75040b 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -102,7 +102,6 @@ Indexing - When called with a null slice (e.g. ``df.iloc[:]``), the ``.iloc`` and ``.loc`` indexers return a shallow copy of the original object. Previously they returned the original object. (:issue:`13873`). - When called on an unsorted ``MultiIndex``, the ``loc`` indexer now will raise ``UnsortedIndexError`` only if proper slicing is used on non-sorted levels (:issue:`16734`). -- Fixed a bug that prevented joining on a categorical MultiIndex (:issue:`13873`). I/O diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index d9e0c218bfafc..d13636e8b43e2 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -560,6 +560,9 @@ def take(self, indices, axis=0, allow_fill=True, na_value=-1) return self._create_from_codes(taken) + def is_dtype_equal(self, other): + return self._data.is_dtype_equal(other) + take_nd = take def map(self, mapper): diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 99d9af28ac019..ffe0cac33ec8f 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -11,7 +11,7 @@ import pandas.compat as compat from pandas import (Categorical, Series, DataFrame, - Index, MultiIndex, Timedelta, CategoricalIndex) + Index, MultiIndex, Timedelta) from pandas.core.frame import _merge_doc from pandas.core.dtypes.common import ( is_datetime64tz_dtype, @@ -1441,13 +1441,9 @@ def _factorize_keys(lk, rk, sort=True): rk = rk.values # if we exactly match in categories, allow us to use codes - if isinstance(lk, CategoricalIndex): - ldata = lk._data - else: - ldata = lk if (is_categorical_dtype(lk) and is_categorical_dtype(rk) and - ldata.is_dtype_equal(rk)): + lk.is_dtype_equal(rk)): return lk.codes, rk.codes, len(lk.categories) if is_int_or_datetime_dtype(lk) and is_int_or_datetime_dtype(rk): diff --git a/pandas/tests/test_join.py b/pandas/tests/test_join.py index 5d29c5355f880..215ad21940f4a 100644 --- a/pandas/tests/test_join.py +++ b/pandas/tests/test_join.py @@ -1,11 +1,11 @@ # -*- coding: utf-8 -*- import numpy as np -from pandas import Index +from pandas import Index, DataFrame, Categorical, merge from pandas._libs import join as _join import pandas.util.testing as tm -from pandas.util.testing import assert_almost_equal +from pandas.util.testing import assert_almost_equal, assert_frame_equal class TestIndexer(object): @@ -196,20 +196,38 @@ def test_inner_join_indexer2(): def test_merge_join_categorical_multiindex(): # From issue 16627 - import pandas as pd - a = {'Cat1': pd.Categorical(['a', 'b', 'a', 'c', 'a', 'b'], - ['a', 'b', 'c']), + a = {'Cat1': Categorical(['a', 'b', 'a', 'c', 'a', 'b'], + ['a', 'b', 'c']), 'Int1': [0, 1, 0, 1, 0, 0]} - a = pd.DataFrame(a) + a = DataFrame(a) - b = {'Cat': pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'], - ['a', 'b', 'c']), + b = {'Cat': Categorical(['a', 'b', 'c', 'a', 'b', 'c'], + ['a', 'b', 'c']), 'Int': [0, 0, 0, 1, 1, 1], 'Factor': [1.1, 1.2, 1.3, 1.4, 1.5, 1.6]} - b = pd.DataFrame(b).set_index(['Cat', 'Int'])['Factor'] + b = DataFrame(b).set_index(['Cat', 'Int'])['Factor'] - c = pd.merge(a, b.reset_index(), left_on=['Cat1', 'Int1'], - right_on=['Cat', 'Int'], how='left') + c = merge(a, b.reset_index(), left_on=['Cat1', 'Int1'], + right_on=['Cat', 'Int'], how='left') d = a.join(b, on=['Cat1', 'Int1']) c = c.drop(['Cat', 'Int'], axis=1) - assert_almost_equal(c, d) + assert_frame_equal(c, d) + + a = {'Cat1': Categorical(['a', 'b', 'a', 'c', 'a', 'b'], + ['b', 'a', 'c'], + ordered=True), + 'Int1': [0, 1, 0, 1, 0, 0]} + a = DataFrame(a) + + b = {'Cat': Categorical(['a', 'b', 'c', 'a', 'b', 'c'], + ['b', 'a', 'c'], + ordered=True), + 'Int': [0, 0, 0, 1, 1, 1], + 'Factor': [1.1, 1.2, 1.3, 1.4, 1.5, 1.6]} + b = DataFrame(b).set_index(['Cat', 'Int'])['Factor'] + + c = merge(a, b.reset_index(), left_on=['Cat1', 'Int1'], + right_on=['Cat', 'Int'], how='left') + d = a.join(b, on=['Cat1', 'Int1']) + c = c.drop(['Cat', 'Int'], axis=1) + assert_frame_equal(c, d) From 257edec9dd2f3ae15c16ec6bedf5a2ca20add898 Mon Sep 17 00:00:00 2001 From: thequackdaddy Date: Wed, 21 Jun 2017 19:48:58 -0500 Subject: [PATCH 3/4] Flake8 windows suggestion --- .github/PULL_REQUEST_TEMPLATE.md | 2 +- doc/source/contributing.rst | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 9281c51059087..959858fb50f89 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,4 +1,4 @@ - [ ] closes #xxxx - [ ] tests added / passed - - [ ] passes ``git diff upstream/master --name-only -- '*.py' | flake8 --diff`` + - [ ] passes ``git diff upstream/master --name-only -- '*.py' | flake8 --diff`` (On Windows, ``git diff upstream/master -u -- "*.py" | flake8 --diff`` might work as an alternative.) - [ ] whatsnew entry diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index aacfe25b91564..67d3c2c515f6f 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -525,6 +525,11 @@ run this slightly modified command:: git diff master --name-only -- '*.py' | grep 'pandas/' | xargs flake8 +Note that on Windows, `grep`, `xargs`, and other tools are likely unavailable. +However, this has been shown to work on smaller commits:: + + git diff master -u -- "*.py" | flake8 --diff + Backwards Compatibility ~~~~~~~~~~~~~~~~~~~~~~~ From 081c3ced906411c78e6b290efd3e9892de1ef76b Mon Sep 17 00:00:00 2001 From: thequackdaddy Date: Wed, 28 Jun 2017 19:54:30 -0500 Subject: [PATCH 4/4] Fixed some documentation/formatting issues, clarified the purpose of the test case. --- doc/source/contributing.rst | 7 ++++--- doc/source/whatsnew/v0.20.3.txt | 2 +- pandas/tests/test_join.py | 21 +++++++++++---------- 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index 67d3c2c515f6f..cd444f796fabb 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -525,10 +525,11 @@ run this slightly modified command:: git diff master --name-only -- '*.py' | grep 'pandas/' | xargs flake8 -Note that on Windows, `grep`, `xargs`, and other tools are likely unavailable. -However, this has been shown to work on smaller commits:: +Note that on Windows, ``grep``, ``xargs``, and other tools are likely +unavailable. However, this has been shown to work on smaller commits in the +standard Windows command line:: - git diff master -u -- "*.py" | flake8 --diff + git diff master -u -- "*.py" | flake8 --diff Backwards Compatibility ~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/whatsnew/v0.20.3.txt b/doc/source/whatsnew/v0.20.3.txt index cbd86ae4c2722..30f0fe2c5139a 100644 --- a/doc/source/whatsnew/v0.20.3.txt +++ b/doc/source/whatsnew/v0.20.3.txt @@ -51,7 +51,6 @@ Indexing ^^^^^^^^ - Bug in ``Float64Index`` causing an empty array instead of ``None`` to be returned from ``.get(np.nan)`` on a Series whose index did not contain any ``NaN`` s (:issue:`8569`) -- Fixed a bug that prevented joining on a categorical MultiIndex (:issue:`16627`). I/O ^^^ @@ -79,6 +78,7 @@ Sparse Reshaping ^^^^^^^^^ +- Bug in joining on a ``MultiIndex`` with a ``category`` dtype for a level (:issue:`16627`). Numeric diff --git a/pandas/tests/test_join.py b/pandas/tests/test_join.py index 215ad21940f4a..cde1cab37d09c 100644 --- a/pandas/tests/test_join.py +++ b/pandas/tests/test_join.py @@ -207,12 +207,13 @@ def test_merge_join_categorical_multiindex(): 'Factor': [1.1, 1.2, 1.3, 1.4, 1.5, 1.6]} b = DataFrame(b).set_index(['Cat', 'Int'])['Factor'] - c = merge(a, b.reset_index(), left_on=['Cat1', 'Int1'], - right_on=['Cat', 'Int'], how='left') - d = a.join(b, on=['Cat1', 'Int1']) - c = c.drop(['Cat', 'Int'], axis=1) - assert_frame_equal(c, d) + expected = merge(a, b.reset_index(), left_on=['Cat1', 'Int1'], + right_on=['Cat', 'Int'], how='left') + result = a.join(b, on=['Cat1', 'Int1']) + expected = expected.drop(['Cat', 'Int'], axis=1) + assert_frame_equal(expected, result) + # Same test, but with ordered categorical a = {'Cat1': Categorical(['a', 'b', 'a', 'c', 'a', 'b'], ['b', 'a', 'c'], ordered=True), @@ -226,8 +227,8 @@ def test_merge_join_categorical_multiindex(): 'Factor': [1.1, 1.2, 1.3, 1.4, 1.5, 1.6]} b = DataFrame(b).set_index(['Cat', 'Int'])['Factor'] - c = merge(a, b.reset_index(), left_on=['Cat1', 'Int1'], - right_on=['Cat', 'Int'], how='left') - d = a.join(b, on=['Cat1', 'Int1']) - c = c.drop(['Cat', 'Int'], axis=1) - assert_frame_equal(c, d) + expected = merge(a, b.reset_index(), left_on=['Cat1', 'Int1'], + right_on=['Cat', 'Int'], how='left') + result = a.join(b, on=['Cat1', 'Int1']) + expected = expected.drop(['Cat', 'Int'], axis=1) + assert_frame_equal(expected, result)