BUG: Fix index order for Index.intersection()

closes #15582 Author: Albert Villanova del Moral <albert.villanova@gmail.com> Author: Jeff Reback <jeff@reback.net> Closes #15583 from albertvillanova/fix-15582 and squashes the following commits: 2d4e143 [Albert Villanova del Moral] Fix pytest fixture name collision 64e86a4 [Albert Villanova del Moral] Fix test on right join 73df69e [Albert Villanova del Moral] Address requested changes 8d2e9cc [Albert Villanova del Moral] Address requested changes 968c7f1 [Jeff Reback] DOC/TST: change to use parameterization 9e39794 [Albert Villanova del Moral] Address requested changes 5bf1508 [Albert Villanova del Moral] Address requested changes 654288b [Albert Villanova del Moral] Fix Travis errors 33eb740 [Albert Villanova del Moral] Address requested changes 3c200fe [Albert Villanova del Moral] Add new tests ef2581e [Albert Villanova del Moral] Fix Travis error f0d9d03 [Albert Villanova del Moral] Add whatsnew c96306d [Albert Villanova del Moral] Add sort argument to Index.join 047b513 [Albert Villanova del Moral] Address requested changes ec836bd [Albert Villanova del Moral] Fix Travis errors b977278 [Albert Villanova del Moral] Address requested changes 784fe75 [Albert Villanova del Moral] Fix error: line too long 1197b99 [Albert Villanova del Moral] Fix DataFrame column order when read from HDF file d9e29f8 [Albert Villanova del Moral] Create new DatetimeIndex from the Index.intersection result e7bcd28 [Albert Villanova del Moral] Fix typo in documentation a4ead99 [Albert Villanova del Moral] Fix typo c2a8dc3 [Albert Villanova del Moral] Implement tests c12bb3f [Albert Villanova del Moral] BUG: Fix index order for Index.intersection()
pandas-dev · Mar 29, 2017 · bd169dc · bd169dc
1 parent 2e64614
commit bd169dc
Show file tree

Hide file tree

Showing 11 changed files with 309 additions and 137 deletions.
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -750,6 +750,62 @@ New Behavior:
    TypeError: Cannot compare 2014-01-01 00:00:00 of
    type <class 'pandas.tslib.Timestamp'> to string column
 
+.. _whatsnew_0200.api_breaking.index_order:
+
+Index.intersection and inner join now preserve the order of the left Index
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+`:meth:Index.intersection` now preserves the order of the calling ``Index`` (left)
+instead of the other ``Index`` (right) (:issue:`15582`). This affects the inner
+joins (`:meth:DataFrame.join` and `:func:merge`) and the ``.align`` methods.
+
+- ``Index.intersection``
+
+  .. ipython:: python
+
+     left = pd.Index([2, 1, 0])
+     left
+     right = pd.Index([1, 2, 3])
+     right
+
+  Previous Behavior:
+
+  .. code-block:: ipython
+
+     In [4]: left.intersection(right)
+     Out[4]: Int64Index([1, 2], dtype='int64')
+
+  New Behavior:
+
+  .. ipython:: python
+
+     left.intersection(right)
+
+- ``DataFrame.join`` and ``pd.merge``
+
+  .. ipython:: python
+
+     left = pd.DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0])
+     left
+     right = pd.DataFrame({'b': [100, 200, 300]}, index=[1, 2, 3])
+     right
+
+  Previous Behavior:
+
+  .. code-block:: ipython
+
+     In [4]: left.join(right, how='inner')
+     Out[4]:
+         a    b
+     1  10  100
+     2  20  200
+
+  New Behavior:
+
+  .. ipython:: python
+
+     left.join(right, how='inner')
+
 
 .. _whatsnew_0200.api:
 
@@ -984,6 +1040,7 @@ Bug Fixes
 
 - Bug in ``DataFrame.to_stata()`` and ``StataWriter`` which produces incorrectly formatted files to be produced for some locales (:issue:`13856`)
 - Bug in ``StataReader`` and ``StataWriter`` which allows invalid encodings (:issue:`15723`)
+- Bug with ``sort=True`` in ``DataFrame.join`` and ``pd.merge`` when joining on indexes (:issue:`15582`)
 
 - Bug in ``pd.concat()`` in which concatting with an empty dataframe with ``join='inner'`` was being improperly handled (:issue:`15328`)
 - Bug in ``groupby.agg()`` incorrectly localizing timezone on ``datetime`` (:issue:`15426`, :issue:`10668`, :issue:`13046`)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -124,10 +124,14 @@
 ----------%s
 right : DataFrame
 how : {'left', 'right', 'outer', 'inner'}, default 'inner'
-    * left: use only keys from left frame (SQL: left outer join)
-    * right: use only keys from right frame (SQL: right outer join)
-    * outer: use union of keys from both frames (SQL: full outer join)
-    * inner: use intersection of keys from both frames (SQL: inner join)
+    * left: use only keys from left frame, similar to a SQL left outer join;
+      preserve key order
+    * right: use only keys from right frame, similar to a SQL right outer join;
+      preserve key order
+    * outer: use union of keys from both frames, similar to a SQL full outer
+      join; sort keys lexicographically
+    * inner: use intersection of keys from both frames, similar to a SQL inner
+      join; preserve the order of the left keys
 on : label or list
     Field names to join on. Must be found in both DataFrames. If on is
     None and not merging on indexes, then it merges on the intersection of
@@ -147,7 +151,8 @@
     Use the index from the right DataFrame as the join key. Same caveats as
     left_index
 sort : boolean, default False
-    Sort the join keys lexicographically in the result DataFrame
+    Sort the join keys lexicographically in the result DataFrame. If False,
+    the order of the join keys depends on the join type (how keyword)
 suffixes : 2-length sequence (tuple, list, ...)
     Suffix to apply to overlapping column names in the left and right
     side, respectively
@@ -4472,16 +4477,18 @@ def join(self, other, on=None, how='left', lsuffix='', rsuffix='',
             * left: use calling frame's index (or column if on is specified)
             * right: use other frame's index
             * outer: form union of calling frame's index (or column if on is
-                specified) with other frame's index
+              specified) with other frame's index, and sort it
+              lexicographically
             * inner: form intersection of calling frame's index (or column if
-                on is specified) with other frame's index
+              on is specified) with other frame's index, preserving the order
+              of the calling's one
         lsuffix : string
             Suffix to use from left frame's overlapping columns
         rsuffix : string
             Suffix to use from right frame's overlapping columns
         sort : boolean, default False
             Order result DataFrame lexicographically by the join key. If False,
-            preserves the index order of the calling (left) DataFrame
+            the order of the join key depends on the join type (how keyword)
 
         Notes
         -----

diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py
@@ -2089,8 +2089,8 @@ def intersection(self, other):
         """
         Form the intersection of two Index objects.
 
-        This returns a new Index with elements common to the index and `other`.
-        Sortedness of the result is not guaranteed.
+        This returns a new Index with elements common to the index and `other`,
+        preserving the order of the calling index.
 
         Parameters
         ----------
@@ -2128,15 +2128,15 @@ def intersection(self, other):
                 pass
 
         try:
-            indexer = Index(self._values).get_indexer(other._values)
+            indexer = Index(other._values).get_indexer(self._values)
             indexer = indexer.take((indexer != -1).nonzero()[0])
         except:
             # duplicates
-            indexer = Index(self._values).get_indexer_non_unique(
-                other._values)[0].unique()
+            indexer = Index(other._values).get_indexer_non_unique(
+                self._values)[0].unique()
             indexer = indexer[indexer != -1]
 
-        taken = self.take(indexer)
+        taken = other.take(indexer)
         if self.name != other.name:
             taken.name = None
         return taken
@@ -2831,8 +2831,7 @@ def _reindex_non_unique(self, target):
         new_index = self._shallow_copy_with_infer(new_labels, freq=None)
         return new_index, indexer, new_indexer
 
-    def join(self, other, how='left', level=None, return_indexers=False):
-        """
+    _index_shared_docs['join'] = """
         *this is an internal non-public method*
 
         Compute join_index and indexers to conform data
@@ -2844,11 +2843,20 @@ def join(self, other, how='left', level=None, return_indexers=False):
         how : {'left', 'right', 'inner', 'outer'}
         level : int or level name, default None
         return_indexers : boolean, default False
+        sort : boolean, default False
+            Sort the join keys lexicographically in the result Index. If False,
+            the order of the join keys depends on the join type (how keyword)
+
+            .. versionadded:: 0.20.0
 
         Returns
         -------
         join_index, (left_indexer, right_indexer)
         """
+
+    @Appender(_index_shared_docs['join'])
+    def join(self, other, how='left', level=None, return_indexers=False,
+             sort=False):
         from .multi import MultiIndex
         self_is_mi = isinstance(self, MultiIndex)
         other_is_mi = isinstance(other, MultiIndex)
@@ -2929,6 +2937,9 @@ def join(self, other, how='left', level=None, return_indexers=False):
         elif how == 'outer':
             join_index = self.union(other)
 
+        if sort:
+            join_index = join_index.sort_values()
+
         if return_indexers:
             if join_index is self:
                 lindexer = None

diff --git a/pandas/indexes/range.py b/pandas/indexes/range.py
@@ -431,29 +431,16 @@ def union(self, other):
 
         return self._int64index.union(other)
 
-    def join(self, other, how='left', level=None, return_indexers=False):
-        """
-        *this is an internal non-public method*
-
-        Compute join_index and indexers to conform data
-        structures to the new index.
-
-        Parameters
-        ----------
-        other : Index
-        how : {'left', 'right', 'inner', 'outer'}
-        level : int or level name, default None
-        return_indexers : boolean, default False
-
-        Returns
-        -------
-        join_index, (left_indexer, right_indexer)
-        """
+    @Appender(_index_shared_docs['join'])
+    def join(self, other, how='left', level=None, return_indexers=False,
+             sort=False):
         if how == 'outer' and self is not other:
             # note: could return RangeIndex in more circumstances
-            return self._int64index.join(other, how, level, return_indexers)
+            return self._int64index.join(other, how, level, return_indexers,
+                                         sort)
 
-        return super(RangeIndex, self).join(other, how, level, return_indexers)
+        return super(RangeIndex, self).join(other, how, level, return_indexers,
+                                            sort)
 
     def __len__(self):
         """

diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
@@ -4321,7 +4321,7 @@ def _reindex_axis(obj, axis, labels, other=None):
 
     labels = _ensure_index(labels.unique())
     if other is not None:
-        labels = labels & _ensure_index(other.unique())
+        labels = _ensure_index(other.unique()) & labels
     if not labels.equals(ax):
         slicer = [slice(None, None)] * obj.ndim
         slicer[axis] = labels

diff --git a/pandas/tests/frame/test_join.py b/pandas/tests/frame/test_join.py
@@ -0,0 +1,140 @@
+# -*- coding: utf-8 -*-
+
+import pytest
+import numpy as np
+
+from pandas import DataFrame, Index
+from pandas.tests.frame.common import TestData
+import pandas.util.testing as tm
+
+
+@pytest.fixture
+def frame():
+    return TestData().frame
+
+
+@pytest.fixture
+def left():
+    return DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0])
+
+
+@pytest.fixture
+def right():
+    return DataFrame({'b': [300, 100, 200]}, index=[3, 1, 2])
+
+
+@pytest.mark.parametrize(
+    "how, sort, expected",
+    [('inner', False, DataFrame({'a': [20, 10],
+                                 'b': [200, 100]},
+                                index=[2, 1])),
+     ('inner', True, DataFrame({'a': [10, 20],
+                                'b': [100, 200]},
+                               index=[1, 2])),
+     ('left', False, DataFrame({'a': [20, 10, 0],
+                                'b': [200, 100, np.nan]},
+                               index=[2, 1, 0])),
+     ('left', True, DataFrame({'a': [0, 10, 20],
+                               'b': [np.nan, 100, 200]},
+                              index=[0, 1, 2])),
+     ('right', False, DataFrame({'a': [np.nan, 10, 20],
+                                 'b': [300, 100, 200]},
+                                index=[3, 1, 2])),
+     ('right', True, DataFrame({'a': [10, 20, np.nan],
+                                'b': [100, 200, 300]},
+                               index=[1, 2, 3])),
+     ('outer', False, DataFrame({'a': [0, 10, 20, np.nan],
+                                 'b': [np.nan, 100, 200, 300]},
+                                index=[0, 1, 2, 3])),
+     ('outer', True, DataFrame({'a': [0, 10, 20, np.nan],
+                                'b': [np.nan, 100, 200, 300]},
+                               index=[0, 1, 2, 3]))])
+def test_join(left, right, how, sort, expected):
+
+    result = left.join(right, how=how, sort=sort)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_join_index(frame):
+    # left / right
+
+    f = frame.loc[frame.index[:10], ['A', 'B']]
+    f2 = frame.loc[frame.index[5:], ['C', 'D']].iloc[::-1]
+
+    joined = f.join(f2)
+    tm.assert_index_equal(f.index, joined.index)
+    expected_columns = Index(['A', 'B', 'C', 'D'])
+    tm.assert_index_equal(joined.columns, expected_columns)
+
+    joined = f.join(f2, how='left')
+    tm.assert_index_equal(joined.index, f.index)
+    tm.assert_index_equal(joined.columns, expected_columns)
+
+    joined = f.join(f2, how='right')
+    tm.assert_index_equal(joined.index, f2.index)
+    tm.assert_index_equal(joined.columns, expected_columns)
+
+    # inner
+
+    joined = f.join(f2, how='inner')
+    tm.assert_index_equal(joined.index, f.index[5:10])
+    tm.assert_index_equal(joined.columns, expected_columns)
+
+    # outer
+
+    joined = f.join(f2, how='outer')
+    tm.assert_index_equal(joined.index, frame.index.sort_values())
+    tm.assert_index_equal(joined.columns, expected_columns)
+
+    tm.assertRaisesRegexp(ValueError, 'join method', f.join, f2, how='foo')
+
+    # corner case - overlapping columns
+    for how in ('outer', 'left', 'inner'):
+        with tm.assertRaisesRegexp(ValueError, 'columns overlap but '
+                                   'no suffix'):
+            frame.join(frame, how=how)
+
+
+def test_join_index_more(frame):
+    af = frame.loc[:, ['A', 'B']]
+    bf = frame.loc[::2, ['C', 'D']]
+
+    expected = af.copy()
+    expected['C'] = frame['C'][::2]
+    expected['D'] = frame['D'][::2]
+
+    result = af.join(bf)
+    tm.assert_frame_equal(result, expected)
+
+    result = af.join(bf, how='right')
+    tm.assert_frame_equal(result, expected[::2])
+
+    result = bf.join(af, how='right')
+    tm.assert_frame_equal(result, expected.loc[:, result.columns])
+
+
+def test_join_index_series(frame):
+    df = frame.copy()
+    s = df.pop(frame.columns[-1])
+    joined = df.join(s)
+
+    # TODO should this check_names ?
+    tm.assert_frame_equal(joined, frame, check_names=False)
+
+    s.name = None
+    tm.assertRaisesRegexp(ValueError, 'must have a name', df.join, s)
+
+
+def test_join_overlap(frame):
+    df1 = frame.loc[:, ['A', 'B', 'C']]
+    df2 = frame.loc[:, ['B', 'C', 'D']]
+
+    joined = df1.join(df2, lsuffix='_df1', rsuffix='_df2')
+    df1_suf = df1.loc[:, ['B', 'C']].add_suffix('_df1')
+    df2_suf = df2.loc[:, ['B', 'C']].add_suffix('_df2')
+
+    no_overlap = frame.loc[:, ['A', 'D']]
+    expected = df1_suf.join(df2_suf).join(no_overlap)
+
+    # column order not necessarily sorted
+    tm.assert_frame_equal(joined, expected.loc[:, joined.columns])