From 0a4665a5fe716c28ec2e756611b64efd32d63148 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 24 Jan 2019 10:45:05 -0600 Subject: [PATCH] Revert BUG-24212 fix usage of Index.take in pd.merge (#24904) * Revert BUG-24212 fix usage of Index.take in pd.merge xref https://github.com/pandas-dev/pandas/pull/24733/ xref https://github.com/pandas-dev/pandas/issues/24897 * test 0.23.4 output * added note about buggy test --- doc/source/whatsnew/v0.24.0.rst | 1 - pandas/core/reshape/merge.py | 41 ++---------------------- pandas/tests/reshape/merge/test_merge.py | 17 ++++++++++ 3 files changed, 19 insertions(+), 40 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 1c44d35aae4d1..4efe24789af28 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1827,7 +1827,6 @@ Reshaping - Bug in :func:`DataFrame.unstack` where a ``ValueError`` was raised when unstacking timezone aware values (:issue:`18338`) - Bug in :func:`DataFrame.stack` where timezone aware values were converted to timezone naive values (:issue:`19420`) - Bug in :func:`merge_asof` where a ``TypeError`` was raised when ``by_col`` were timezone aware values (:issue:`21184`) -- Bug in :func:`merge` when merging by index name would sometimes result in an incorrectly numbered index (:issue:`24212`) - Bug showing an incorrect shape when throwing error during ``DataFrame`` construction. (:issue:`20742`) .. _whatsnew_0240.bug_fixes.sparse: diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 0a51f2ee0dce7..e11847d2b8ce2 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -757,19 +757,13 @@ def _get_join_info(self): if self.right_index: if len(self.left) > 0: - join_index = self._create_join_index(self.left.index, - self.right.index, - left_indexer, - how='right') + join_index = self.left.index.take(left_indexer) else: join_index = self.right.index.take(right_indexer) left_indexer = np.array([-1] * len(join_index)) elif self.left_index: if len(self.right) > 0: - join_index = self._create_join_index(self.right.index, - self.left.index, - right_indexer, - how='left') + join_index = self.right.index.take(right_indexer) else: join_index = self.left.index.take(left_indexer) right_indexer = np.array([-1] * len(join_index)) @@ -780,37 +774,6 @@ def _get_join_info(self): join_index = join_index.astype(object) return join_index, left_indexer, right_indexer - def _create_join_index(self, index, other_index, indexer, how='left'): - """ - Create a join index by rearranging one index to match another - - Parameters - ---------- - index: Index being rearranged - other_index: Index used to supply values not found in index - indexer: how to rearrange index - how: replacement is only necessary if indexer based on other_index - - Returns - ------- - join_index - """ - join_index = index.take(indexer) - if (self.how in (how, 'outer') and - not isinstance(other_index, MultiIndex)): - # if final index requires values in other_index but not target - # index, indexer may hold missing (-1) values, causing Index.take - # to take the final value in target index - mask = indexer == -1 - if np.any(mask): - # if values missing (-1) from target index, - # take from other_index instead - join_list = join_index.to_numpy() - join_list[mask] = other_index.to_numpy()[mask] - join_index = Index(join_list, dtype=join_index.dtype, - name=join_index.name) - return join_index - def _get_merge_keys(self): """ Note: has side effects (copy/delete key columns) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index e123a5171769d..f0a3ddc8ce8a4 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -940,6 +940,7 @@ def test_merge_two_empty_df_no_division_error(self): merge(a, a, on=('a', 'b')) @pytest.mark.parametrize('how', ['left', 'outer']) + @pytest.mark.xfail(reason="GH-24897") def test_merge_on_index_with_more_values(self, how): # GH 24212 # pd.merge gets [-1, -1, 0, 1] as right_indexer, ensure that -1 is @@ -959,6 +960,22 @@ def test_merge_on_index_with_more_values(self, how): expected.set_index('a', drop=False, inplace=True) assert_frame_equal(result, expected) + def test_merge_right_index_right(self): + # Note: the expected output here is probably incorrect. + # See https://github.com/pandas-dev/pandas/issues/17257 for more. + # We include this as a regression test for GH-24897. + left = pd.DataFrame({'a': [1, 2, 3], 'key': [0, 1, 1]}) + right = pd.DataFrame({'b': [1, 2, 3]}) + + expected = pd.DataFrame({'a': [1, 2, 3, None], + 'key': [0, 1, 1, 2], + 'b': [1, 2, 2, 3]}, + columns=['a', 'key', 'b'], + index=[0, 1, 2, 2]) + result = left.merge(right, left_on='key', right_index=True, + how='right') + tm.assert_frame_equal(result, expected) + def _check_merge(x, y): for how in ['inner', 'left', 'outer']: