pandas-dev · jreback · Jan 26, 2019 · Jan 11, 2019 · Jan 11, 2019 · Jan 12, 2019
diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
@@ -1836,6 +1836,7 @@ Reshaping
 - Bug in :func:`DataFrame.stack` where timezone aware values were converted to timezone naive values (:issue:`19420`)
 - Bug in :func:`merge_asof` where a ``TypeError`` was raised when ``by_col`` were timezone aware values (:issue:`21184`)
 - Bug showing an incorrect shape when throwing error during ``DataFrame`` construction. (:issue:`20742`)
+- Bug in :func:`merge` when merging by index name would sometimes result in an incorrectly numbered index (:issue:`24212`)
 
 .. _whatsnew_0240.bug_fixes.sparse:
 

diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
@@ -757,13 +757,22 @@ def _get_join_info(self):
 
             if self.right_index:
                 if len(self.left) > 0:
-                    join_index = self.left.index.take(left_indexer)
+                    print(left_indexer)
+                    join_index = self._create_join_index(self.left.index,
+                                                         self.right.index,
+                                                         left_indexer,
+                                                         right_indexer,
+                                                         how='right')
                 else:
                     join_index = self.right.index.take(right_indexer)
                     left_indexer = np.array([-1] * len(join_index))
             elif self.left_index:
                 if len(self.right) > 0:
-                    join_index = self.right.index.take(right_indexer)
+                    join_index = self._create_join_index(self.right.index,
+                                                         self.left.index,
+                                                         right_indexer,
+                                                         left_indexer,
+                                                         how='left')
                 else:
                     join_index = self.left.index.take(left_indexer)
                     right_indexer = np.array([-1] * len(join_index))
@@ -774,6 +783,39 @@ def _get_join_info(self):
             join_index = join_index.astype(object)
         return join_index, left_indexer, right_indexer
 
+    def _create_join_index(self, index, other_index, indexer,
+                           other_indexer, how='left'):
+        """
+        Create a join index by rearranging one index to match another
+
+        Parameters
+        ----------
+        index: Index being rearranged
+        other_index: Index used to supply values not found in index
+        indexer: how to rearrange index
+        how: replacement is only necessary if indexer based on other_index
+
+        Returns
+        -------
+        join_index
+        """
+        join_index = index.take(indexer)
+        if (self.how in (how, 'outer') and
+                not isinstance(other_index, MultiIndex)):
+            # if final index requires values in other_index but not target
+            # index, indexer may hold missing (-1) values, causing Index.take
+            # to take the final value in target index
+            mask = indexer == -1
+            if np.any(mask):
+                # if values missing (-1) from target index,
+                # take from other_index instead
+                join_list = join_index.to_numpy()
+                other_list = other_index.take(other_indexer).to_numpy()
+                join_list[mask] = other_list[mask]
+                join_index = Index(join_list, dtype=join_index.dtype,
+                                   name=join_index.name)
+        return join_index
+
     def _get_merge_keys(self):
         """
         Note: has side effects (copy/delete key columns)

diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py
@@ -939,25 +939,22 @@ def test_merge_two_empty_df_no_division_error(self):
         with np.errstate(divide='raise'):
             merge(a, a, on=('a', 'b'))
 
-    @pytest.mark.parametrize('how', ['left', 'outer'])
-    @pytest.mark.xfail(reason="GH-24897")
+    @pytest.mark.parametrize('how', ['right', 'outer'])
     def test_merge_on_index_with_more_values(self, how):
         # GH 24212
-        # pd.merge gets [-1, -1, 0, 1] as right_indexer, ensure that -1 is
-        # interpreted as a missing value instead of the last element
-        df1 = pd.DataFrame([[1, 2], [2, 4], [3, 6], [4, 8]],
-                           columns=['a', 'b'])
-        df2 = pd.DataFrame([[3, 30], [4, 40]],
-                           columns=['a', 'c'])
-        df1.set_index('a', drop=False, inplace=True)
-        df2.set_index('a', inplace=True)
-        result = pd.merge(df1, df2, left_index=True, right_on='a', how=how)
-        expected = pd.DataFrame([[1, 2, np.nan],
-                                 [2, 4, np.nan],
-                                 [3, 6, 30.0],
-                                 [4, 8, 40.0]],
-                                columns=['a', 'b', 'c'])
-        expected.set_index('a', drop=False, inplace=True)
+        # pd.merge gets [0, 1, 2, -1, -1, -1] as left_indexer, ensure that
+        # -1 is interpreted as a missing value instead of the last element
+        df1 = pd.DataFrame({'a': [1, 2, 3], 'key': [0, 2, 2]})
+        df2 = pd.DataFrame({'b': [1, 2, 3, 4, 5]})
+        result = df1.merge(df2, left_on='key', right_index=True, how=how)
+        expected = pd.DataFrame([[1.0, 0, 1],
+                                 [2.0, 2, 3],
+                                 [3.0, 2, 3],
+                                 [np.nan, 1, 2],
+                                 [np.nan, 3, 4],
+                                 [np.nan, 4, 5]],
+                                columns=['a', 'key', 'b'])
+        expected.set_index(Int64Index([0, 1, 2, 1, 3, 4]), inplace=True)
         assert_frame_equal(result, expected)
 
     def test_merge_right_index_right(self):