From b04cee7520195ef75ce29d57df5dfeaeebf5a8ec Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Fri, 11 Jan 2019 12:04:34 -0800 Subject: [PATCH 01/22] BUG-24212 fix usage of Index.take in pd.merge --- doc/source/whatsnew/v0.24.0.rst | 1 + pandas/core/reshape/merge.py | 14 ++++++++++++++ pandas/tests/reshape/merge/test_merge.py | 10 ++++++++++ 3 files changed, 25 insertions(+) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 3b3fad22ce949..c0b1e4500a6b3 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1836,6 +1836,7 @@ Reshaping - Bug in :func:`DataFrame.stack` where timezone aware values were converted to timezone naive values (:issue:`19420`) - Bug in :func:`merge_asof` where a ``TypeError`` was raised when ``by_col`` were timezone aware values (:issue:`21184`) - Bug showing an incorrect shape when throwing error during ``DataFrame`` construction. (:issue:`20742`) +- Bug in :func:`merge` when merging by index name would sometimes result in an incorrectly numbered index (:issue:`24212`) .. _whatsnew_0240.bug_fixes.sparse: diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index e11847d2b8ce2..d39f6a98130d0 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -758,12 +758,26 @@ def _get_join_info(self): if self.right_index: if len(self.left) > 0: join_index = self.left.index.take(left_indexer) + if (self.how == 'right' and -1 in left_indexer + and not isinstance(self.right.index, MultiIndex)): + join_list = join_index.to_numpy() + absent = left_indexer == -1 + join_list[absent] = self.right.index.to_numpy()[absent] + join_index = Index(join_list, dtype=join_index.dtype, + name=join_index.name) else: join_index = self.right.index.take(right_indexer) left_indexer = np.array([-1] * len(join_index)) elif self.left_index: if len(self.right) > 0: join_index = self.right.index.take(right_indexer) + if (self.how == 'left' and -1 in right_indexer + and not isinstance(self.left.index, MultiIndex)): + join_list = join_index.to_numpy() + absent = right_indexer == -1 + join_list[absent] = self.left.index.to_numpy()[absent] + join_index = Index(join_list, dtype=join_index.dtype, + name=join_index.name) else: join_index = self.left.index.take(left_indexer) right_indexer = np.array([-1] * len(join_index)) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index f0a3ddc8ce8a4..fd858fc9c6cf9 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1170,6 +1170,16 @@ def test_merge_incompat_dtypes_error(self, df1_vals, df2_vals): with pytest.raises(ValueError, match=msg): pd.merge(df2, df1, on=['A']) + def test_merge_on_index_with_more_values(self): # GH 24212 + df1 = pd.DataFrame([[1, 2], [2, 4], [3, 6], [4, 8]], + columns=['a', 'b']) + df2 = pd.DataFrame([[3, 30], [4, 40]], + columns=['a', 'c']) + df1.set_index('a', drop=False, inplace=True) + df2.set_index('a', inplace=True) + result = pd.merge(df1, df2, left_index=True, right_on='a', how='left') + assert 1 in result.index + @pytest.fixture def left(): From a64b8fefcb6faeead50a95a6fe1e9d80a6841253 Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Fri, 11 Jan 2019 13:57:05 -0800 Subject: [PATCH 02/22] BUG-24212 add comment --- pandas/core/reshape/merge.py | 4 ++++ pandas/tests/reshape/merge/test_merge.py | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index d39f6a98130d0..cbbff43f84dfd 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -760,6 +760,8 @@ def _get_join_info(self): join_index = self.left.index.take(left_indexer) if (self.how == 'right' and -1 in left_indexer and not isinstance(self.right.index, MultiIndex)): + # if values missing (-1) from left index, + # take from right index instead join_list = join_index.to_numpy() absent = left_indexer == -1 join_list[absent] = self.right.index.to_numpy()[absent] @@ -773,6 +775,8 @@ def _get_join_info(self): join_index = self.right.index.take(right_indexer) if (self.how == 'left' and -1 in right_indexer and not isinstance(self.left.index, MultiIndex)): + # if values missing (-1) from right index, + # take from left index instead join_list = join_index.to_numpy() absent = right_indexer == -1 join_list[absent] = self.left.index.to_numpy()[absent] diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index fd858fc9c6cf9..7c3d2c324f8ac 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1170,7 +1170,8 @@ def test_merge_incompat_dtypes_error(self, df1_vals, df2_vals): with pytest.raises(ValueError, match=msg): pd.merge(df2, df1, on=['A']) - def test_merge_on_index_with_more_values(self): # GH 24212 + def test_merge_on_index_with_more_values(self): + # GH 24212 df1 = pd.DataFrame([[1, 2], [2, 4], [3, 6], [4, 8]], columns=['a', 'b']) df2 = pd.DataFrame([[3, 30], [4, 40]], From 022643db74a275c59d4d709b2be11fb660c4f33b Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Fri, 11 Jan 2019 18:45:26 -0800 Subject: [PATCH 03/22] BUG-24212 clarify test --- pandas/core/reshape/merge.py | 1 + pandas/tests/reshape/merge/test_merge.py | 12 ------------ 2 files changed, 1 insertion(+), 12 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index cbbff43f84dfd..43d1d1b4d1fcd 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -777,6 +777,7 @@ def _get_join_info(self): and not isinstance(self.left.index, MultiIndex)): # if values missing (-1) from right index, # take from left index instead + print(right_indexer) join_list = join_index.to_numpy() absent = right_indexer == -1 join_list[absent] = self.left.index.to_numpy()[absent] diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 7c3d2c324f8ac..bfc185257954b 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -940,7 +940,6 @@ def test_merge_two_empty_df_no_division_error(self): merge(a, a, on=('a', 'b')) @pytest.mark.parametrize('how', ['left', 'outer']) - @pytest.mark.xfail(reason="GH-24897") def test_merge_on_index_with_more_values(self, how): # GH 24212 # pd.merge gets [-1, -1, 0, 1] as right_indexer, ensure that -1 is @@ -1170,17 +1169,6 @@ def test_merge_incompat_dtypes_error(self, df1_vals, df2_vals): with pytest.raises(ValueError, match=msg): pd.merge(df2, df1, on=['A']) - def test_merge_on_index_with_more_values(self): - # GH 24212 - df1 = pd.DataFrame([[1, 2], [2, 4], [3, 6], [4, 8]], - columns=['a', 'b']) - df2 = pd.DataFrame([[3, 30], [4, 40]], - columns=['a', 'c']) - df1.set_index('a', drop=False, inplace=True) - df2.set_index('a', inplace=True) - result = pd.merge(df1, df2, left_index=True, right_on='a', how='left') - assert 1 in result.index - @pytest.fixture def left(): From e99dece04119c989f6f5327300062a14afc4dc35 Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Mon, 14 Jan 2019 15:37:45 -0800 Subject: [PATCH 04/22] BUG-24212 make _create_join_index function --- pandas/core/reshape/merge.py | 47 ++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 43d1d1b4d1fcd..ab236b86ce921 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -757,32 +757,15 @@ def _get_join_info(self): if self.right_index: if len(self.left) > 0: - join_index = self.left.index.take(left_indexer) - if (self.how == 'right' and -1 in left_indexer - and not isinstance(self.right.index, MultiIndex)): - # if values missing (-1) from left index, - # take from right index instead - join_list = join_index.to_numpy() - absent = left_indexer == -1 - join_list[absent] = self.right.index.to_numpy()[absent] - join_index = Index(join_list, dtype=join_index.dtype, - name=join_index.name) + join_index = self._create_join_index(left_indexer, + using_left=True) else: join_index = self.right.index.take(right_indexer) left_indexer = np.array([-1] * len(join_index)) elif self.left_index: if len(self.right) > 0: - join_index = self.right.index.take(right_indexer) - if (self.how == 'left' and -1 in right_indexer - and not isinstance(self.left.index, MultiIndex)): - # if values missing (-1) from right index, - # take from left index instead - print(right_indexer) - join_list = join_index.to_numpy() - absent = right_indexer == -1 - join_list[absent] = self.left.index.to_numpy()[absent] - join_index = Index(join_list, dtype=join_index.dtype, - name=join_index.name) + join_index = self._create_join_index(right_indexer, + using_left=False) else: join_index = self.left.index.take(left_indexer) right_indexer = np.array([-1] * len(join_index)) @@ -793,6 +776,28 @@ def _get_join_info(self): join_index = join_index.astype(object) return join_index, left_indexer, right_indexer + def _create_join_index(self, indexer, using_left=True): + if using_left: + index = self.left.index + other_index = self.right.index + how_check = 'right' + else: + index = self.right.index + other_index = self.left.index + how_check = 'left' + + join_index = index.take(indexer) + if self.how == how_check and not isinstance(other_index, MultiIndex): + absent = indexer == -1 + if any(absent): + # if values missing (-1) from target index, + # take from other index instead + join_list = join_index.to_numpy() + join_list[absent] = other_index.to_numpy()[absent] + join_index = Index(join_list, dtype=join_index.dtype, + name=join_index.name) + return join_index + def _get_merge_keys(self): """ Note: has side effects (copy/delete key columns) From b95e1feffec25152d31b7419e4c6d4a825dd1fbb Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Wed, 16 Jan 2019 16:16:16 -0800 Subject: [PATCH 05/22] BUG-24212 add docstring and comments --- pandas/core/reshape/merge.py | 49 +++++++++++++++++++++++------------- 1 file changed, 31 insertions(+), 18 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index ab236b86ce921..0a51f2ee0dce7 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -757,15 +757,19 @@ def _get_join_info(self): if self.right_index: if len(self.left) > 0: - join_index = self._create_join_index(left_indexer, - using_left=True) + join_index = self._create_join_index(self.left.index, + self.right.index, + left_indexer, + how='right') else: join_index = self.right.index.take(right_indexer) left_indexer = np.array([-1] * len(join_index)) elif self.left_index: if len(self.right) > 0: - join_index = self._create_join_index(right_indexer, - using_left=False) + join_index = self._create_join_index(self.right.index, + self.left.index, + right_indexer, + how='left') else: join_index = self.left.index.take(left_indexer) right_indexer = np.array([-1] * len(join_index)) @@ -776,24 +780,33 @@ def _get_join_info(self): join_index = join_index.astype(object) return join_index, left_indexer, right_indexer - def _create_join_index(self, indexer, using_left=True): - if using_left: - index = self.left.index - other_index = self.right.index - how_check = 'right' - else: - index = self.right.index - other_index = self.left.index - how_check = 'left' + def _create_join_index(self, index, other_index, indexer, how='left'): + """ + Create a join index by rearranging one index to match another + + Parameters + ---------- + index: Index being rearranged + other_index: Index used to supply values not found in index + indexer: how to rearrange index + how: replacement is only necessary if indexer based on other_index + Returns + ------- + join_index + """ join_index = index.take(indexer) - if self.how == how_check and not isinstance(other_index, MultiIndex): - absent = indexer == -1 - if any(absent): + if (self.how in (how, 'outer') and + not isinstance(other_index, MultiIndex)): + # if final index requires values in other_index but not target + # index, indexer may hold missing (-1) values, causing Index.take + # to take the final value in target index + mask = indexer == -1 + if np.any(mask): # if values missing (-1) from target index, - # take from other index instead + # take from other_index instead join_list = join_index.to_numpy() - join_list[absent] = other_index.to_numpy()[absent] + join_list[mask] = other_index.to_numpy()[mask] join_index = Index(join_list, dtype=join_index.dtype, name=join_index.name) return join_index From 73be0d0d84089310894acf89b591beb61fe5df24 Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Thu, 24 Jan 2019 12:32:44 -0800 Subject: [PATCH 06/22] BUG-24212 fix regression --- pandas/core/reshape/merge.py | 8 ++++++-- pandas/tests/reshape/merge/test_merge.py | 18 ++++++++++++++++++ 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 0a51f2ee0dce7..1dd19a7c1514e 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -760,6 +760,7 @@ def _get_join_info(self): join_index = self._create_join_index(self.left.index, self.right.index, left_indexer, + right_indexer, how='right') else: join_index = self.right.index.take(right_indexer) @@ -769,6 +770,7 @@ def _get_join_info(self): join_index = self._create_join_index(self.right.index, self.left.index, right_indexer, + left_indexer, how='left') else: join_index = self.left.index.take(left_indexer) @@ -780,7 +782,8 @@ def _get_join_info(self): join_index = join_index.astype(object) return join_index, left_indexer, right_indexer - def _create_join_index(self, index, other_index, indexer, how='left'): + def _create_join_index(self, index, other_index, indexer, + other_indexer, how='left'): """ Create a join index by rearranging one index to match another @@ -806,7 +809,8 @@ def _create_join_index(self, index, other_index, indexer, how='left'): # if values missing (-1) from target index, # take from other_index instead join_list = join_index.to_numpy() - join_list[mask] = other_index.to_numpy()[mask] + other_list = other_index.take(other_indexer).to_numpy() + join_list[mask] = other_list[mask] join_index = Index(join_list, dtype=join_index.dtype, name=join_index.name) return join_index diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index bfc185257954b..05c482ab3ff28 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -959,6 +959,24 @@ def test_merge_on_index_with_more_values(self, how): expected.set_index('a', drop=False, inplace=True) assert_frame_equal(result, expected) + @pytest.mark.parametrize('how', ['right', 'outer']) + def test_merge_on_index_with_more_values(self, how): + # GH 24212 + # pd.merge gets [-1, -1, 0, 1] as right_indexer, ensure that -1 is + # interpreted as a missing value instead of the last element + df1 = pd.DataFrame({'a': [1, 2, 3], 'key': [0, 2, 2]}) + df2 = pd.DataFrame({'b': [1, 2, 3, 4, 5]}) + result = df1.merge(df2, left_on='key', right_index=True, how=how) + expected = pd.DataFrame([[1.0, 0, 1], + [2.0, 2, 3], + [3.0, 2, 3], + [np.nan, 1, 2], + [np.nan, 3, 4], + [np.nan, 4, 5]], + columns=['a', 'key', 'b']) + expected.set_index(Int64Index([0, 1, 2, 1, 3, 4]), inplace=True) + assert_frame_equal(result, expected) + def test_merge_right_index_right(self): # Note: the expected output here is probably incorrect. # See https://github.com/pandas-dev/pandas/issues/17257 for more. From de3e2c71208a6fb52a9315cf4240ed12a67520f0 Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Thu, 24 Jan 2019 13:24:31 -0800 Subject: [PATCH 07/22] BUG-24212 alter old test --- pandas/core/reshape/merge.py | 1 + pandas/tests/reshape/merge/test_merge.py | 26 +++--------------------- 2 files changed, 4 insertions(+), 23 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 1dd19a7c1514e..1a2f707b17e48 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -757,6 +757,7 @@ def _get_join_info(self): if self.right_index: if len(self.left) > 0: + print(left_indexer) join_index = self._create_join_index(self.left.index, self.right.index, left_indexer, diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 05c482ab3ff28..82480eef5b9fe 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -939,31 +939,11 @@ def test_merge_two_empty_df_no_division_error(self): with np.errstate(divide='raise'): merge(a, a, on=('a', 'b')) - @pytest.mark.parametrize('how', ['left', 'outer']) - def test_merge_on_index_with_more_values(self, how): - # GH 24212 - # pd.merge gets [-1, -1, 0, 1] as right_indexer, ensure that -1 is - # interpreted as a missing value instead of the last element - df1 = pd.DataFrame([[1, 2], [2, 4], [3, 6], [4, 8]], - columns=['a', 'b']) - df2 = pd.DataFrame([[3, 30], [4, 40]], - columns=['a', 'c']) - df1.set_index('a', drop=False, inplace=True) - df2.set_index('a', inplace=True) - result = pd.merge(df1, df2, left_index=True, right_on='a', how=how) - expected = pd.DataFrame([[1, 2, np.nan], - [2, 4, np.nan], - [3, 6, 30.0], - [4, 8, 40.0]], - columns=['a', 'b', 'c']) - expected.set_index('a', drop=False, inplace=True) - assert_frame_equal(result, expected) - @pytest.mark.parametrize('how', ['right', 'outer']) def test_merge_on_index_with_more_values(self, how): # GH 24212 - # pd.merge gets [-1, -1, 0, 1] as right_indexer, ensure that -1 is - # interpreted as a missing value instead of the last element + # pd.merge gets [0, 1, 2, -1, -1, -1] as left_indexer, ensure that + # -1 is interpreted as a missing value instead of the last element df1 = pd.DataFrame({'a': [1, 2, 3], 'key': [0, 2, 2]}) df2 = pd.DataFrame({'b': [1, 2, 3, 4, 5]}) result = df1.merge(df2, left_on='key', right_index=True, how=how) @@ -984,7 +964,7 @@ def test_merge_right_index_right(self): left = pd.DataFrame({'a': [1, 2, 3], 'key': [0, 1, 1]}) right = pd.DataFrame({'b': [1, 2, 3]}) - expected = pd.DataFrame({'a': [1, 2, 3, None], + expected = pd.DataFrame({'a': [1, 2, 3, None, None, None], 'key': [0, 1, 1, 2], 'b': [1, 2, 2, 3]}, columns=['a', 'key', 'b'], From 12877586c97c10dce33060472ddca350844747af Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Thu, 24 Jan 2019 13:25:38 -0800 Subject: [PATCH 08/22] fix typo --- pandas/tests/reshape/merge/test_merge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 82480eef5b9fe..c17c301968269 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -964,7 +964,7 @@ def test_merge_right_index_right(self): left = pd.DataFrame({'a': [1, 2, 3], 'key': [0, 1, 1]}) right = pd.DataFrame({'b': [1, 2, 3]}) - expected = pd.DataFrame({'a': [1, 2, 3, None, None, None], + expected = pd.DataFrame({'a': [1, 2, 3, None], 'key': [0, 1, 1, 2], 'b': [1, 2, 2, 3]}, columns=['a', 'key', 'b'], From bdce7ac60a9ed495228cc786fe610a4aab77de7f Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Thu, 24 Jan 2019 14:07:17 -0800 Subject: [PATCH 09/22] BUG-24212 remove print and move whatsnew note --- doc/source/whatsnew/v0.24.0.rst | 1 - doc/source/whatsnew/v0.24.1.rst | 3 +++ pandas/core/reshape/merge.py | 1 - 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index c0b1e4500a6b3..3b3fad22ce949 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1836,7 +1836,6 @@ Reshaping - Bug in :func:`DataFrame.stack` where timezone aware values were converted to timezone naive values (:issue:`19420`) - Bug in :func:`merge_asof` where a ``TypeError`` was raised when ``by_col`` were timezone aware values (:issue:`21184`) - Bug showing an incorrect shape when throwing error during ``DataFrame`` construction. (:issue:`20742`) -- Bug in :func:`merge` when merging by index name would sometimes result in an incorrectly numbered index (:issue:`24212`) .. _whatsnew_0240.bug_fixes.sparse: diff --git a/doc/source/whatsnew/v0.24.1.rst b/doc/source/whatsnew/v0.24.1.rst index ee4b7ab62b31a..3ac2ed73ea53f 100644 --- a/doc/source/whatsnew/v0.24.1.rst +++ b/doc/source/whatsnew/v0.24.1.rst @@ -63,6 +63,9 @@ Bug Fixes - - +**Reshaping** + +- Bug in :func:`merge` when merging by index name would sometimes result in an incorrectly numbered index (:issue:`24212`) **Other** diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 1a2f707b17e48..1dd19a7c1514e 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -757,7 +757,6 @@ def _get_join_info(self): if self.right_index: if len(self.left) > 0: - print(left_indexer) join_index = self._create_join_index(self.left.index, self.right.index, left_indexer, From 83ae393805adeb336a062e7cd2af853d846b3c9c Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Tue, 29 Jan 2019 10:50:21 -0800 Subject: [PATCH 10/22] BUG-24212 fix when other_index has incompatible dtype --- pandas/core/reshape/merge.py | 16 ++++++++++++---- pandas/tests/reshape/merge/test_merge.py | 16 ++++++++++++++++ 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 1dd19a7c1514e..eb9e67854408b 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -809,10 +809,18 @@ def _create_join_index(self, index, other_index, indexer, # if values missing (-1) from target index, # take from other_index instead join_list = join_index.to_numpy() - other_list = other_index.take(other_indexer).to_numpy() - join_list[mask] = other_list[mask] - join_index = Index(join_list, dtype=join_index.dtype, - name=join_index.name) + try: + other_list = other_index.take(other_indexer).to_numpy() + join_list[mask] = other_list[mask] + join_index = Index(join_list, dtype=other_index.dtype, + name=other_index.name) + except ValueError: + # if other_index does not have a compatible dtype, naively + # replace missing values by their column position in other + naive_index = np.arange(len(other_index)) + other_list = naive_index.take(other_indexer) + join_list[mask] = other_list[mask] + join_index = Index(join_list, name=other_index.name) return join_index def _get_merge_keys(self): diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index c17c301968269..7e8985b2487b2 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -973,6 +973,22 @@ def test_merge_right_index_right(self): how='right') tm.assert_frame_equal(result, expected) + def test_merge_take_missing_values_from_index_of_other_dtype(self): + left = pd.DataFrame({'a': [1, 2, 3], + 'key': pd.Categorical(['a', 'a', 'b'], + categories=list('abc'))}) + right = pd.DataFrame({'b': [1, 2, 3]}, + index=pd.CategoricalIndex(['a', 'b', 'c'])) + result = left.merge(right, left_on='key', + right_index=True, how='right') + expected = pd.DataFrame({'a': [1, 2, 3, None], + 'key': pd.Categorical(['a', 'a', 'b', 'c']), + 'b': [1, 1, 2, 3]}, + index=[0, 1, 2, 2]) + print(result.dtypes) + print(expected.dtypes) + tm.assert_frame_equal(result, expected) + def _check_merge(x, y): for how in ['inner', 'left', 'outer']: From 66f6fe4ef5577334274e7a0b90acd30be2097b90 Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Tue, 29 Jan 2019 11:20:13 -0800 Subject: [PATCH 11/22] merge issue --- pandas/core/reshape/merge.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index d2aea97e9505f..eb9e67854408b 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -809,7 +809,6 @@ def _create_join_index(self, index, other_index, indexer, # if values missing (-1) from target index, # take from other_index instead join_list = join_index.to_numpy() -<<<<<<< HEAD try: other_list = other_index.take(other_indexer).to_numpy() join_list[mask] = other_list[mask] @@ -822,12 +821,6 @@ def _create_join_index(self, index, other_index, indexer, other_list = naive_index.take(other_indexer) join_list[mask] = other_list[mask] join_index = Index(join_list, name=other_index.name) -======= - other_list = other_index.take(other_indexer).to_numpy() - join_list[mask] = other_list[mask] - join_index = Index(join_list, dtype=join_index.dtype, - name=join_index.name) ->>>>>>> master return join_index def _get_merge_keys(self): From cf6fa1474e95c6c914937ddbc3d6d455583be50b Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Tue, 29 Jan 2019 11:22:27 -0800 Subject: [PATCH 12/22] fix whatsnew --- doc/source/whatsnew/v0.24.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.1.rst b/doc/source/whatsnew/v0.24.1.rst index 86605230877b2..8f4c3982c745f 100644 --- a/doc/source/whatsnew/v0.24.1.rst +++ b/doc/source/whatsnew/v0.24.1.rst @@ -72,13 +72,13 @@ Bug Fixes **Reshaping** -- Bug in :func:`merge` when merging by index name would sometimes result in an incorrectly numbered index (:issue:`24212`) - Bug in :meth:`DataFrame.groupby` with :class:`Grouper` when there is a time change (DST) and grouping frequency is ``'1d'`` (:issue:`24972`) **Visualization** - Fixed the warning for implicitly registered matplotlib converters not showing. See :ref:`whatsnew_0211.converters` for more (:issue:`24963`). + **Other** - From 0e6de81ecc8e53ed6a741c0240cc9a24ab13f938 Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Tue, 29 Jan 2019 12:17:14 -0800 Subject: [PATCH 13/22] BUG-24212 fix test --- pandas/tests/reshape/merge/test_merge.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 7e8985b2487b2..8aa4d2abf0ae3 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -985,8 +985,7 @@ def test_merge_take_missing_values_from_index_of_other_dtype(self): 'key': pd.Categorical(['a', 'a', 'b', 'c']), 'b': [1, 1, 2, 3]}, index=[0, 1, 2, 2]) - print(result.dtypes) - print(expected.dtypes) + expected = expected.reindex(columns=['a', 'key', 'b']) tm.assert_frame_equal(result, expected) From 1da789a52dc6aa1af5e6571213bc27bf9fab0f6f Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Tue, 29 Jan 2019 12:17:14 -0800 Subject: [PATCH 14/22] BUG-24212 fix test --- pandas/tests/reshape/merge/test_merge.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 7e8985b2487b2..8aa4d2abf0ae3 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -985,8 +985,7 @@ def test_merge_take_missing_values_from_index_of_other_dtype(self): 'key': pd.Categorical(['a', 'a', 'b', 'c']), 'b': [1, 1, 2, 3]}, index=[0, 1, 2, 2]) - print(result.dtypes) - print(expected.dtypes) + expected = expected.reindex(columns=['a', 'key', 'b']) tm.assert_frame_equal(result, expected) From 27cdbc8daa8206f0aaade9306507ad1b79351c56 Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Thu, 31 Jan 2019 11:47:20 -0800 Subject: [PATCH 15/22] BUG-24212 simplify take logic --- pandas/core/reshape/merge.py | 30 +++++++------ pandas/tests/reshape/merge/test_merge.py | 54 ++++++++++++++++++++++-- 2 files changed, 67 insertions(+), 17 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index eb9e67854408b..f697bffb5a77c 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -28,6 +28,7 @@ from pandas.core.arrays.categorical import _recode_for_categories import pandas.core.common as com from pandas.core.frame import _merge_doc +from pandas.core.indexes.category import CategoricalIndex from pandas.core.internals import ( concatenate_block_managers, items_overlap_with_suffix) import pandas.core.sorting as sorting @@ -798,7 +799,6 @@ def _create_join_index(self, index, other_index, indexer, ------- join_index """ - join_index = index.take(indexer) if (self.how in (how, 'outer') and not isinstance(other_index, MultiIndex)): # if final index requires values in other_index but not target @@ -806,22 +806,26 @@ def _create_join_index(self, index, other_index, indexer, # to take the final value in target index mask = indexer == -1 if np.any(mask): - # if values missing (-1) from target index, - # take from other_index instead - join_list = join_index.to_numpy() - try: - other_list = other_index.take(other_indexer).to_numpy() - join_list[mask] = other_list[mask] - join_index = Index(join_list, dtype=other_index.dtype, - name=other_index.name) - except ValueError: - # if other_index does not have a compatible dtype, naively - # replace missing values by their column position in other + # if values missing (-1) from target index, replace missing + # values by their column position or NA if not applicable + if is_numeric_dtype(index.dtype): + join_index = index.take(indexer) + join_list = join_index.to_numpy() naive_index = np.arange(len(other_index)) other_list = naive_index.take(other_indexer) join_list[mask] = other_list[mask] join_index = Index(join_list, name=other_index.name) - return join_index + elif is_categorical_dtype(index.dtype): + join_index = index.take(indexer) + codes = np.array(join_index.codes) + 1 + codes[mask] = -1 + join_index = CategoricalIndex(codes, index.categories) + else: + fill_value = na_value_for_dtype(index.dtype) + join_index = index.take(indexer, allow_fill=True, + fill_value=fill_value) + return join_index + return index.take(indexer) def _get_merge_keys(self): """ diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 8aa4d2abf0ae3..423fbc4919db7 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -17,7 +17,8 @@ import pandas as pd from pandas import ( Categorical, CategoricalIndex, DataFrame, DatetimeIndex, Float64Index, - Int64Index, MultiIndex, RangeIndex, Series, UInt64Index) + Int64Index, IntervalIndex, MultiIndex, PeriodIndex, RangeIndex, + Series, UInt64Index, TimedeltaIndex) from pandas.api.types import CategoricalDtype as CDT from pandas.core.reshape.concat import concat from pandas.core.reshape.merge import MergeError, merge @@ -940,11 +941,56 @@ def test_merge_two_empty_df_no_division_error(self): merge(a, a, on=('a', 'b')) @pytest.mark.parametrize('how', ['right', 'outer']) - def test_merge_on_index_with_more_values(self, how): + @pytest.mark.parametrize('index,expected_index', + [(CategoricalIndex([1, 2, 3]), + CategoricalIndex([1, 2, 3, None, None, None])), + (DatetimeIndex(['2001-01-01', + '2002-02-02', + '2003-03-03']), + DatetimeIndex(['2001-01-01', + '2002-02-02', + '2003-03-03', + pd.NaT, + pd.NaT, + pd.NaT])), + (Float64Index([1, 2, 3]), + Float64Index([1, 2, 3, 1, 3, 4])), + (Int64Index([1, 2, 3]), + Int64Index([1, 2, 3, 1, 3, 4])), + (IntervalIndex.from_tuples([(1, 2), + (2, 3), + (3, 4)]), + IntervalIndex.from_tuples([(1, 2), + (2, 3), + (3, 4), + np.nan, + np.nan, + np.nan])), + (PeriodIndex(['2001-01-01', + '2001-01-02', + '2001-01-03'], + freq='D'), + PeriodIndex(['2001-01-01', + '2001-01-02', + '2001-01-03', + pd.NaT, + pd.NaT, + pd.NaT], + freq='D')), + (TimedeltaIndex(['1d', + '2d', + '3d']), + TimedeltaIndex(['1d', + '2d', + '3d', + pd.NaT, + pd.NaT, + pd.NaT]))]) + def test_merge_on_index_with_more_values(self, how, index, expected_index): # GH 24212 # pd.merge gets [0, 1, 2, -1, -1, -1] as left_indexer, ensure that # -1 is interpreted as a missing value instead of the last element - df1 = pd.DataFrame({'a': [1, 2, 3], 'key': [0, 2, 2]}) + df1 = pd.DataFrame({'a': [1, 2, 3], 'key': [0, 2, 2]}, index=index) df2 = pd.DataFrame({'b': [1, 2, 3, 4, 5]}) result = df1.merge(df2, left_on='key', right_index=True, how=how) expected = pd.DataFrame([[1.0, 0, 1], @@ -954,7 +1000,7 @@ def test_merge_on_index_with_more_values(self, how): [np.nan, 3, 4], [np.nan, 4, 5]], columns=['a', 'key', 'b']) - expected.set_index(Int64Index([0, 1, 2, 1, 3, 4]), inplace=True) + expected.set_index(expected_index, inplace=True) assert_frame_equal(result, expected) def test_merge_right_index_right(self): From cd326b277d85de734a209212e4e869471a214f53 Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Thu, 31 Jan 2019 12:15:07 -0800 Subject: [PATCH 16/22] fix import order --- pandas/tests/reshape/merge/test_merge.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 423fbc4919db7..7e44976f995e9 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -17,8 +17,8 @@ import pandas as pd from pandas import ( Categorical, CategoricalIndex, DataFrame, DatetimeIndex, Float64Index, - Int64Index, IntervalIndex, MultiIndex, PeriodIndex, RangeIndex, - Series, UInt64Index, TimedeltaIndex) + Int64Index, IntervalIndex, MultiIndex, PeriodIndex, RangeIndex, Series, + TimedeltaIndex, UInt64Index) from pandas.api.types import CategoricalDtype as CDT from pandas.core.reshape.concat import concat from pandas.core.reshape.merge import MergeError, merge From d8d3cdfd01f6f8640c703df774a38c9a6ea886e9 Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Tue, 26 Mar 2019 11:27:25 -0700 Subject: [PATCH 17/22] make logic more generic --- pandas/core/reshape/merge.py | 25 ++----- pandas/tests/reshape/merge/test_merge.py | 95 ++++++++++++------------ 2 files changed, 54 insertions(+), 66 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 8ecd55f29fa3e..66115c6b06610 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -28,7 +28,6 @@ from pandas.core.arrays.categorical import _recode_for_categories import pandas.core.common as com from pandas.core.frame import _merge_doc -from pandas.core.indexes.category import CategoricalIndex from pandas.core.internals import ( concatenate_block_managers, items_overlap_with_suffix) import pandas.core.sorting as sorting @@ -809,28 +808,16 @@ def _create_join_index(self, index, other_index, indexer, not isinstance(other_index, MultiIndex)): # if final index requires values in other_index but not target # index, indexer may hold missing (-1) values, causing Index.take - # to take the final value in target index + # to take the final value in target index. So, we set the last + # element to be the desired fill value. We do not use allow_fill + # and fill_value because it throws a ValueError on integer indices mask = indexer == -1 if np.any(mask): - # if values missing (-1) from target index, replace missing - # values by their column position or NA if not applicable - if is_numeric_dtype(index.dtype): - join_index = index.take(indexer) - join_list = join_index.to_numpy() - naive_index = np.arange(len(other_index)) - other_list = naive_index.take(other_indexer) - join_list[mask] = other_list[mask] - join_index = Index(join_list, name=other_index.name) - elif is_categorical_dtype(index.dtype): - join_index = index.take(indexer) - codes = np.array(join_index.codes) + 1 - codes[mask] = -1 - join_index = CategoricalIndex(codes, index.categories) + if is_integer_dtype(index.dtype): + fill_value = np.nan else: fill_value = na_value_for_dtype(index.dtype) - join_index = index.take(indexer, allow_fill=True, - fill_value=fill_value) - return join_index + index = index.append(Index([fill_value])) return index.take(indexer) def _get_merge_keys(self): diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index f5c48db7a3ca3..d4543e60c3ba5 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1037,51 +1037,52 @@ def test_merge_two_empty_df_no_division_error(self): merge(a, a, on=('a', 'b')) @pytest.mark.parametrize('how', ['right', 'outer']) - @pytest.mark.parametrize('index,expected_index', - [(CategoricalIndex([1, 2, 3]), - CategoricalIndex([1, 2, 3, None, None, None])), - (DatetimeIndex(['2001-01-01', - '2002-02-02', - '2003-03-03']), - DatetimeIndex(['2001-01-01', - '2002-02-02', - '2003-03-03', - pd.NaT, - pd.NaT, - pd.NaT])), - (Float64Index([1, 2, 3]), - Float64Index([1, 2, 3, 1, 3, 4])), - (Int64Index([1, 2, 3]), - Int64Index([1, 2, 3, 1, 3, 4])), - (IntervalIndex.from_tuples([(1, 2), - (2, 3), - (3, 4)]), - IntervalIndex.from_tuples([(1, 2), - (2, 3), - (3, 4), - np.nan, - np.nan, - np.nan])), - (PeriodIndex(['2001-01-01', - '2001-01-02', - '2001-01-03'], - freq='D'), - PeriodIndex(['2001-01-01', - '2001-01-02', - '2001-01-03', - pd.NaT, - pd.NaT, - pd.NaT], - freq='D')), - (TimedeltaIndex(['1d', - '2d', - '3d']), - TimedeltaIndex(['1d', - '2d', - '3d', - pd.NaT, - pd.NaT, - pd.NaT]))]) + @pytest.mark.parametrize( + 'index,expected_index', + [(CategoricalIndex([1, 2, 4]), + CategoricalIndex([1, 2, 4, None, None, None])), + (DatetimeIndex(['2001-01-01', + '2002-02-02', + '2003-03-03']), + DatetimeIndex(['2001-01-01', + '2002-02-02', + '2003-03-03', + pd.NaT, + pd.NaT, + pd.NaT])), + (Float64Index([1, 2, 3]), + Float64Index([1, 2, 3, None, None, None])), + (Int64Index([1, 2, 3]), + Float64Index([1, 2, 3, None, None, None])), + (IntervalIndex.from_tuples([(1, 2), + (2, 3), + (3, 4)]), + IntervalIndex.from_tuples([(1, 2), + (2, 3), + (3, 4), + np.nan, + np.nan, + np.nan])), + (PeriodIndex(['2001-01-01', + '2001-01-02', + '2001-01-03'], + freq='D'), + PeriodIndex(['2001-01-01', + '2001-01-02', + '2001-01-03', + pd.NaT, + pd.NaT, + pd.NaT], + freq='D')), + (TimedeltaIndex(['1d', + '2d', + '3d']), + TimedeltaIndex(['1d', + '2d', + '3d', + pd.NaT, + pd.NaT, + pd.NaT]))]) def test_merge_on_index_with_more_values(self, how, index, expected_index): # GH 24212 # pd.merge gets [0, 1, 2, -1, -1, -1] as left_indexer, ensure that @@ -1110,7 +1111,7 @@ def test_merge_right_index_right(self): 'key': [0, 1, 1, 2], 'b': [1, 2, 2, 3]}, columns=['a', 'key', 'b'], - index=[0, 1, 2, 2]) + index=[0, 1, 2, np.nan]) result = left.merge(right, left_on='key', right_index=True, how='right') tm.assert_frame_equal(result, expected) @@ -1126,7 +1127,7 @@ def test_merge_take_missing_values_from_index_of_other_dtype(self): expected = pd.DataFrame({'a': [1, 2, 3, None], 'key': pd.Categorical(['a', 'a', 'b', 'c']), 'b': [1, 1, 2, 3]}, - index=[0, 1, 2, 2]) + index=[0, 1, 2, np.nan]) expected = expected.reindex(columns=['a', 'key', 'b']) tm.assert_frame_equal(result, expected) From f9e7386dde7ccda38bc5b9e5877afadd00899e2a Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Tue, 26 Mar 2019 11:27:25 -0700 Subject: [PATCH 18/22] make logic more generic --- pandas/core/reshape/merge.py | 25 ++----- pandas/tests/reshape/merge/test_merge.py | 95 ++++++++++++------------ 2 files changed, 54 insertions(+), 66 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 8ecd55f29fa3e..66115c6b06610 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -28,7 +28,6 @@ from pandas.core.arrays.categorical import _recode_for_categories import pandas.core.common as com from pandas.core.frame import _merge_doc -from pandas.core.indexes.category import CategoricalIndex from pandas.core.internals import ( concatenate_block_managers, items_overlap_with_suffix) import pandas.core.sorting as sorting @@ -809,28 +808,16 @@ def _create_join_index(self, index, other_index, indexer, not isinstance(other_index, MultiIndex)): # if final index requires values in other_index but not target # index, indexer may hold missing (-1) values, causing Index.take - # to take the final value in target index + # to take the final value in target index. So, we set the last + # element to be the desired fill value. We do not use allow_fill + # and fill_value because it throws a ValueError on integer indices mask = indexer == -1 if np.any(mask): - # if values missing (-1) from target index, replace missing - # values by their column position or NA if not applicable - if is_numeric_dtype(index.dtype): - join_index = index.take(indexer) - join_list = join_index.to_numpy() - naive_index = np.arange(len(other_index)) - other_list = naive_index.take(other_indexer) - join_list[mask] = other_list[mask] - join_index = Index(join_list, name=other_index.name) - elif is_categorical_dtype(index.dtype): - join_index = index.take(indexer) - codes = np.array(join_index.codes) + 1 - codes[mask] = -1 - join_index = CategoricalIndex(codes, index.categories) + if is_integer_dtype(index.dtype): + fill_value = np.nan else: fill_value = na_value_for_dtype(index.dtype) - join_index = index.take(indexer, allow_fill=True, - fill_value=fill_value) - return join_index + index = index.append(Index([fill_value])) return index.take(indexer) def _get_merge_keys(self): diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index f5c48db7a3ca3..d4543e60c3ba5 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1037,51 +1037,52 @@ def test_merge_two_empty_df_no_division_error(self): merge(a, a, on=('a', 'b')) @pytest.mark.parametrize('how', ['right', 'outer']) - @pytest.mark.parametrize('index,expected_index', - [(CategoricalIndex([1, 2, 3]), - CategoricalIndex([1, 2, 3, None, None, None])), - (DatetimeIndex(['2001-01-01', - '2002-02-02', - '2003-03-03']), - DatetimeIndex(['2001-01-01', - '2002-02-02', - '2003-03-03', - pd.NaT, - pd.NaT, - pd.NaT])), - (Float64Index([1, 2, 3]), - Float64Index([1, 2, 3, 1, 3, 4])), - (Int64Index([1, 2, 3]), - Int64Index([1, 2, 3, 1, 3, 4])), - (IntervalIndex.from_tuples([(1, 2), - (2, 3), - (3, 4)]), - IntervalIndex.from_tuples([(1, 2), - (2, 3), - (3, 4), - np.nan, - np.nan, - np.nan])), - (PeriodIndex(['2001-01-01', - '2001-01-02', - '2001-01-03'], - freq='D'), - PeriodIndex(['2001-01-01', - '2001-01-02', - '2001-01-03', - pd.NaT, - pd.NaT, - pd.NaT], - freq='D')), - (TimedeltaIndex(['1d', - '2d', - '3d']), - TimedeltaIndex(['1d', - '2d', - '3d', - pd.NaT, - pd.NaT, - pd.NaT]))]) + @pytest.mark.parametrize( + 'index,expected_index', + [(CategoricalIndex([1, 2, 4]), + CategoricalIndex([1, 2, 4, None, None, None])), + (DatetimeIndex(['2001-01-01', + '2002-02-02', + '2003-03-03']), + DatetimeIndex(['2001-01-01', + '2002-02-02', + '2003-03-03', + pd.NaT, + pd.NaT, + pd.NaT])), + (Float64Index([1, 2, 3]), + Float64Index([1, 2, 3, None, None, None])), + (Int64Index([1, 2, 3]), + Float64Index([1, 2, 3, None, None, None])), + (IntervalIndex.from_tuples([(1, 2), + (2, 3), + (3, 4)]), + IntervalIndex.from_tuples([(1, 2), + (2, 3), + (3, 4), + np.nan, + np.nan, + np.nan])), + (PeriodIndex(['2001-01-01', + '2001-01-02', + '2001-01-03'], + freq='D'), + PeriodIndex(['2001-01-01', + '2001-01-02', + '2001-01-03', + pd.NaT, + pd.NaT, + pd.NaT], + freq='D')), + (TimedeltaIndex(['1d', + '2d', + '3d']), + TimedeltaIndex(['1d', + '2d', + '3d', + pd.NaT, + pd.NaT, + pd.NaT]))]) def test_merge_on_index_with_more_values(self, how, index, expected_index): # GH 24212 # pd.merge gets [0, 1, 2, -1, -1, -1] as left_indexer, ensure that @@ -1110,7 +1111,7 @@ def test_merge_right_index_right(self): 'key': [0, 1, 1, 2], 'b': [1, 2, 2, 3]}, columns=['a', 'key', 'b'], - index=[0, 1, 2, 2]) + index=[0, 1, 2, np.nan]) result = left.merge(right, left_on='key', right_index=True, how='right') tm.assert_frame_equal(result, expected) @@ -1126,7 +1127,7 @@ def test_merge_take_missing_values_from_index_of_other_dtype(self): expected = pd.DataFrame({'a': [1, 2, 3, None], 'key': pd.Categorical(['a', 'a', 'b', 'c']), 'b': [1, 1, 2, 3]}, - index=[0, 1, 2, 2]) + index=[0, 1, 2, np.nan]) expected = expected.reindex(columns=['a', 'key', 'b']) tm.assert_frame_equal(result, expected) From 7da365551b653ac3d6797f87d00583340779067c Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Thu, 28 Mar 2019 18:56:45 -0700 Subject: [PATCH 19/22] clean up test --- pandas/tests/reshape/merge/test_merge.py | 50 ++++++------------------ 1 file changed, 12 insertions(+), 38 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index d4543e60c3ba5..619abcd70cbf2 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1041,48 +1041,21 @@ def test_merge_two_empty_df_no_division_error(self): 'index,expected_index', [(CategoricalIndex([1, 2, 4]), CategoricalIndex([1, 2, 4, None, None, None])), - (DatetimeIndex(['2001-01-01', - '2002-02-02', - '2003-03-03']), - DatetimeIndex(['2001-01-01', - '2002-02-02', - '2003-03-03', - pd.NaT, - pd.NaT, - pd.NaT])), + (DatetimeIndex(['2001-01-01', '2002-02-02', '2003-03-03']), + DatetimeIndex(['2001-01-01', '2002-02-02', '2003-03-03', + pd.NaT, pd.NaT, pd.NaT])), (Float64Index([1, 2, 3]), Float64Index([1, 2, 3, None, None, None])), (Int64Index([1, 2, 3]), Float64Index([1, 2, 3, None, None, None])), - (IntervalIndex.from_tuples([(1, 2), - (2, 3), - (3, 4)]), - IntervalIndex.from_tuples([(1, 2), - (2, 3), - (3, 4), - np.nan, - np.nan, - np.nan])), - (PeriodIndex(['2001-01-01', - '2001-01-02', - '2001-01-03'], - freq='D'), - PeriodIndex(['2001-01-01', - '2001-01-02', - '2001-01-03', - pd.NaT, - pd.NaT, - pd.NaT], - freq='D')), - (TimedeltaIndex(['1d', - '2d', - '3d']), - TimedeltaIndex(['1d', - '2d', - '3d', - pd.NaT, - pd.NaT, - pd.NaT]))]) + (IntervalIndex.from_tuples([(1, 2), (2, 3), (3, 4)]), + IntervalIndex.from_tuples([(1, 2), (2, 3), (3, 4), + np.nan, np.nan, np.nan])), + (PeriodIndex(['2001-01-01', '2001-01-02', '2001-01-03'], freq='D'), + PeriodIndex(['2001-01-01', '2001-01-02', '2001-01-03', + pd.NaT, pd.NaT, pd.NaT], freq='D')), + (TimedeltaIndex(['1d', '2d', '3d']), + TimedeltaIndex(['1d', '2d', '3d', pd.NaT, pd.NaT, pd.NaT]))]) def test_merge_on_index_with_more_values(self, how, index, expected_index): # GH 24212 # pd.merge gets [0, 1, 2, -1, -1, -1] as left_indexer, ensure that @@ -1117,6 +1090,7 @@ def test_merge_right_index_right(self): tm.assert_frame_equal(result, expected) def test_merge_take_missing_values_from_index_of_other_dtype(self): + # GH 24212 left = pd.DataFrame({'a': [1, 2, 3], 'key': pd.Categorical(['a', 'a', 'b'], categories=list('abc'))}) From 17c54979b64d6b33ffc73e3c783b063980b0d7d5 Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Fri, 29 Mar 2019 09:22:17 -0700 Subject: [PATCH 20/22] use compat=False for na_value_for_dtype --- pandas/core/reshape/merge.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 66115c6b06610..04db7575326a6 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -813,10 +813,7 @@ def _create_join_index(self, index, other_index, indexer, # and fill_value because it throws a ValueError on integer indices mask = indexer == -1 if np.any(mask): - if is_integer_dtype(index.dtype): - fill_value = np.nan - else: - fill_value = na_value_for_dtype(index.dtype) + fill_value = na_value_for_dtype(index.dtype, compat=False) index = index.append(Index([fill_value])) return index.take(indexer) From 6772618301a302cbdbf84a0dbb4aa9749c8cab10 Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Sun, 21 Apr 2019 17:05:08 -0700 Subject: [PATCH 21/22] clarify whatsnew --- doc/source/whatsnew/v0.25.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 2a0bc0afc095c..f583c8bd13b45 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -397,7 +397,7 @@ Reshaping ^^^^^^^^^ - Bug in :func:`pandas.merge` adds a string of ``None`` if ``None`` is assigned in suffixes instead of remain the column name as-is (:issue:`24782`). -- Bug in :func:`merge` when merging by index name would sometimes result in an incorrectly numbered index (:issue:`24212`) +- Bug in :func:`merge` when merging by index name would sometimes result in an incorrectly numbered index (missing index values are now assigned NA) (:issue:`24212`) - :func:`to_records` now accepts dtypes to its `column_dtypes` parameter (:issue:`24895`) - Bug in :func:`concat` where order of ``OrderedDict`` (and ``dict`` in Python 3.6+) is not respected, when passed in as ``objs`` argument (:issue:`21510`) - Bug in :func:`pivot_table` where columns with ``NaN`` values are dropped even if ``dropna`` argument is ``False``, when the ``aggfunc`` argument contains a ``list`` (:issue:`22159`) From cad43986183e0f1fdb13baabae3de3c25c28f713 Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Sun, 21 Apr 2019 19:53:22 -0700 Subject: [PATCH 22/22] add PR number to whatsnew --- doc/source/whatsnew/v0.25.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index f583c8bd13b45..ae84c94d894c4 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -397,7 +397,7 @@ Reshaping ^^^^^^^^^ - Bug in :func:`pandas.merge` adds a string of ``None`` if ``None`` is assigned in suffixes instead of remain the column name as-is (:issue:`24782`). -- Bug in :func:`merge` when merging by index name would sometimes result in an incorrectly numbered index (missing index values are now assigned NA) (:issue:`24212`) +- Bug in :func:`merge` when merging by index name would sometimes result in an incorrectly numbered index (missing index values are now assigned NA) (:issue:`24212`, :issue:`25009`) - :func:`to_records` now accepts dtypes to its `column_dtypes` parameter (:issue:`24895`) - Bug in :func:`concat` where order of ``OrderedDict`` (and ``dict`` in Python 3.6+) is not respected, when passed in as ``objs`` argument (:issue:`21510`) - Bug in :func:`pivot_table` where columns with ``NaN`` values are dropped even if ``dropna`` argument is ``False``, when the ``aggfunc`` argument contains a ``list`` (:issue:`22159`)