Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Index with right_index=True or left_index=True merge #34468

Closed
wants to merge 10 commits into from
15 changes: 11 additions & 4 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -938,16 +938,23 @@ def _create_join_index(
-------
join_index
"""
if self.how in (how, "outer") and not isinstance(other_index, MultiIndex):
if self.how in (how, "outer") and not isinstance(index, MultiIndex):
# if final index requires values in other_index but not target
# index, indexer may hold missing (-1) values, causing Index.take
# to take the final value in target index. So, we set the last
# element to be the desired fill value. We do not use allow_fill
# and fill_value because it throws a ValueError on integer indices
mask = indexer == -1
mask = other_indexer == -1
phofl marked this conversation as resolved.
Show resolved Hide resolved
if np.any(mask):
fill_value = na_value_for_dtype(index.dtype, compat=False)
index = index.append(Index([fill_value]))
fill_value = na_value_for_dtype(other_index.dtype, compat=False)
if isinstance(other_index, MultiIndex):
fill_index = MultiIndex.from_tuples(
[[fill_value] * other_index.nlevels]
)
else:
fill_index = Index([fill_value])
other_index = other_index.append(fill_index)
return other_index.take(other_indexer)
return index.take(indexer)

def _get_merge_keys(self):
Expand Down
30 changes: 26 additions & 4 deletions pandas/tests/reshape/merge/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
DataFrame,
DatetimeIndex,
Float64Index,
Index,
Int64Index,
IntervalIndex,
MultiIndex,
Expand Down Expand Up @@ -361,7 +362,9 @@ def test_handle_join_key_pass_array(self):

key = np.array([0, 1, 1, 2, 2, 3], dtype=np.int64)
merged = merge(left, right, left_index=True, right_on=key, how="outer")
tm.assert_series_equal(merged["key_0"], Series(key, name="key_0"))
tm.assert_series_equal(
merged["key_0"], Series(key, name="key_0", index=[0, 1, 1, 2, 2, np.nan])
)

def test_no_overlap_more_informative_error(self):
dt = datetime.now()
Expand Down Expand Up @@ -472,7 +475,10 @@ def check1(exp, kwarg):
def check2(exp, kwarg):
result = pd.merge(left, right, how="right", **kwarg)
tm.assert_frame_equal(result, exp)

def check3(exp, kwarg, index):
result = pd.merge(left, right, how="outer", **kwarg)
exp.index = index
tm.assert_frame_equal(result, exp)

for kwarg in [
Expand All @@ -482,6 +488,13 @@ def check2(exp, kwarg):
check1(exp_in, kwarg)
check2(exp_out, kwarg)

check3(exp_out, dict(left_index=True, right_index=True), exp_out.index)
check3(
exp_out.copy(),
dict(left_index=True, right_on="x"),
Index([np.nan, np.nan, np.nan]),
)

kwarg = dict(left_on="a", right_index=True)
check1(exp_in, kwarg)
exp_out["a"] = [0, 1, 2]
Expand Down Expand Up @@ -1300,7 +1313,7 @@ def test_merge_on_index_with_more_values(self, how, index, expected_index):
],
columns=["a", "key", "b"],
)
expected.set_index(expected_index, inplace=True)
expected.set_index(df2.index, inplace=True)
tm.assert_frame_equal(result, expected)

def test_merge_right_index_right(self):
Expand All @@ -1313,7 +1326,7 @@ def test_merge_right_index_right(self):
expected = pd.DataFrame(
{"a": [1, 2, 3, None], "key": [0, 1, 1, 2], "b": [1, 2, 2, 3]},
columns=["a", "key", "b"],
index=[0, 1, 2, np.nan],
index=[0, 1, 1, 2],
)
result = left.merge(right, left_on="key", right_index=True, how="right")
tm.assert_frame_equal(result, expected)
Expand Down Expand Up @@ -1350,7 +1363,7 @@ def test_merge_take_missing_values_from_index_of_other_dtype(self):
"key": pd.Categorical(["a", "a", "b", "c"]),
"b": [1, 1, 2, 3],
},
index=[0, 1, 2, np.nan],
index=pd.Categorical(["a", "a", "b", "c"]),
)
expected = expected.reindex(columns=["a", "key", "b"])
tm.assert_frame_equal(result, expected)
Expand Down Expand Up @@ -2227,3 +2240,12 @@ def test_categorical_non_unique_monotonic(n_categories):
index=left_index,
)
tm.assert_frame_equal(expected, result)


def test_right_index_true_right_join_target_index():
df_left = pd.DataFrame(index=["a", "b"])
df_right = pd.DataFrame({"x": ["a", "c"]})

result = pd.merge(df_left, df_right, left_index=True, right_on="x", how="left")
expected = pd.DataFrame({"x": ["a", "b"]}, index=Index(["a", "b"]))
tm.assert_frame_equal(result, expected)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you mean "left" instead of "right" in the test name? I'd also include a link to the issue as a comment, and rename df_left -> left, df_right -> right to be consistent with other tests.

How does this behave when we actually are doing a right join? It feels like it should exactly equal the right DataFrame (index and values), but someone who knows more than me would have to chime in.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah you are right of course. It was late yesterday evening :)

I added a test to show the behavior in case of a right join. I am not quite sure what to expect. The current behavior is the result you described. But the documentation says, that indexes are passed along if used in join. I am not sure how to interpret this in case of right join when left index is used.

5 changes: 4 additions & 1 deletion pandas/tests/reshape/merge/test_merge_index_as_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,5 +185,8 @@ def test_join_indexes_and_columns_on(df1, df2, left_index, join_type):
result = left_df.join(
right_df, on=["outer", "inner"], how=join_type, lsuffix="_x", rsuffix="_y"
)

if join_type == "right" and left_index == "inner":
result.index = result.index.droplevel("outer")
if join_type == "outer" and left_index == "inner":
result.index = result.index.droplevel(0)
tm.assert_frame_equal(result, expected, check_like=True)