Skip to content

Commit

Permalink
BUG-24212 fix when other_index has incompatible dtype (#25009)
Browse files Browse the repository at this point in the history
  • Loading branch information
JustinZhengBC authored and jreback committed May 5, 2019
1 parent ec2846a commit cc3b2f0
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 16 deletions.
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -404,7 +404,7 @@ Reshaping
^^^^^^^^^

- Bug in :func:`pandas.merge` adds a string of ``None``, if ``None`` is assigned in suffixes instead of remain the column name as-is (:issue:`24782`).
- Bug in :func:`merge` when merging by index name would sometimes result in an incorrectly numbered index (:issue:`24212`)
- Bug in :func:`merge` when merging by index name would sometimes result in an incorrectly numbered index (missing index values are now assigned NA) (:issue:`24212`, :issue:`25009`)
- :func:`to_records` now accepts dtypes to its ``column_dtypes`` parameter (:issue:`24895`)
- Bug in :func:`concat` where order of ``OrderedDict`` (and ``dict`` in Python 3.6+) is not respected, when passed in as ``objs`` argument (:issue:`21510`)
- Bug in :func:`pivot_table` where columns with ``NaN`` values are dropped even if ``dropna`` argument is ``False``, when the ``aggfunc`` argument contains a ``list`` (:issue:`22159`)
Expand Down
16 changes: 6 additions & 10 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -803,22 +803,18 @@ def _create_join_index(self, index, other_index, indexer,
-------
join_index
"""
join_index = index.take(indexer)
if (self.how in (how, 'outer') and
not isinstance(other_index, MultiIndex)):
# if final index requires values in other_index but not target
# index, indexer may hold missing (-1) values, causing Index.take
# to take the final value in target index
# to take the final value in target index. So, we set the last
# element to be the desired fill value. We do not use allow_fill
# and fill_value because it throws a ValueError on integer indices
mask = indexer == -1
if np.any(mask):
# if values missing (-1) from target index,
# take from other_index instead
join_list = join_index.to_numpy()
other_list = other_index.take(other_indexer).to_numpy()
join_list[mask] = other_list[mask]
join_index = Index(join_list, dtype=join_index.dtype,
name=join_index.name)
return join_index
fill_value = na_value_for_dtype(index.dtype, compat=False)
index = index.append(Index([fill_value]))
return index.take(indexer)

def _get_merge_keys(self):
"""
Expand Down
46 changes: 41 additions & 5 deletions pandas/tests/reshape/merge/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
import pandas as pd
from pandas import (
Categorical, CategoricalIndex, DataFrame, DatetimeIndex, Float64Index,
Int64Index, MultiIndex, RangeIndex, Series, UInt64Index)
Int64Index, IntervalIndex, MultiIndex, PeriodIndex, RangeIndex, Series,
TimedeltaIndex, UInt64Index)
from pandas.api.types import CategoricalDtype as CDT
from pandas.core.reshape.concat import concat
from pandas.core.reshape.merge import MergeError, merge
Expand Down Expand Up @@ -1034,11 +1035,30 @@ def test_merge_two_empty_df_no_division_error(self):
merge(a, a, on=('a', 'b'))

@pytest.mark.parametrize('how', ['right', 'outer'])
def test_merge_on_index_with_more_values(self, how):
@pytest.mark.parametrize(
'index,expected_index',
[(CategoricalIndex([1, 2, 4]),
CategoricalIndex([1, 2, 4, None, None, None])),
(DatetimeIndex(['2001-01-01', '2002-02-02', '2003-03-03']),
DatetimeIndex(['2001-01-01', '2002-02-02', '2003-03-03',
pd.NaT, pd.NaT, pd.NaT])),
(Float64Index([1, 2, 3]),
Float64Index([1, 2, 3, None, None, None])),
(Int64Index([1, 2, 3]),
Float64Index([1, 2, 3, None, None, None])),
(IntervalIndex.from_tuples([(1, 2), (2, 3), (3, 4)]),
IntervalIndex.from_tuples([(1, 2), (2, 3), (3, 4),
np.nan, np.nan, np.nan])),
(PeriodIndex(['2001-01-01', '2001-01-02', '2001-01-03'], freq='D'),
PeriodIndex(['2001-01-01', '2001-01-02', '2001-01-03',
pd.NaT, pd.NaT, pd.NaT], freq='D')),
(TimedeltaIndex(['1d', '2d', '3d']),
TimedeltaIndex(['1d', '2d', '3d', pd.NaT, pd.NaT, pd.NaT]))])
def test_merge_on_index_with_more_values(self, how, index, expected_index):
# GH 24212
# pd.merge gets [0, 1, 2, -1, -1, -1] as left_indexer, ensure that
# -1 is interpreted as a missing value instead of the last element
df1 = pd.DataFrame({'a': [1, 2, 3], 'key': [0, 2, 2]})
df1 = pd.DataFrame({'a': [1, 2, 3], 'key': [0, 2, 2]}, index=index)
df2 = pd.DataFrame({'b': [1, 2, 3, 4, 5]})
result = df1.merge(df2, left_on='key', right_index=True, how=how)
expected = pd.DataFrame([[1.0, 0, 1],
Expand All @@ -1048,7 +1068,7 @@ def test_merge_on_index_with_more_values(self, how):
[np.nan, 3, 4],
[np.nan, 4, 5]],
columns=['a', 'key', 'b'])
expected.set_index(Int64Index([0, 1, 2, 1, 3, 4]), inplace=True)
expected.set_index(expected_index, inplace=True)
assert_frame_equal(result, expected)

def test_merge_right_index_right(self):
Expand All @@ -1062,11 +1082,27 @@ def test_merge_right_index_right(self):
'key': [0, 1, 1, 2],
'b': [1, 2, 2, 3]},
columns=['a', 'key', 'b'],
index=[0, 1, 2, 2])
index=[0, 1, 2, np.nan])
result = left.merge(right, left_on='key', right_index=True,
how='right')
tm.assert_frame_equal(result, expected)

def test_merge_take_missing_values_from_index_of_other_dtype(self):
# GH 24212
left = pd.DataFrame({'a': [1, 2, 3],
'key': pd.Categorical(['a', 'a', 'b'],
categories=list('abc'))})
right = pd.DataFrame({'b': [1, 2, 3]},
index=pd.CategoricalIndex(['a', 'b', 'c']))
result = left.merge(right, left_on='key',
right_index=True, how='right')
expected = pd.DataFrame({'a': [1, 2, 3, None],
'key': pd.Categorical(['a', 'a', 'b', 'c']),
'b': [1, 1, 2, 3]},
index=[0, 1, 2, np.nan])
expected = expected.reindex(columns=['a', 'key', 'b'])
tm.assert_frame_equal(result, expected)


def _check_merge(x, y):
for how in ['inner', 'left', 'outer']:
Expand Down

0 comments on commit cc3b2f0

Please sign in to comment.