Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG-24212 fix when other_index has incompatible dtype #25009

Merged
merged 30 commits into from
May 5, 2019
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
b04cee7
BUG-24212 fix usage of Index.take in pd.merge
JustinZhengBC Jan 11, 2019
a64b8fe
BUG-24212 add comment
JustinZhengBC Jan 11, 2019
022643d
BUG-24212 clarify test
JustinZhengBC Jan 12, 2019
e99dece
BUG-24212 make _create_join_index function
JustinZhengBC Jan 14, 2019
b95e1fe
BUG-24212 add docstring and comments
JustinZhengBC Jan 17, 2019
73be0d0
BUG-24212 fix regression
JustinZhengBC Jan 24, 2019
de3e2c7
BUG-24212 alter old test
JustinZhengBC Jan 24, 2019
1287758
fix typo
JustinZhengBC Jan 24, 2019
bdce7ac
BUG-24212 remove print and move whatsnew note
JustinZhengBC Jan 24, 2019
83ae393
BUG-24212 fix when other_index has incompatible dtype
JustinZhengBC Jan 29, 2019
4cb3ab0
Merge branch 'master' into BUG-24212
JustinZhengBC Jan 29, 2019
66f6fe4
merge issue
JustinZhengBC Jan 29, 2019
cf6fa14
fix whatsnew
JustinZhengBC Jan 29, 2019
0e6de81
BUG-24212 fix test
JustinZhengBC Jan 29, 2019
1da789a
BUG-24212 fix test
JustinZhengBC Jan 29, 2019
a0e5ffc
Merge branch 'BUG-24212' of https://github.com/justinzhengbc/pandas i…
JustinZhengBC Jan 29, 2019
27cdbc8
BUG-24212 simplify take logic
JustinZhengBC Jan 31, 2019
cd326b2
fix import order
JustinZhengBC Jan 31, 2019
2c65ebf
Merge branch 'master' into BUG-24212
JustinZhengBC Mar 26, 2019
d8d3cdf
make logic more generic
JustinZhengBC Mar 26, 2019
f9e7386
make logic more generic
JustinZhengBC Mar 26, 2019
8a36130
Merge branch 'BUG-24212' of https://github.com/justinzhengbc/pandas i…
JustinZhengBC Mar 27, 2019
7da3655
clean up test
JustinZhengBC Mar 29, 2019
17c5497
use compat=False for na_value_for_dtype
JustinZhengBC Mar 29, 2019
720dfbb
Merge branch 'master' into BUG-24212
JustinZhengBC Apr 21, 2019
6772618
clarify whatsnew
JustinZhengBC Apr 22, 2019
dacb4bc
Merge branch 'master' into BUG-24212
JustinZhengBC Apr 22, 2019
cad4398
add PR number to whatsnew
JustinZhengBC Apr 22, 2019
5e2eb0f
Merge branch 'BUG-24212' of https://github.com/justinzhengbc/pandas i…
JustinZhengBC Apr 22, 2019
88cdf8b
Merge branch 'master' into BUG-24212
JustinZhengBC Apr 29, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 21 additions & 9 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from pandas.core.arrays.categorical import _recode_for_categories
import pandas.core.common as com
from pandas.core.frame import _merge_doc
from pandas.core.indexes.category import CategoricalIndex
from pandas.core.internals import (
concatenate_block_managers, items_overlap_with_suffix)
import pandas.core.sorting as sorting
Expand Down Expand Up @@ -798,22 +799,33 @@ def _create_join_index(self, index, other_index, indexer,
-------
join_index
"""
join_index = index.take(indexer)
if (self.how in (how, 'outer') and
not isinstance(other_index, MultiIndex)):
# if final index requires values in other_index but not target
# index, indexer may hold missing (-1) values, causing Index.take
# to take the final value in target index
mask = indexer == -1
if np.any(mask):
# if values missing (-1) from target index,
# take from other_index instead
join_list = join_index.to_numpy()
other_list = other_index.take(other_indexer).to_numpy()
join_list[mask] = other_list[mask]
join_index = Index(join_list, dtype=join_index.dtype,
name=join_index.name)
return join_index
# if values missing (-1) from target index, replace missing
# values by their column position or NA if not applicable
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we don't want to dispatch on the index type here at all, other than calling a method on the index. This is just ripe for errors. Need to make this much more generic.

if is_numeric_dtype(index.dtype):
join_index = index.take(indexer)
join_list = join_index.to_numpy()
naive_index = np.arange(len(other_index))
other_list = naive_index.take(other_indexer)
join_list[mask] = other_list[mask]
join_index = Index(join_list, name=other_index.name)
elif is_categorical_dtype(index.dtype):
join_index = index.take(indexer)
codes = np.array(join_index.codes) + 1
codes[mask] = -1
join_index = CategoricalIndex(codes, index.categories)
else:
fill_value = na_value_for_dtype(index.dtype)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use this for all, passing compat=False

join_index = index.take(indexer, allow_fill=True,
fill_value=fill_value)
return join_index
return index.take(indexer)

def _get_merge_keys(self):
"""
Expand Down
69 changes: 65 additions & 4 deletions pandas/tests/reshape/merge/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
import pandas as pd
from pandas import (
Categorical, CategoricalIndex, DataFrame, DatetimeIndex, Float64Index,
Int64Index, MultiIndex, RangeIndex, Series, UInt64Index)
Int64Index, IntervalIndex, MultiIndex, PeriodIndex, RangeIndex, Series,
TimedeltaIndex, UInt64Index)
from pandas.api.types import CategoricalDtype as CDT
from pandas.core.reshape.concat import concat
from pandas.core.reshape.merge import MergeError, merge
Expand Down Expand Up @@ -940,11 +941,56 @@ def test_merge_two_empty_df_no_division_error(self):
merge(a, a, on=('a', 'b'))

@pytest.mark.parametrize('how', ['right', 'outer'])
def test_merge_on_index_with_more_values(self, how):
@pytest.mark.parametrize('index,expected_index',
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you format this a bit better. start like

@pytest.mark.parametrize(
   'index,expected_index',
    [(......),
.....

[(CategoricalIndex([1, 2, 3]),
CategoricalIndex([1, 2, 3, None, None, None])),
(DatetimeIndex(['2001-01-01',
'2002-02-02',
'2003-03-03']),
DatetimeIndex(['2001-01-01',
'2002-02-02',
'2003-03-03',
pd.NaT,
pd.NaT,
pd.NaT])),
(Float64Index([1, 2, 3]),
Float64Index([1, 2, 3, 1, 3, 4])),
(Int64Index([1, 2, 3]),
Int64Index([1, 2, 3, 1, 3, 4])),
(IntervalIndex.from_tuples([(1, 2),
(2, 3),
(3, 4)]),
IntervalIndex.from_tuples([(1, 2),
(2, 3),
(3, 4),
np.nan,
np.nan,
np.nan])),
(PeriodIndex(['2001-01-01',
'2001-01-02',
'2001-01-03'],
freq='D'),
PeriodIndex(['2001-01-01',
'2001-01-02',
'2001-01-03',
pd.NaT,
pd.NaT,
pd.NaT],
freq='D')),
(TimedeltaIndex(['1d',
'2d',
'3d']),
TimedeltaIndex(['1d',
'2d',
'3d',
pd.NaT,
pd.NaT,
pd.NaT]))])
def test_merge_on_index_with_more_values(self, how, index, expected_index):
# GH 24212
# pd.merge gets [0, 1, 2, -1, -1, -1] as left_indexer, ensure that
# -1 is interpreted as a missing value instead of the last element
df1 = pd.DataFrame({'a': [1, 2, 3], 'key': [0, 2, 2]})
df1 = pd.DataFrame({'a': [1, 2, 3], 'key': [0, 2, 2]}, index=index)
df2 = pd.DataFrame({'b': [1, 2, 3, 4, 5]})
result = df1.merge(df2, left_on='key', right_index=True, how=how)
expected = pd.DataFrame([[1.0, 0, 1],
Expand All @@ -954,7 +1000,7 @@ def test_merge_on_index_with_more_values(self, how):
[np.nan, 3, 4],
[np.nan, 4, 5]],
columns=['a', 'key', 'b'])
expected.set_index(Int64Index([0, 1, 2, 1, 3, 4]), inplace=True)
expected.set_index(expected_index, inplace=True)
assert_frame_equal(result, expected)

def test_merge_right_index_right(self):
Expand All @@ -973,6 +1019,21 @@ def test_merge_right_index_right(self):
how='right')
tm.assert_frame_equal(result, expected)

def test_merge_take_missing_values_from_index_of_other_dtype(self):
left = pd.DataFrame({'a': [1, 2, 3],
jreback marked this conversation as resolved.
Show resolved Hide resolved
'key': pd.Categorical(['a', 'a', 'b'],
categories=list('abc'))})
right = pd.DataFrame({'b': [1, 2, 3]},
index=pd.CategoricalIndex(['a', 'b', 'c']))
result = left.merge(right, left_on='key',
right_index=True, how='right')
expected = pd.DataFrame({'a': [1, 2, 3, None],
'key': pd.Categorical(['a', 'a', 'b', 'c']),
'b': [1, 1, 2, 3]},
index=[0, 1, 2, 2])
expected = expected.reindex(columns=['a', 'key', 'b'])
tm.assert_frame_equal(result, expected)


def _check_merge(x, y):
for how in ['inner', 'left', 'outer']:
Expand Down