From 54cbc21a58441bc169d360a9c8c85147063aaf4b Mon Sep 17 00:00:00 2001 From: Josh Mayer Date: Tue, 12 Feb 2019 12:03:14 -0500 Subject: [PATCH 1/6] BUG: fixed merging with empty frame containing an Int64 column (#25183) --- doc/source/whatsnew/v0.24.2.rst | 2 +- pandas/core/internals/concat.py | 2 + pandas/tests/reshape/merge/test_merge.py | 59 ++++++++++++++++++++++++ 3 files changed, 62 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst index f17c4974cd450..aba661bd1fd18 100644 --- a/doc/source/whatsnew/v0.24.2.rst +++ b/doc/source/whatsnew/v0.24.2.rst @@ -91,7 +91,7 @@ Bug Fixes **Other** - Bug in :meth:`Series.is_unique` where single occurrences of ``NaN`` were not considered unique (:issue:`25180`) -- +- Bug in :func:`merge` when merging an empty ``DataFrame`` with an ``Int64`` column or a non-empty ``DataFrame`` with an ``Int64`` column that is all ``NaN`` (:issue:`25183`) - .. _whatsnew_0.242.contributors: diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 640587b7f9f31..cb98274962656 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -190,6 +190,8 @@ def get_reindexed_values(self, empty_dtype, upcasted_na): pass elif getattr(self.block, 'is_sparse', False): pass + elif getattr(self.block, 'is_extension', False): + pass else: missing_arr = np.empty(self.shape, dtype=empty_dtype) missing_arr.fill(fill_value) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 25487ccc76e62..1216730f1862f 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2,6 +2,7 @@ from collections import OrderedDict from datetime import date, datetime +import itertools import random import re @@ -428,6 +429,64 @@ def check2(exp, kwarg): check1(exp_in, kwarg) check2(exp_out, kwarg) + @pytest.mark.parametrize( + 'join_col, val_col', list(itertools.product([ + pd.Series([1], dtype='int64'), + pd.Series([1], dtype='Int64'), + pd.Series([1.23]), + pd.Series(['foo']), + pd.Series([True]), + pd.Series([pd.Timestamp('2018-01-01')]), + pd.Series([pd.Timestamp('2018-01-01', tz='US/Eastern')]), + ], repeat=2)), + ids=lambda x: x.dtype.name + ) + def test_merge_empty_frame(self, join_col, val_col): + # GH 25183 + df = pd.DataFrame({'a': join_col, 'b': val_col}, columns=['a', 'b']) + df_empty = df[:0] + exp = pd.DataFrame({ + 'b_x': pd.Series(dtype=df.dtypes['b']), + 'a': pd.Series(dtype=df.dtypes['a']), + 'b_y': pd.Series(dtype=df.dtypes['b']), + }, columns=['b_x', 'a', 'b_y']) + act = df_empty.merge(df, on='a') + assert_frame_equal(act, exp) + + @pytest.mark.parametrize( + 'join_col, val_col', list(itertools.product( + [ + pd.Series([1], dtype='int64'), + pd.Series([1], dtype='Int64'), + pd.Series([1.23]), + pd.Series(['foo']), + pd.Series([True]), + pd.Series([pd.Timestamp('2018-01-01')]), + pd.Series([pd.Timestamp('2018-01-01', tz='US/Eastern')]), + ], + [ + pd.Series([np.nan], dtype='Int64'), + pd.Series([np.nan], dtype='float'), + pd.Series([np.nan], dtype='object'), + pd.Series([pd.NaT]), + ] + )), + ids=lambda x: x.dtype.name + ) + def test_merge_all_na_column(self, join_col, val_col): + # GH 25183 + df_left = pd.DataFrame( + {'a': join_col, 'b': val_col}, columns=['a', 'b']) + df_right = pd.DataFrame( + {'a': join_col, 'b': val_col}, columns=['a', 'b']) + exp = pd.DataFrame({ + 'a': join_col, + 'b_x': val_col, + 'b_y': val_col, + }, columns=['a', 'b_x', 'b_y']) + act = df_left.merge(df_right, on='a') + assert_frame_equal(act, exp) + def test_merge_nosort(self): # #2098, anything to do? From 274b95ac3e36d4eaef057b41611b6c3b7b4f37ee Mon Sep 17 00:00:00 2001 From: Josh Mayer Date: Sat, 16 Feb 2019 17:41:52 -0500 Subject: [PATCH 2/6] updates for PR comments --- pandas/tests/reshape/merge/test_merge.py | 84 +++++++++++++----------- 1 file changed, 44 insertions(+), 40 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 1216730f1862f..545085f61acf7 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -40,6 +40,42 @@ def get_test_data(ngroups=NGROUPS, n=N): return arr +def get_series(): + return [ + pd.Series([1], dtype='int64'), + pd.Series([1], dtype='Int64'), + pd.Series([1.23]), + pd.Series(['foo']), + pd.Series([True]), + pd.Series([pd.Timestamp('2018-01-01')]), + pd.Series([pd.Timestamp('2018-01-01', tz='US/Eastern')]), + ] + + +def get_series_nan(): + return [ + pd.Series([np.nan], dtype='Int64'), + pd.Series([np.nan], dtype='float'), + pd.Series([np.nan], dtype='object'), + pd.Series([pd.NaT]), + ] + + +@pytest.fixture(params=get_series(), ids=lambda x: x.dtype.name) +def value_col(request): + return request.param + + +@pytest.fixture(params=get_series(), ids=lambda x: x.dtype.name) +def value_col2(request): + return request.param + + +@pytest.fixture(params=get_series_nan(), ids=lambda x: x.dtype.name) +def value_col_nan(request): + return request.param + + class TestMerge(object): def setup_method(self, method): @@ -429,21 +465,9 @@ def check2(exp, kwarg): check1(exp_in, kwarg) check2(exp_out, kwarg) - @pytest.mark.parametrize( - 'join_col, val_col', list(itertools.product([ - pd.Series([1], dtype='int64'), - pd.Series([1], dtype='Int64'), - pd.Series([1.23]), - pd.Series(['foo']), - pd.Series([True]), - pd.Series([pd.Timestamp('2018-01-01')]), - pd.Series([pd.Timestamp('2018-01-01', tz='US/Eastern')]), - ], repeat=2)), - ids=lambda x: x.dtype.name - ) - def test_merge_empty_frame(self, join_col, val_col): + def test_merge_empty_frame(self, value_col, value_col2): # GH 25183 - df = pd.DataFrame({'a': join_col, 'b': val_col}, columns=['a', 'b']) + df = pd.DataFrame({'a': value_col, 'b': value_col2}, columns=['a', 'b']) df_empty = df[:0] exp = pd.DataFrame({ 'b_x': pd.Series(dtype=df.dtypes['b']), @@ -453,36 +477,16 @@ def test_merge_empty_frame(self, join_col, val_col): act = df_empty.merge(df, on='a') assert_frame_equal(act, exp) - @pytest.mark.parametrize( - 'join_col, val_col', list(itertools.product( - [ - pd.Series([1], dtype='int64'), - pd.Series([1], dtype='Int64'), - pd.Series([1.23]), - pd.Series(['foo']), - pd.Series([True]), - pd.Series([pd.Timestamp('2018-01-01')]), - pd.Series([pd.Timestamp('2018-01-01', tz='US/Eastern')]), - ], - [ - pd.Series([np.nan], dtype='Int64'), - pd.Series([np.nan], dtype='float'), - pd.Series([np.nan], dtype='object'), - pd.Series([pd.NaT]), - ] - )), - ids=lambda x: x.dtype.name - ) - def test_merge_all_na_column(self, join_col, val_col): + def test_merge_all_na_column(self, value_col, value_col_nan): # GH 25183 df_left = pd.DataFrame( - {'a': join_col, 'b': val_col}, columns=['a', 'b']) + {'a': value_col, 'b': value_col_nan}, columns=['a', 'b']) df_right = pd.DataFrame( - {'a': join_col, 'b': val_col}, columns=['a', 'b']) + {'a': value_col, 'b': value_col_nan}, columns=['a', 'b']) exp = pd.DataFrame({ - 'a': join_col, - 'b_x': val_col, - 'b_y': val_col, + 'a': value_col, + 'b_x': value_col_nan, + 'b_y': value_col_nan, }, columns=['a', 'b_x', 'b_y']) act = df_left.merge(df_right, on='a') assert_frame_equal(act, exp) From f32767ca4a853bdbc1b31ea90b436ffa8c8c28d1 Mon Sep 17 00:00:00 2001 From: Josh Mayer Date: Sat, 16 Feb 2019 17:47:03 -0500 Subject: [PATCH 3/6] pep8 line length fix --- pandas/tests/reshape/merge/test_merge.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 545085f61acf7..b27fe83dd8ac7 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -467,7 +467,8 @@ def check2(exp, kwarg): def test_merge_empty_frame(self, value_col, value_col2): # GH 25183 - df = pd.DataFrame({'a': value_col, 'b': value_col2}, columns=['a', 'b']) + df = pd.DataFrame({'a': value_col, 'b': value_col2}, + columns=['a', 'b']) df_empty = df[:0] exp = pd.DataFrame({ 'b_x': pd.Series(dtype=df.dtypes['b']), From 61b4b5f5a503ef84375dbd68a05c83c29dd88615 Mon Sep 17 00:00:00 2001 From: Josh Mayer Date: Sat, 16 Feb 2019 18:08:38 -0500 Subject: [PATCH 4/6] remove unused import --- pandas/tests/reshape/merge/test_merge.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index b27fe83dd8ac7..8865bcb4e723d 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2,7 +2,6 @@ from collections import OrderedDict from datetime import date, datetime -import itertools import random import re From 7942c8ac39f7885236424d30a1b6a13c2bb52dde Mon Sep 17 00:00:00 2001 From: Josh Mayer Date: Tue, 19 Feb 2019 09:54:02 -0500 Subject: [PATCH 5/6] updates for PR comments --- pandas/tests/reshape/merge/test_merge.py | 59 +++++++++++++----------- 1 file changed, 33 insertions(+), 26 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 8865bcb4e723d..e86125b2e4825 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -51,7 +51,7 @@ def get_series(): ] -def get_series_nan(): +def get_series_na(): return [ pd.Series([np.nan], dtype='Int64'), pd.Series([np.nan], dtype='float'), @@ -61,17 +61,21 @@ def get_series_nan(): @pytest.fixture(params=get_series(), ids=lambda x: x.dtype.name) -def value_col(request): +def series_of_dtype(request): + # A parametrized fixture returning a variety of Series of different dtypes return request.param @pytest.fixture(params=get_series(), ids=lambda x: x.dtype.name) -def value_col2(request): +def series_of_dtype2(request): + # A duplicate of the series_of_dtype fixture, so that it + # can be used twice by a single function return request.param -@pytest.fixture(params=get_series_nan(), ids=lambda x: x.dtype.name) -def value_col_nan(request): +@pytest.fixture(params=get_series_na(), ids=lambda x: x.dtype.name) +def series_of_dtype_all_na(request): + # A parametrized fixture returning a variety of Series with all NA values return request.param @@ -464,32 +468,35 @@ def check2(exp, kwarg): check1(exp_in, kwarg) check2(exp_out, kwarg) - def test_merge_empty_frame(self, value_col, value_col2): + def test_merge_empty_frame(self, series_of_dtype, series_of_dtype2): # GH 25183 - df = pd.DataFrame({'a': value_col, 'b': value_col2}, - columns=['a', 'b']) + df = pd.DataFrame({'key': series_of_dtype, 'value': series_of_dtype2}, + columns=['key', 'value']) df_empty = df[:0] - exp = pd.DataFrame({ - 'b_x': pd.Series(dtype=df.dtypes['b']), - 'a': pd.Series(dtype=df.dtypes['a']), - 'b_y': pd.Series(dtype=df.dtypes['b']), - }, columns=['b_x', 'a', 'b_y']) - act = df_empty.merge(df, on='a') - assert_frame_equal(act, exp) - - def test_merge_all_na_column(self, value_col, value_col_nan): + expected = pd.DataFrame({ + 'value_x': pd.Series(dtype=df.dtypes['value']), + 'key': pd.Series(dtype=df.dtypes['key']), + 'value_y': pd.Series(dtype=df.dtypes['value']), + }, columns=['value_x', 'key', 'value_y']) + actual = df_empty.merge(df, on='key') + assert_frame_equal(actual, expected) + + def test_merge_all_na_column(self, series_of_dtype, + series_of_dtype_all_na): # GH 25183 df_left = pd.DataFrame( - {'a': value_col, 'b': value_col_nan}, columns=['a', 'b']) + {'key': series_of_dtype, 'value': series_of_dtype_all_na}, + columns=['key', 'value']) df_right = pd.DataFrame( - {'a': value_col, 'b': value_col_nan}, columns=['a', 'b']) - exp = pd.DataFrame({ - 'a': value_col, - 'b_x': value_col_nan, - 'b_y': value_col_nan, - }, columns=['a', 'b_x', 'b_y']) - act = df_left.merge(df_right, on='a') - assert_frame_equal(act, exp) + {'key': series_of_dtype, 'value': series_of_dtype_all_na}, + columns=['key', 'value']) + expected = pd.DataFrame({ + 'key': series_of_dtype, + 'value_x': series_of_dtype_all_na, + 'value_y': series_of_dtype_all_na, + }, columns=['key', 'value_x', 'value_y']) + actual = df_left.merge(df_right, on='key') + assert_frame_equal(actual, expected) def test_merge_nosort(self): # #2098, anything to do? From 5d3e6912a15a83a39d5e854fa2a67da14e7ea8af Mon Sep 17 00:00:00 2001 From: Josh Mayer Date: Wed, 20 Feb 2019 09:53:06 -0500 Subject: [PATCH 6/6] updates for PR comments --- pandas/tests/reshape/merge/test_merge.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index e86125b2e4825..7a97368504fd6 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -62,20 +62,28 @@ def get_series_na(): @pytest.fixture(params=get_series(), ids=lambda x: x.dtype.name) def series_of_dtype(request): - # A parametrized fixture returning a variety of Series of different dtypes + """ + A parametrized fixture returning a variety of Series of different + dtypes + """ return request.param @pytest.fixture(params=get_series(), ids=lambda x: x.dtype.name) def series_of_dtype2(request): - # A duplicate of the series_of_dtype fixture, so that it - # can be used twice by a single function + """ + A duplicate of the series_of_dtype fixture, so that it can be used + twice by a single function + """ return request.param @pytest.fixture(params=get_series_na(), ids=lambda x: x.dtype.name) def series_of_dtype_all_na(request): - # A parametrized fixture returning a variety of Series with all NA values + """ + A parametrized fixture returning a variety of Series with all NA + values + """ return request.param