From 2a264e8c06fccb6c64b4743ec08f84d1d50346b2 Mon Sep 17 00:00:00 2001 From: Sterling Paramore Date: Fri, 8 Feb 2019 17:32:18 -0800 Subject: [PATCH 1/2] BUG: Duplicated returns boolean dataframe --- doc/source/whatsnew/v0.25.0.rst | 2 ++ pandas/core/frame.py | 2 +- pandas/tests/frame/test_duplicates.py | 6 ++++++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 4032dc20b2e19..af0fca34cac6d 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -68,6 +68,8 @@ Performance Improvements Bug Fixes ~~~~~~~~~ +- Bug in using duplicated with empty dataframes (:issue:`25184`) + Categorical ^^^^^^^^^^^ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 19da8ba5c547d..bc521e931e5ae 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4636,7 +4636,7 @@ def duplicated(self, subset=None, keep='first'): from pandas._libs.hashtable import duplicated_int64, _SIZE_HINT_LIMIT if self.empty: - return Series() + return Series(dtype=bool) def f(vals): labels, shape = algorithms.factorize( diff --git a/pandas/tests/frame/test_duplicates.py b/pandas/tests/frame/test_duplicates.py index f61dbbdb989e4..b2a8cb738614c 100644 --- a/pandas/tests/frame/test_duplicates.py +++ b/pandas/tests/frame/test_duplicates.py @@ -182,6 +182,12 @@ def test_drop_duplicates(): assert df.duplicated(keep=keep).sum() == 0 +def test_duplicated_on_empty_frame_gives_back_frame(): + df = DataFrame(columns=['a', 'b']) + dupes = df.duplicated('a') + tm.assert_frame_equal(df[dupes], df) + + def test_drop_duplicates_with_duplicate_column_names(): # GH17836 df = DataFrame([ From 97307dd37ba2a7a48e586c3f0a16b92d815c3254 Mon Sep 17 00:00:00 2001 From: Sterling Paramore Date: Sat, 9 Feb 2019 13:46:33 -0800 Subject: [PATCH 2/2] Resolved issues brought up in code review --- doc/source/whatsnew/v0.24.2.rst | 2 ++ doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/tests/frame/test_duplicates.py | 9 +++++++-- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst index 73df504c89d5b..abe899b8af5a6 100644 --- a/doc/source/whatsnew/v0.24.2.rst +++ b/doc/source/whatsnew/v0.24.2.rst @@ -24,6 +24,8 @@ Fixed Regressions - Fixed issue in ``DataFrame`` construction with passing a mixed list of mixed types could segfault. (:issue:`25075`) +- Fixed regression in :meth:`DataFrame.duplicated()`, where empty dataframe was not returning a boolean dtyped Series. (:issue:`25184`) + .. _whatsnew_0242.enhancements: Enhancements diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index af0fca34cac6d..1055514cd0e09 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -68,7 +68,7 @@ Performance Improvements Bug Fixes ~~~~~~~~~ -- Bug in using duplicated with empty dataframes (:issue:`25184`) +- Categorical ^^^^^^^^^^^ diff --git a/pandas/tests/frame/test_duplicates.py b/pandas/tests/frame/test_duplicates.py index b2a8cb738614c..3396670fb5879 100644 --- a/pandas/tests/frame/test_duplicates.py +++ b/pandas/tests/frame/test_duplicates.py @@ -182,10 +182,15 @@ def test_drop_duplicates(): assert df.duplicated(keep=keep).sum() == 0 -def test_duplicated_on_empty_frame_gives_back_frame(): +def test_duplicated_on_empty_frame(): + # GH 25184 + df = DataFrame(columns=['a', 'b']) dupes = df.duplicated('a') - tm.assert_frame_equal(df[dupes], df) + + result = df[dupes] + expected = df.copy() + tm.assert_frame_equal(result, expected) def test_drop_duplicates_with_duplicate_column_names():