From 74f01a8ce56600cc7f081d18b77c48bbbb35f54b Mon Sep 17 00:00:00 2001 From: Max Bolingbroke Date: Tue, 19 Feb 2019 06:20:45 +0000 Subject: [PATCH] BUG: Fix potential segfault after pd.Categorical(pd.Series(...), categories=...) --- doc/source/whatsnew/v0.24.2.rst | 2 ++ pandas/core/arrays/categorical.py | 11 +++-------- pandas/tests/arrays/categorical/test_constructors.py | 10 ++++++++++ 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst index 8e59c2300e7cab..3ceb0a7eaef297 100644 --- a/doc/source/whatsnew/v0.24.2.rst +++ b/doc/source/whatsnew/v0.24.2.rst @@ -27,6 +27,8 @@ Fixed Regressions - Fixed regression in :meth:`DataFrame.duplicated()`, where empty dataframe was not returning a boolean dtyped Series. (:issue:`25184`) - Fixed regression in :meth:`Series.min` and :meth:`Series.max` where ``numeric_only=True`` was ignored when the ``Series`` contained ```Categorical`` data (:issue:`25299`) +- Fixed regression in ``Categorical``, where supplying a ``Series`` in the constructor could create a broken object which could cause segfaults (:issue:`25318`) + .. _whatsnew_0242.enhancements: Enhancements diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 79e565df94eaea..d8274632ef3a8d 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -323,14 +323,6 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, # we may have dtype.categories be None, and we need to # infer categories in a factorization step futher below - if is_categorical(values): - # GH23814, for perf, if values._values already an instance of - # Categorical, set values to codes, and run fastpath - if (isinstance(values, (ABCSeries, ABCIndexClass)) and - isinstance(values._values, type(self))): - values = values._values.codes.copy() - fastpath = True - if fastpath: self._codes = coerce_indexer_dtype(values, dtype.categories) self._dtype = self._dtype.update_dtype(dtype) @@ -2625,6 +2617,9 @@ def _recode_for_categories(codes, old_categories, new_categories): if len(old_categories) == 0: # All null anyway, so just retain the nulls return codes.copy() + elif new_categories.equals(old_categories): + # Same categories, so no need to actually recode + return codes.copy() indexer = coerce_indexer_dtype(new_categories.get_indexer(old_categories), new_categories) new_codes = take_1d(indexer, codes.copy(), fill_value=-1) diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index 25c299692ceca3..bd233c85767c69 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -149,6 +149,16 @@ def test_constructor(self): categories=["a", "b", "c", "d"]) tm.assert_categorical_equal(c1, c2) + # GH25318 + c0 = Categorical(["a", "b", "c", "a"]) + c1 = Categorical(["a", "b", "c", "a"], categories=["b", "c"]) + + c2 = Categorical(c0, categories=c1.categories) + tm.assert_categorical_equal(c1, c2) + + c3 = Categorical(Series(c0), categories=c1.categories) + tm.assert_categorical_equal(c1, c3) + # This should result in integer categories, not float! cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3]) assert is_integer_dtype(cat.categories)