Skip to content

Commit

Permalink
BUG: Fix potential segfault after pd.Categorical(pd.Series(...), cate…
Browse files Browse the repository at this point in the history
…gories=...)
  • Loading branch information
batterseapower committed Feb 21, 2019
1 parent 5449279 commit 9d22ea7
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 9 deletions.
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.24.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ Fixed Regressions
- Fixed regression in :class:`TimedeltaIndex` where `np.sum(index)` incorrectly returned a zero-dimensional object instead of a scalar (:issue:`25282`)
- Fixed regression in ``IntervalDtype`` construction where passing an incorrect string with 'Interval' as a prefix could result in a ``RecursionError``. (:issue:`25338`)

- Fixed regression in ``Categorical``, where constructing it from a categorical ``Series`` and an explicit ``categories=`` that differed from that in the ``Series`` created an invalid object which could trigger segfaults. (:issue:`25318`)

.. _whatsnew_0242.enhancements:

Enhancements
Expand Down
13 changes: 4 additions & 9 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,14 +323,6 @@ def __init__(self, values, categories=None, ordered=None, dtype=None,
# we may have dtype.categories be None, and we need to
# infer categories in a factorization step futher below

if is_categorical(values):
# GH23814, for perf, if values._values already an instance of
# Categorical, set values to codes, and run fastpath
if (isinstance(values, (ABCSeries, ABCIndexClass)) and
isinstance(values._values, type(self))):
values = values._values.codes.copy()
fastpath = True

if fastpath:
self._codes = coerce_indexer_dtype(values, dtype.categories)
self._dtype = self._dtype.update_dtype(dtype)
Expand Down Expand Up @@ -382,7 +374,7 @@ def __init__(self, values, categories=None, ordered=None, dtype=None,
dtype = CategoricalDtype(categories, dtype.ordered)

elif is_categorical_dtype(values):
old_codes = (values.cat.codes if isinstance(values, ABCSeries)
old_codes = (values.values.codes if isinstance(values, ABCSeries)
else values.codes)
codes = _recode_for_categories(old_codes, values.dtype.categories,
dtype.categories)
Expand Down Expand Up @@ -2625,6 +2617,9 @@ def _recode_for_categories(codes, old_categories, new_categories):
if len(old_categories) == 0:
# All null anyway, so just retain the nulls
return codes.copy()
elif new_categories.equals(old_categories):
# Same categories, so no need to actually recode
return codes.copy()
indexer = coerce_indexer_dtype(new_categories.get_indexer(old_categories),
new_categories)
new_codes = take_1d(indexer, codes.copy(), fill_value=-1)
Expand Down
11 changes: 11 additions & 0 deletions pandas/tests/arrays/categorical/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,17 @@ def test_constructor(self):
categories=["a", "b", "c", "d"])
tm.assert_categorical_equal(c1, c2)

# GH25318: constructing with pd.Series used to bogusly skip recoding
# categories
c0 = Categorical(["a", "b", "c", "a"])
c1 = Categorical(["a", "b", "c", "a"], categories=["b", "c"])

c2 = Categorical(c0, categories=c1.categories)
tm.assert_categorical_equal(c1, c2)

c3 = Categorical(Series(c0), categories=c1.categories)
tm.assert_categorical_equal(c1, c3)

# This should result in integer categories, not float!
cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
assert is_integer_dtype(cat.categories)
Expand Down

0 comments on commit 9d22ea7

Please sign in to comment.