From 5ed076ca6f8a925317fba32c47bd4acb6d2a2b49 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 18 Oct 2017 02:03:44 +0200 Subject: [PATCH] BUG: Categorical(Index) passed as categories (#17888) --- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/core/dtypes/dtypes.py | 17 ++++++++++------- pandas/tests/dtypes/test_dtypes.py | 10 +++++++++- pandas/tests/test_categorical.py | 28 ++++++++++++++++++++++++++++ 4 files changed, 48 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 598e452640781a..dd90396c4009e7 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -1024,6 +1024,7 @@ Categorical - Bug in the categorical constructor with empty values and categories causing the ``.categories`` to be an empty ``Float64Index`` rather than an empty ``Index`` with object dtype (:issue:`17248`) - Bug in categorical operations with :ref:`Series.cat ` not preserving the original Series' name (:issue:`17509`) - Bug in :func:`DataFrame.merge` failing for categorical columns with boolean/int data types (:issue:`17187`) +- Bug in constructing a ``Categorical``/``CategoricalDtype`` when the specified ``categories`` are of categorical type (:issue:`17884`). .. _whatsnew_0210.pypy: diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index b3498abb3b2c06..b4467f0f9733bf 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -3,7 +3,7 @@ import re import numpy as np from pandas import compat -from pandas.core.dtypes.generic import ABCIndexClass +from pandas.core.dtypes.generic import ABCIndexClass, ABCCategoricalIndex class ExtensionDtype(object): @@ -170,16 +170,16 @@ def _from_categorical_dtype(cls, dtype, categories=None, ordered=None): return cls(categories, ordered) def _finalize(self, categories, ordered, fastpath=False): - from pandas.core.indexes.base import Index if ordered is None: ordered = False + else: + self._validate_ordered(ordered) if categories is not None: - categories = Index(categories, tupleize_cols=False) - # validation - self._validate_categories(categories, fastpath=fastpath) - self._validate_ordered(ordered) + categories = self._validate_categories(categories, + fastpath=fastpath) + self._categories = categories self._ordered = ordered @@ -316,7 +316,7 @@ def _validate_categories(categories, fastpath=False): from pandas import Index if not isinstance(categories, ABCIndexClass): - categories = Index(categories) + categories = Index(categories, tupleize_cols=False) if not fastpath: @@ -326,6 +326,9 @@ def _validate_categories(categories, fastpath=False): if not categories.is_unique: raise ValueError('Categorical categories must be unique') + if isinstance(categories, ABCCategoricalIndex): + categories = categories.categories + return categories @property diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 0b9e2c9fe5ffc7..84e6f0d4f5a7a2 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -6,7 +6,8 @@ import numpy as np import pandas as pd -from pandas import Series, Categorical, IntervalIndex, date_range +from pandas import ( + Series, Categorical, CategoricalIndex, IntervalIndex, date_range) from pandas.core.dtypes.dtypes import ( DatetimeTZDtype, PeriodDtype, @@ -657,3 +658,10 @@ def test_str_vs_repr(self): # Py2 will have unicode prefixes pat = r"CategoricalDtype\(categories=\[.*\], ordered=False\)" assert re.match(pat, repr(c1)) + + def test_categorical_categories(self): + # GH17884 + c1 = CategoricalDtype(Categorical(['a', 'b'])) + tm.assert_index_equal(c1.categories, pd.Index(['a', 'b'])) + c1 = CategoricalDtype(CategoricalIndex(['a', 'b'])) + tm.assert_index_equal(c1.categories, pd.Index(['a', 'b'])) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index e1d0b756fed1cd..d88e92a39a6c56 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -519,6 +519,18 @@ def test_contructor_from_categorical_string(self): result = Categorical(values, categories=['a', 'b', 'c'], ordered=True) tm.assert_categorical_equal(result, expected) + def test_constructor_with_categorical_categories(self): + # GH17884 + expected = Categorical(['a', 'b'], categories=['a', 'b', 'c']) + + result = Categorical( + ['a', 'b'], categories=Categorical(['a', 'b', 'c'])) + tm.assert_categorical_equal(result, expected) + + result = Categorical( + ['a', 'b'], categories=CategoricalIndex(['a', 'b', 'c'])) + tm.assert_categorical_equal(result, expected) + def test_from_codes(self): # too few categories @@ -560,6 +572,22 @@ def f(): codes = np.random.choice([0, 1], 5, p=[0.9, 0.1]) pd.Categorical.from_codes(codes, categories=["train", "test"]) + def test_from_codes_with_categorical_categories(self): + # GH17884 + expected = Categorical(['a', 'b'], categories=['a', 'b', 'c']) + + result = Categorical.from_codes( + [0, 1], categories=Categorical(['a', 'b', 'c'])) + tm.assert_categorical_equal(result, expected) + + result = Categorical.from_codes( + [0, 1], categories=CategoricalIndex(['a', 'b', 'c'])) + tm.assert_categorical_equal(result, expected) + + # non-unique Categorical still raises + with pytest.raises(ValueError): + Categorical.from_codes([0, 1], Categorical(['a', 'b', 'a'])) + @pytest.mark.parametrize('dtype', [None, 'category']) def test_from_inferred_categories(self, dtype): cats = ['a', 'b']