From e757e8a817f738421c1500863c8b022c3d5af397 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 29 Aug 2015 07:51:56 -0500 Subject: [PATCH 1/2] DEPR: No NaNs in categories --- asv_bench/benchmarks/categoricals.py | 20 ++++- doc/source/categorical.rst | 29 +++--- doc/source/whatsnew/v0.17.0.txt | 1 + pandas/core/categorical.py | 26 +++--- pandas/tests/test_categorical.py | 129 +++++++++++++++++---------- 5 files changed, 129 insertions(+), 76 deletions(-) diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index a449639f1560e..a0f9383336940 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -1,5 +1,5 @@ from .pandas_vb_common import * - +import string class concat_categorical(object): goal_time = 0.2 @@ -25,3 +25,21 @@ def time_value_counts(self): def time_value_counts_dropna(self): self.ts.value_counts(dropna=True) + +class categorical_constructor(object): + goal_time = 0.2 + + def setup(self): + n = 5 + N = 1e6 + self.categories = list(string.ascii_letters[:n]) + self.cat_idx = Index(self.categories) + self.values = np.tile(self.categories, N) + self.codes = np.tile(range(n), N) + + def time_regular_constructor(self): + Categorical(self.values, self.categories) + + def time_fastpath(self): + Categorical(self.codes, self.cat_idx, fastpath=True) + diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index 3c9b538caa555..534ab0e343398 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -632,41 +632,35 @@ Missing Data pandas primarily uses the value `np.nan` to represent missing data. It is by default not included in computations. See the :ref:`Missing Data section -` +`. -There are two ways a `np.nan` can be represented in categorical data: either the value is not -available ("missing value") or `np.nan` is a valid category. +Missing values should **not** be included in the Categorical's ``categories``, +only in the ``values``. +Instead, it is understood that NaN is different, and is always a possibility. +When working with the Categorical's ``codes``, missing values will always have +a code of ``-1``. .. ipython:: python s = pd.Series(["a","b",np.nan,"a"], dtype="category") # only two categories s - s2 = pd.Series(["a","b","c","a"], dtype="category") - s2.cat.categories = [1,2,np.nan] - # three categories, np.nan included - s2 + s.codes -.. note:: - As integer `Series` can't include NaN, the categories were converted to `object`. -.. note:: - Missing value methods like ``isnull`` and ``fillna`` will take both missing values as well as - `np.nan` categories into account: +Methods for working with missing data, e.g. :meth:`~Series.isnull`, :meth:`~Series.fillna`, +:meth:`~Series.dropna`, all work normally: .. ipython:: python c = pd.Series(["a","b",np.nan], dtype="category") - c.cat.set_categories(["a","b",np.nan], inplace=True) - # will be inserted as a NA category: - c[0] = np.nan s = pd.Series(c) s pd.isnull(s) s.fillna("a") Differences to R's `factor` -~~~~~~~~~~~~~~~~~~~~~~~~~~~ +--------------------------- The following differences to R's factor functions can be observed: @@ -677,6 +671,9 @@ The following differences to R's factor functions can be observed: * In contrast to R's `factor` function, using categorical data as the sole input to create a new categorical series will *not* remove unused categories but create a new categorical series which is equal to the passed in one! +* R allows for missing values to be included in its `levels` (pandas' `categories`). Pandas + does not allow `NaN` categories, but missing values can still be in the `values`. + Gotchas ------- diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index eae33bc80be32..424be6d949f13 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -652,6 +652,7 @@ Deprecations ===================== ================================= - ``Categorical.name`` was deprecated to make ``Categorical`` more ``numpy.ndarray`` like. Use ``Series(cat, name="whatever")`` instead (:issue:`10482`). +- Setting missing values (NaN) in a ``Categorical``'s ``categories`` will issue a warning (:issue:`10748`). You can still have missing values in the ``values``. - ``drop_duplicates`` and ``duplicated``'s ``take_last`` keyword was deprecated in favor of ``keep``. (:issue:`6511`, :issue:`8505`) - ``Series.nsmallest`` and ``nlargest``'s ``take_last`` keyword was deprecated in favor of ``keep``. (:issue:`10792`) - ``DataFrame.combineAdd`` and ``DataFrame.combineMult`` are deprecated. They diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 9951024ffe218..40694bfe85181 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -443,12 +443,18 @@ def _validate_categories(cls, categories): raise ValueError('Categorical categories must be unique') return categories - def _set_categories(self, categories): + def _set_categories(self, categories, validate=True): """ Sets new categories """ - categories = self._validate_categories(categories) - if not self._categories is None and len(categories) != len(self._categories): - raise ValueError("new categories need to have the same number of items than the old " - "categories!") + if validate: + categories = self._validate_categories(categories) + if not self._categories is None and len(categories) != len(self._categories): + raise ValueError("new categories need to have the same number of items than the old " + "categories!") + if np.any(isnull(categories)): + # NaNs in cats deprecated in 0.17, remove in 0.18 or 0.19 GH 10748 + msg = ('\nSetting NaNs in `categories` is deprecated and ' + 'will be removed in a future version of pandas.') + warn(msg, FutureWarning, stacklevel=9) self._categories = categories def _get_categories(self): @@ -581,11 +587,11 @@ def set_categories(self, new_categories, ordered=None, rename=False, inplace=Fal if not cat._categories is None and len(new_categories) < len(cat._categories): # remove all _codes which are larger and set to -1/NaN self._codes[self._codes >= len(new_categories)] = -1 - cat._categories = new_categories + cat._set_categories(new_categories, validate=False) else: values = cat.__array__() cat._codes = _get_codes_for_values(values, new_categories) - cat._categories = new_categories + cat._set_categories(new_categories, validate=False) if ordered is None: ordered = self.ordered @@ -708,7 +714,7 @@ def add_categories(self, new_categories, inplace=False): new_categories = list(self._categories) + list(new_categories) new_categories = self._validate_categories(new_categories) cat = self if inplace else self.copy() - cat._categories = new_categories + cat._set_categories(new_categories, validate=False) cat._codes = _coerce_indexer_dtype(cat._codes, new_categories) if not inplace: return cat @@ -791,7 +797,7 @@ def remove_unused_categories(self, inplace=False): from pandas.core.index import _ensure_index new_categories = _ensure_index(new_categories) cat._codes = _get_codes_for_values(cat.__array__(), new_categories) - cat._categories = new_categories + cat._set_categories(new_categories, validate=False) if not inplace: return cat @@ -1171,7 +1177,7 @@ def order(self, inplace=False, ascending=True, na_position='last'): Category.sort """ warn("order is deprecated, use sort_values(...)", - FutureWarning, stacklevel=2) + FutureWarning, stacklevel=3) return self.sort_values(inplace=inplace, ascending=ascending, na_position=na_position) def sort(self, inplace=True, ascending=True, na_position='last'): diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 05da93a4fca0f..8a71ddaa732e8 100755 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -187,17 +187,21 @@ def f(): cat = pd.Categorical([np.nan, 1., 2., 3. ]) self.assertTrue(com.is_float_dtype(cat.categories)) + # Deprecating NaNs in categoires (GH #10748) # preserve int as far as possible by converting to object if NaN is in categories - cat = pd.Categorical([np.nan, 1, 2, 3], categories=[np.nan, 1, 2, 3]) + with tm.assert_produces_warning(FutureWarning): + cat = pd.Categorical([np.nan, 1, 2, 3], categories=[np.nan, 1, 2, 3]) self.assertTrue(com.is_object_dtype(cat.categories)) # This doesn't work -> this would probably need some kind of "remember the original type" # feature to try to cast the array interface result to... #vals = np.asarray(cat[cat.notnull()]) #self.assertTrue(com.is_integer_dtype(vals)) - cat = pd.Categorical([np.nan,"a", "b", "c"], categories=[np.nan,"a", "b", "c"]) + with tm.assert_produces_warning(FutureWarning): + cat = pd.Categorical([np.nan,"a", "b", "c"], categories=[np.nan,"a", "b", "c"]) self.assertTrue(com.is_object_dtype(cat.categories)) # but don't do it for floats - cat = pd.Categorical([np.nan, 1., 2., 3.], categories=[np.nan, 1., 2., 3.]) + with tm.assert_produces_warning(FutureWarning): + cat = pd.Categorical([np.nan, 1., 2., 3.], categories=[np.nan, 1., 2., 3.]) self.assertTrue(com.is_float_dtype(cat.categories)) @@ -465,8 +469,9 @@ def test_describe(self): tm.assert_frame_equal(desc, expected) # NA as a category - cat = pd.Categorical(["a","c","c",np.nan], categories=["b","a","c",np.nan]) - result = cat.describe() + with tm.assert_produces_warning(FutureWarning): + cat = pd.Categorical(["a","c","c",np.nan], categories=["b","a","c",np.nan]) + result = cat.describe() expected = DataFrame([[0,0],[1,0.25],[2,0.5],[1,0.25]], columns=['counts','freqs'], @@ -474,8 +479,9 @@ def test_describe(self): tm.assert_frame_equal(result,expected) # NA as an unused category - cat = pd.Categorical(["a","c","c"], categories=["b","a","c",np.nan]) - result = cat.describe() + with tm.assert_produces_warning(FutureWarning): + cat = pd.Categorical(["a","c","c"], categories=["b","a","c",np.nan]) + result = cat.describe() expected = DataFrame([[0,0],[1,1/3.],[2,2/3.],[0,0]], columns=['counts','freqs'], @@ -827,29 +833,37 @@ def test_nan_handling(self): self.assert_numpy_array_equal(c._codes , np.array([0,-1,-1,0])) # If categories have nan included, the code should point to that instead - c = Categorical(["a","b",np.nan,"a"], categories=["a","b",np.nan]) - self.assert_numpy_array_equal(c.categories , np.array(["a","b",np.nan],dtype=np.object_)) - self.assert_numpy_array_equal(c._codes , np.array([0,1,2,0])) + with tm.assert_produces_warning(FutureWarning): + c = Categorical(["a","b",np.nan,"a"], categories=["a","b",np.nan]) + self.assert_numpy_array_equal(c.categories, np.array(["a","b",np.nan], + dtype=np.object_)) + self.assert_numpy_array_equal(c._codes, np.array([0,1,2,0])) c[1] = np.nan - self.assert_numpy_array_equal(c.categories , np.array(["a","b",np.nan],dtype=np.object_)) - self.assert_numpy_array_equal(c._codes , np.array([0,2,2,0])) + self.assert_numpy_array_equal(c.categories, np.array(["a","b",np.nan], + dtype=np.object_)) + self.assert_numpy_array_equal(c._codes, np.array([0,2,2,0])) # Changing categories should also make the replaced category np.nan c = Categorical(["a","b","c","a"]) - c.categories = ["a","b",np.nan] - self.assert_numpy_array_equal(c.categories , np.array(["a","b",np.nan],dtype=np.object_)) - self.assert_numpy_array_equal(c._codes , np.array([0,1,2,0])) + with tm.assert_produces_warning(FutureWarning): + c.categories = ["a","b",np.nan] + self.assert_numpy_array_equal(c.categories, np.array(["a","b",np.nan], + dtype=np.object_)) + self.assert_numpy_array_equal(c._codes, np.array([0,1,2,0])) # Adding nan to categories should make assigned nan point to the category! c = Categorical(["a","b",np.nan,"a"]) self.assert_numpy_array_equal(c.categories , np.array(["a","b"])) self.assert_numpy_array_equal(c._codes , np.array([0,1,-1,0])) - c.set_categories(["a","b",np.nan], rename=True, inplace=True) - self.assert_numpy_array_equal(c.categories , np.array(["a","b",np.nan],dtype=np.object_)) - self.assert_numpy_array_equal(c._codes , np.array([0,1,-1,0])) + with tm.assert_produces_warning(FutureWarning): + c.set_categories(["a","b",np.nan], rename=True, inplace=True) + self.assert_numpy_array_equal(c.categories, np.array(["a","b",np.nan], + dtype=np.object_)) + self.assert_numpy_array_equal(c._codes, np.array([0,1,-1,0])) c[1] = np.nan - self.assert_numpy_array_equal(c.categories , np.array(["a","b",np.nan],dtype=np.object_)) - self.assert_numpy_array_equal(c._codes , np.array([0,2,-1,0])) + self.assert_numpy_array_equal(c.categories , np.array(["a","b",np.nan], + dtype=np.object_)) + self.assert_numpy_array_equal(c._codes, np.array([0,2,-1,0])) # Remove null categories (GH 10156) cases = [ @@ -861,11 +875,13 @@ def test_nan_handling(self): null_values = [np.nan, None, pd.NaT] for with_null, without in cases: - base = Categorical([], with_null) + with tm.assert_produces_warning(FutureWarning): + base = Categorical([], with_null) expected = Categorical([], without) - for nullval in null_values: - result = base.remove_categories(nullval) + with tm.assert_produces_warning(FutureWarning): + for nullval in null_values: + result = base.remove_categories(nullval) self.assert_categorical_equal(result, expected) # Different null values are indistinguishable @@ -880,14 +896,16 @@ def test_isnull(self): res = c.isnull() self.assert_numpy_array_equal(res, exp) - c = Categorical(["a","b",np.nan], categories=["a","b",np.nan]) + with tm.assert_produces_warning(FutureWarning): + c = Categorical(["a","b",np.nan], categories=["a","b",np.nan]) res = c.isnull() self.assert_numpy_array_equal(res, exp) # test both nan in categories and as -1 exp = np.array([True, False, True]) c = Categorical(["a","b",np.nan]) - c.set_categories(["a","b",np.nan], rename=True, inplace=True) + with tm.assert_produces_warning(FutureWarning): + c.set_categories(["a","b",np.nan], rename=True, inplace=True) c[0] = np.nan res = c.isnull() self.assert_numpy_array_equal(res, exp) @@ -1087,31 +1105,36 @@ def test_set_item_nan(self): # if nan in categories, the proper code should be set! cat = pd.Categorical([1,2,3, np.nan], categories=[1,2,3]) - cat.set_categories([1,2,3, np.nan], rename=True, inplace=True) + with tm.assert_produces_warning(FutureWarning): + cat.set_categories([1,2,3, np.nan], rename=True, inplace=True) cat[1] = np.nan exp = np.array([0,3,2,-1]) self.assert_numpy_array_equal(cat.codes, exp) cat = pd.Categorical([1,2,3, np.nan], categories=[1,2,3]) - cat.set_categories([1,2,3, np.nan], rename=True, inplace=True) + with tm.assert_produces_warning(FutureWarning): + cat.set_categories([1,2,3, np.nan], rename=True, inplace=True) cat[1:3] = np.nan exp = np.array([0,3,3,-1]) self.assert_numpy_array_equal(cat.codes, exp) cat = pd.Categorical([1,2,3, np.nan], categories=[1,2,3]) - cat.set_categories([1,2,3, np.nan], rename=True, inplace=True) + with tm.assert_produces_warning(FutureWarning): + cat.set_categories([1,2,3, np.nan], rename=True, inplace=True) cat[1:3] = [np.nan, 1] exp = np.array([0,3,0,-1]) self.assert_numpy_array_equal(cat.codes, exp) cat = pd.Categorical([1,2,3, np.nan], categories=[1,2,3]) - cat.set_categories([1,2,3, np.nan], rename=True, inplace=True) + with tm.assert_produces_warning(FutureWarning): + cat.set_categories([1,2,3, np.nan], rename=True, inplace=True) cat[1:3] = [np.nan, np.nan] exp = np.array([0,3,3,-1]) self.assert_numpy_array_equal(cat.codes, exp) cat = pd.Categorical([1,2, np.nan, 3], categories=[1,2,3]) - cat.set_categories([1,2,3, np.nan], rename=True, inplace=True) + with tm.assert_produces_warning(FutureWarning): + cat.set_categories([1,2,3, np.nan], rename=True, inplace=True) cat[pd.isnull(cat)] = np.nan exp = np.array([0,1,3,2]) self.assert_numpy_array_equal(cat.codes, exp) @@ -1555,14 +1578,16 @@ def test_nan_handling(self): self.assert_numpy_array_equal(s.values.codes, np.array([0,1,-1,0])) # If categories have nan included, the label should point to that instead - s2 = Series(Categorical(["a","b",np.nan,"a"], categories=["a","b",np.nan])) + with tm.assert_produces_warning(FutureWarning): + s2 = Series(Categorical(["a","b",np.nan,"a"], categories=["a","b",np.nan])) self.assert_numpy_array_equal(s2.cat.categories, np.array(["a","b",np.nan], dtype=np.object_)) self.assert_numpy_array_equal(s2.values.codes, np.array([0,1,2,0])) # Changing categories should also make the replaced category np.nan s3 = Series(Categorical(["a","b","c","a"])) - s3.cat.categories = ["a","b",np.nan] + with tm.assert_produces_warning(FutureWarning): + s3.cat.categories = ["a","b",np.nan] self.assert_numpy_array_equal(s3.cat.categories, np.array(["a","b",np.nan], dtype=np.object_)) self.assert_numpy_array_equal(s3.values.codes, np.array([0,1,2,0])) @@ -2415,28 +2440,32 @@ def test_value_counts_with_nan(self): s.value_counts(dropna=False, sort=False), pd.Series([2, 1, 3], index=["a", "b", np.nan])) - s = pd.Series(pd.Categorical(["a", "b", "a"], categories=["a", "b", np.nan])) - tm.assert_series_equal( - s.value_counts(dropna=True), - pd.Series([2, 1], index=["a", "b"])) - tm.assert_series_equal( - s.value_counts(dropna=False), - pd.Series([2, 1, 0], index=["a", "b", np.nan])) + with tm.assert_produces_warning(FutureWarning): + s = pd.Series(pd.Categorical(["a", "b", "a"], categories=["a", "b", np.nan])) + tm.assert_series_equal( + s.value_counts(dropna=True), + pd.Series([2, 1], index=["a", "b"])) + tm.assert_series_equal( + s.value_counts(dropna=False), + pd.Series([2, 1, 0], index=["a", "b", np.nan])) - s = pd.Series(pd.Categorical(["a", "b", None, "a", None, None], categories=["a", "b", np.nan])) - tm.assert_series_equal( - s.value_counts(dropna=True), - pd.Series([2, 1], index=["a", "b"])) - tm.assert_series_equal( - s.value_counts(dropna=False), - pd.Series([3, 2, 1], index=[np.nan, "a", "b"])) + with tm.assert_produces_warning(FutureWarning): + s = pd.Series(pd.Categorical(["a", "b", None, "a", None, None], + categories=["a", "b", np.nan])) + tm.assert_series_equal( + s.value_counts(dropna=True), + pd.Series([2, 1], index=["a", "b"])) + tm.assert_series_equal( + s.value_counts(dropna=False), + pd.Series([3, 2, 1], index=[np.nan, "a", "b"])) def test_groupby(self): cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], categories=["a","b","c","d"], ordered=True) data = DataFrame({"a":[1,1,1,2,2,2,3,4,5], "b":cats}) - expected = DataFrame({ 'a' : Series([1,2,4,np.nan],index=Index(['a','b','c','d'],name='b')) }) + expected = DataFrame({'a': Series([1, 2, 4, np.nan], + index=Index(['a', 'b', 'c', 'd'], name='b'))}) result = data.groupby("b").mean() tm.assert_frame_equal(result, expected) @@ -3454,11 +3483,13 @@ def f(): # make sure that fillna takes both missing values and NA categories into account c = Categorical(["a","b",np.nan]) - c.set_categories(["a","b",np.nan], rename=True, inplace=True) + with tm.assert_produces_warning(FutureWarning): + c.set_categories(["a","b",np.nan], rename=True, inplace=True) c[0] = np.nan df = pd.DataFrame({"cats":c, "vals":[1,2,3]}) df_exp = pd.DataFrame({"cats": Categorical(["a","b","a"]), "vals": [1,2,3]}) - res = df.fillna("a") + with tm.assert_produces_warning(FutureWarning): + res = df.fillna("a") tm.assert_frame_equal(res, df_exp) From 8d87f3be5ae67d03ecd3dfb70112a663064ab19b Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 30 Aug 2015 09:07:26 -0400 Subject: [PATCH 2/2] move NaN deprecation warning to _validate_categories, cleanup a bit --- pandas/core/base.py | 1 + pandas/core/categorical.py | 69 +++++++++++++++++++++----------- pandas/tests/test_categorical.py | 20 +++++---- 3 files changed, 59 insertions(+), 31 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 6d1c89a7a2f89..fe9bac7f4c68e 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -392,6 +392,7 @@ def argmin(self, axis=None): """ return nanops.nanargmin(self.values) + @cache_readonly def hasnans(self): """ return if I have any nans; enables various perf speedups """ return com.isnull(self).any() diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 40694bfe85181..4a6a26f21b5bf 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -207,7 +207,7 @@ def __init__(self, values, categories=None, ordered=False, name=None, fastpath=F if fastpath: # fast path self._codes = _coerce_indexer_dtype(values, categories) - self.categories = categories + self._categories = self._validate_categories(categories, fastpath=isinstance(categories, ABCIndexClass)) self._ordered = ordered return @@ -274,6 +274,8 @@ def __init__(self, values, categories=None, ordered=False, name=None, fastpath=F ### FIXME #### raise NotImplementedError("> 1 ndim Categorical are not supported at this time") + categories = self._validate_categories(categories) + else: # there were two ways if categories are present # - the old one, where each value is a int pointer to the levels array -> not anymore @@ -282,7 +284,6 @@ def __init__(self, values, categories=None, ordered=False, name=None, fastpath=F # make sure that we always have the same type here, no matter what we get passed in categories = self._validate_categories(categories) - codes = _get_codes_for_values(values, categories) # TODO: check for old style usage. These warnings should be removes after 0.18/ in 2016 @@ -295,7 +296,7 @@ def __init__(self, values, categories=None, ordered=False, name=None, fastpath=F "'Categorical.from_codes(codes, categories)'?", RuntimeWarning, stacklevel=2) self.set_ordered(ordered or False, inplace=True) - self.categories = categories + self._categories = categories self._codes = _coerce_indexer_dtype(codes, categories) def copy(self): @@ -421,9 +422,15 @@ def _get_labels(self): _categories = None @classmethod - def _validate_categories(cls, categories): + def _validate_categories(cls, categories, fastpath=False): """ Validates that we have good categories + + Parameters + ---------- + fastpath : boolean (default: False) + Don't perform validation of the categories for uniqueness or nulls + """ if not isinstance(categories, ABCIndexClass): dtype = None @@ -439,22 +446,40 @@ def _validate_categories(cls, categories): from pandas import Index categories = Index(categories, dtype=dtype) - if not categories.is_unique: - raise ValueError('Categorical categories must be unique') + + if not fastpath: + + # check properties of the categories + # we don't allow NaNs in the categories themselves + + if categories.hasnans: + # NaNs in cats deprecated in 0.17, remove in 0.18 or 0.19 GH 10748 + msg = ('\nSetting NaNs in `categories` is deprecated and ' + 'will be removed in a future version of pandas.') + warn(msg, FutureWarning, stacklevel=5) + + # categories must be unique + + if not categories.is_unique: + raise ValueError('Categorical categories must be unique') + return categories - def _set_categories(self, categories, validate=True): - """ Sets new categories """ - if validate: - categories = self._validate_categories(categories) - if not self._categories is None and len(categories) != len(self._categories): - raise ValueError("new categories need to have the same number of items than the old " - "categories!") - if np.any(isnull(categories)): - # NaNs in cats deprecated in 0.17, remove in 0.18 or 0.19 GH 10748 - msg = ('\nSetting NaNs in `categories` is deprecated and ' - 'will be removed in a future version of pandas.') - warn(msg, FutureWarning, stacklevel=9) + def _set_categories(self, categories, fastpath=False): + """ Sets new categories + + Parameters + ---------- + fastpath : boolean (default: False) + Don't perform validation of the categories for uniqueness or nulls + + """ + + categories = self._validate_categories(categories, fastpath=fastpath) + if not fastpath and not self._categories is None and len(categories) != len(self._categories): + raise ValueError("new categories need to have the same number of items than the old " + "categories!") + self._categories = categories def _get_categories(self): @@ -587,11 +612,10 @@ def set_categories(self, new_categories, ordered=None, rename=False, inplace=Fal if not cat._categories is None and len(new_categories) < len(cat._categories): # remove all _codes which are larger and set to -1/NaN self._codes[self._codes >= len(new_categories)] = -1 - cat._set_categories(new_categories, validate=False) else: values = cat.__array__() cat._codes = _get_codes_for_values(values, new_categories) - cat._set_categories(new_categories, validate=False) + cat._categories = new_categories if ordered is None: ordered = self.ordered @@ -712,9 +736,8 @@ def add_categories(self, new_categories, inplace=False): msg = "new categories must not include old categories: %s" % str(already_included) raise ValueError(msg) new_categories = list(self._categories) + list(new_categories) - new_categories = self._validate_categories(new_categories) cat = self if inplace else self.copy() - cat._set_categories(new_categories, validate=False) + cat._categories = self._validate_categories(new_categories) cat._codes = _coerce_indexer_dtype(cat._codes, new_categories) if not inplace: return cat @@ -797,7 +820,7 @@ def remove_unused_categories(self, inplace=False): from pandas.core.index import _ensure_index new_categories = _ensure_index(new_categories) cat._codes = _get_codes_for_values(cat.__array__(), new_categories) - cat._set_categories(new_categories, validate=False) + cat._categories = new_categories if not inplace: return cat diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 8a71ddaa732e8..d847638ff105e 100755 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -129,7 +129,8 @@ def f(): Categorical(["a","b"], ["a","b","b"]) self.assertRaises(ValueError, f) def f(): - Categorical([1,2], [1,2,np.nan, np.nan]) + with tm.assert_produces_warning(FutureWarning): + Categorical([1,2], [1,2,np.nan, np.nan]) self.assertRaises(ValueError, f) # The default should be unordered @@ -879,15 +880,18 @@ def test_nan_handling(self): base = Categorical([], with_null) expected = Categorical([], without) - with tm.assert_produces_warning(FutureWarning): - for nullval in null_values: - result = base.remove_categories(nullval) - self.assert_categorical_equal(result, expected) + for nullval in null_values: + result = base.remove_categories(nullval) + self.assert_categorical_equal(result, expected) # Different null values are indistinguishable for i, j in [(0, 1), (0, 2), (1, 2)]: nulls = [null_values[i], null_values[j]] - self.assertRaises(ValueError, lambda: Categorical([], categories=nulls)) + + def f(): + with tm.assert_produces_warning(FutureWarning): + Categorical([], categories=nulls) + self.assertRaises(ValueError, f) def test_isnull(self): @@ -3488,8 +3492,8 @@ def f(): c[0] = np.nan df = pd.DataFrame({"cats":c, "vals":[1,2,3]}) df_exp = pd.DataFrame({"cats": Categorical(["a","b","a"]), "vals": [1,2,3]}) - with tm.assert_produces_warning(FutureWarning): - res = df.fillna("a") + + res = df.fillna("a") tm.assert_frame_equal(res, df_exp)