From e4639ee14f5562125886bca607e3a9615db0c439 Mon Sep 17 00:00:00 2001 From: Jan Schulz Date: Sun, 19 Jul 2015 23:28:13 +0200 Subject: [PATCH] Remove Categorical.name to make it more numpy.ndarray like `name` was initialy introduced to save the name of a Series/column during a groupby, when categorical was mostly a helper for that. See here for the discussion: https://github.com/pydata/pandas/issues/10482 Closes: #10482 --- doc/source/whatsnew/v0.17.0.txt | 1 + pandas/core/categorical.py | 76 +++++++++++++------------------- pandas/core/format.py | 9 +--- pandas/core/groupby.py | 2 - pandas/core/index.py | 2 +- pandas/core/series.py | 2 - pandas/tests/test_categorical.py | 37 +++++++++------- pandas/tests/test_groupby.py | 21 ++++----- pandas/tests/test_series.py | 2 - pandas/tools/tile.py | 4 +- pandas/util/testing.py | 3 -- 11 files changed, 65 insertions(+), 94 deletions(-) diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index b2a1e10469a0f..75314debe4c63 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -270,6 +270,7 @@ Other API Changes - Enable serialization of lists and dicts to strings in ExcelWriter (:issue:`8188`) - Allow passing `kwargs` to the interpolation methods (:issue:`10378`). - Serialize metadata properties of subclasses of pandas objects (:issue:`10553`). +- ``Categorical.name`` was removed to make `Categorical` more ``numpy.ndarray`` like. Use ``Series(cat, name="whatever")`` instead (:issue:`10482`). - ``NaT``'s methods now either raise ``ValueError``, or return ``np.nan`` or ``NaT`` (:issue:`9513`) =========================== ============================================================== diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 264e7aea3fa0a..1d1f0d7da80e4 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -147,9 +147,6 @@ class Categorical(PandasObject): ordered : boolean, (default False) Whether or not this categorical is treated as a ordered categorical. If not given, the resulting categorical will not be ordered. - name : str, optional - Name for the Categorical variable. If name is None, will attempt - to infer from values. Attributes ---------- @@ -159,8 +156,6 @@ class Categorical(PandasObject): The codes (integer positions, which point to the categories) of this categorical, read only. ordered : boolean Whether or not this Categorical is ordered. - name : string - The name of this Categorical. Raises ------ @@ -205,7 +200,6 @@ class Categorical(PandasObject): # For comparisons, so that numpy uses our implementation if the compare ops, which raise __array_priority__ = 1000 _typ = 'categorical' - name = None def __init__(self, values, categories=None, ordered=False, name=None, fastpath=False, levels=None): @@ -213,23 +207,24 @@ def __init__(self, values, categories=None, ordered=False, name=None, fastpath=F if fastpath: # fast path self._codes = _coerce_indexer_dtype(values, categories) - self.name = name self.categories = categories self._ordered = ordered return - if name is None: - name = getattr(values, 'name', None) + if not name is None: + msg = "the 'name' keyword is removed, use 'name' with consumers of the " \ + "categorical instead (e.g. 'Series(cat, name=\"something\")'" + warn(msg, UserWarning, stacklevel=2) # TODO: Remove after deprecation period in 2017/ after 0.18 if not levels is None: warn("Creating a 'Categorical' with 'levels' is deprecated, use 'categories' instead", - FutureWarning) + FutureWarning, stacklevel=2) if categories is None: categories = levels else: raise ValueError("Cannot pass in both 'categories' and (deprecated) 'levels', " - "use only 'categories'") + "use only 'categories'", stacklevel=2) # sanitize input if is_categorical_dtype(values): @@ -293,21 +288,20 @@ def __init__(self, values, categories=None, ordered=False, name=None, fastpath=F # TODO: check for old style usage. These warnings should be removes after 0.18/ in 2016 if is_integer_dtype(values) and not is_integer_dtype(categories): warn("Values and categories have different dtypes. Did you mean to use\n" - "'Categorical.from_codes(codes, categories)'?", RuntimeWarning) + "'Categorical.from_codes(codes, categories)'?", RuntimeWarning, stacklevel=2) if len(values) and is_integer_dtype(values) and (codes == -1).all(): warn("None of the categories were found in values. Did you mean to use\n" - "'Categorical.from_codes(codes, categories)'?", RuntimeWarning) + "'Categorical.from_codes(codes, categories)'?", RuntimeWarning, stacklevel=2) self.set_ordered(ordered or False, inplace=True) self.categories = categories - self.name = name self._codes = _coerce_indexer_dtype(codes, categories) def copy(self): """ Copy constructor. """ return Categorical(values=self._codes.copy(),categories=self.categories, - name=self.name, ordered=self.ordered, fastpath=True) + ordered=self.ordered, fastpath=True) def astype(self, dtype): """ coerce this type to another dtype """ @@ -373,9 +367,12 @@ def from_codes(cls, codes, categories, ordered=False, name=None): ordered : boolean, (default False) Whether or not this categorical is treated as a ordered categorical. If not given, the resulting categorical will be unordered. - name : str, optional - Name for the Categorical variable. """ + if not name is None: + msg = "the 'name' keyword is removed, use 'name' with consumers of the " \ + "categorical instead (e.g. 'Series(cat, name=\"something\")'" + warn(msg, UserWarning, stacklevel=2) + try: codes = np.asarray(codes, np.int64) except: @@ -386,7 +383,7 @@ def from_codes(cls, codes, categories, ordered=False, name=None): if len(codes) and (codes.max() >= len(categories) or codes.min() < -1): raise ValueError("codes need to be between -1 and len(categories)-1") - return Categorical(codes, categories=categories, ordered=ordered, name=name, fastpath=True) + return Categorical(codes, categories=categories, ordered=ordered, fastpath=True) _codes = None @@ -416,8 +413,7 @@ def _get_labels(self): Deprecated, use .codes! """ - import warnings - warnings.warn("'labels' is deprecated. Use 'codes' instead", FutureWarning) + warn("'labels' is deprecated. Use 'codes' instead", FutureWarning, stacklevel=3) return self.codes labels = property(fget=_get_labels, fset=_set_codes) @@ -464,12 +460,12 @@ def _get_categories(self): def _set_levels(self, levels): """ set new levels (deprecated, use "categories") """ - warn("Assigning to 'levels' is deprecated, use 'categories'", FutureWarning) + warn("Assigning to 'levels' is deprecated, use 'categories'", FutureWarning, stacklevel=3) self.categories = levels def _get_levels(self): """ Gets the levels (deprecated, use "categories") """ - warn("Accessing 'levels' is deprecated, use 'categories'", FutureWarning) + warn("Accessing 'levels' is deprecated, use 'categories'", FutureWarning, stacklevel=3) return self.categories # TODO: Remove after deprecation period in 2017/ after 0.18 @@ -479,7 +475,8 @@ def _get_levels(self): def _set_ordered(self, value): """ Sets the ordered attribute to the boolean value """ - warn("Setting 'ordered' directly is deprecated, use 'set_ordered'", FutureWarning) + warn("Setting 'ordered' directly is deprecated, use 'set_ordered'", FutureWarning, + stacklevel=3) self.set_ordered(value, inplace=True) def set_ordered(self, value, inplace=False): @@ -1140,7 +1137,7 @@ def order(self, inplace=False, ascending=True, na_position='last'): return else: return Categorical(values=codes,categories=self.categories, ordered=self.ordered, - name=self.name, fastpath=True) + fastpath=True) def sort(self, inplace=True, ascending=True, na_position='last'): @@ -1266,7 +1263,7 @@ def fillna(self, value=None, method=None, limit=None): values[mask] = self.categories.get_loc(value) return Categorical(values, categories=self.categories, ordered=self.ordered, - name=self.name, fastpath=True) + fastpath=True) def take_nd(self, indexer, allow_fill=True, fill_value=None): """ Take the codes by the indexer, fill with the fill_value. @@ -1280,7 +1277,7 @@ def take_nd(self, indexer, allow_fill=True, fill_value=None): codes = take_1d(self._codes, indexer, allow_fill=True, fill_value=-1) result = Categorical(codes, categories=self.categories, ordered=self.ordered, - name=self.name, fastpath=True) + fastpath=True) return result take = take_nd @@ -1300,7 +1297,7 @@ def _slice(self, slicer): _codes = self._codes[slicer] return Categorical(values=_codes,categories=self.categories, ordered=self.ordered, - name=self.name, fastpath=True) + fastpath=True) def __len__(self): """The length of this Categorical.""" @@ -1313,9 +1310,8 @@ def __iter__(self): def _tidy_repr(self, max_vals=10, footer=True): """ a short repr displaying only max_vals and an optional (but default footer) """ num = max_vals // 2 - head = self[:num]._get_repr(length=False, name=False, footer=False) + head = self[:num]._get_repr(length=False, footer=False) tail = self[-(max_vals - num):]._get_repr(length=False, - name=False, footer=False) result = '%s, ..., %s' % (head[:-1], tail[1:]) @@ -1369,14 +1365,11 @@ def _repr_categories_info(self): def _repr_footer(self): - namestr = "Name: %s, " % self.name if self.name is not None else "" - return u('%sLength: %d\n%s') % (namestr, - len(self), self._repr_categories_info()) + return u('Length: %d\n%s') % (len(self), self._repr_categories_info()) - def _get_repr(self, name=False, length=True, na_rep='NaN', footer=True): + def _get_repr(self, length=True, na_rep='NaN', footer=True): from pandas.core import format as fmt formatter = fmt.CategoricalFormatter(self, - name=name, length=length, na_rep=na_rep, footer=footer) @@ -1389,11 +1382,9 @@ def __unicode__(self): if len(self._codes) > _maxlen: result = self._tidy_repr(_maxlen) elif len(self._codes) > 0: - result = self._get_repr(length=len(self) > _maxlen, - name=True) + result = self._get_repr(length=len(self) > _maxlen) else: - result = '[], %s' % self._get_repr(name=True, - length=False, + result = '[], %s' % self._get_repr(length=False, footer=True, ).replace("\n",", ") @@ -1562,8 +1553,7 @@ def mode(self): import pandas.hashtable as htable good = self._codes != -1 result = Categorical(sorted(htable.mode_int64(_ensure_int64(self._codes[good]))), - categories=self.categories,ordered=self.ordered, name=self.name, - fastpath=True) + categories=self.categories,ordered=self.ordered, fastpath=True) return result def unique(self): @@ -1586,8 +1576,6 @@ def equals(self, other): """ Returns True if categorical arrays are equal. - The name of the `Categorical` is not compared! - Parameters ---------- other : `Categorical` @@ -1596,7 +1584,6 @@ def equals(self, other): ------- are_equal : boolean """ - # TODO: should this also test if name is equal? return self.is_dtype_equal(other) and np.array_equal(self._codes, other._codes) def is_dtype_equal(self, other): @@ -1647,7 +1634,7 @@ def repeat(self, repeats): """ codes = self._codes.repeat(repeats) return Categorical(values=codes, categories=self.categories, - ordered=self.ordered, name=self.name, fastpath=True) + ordered=self.ordered, fastpath=True) ##### The Series.cat accessor ##### @@ -1696,7 +1683,6 @@ def _delegate_method(self, name, *args, **kwargs): if not res is None: return Series(res, index=self.index) -# TODO: remove levels after the deprecation period CategoricalAccessor._add_delegate_accessors(delegate=Categorical, accessors=["categories", "ordered"], typ='property') diff --git a/pandas/core/format.py b/pandas/core/format.py index 4f0e57130006b..6a05f819908af 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -68,10 +68,9 @@ class CategoricalFormatter(object): def __init__(self, categorical, buf=None, length=True, - na_rep='NaN', name=False, footer=True): + na_rep='NaN', footer=True): self.categorical = categorical self.buf = buf if buf is not None else StringIO(u("")) - self.name = name self.na_rep = na_rep self.length = length self.footer = footer @@ -79,12 +78,6 @@ def __init__(self, categorical, buf=None, length=True, def _get_footer(self): footer = '' - if self.name: - name = com.pprint_thing(self.categorical.name, - escape_chars=('\t', '\r', '\n')) - footer += ('Name: %s' % name if self.categorical.name is not None - else '') - if self.length: if footer: footer += ', ' diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index df788f806eda6..8e1a18006bbaf 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1960,8 +1960,6 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, self._group_index = CategoricalIndex(Categorical.from_codes(np.arange(len(c)), categories=c, ordered=self.grouper.ordered)) - if self.name is None: - self.name = self.grouper.name # a passed Grouper like elif isinstance(self.grouper, Grouper): diff --git a/pandas/core/index.py b/pandas/core/index.py index 98e0214dbf073..442f188267a58 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -4414,7 +4414,7 @@ def from_arrays(cls, arrays, sortorder=None, names=None): levels = [c.categories for c in cats] labels = [c.codes for c in cats] if names is None: - names = [c.name for c in cats] + names = [getattr(arr, "name", None) for arr in arrays] return MultiIndex(levels=levels, labels=labels, sortorder=sortorder, names=names, diff --git a/pandas/core/series.py b/pandas/core/series.py index 062a32413286f..506aa1a6eb51e 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -189,8 +189,6 @@ def __init__(self, data=None, index=None, dtype=None, name=None, elif isinstance(data, Categorical): if dtype is not None: raise ValueError("cannot specify a dtype with a Categorical") - if name is None: - name = data.name elif (isinstance(data, types.GeneratorType) or (compat.PY3 and isinstance(data, map))): data = list(data) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 5f3ff794b4900..fdd20af6ab6ce 100755 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -492,7 +492,7 @@ def test_print(self): def test_big_print(self): factor = Categorical([0,1,2,0,1,2]*100, ['a', 'b', 'c'], name='cat', fastpath=True) expected = ["[a, b, c, a, b, ..., b, c, a, b, c]", - "Name: cat, Length: 600", + "Length: 600", "Categories (3, object): [a, b, c]"] expected = "\n".join(expected) @@ -501,15 +501,11 @@ def test_big_print(self): self.assertEqual(actual, expected) def test_empty_print(self): - factor = Categorical([], ["a","b","c"], name="cat") - expected = ("[], Name: cat, Categories (3, object): [a, b, c]") - # hack because array_repr changed in numpy > 1.6.x - actual = repr(factor) - self.assertEqual(actual, expected) - factor = Categorical([], ["a","b","c"]) expected = ("[], Categories (3, object): [a, b, c]") + # hack because array_repr changed in numpy > 1.6.x actual = repr(factor) + self.assertEqual(actual, expected) self.assertEqual(expected, actual) factor = Categorical([], ["a","b","c"], ordered=True) @@ -523,9 +519,9 @@ def test_empty_print(self): def test_print_none_width(self): # GH10087 - a = pd.Series(pd.Categorical([1,2,3,4], name="a")) + a = pd.Series(pd.Categorical([1,2,3,4])) exp = u("0 1\n1 2\n2 3\n3 4\n" + - "Name: a, dtype: category\nCategories (4, int64): [1, 2, 3, 4]") + "dtype: category\nCategories (4, int64): [1, 2, 3, 4]") with option_context("display.width", None): self.assertEqual(exp, repr(a)) @@ -1170,6 +1166,13 @@ def test_deprecated_levels(self): self.assertFalse(LooseVersion(pd.__version__) >= '0.18') + def test_removed_names_produces_warning(self): + with tm.assert_produces_warning(UserWarning): + Categorical([0,1], name="a") + + with tm.assert_produces_warning(UserWarning): + Categorical.from_codes([1,2], ["a","b","c"], name="a") + def test_datetime_categorical_comparison(self): dt_cat = pd.Categorical(pd.date_range('2014-01-01', periods=3), ordered=True) self.assert_numpy_array_equal(dt_cat > dt_cat[0], [False, True, True]) @@ -1673,23 +1676,23 @@ def test_describe(self): self.assert_numpy_array_equal(res["cat"].values, res["s"].values) def test_repr(self): - a = pd.Series(pd.Categorical([1,2,3,4], name="a")) + a = pd.Series(pd.Categorical([1,2,3,4])) exp = u("0 1\n1 2\n2 3\n3 4\n" + - "Name: a, dtype: category\nCategories (4, int64): [1, 2, 3, 4]") + "dtype: category\nCategories (4, int64): [1, 2, 3, 4]") self.assertEqual(exp, a.__unicode__()) - a = pd.Series(pd.Categorical(["a","b"] *25, name="a")) + a = pd.Series(pd.Categorical(["a","b"] *25)) exp = u("0 a\n1 b\n" + " ..\n" + "48 a\n49 b\n" + - "Name: a, dtype: category\nCategories (2, object): [a, b]") + "dtype: category\nCategories (2, object): [a, b]") with option_context("display.max_rows", 5): self.assertEqual(exp, repr(a)) levs = list("abcdefghijklmnopqrstuvwxyz") - a = pd.Series(pd.Categorical(["a","b"], name="a", categories=levs, ordered=True)) + a = pd.Series(pd.Categorical(["a","b"], categories=levs, ordered=True)) exp = u("0 a\n1 b\n" + - "Name: a, dtype: category\n" + "dtype: category\n" "Categories (26, object): [a < b < c < d ... w < x < y < z]") self.assertEqual(exp,a.__unicode__()) @@ -2202,8 +2205,8 @@ def test_slicing_doc_examples(self): tm.assert_series_equal(result, expected) result = df.loc["h":"j","cats"] - expected = Series(Categorical(['a','b','b'], name='cats', - categories=['a','b','c']), index=['h','i','j']) + expected = Series(Categorical(['a','b','b'], + categories=['a','b','c']), index=['h','i','j'], name='cats') tm.assert_series_equal(result, expected) result = df.ix["h":"j",0:1] diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 91902aae3c835..a73f4e2939578 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -3453,7 +3453,7 @@ def test_groupby_categorical(self): levels = ['foo', 'bar', 'baz', 'qux'] codes = np.random.randint(0, 4, size=100) - cats = Categorical.from_codes(codes, levels, name='myfactor', ordered=True) + cats = Categorical.from_codes(codes, levels, ordered=True) data = DataFrame(np.random.randn(100, 4)) @@ -3461,10 +3461,8 @@ def test_groupby_categorical(self): expected = data.groupby(np.asarray(cats)).mean() expected = expected.reindex(levels) - expected.index.name = 'myfactor' assert_frame_equal(result, expected) - self.assertEqual(result.index.name, cats.name) grouped = data.groupby(cats) desc_result = grouped.describe() @@ -3473,12 +3471,12 @@ def test_groupby_categorical(self): ord_labels = np.asarray(cats).take(idx) ord_data = data.take(idx) expected = ord_data.groupby(ord_labels, sort=False).describe() - expected.index.names = ['myfactor', None] + expected.index.names = [None, None] assert_frame_equal(desc_result, expected) # GH 10460 - expc = Categorical.from_codes(np.arange(4).repeat(8), levels, name='myfactor', ordered=True) - exp = CategoricalIndex(expc, name='myfactor') + expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True) + exp = CategoricalIndex(expc) self.assert_index_equal(desc_result.index.get_level_values(0), exp) exp = Index(['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'] * 4) self.assert_index_equal(desc_result.index.get_level_values(1), exp) @@ -3488,7 +3486,7 @@ def test_groupby_datetime_categorical(self): levels = pd.date_range('2014-01-01', periods=4) codes = np.random.randint(0, 4, size=100) - cats = Categorical.from_codes(codes, levels, name='myfactor', ordered=True) + cats = Categorical.from_codes(codes, levels, ordered=True) data = DataFrame(np.random.randn(100, 4)) result = data.groupby(cats).mean() @@ -3496,10 +3494,9 @@ def test_groupby_datetime_categorical(self): expected = data.groupby(np.asarray(cats)).mean() expected = expected.reindex(levels) expected.index = CategoricalIndex(expected.index, categories=expected.index, - name='myfactor', ordered=True) + ordered=True) assert_frame_equal(result, expected) - self.assertEqual(result.index.name, cats.name) grouped = data.groupby(cats) desc_result = grouped.describe() @@ -3508,14 +3505,14 @@ def test_groupby_datetime_categorical(self): ord_labels = cats.take_nd(idx) ord_data = data.take(idx) expected = ord_data.groupby(ord_labels).describe() - expected.index.names = ['myfactor', None] + expected.index.names = [None, None] assert_frame_equal(desc_result, expected) tm.assert_index_equal(desc_result.index, expected.index) tm.assert_index_equal(desc_result.index.get_level_values(0), expected.index.get_level_values(0)) # GH 10460 - expc = Categorical.from_codes(np.arange(4).repeat(8), levels, name='myfactor', ordered=True) - exp = CategoricalIndex(expc, name='myfactor') + expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True) + exp = CategoricalIndex(expc) self.assert_index_equal(desc_result.index.get_level_values(0), exp) exp = Index(['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'] * 4) self.assert_index_equal(desc_result.index.get_level_values(1), exp) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index becea04c5dc98..cb6659af9eca5 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -692,9 +692,7 @@ def test_constructor_map(self): def test_constructor_categorical(self): cat = pd.Categorical([0, 1, 2, 0, 1, 2], ['a', 'b', 'c'], fastpath=True) - cat.name = 'foo' res = Series(cat) - self.assertEqual(res.name, cat.name) self.assertTrue(res.values.equals(cat)) def test_constructor_maskedarray(self): diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index 6830919d9c09f..416addfcf2ad5 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -217,7 +217,7 @@ def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, levels = np.asarray(levels, dtype=object) np.putmask(ids, na_mask, 0) - fac = Categorical(ids - 1, levels, ordered=True, name=name, fastpath=True) + fac = Categorical(ids - 1, levels, ordered=True, fastpath=True) else: fac = ids - 1 if has_nas: @@ -225,7 +225,7 @@ def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, np.putmask(fac, na_mask, np.nan) if x_is_series: - fac = Series(fac, index=series_index) + fac = Series(fac, index=series_index, name=name) if not retbins: return fac diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 9f75e42a8676a..97bae51b18248 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -630,9 +630,6 @@ def assert_categorical_equal(res, exp): if res.ordered != exp.ordered: raise AssertionError("ordered not the same") - if res.name != exp.name: - raise AssertionError("name not the same") - def assert_numpy_array_equal(np_array, assert_equal, err_msg=None): """Checks that 'np_array' is equal to 'assert_equal'