From 081d5336b4ec4f4bb0ad9ee056319cf253eb6842 Mon Sep 17 00:00:00 2001 From: jschendel Date: Fri, 3 Nov 2017 16:36:37 -0600 Subject: [PATCH] ENH: Implement DataFrame.astype('category') --- doc/source/categorical.rst | 56 +++++++++++++++++++ doc/source/whatsnew/v0.22.0.txt | 22 +++++++- pandas/core/generic.py | 24 +++++++- pandas/tests/test_categorical.py | 95 ++++++++++++++++++++++---------- 4 files changed, 161 insertions(+), 36 deletions(-) diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index 2acc919d1fbdf1..614ba5565d0bb4 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -45,9 +45,16 @@ The categorical data type is useful in the following cases: See also the :ref:`API docs on categoricals`. +.. _categorical.objectcreation: + Object Creation --------------- +.. _categorical.objectcreation.series: + +Creating categories from a ``Series`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + Categorical `Series` or columns in a `DataFrame` can be created in several ways: By specifying ``dtype="category"`` when constructing a `Series`: @@ -143,6 +150,55 @@ constructor to save the factorize step during normal constructor mode: splitter = np.random.choice([0,1], 5, p=[0.5,0.5]) s = pd.Series(pd.Categorical.from_codes(splitter, categories=["train", "test"])) +.. _categorical.objectcreation.frame: + +Creating categories from a ``DataFrame`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. versionadded:: 0.22.0 + +:meth:`DataFrame.astype` supports simultaneously setting multiple columns as categorical. When setting multiple +columns as categorical, by default each column's dtype will contain categories for all labels present in all columns, even +if a column does not contain all labels: + +.. ipython:: python + + df = pd.DataFrame({'A': ['a', 'b', 'c'], 'B': ['c', 'd', 'e']}) + df = df.astype('category') + df + df['A'].dtype + df['B'].dtype + +Note that this behavior is different than instantiating a ``DataFrame`` with categorical dtype, which will only assign +categories to each column based on the labels present in each column: + +.. ipython:: python + + df = pd.DataFrame({'A': ['a', 'b', 'c'], 'B': ['c', 'd', 'e']}, dtype='category') + df['A'].dtype + df['B'].dtype + +When using ``astype``, you can control the categories that will be present in each column by passing +a ``CategoricalDtype``: + +.. ipython:: python + + df = pd.DataFrame({'A': ['a', 'b', 'c'], 'B': ['c', 'd', 'e']}) + dtype = CategoricalDtype(categories=list('abdef'), ordered=True) + df = df.astype(dtype) + df + df['A'].dtype + df['B'].dtype + +Use subselection if you only want to convert certain columns to categorical. The same be behaviors previously +discussed hold with subselection. + +.. ipython:: python + + df = pd.DataFrame({'A': ['a', 'b', 'c'], 'B': ['c', 'd', 'e'], 'C': ['x', 'y', 'z']}) + df[['A', 'B']] = df[['A', 'B']].astype('category') + df.dtypes + .. _categorical.categoricaldtype: CategoricalDtype diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index c41da4d67afe5b..a331f076d06827 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -13,9 +13,25 @@ version. New features ~~~~~~~~~~~~ -- -- -- +.. _whatsnew_0220.enhancements.astype_category: + +``DataFrame.astype`` now supports categoricals +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:meth:`DataFrame.astype` now supports simultaneously setting multiple columns as categorical (:issue:`12860`) + +When setting multiple columns as categorical, by default each column's dtype will contain categories for all +labels present in all columns, even if a column does not contain all labels: + +.. ipython:: python + + df = pd.DataFrame({'A': ['a', 'b', 'c'], 'B': ['c', 'd', 'e']}) + df = df.astype('category') + df + df['A'].dtype + df['B'].dtype + +See the :ref:`categorical.objectcreation.frame` section of the documentation for more details and examples. .. _whatsnew_0220.enhancements.other: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 14bf9710fca6a4..22d8166021266e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -18,6 +18,7 @@ is_number, is_integer, is_bool, is_bool_dtype, + is_categorical_dtype, is_numeric_dtype, is_datetime64_dtype, is_timedelta64_dtype, @@ -25,7 +26,8 @@ is_list_like, is_dict_like, is_re_compilable, - pandas_dtype) + pandas_dtype, + CategoricalDtype) from pandas.core.dtypes.cast import maybe_promote, maybe_upcast_putmask from pandas.core.dtypes.missing import isna, notna from pandas.core.dtypes.generic import ABCSeries, ABCPanel, ABCDataFrame @@ -3973,14 +3975,30 @@ def astype(self, dtype, copy=True, errors='raise', **kwargs): if col_name not in self: raise KeyError('Only a column name can be used for the ' 'key in a dtype mappings argument.') - from pandas import concat results = [] for col_name, col in self.iteritems(): if col_name in dtype: results.append(col.astype(dtype[col_name], copy=copy)) else: results.append(results.append(col.copy() if copy else col)) - return concat(results, axis=1, copy=False) + return pd.concat(results, axis=1, copy=False) + + elif is_categorical_dtype(dtype) and self.ndim > 1: + # GH 12860 + dtype_with_cat = (isinstance(dtype, CategoricalDtype) and + dtype.categories is not None) + if not dtype_with_cat: + categories = kwargs.get('categories', None) + ordered = (kwargs.get('ordered', None) or + getattr(dtype, 'ordered', None)) + + if categories is None: + categories = algos.unique(self.values.ravel(order='F')) + + dtype = CategoricalDtype(categories, ordered) + + results = (self[col].astype(dtype, copy=copy) for col in self) + return pd.concat(results, axis=1, copy=False) # else, only a single dtype is given new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors, diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 6366aae8ccdf6c..7dbb64d343cb69 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -2085,51 +2085,86 @@ def test_basic(self): result = x.person_name.loc[0] assert result == expected - def test_creation_astype(self): - l = ["a", "b", "c", "a"] - s = pd.Series(l) - exp = pd.Series(Categorical(l)) - res = s.astype('category') + def test_series_creation_astype(self): + labels = list('abca') + exp = Series(Categorical(labels)) + res = Series(labels).astype('category') tm.assert_series_equal(res, exp) - l = [1, 2, 3, 1] - s = pd.Series(l) - exp = pd.Series(Categorical(l)) - res = s.astype('category') + labels = [1, 2, 3, 1] + exp = Series(Categorical(labels)) + res = Series(labels).astype('category') tm.assert_series_equal(res, exp) - df = pd.DataFrame({"cats": [1, 2, 3, 4, 5, 6], - "vals": [1, 2, 3, 4, 5, 6]}) - cats = Categorical([1, 2, 3, 4, 5, 6]) - exp_df = pd.DataFrame({"cats": cats, "vals": [1, 2, 3, 4, 5, 6]}) - df["cats"] = df["cats"].astype("category") - tm.assert_frame_equal(exp_df, df) + labels_int = [1, 2, 3, 4, 5, 6] + exp = DataFrame({"cats": Categorical(labels_int), "vals": labels_int}) + res = DataFrame({"cats": labels_int, "vals": labels_int}) + res["cats"] = res["cats"].astype("category") + tm.assert_frame_equal(res, exp) - df = pd.DataFrame({"cats": ['a', 'b', 'b', 'a', 'a', 'd'], - "vals": [1, 2, 3, 4, 5, 6]}) - cats = Categorical(['a', 'b', 'b', 'a', 'a', 'd']) - exp_df = pd.DataFrame({"cats": cats, "vals": [1, 2, 3, 4, 5, 6]}) - df["cats"] = df["cats"].astype("category") - tm.assert_frame_equal(exp_df, df) + labels_str = list('abbaad') + exp = DataFrame({"cats": Categorical(labels_str), "vals": labels_int}) + res = DataFrame({"cats": labels_str, "vals": labels_int}) + res["cats"] = res["cats"].astype("category") + tm.assert_frame_equal(res, exp) # with keywords - l = ["a", "b", "c", "a"] - s = pd.Series(l) - exp = pd.Series(Categorical(l, ordered=True)) + labels = list('abca') + s = Series(labels) + exp = Series(Categorical(labels, ordered=True)) res = s.astype(CategoricalDtype(None, ordered=True)) tm.assert_series_equal(res, exp) - exp = pd.Series(Categorical( - l, categories=list('abcdef'), ordered=True)) - res = s.astype(CategoricalDtype(list('abcdef'), ordered=True)) + cats = list('abcdef') + exp = Series(Categorical(labels, categories=cats, ordered=True)) + res = s.astype(CategoricalDtype(cats, ordered=True)) tm.assert_series_equal(res, exp) + def test_frame_creation_astype(self): + # GH 12860 + cats = list('abcde') + x = Categorical(list('abcd'), categories=cats) + y = Categorical(list('bcde'), categories=cats) + exp = DataFrame({'x': x, 'y': y}) + + data = {'x': list('abcd'), 'y': list('bcde')} + res = DataFrame(data).astype('category') + tm.assert_frame_equal(res, exp) + + res = DataFrame(data).astype(CategoricalDtype()) + tm.assert_frame_equal(res, exp) + + # categories keyword + cats = list('abdef') + x = Categorical(['a', 'b', np.nan, 'd'], categories=cats) + y = Categorical(['b', np.nan, 'd', 'e'], categories=cats) + exp = DataFrame({'x': x, 'y': y}) + + res = DataFrame(data).astype('category', categories=cats) + tm.assert_frame_equal(res, exp) + + res = DataFrame(data).astype(CategoricalDtype(categories=cats)) + tm.assert_frame_equal(res, exp) + + # ordered keyword + cats = [1, 2, 3, 4, 0] + x = Categorical(range(1, 5), categories=cats, ordered=True) + y = Categorical(range(4), categories=cats, ordered=True) + exp = DataFrame({'x': x, 'y': y}) + + data = {'x': range(1, 5), 'y': range(4)} + res = DataFrame(data).astype('category', ordered=True) + tm.assert_frame_equal(res, exp) + + res = DataFrame(data).astype(CategoricalDtype(ordered=True)) + tm.assert_frame_equal(res, exp) + @pytest.mark.parametrize('columns', [['x'], ['x', 'y'], ['x', 'y', 'z']]) def test_empty_astype(self, columns): # GH 18004 - msg = '> 1 ndim Categorical are not supported at this time' - with tm.assert_raises_regex(NotImplementedError, msg): - DataFrame(columns=columns).astype('category') + exp = DataFrame({c: Categorical([]) for c in columns}, index=[]) + res = DataFrame(columns=columns).astype('category') + tm.assert_frame_equal(res, exp) def test_construction_series(self):