Skip to content

Commit

Permalink
ENH: Implement DataFrame.astype('category')
Browse files Browse the repository at this point in the history
  • Loading branch information
jschendel committed Nov 3, 2017
1 parent 27bbea7 commit 081d533
Show file tree
Hide file tree
Showing 4 changed files with 161 additions and 36 deletions.
56 changes: 56 additions & 0 deletions doc/source/categorical.rst
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,16 @@ The categorical data type is useful in the following cases:

See also the :ref:`API docs on categoricals<api.categorical>`.

.. _categorical.objectcreation:

Object Creation
---------------

.. _categorical.objectcreation.series:

Creating categories from a ``Series``
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Categorical `Series` or columns in a `DataFrame` can be created in several ways:

By specifying ``dtype="category"`` when constructing a `Series`:
Expand Down Expand Up @@ -143,6 +150,55 @@ constructor to save the factorize step during normal constructor mode:
splitter = np.random.choice([0,1], 5, p=[0.5,0.5])
s = pd.Series(pd.Categorical.from_codes(splitter, categories=["train", "test"]))
.. _categorical.objectcreation.frame:

Creating categories from a ``DataFrame``
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

.. versionadded:: 0.22.0

:meth:`DataFrame.astype` supports simultaneously setting multiple columns as categorical. When setting multiple
columns as categorical, by default each column's dtype will contain categories for all labels present in all columns, even
if a column does not contain all labels:

.. ipython:: python
df = pd.DataFrame({'A': ['a', 'b', 'c'], 'B': ['c', 'd', 'e']})
df = df.astype('category')
df
df['A'].dtype
df['B'].dtype
Note that this behavior is different than instantiating a ``DataFrame`` with categorical dtype, which will only assign
categories to each column based on the labels present in each column:

.. ipython:: python
df = pd.DataFrame({'A': ['a', 'b', 'c'], 'B': ['c', 'd', 'e']}, dtype='category')
df['A'].dtype
df['B'].dtype
When using ``astype``, you can control the categories that will be present in each column by passing
a ``CategoricalDtype``:

.. ipython:: python
df = pd.DataFrame({'A': ['a', 'b', 'c'], 'B': ['c', 'd', 'e']})
dtype = CategoricalDtype(categories=list('abdef'), ordered=True)
df = df.astype(dtype)
df
df['A'].dtype
df['B'].dtype
Use subselection if you only want to convert certain columns to categorical. The same be behaviors previously
discussed hold with subselection.

.. ipython:: python
df = pd.DataFrame({'A': ['a', 'b', 'c'], 'B': ['c', 'd', 'e'], 'C': ['x', 'y', 'z']})
df[['A', 'B']] = df[['A', 'B']].astype('category')
df.dtypes
.. _categorical.categoricaldtype:

CategoricalDtype
Expand Down
22 changes: 19 additions & 3 deletions doc/source/whatsnew/v0.22.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,25 @@ version.
New features
~~~~~~~~~~~~

-
-
-
.. _whatsnew_0220.enhancements.astype_category:

``DataFrame.astype`` now supports categoricals
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

:meth:`DataFrame.astype` now supports simultaneously setting multiple columns as categorical (:issue:`12860`)

When setting multiple columns as categorical, by default each column's dtype will contain categories for all
labels present in all columns, even if a column does not contain all labels:

.. ipython:: python

df = pd.DataFrame({'A': ['a', 'b', 'c'], 'B': ['c', 'd', 'e']})
df = df.astype('category')
df
df['A'].dtype
df['B'].dtype

See the :ref:`categorical.objectcreation.frame` section of the documentation for more details and examples.

.. _whatsnew_0220.enhancements.other:

Expand Down
24 changes: 21 additions & 3 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,16 @@
is_number,
is_integer, is_bool,
is_bool_dtype,
is_categorical_dtype,
is_numeric_dtype,
is_datetime64_dtype,
is_timedelta64_dtype,
is_datetime64tz_dtype,
is_list_like,
is_dict_like,
is_re_compilable,
pandas_dtype)
pandas_dtype,
CategoricalDtype)
from pandas.core.dtypes.cast import maybe_promote, maybe_upcast_putmask
from pandas.core.dtypes.missing import isna, notna
from pandas.core.dtypes.generic import ABCSeries, ABCPanel, ABCDataFrame
Expand Down Expand Up @@ -3973,14 +3975,30 @@ def astype(self, dtype, copy=True, errors='raise', **kwargs):
if col_name not in self:
raise KeyError('Only a column name can be used for the '
'key in a dtype mappings argument.')
from pandas import concat
results = []
for col_name, col in self.iteritems():
if col_name in dtype:
results.append(col.astype(dtype[col_name], copy=copy))
else:
results.append(results.append(col.copy() if copy else col))
return concat(results, axis=1, copy=False)
return pd.concat(results, axis=1, copy=False)

elif is_categorical_dtype(dtype) and self.ndim > 1:
# GH 12860
dtype_with_cat = (isinstance(dtype, CategoricalDtype) and
dtype.categories is not None)
if not dtype_with_cat:
categories = kwargs.get('categories', None)
ordered = (kwargs.get('ordered', None) or
getattr(dtype, 'ordered', None))

if categories is None:
categories = algos.unique(self.values.ravel(order='F'))

dtype = CategoricalDtype(categories, ordered)

results = (self[col].astype(dtype, copy=copy) for col in self)
return pd.concat(results, axis=1, copy=False)

# else, only a single dtype is given
new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors,
Expand Down
95 changes: 65 additions & 30 deletions pandas/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -2085,51 +2085,86 @@ def test_basic(self):
result = x.person_name.loc[0]
assert result == expected

def test_creation_astype(self):
l = ["a", "b", "c", "a"]
s = pd.Series(l)
exp = pd.Series(Categorical(l))
res = s.astype('category')
def test_series_creation_astype(self):
labels = list('abca')
exp = Series(Categorical(labels))
res = Series(labels).astype('category')
tm.assert_series_equal(res, exp)

l = [1, 2, 3, 1]
s = pd.Series(l)
exp = pd.Series(Categorical(l))
res = s.astype('category')
labels = [1, 2, 3, 1]
exp = Series(Categorical(labels))
res = Series(labels).astype('category')
tm.assert_series_equal(res, exp)

df = pd.DataFrame({"cats": [1, 2, 3, 4, 5, 6],
"vals": [1, 2, 3, 4, 5, 6]})
cats = Categorical([1, 2, 3, 4, 5, 6])
exp_df = pd.DataFrame({"cats": cats, "vals": [1, 2, 3, 4, 5, 6]})
df["cats"] = df["cats"].astype("category")
tm.assert_frame_equal(exp_df, df)
labels_int = [1, 2, 3, 4, 5, 6]
exp = DataFrame({"cats": Categorical(labels_int), "vals": labels_int})
res = DataFrame({"cats": labels_int, "vals": labels_int})
res["cats"] = res["cats"].astype("category")
tm.assert_frame_equal(res, exp)

df = pd.DataFrame({"cats": ['a', 'b', 'b', 'a', 'a', 'd'],
"vals": [1, 2, 3, 4, 5, 6]})
cats = Categorical(['a', 'b', 'b', 'a', 'a', 'd'])
exp_df = pd.DataFrame({"cats": cats, "vals": [1, 2, 3, 4, 5, 6]})
df["cats"] = df["cats"].astype("category")
tm.assert_frame_equal(exp_df, df)
labels_str = list('abbaad')
exp = DataFrame({"cats": Categorical(labels_str), "vals": labels_int})
res = DataFrame({"cats": labels_str, "vals": labels_int})
res["cats"] = res["cats"].astype("category")
tm.assert_frame_equal(res, exp)

# with keywords
l = ["a", "b", "c", "a"]
s = pd.Series(l)
exp = pd.Series(Categorical(l, ordered=True))
labels = list('abca')
s = Series(labels)
exp = Series(Categorical(labels, ordered=True))
res = s.astype(CategoricalDtype(None, ordered=True))
tm.assert_series_equal(res, exp)

exp = pd.Series(Categorical(
l, categories=list('abcdef'), ordered=True))
res = s.astype(CategoricalDtype(list('abcdef'), ordered=True))
cats = list('abcdef')
exp = Series(Categorical(labels, categories=cats, ordered=True))
res = s.astype(CategoricalDtype(cats, ordered=True))
tm.assert_series_equal(res, exp)

def test_frame_creation_astype(self):
# GH 12860
cats = list('abcde')
x = Categorical(list('abcd'), categories=cats)
y = Categorical(list('bcde'), categories=cats)
exp = DataFrame({'x': x, 'y': y})

data = {'x': list('abcd'), 'y': list('bcde')}
res = DataFrame(data).astype('category')
tm.assert_frame_equal(res, exp)

res = DataFrame(data).astype(CategoricalDtype())
tm.assert_frame_equal(res, exp)

# categories keyword
cats = list('abdef')
x = Categorical(['a', 'b', np.nan, 'd'], categories=cats)
y = Categorical(['b', np.nan, 'd', 'e'], categories=cats)
exp = DataFrame({'x': x, 'y': y})

res = DataFrame(data).astype('category', categories=cats)
tm.assert_frame_equal(res, exp)

res = DataFrame(data).astype(CategoricalDtype(categories=cats))
tm.assert_frame_equal(res, exp)

# ordered keyword
cats = [1, 2, 3, 4, 0]
x = Categorical(range(1, 5), categories=cats, ordered=True)
y = Categorical(range(4), categories=cats, ordered=True)
exp = DataFrame({'x': x, 'y': y})

data = {'x': range(1, 5), 'y': range(4)}
res = DataFrame(data).astype('category', ordered=True)
tm.assert_frame_equal(res, exp)

res = DataFrame(data).astype(CategoricalDtype(ordered=True))
tm.assert_frame_equal(res, exp)

@pytest.mark.parametrize('columns', [['x'], ['x', 'y'], ['x', 'y', 'z']])
def test_empty_astype(self, columns):
# GH 18004
msg = '> 1 ndim Categorical are not supported at this time'
with tm.assert_raises_regex(NotImplementedError, msg):
DataFrame(columns=columns).astype('category')
exp = DataFrame({c: Categorical([]) for c in columns}, index=[])
res = DataFrame(columns=columns).astype('category')
tm.assert_frame_equal(res, exp)

def test_construction_series(self):

Expand Down

0 comments on commit 081d533

Please sign in to comment.