BUG/DEPR: Categorical: keep dtype in MultiIndex (#13743), deprecate .…

…from_array Now, categorical dtype is preserved also in `groupby`, `set_index`, `stack`, `get_dummies`, and `make_axis_dummies`. closes #13743 closes #13854
pandas-dev · Sep 2, 2016 · d26363b · d26363b
1 parent ccec504
commit d26363b
Show file tree

Hide file tree

Showing 14 changed files with 370 additions and 91 deletions.
diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
@@ -977,6 +977,79 @@ New Behavior:
    pd.Index([1, 2, 3]).unique()
    pd.DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'], tz='Asia/Tokyo').unique()
 
+.. _whatsnew_0190.api.multiindex:
+
+``MultiIndex`` constructors preserve categorical dtypes
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``MultiIndex.from_arrays`` and ``MultiIndex.from_product`` will now preserve categorical dtype
+in ``MultiIndex`` levels. (:issue:`13743`, :issue:`13854`)
+
+.. ipython:: python
+
+   cat = pd.Categorical(['a', 'b'], categories=list("bac"))
+   lvl1 = ['foo', 'bar']
+   midx = pd.MultiIndex.from_arrays([cat, lvl1])
+   midx
+
+Previous Behavior:
+
+.. code-block:: ipython
+
+   In [4]: midx.levels[0]
+   Out[4]: Index(['b', 'a', 'c'], dtype='object')
+
+   In [5]: midx.get_level_values[0]
+   Out[5]: Index(['a', 'b'], dtype='object')
+
+New Behavior:
+
+.. ipython:: python
+
+   midx.levels[0]
+   midx.get_level_values(0)
+
+An analogous change has been made to ``MultiIndex.from_product``.
+As a consequence, ``groupby`` and ``set_index`` also preserve categorical dtypes in indexes
+
+.. ipython:: python
+
+   df = pd.DataFrame({'A': [0, 1], 'B': [10, 11], 'C': cat})
+   df_grouped = df.groupby(by=['A', 'C']).first()
+   df_set_idx = df.set_index(['A', 'C'])
+
+Previous Behavior:
+
+.. code-block:: ipython
+
+   In [11]: df_grouped.index.levels[1]
+   Out[11]: Index(['b', 'a', 'c'], dtype='object', name='C')
+   In [12]: df_grouped.reset_index().dtypes
+   Out[12]:
+   A      int64
+   C     object
+   B    float64
+   dtype: object
+
+   In [13]: df_set_idx.index.levels[1]
+   Out[13]: Index(['b', 'a', 'c'], dtype='object', name='C')
+   In [14]: df_set_idx.reset_index().dtypes
+   Out[14]:
+   A      int64
+   C     object
+   B      int64
+   dtype: object
+
+New Behavior:
+
+.. ipython:: python
+
+   df_grouped.index.levels[1]
+   df_grouped.reset_index().dtypes
+
+   df_set_idx.index.levels[1]
+   df_set_idx.reset_index().dtypes
+
 .. _whatsnew_0190.api.autogenerated_chunksize_index:
 
 ``read_csv`` will progressively enumerate chunks
@@ -1173,7 +1246,7 @@ Deprecations
 - ``Panel4D`` and ``PanelND`` constructors are deprecated and will be removed in a future version. The recommended way to represent these types of n-dimensional data are with the `xarray package <http://xarray.pydata.org/en/stable/>`__. Pandas provides a :meth:`~Panel4D.to_xarray` method to automate this conversion. (:issue:`13564`)
 - ``pandas.tseries.frequencies.get_standard_freq`` is deprecated. Use  ``pandas.tseries.frequencies.to_offset(freq).rule_code`` instead. (:issue:`13874`)
 - ``pandas.tseries.frequencies.to_offset``'s ``freqstr`` keyword is deprecated in favor of ``freq``. (:issue:`13874`)
-
+- ``Categorical.from_array`` has been deprecated and will be removed in a future version (:issue:`13854`)
 
 .. _whatsnew_0190.prior_deprecations:
 
@@ -1388,3 +1461,4 @@ Bug Fixes
 - Bug in ``.to_string()`` when called with an integer ``line_width`` and ``index=False`` raises an UnboundLocalError exception because ``idx`` referenced before assignment.
 
 - Bug in ``eval()`` where the ``resolvers`` argument would not accept a list (:issue`14095`)
+- Bugs in ``stack``, ``get_dummies``, ``make_axis_dummies`` which don't preserve categorical dtypes in (multi)indexes (:issue:`13854`)
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
@@ -5,7 +5,7 @@
 import types
 
 from pandas import compat, lib
-from pandas.compat import u
+from pandas.compat import u, lzip
 
 from pandas.types.generic import ABCSeries, ABCIndexClass, ABCCategoricalIndex
 from pandas.types.missing import isnull, notnull
@@ -17,6 +17,7 @@
                                  _ensure_platform_int,
                                  is_dtype_equal,
                                  is_datetimelike,
+                                 is_categorical,
                                  is_categorical_dtype,
                                  is_integer_dtype, is_bool,
                                  is_list_like, is_sequence,
@@ -411,6 +412,8 @@ def base(self):
     @classmethod
     def from_array(cls, data, **kwargs):
         """
+        DEPRECATED: Use ``Categorical`` instead.
+
         Make a Categorical type from a single array-like object.
 
         For internal compatibility with numpy arrays.
@@ -421,6 +424,8 @@ def from_array(cls, data, **kwargs):
             Can be an Index or array-like. The categories are assumed to be
             the unique values of `data`.
         """
+        warn("Categorical.from_array is deprecated, use Categorical instead",
+             FutureWarning, stacklevel=2)
         return cls(data, **kwargs)
 
     @classmethod
@@ -1959,3 +1964,47 @@ def _convert_to_list_like(list_like):
     else:
         # is this reached?
         return [list_like]
+
+
+def _factorize_from_iterable(values):
+    """
+    Factorize an input `values` into `categories` and `codes`. Preserves
+    categorical dtype in `categories`.
+
+    *This is an internal function*
+
+    Parameters
+    ----------
+    values : list-like
+
+    Returns
+    -------
+    codes : np.array
+    categories : Index
+        If `values` has a categorical dtype, then `categories` is
+        a CategoricalIndex keeping the categories and order of `values`.
+    """
+    from pandas.indexes.category import CategoricalIndex
+
+    if is_categorical(values):
+        if isinstance(values, (ABCCategoricalIndex, ABCSeries)):
+            values = values._values
+        categories = CategoricalIndex(values.categories,
+                                      categories=values.categories,
+                                      ordered=values.ordered)
+        codes = values.codes
+    else:
+        cat = Categorical(values, ordered=True)
+        categories = cat.categories
+        codes = cat.codes
+    return codes, categories
+
+
+def _factorize_from_iterables(iterables):
+    """
+    A higher-level wrapper over `_factorize_from_iterable`.
+    See `_factorize_from_iterable` for more info.
+
+    *This is an internal function*
+    """
+    return lzip(*[_factorize_from_iterable(it) for it in iterables])
diff --git a/pandas/core/panel.py b/pandas/core/panel.py
@@ -21,7 +21,6 @@
 from pandas import compat
 from pandas.compat import (map, zip, range, u, OrderedDict, OrderedDefaultdict)
 from pandas.compat.numpy import function as nv
-from pandas.core.categorical import Categorical
 from pandas.core.common import PandasError, _try_sort, _default_index
 from pandas.core.frame import DataFrame
 from pandas.core.generic import NDFrame, _shared_docs
@@ -103,13 +102,7 @@ def panel_index(time, panels, names=None):
     if names is None:
         names = ['time', 'panel']
     time, panels = _ensure_like_indices(time, panels)
-    time_factor = Categorical.from_array(time, ordered=True)
-    panel_factor = Categorical.from_array(panels, ordered=True)
-
-    labels = [time_factor.codes, panel_factor.codes]
-    levels = [time_factor.categories, panel_factor.categories]
-    return MultiIndex(levels, labels, sortorder=None, names=names,
-                      verify_integrity=False)
+    return MultiIndex.from_arrays([time, panels], sortorder=None, names=names)
 
 
 class Panel(NDFrame):

diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py
@@ -18,7 +18,7 @@
 from pandas.sparse.array import SparseArray
 from pandas._sparse import IntIndex
 
-from pandas.core.categorical import Categorical
+from pandas.core.categorical import Categorical, _factorize_from_iterable
 from pandas.core.groupby import get_group_index, _compress_group_index
 
 import pandas.core.algorithms as algos
@@ -166,9 +166,8 @@ def get_result(self):
         if self.is_categorical is not None:
             categories = self.is_categorical.categories
             ordered = self.is_categorical.ordered
-            values = [Categorical.from_array(values[:, i],
-                                             categories=categories,
-                                             ordered=ordered)
+            values = [Categorical(values[:, i], categories=categories,
+                                  ordered=ordered)
                       for i in range(values.shape[-1])]
 
         return DataFrame(values, index=index, columns=columns)
@@ -471,8 +470,8 @@ def stack(frame, level=-1, dropna=True):
     def factorize(index):
         if index.is_unique:
             return index, np.arange(len(index))
-        cat = Categorical(index, ordered=True)
-        return cat.categories, cat.codes
+        codes, categories = _factorize_from_iterable(index)
+        return categories, codes
 
     N, K = frame.shape
     if isinstance(frame.columns, MultiIndex):
@@ -1107,8 +1106,7 @@ def check_len(item, name):
 def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False,
                     sparse=False, drop_first=False):
     # Series avoids inconsistent NaN handling
-    cat = Categorical.from_array(Series(data), ordered=True)
-    levels = cat.categories
+    codes, levels = _factorize_from_iterable(Series(data))
 
     def get_empty_Frame(data, sparse):
         if isinstance(data, Series):
@@ -1124,10 +1122,10 @@ def get_empty_Frame(data, sparse):
     if not dummy_na and len(levels) == 0:
         return get_empty_Frame(data, sparse)
 
-    codes = cat.codes.copy()
+    codes = codes.copy()
     if dummy_na:
-        codes[codes == -1] = len(cat.categories)
-        levels = np.append(cat.categories, np.nan)
+        codes[codes == -1] = len(levels)
+        levels = np.append(levels, np.nan)
 
     # if dummy_na, we just fake a nan level. drop_first will drop it again
     if drop_first and len(levels) == 1:
@@ -1212,9 +1210,7 @@ def make_axis_dummies(frame, axis='minor', transform=None):
     labels = frame.index.labels[num]
     if transform is not None:
         mapped_items = items.map(transform)
-        cat = Categorical.from_array(mapped_items.take(labels), ordered=True)
-        labels = cat.codes
-        items = cat.categories
+        labels, items = _factorize_from_iterable(mapped_items.take(labels))
 
     values = np.eye(len(items), dtype=float)
     values = values.take(labels, axis=0)

diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py
@@ -852,8 +852,6 @@ def from_arrays(cls, arrays, sortorder=None, names=None):
         MultiIndex.from_product : Make a MultiIndex from cartesian product
                                   of iterables
         """
-        from pandas.core.categorical import Categorical
-
         if len(arrays) == 1:
             name = None if names is None else names[0]
             return Index(arrays[0], name=name)
@@ -864,9 +862,9 @@ def from_arrays(cls, arrays, sortorder=None, names=None):
             if len(arrays[i]) != len(arrays[i - 1]):
                 raise ValueError('all arrays must be same length')
 
-        cats = [Categorical.from_array(arr, ordered=True) for arr in arrays]
-        levels = [c.categories for c in cats]
-        labels = [c.codes for c in cats]
+        from pandas.core.categorical import _factorize_from_iterables
+
+        labels, levels = _factorize_from_iterables(arrays)
         if names is None:
             names = [getattr(arr, "name", None) for arr in arrays]
 
@@ -952,15 +950,14 @@ def from_product(cls, iterables, sortorder=None, names=None):
         MultiIndex.from_arrays : Convert list of arrays to MultiIndex
         MultiIndex.from_tuples : Convert list of tuples to MultiIndex
         """
-        from pandas.core.categorical import Categorical
+        from pandas.core.categorical import _factorize_from_iterables
         from pandas.tools.util import cartesian_product
 
-        categoricals = [Categorical.from_array(it, ordered=True)
-                        for it in iterables]
-        labels = cartesian_product([c.codes for c in categoricals])
+        labels, levels = _factorize_from_iterables(iterables)
+        labels = cartesian_product(labels)
 
-        return MultiIndex(levels=[c.categories for c in categoricals],
-                          labels=labels, sortorder=sortorder, names=names)
+        return MultiIndex(levels=levels, labels=labels, sortorder=sortorder,
+                          names=names)
 
     @property
     def nlevels(self):

diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
@@ -37,7 +37,7 @@
 from pandas.formats.printing import adjoin, pprint_thing
 from pandas.core.common import _asarray_tuplesafe, PerformanceWarning
 from pandas.core.algorithms import match, unique
-from pandas.core.categorical import Categorical
+from pandas.core.categorical import Categorical, _factorize_from_iterables
 from pandas.core.internals import (BlockManager, make_block,
                                    _block2d_to_blocknd,
                                    _factor_indexer, _block_shape)
@@ -3736,11 +3736,12 @@ def read(self, where=None, columns=None, **kwargs):
         if not self.read_axes(where=where, **kwargs):
             return None
 
-        factors = [Categorical.from_array(
-            a.values, ordered=True) for a in self.index_axes]
-        levels = [f.categories for f in factors]
-        N = [len(f.categories) for f in factors]
-        labels = [f.codes for f in factors]
+        lst_vals = [a.values for a in self.index_axes]
+        labels, levels = _factorize_from_iterables(lst_vals)
+        # labels and levels are tuples but lists are expected
+        labels = list(labels)
+        levels = list(levels)
+        N = [len(lvl) for lvl in levels]
 
         # compute the key
         key = _factor_indexer(N[1:], labels)

diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py
@@ -664,3 +664,18 @@ def test_assign_columns(self):
         frame.columns = ['foo', 'bar', 'baz', 'quux', 'foo2']
         assert_series_equal(self.frame['C'], frame['baz'], check_names=False)
         assert_series_equal(self.frame['hi'], frame['foo2'], check_names=False)
+
+    def test_set_index_preserve_categorical_dtype(self):
+        # GH13743, GH13854
+        df = DataFrame({'A': [1, 2, 1, 1, 2],
+                        'B': [10, 16, 22, 28, 34],
+                        'C1': pd.Categorical(list("abaab"),
+                                             categories=list("bac"),
+                                             ordered=False),
+                        'C2': pd.Categorical(list("abaab"),
+                                             categories=list("bac"),
+                                             ordered=True)})
+        for cols in ['C1', 'C2', ['A', 'C1'], ['A', 'C2'], ['C1', 'C2']]:
+            result = df.set_index(cols).reset_index()
+            result = result.reindex(columns=df.columns)
+            tm.assert_frame_equal(result, df)
diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py
@@ -707,3 +707,19 @@ def _test_stack_with_multiindex(multiindex):
                              columns=Index(['B', 'C'], name='Upper'),
                              dtype=df.dtypes[0])
         assert_frame_equal(result, expected)
+
+    def test_stack_preserve_categorical_dtype(self):
+        # GH13854
+        for ordered in [False, True]:
+            for labels in [list("yxz"), list("yxy")]:
+                cidx = pd.CategoricalIndex(labels, categories=list("xyz"),
+                                           ordered=ordered)
+                df = DataFrame([[10, 11, 12]], columns=cidx)
+                result = df.stack()
+
+                # `MutliIndex.from_product` preserves categorical dtype -
+                # it's tested elsewhere.
+                midx = pd.MultiIndex.from_product([df.index, cidx])
+                expected = Series([10, 11, 12], index=midx)
+
+                tm.assert_series_equal(result, expected)