diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 38a90ac371b16..777bc01e71833 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -977,6 +977,79 @@ New Behavior: pd.Index([1, 2, 3]).unique() pd.DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'], tz='Asia/Tokyo').unique() +.. _whatsnew_0190.api.multiindex: + +``MultiIndex`` constructors preserve categorical dtypes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``MultiIndex.from_arrays`` and ``MultiIndex.from_product`` will now preserve categorical dtype +in ``MultiIndex`` levels. (:issue:`13743`, :issue:`13854`) + +.. ipython:: python + + cat = pd.Categorical(['a', 'b'], categories=list("bac")) + lvl1 = ['foo', 'bar'] + midx = pd.MultiIndex.from_arrays([cat, lvl1]) + midx + +Previous Behavior: + +.. code-block:: ipython + + In [4]: midx.levels[0] + Out[4]: Index(['b', 'a', 'c'], dtype='object') + + In [5]: midx.get_level_values[0] + Out[5]: Index(['a', 'b'], dtype='object') + +New Behavior: + +.. ipython:: python + + midx.levels[0] + midx.get_level_values(0) + +An analogous change has been made to ``MultiIndex.from_product``. +As a consequence, ``groupby`` and ``set_index`` also preserve categorical dtypes in indexes + +.. ipython:: python + + df = pd.DataFrame({'A': [0, 1], 'B': [10, 11], 'C': cat}) + df_grouped = df.groupby(by=['A', 'C']).first() + df_set_idx = df.set_index(['A', 'C']) + +Previous Behavior: + +.. code-block:: ipython + + In [11]: df_grouped.index.levels[1] + Out[11]: Index(['b', 'a', 'c'], dtype='object', name='C') + In [12]: df_grouped.reset_index().dtypes + Out[12]: + A int64 + C object + B float64 + dtype: object + + In [13]: df_set_idx.index.levels[1] + Out[13]: Index(['b', 'a', 'c'], dtype='object', name='C') + In [14]: df_set_idx.reset_index().dtypes + Out[14]: + A int64 + C object + B int64 + dtype: object + +New Behavior: + +.. ipython:: python + + df_grouped.index.levels[1] + df_grouped.reset_index().dtypes + + df_set_idx.index.levels[1] + df_set_idx.reset_index().dtypes + .. _whatsnew_0190.api.autogenerated_chunksize_index: ``read_csv`` will progressively enumerate chunks @@ -1173,7 +1246,7 @@ Deprecations - ``Panel4D`` and ``PanelND`` constructors are deprecated and will be removed in a future version. The recommended way to represent these types of n-dimensional data are with the `xarray package `__. Pandas provides a :meth:`~Panel4D.to_xarray` method to automate this conversion. (:issue:`13564`) - ``pandas.tseries.frequencies.get_standard_freq`` is deprecated. Use ``pandas.tseries.frequencies.to_offset(freq).rule_code`` instead. (:issue:`13874`) - ``pandas.tseries.frequencies.to_offset``'s ``freqstr`` keyword is deprecated in favor of ``freq``. (:issue:`13874`) - +- ``Categorical.from_array`` has been deprecated and will be removed in a future version (:issue:`13854`) .. _whatsnew_0190.prior_deprecations: @@ -1388,3 +1461,4 @@ Bug Fixes - Bug in ``.to_string()`` when called with an integer ``line_width`` and ``index=False`` raises an UnboundLocalError exception because ``idx`` referenced before assignment. - Bug in ``eval()`` where the ``resolvers`` argument would not accept a list (:issue`14095`) +- Bugs in ``stack``, ``get_dummies``, ``make_axis_dummies`` which don't preserve categorical dtypes in (multi)indexes (:issue:`13854`) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 2c89e4c05c633..48054c5bd34fa 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -5,7 +5,7 @@ import types from pandas import compat, lib -from pandas.compat import u +from pandas.compat import u, lzip from pandas.types.generic import ABCSeries, ABCIndexClass, ABCCategoricalIndex from pandas.types.missing import isnull, notnull @@ -17,6 +17,7 @@ _ensure_platform_int, is_dtype_equal, is_datetimelike, + is_categorical, is_categorical_dtype, is_integer_dtype, is_bool, is_list_like, is_sequence, @@ -411,6 +412,8 @@ def base(self): @classmethod def from_array(cls, data, **kwargs): """ + DEPRECATED: Use ``Categorical`` instead. + Make a Categorical type from a single array-like object. For internal compatibility with numpy arrays. @@ -421,6 +424,8 @@ def from_array(cls, data, **kwargs): Can be an Index or array-like. The categories are assumed to be the unique values of `data`. """ + warn("Categorical.from_array is deprecated, use Categorical instead", + FutureWarning, stacklevel=2) return cls(data, **kwargs) @classmethod @@ -1959,3 +1964,47 @@ def _convert_to_list_like(list_like): else: # is this reached? return [list_like] + + +def _factorize_from_iterable(values): + """ + Factorize an input `values` into `categories` and `codes`. Preserves + categorical dtype in `categories`. + + *This is an internal function* + + Parameters + ---------- + values : list-like + + Returns + ------- + codes : np.array + categories : Index + If `values` has a categorical dtype, then `categories` is + a CategoricalIndex keeping the categories and order of `values`. + """ + from pandas.indexes.category import CategoricalIndex + + if is_categorical(values): + if isinstance(values, (ABCCategoricalIndex, ABCSeries)): + values = values._values + categories = CategoricalIndex(values.categories, + categories=values.categories, + ordered=values.ordered) + codes = values.codes + else: + cat = Categorical(values, ordered=True) + categories = cat.categories + codes = cat.codes + return codes, categories + + +def _factorize_from_iterables(iterables): + """ + A higher-level wrapper over `_factorize_from_iterable`. + See `_factorize_from_iterable` for more info. + + *This is an internal function* + """ + return lzip(*[_factorize_from_iterable(it) for it in iterables]) diff --git a/pandas/core/panel.py b/pandas/core/panel.py index b2f318d825db6..f708774dd84ff 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -21,7 +21,6 @@ from pandas import compat from pandas.compat import (map, zip, range, u, OrderedDict, OrderedDefaultdict) from pandas.compat.numpy import function as nv -from pandas.core.categorical import Categorical from pandas.core.common import PandasError, _try_sort, _default_index from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame, _shared_docs @@ -103,13 +102,7 @@ def panel_index(time, panels, names=None): if names is None: names = ['time', 'panel'] time, panels = _ensure_like_indices(time, panels) - time_factor = Categorical.from_array(time, ordered=True) - panel_factor = Categorical.from_array(panels, ordered=True) - - labels = [time_factor.codes, panel_factor.codes] - levels = [time_factor.categories, panel_factor.categories] - return MultiIndex(levels, labels, sortorder=None, names=names, - verify_integrity=False) + return MultiIndex.from_arrays([time, panels], sortorder=None, names=names) class Panel(NDFrame): diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index b451f49fce78c..4dec8b4106126 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -18,7 +18,7 @@ from pandas.sparse.array import SparseArray from pandas._sparse import IntIndex -from pandas.core.categorical import Categorical +from pandas.core.categorical import Categorical, _factorize_from_iterable from pandas.core.groupby import get_group_index, _compress_group_index import pandas.core.algorithms as algos @@ -166,9 +166,8 @@ def get_result(self): if self.is_categorical is not None: categories = self.is_categorical.categories ordered = self.is_categorical.ordered - values = [Categorical.from_array(values[:, i], - categories=categories, - ordered=ordered) + values = [Categorical(values[:, i], categories=categories, + ordered=ordered) for i in range(values.shape[-1])] return DataFrame(values, index=index, columns=columns) @@ -471,8 +470,8 @@ def stack(frame, level=-1, dropna=True): def factorize(index): if index.is_unique: return index, np.arange(len(index)) - cat = Categorical(index, ordered=True) - return cat.categories, cat.codes + codes, categories = _factorize_from_iterable(index) + return categories, codes N, K = frame.shape if isinstance(frame.columns, MultiIndex): @@ -1107,8 +1106,7 @@ def check_len(item, name): def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, sparse=False, drop_first=False): # Series avoids inconsistent NaN handling - cat = Categorical.from_array(Series(data), ordered=True) - levels = cat.categories + codes, levels = _factorize_from_iterable(Series(data)) def get_empty_Frame(data, sparse): if isinstance(data, Series): @@ -1124,10 +1122,10 @@ def get_empty_Frame(data, sparse): if not dummy_na and len(levels) == 0: return get_empty_Frame(data, sparse) - codes = cat.codes.copy() + codes = codes.copy() if dummy_na: - codes[codes == -1] = len(cat.categories) - levels = np.append(cat.categories, np.nan) + codes[codes == -1] = len(levels) + levels = np.append(levels, np.nan) # if dummy_na, we just fake a nan level. drop_first will drop it again if drop_first and len(levels) == 1: @@ -1212,9 +1210,7 @@ def make_axis_dummies(frame, axis='minor', transform=None): labels = frame.index.labels[num] if transform is not None: mapped_items = items.map(transform) - cat = Categorical.from_array(mapped_items.take(labels), ordered=True) - labels = cat.codes - items = cat.categories + labels, items = _factorize_from_iterable(mapped_items.take(labels)) values = np.eye(len(items), dtype=float) values = values.take(labels, axis=0) diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index cc279076f7a5e..618bc319c3f74 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -852,8 +852,6 @@ def from_arrays(cls, arrays, sortorder=None, names=None): MultiIndex.from_product : Make a MultiIndex from cartesian product of iterables """ - from pandas.core.categorical import Categorical - if len(arrays) == 1: name = None if names is None else names[0] return Index(arrays[0], name=name) @@ -864,9 +862,9 @@ def from_arrays(cls, arrays, sortorder=None, names=None): if len(arrays[i]) != len(arrays[i - 1]): raise ValueError('all arrays must be same length') - cats = [Categorical.from_array(arr, ordered=True) for arr in arrays] - levels = [c.categories for c in cats] - labels = [c.codes for c in cats] + from pandas.core.categorical import _factorize_from_iterables + + labels, levels = _factorize_from_iterables(arrays) if names is None: names = [getattr(arr, "name", None) for arr in arrays] @@ -952,15 +950,14 @@ def from_product(cls, iterables, sortorder=None, names=None): MultiIndex.from_arrays : Convert list of arrays to MultiIndex MultiIndex.from_tuples : Convert list of tuples to MultiIndex """ - from pandas.core.categorical import Categorical + from pandas.core.categorical import _factorize_from_iterables from pandas.tools.util import cartesian_product - categoricals = [Categorical.from_array(it, ordered=True) - for it in iterables] - labels = cartesian_product([c.codes for c in categoricals]) + labels, levels = _factorize_from_iterables(iterables) + labels = cartesian_product(labels) - return MultiIndex(levels=[c.categories for c in categoricals], - labels=labels, sortorder=sortorder, names=names) + return MultiIndex(levels=levels, labels=labels, sortorder=sortorder, + names=names) @property def nlevels(self): diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index ccc3fe081acde..b8c2b146b6259 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -37,7 +37,7 @@ from pandas.formats.printing import adjoin, pprint_thing from pandas.core.common import _asarray_tuplesafe, PerformanceWarning from pandas.core.algorithms import match, unique -from pandas.core.categorical import Categorical +from pandas.core.categorical import Categorical, _factorize_from_iterables from pandas.core.internals import (BlockManager, make_block, _block2d_to_blocknd, _factor_indexer, _block_shape) @@ -3736,11 +3736,12 @@ def read(self, where=None, columns=None, **kwargs): if not self.read_axes(where=where, **kwargs): return None - factors = [Categorical.from_array( - a.values, ordered=True) for a in self.index_axes] - levels = [f.categories for f in factors] - N = [len(f.categories) for f in factors] - labels = [f.codes for f in factors] + lst_vals = [a.values for a in self.index_axes] + labels, levels = _factorize_from_iterables(lst_vals) + # labels and levels are tuples but lists are expected + labels = list(labels) + levels = list(levels) + N = [len(lvl) for lvl in levels] # compute the key key = _factor_indexer(N[1:], labels) diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 66b14995e6d3c..46f0fff7bb4b8 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -664,3 +664,18 @@ def test_assign_columns(self): frame.columns = ['foo', 'bar', 'baz', 'quux', 'foo2'] assert_series_equal(self.frame['C'], frame['baz'], check_names=False) assert_series_equal(self.frame['hi'], frame['foo2'], check_names=False) + + def test_set_index_preserve_categorical_dtype(self): + # GH13743, GH13854 + df = DataFrame({'A': [1, 2, 1, 1, 2], + 'B': [10, 16, 22, 28, 34], + 'C1': pd.Categorical(list("abaab"), + categories=list("bac"), + ordered=False), + 'C2': pd.Categorical(list("abaab"), + categories=list("bac"), + ordered=True)}) + for cols in ['C1', 'C2', ['A', 'C1'], ['A', 'C2'], ['C1', 'C2']]: + result = df.set_index(cols).reset_index() + result = result.reindex(columns=df.columns) + tm.assert_frame_equal(result, df) diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index 066485e966a42..8b1b1130dc2fc 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -707,3 +707,19 @@ def _test_stack_with_multiindex(multiindex): columns=Index(['B', 'C'], name='Upper'), dtype=df.dtypes[0]) assert_frame_equal(result, expected) + + def test_stack_preserve_categorical_dtype(self): + # GH13854 + for ordered in [False, True]: + for labels in [list("yxz"), list("yxy")]: + cidx = pd.CategoricalIndex(labels, categories=list("xyz"), + ordered=ordered) + df = DataFrame([[10, 11, 12]], columns=cidx) + result = df.stack() + + # `MutliIndex.from_product` preserves categorical dtype - + # it's tested elsewhere. + midx = pd.MultiIndex.from_product([df.index, cidx]) + expected = Series([10, 11, 12], index=midx) + + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index c72cab32d198b..675193e1538b2 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -314,6 +314,22 @@ def test_set_levels_labels_names_bad_input(self): with tm.assertRaisesRegexp(TypeError, 'string'): self.index.set_names(names, level=0) + def test_set_levels_categorical(self): + # GH13854 + index = MultiIndex.from_arrays([list("xyzx"), [0, 1, 2, 3]]) + for ordered in [False, True]: + cidx = CategoricalIndex(list("bac"), ordered=ordered) + result = index.set_levels(cidx, 0) + expected = MultiIndex(levels=[cidx, [0, 1, 2, 3]], + labels=index.labels) + tm.assert_index_equal(result, expected) + + result_lvl = result.get_level_values(0) + expected_lvl = CategoricalIndex(list("bacb"), + categories=cidx.categories, + ordered=cidx.ordered) + tm.assert_index_equal(result_lvl, expected_lvl) + def test_metadata_immutable(self): levels, labels = self.index.levels, self.index.labels # shouldn't be able to set at either the top level or base level @@ -656,6 +672,25 @@ def test_from_arrays_index_datetimelike_mixed(self): tm.assert_index_equal(result, result2) + def test_from_arrays_index_series_categorical(self): + # GH13743 + idx1 = pd.CategoricalIndex(list("abcaab"), categories=list("bac"), + ordered=False) + idx2 = pd.CategoricalIndex(list("abcaab"), categories=list("bac"), + ordered=True) + + result = pd.MultiIndex.from_arrays([idx1, idx2]) + tm.assert_index_equal(result.get_level_values(0), idx1) + tm.assert_index_equal(result.get_level_values(1), idx2) + + result2 = pd.MultiIndex.from_arrays([pd.Series(idx1), pd.Series(idx2)]) + tm.assert_index_equal(result2.get_level_values(0), idx1) + tm.assert_index_equal(result2.get_level_values(1), idx2) + + result3 = pd.MultiIndex.from_arrays([idx1.values, idx2.values]) + tm.assert_index_equal(result3.get_level_values(0), idx1) + tm.assert_index_equal(result3.get_level_values(1), idx2) + def test_from_arrays_different_lengths(self): # GH13599 idx1 = [1, 2, 3] @@ -696,6 +731,20 @@ def test_from_product_datetimeindex(self): '2000-01-01')), (2, pd.Timestamp('2000-01-02'))]) tm.assert_numpy_array_equal(mi.values, etalon) + def test_from_product_index_series_categorical(self): + # GH13743 + first = ['foo', 'bar'] + for ordered in [False, True]: + idx = pd.CategoricalIndex(list("abcaab"), categories=list("bac"), + ordered=ordered) + expected = pd.CategoricalIndex(list("abcaab") + list("abcaab"), + categories=list("bac"), + ordered=ordered) + + for arr in [idx, pd.Series(idx), idx.values]: + result = pd.MultiIndex.from_product([first, arr]) + tm.assert_index_equal(result.get_level_values(1), expected) + def test_values_boxed(self): tuples = [(1, pd.Timestamp('2000-01-01')), (2, pd.NaT), (3, pd.Timestamp('2000-01-03')), diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 781c9b786328d..eabd118de671d 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -29,9 +29,8 @@ class TestCategorical(tm.TestCase): _multiprocess_can_split_ = True def setUp(self): - self.factor = Categorical.from_array(['a', 'b', 'b', 'a', - 'a', 'c', 'c', 'c'], - ordered=True) + self.factor = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'], + ordered=True) def test_getitem(self): self.assertEqual(self.factor[0], 'a') @@ -70,8 +69,8 @@ def test_setitem(self): indexer[0] = True indexer[-1] = True c[indexer] = 'c' - expected = Categorical.from_array(['c', 'b', 'b', 'a', - 'a', 'c', 'c', 'c'], ordered=True) + expected = Categorical(['c', 'b', 'b', 'a', 'a', 'c', 'c', 'c'], + ordered=True) self.assert_categorical_equal(c, expected) @@ -94,12 +93,12 @@ def test_constructor_unsortable(self): # it works! arr = np.array([1, 2, 3, datetime.now()], dtype='O') - factor = Categorical.from_array(arr, ordered=False) + factor = Categorical(arr, ordered=False) self.assertFalse(factor.ordered) # this however will raise as cannot be sorted self.assertRaises( - TypeError, lambda: Categorical.from_array(arr, ordered=True)) + TypeError, lambda: Categorical(arr, ordered=True)) def test_is_equal_dtype(self): @@ -341,26 +340,26 @@ def test_constructor_with_datetimelike(self): def test_constructor_from_index_series_datetimetz(self): idx = pd.date_range('2015-01-01 10:00', freq='D', periods=3, tz='US/Eastern') - result = pd.Categorical.from_array(idx) + result = pd.Categorical(idx) tm.assert_index_equal(result.categories, idx) - result = pd.Categorical.from_array(pd.Series(idx)) + result = pd.Categorical(pd.Series(idx)) tm.assert_index_equal(result.categories, idx) def test_constructor_from_index_series_timedelta(self): idx = pd.timedelta_range('1 days', freq='D', periods=3) - result = pd.Categorical.from_array(idx) + result = pd.Categorical(idx) tm.assert_index_equal(result.categories, idx) - result = pd.Categorical.from_array(pd.Series(idx)) + result = pd.Categorical(pd.Series(idx)) tm.assert_index_equal(result.categories, idx) def test_constructor_from_index_series_period(self): idx = pd.period_range('2015-01-01', freq='D', periods=3) - result = pd.Categorical.from_array(idx) + result = pd.Categorical(idx) tm.assert_index_equal(result.categories, idx) - result = pd.Categorical.from_array(pd.Series(idx)) + result = pd.Categorical(pd.Series(idx)) tm.assert_index_equal(result.categories, idx) def test_from_codes(self): @@ -409,9 +408,6 @@ def test_validate_ordered(self): with tm.assertRaisesRegexp(exp_err, exp_msg): Categorical([1, 2, 3], ordered=ordered) - with tm.assertRaisesRegexp(exp_err, exp_msg): - Categorical.from_array([1, 2, 3], ordered=ordered) - with tm.assertRaisesRegexp(exp_err, exp_msg): Categorical.from_codes([0, 0, 1], categories=['a', 'b', 'c'], ordered=ordered) @@ -724,7 +720,7 @@ def test_periodindex(self): idx1 = PeriodIndex(['2014-01', '2014-01', '2014-02', '2014-02', '2014-03', '2014-03'], freq='M') - cat1 = Categorical.from_array(idx1) + cat1 = Categorical(idx1) str(cat1) exp_arr = np.array([0, 0, 1, 1, 2, 2], dtype=np.int8) exp_idx = PeriodIndex(['2014-01', '2014-02', '2014-03'], freq='M') @@ -733,7 +729,7 @@ def test_periodindex(self): idx2 = PeriodIndex(['2014-03', '2014-03', '2014-02', '2014-01', '2014-03', '2014-01'], freq='M') - cat2 = Categorical.from_array(idx2, ordered=True) + cat2 = Categorical(idx2, ordered=True) str(cat2) exp_arr = np.array([2, 2, 1, 0, 2, 0], dtype=np.int8) exp_idx2 = PeriodIndex(['2014-01', '2014-02', '2014-03'], freq='M') @@ -742,7 +738,7 @@ def test_periodindex(self): idx3 = PeriodIndex(['2013-12', '2013-11', '2013-10', '2013-09', '2013-08', '2013-07', '2013-05'], freq='M') - cat3 = Categorical.from_array(idx3, ordered=True) + cat3 = Categorical(idx3, ordered=True) exp_arr = np.array([6, 5, 4, 3, 2, 1, 0], dtype=np.int8) exp_idx = PeriodIndex(['2013-05', '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12'], freq='M') @@ -1590,6 +1586,11 @@ def test_deprecated_labels(self): res = cat.labels self.assert_numpy_array_equal(res, exp) + def test_deprecated_from_array(self): + # GH13854, `.from_array` is deprecated + with tm.assert_produces_warning(FutureWarning): + Categorical.from_array([0, 1]) + def test_removed_names_produces_warning(self): # 10482 @@ -1654,8 +1655,7 @@ class TestCategoricalAsBlock(tm.TestCase): _multiprocess_can_split_ = True def setUp(self): - self.factor = Categorical.from_array(['a', 'b', 'b', 'a', 'a', 'c', - 'c', 'c']) + self.factor = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) df = DataFrame({'value': np.random.randint(0, 10000, 100)}) labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] @@ -3001,9 +3001,10 @@ def test_groupby(self): # multiple groupers gb = df.groupby(['A', 'B']) - exp_index = pd.MultiIndex.from_product([['a', 'b', 'z'], - ['c', 'd', 'y']], - names=['A', 'B']) + exp_index = pd.MultiIndex.from_product( + [Categorical(["a", "b", "z"], ordered=True), + Categorical(["c", "d", "y"], ordered=True)], + names=['A', 'B']) expected = DataFrame({'values': [1, 2, np.nan, 3, 4, np.nan, np.nan, np.nan, np.nan]}, index=exp_index) @@ -3014,10 +3015,13 @@ def test_groupby(self): df = df.copy() df['C'] = ['foo', 'bar'] * 2 gb = df.groupby(['A', 'B', 'C']) + exp_index = pd.MultiIndex.from_product( + [Categorical(["a", "b", "z"], ordered=True), + Categorical(["c", "d", "y"], ordered=True), + ['foo', 'bar']], + names=['A', 'B', 'C']) expected = DataFrame({'values': Series( - np.nan, index=pd.MultiIndex.from_product( - [['a', 'b', 'z'], ['c', 'd', 'y'], ['foo', 'bar'] - ], names=['A', 'B', 'C']))}).sortlevel() + np.nan, index=exp_index)}).sortlevel() expected.iloc[[1, 2, 7, 8], 0] = [1, 2, 3, 4] result = gb.sum() tm.assert_frame_equal(result, expected) @@ -3096,11 +3100,12 @@ def test_pivot_table(self): df = DataFrame({"A": raw_cat1, "B": raw_cat2, "values": [1, 2, 3, 4]}) result = pd.pivot_table(df, values='values', index=['A', 'B']) + exp_index = pd.MultiIndex.from_product( + [Categorical(["a", "b", "z"], ordered=True), + Categorical(["c", "d", "y"], ordered=True)], + names=['A', 'B']) expected = Series([1, 2, np.nan, 3, 4, np.nan, np.nan, np.nan, np.nan], - index=pd.MultiIndex.from_product( - [['a', 'b', 'z'], ['c', 'd', 'y']], - names=['A', 'B']), - name='values') + index=exp_index, name='values') tm.assert_series_equal(result, expected) def test_count(self): @@ -4184,7 +4189,7 @@ def test_astype_to_other(self): cat = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])) exp = Series(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) tm.assert_series_equal(cat.astype('str'), exp) - s2 = Series(Categorical.from_array(['1', '2', '3', '4'])) + s2 = Series(Categorical(['1', '2', '3', '4'])) exp2 = Series([1, 2, 3, 4]).astype(int) tm.assert_series_equal(s2.astype('int'), exp2) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 6b33fa747d8ba..9d8873d843642 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -3108,8 +3108,10 @@ def test_apply_categorical_data(self): grouped = df.groupby(['missing', 'dense']) # missing category 'b' should still exist in the output index - idx = MultiIndex.from_product([['a', 'b'], ['a', 'b', 'c']], - names=['missing', 'dense']) + idx = MultiIndex.from_product( + [Categorical(['a', 'b'], ordered=ordered), + Categorical(['a', 'b', 'c'], ordered=ordered)], + names=['missing', 'dense']) expected = DataFrame([0, 1, 2, np.nan, np.nan, np.nan], index=idx, columns=['values']) @@ -6389,7 +6391,8 @@ def test_groupby_categorical_two_columns(self): groups_double_key = test.groupby(["cat", "ints"]) res = groups_double_key.agg('mean') exp = DataFrame({"val": [10, 30, 20, 40, np.nan, np.nan], - "cat": ["a", "a", "b", "b", "c", "c"], + "cat": pd.Categorical(["a", "a", "b", "b", "c", "c"], + ordered=True), "ints": [1, 2, 1, 2, 1, 2]}).set_index(["cat", "ints" ]) tm.assert_frame_equal(res, exp) @@ -6409,9 +6412,10 @@ def test_groupby_categorical_two_columns(self): res = groups_double_key.agg('mean') nan = np.nan - idx = MultiIndex.from_product([["(1, 2]", "(2, 3]", "(3, 6]"], - [1, 2, 3, 4]], - names=["cat", "C2"]) + idx = MultiIndex.from_product( + [Categorical(["(1, 2]", "(2, 3]", "(3, 6]"], ordered=True), + [1, 2, 3, 4]], + names=["cat", "C2"]) exp = DataFrame({"C1": [nan, nan, nan, nan, 3, 3, nan, nan, nan, nan, 4, 5], "C3": [nan, nan, nan, nan, 10, 100, @@ -6424,7 +6428,7 @@ def test_groupby_multi_categorical_as_index(self): 'A': [10, 11, 11], 'B': [101, 102, 103]}) result = df.groupby(['cat', 'A'], as_index=False).sum() - expected = DataFrame({'cat': [1, 1, 2, 2, 3, 3], + expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), 'A': [10, 11, 10, 11, 10, 11], 'B': [101.0, nan, nan, 205.0, nan, nan]}, columns=['cat', 'A', 'B']) @@ -6433,7 +6437,7 @@ def test_groupby_multi_categorical_as_index(self): # function grouper f = lambda r: df.loc[r, 'A'] result = df.groupby(['cat', f], as_index=False).sum() - expected = DataFrame({'cat': [1, 1, 2, 2, 3, 3], + expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), 'A': [10.0, nan, nan, 22.0, nan, nan], 'B': [101.0, nan, nan, 205.0, nan, nan]}, columns=['cat', 'A', 'B']) @@ -6442,14 +6446,14 @@ def test_groupby_multi_categorical_as_index(self): # another not in-axis grouper (conflicting names in index) s = Series(['a', 'b', 'b'], name='cat') result = df.groupby(['cat', s], as_index=False).sum() - expected = DataFrame({'cat': [1, 1, 2, 2, 3, 3], + expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), 'A': [10.0, nan, nan, 22.0, nan, nan], 'B': [101.0, nan, nan, 205.0, nan, nan]}, columns=['cat', 'A', 'B']) tm.assert_frame_equal(result, expected) # is original index dropped? - expected = DataFrame({'cat': [1, 1, 2, 2, 3, 3], + expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), 'A': [10, 11, 10, 11, 10, 11], 'B': [101.0, nan, nan, 205.0, nan, nan]}, columns=['cat', 'A', 'B']) @@ -6459,6 +6463,49 @@ def test_groupby_multi_categorical_as_index(self): result = df.groupby(['cat', 'A'], as_index=False).sum() tm.assert_frame_equal(result, expected, check_index_type=True) + def test_groupby_preserve_categorical_dtype(self): + # GH13743, GH13854 + df = DataFrame({'A': [1, 2, 1, 1, 2], + 'B': [10, 16, 22, 28, 34], + 'C1': Categorical(list("abaab"), + categories=list("bac"), + ordered=False), + 'C2': Categorical(list("abaab"), + categories=list("bac"), + ordered=True)}) + # single grouper + exp_full = DataFrame({'A': [2.0, 1.0, np.nan], + 'B': [25.0, 20.0, np.nan], + 'C1': Categorical(list("bac"), + categories=list("bac"), + ordered=False), + 'C2': Categorical(list("bac"), + categories=list("bac"), + ordered=True)}) + for col in ['C1', 'C2']: + result1 = df.groupby(by=col, as_index=False).mean() + result2 = df.groupby(by=col, as_index=True).mean().reset_index() + expected = exp_full.reindex(columns=result1.columns) + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, expected) + + # multiple grouper + exp_full = DataFrame({'A': [1, 1, 1, 2, 2, 2], + 'B': [np.nan, 20.0, np.nan, 25.0, np.nan, + np.nan], + 'C1': Categorical(list("bacbac"), + categories=list("bac"), + ordered=False), + 'C2': Categorical(list("bacbac"), + categories=list("bac"), + ordered=True)}) + for cols in [['A', 'C1'], ['A', 'C2']]: + result1 = df.groupby(by=cols, as_index=False).mean() + result2 = df.groupby(by=cols, as_index=True).mean().reset_index() + expected = exp_full.reindex(columns=result1.columns) + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, expected) + def test_groupby_apply_all_none(self): # Tests to make sure no errors if apply function returns all None # values. Issue 9684. diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py index 8bfd6350adc06..413724d1a6177 100644 --- a/pandas/tests/test_reshape.py +++ b/pandas/tests/test_reshape.py @@ -539,7 +539,8 @@ def test_int_int(self): data = Series(pd.Categorical(['a', 'b', 'a'])) result = pd.get_dummies(data) - expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=['a', 'b'], + expected = DataFrame([[1, 0], [0, 1], [1, 0]], + columns=pd.Categorical(['a', 'b']), dtype=np.uint8) tm.assert_frame_equal(result, expected) @@ -561,11 +562,47 @@ def test_int_df(self): result = pd.get_dummies(data, columns=['A', 'B']) tm.assert_frame_equal(result, expected) + def test_dataframe_dummies_preserve_categorical_dtype(self): + # GH13854 + for ordered in [False, True]: + cat = pd.Categorical(list("xy"), categories=list("xyz"), + ordered=ordered) + result = get_dummies(cat) + + data = np.array([[1, 0, 0], [0, 1, 0]], dtype=np.uint8) + cols = pd.CategoricalIndex(cat.categories, + categories=cat.categories, + ordered=ordered) + expected = DataFrame(data, columns=cols) + + tm.assert_frame_equal(result, expected) + class TestGetDummiesSparse(TestGetDummies): sparse = True +class TestMakeAxisDummies(tm.TestCase): + + def test_preserve_categorical_dtype(self): + # GH13854 + for ordered in [False, True]: + cidx = pd.CategoricalIndex(list("xyz"), ordered=ordered) + midx = pd.MultiIndex(levels=[['a'], cidx], + labels=[[0, 0], [0, 1]]) + df = DataFrame([[10, 11]], index=midx) + + expected = DataFrame([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]], + index=midx, columns=cidx) + + from pandas.core.reshape import make_axis_dummies + result = make_axis_dummies(df) + tm.assert_frame_equal(result, expected) + + result = make_axis_dummies(df, transform=lambda x: x) + tm.assert_frame_equal(result, expected) + + class TestLreshape(tm.TestCase): def test_pairs(self): diff --git a/pandas/tests/types/test_dtypes.py b/pandas/tests/types/test_dtypes.py index dd1a8dbd5c53a..a2b0a9ebfa6cc 100644 --- a/pandas/tests/types/test_dtypes.py +++ b/pandas/tests/types/test_dtypes.py @@ -80,8 +80,7 @@ def test_basic(self): self.assertTrue(is_categorical_dtype(self.dtype)) - factor = Categorical.from_array(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c' - ]) + factor = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) s = Series(factor, name='A') diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index ca7288b048427..7a29918c55658 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -11,6 +11,8 @@ from pandas import (Categorical, DataFrame, Series, Index, MultiIndex, Timedelta) +from pandas.core.categorical import (_factorize_from_iterable, + _factorize_from_iterables) from pandas.core.frame import _merge_doc from pandas.types.generic import ABCSeries from pandas.types.common import (is_datetime64tz_dtype, @@ -1632,8 +1634,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None): names = [None] * len(zipped) if levels is None: - levels = [Categorical.from_array( - zp, ordered=True).categories for zp in zipped] + _, levels = _factorize_from_iterables(zipped) else: levels = [_ensure_index(x) for x in levels] else: @@ -1671,9 +1672,9 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None): levels.extend(concat_index.levels) label_list.extend(concat_index.labels) else: - factor = Categorical.from_array(concat_index, ordered=True) - levels.append(factor.categories) - label_list.append(factor.codes) + codes, categories = _factorize_from_iterable(concat_index) + levels.append(categories) + label_list.append(codes) if len(names) == len(levels): names = list(names)