Initial Review Fixes

pandas-dev · Nov 13, 2017 · 7cde4da · 7cde4da
1 parent 0b2969f
commit 7cde4da
Show file tree

Hide file tree

Showing 9 changed files with 280 additions and 39 deletions.
diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst
@@ -150,10 +150,10 @@ constructor to save the factorize step during normal constructor mode:
     splitter = np.random.choice([0,1], 5, p=[0.5,0.5])
     s = pd.Series(pd.Categorical.from_codes(splitter, categories=["train", "test"]))
 
-.. _categorical.objectcreation.frame:
+.. _categorical.objectcreation.existingframe:
 
-Creating categories from a ``DataFrame``
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Creating categories from an existing ``DataFrame``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. versionadded:: 0.22.0
 
@@ -169,15 +169,6 @@ if a column does not contain all labels:
    df['A'].dtype
    df['B'].dtype
 
-Note that this behavior is different than instantiating a ``DataFrame`` with categorical dtype, which will only assign
-categories to each column based on the labels present in each column:
-
-.. ipython:: python
-
-   df = pd.DataFrame({'A': ['a', 'b', 'c'], 'B': ['c', 'd', 'e']}, dtype='category')
-   df['A'].dtype
-   df['B'].dtype
-
 When using ``astype``, you can control the categories that will be present in each column by passing
 a ``CategoricalDtype``:
 
@@ -199,6 +190,72 @@ discussed hold with subselection.
    df[['A', 'B']] = df[['A', 'B']].astype('category')
    df.dtypes
 
+Note that you can use ``apply`` to set categories on a per-column basis:
+
+.. ipython:: python
+
+   df = pd.DataFrame({'A': ['a', 'b', 'c'], 'B': ['c', 'd', 'e']})
+   df = df.apply(lambda x: x.astype('category'))
+   df['A'].dtype
+   df['B'].dtype
+
+
+.. _categorical.objectcreation.frameconstructor:
+
+Creating categories from the ``DataFrame`` constructor
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. versionchanged:: 0.22.0
+
+.. warning::
+
+   Prior to version 0.22.0, the default behavior of the ``DataFrame`` constructor when a categorical dtype was
+   passed was to operate on a per-column basis, meaning that only labels present in a given column would be categories
+   for that column.
+
+   To promote consistency of behavior, from version 0.22.0 onwards instantiating a ``DataFrame`` with categorical
+   dtype will by default use all labels in present all columns when setting categories, even if a column does not
+   contain all labels.  This is consistent with the new ``astype`` behavior described above.
+
+Behavior prior to version 0.22.0:
+
+.. code-block:: ipython
+
+  In [2]: df = pd.DataFrame({'A': ['a', 'b', 'c'], 'B': ['c', 'd', 'e']}, dtype='category')
+
+  In [3]: df
+  Out[3]:
+     A  B
+  0  a  c
+  1  b  d
+  2  c  e
+
+  In [4]: df['A'].dtype
+  Out[4]: CategoricalDtype(categories=['a', 'b', 'c'], ordered=False)
+
+  In [5]: df['B'].dtype
+  Out[5]: CategoricalDtype(categories=['c', 'd', 'e'], ordered=False)
+
+Behavior from version 0.22.0 onwards:
+
+.. ipython:: python
+
+   df = pd.DataFrame({'A': ['a', 'b', 'c'], 'B': ['c', 'd', 'e']}, dtype='category')
+   df
+   df['A'].dtype
+   df['B'].dtype
+
+Like with ``astype``, you can control the categories that will be present in each column by passing
+a ``CategoricalDtype``:
+
+.. ipython:: python
+
+   dtype = CategoricalDtype(categories=list('abdef'), ordered=True)
+   df = pd.DataFrame({'A': ['a', 'b', 'c'], 'B': ['c', 'd', 'e']}, dtype=dtype)
+   df
+   df['A'].dtype
+   df['B'].dtype
+
 .. _categorical.categoricaldtype:
 
 CategoricalDtype

diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt
@@ -31,7 +31,8 @@ labels present in all columns, even if a column does not contain all labels:
    df['A'].dtype
    df['B'].dtype
 
-See the :ref:`categorical.objectcreation.frame` section of the documentation for more details and examples.
+See the :ref:`categorical.objectcreation.existingframe` section of the documentation for more details and examples.
+
 
 .. _whatsnew_0220.enhancements.other:
 
@@ -47,9 +48,45 @@ Other Enhancements
 Backwards incompatible API changes
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
--
--
--
+.. _whatsnew_0220.api_breaking.frame_constructor_category:
+
+Creating categories from the ``DataFrame`` constructor
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+To promote consistency with the :ref:`new behavior :meth:`DataFrame.astype` with categorical dtype <categorical.objectcreation.frame.warning>`,
+using the ``DataFrame`` constructor with categorical dtype will now by default use all labels in present all columns to set each column's
+categories, even if a column does not contain all labels (:issue:`12860`)
+
+Previous Behavior:
+
+.. code-block:: ipython
+
+  In [2]: df = pd.DataFrame({'A': ['a', 'b', 'c'], 'B': ['c', 'd', 'e']}, dtype='category')
+
+  In [3]: df
+  Out[3]:
+     A  B
+  0  a  c
+  1  b  d
+  2  c  e
+
+  In [4]: df['A'].dtype
+  Out[4]: CategoricalDtype(categories=['a', 'b', 'c'], ordered=False)
+
+  In [5]: df['B'].dtype
+  Out[5]: CategoricalDtype(categories=['c', 'd', 'e'], ordered=False)
+
+New Behavior:
+
+.. ipython:: python
+
+   df = pd.DataFrame({'A': ['a', 'b', 'c'], 'B': ['c', 'd', 'e']}, dtype='category')
+   df
+   df['A'].dtype
+   df['B'].dtype
+
+See the :ref:`categorical.objectcreation.frameconstructor` section of the documentation for more details and examples.
+
 
 .. _whatsnew_0220.api:
 

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -8,8 +8,7 @@
 
 from pandas.core.dtypes.cast import maybe_promote
 from pandas.core.dtypes.generic import (
-    ABCSeries, ABCIndex,
-    ABCIndexClass, ABCCategorical)
+    ABCSeries, ABCDataFrame, ABCIndex, ABCIndexClass, ABCCategorical)
 from pandas.core.dtypes.common import (
     is_unsigned_integer_dtype, is_signed_integer_dtype,
     is_integer_dtype, is_complex_dtype,
@@ -177,6 +176,18 @@ def _ensure_arraylike(values):
     return values
 
 
+def _ensure_arraylike2d(values):
+    """
+    ensure a 2d structure is arraylike if not already
+    """
+    if isinstance(values, ABCDataFrame):
+        values = values.values
+    elif not isinstance(values, np.ndarray):
+        # assuming list-of-list-like structure
+        values = np.asarray([_ensure_arraylike(x) for x in values])
+    return values
+
+
 _hashtables = {
     'float64': (htable.Float64HashTable, htable.Float64Vector),
     'uint64': (htable.UInt64HashTable, htable.UInt64Vector),
@@ -377,6 +388,32 @@ def unique(values):
 unique1d = unique
 
 
+def unique2d(values, order='F'):
+    """
+    Hash table-based unique. Uniques are returned in order
+    of appearance. This does NOT sort, but the order of appearance
+    can be altered via the order parameter.
+
+    Significantly faster than numpy.unique. Includes NA values.
+
+    Parameters
+    ----------
+    values : 2d array-like
+    order: string, default 'F'
+        The order in which to ravel the 2d array, which corresponds to
+        the order of appearance for unique values in the output. Valid
+        values are {'C', 'F', 'A', 'K'}, see ``numpy.ravel`` for more
+        details regarding the order parameter.
+
+    Returns
+    -------
+    ndarray of unique values.
+    """
+    values = _ensure_arraylike2d(values)
+    values = values.ravel(order=order)
+    return unique(values)
+
+
 def isin(comps, values):
     """
     Compute the isin boolean array

diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
@@ -30,7 +30,7 @@
     is_dict_like)
 from pandas.core.common import is_null_slice, _maybe_box_datetimelike
 
-from pandas.core.algorithms import factorize, take_1d, unique1d
+from pandas.core.algorithms import factorize, take_1d, unique1d, unique2d
 from pandas.core.accessor import PandasDelegate
 from pandas.core.base import (PandasObject,
                               NoNewAttributesMixin, _shared_docs)
@@ -2332,3 +2332,20 @@ def _factorize_from_iterables(iterables):
         # For consistency, it should return a list of 2 lists.
         return [[], []]
     return map(list, lzip(*[_factorize_from_iterable(it) for it in iterables]))
+
+
+def _get_categorical_dtype_2d(values, dtype, **kwargs):
+    # GH 12860
+    # have a CategoricalDtype with set categories already
+    if isinstance(dtype, CategoricalDtype) and dtype.categories is not None:
+        return dtype
+
+    # construct a CategoricalDtype with appropriate categories
+    categories = kwargs.get('categories', None)
+    ordered = kwargs.get('ordered', None) or getattr(dtype, 'ordered', None)
+
+    if categories is None:
+        categories = unique2d(values)
+        categories = categories[notna(categories)]
+
+    return CategoricalDtype(categories, ordered)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -75,7 +75,7 @@
                                    create_block_manager_from_arrays,
                                    create_block_manager_from_blocks)
 from pandas.core.series import Series
-from pandas.core.categorical import Categorical
+from pandas.core.categorical import Categorical, _get_categorical_dtype_2d
 import pandas.core.algorithms as algorithms
 from pandas.compat import (range, map, zip, lrange, lmap, lzip, StringIO, u,
                            OrderedDict, raise_with_traceback)
@@ -6123,6 +6123,9 @@ def _arrays_to_mgr(arrays, arr_names, index, columns, dtype=None):
     else:
         index = _ensure_index(index)
 
+    if is_categorical_dtype(dtype):
+        dtype = _get_categorical_dtype_2d(arrays, dtype)
+
     # don't force copy because getting jammed in an ndarray anyway
     arrays = _homogenize(arrays, index, dtype)
 

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -26,8 +26,7 @@
     is_list_like,
     is_dict_like,
     is_re_compilable,
-    pandas_dtype,
-    CategoricalDtype)
+    pandas_dtype)
 from pandas.core.dtypes.cast import maybe_promote, maybe_upcast_putmask
 from pandas.core.dtypes.missing import isna, notna
 from pandas.core.dtypes.generic import ABCSeries, ABCPanel, ABCDataFrame
@@ -37,6 +36,7 @@
                                 SettingWithCopyWarning)
 
 from pandas.core.base import PandasObject, SelectionMixin
+from pandas.core.categorical import _get_categorical_dtype_2d
 from pandas.core.index import (Index, MultiIndex, _ensure_index,
                                InvalidIndexError)
 import pandas.core.indexing as indexing
@@ -3985,18 +3985,7 @@ def astype(self, dtype, copy=True, errors='raise', **kwargs):
 
         elif is_categorical_dtype(dtype) and self.ndim > 1:
             # GH 12860
-            dtype_with_cat = (isinstance(dtype, CategoricalDtype) and
-                              dtype.categories is not None)
-            if not dtype_with_cat:
-                categories = kwargs.get('categories', None)
-                ordered = (kwargs.get('ordered', None) or
-                           getattr(dtype, 'ordered', None))
-
-                if categories is None:
-                    categories = algos.unique(self.values.ravel(order='F'))
-
-                dtype = CategoricalDtype(categories, ordered)
-
+            dtype = _get_categorical_dtype_2d(self.values, dtype, **kwargs)
             results = (self[col].astype(dtype, copy=copy) for col in self)
             return pd.concat(results, axis=1, copy=False)
 

diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py
@@ -7,8 +7,10 @@
 import numpy as np
 import pandas as pd
 from pandas import (
-    Series, Categorical, CategoricalIndex, IntervalIndex, date_range)
+    Series, DataFrame, Categorical,
+    CategoricalIndex, IntervalIndex, date_range)
 
+from pandas.core.categorical import _get_categorical_dtype_2d
 from pandas.core.dtypes.dtypes import (
     DatetimeTZDtype, PeriodDtype,
     IntervalDtype, CategoricalDtype)
@@ -123,6 +125,44 @@ def test_tuple_categories(self):
         result = CategoricalDtype(categories)
         assert all(result.categories == categories)
 
+    def test_get_categorical_dtype_2d(self):
+        # GH 12860
+        values = DataFrame({0: ['a', 'b', 'c', 'a'],
+                            1: ['b', np.nan, 'd', 'd']})
+        categories = ['a', 'b', 'c', 'd']
+
+        # no keywords
+        expected = CategoricalDtype(categories=categories)
+        result = _get_categorical_dtype_2d(values, 'category')
+        assert is_dtype_equal(result, expected)
+
+        result = _get_categorical_dtype_2d(values, CategoricalDtype())
+        assert is_dtype_equal(result, expected)
+
+        # ordered
+        expected = CategoricalDtype(categories=categories, ordered=True)
+        result = _get_categorical_dtype_2d(values, 'category', ordered=True)
+        assert is_dtype_equal(result, expected)
+
+        result = _get_categorical_dtype_2d(
+            values, CategoricalDtype(ordered=True))
+        assert is_dtype_equal(result, expected)
+
+        # custom categories
+        custom = ['a', 'b', 'c', 'e']
+        expected = CategoricalDtype(categories=custom)
+        result = _get_categorical_dtype_2d(
+            values, 'category', categories=custom)
+        assert is_dtype_equal(result, expected)
+
+        result = _get_categorical_dtype_2d(
+            values, CategoricalDtype(categories=custom))
+        assert is_dtype_equal(result, expected)
+
+        # CategoricalDtype with set categories
+        result = _get_categorical_dtype_2d(values, expected)
+        assert is_dtype_equal(result, expected)
+
 
 class TestDatetimeTZDtype(Base):