BUG: groupby with categorical and other columns

closes pandas-dev#14942
jreback · Apr 30, 2018 · d34d365 · d34d365
1 parent c8fcfcb
commit d34d365
Show file tree

Hide file tree

Showing 6 changed files with 549 additions and 348 deletions.
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -527,6 +527,41 @@ If you wish to retain the old behavior while using Python >= 3.6, you can use
                'Taxes': -200,
                'Net result': 300}).sort_index()
 
+.. _whatsnew_0230.api_breaking.categorical_grouping:
+
+Categorical Groupers will now require passing the observed keyword
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In previous versions, grouping by 1 or more categorical columns would result in an index that was the cartesian product of all of the categories for
+each grouper, not just the observed values.``.groupby()`` has gained the ``observed`` keyword to toggle this behavior. The default remains backward
+compatible (generate a cartesian product). Pandas will show a ``FutureWarning`` if the ``observed`` keyword is not passed; the default will
+change to ``observed=True`` in the future. (:issue:`14942`, :issue:`8138`, :issue:`15217`, :issue:`17594`, :issue:`8669`, :issue:`20583`)
+
+
+.. ipython:: python
+
+   cat1 = pd.Categorical(["a", "a", "b", "b"],
+                         categories=["a", "b", "z"], ordered=True)
+   cat2 = pd.Categorical(["c", "d", "c", "d"],
+                         categories=["c", "d", "y"], ordered=True)
+   df = pd.DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
+   df['C'] = ['foo', 'bar'] * 2
+   df
+
+Previous Behavior (show all values):
+
+.. ipython:: python
+
+.. code-block:: python
+   df.groupby(['A', 'B', 'C'], observed=False).count()
+
+
+New Behavior (show only observed values):
+
+.. ipython:: python
+
+   df.groupby(['A', 'B', 'C'], observed=True).count()
+
 .. _whatsnew_0230.api_breaking.deprecate_panel:
 
 Deprecate Panel

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -6599,7 +6599,7 @@ def clip_lower(self, threshold, axis=None, inplace=False):
                                          axis=axis, inplace=inplace)
 
     def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True,
-                group_keys=True, squeeze=False, **kwargs):
+                group_keys=True, squeeze=False, observed=None, **kwargs):
         """
         Group series using mapper (dict or key function, apply given function
         to group, return result as series) or by a series of columns.
@@ -6632,6 +6632,13 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True,
         squeeze : boolean, default False
             reduce the dimensionality of the return type if possible,
             otherwise return a consistent type
+        observed : boolean, default None
+            if True: only show observed values for categorical groupers
+            if False: show all values for categorical groupers
+            if None: if any categorical groupers, show a FutureWarning,
+                default to False
+
+            .. versionadded:: 0.23.0
 
         Returns
         -------
@@ -6665,7 +6672,7 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True,
         axis = self._get_axis_number(axis)
         return groupby(self, by=by, axis=axis, level=level, as_index=as_index,
                        sort=sort, group_keys=group_keys, squeeze=squeeze,
-                       **kwargs)
+                       observed=observed, **kwargs)
 
     def asfreq(self, freq, method=None, how=None, normalize=False,
                fill_value=None):

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -556,7 +556,8 @@ class _GroupBy(PandasObject, SelectionMixin):
 
     def __init__(self, obj, keys=None, axis=0, level=None,
                  grouper=None, exclusions=None, selection=None, as_index=True,
-                 sort=True, group_keys=True, squeeze=False, **kwargs):
+                 sort=True, group_keys=True, squeeze=False,
+                 observed=None, **kwargs):
 
         self._selection = selection
 
@@ -576,13 +577,15 @@ def __init__(self, obj, keys=None, axis=0, level=None,
         self.sort = sort
         self.group_keys = group_keys
         self.squeeze = squeeze
+        self.observed = observed
         self.mutated = kwargs.pop('mutated', False)
 
         if grouper is None:
             grouper, exclusions, obj = _get_grouper(obj, keys,
                                                     axis=axis,
                                                     level=level,
                                                     sort=sort,
+                                                    observed=observed,
                                                     mutated=self.mutated)
 
         self.obj = obj
@@ -2331,18 +2334,21 @@ def ngroups(self):
     def recons_labels(self):
         comp_ids, obs_ids, _ = self.group_info
         labels = (ping.labels for ping in self.groupings)
-        return decons_obs_group_ids(comp_ids,
-                                    obs_ids, self.shape, labels, xnull=True)
+        return decons_obs_group_ids(
+            comp_ids, obs_ids, self.shape, labels, xnull=True)
 
     @cache_readonly
     def result_index(self):
         if not self.compressed and len(self.groupings) == 1:
             return self.groupings[0].group_index.rename(self.names[0])
 
-        return MultiIndex(levels=[ping.group_index for ping in self.groupings],
-                          labels=self.recons_labels,
-                          verify_integrity=False,
-                          names=self.names)
+        labels = self.recons_labels
+        levels = [ping.group_index for ping in self.groupings]
+        result = MultiIndex(levels=levels,
+                            labels=labels,
+                            verify_integrity=False,
+                            names=self.names)
+        return result
 
     def get_group_levels(self):
         if not self.compressed and len(self.groupings) == 1:
@@ -2883,6 +2889,7 @@ class Grouping(object):
     obj :
     name :
     level :
+    observed : If we are a Categorical, use the observed values
     in_axis : if the Grouping is a column in self.obj and hence among
         Groupby.exclusions list
 
@@ -2898,14 +2905,15 @@ class Grouping(object):
     """
 
     def __init__(self, index, grouper=None, obj=None, name=None, level=None,
-                 sort=True, in_axis=False):
+                 sort=True, observed=None, in_axis=False):
 
         self.name = name
         self.level = level
         self.grouper = _convert_grouper(index, grouper)
         self.index = index
         self.sort = sort
         self.obj = obj
+        self.observed = observed
         self.in_axis = in_axis
 
         # right place for this?
@@ -2954,16 +2962,34 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
             elif is_categorical_dtype(self.grouper):
 
                 self.grouper = self.grouper._codes_for_groupby(self.sort)
+                codes = self.grouper.codes
+                categories = self.grouper.categories
 
                 # we make a CategoricalIndex out of the cat grouper
                 # preserving the categories / ordered attributes
-                self._labels = self.grouper.codes
+                self._labels = codes
+
+                # Use the observed values of the grouper if inidcated
+                observed = self.observed
+                if observed is None:
+                    msg = ("pass observed=True to ensure that a "
+                           "categorical grouper only returns the "
+                           "observed groupers, or\n"
+                           "observed=False to return NA for non-observed"
+                           "values\n")
+                    warnings.warn(msg, FutureWarning, stacklevel=5)
+                    observed = False
+
+                if observed:
+                    codes = algorithms.unique1d(codes)
+                else:
+                    codes = np.arange(len(categories))
 
-                c = self.grouper.categories
                 self._group_index = CategoricalIndex(
-                    Categorical.from_codes(np.arange(len(c)),
-                                           categories=c,
-                                           ordered=self.grouper.ordered))
+                    Categorical.from_codes(
+                        codes=codes,
+                        categories=categories,
+                        ordered=self.grouper.ordered))
 
             # we are done
             if isinstance(self.grouper, Grouping):
@@ -3048,7 +3074,7 @@ def groups(self):
 
 
 def _get_grouper(obj, key=None, axis=0, level=None, sort=True,
-                 mutated=False, validate=True):
+                 observed=None, mutated=False, validate=True):
     """
     create and return a BaseGrouper, which is an internal
     mapping of how to create the grouper indexers.
@@ -3065,6 +3091,9 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True,
     are and then creates a Grouping for each one, combined into
     a BaseGrouper.
 
+    If observed & we have a categorical grouper, only show the observed
+    values
+
     If validate, then check for key/level overlaps
 
     """
@@ -3243,6 +3272,7 @@ def is_in_obj(gpr):
                         name=name,
                         level=level,
                         sort=sort,
+                        observed=observed,
                         in_axis=in_axis) \
             if not isinstance(gpr, Grouping) else gpr
 
@@ -4154,7 +4184,7 @@ def first_not_none(values):
                                         not_indexed_same=not_indexed_same)
         elif self.grouper.groupings is not None:
             if len(self.grouper.groupings) > 1:
-                key_index = MultiIndex.from_tuples(keys, names=key_names)
+                key_index = self.grouper.result_index
 
             else:
                 ping = self.grouper.groupings[0]
@@ -4244,8 +4274,9 @@ def first_not_none(values):
 
                         # normally use vstack as its faster than concat
                         # and if we have mi-columns
-                        if isinstance(v.index,
-                                      MultiIndex) or key_index is None:
+                        if (isinstance(v.index, MultiIndex) or
+                                key_index is None or
+                                isinstance(key_index, MultiIndex)):
                             stacked_values = np.vstack(map(np.asarray, values))
                             result = DataFrame(stacked_values, index=key_index,
                                                columns=index)
@@ -4696,6 +4727,14 @@ def _reindex_output(self, result):
 
         This can re-expand the output space
         """
+
+        # TODO(jreback): remove completely
+        # when observed parameter is defaulted to True
+        # gh-20583
+
+        if self.observed:
+            return result
+
         groupings = self.grouper.groupings
         if groupings is None:
             return result

diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py
@@ -79,7 +79,7 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean',
                 pass
         values = list(values)
 
-    grouped = data.groupby(keys)
+    grouped = data.groupby(keys, observed=dropna)
     agged = grouped.agg(aggfunc)
 
     table = agged
@@ -241,10 +241,13 @@ def _all_key(key):
             return (key, margins_name) + ('',) * (len(cols) - 1)
 
         if len(rows) > 0:
-            margin = data[rows + values].groupby(rows).agg(aggfunc)
+            margin = data[rows + values].groupby(
+                rows, observed=True).agg(aggfunc)
             cat_axis = 1
 
-            for key, piece in table.groupby(level=0, axis=cat_axis):
+            for key, piece in table.groupby(level=0,
+                                            axis=cat_axis,
+                                            observed=True):
                 all_key = _all_key(key)
 
                 # we are going to mutate this, so need to copy!
@@ -264,7 +267,9 @@ def _all_key(key):
         else:
             margin = grand_margin
             cat_axis = 0
-            for key, piece in table.groupby(level=0, axis=cat_axis):
+            for key, piece in table.groupby(level=0,
+                                            axis=cat_axis,
+                                            observed=True):
                 all_key = _all_key(key)
                 table_pieces.append(piece)
                 table_pieces.append(Series(margin[key], index=[all_key]))
@@ -279,7 +284,8 @@ def _all_key(key):
         margin_keys = table.columns
 
     if len(cols) > 0:
-        row_margin = data[cols + values].groupby(cols).agg(aggfunc)
+        row_margin = data[cols + values].groupby(
+            cols, observed=True).agg(aggfunc)
         row_margin = row_margin.stack()
 
         # slight hack
@@ -304,14 +310,17 @@ def _all_key():
             return (margins_name, ) + ('', ) * (len(cols) - 1)
 
         if len(rows) > 0:
-            margin = data[rows].groupby(rows).apply(aggfunc)
+            margin = data[rows].groupby(rows,
+                                        observed=True).apply(aggfunc)
             all_key = _all_key()
             table[all_key] = margin
             result = table
             margin_keys.append(all_key)
 
         else:
-            margin = data.groupby(level=0, axis=0).apply(aggfunc)
+            margin = data.groupby(level=0,
+                                  axis=0,
+                                  observed=True).apply(aggfunc)
             all_key = _all_key()
             table[all_key] = margin
             result = table
@@ -322,7 +331,7 @@ def _all_key():
         margin_keys = table.columns
 
     if len(cols):
-        row_margin = data[cols].groupby(cols).apply(aggfunc)
+        row_margin = data[cols].groupby(cols, observed=True).apply(aggfunc)
     else:
         row_margin = Series(np.nan, index=result.columns)