clean warnings

pandas-dev · Apr 26, 2018 · 7cd56cd · 7cd56cd
1 parent c61318d
commit 7cd56cd
Show file tree

Hide file tree

Showing 10 changed files with 77 additions and 40 deletions.
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -635,7 +635,7 @@ def _set_categories(self, categories, fastpath=False):
 
         self._dtype = new_dtype
 
-    def _codes_for_groupby(self, sort):
+    def _codes_for_groupby(self, sort, observed):
         """
         If sort=False, return a copy of self, coded with categories as
         returned by .unique(), followed by any categories not appearing in
@@ -649,6 +649,8 @@ def _codes_for_groupby(self, sort):
         ----------
         sort : boolean
             The value of the sort parameter groupby was called with.
+        observed : boolean
+            Account only for the observed values
 
         Returns
         -------
@@ -659,6 +661,22 @@ def _codes_for_groupby(self, sort):
             categories in the original order.
         """
 
+        # we only care about observed values
+        if observed:
+            unique_codes = unique1d(self.codes)
+            cat = self.copy()
+
+            take_codes = unique_codes[unique_codes != -1]
+            if self.ordered:
+                take_codes = np.sort(take_codes)
+
+            # we recode according to the uniques
+            cat._categories = self.categories.take(take_codes)
+            cat._codes = _recode_for_categories(self.codes,
+                                                self.categories,
+                                                cat._categories)
+            return cat
+
         # Already sorted according to self.categories; all is fine
         if sort:
             return self
@@ -2117,7 +2135,7 @@ def unique(self):
         # exclude nan from indexer for categories
         take_codes = unique_codes[unique_codes != -1]
         if self.ordered:
-            take_codes = sorted(take_codes)
+            take_codes = np.sort(take_codes)
         return cat.set_categories(cat.categories.take(take_codes))
 
     def _values_for_factorize(self):

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -2961,14 +2961,6 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
             # a passed Categorical
             elif is_categorical_dtype(self.grouper):
 
-                self.grouper = self.grouper._codes_for_groupby(self.sort)
-                codes = self.grouper.codes
-                categories = self.grouper.categories
-
-                # we make a CategoricalIndex out of the cat grouper
-                # preserving the categories / ordered attributes
-                self._labels = codes
-
                 # Use the observed values of the grouper if inidcated
                 observed = self.observed
                 if observed is None:
@@ -2980,8 +2972,16 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
                     warnings.warn(msg, FutureWarning, stacklevel=5)
                     observed = False
 
+                grouper = self.grouper
+                self.grouper = self.grouper._codes_for_groupby(
+                    self.sort, observed)
+                categories = self.grouper.categories
+
+                # we make a CategoricalIndex out of the cat grouper
+                # preserving the categories / ordered attributes
+                self._labels = self.grouper.codes
                 if observed:
-                    codes = algorithms.unique1d(codes)
+                    codes = algorithms.unique1d(grouper.codes)
                 else:
                     codes = np.arange(len(categories))
 

diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
@@ -782,9 +782,9 @@ def _concat_same_dtype(self, to_concat, name):
         result.name = name
         return result
 
-    def _codes_for_groupby(self, sort):
+    def _codes_for_groupby(self, sort, observed):
         """ Return a Categorical adjusted for groupby """
-        return self.values._codes_for_groupby(sort)
+        return self.values._codes_for_groupby(sort, observed)
 
     @classmethod
     def _add_comparison_methods(cls):

diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py
@@ -573,7 +573,7 @@ def test_sort_index_intervalindex(self):
                     bins=[-3, -0.5, 0, 0.5, 3])
         model = pd.concat([y, x1, x2], axis=1, keys=['Y', 'X1', 'X2'])
 
-        result = model.groupby(['X1', 'X2']).mean().unstack()
+        result = model.groupby(['X1', 'X2'], observed=True).mean().unstack()
         expected = IntervalIndex.from_tuples(
             [(-3.0, -0.5), (-0.5, 0.0),
              (0.0, 0.5), (0.5, 3.0)],

diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py
@@ -158,35 +158,46 @@ def test__cython_agg_general(op, targop):
     ('min', np.min),
     ('max', np.max), ]
 )
-def test_cython_agg_empty_buckets(op, targop):
+def test_cython_agg_empty_buckets(op, targop, observed):
     df = pd.DataFrame([11, 12, 13])
     grps = range(0, 55, 5)
 
     # calling _cython_agg_general directly, instead of via the user API
     # which sets different values for min_count, so do that here.
-    result = df.groupby(pd.cut(df[0], grps))._cython_agg_general(op)
-    expected = df.groupby(pd.cut(df[0], grps)).agg(lambda x: targop(x))
+    g = df.groupby(pd.cut(df[0], grps), observed=observed)
+    result = g._cython_agg_general(op)
+
+    g = df.groupby(pd.cut(df[0], grps), observed=observed)
+    expected = g.agg(lambda x: targop(x))
     tm.assert_frame_equal(result, expected)
 
 
-def test_cython_agg_empty_buckets_nanops():
+def test_cython_agg_empty_buckets_nanops(observed):
     # GH-18869 can't call nanops on empty groups, so hardcode expected
     # for these
     df = pd.DataFrame([11, 12, 13], columns=['a'])
     grps = range(0, 25, 5)
     # add / sum
-    result = df.groupby(pd.cut(df['a'], grps))._cython_agg_general('add')
+    result = df.groupby(pd.cut(df['a'], grps),
+                        observed=observed)._cython_agg_general('add')
     intervals = pd.interval_range(0, 20, freq=5)
     expected = pd.DataFrame(
         {"a": [0, 0, 36, 0]},
         index=pd.CategoricalIndex(intervals, name='a', ordered=True))
+    if observed:
+        expected = expected[expected.a != 0]
+
     tm.assert_frame_equal(result, expected)
 
     # prod
-    result = df.groupby(pd.cut(df['a'], grps))._cython_agg_general('prod')
+    result = df.groupby(pd.cut(df['a'], grps),
+                        observed=observed)._cython_agg_general('prod')
     expected = pd.DataFrame(
         {"a": [1, 1, 1716, 1]},
         index=pd.CategoricalIndex(intervals, name='a', ordered=True))
+    if observed:
+        expected = expected[expected.a != 1]
+
     tm.assert_frame_equal(result, expected)
 
 

diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py
@@ -488,12 +488,12 @@ def test_agg_structs_series(structure, expected):
 
 
 @pytest.mark.xfail(reason="GH-18869: agg func not called on empty groups.")
-def test_agg_category_nansum():
+def test_agg_category_nansum(observed):
     categories = ['a', 'b', 'c']
     df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'],
                                            categories=categories),
                        'B': [1, 2, 3]})
-    result = df.groupby("A").B.agg(np.nansum)
+    result = df.groupby("A", observed=observed).B.agg(np.nansum)
     expected = pd.Series([3, 3, 0],
                          index=pd.CategoricalIndex(['a', 'b', 'c'],
                                                    categories=categories,

diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py
@@ -4,6 +4,11 @@
 from pandas.util import testing as tm
 
 
+@pytest.fixture(params=[True, False])
+def observed(request):
+    return request.param
+
+
 @pytest.fixture
 def mframe():
     index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two',

diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
@@ -12,11 +12,6 @@
 import pandas.util.testing as tm
 
 
-@pytest.fixture(params=[True, False])
-def observed(request):
-    return request.param
-
-
 def cartesian_product_for_groupers(result, args, names):
     """ Reindex to a cartesian production for the groupers,
     preserving the nature (Categorical) of each grouper """
@@ -378,8 +373,7 @@ def test_observed(observed):
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.xfail(reason="failing with observed")
-def test_observed_failing(observed):
+def test_observed_codes_remap(observed):
     d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]}
     df = pd.DataFrame(d)
     values = pd.cut(df['C1'], [1, 2, 3, 6])

diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
@@ -313,14 +313,14 @@ def test_cython_median():
     tm.assert_frame_equal(rs, xp)
 
 
-def test_median_empty_bins():
+def test_median_empty_bins(observed):
     df = pd.DataFrame(np.random.randint(0, 44, 500))
 
     grps = range(0, 55, 5)
     bins = pd.cut(df[0], grps)
 
-    result = df.groupby(bins).median()
-    expected = df.groupby(bins).agg(lambda x: x.median())
+    result = df.groupby(bins, observed=observed).median()
+    expected = df.groupby(bins, observed=observed).agg(lambda x: x.median())
     tm.assert_frame_equal(result, expected)
 
 

diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py
@@ -251,7 +251,7 @@ def test_groupby_levels_and_columns(self):
         by_columns.columns = pd.Index(by_columns.columns, dtype=np.int64)
         tm.assert_frame_equal(by_levels, by_columns)
 
-    def test_groupby_categorical_index_and_columns(self):
+    def test_groupby_categorical_index_and_columns(self, observed):
         # GH18432
         columns = ['A', 'B', 'A', 'B']
         categories = ['B', 'A']
@@ -260,17 +260,26 @@ def test_groupby_categorical_index_and_columns(self):
                                        categories=categories,
                                        ordered=True)
         df = DataFrame(data=data, columns=cat_columns)
-        result = df.groupby(axis=1, level=0).sum()
+        result = df.groupby(axis=1, level=0, observed=observed).sum()
         expected_data = 2 * np.ones((5, 2), int)
-        expected_columns = CategoricalIndex(categories,
-                                            categories=categories,
-                                            ordered=True)
+
+        if observed:
+            # if we are not-observed we undergo a reindex
+            # so need to adjust the output as our expected sets us up
+            # to be non-observed
+            expected_columns = CategoricalIndex(['A', 'B'],
+                                                categories=categories,
+                                                ordered=True)
+        else:
+            expected_columns = CategoricalIndex(categories,
+                                                categories=categories,
+                                                ordered=True)
         expected = DataFrame(data=expected_data, columns=expected_columns)
         assert_frame_equal(result, expected)
 
         # test transposed version
         df = DataFrame(data.T, index=cat_columns)
-        result = df.groupby(axis=0, level=0).sum()
+        result = df.groupby(axis=0, level=0, observed=observed).sum()
         expected = DataFrame(data=expected_data.T, index=expected_columns)
         assert_frame_equal(result, expected)
 
@@ -572,11 +581,11 @@ def test_get_group(self):
         pytest.raises(ValueError,
                       lambda: g.get_group(('foo', 'bar', 'baz')))
 
-    def test_get_group_empty_bins(self):
+    def test_get_group_empty_bins(self, observed):
 
         d = pd.DataFrame([3, 1, 7, 6])
         bins = [0, 5, 10, 15]
-        g = d.groupby(pd.cut(d[0], bins))
+        g = d.groupby(pd.cut(d[0], bins), observed=observed)
 
         # TODO: should prob allow a str of Interval work as well
         # IOW '(0, 5]'