Skip to content

Commit

Permalink
clean warnings
Browse files Browse the repository at this point in the history
  • Loading branch information
jreback committed Apr 26, 2018
1 parent c61318d commit 7cd56cd
Show file tree
Hide file tree
Showing 10 changed files with 77 additions and 40 deletions.
22 changes: 20 additions & 2 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -635,7 +635,7 @@ def _set_categories(self, categories, fastpath=False):

self._dtype = new_dtype

def _codes_for_groupby(self, sort):
def _codes_for_groupby(self, sort, observed):
"""
If sort=False, return a copy of self, coded with categories as
returned by .unique(), followed by any categories not appearing in
Expand All @@ -649,6 +649,8 @@ def _codes_for_groupby(self, sort):
----------
sort : boolean
The value of the sort parameter groupby was called with.
observed : boolean
Account only for the observed values
Returns
-------
Expand All @@ -659,6 +661,22 @@ def _codes_for_groupby(self, sort):
categories in the original order.
"""

# we only care about observed values
if observed:
unique_codes = unique1d(self.codes)
cat = self.copy()

take_codes = unique_codes[unique_codes != -1]
if self.ordered:
take_codes = np.sort(take_codes)

# we recode according to the uniques
cat._categories = self.categories.take(take_codes)
cat._codes = _recode_for_categories(self.codes,
self.categories,
cat._categories)
return cat

# Already sorted according to self.categories; all is fine
if sort:
return self
Expand Down Expand Up @@ -2117,7 +2135,7 @@ def unique(self):
# exclude nan from indexer for categories
take_codes = unique_codes[unique_codes != -1]
if self.ordered:
take_codes = sorted(take_codes)
take_codes = np.sort(take_codes)
return cat.set_categories(cat.categories.take(take_codes))

def _values_for_factorize(self):
Expand Down
18 changes: 9 additions & 9 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2961,14 +2961,6 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
# a passed Categorical
elif is_categorical_dtype(self.grouper):

self.grouper = self.grouper._codes_for_groupby(self.sort)
codes = self.grouper.codes
categories = self.grouper.categories

# we make a CategoricalIndex out of the cat grouper
# preserving the categories / ordered attributes
self._labels = codes

# Use the observed values of the grouper if inidcated
observed = self.observed
if observed is None:
Expand All @@ -2980,8 +2972,16 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
warnings.warn(msg, FutureWarning, stacklevel=5)
observed = False

grouper = self.grouper
self.grouper = self.grouper._codes_for_groupby(
self.sort, observed)
categories = self.grouper.categories

# we make a CategoricalIndex out of the cat grouper
# preserving the categories / ordered attributes
self._labels = self.grouper.codes
if observed:
codes = algorithms.unique1d(codes)
codes = algorithms.unique1d(grouper.codes)
else:
codes = np.arange(len(categories))

Expand Down
4 changes: 2 additions & 2 deletions pandas/core/indexes/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -782,9 +782,9 @@ def _concat_same_dtype(self, to_concat, name):
result.name = name
return result

def _codes_for_groupby(self, sort):
def _codes_for_groupby(self, sort, observed):
""" Return a Categorical adjusted for groupby """
return self.values._codes_for_groupby(sort)
return self.values._codes_for_groupby(sort, observed)

@classmethod
def _add_comparison_methods(cls):
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/test_sorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -573,7 +573,7 @@ def test_sort_index_intervalindex(self):
bins=[-3, -0.5, 0, 0.5, 3])
model = pd.concat([y, x1, x2], axis=1, keys=['Y', 'X1', 'X2'])

result = model.groupby(['X1', 'X2']).mean().unstack()
result = model.groupby(['X1', 'X2'], observed=True).mean().unstack()
expected = IntervalIndex.from_tuples(
[(-3.0, -0.5), (-0.5, 0.0),
(0.0, 0.5), (0.5, 3.0)],
Expand Down
23 changes: 17 additions & 6 deletions pandas/tests/groupby/aggregate/test_cython.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,35 +158,46 @@ def test__cython_agg_general(op, targop):
('min', np.min),
('max', np.max), ]
)
def test_cython_agg_empty_buckets(op, targop):
def test_cython_agg_empty_buckets(op, targop, observed):
df = pd.DataFrame([11, 12, 13])
grps = range(0, 55, 5)

# calling _cython_agg_general directly, instead of via the user API
# which sets different values for min_count, so do that here.
result = df.groupby(pd.cut(df[0], grps))._cython_agg_general(op)
expected = df.groupby(pd.cut(df[0], grps)).agg(lambda x: targop(x))
g = df.groupby(pd.cut(df[0], grps), observed=observed)
result = g._cython_agg_general(op)

g = df.groupby(pd.cut(df[0], grps), observed=observed)
expected = g.agg(lambda x: targop(x))
tm.assert_frame_equal(result, expected)


def test_cython_agg_empty_buckets_nanops():
def test_cython_agg_empty_buckets_nanops(observed):
# GH-18869 can't call nanops on empty groups, so hardcode expected
# for these
df = pd.DataFrame([11, 12, 13], columns=['a'])
grps = range(0, 25, 5)
# add / sum
result = df.groupby(pd.cut(df['a'], grps))._cython_agg_general('add')
result = df.groupby(pd.cut(df['a'], grps),
observed=observed)._cython_agg_general('add')
intervals = pd.interval_range(0, 20, freq=5)
expected = pd.DataFrame(
{"a": [0, 0, 36, 0]},
index=pd.CategoricalIndex(intervals, name='a', ordered=True))
if observed:
expected = expected[expected.a != 0]

tm.assert_frame_equal(result, expected)

# prod
result = df.groupby(pd.cut(df['a'], grps))._cython_agg_general('prod')
result = df.groupby(pd.cut(df['a'], grps),
observed=observed)._cython_agg_general('prod')
expected = pd.DataFrame(
{"a": [1, 1, 1716, 1]},
index=pd.CategoricalIndex(intervals, name='a', ordered=True))
if observed:
expected = expected[expected.a != 1]

tm.assert_frame_equal(result, expected)


Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/groupby/aggregate/test_other.py
Original file line number Diff line number Diff line change
Expand Up @@ -488,12 +488,12 @@ def test_agg_structs_series(structure, expected):


@pytest.mark.xfail(reason="GH-18869: agg func not called on empty groups.")
def test_agg_category_nansum():
def test_agg_category_nansum(observed):
categories = ['a', 'b', 'c']
df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'],
categories=categories),
'B': [1, 2, 3]})
result = df.groupby("A").B.agg(np.nansum)
result = df.groupby("A", observed=observed).B.agg(np.nansum)
expected = pd.Series([3, 3, 0],
index=pd.CategoricalIndex(['a', 'b', 'c'],
categories=categories,
Expand Down
5 changes: 5 additions & 0 deletions pandas/tests/groupby/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@
from pandas.util import testing as tm


@pytest.fixture(params=[True, False])
def observed(request):
return request.param


@pytest.fixture
def mframe():
index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two',
Expand Down
8 changes: 1 addition & 7 deletions pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,6 @@
import pandas.util.testing as tm


@pytest.fixture(params=[True, False])
def observed(request):
return request.param


def cartesian_product_for_groupers(result, args, names):
""" Reindex to a cartesian production for the groupers,
preserving the nature (Categorical) of each grouper """
Expand Down Expand Up @@ -378,8 +373,7 @@ def test_observed(observed):
tm.assert_frame_equal(result, expected)


@pytest.mark.xfail(reason="failing with observed")
def test_observed_failing(observed):
def test_observed_codes_remap(observed):
d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]}
df = pd.DataFrame(d)
values = pd.cut(df['C1'], [1, 2, 3, 6])
Expand Down
6 changes: 3 additions & 3 deletions pandas/tests/groupby/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,14 +313,14 @@ def test_cython_median():
tm.assert_frame_equal(rs, xp)


def test_median_empty_bins():
def test_median_empty_bins(observed):
df = pd.DataFrame(np.random.randint(0, 44, 500))

grps = range(0, 55, 5)
bins = pd.cut(df[0], grps)

result = df.groupby(bins).median()
expected = df.groupby(bins).agg(lambda x: x.median())
result = df.groupby(bins, observed=observed).median()
expected = df.groupby(bins, observed=observed).agg(lambda x: x.median())
tm.assert_frame_equal(result, expected)


Expand Down
25 changes: 17 additions & 8 deletions pandas/tests/groupby/test_grouping.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@ def test_groupby_levels_and_columns(self):
by_columns.columns = pd.Index(by_columns.columns, dtype=np.int64)
tm.assert_frame_equal(by_levels, by_columns)

def test_groupby_categorical_index_and_columns(self):
def test_groupby_categorical_index_and_columns(self, observed):
# GH18432
columns = ['A', 'B', 'A', 'B']
categories = ['B', 'A']
Expand All @@ -260,17 +260,26 @@ def test_groupby_categorical_index_and_columns(self):
categories=categories,
ordered=True)
df = DataFrame(data=data, columns=cat_columns)
result = df.groupby(axis=1, level=0).sum()
result = df.groupby(axis=1, level=0, observed=observed).sum()
expected_data = 2 * np.ones((5, 2), int)
expected_columns = CategoricalIndex(categories,
categories=categories,
ordered=True)

if observed:
# if we are not-observed we undergo a reindex
# so need to adjust the output as our expected sets us up
# to be non-observed
expected_columns = CategoricalIndex(['A', 'B'],
categories=categories,
ordered=True)
else:
expected_columns = CategoricalIndex(categories,
categories=categories,
ordered=True)
expected = DataFrame(data=expected_data, columns=expected_columns)
assert_frame_equal(result, expected)

# test transposed version
df = DataFrame(data.T, index=cat_columns)
result = df.groupby(axis=0, level=0).sum()
result = df.groupby(axis=0, level=0, observed=observed).sum()
expected = DataFrame(data=expected_data.T, index=expected_columns)
assert_frame_equal(result, expected)

Expand Down Expand Up @@ -572,11 +581,11 @@ def test_get_group(self):
pytest.raises(ValueError,
lambda: g.get_group(('foo', 'bar', 'baz')))

def test_get_group_empty_bins(self):
def test_get_group_empty_bins(self, observed):

d = pd.DataFrame([3, 1, 7, 6])
bins = [0, 5, 10, 15]
g = d.groupby(pd.cut(d[0], bins))
g = d.groupby(pd.cut(d[0], bins), observed=observed)

# TODO: should prob allow a str of Interval work as well
# IOW '(0, 5]'
Expand Down

0 comments on commit 7cd56cd

Please sign in to comment.