Skip to content

Commit

Permalink
ENH: add .ngroup() method to groupby objects (pandas-dev#14026)
Browse files Browse the repository at this point in the history
  • Loading branch information
dsm054 committed May 29, 2017
1 parent ef487d9 commit 053935b
Show file tree
Hide file tree
Showing 7 changed files with 322 additions and 60 deletions.
1 change: 1 addition & 0 deletions doc/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1705,6 +1705,7 @@ Computations / Descriptive Stats
GroupBy.mean
GroupBy.median
GroupBy.min
GroupBy.ngroup
GroupBy.nth
GroupBy.ohlc
GroupBy.prod
Expand Down
50 changes: 46 additions & 4 deletions doc/source/groupby.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1122,12 +1122,34 @@ To see the order in which each row appears within its group, use the

.. ipython:: python
df = pd.DataFrame(list('aaabba'), columns=['A'])
df
dfg = pd.DataFrame(list('aaabba'), columns=['A'])
dfg
dfg.groupby('A').cumcount()
dfg.groupby('A').cumcount(ascending=False)
Enumerate groups
~~~~~~~~~~~~~~~~

.. versionadded:: 0.20.2

To see the ordering of the groups (as opposed to the order of rows
within a group given by ``cumcount``) you can use the ``ngroup``
method.

Note that the numbers given to the groups match the order in which the
groups would be seen when iterating over the groupby object, not the
order they are first observed.

.. ipython:: python
df.groupby('A').cumcount()
dfg = pd.DataFrame(list('aaabba'), columns=['A'])
dfg
df.groupby('A').cumcount(ascending=False) # kwarg only
dfg.groupby('A').ngroup()
dfg.groupby('A').ngroup(ascending=False)
Plotting
~~~~~~~~
Expand Down Expand Up @@ -1176,6 +1198,26 @@ Regroup columns of a DataFrame according to their sum, and sum the aggregated on
df
df.groupby(df.sum(), axis=1).sum()
Multi-column factorization
~~~~~~~~~~~~~~~~~~~~~~~~~~

By using ``.ngroup()``, we can extract information about the groups in a
way similar to ``pd.factorize()``, but which applies naturally to multiple
columns of mixed type and different sources. This can be useful as an
intermediate categorical-like step in processing, when the relationships
between the group rows are more important than their content, or as input
to an algorithm which only accepts the integer encoding.

.. ipython:: python
dfg = pd.DataFrame({"A": [1, 1, 2, 3, 2], "B": list("aaaba")})
dfg
dfg.groupby(["A", "B"]).ngroup()
dfg.groupby(["A", [0, 0, 0, 1, 1]]).ngroup()
Groupby by Indexer to 'resample' data
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.20.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ Enhancements

- Unblocked access to additional compression types supported in pytables: 'blosc:blosclz, 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy', 'blosc:zlib', 'blosc:zstd' (:issue:`14478`)
- ``Series`` provides a ``to_latex`` method (:issue:`16180`)
- A new groupby method :meth:`~pandas.core.groupby.GroupBy.ngroup`, parallel to the existing :meth:`~pandas.core.groupby.GroupBy.cumcount`, has been added to return the group order (:issue:`11642`).

.. _whatsnew_0202.performance:

Expand Down
75 changes: 74 additions & 1 deletion pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@
'last', 'first',
'head', 'tail', 'median',
'mean', 'sum', 'min', 'max',
'cumcount',
'cumcount', 'ngroup',
'resample',
'rank', 'quantile',
'fillna',
Expand Down Expand Up @@ -1437,6 +1437,75 @@ def nth(self, n, dropna=None):

return result

@Substitution(name='groupby')
@Appender(_doc_template)
def ngroup(self, ascending=True):
"""
Number each group from 0 to the number of groups - 1.
This is the enumerative complement of cumcount. Note that the
numbers given to the groups match the order in which the groups
would be seen when iterating over the groupby object, not the
order they are first observed.
.. versionadded:: 0.20.2
Parameters
----------
ascending : bool, default True
If False, number in reverse, from number of group - 1 to 0.
Examples
--------
>>> df = pd.DataFrame({"A": list("aaabba")})
>>> df
A
0 a
1 a
2 a
3 b
4 b
5 a
>>> df.groupby('A').ngroup()
0 0
1 0
2 0
3 1
4 1
5 0
dtype: int64
>>> df.groupby('A').ngroup(ascending=False)
0 1
1 1
2 1
3 0
4 0
5 1
dtype: int64
>>> df.groupby(["A", [1,1,2,3,2,1]]).ngroup()
0 0
1 0
2 1
3 3
4 2
5 0
dtype: int64
See also
--------
.cumcount : Number the rows in each group.
"""

self._set_group_selection()

index = self._selected_obj.index
result = Series(self.grouper.group_info[0], index)
if not ascending:
result = self.ngroups - 1 - result
return result

@Substitution(name='groupby')
@Appender(_doc_template)
def cumcount(self, ascending=True):
Expand Down Expand Up @@ -1481,6 +1550,10 @@ def cumcount(self, ascending=True):
4 0
5 0
dtype: int64
See also
--------
.ngroup : Number the groups themselves.
"""

self._set_group_selection()
Expand Down
197 changes: 197 additions & 0 deletions pandas/tests/groupby/test_counting.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
# -*- coding: utf-8 -*-
from __future__ import print_function

import numpy as np

from pandas import (DataFrame, Series, MultiIndex)
from pandas.util.testing import assert_series_equal
from pandas.compat import (range, product as cart_product)


class TestCounting(object):

def test_cumcount(self):
df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'])
g = df.groupby('A')
sg = g.A

expected = Series([0, 1, 2, 0, 3])

assert_series_equal(expected, g.cumcount())
assert_series_equal(expected, sg.cumcount())

def test_cumcount_empty(self):
ge = DataFrame().groupby(level=0)
se = Series().groupby(level=0)

# edge case, as this is usually considered float
e = Series(dtype='int64')

assert_series_equal(e, ge.cumcount())
assert_series_equal(e, se.cumcount())

def test_cumcount_dupe_index(self):
df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'],
index=[0] * 5)
g = df.groupby('A')
sg = g.A

expected = Series([0, 1, 2, 0, 3], index=[0] * 5)

assert_series_equal(expected, g.cumcount())
assert_series_equal(expected, sg.cumcount())

def test_cumcount_mi(self):
mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]])
df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'],
index=mi)
g = df.groupby('A')
sg = g.A

expected = Series([0, 1, 2, 0, 3], index=mi)

assert_series_equal(expected, g.cumcount())
assert_series_equal(expected, sg.cumcount())

def test_cumcount_groupby_not_col(self):
df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'],
index=[0] * 5)
g = df.groupby([0, 0, 0, 1, 0])
sg = g.A

expected = Series([0, 1, 2, 0, 3], index=[0] * 5)

assert_series_equal(expected, g.cumcount())
assert_series_equal(expected, sg.cumcount())

def test_ngroup(self):
df = DataFrame({'A': list('aaaba')})
g = df.groupby('A')
sg = g.A

expected = Series([0, 0, 0, 1, 0])

assert_series_equal(expected, g.ngroup())
assert_series_equal(expected, sg.ngroup())

def test_ngroup_distinct(self):
df = DataFrame({'A': list('abcde')})
g = df.groupby('A')
sg = g.A

expected = Series(range(5), dtype='int64')

assert_series_equal(expected, g.ngroup())
assert_series_equal(expected, sg.ngroup())

def test_ngroup_one_group(self):
df = DataFrame({'A': [0] * 5})
g = df.groupby('A')
sg = g.A

expected = Series([0] * 5)

assert_series_equal(expected, g.ngroup())
assert_series_equal(expected, sg.ngroup())

def test_ngroup_empty(self):
ge = DataFrame().groupby(level=0)
se = Series().groupby(level=0)

# edge case, as this is usually considered float
e = Series(dtype='int64')

assert_series_equal(e, ge.ngroup())
assert_series_equal(e, se.ngroup())

def test_ngroup_series_matches_frame(self):
df = DataFrame({'A': list('aaaba')})
s = Series(list('aaaba'))

assert_series_equal(df.groupby(s).ngroup(),
s.groupby(s).ngroup())

def test_ngroup_dupe_index(self):
df = DataFrame({'A': list('aaaba')}, index=[0] * 5)
g = df.groupby('A')
sg = g.A

expected = Series([0, 0, 0, 1, 0], index=[0] * 5)

assert_series_equal(expected, g.ngroup())
assert_series_equal(expected, sg.ngroup())

def test_ngroup_mi(self):
mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]])
df = DataFrame({'A': list('aaaba')}, index=mi)
g = df.groupby('A')
sg = g.A
expected = Series([0, 0, 0, 1, 0], index=mi)

assert_series_equal(expected, g.ngroup())
assert_series_equal(expected, sg.ngroup())

def test_ngroup_groupby_not_col(self):
df = DataFrame({'A': list('aaaba')}, index=[0] * 5)
g = df.groupby([0, 0, 0, 1, 0])
sg = g.A

expected = Series([0, 0, 0, 1, 0], index=[0] * 5)

assert_series_equal(expected, g.ngroup())
assert_series_equal(expected, sg.ngroup())

def test_ngroup_descending(self):
df = DataFrame(['a', 'a', 'b', 'a', 'b'], columns=['A'])
g = df.groupby(['A'])

ascending = Series([0, 0, 1, 0, 1])
descending = Series([1, 1, 0, 1, 0])

assert_series_equal(descending, (g.ngroups - 1) - ascending)
assert_series_equal(ascending, g.ngroup(ascending=True))
assert_series_equal(descending, g.ngroup(ascending=False))

def test_ngroup_matches_cumcount(self):
# verify one manually-worked out case works
df = DataFrame([['a', 'x'], ['a', 'y'], ['b', 'x'],
['a', 'x'], ['b', 'y']], columns=['A', 'X'])
g = df.groupby(['A', 'X'])
g_ngroup = g.ngroup()
g_cumcount = g.cumcount()
expected_ngroup = Series([0, 1, 2, 0, 3])
expected_cumcount = Series([0, 0, 0, 1, 0])

assert_series_equal(g_ngroup, expected_ngroup)
assert_series_equal(g_cumcount, expected_cumcount)

def test_ngroup_cumcount_pair(self):
# brute force comparison for all small series
for p in cart_product(range(3), repeat=4):
df = DataFrame({'a': p})
g = df.groupby(['a'])

order = sorted(set(p))
ngroupd = [order.index(val) for val in p]
cumcounted = [p[:i].count(val) for i, val in enumerate(p)]

assert_series_equal(g.ngroup(), Series(ngroupd))
assert_series_equal(g.cumcount(), Series(cumcounted))

def test_ngroup_respects_groupby_order(self):
np.random.seed(0)
df = DataFrame({'a': np.random.choice(list('abcdef'), 100)})
for sort_flag in (False, True):
g = df.groupby(['a'], sort=sort_flag)
df['group_id'] = -1
df['group_index'] = -1

for i, (_, group) in enumerate(g):
df.loc[group.index, 'group_id'] = i
for j, ind in enumerate(group.index):
df.loc[ind, 'group_index'] = j

assert_series_equal(Series(df['group_id'].values),
g.ngroup())
assert_series_equal(Series(df['group_index'].values),
g.cumcount())
Loading

0 comments on commit 053935b

Please sign in to comment.