From 8383c54327ddae25fd643f5353a44881f3ce6364 Mon Sep 17 00:00:00 2001 From: "D.S. McNeil" Date: Sun, 28 May 2017 20:59:42 -0400 Subject: [PATCH] ENH: add .ngroup() method to groupby objects (#14026) --- doc/source/api.rst | 1 + doc/source/groupby.rst | 63 +++++++- doc/source/reshaping.rst | 2 +- doc/source/whatsnew/v0.20.2.txt | 5 + pandas/core/groupby.py | 75 +++++++++- pandas/tests/groupby/test_counting.py | 197 +++++++++++++++++++++++++ pandas/tests/groupby/test_groupby.py | 54 ------- pandas/tests/groupby/test_whitelist.py | 4 +- 8 files changed, 338 insertions(+), 63 deletions(-) create mode 100644 pandas/tests/groupby/test_counting.py diff --git a/doc/source/api.rst b/doc/source/api.rst index 888bb6d67e94b8..cfdd305348d709 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -1705,6 +1705,7 @@ Computations / Descriptive Stats GroupBy.mean GroupBy.median GroupBy.min + GroupBy.ngroup GroupBy.nth GroupBy.ohlc GroupBy.prod diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index cf4f1059ae17a9..865f1ccae2c044 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -1122,12 +1122,36 @@ To see the order in which each row appears within its group, use the .. ipython:: python - df = pd.DataFrame(list('aaabba'), columns=['A']) - df + dfg = pd.DataFrame(list('aaabba'), columns=['A']) + dfg + + dfg.groupby('A').cumcount() + + dfg.groupby('A').cumcount(ascending=False) + +.. _groupby.ngroup: + +Enumerate groups +~~~~~~~~~~~~~~~~ + +.. versionadded:: 0.20.2 + +To see the ordering of the groups (as opposed to the order of rows +within a group given by ``cumcount``) you can use the ``ngroup`` +method. + +Note that the numbers given to the groups match the order in which the +groups would be seen when iterating over the groupby object, not the +order they are first observed. + +.. ipython:: python - df.groupby('A').cumcount() + dfg = pd.DataFrame(list('aaabba'), columns=['A']) + dfg - df.groupby('A').cumcount(ascending=False) # kwarg only + dfg.groupby('A').ngroup() + + dfg.groupby('A').ngroup(ascending=False) Plotting ~~~~~~~~ @@ -1176,14 +1200,41 @@ Regroup columns of a DataFrame according to their sum, and sum the aggregated on df df.groupby(df.sum(), axis=1).sum() +.. _groupby.multicolumn_factorization + +Multi-column factorization +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +By using ``.ngroup()``, we can extract information about the groups in +a way similar to :func:`factorize` (as described further in the +:ref:`reshaping API `) but which applies +naturally to multiple columns of mixed type and different +sources. This can be useful as an intermediate categorical-like step +in processing, when the relationships between the group rows are more +important than their content, or as input to an algorithm which only +accepts the integer encoding. (For more information about support in +pandas for full categorical data, see the :ref:`Categorical +introduction ` and the +:ref:`API documentation `.) + +.. ipython:: python + + dfg = pd.DataFrame({"A": [1, 1, 2, 3, 2], "B": list("aaaba")}) + + dfg + + dfg.groupby(["A", "B"]).ngroup() + + dfg.groupby(["A", [0, 0, 0, 1, 1]]).ngroup() + Groupby by Indexer to 'resample' data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Resampling produces new hypothetical samples(resamples) from already existing observed data or from a model that generates data. These new samples are similar to the pre-existing samples. +Resampling produces new hypothetical samples (resamples) from already existing observed data or from a model that generates data. These new samples are similar to the pre-existing samples. In order to resample to work on indices that are non-datetimelike , the following procedure can be utilized. -In the following examples, **df.index // 5** returns a binary array which is used to determine what get's selected for the groupby operation. +In the following examples, **df.index // 5** returns a binary array which is used to determine what gets selected for the groupby operation. .. note:: The below example shows how we can downsample by consolidation of samples into fewer samples. Here by using **df.index // 5**, we are aggregating the samples in bins. By applying **std()** function, we aggregate the information contained in many samples into a small subset of values which is their standard deviation thereby reducing the number of samples. diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index b93749922c8eae..5f125e329f6f13 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -636,7 +636,7 @@ When a column contains only one level, it will be omitted in the result. pd.get_dummies(df, drop_first=True) - +.. _reshaping.factorize: Factorizing values ------------------ diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index 13365401f1d1cb..e33176529a812e 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -22,6 +22,11 @@ Enhancements - Unblocked access to additional compression types supported in pytables: 'blosc:blosclz, 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy', 'blosc:zlib', 'blosc:zstd' (:issue:`14478`) - ``Series`` provides a ``to_latex`` method (:issue:`16180`) +- A new groupby method :meth:`~pandas.core.groupby.GroupBy.ngroup`, + parallel to the existing :meth:`~pandas.core.groupby.GroupBy.cumcount`, + has been added to return the group order (:issue:`11642`); see + :ref:`here `. + .. _whatsnew_0202.performance: Performance Improvements diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 91b55c414b507d..6708183b5e86eb 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -150,7 +150,7 @@ 'last', 'first', 'head', 'tail', 'median', 'mean', 'sum', 'min', 'max', - 'cumcount', + 'cumcount', 'ngroup', 'resample', 'rank', 'quantile', 'fillna', @@ -1437,6 +1437,75 @@ def nth(self, n, dropna=None): return result + @Substitution(name='groupby') + @Appender(_doc_template) + def ngroup(self, ascending=True): + """ + Number each group from 0 to the number of groups - 1. + + This is the enumerative complement of cumcount. Note that the + numbers given to the groups match the order in which the groups + would be seen when iterating over the groupby object, not the + order they are first observed. + + .. versionadded:: 0.20.2 + + Parameters + ---------- + ascending : bool, default True + If False, number in reverse, from number of group - 1 to 0. + + Examples + -------- + + >>> df = pd.DataFrame({"A": list("aaabba")}) + >>> df + A + 0 a + 1 a + 2 a + 3 b + 4 b + 5 a + >>> df.groupby('A').ngroup() + 0 0 + 1 0 + 2 0 + 3 1 + 4 1 + 5 0 + dtype: int64 + >>> df.groupby('A').ngroup(ascending=False) + 0 1 + 1 1 + 2 1 + 3 0 + 4 0 + 5 1 + dtype: int64 + >>> df.groupby(["A", [1,1,2,3,2,1]]).ngroup() + 0 0 + 1 0 + 2 1 + 3 3 + 4 2 + 5 0 + dtype: int64 + + See also + -------- + .cumcount : Number the rows in each group. + + """ + + self._set_group_selection() + + index = self._selected_obj.index + result = Series(self.grouper.group_info[0], index) + if not ascending: + result = self.ngroups - 1 - result + return result + @Substitution(name='groupby') @Appender(_doc_template) def cumcount(self, ascending=True): @@ -1481,6 +1550,10 @@ def cumcount(self, ascending=True): 4 0 5 0 dtype: int64 + + See also + -------- + .ngroup : Number the groups themselves. """ self._set_group_selection() diff --git a/pandas/tests/groupby/test_counting.py b/pandas/tests/groupby/test_counting.py new file mode 100644 index 00000000000000..485241d593d4f9 --- /dev/null +++ b/pandas/tests/groupby/test_counting.py @@ -0,0 +1,197 @@ +# -*- coding: utf-8 -*- +from __future__ import print_function + +import numpy as np + +from pandas import (DataFrame, Series, MultiIndex) +from pandas.util.testing import assert_series_equal +from pandas.compat import (range, product as cart_product) + + +class TestCounting(object): + + def test_cumcount(self): + df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A']) + g = df.groupby('A') + sg = g.A + + expected = Series([0, 1, 2, 0, 3]) + + assert_series_equal(expected, g.cumcount()) + assert_series_equal(expected, sg.cumcount()) + + def test_cumcount_empty(self): + ge = DataFrame().groupby(level=0) + se = Series().groupby(level=0) + + # edge case, as this is usually considered float + e = Series(dtype='int64') + + assert_series_equal(e, ge.cumcount()) + assert_series_equal(e, se.cumcount()) + + def test_cumcount_dupe_index(self): + df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'], + index=[0] * 5) + g = df.groupby('A') + sg = g.A + + expected = Series([0, 1, 2, 0, 3], index=[0] * 5) + + assert_series_equal(expected, g.cumcount()) + assert_series_equal(expected, sg.cumcount()) + + def test_cumcount_mi(self): + mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]]) + df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'], + index=mi) + g = df.groupby('A') + sg = g.A + + expected = Series([0, 1, 2, 0, 3], index=mi) + + assert_series_equal(expected, g.cumcount()) + assert_series_equal(expected, sg.cumcount()) + + def test_cumcount_groupby_not_col(self): + df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'], + index=[0] * 5) + g = df.groupby([0, 0, 0, 1, 0]) + sg = g.A + + expected = Series([0, 1, 2, 0, 3], index=[0] * 5) + + assert_series_equal(expected, g.cumcount()) + assert_series_equal(expected, sg.cumcount()) + + def test_ngroup(self): + df = DataFrame({'A': list('aaaba')}) + g = df.groupby('A') + sg = g.A + + expected = Series([0, 0, 0, 1, 0]) + + assert_series_equal(expected, g.ngroup()) + assert_series_equal(expected, sg.ngroup()) + + def test_ngroup_distinct(self): + df = DataFrame({'A': list('abcde')}) + g = df.groupby('A') + sg = g.A + + expected = Series(range(5), dtype='int64') + + assert_series_equal(expected, g.ngroup()) + assert_series_equal(expected, sg.ngroup()) + + def test_ngroup_one_group(self): + df = DataFrame({'A': [0] * 5}) + g = df.groupby('A') + sg = g.A + + expected = Series([0] * 5) + + assert_series_equal(expected, g.ngroup()) + assert_series_equal(expected, sg.ngroup()) + + def test_ngroup_empty(self): + ge = DataFrame().groupby(level=0) + se = Series().groupby(level=0) + + # edge case, as this is usually considered float + e = Series(dtype='int64') + + assert_series_equal(e, ge.ngroup()) + assert_series_equal(e, se.ngroup()) + + def test_ngroup_series_matches_frame(self): + df = DataFrame({'A': list('aaaba')}) + s = Series(list('aaaba')) + + assert_series_equal(df.groupby(s).ngroup(), + s.groupby(s).ngroup()) + + def test_ngroup_dupe_index(self): + df = DataFrame({'A': list('aaaba')}, index=[0] * 5) + g = df.groupby('A') + sg = g.A + + expected = Series([0, 0, 0, 1, 0], index=[0] * 5) + + assert_series_equal(expected, g.ngroup()) + assert_series_equal(expected, sg.ngroup()) + + def test_ngroup_mi(self): + mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]]) + df = DataFrame({'A': list('aaaba')}, index=mi) + g = df.groupby('A') + sg = g.A + expected = Series([0, 0, 0, 1, 0], index=mi) + + assert_series_equal(expected, g.ngroup()) + assert_series_equal(expected, sg.ngroup()) + + def test_ngroup_groupby_not_col(self): + df = DataFrame({'A': list('aaaba')}, index=[0] * 5) + g = df.groupby([0, 0, 0, 1, 0]) + sg = g.A + + expected = Series([0, 0, 0, 1, 0], index=[0] * 5) + + assert_series_equal(expected, g.ngroup()) + assert_series_equal(expected, sg.ngroup()) + + def test_ngroup_descending(self): + df = DataFrame(['a', 'a', 'b', 'a', 'b'], columns=['A']) + g = df.groupby(['A']) + + ascending = Series([0, 0, 1, 0, 1]) + descending = Series([1, 1, 0, 1, 0]) + + assert_series_equal(descending, (g.ngroups - 1) - ascending) + assert_series_equal(ascending, g.ngroup(ascending=True)) + assert_series_equal(descending, g.ngroup(ascending=False)) + + def test_ngroup_matches_cumcount(self): + # verify one manually-worked out case works + df = DataFrame([['a', 'x'], ['a', 'y'], ['b', 'x'], + ['a', 'x'], ['b', 'y']], columns=['A', 'X']) + g = df.groupby(['A', 'X']) + g_ngroup = g.ngroup() + g_cumcount = g.cumcount() + expected_ngroup = Series([0, 1, 2, 0, 3]) + expected_cumcount = Series([0, 0, 0, 1, 0]) + + assert_series_equal(g_ngroup, expected_ngroup) + assert_series_equal(g_cumcount, expected_cumcount) + + def test_ngroup_cumcount_pair(self): + # brute force comparison for all small series + for p in cart_product(range(3), repeat=4): + df = DataFrame({'a': p}) + g = df.groupby(['a']) + + order = sorted(set(p)) + ngroupd = [order.index(val) for val in p] + cumcounted = [p[:i].count(val) for i, val in enumerate(p)] + + assert_series_equal(g.ngroup(), Series(ngroupd)) + assert_series_equal(g.cumcount(), Series(cumcounted)) + + def test_ngroup_respects_groupby_order(self): + np.random.seed(0) + df = DataFrame({'a': np.random.choice(list('abcdef'), 100)}) + for sort_flag in (False, True): + g = df.groupby(['a'], sort=sort_flag) + df['group_id'] = -1 + df['group_index'] = -1 + + for i, (_, group) in enumerate(g): + df.loc[group.index, 'group_id'] = i + for j, ind in enumerate(group.index): + df.loc[ind, 'group_index'] = j + + assert_series_equal(Series(df['group_id'].values), + g.ngroup()) + assert_series_equal(Series(df['group_index'].values), + g.cumcount()) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 88afa51e46b6c8..19124a33bdbcb8 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -3399,60 +3399,6 @@ def test_groupby_with_small_elem(self): res = grouped.get_group((pd.Timestamp('2014-08-31'), 'start')) tm.assert_frame_equal(res, df.iloc[[2], :]) - def test_cumcount(self): - df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A']) - g = df.groupby('A') - sg = g.A - - expected = Series([0, 1, 2, 0, 3]) - - assert_series_equal(expected, g.cumcount()) - assert_series_equal(expected, sg.cumcount()) - - def test_cumcount_empty(self): - ge = DataFrame().groupby(level=0) - se = Series().groupby(level=0) - - # edge case, as this is usually considered float - e = Series(dtype='int64') - - assert_series_equal(e, ge.cumcount()) - assert_series_equal(e, se.cumcount()) - - def test_cumcount_dupe_index(self): - df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'], - index=[0] * 5) - g = df.groupby('A') - sg = g.A - - expected = Series([0, 1, 2, 0, 3], index=[0] * 5) - - assert_series_equal(expected, g.cumcount()) - assert_series_equal(expected, sg.cumcount()) - - def test_cumcount_mi(self): - mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]]) - df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'], - index=mi) - g = df.groupby('A') - sg = g.A - - expected = Series([0, 1, 2, 0, 3], index=mi) - - assert_series_equal(expected, g.cumcount()) - assert_series_equal(expected, sg.cumcount()) - - def test_cumcount_groupby_not_col(self): - df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'], - index=[0] * 5) - g = df.groupby([0, 0, 0, 1, 0]) - sg = g.A - - expected = Series([0, 1, 2, 0, 3], index=[0] * 5) - - assert_series_equal(expected, g.cumcount()) - assert_series_equal(expected, sg.cumcount()) - def test_fill_constistency(self): # GH9221 diff --git a/pandas/tests/groupby/test_whitelist.py b/pandas/tests/groupby/test_whitelist.py index 5d131717f8345c..2c8bf57f20faea 100644 --- a/pandas/tests/groupby/test_whitelist.py +++ b/pandas/tests/groupby/test_whitelist.py @@ -24,6 +24,7 @@ 'head', 'tail', 'cumcount', + 'ngroup', 'resample', 'rank', 'quantile', @@ -61,6 +62,7 @@ 'head', 'tail', 'cumcount', + 'ngroup', 'resample', 'rank', 'quantile', @@ -237,7 +239,7 @@ def test_tab_completion(mframe): 'prod', 'size', 'std', 'sum', 'transform', 'var', 'sem', 'count', 'nunique', 'head', 'describe', 'cummax', 'quantile', 'rank', 'cumprod', 'tail', 'resample', 'cummin', 'fillna', - 'cumsum', 'cumcount', 'all', 'shift', 'skew', + 'cumsum', 'cumcount', 'ngroup', 'all', 'shift', 'skew', 'take', 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith', 'cov', 'dtypes', 'ndim', 'diff', 'idxmax', 'idxmin', 'ffill', 'bfill', 'pad', 'backfill', 'rolling', 'expanding'])