diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index c0c3a42cc4464..13b5cd2b06032 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -368,6 +368,11 @@ def setup(self): self.dates = (np.datetime64('now') + self.offsets) self.df = DataFrame({'key1': np.random.randint(0, 500, size=self.n), 'key2': np.random.randint(0, 100, size=self.n), 'value1': np.random.randn(self.n), 'value2': np.random.randn(self.n), 'value3': np.random.randn(self.n), 'dates': self.dates, }) + N = 1000000 + self.draws = pd.Series(np.random.randn(N)) + labels = pd.Series(['foo', 'bar', 'baz', 'qux'] * (N // 4)) + self.cats = labels.astype('category') + def time_groupby_multi_size(self): self.df.groupby(['key1', 'key2']).size() @@ -377,6 +382,10 @@ def time_groupby_dt_size(self): def time_groupby_dt_timegrouper_size(self): self.df.groupby(TimeGrouper(key='dates', freq='M')).size() + def time_groupby_size(self): + self.draws.groupby(self.cats).size() + + #---------------------------------------------------------------------- # groupby with a variable value for ngroups diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index 4ec9daff4c0fc..e0857019d2fd4 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -29,7 +29,7 @@ Performance Improvements - Performance regression fix when indexing with a list-like (:issue:`16285`) - Performance regression fix for MultiIndexes (:issue:`16319`, :issue:`16346`) - Improved performance of ``.clip()`` with scalar arguments (:issue:`15400`) - +- Improved performance of groupby with categorical groupers (:issue:`16413`) .. _whatsnew_0202.bug_fixes: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 9b29f1b04ff73..2af4f112ca941 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2388,7 +2388,6 @@ def get_loc(self, key, method=None, tolerance=None): if tolerance is not None: raise ValueError('tolerance argument only valid if using pad, ' 'backfill or nearest lookups') - key = _values_from_object(key) try: return self._engine.get_loc(key) except KeyError: