From 0902bb009d420a49f439822eef54969f0ddc32e3 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 7 Mar 2018 10:58:28 -0800 Subject: [PATCH 1/3] Refactored ASVs for GroupBy --- asv_bench/benchmarks/groupby.py | 118 ++++++++++++-------------------- 1 file changed, 43 insertions(+), 75 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 3e7e5c821b14c..c73492547ca07 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -14,7 +14,10 @@ method_blacklist = { 'object': {'median', 'prod', 'sem', 'cumsum', 'sum', 'cummin', 'mean', 'max', 'skew', 'cumprod', 'cummax', 'rank', 'pct_change', 'min', - 'var', 'mad', 'describe', 'std'} + 'var', 'mad', 'describe', 'std'}, + 'datetime': {'median', 'prod', 'sem', 'cumsum', 'sum', 'mean', 'skew', + 'cumprod', 'cummax', 'pct_change', 'var', 'mad', 'describe', + 'std'} } @@ -90,45 +93,6 @@ def time_series_groups(self, data, key): self.ser.groupby(self.ser).groups -class FirstLast(object): - - goal_time = 0.2 - - param_names = ['dtype'] - params = ['float32', 'float64', 'datetime', 'object'] - - def setup(self, dtype): - N = 10**5 - # with datetimes (GH7555) - if dtype == 'datetime': - self.df = DataFrame({'values': date_range('1/1/2011', - periods=N, - freq='s'), - 'key': range(N)}) - elif dtype == 'object': - self.df = DataFrame({'values': ['foo'] * N, - 'key': range(N)}) - else: - labels = np.arange(N / 10).repeat(10) - data = Series(np.random.randn(len(labels)), dtype=dtype) - data[::3] = np.nan - data[1::3] = np.nan - labels = labels.take(np.random.permutation(len(labels))) - self.df = DataFrame({'values': data, 'key': labels}) - - def time_groupby_first(self, dtype): - self.df.groupby('key').first() - - def time_groupby_last(self, dtype): - self.df.groupby('key').last() - - def time_groupby_nth_all(self, dtype): - self.df.groupby('key').nth(0, dropna='all') - - def time_groupby_nth_none(self, dtype): - self.df.groupby('key').nth(0) - - class GroupManyLabels(object): goal_time = 0.2 @@ -149,39 +113,40 @@ class Nth(object): goal_time = 0.2 - def setup_cache(self): - df = DataFrame(np.random.randint(1, 100, (10000, 2))) - df.iloc[1, 1] = np.nan - return df - - def time_frame_nth_any(self, df): - df.groupby(0).nth(0, dropna='any') - - def time_frame_nth(self, df): - df.groupby(0).nth(0) - + param_names = ['dtype'] + params = ['float32', 'float64', 'datetime', 'object'] - def time_series_nth_any(self, df): - df[1].groupby(df[0]).nth(0, dropna='any') + def setup(self, dtype): + N = 10**5 + # with datetimes (GH7555) + if dtype == 'datetime': + values = date_range('1/1/2011', periods=N, freq='s') + elif dtype == 'object': + values = ['foo'] * N + else: + values = np.arange(N).astype(dtype) - def time_series_nth(self, df): - df[1].groupby(df[0]).nth(0) + key = np.arange(N) + self.df = DataFrame({'key': key, 'values': values}) + self.df.iloc[1, 1] = np.nan # insert missing data + def time_frame_nth_any(self, dtype): + self.df.groupby('key').nth(0, dropna='any') -class NthObject(object): + def time_groupby_nth_all(self, dtype): + self.df.groupby('key').nth(0, dropna='all') - goal_time = 0.2 + def time_frame_nth(self, dtype): + self.df.groupby('key').nth(0) - def setup_cache(self): - df = DataFrame(np.random.randint(1, 100, (10000,)), columns=['g']) - df['obj'] = ['a'] * 5000 + ['b'] * 5000 - return df + def time_series_nth_any(self, dtype): + self.df['values'].groupby(self.df['key']).nth(0, dropna='any') - def time_nth(self, df): - df.groupby('g').nth(5) + def time_groupby_nth_all(self, dtype): + self.df['values'].groupby(self.df['key']).nth(0, dropna='all') - def time_nth_last(self, df): - df.groupby('g').last() + def time_series_nth(self, dtype): + self.df['values'].groupby(self.df['key']).nth(0) class DateAttributes(object): @@ -243,7 +208,7 @@ def time_multi_count(self, df): df.groupby(['key1', 'key2']).count() -class CountInt(object): +class CountMultiInt(object): goal_time = 0.2 @@ -255,10 +220,10 @@ def setup_cache(self): 'ints2': np.random.randint(0, 1000, size=n)}) return df - def time_int_count(self, df): + def time_multi_int_count(self, df): df.groupby(['key1', 'key2']).count() - def time_int_nunique(self, df): + def time_multi_int_nunique(self, df): df.groupby(['key1', 'key2']).nunique() @@ -361,9 +326,6 @@ def setup(self): def time_multi_size(self): self.df.groupby(['key1', 'key2']).size() - def time_dt_size(self): - self.df.groupby(['dates']).size() - def time_dt_timegrouper_size(self): with warnings.catch_warnings(record=True): self.df.groupby(TimeGrouper(key='dates', freq='M')).size() @@ -377,7 +339,7 @@ class GroupByMethods(object): goal_time = 0.2 param_names = ['dtype', 'method'] - params = [['int', 'float', 'object'], + params = [['int', 'float', 'object', 'datetime'], ['all', 'any', 'bfill', 'count', 'cumcount', 'cummax', 'cummin', 'cumprod', 'cumsum', 'describe', 'ffill', 'first', 'head', 'last', 'mad', 'max', 'min', 'median', 'mean', 'nunique', @@ -398,12 +360,18 @@ def setup(self, dtype, method): np.random.random(ngroups) * 10.0]) elif dtype == 'object': key = ['foo'] * size + elif dtype == 'datetime': + key = date_range('1/1/2011', periods=size, freq='s') df = DataFrame({'values': values, 'key': key}) - self.df_groupby_method = getattr(df.groupby('key')['values'], method) + self.as_group_method = getattr(df.groupby('key')['values'], method) + self.as_field_method = getattr(df.groupby('values')['key'], method) + + def time_dtype_as_group(self, dtype, method): + self.as_group_method() - def time_method(self, dtype, method): - self.df_groupby_method() + def time_dtype_as_field(self, dtype, method): + self.as_field_method() class Float32(object): From d8d2732e3cccda66aba5e6105dcd5a18a44891d9 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 7 Mar 2018 12:46:00 -0800 Subject: [PATCH 2/3] PEP fixes --- asv_bench/benchmarks/groupby.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index c73492547ca07..a05cc11ce2cc4 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -364,8 +364,8 @@ def setup(self, dtype, method): key = date_range('1/1/2011', periods=size, freq='s') df = DataFrame({'values': values, 'key': key}) - self.as_group_method = getattr(df.groupby('key')['values'], method) - self.as_field_method = getattr(df.groupby('values')['key'], method) + self.as_group_method = getattr(df.groupby('key')['values'], method) + self.as_field_method = getattr(df.groupby('values')['key'], method) def time_dtype_as_group(self, dtype, method): self.as_group_method() From 1917e60e8a96fccead61157478c1a9e6ce9ff485 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 7 Mar 2018 13:54:33 -0800 Subject: [PATCH 3/3] Added ASVs for transformation --- asv_bench/benchmarks/groupby.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index a05cc11ce2cc4..7777322071957 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -231,7 +231,7 @@ class AggFunctions(object): goal_time = 0.2 - def setup_cache(self): + def setup_cache(): N = 10**5 fac1 = np.array(['A', 'B', 'C'], dtype='O') fac2 = np.array(['one', 'two'], dtype='O') @@ -338,15 +338,16 @@ class GroupByMethods(object): goal_time = 0.2 - param_names = ['dtype', 'method'] + param_names = ['dtype', 'method', 'application'] params = [['int', 'float', 'object', 'datetime'], ['all', 'any', 'bfill', 'count', 'cumcount', 'cummax', 'cummin', 'cumprod', 'cumsum', 'describe', 'ffill', 'first', 'head', 'last', 'mad', 'max', 'min', 'median', 'mean', 'nunique', 'pct_change', 'prod', 'rank', 'sem', 'shift', 'size', 'skew', - 'std', 'sum', 'tail', 'unique', 'value_counts', 'var']] + 'std', 'sum', 'tail', 'unique', 'value_counts', 'var'], + ['direct', 'transformation']] - def setup(self, dtype, method): + def setup(self, dtype, method, application): if method in method_blacklist.get(dtype, {}): raise NotImplementedError # skip benchmark ngroups = 1000 @@ -364,13 +365,23 @@ def setup(self, dtype, method): key = date_range('1/1/2011', periods=size, freq='s') df = DataFrame({'values': values, 'key': key}) - self.as_group_method = getattr(df.groupby('key')['values'], method) - self.as_field_method = getattr(df.groupby('values')['key'], method) - def time_dtype_as_group(self, dtype, method): + if application == 'transform': + if method == 'describe': + raise NotImplementedError + + self.as_group_method = lambda: df.groupby( + 'key')['values'].transform(method) + self.as_field_method = lambda: df.groupby( + 'values')['key'].transform(method) + else: + self.as_group_method = getattr(df.groupby('key')['values'], method) + self.as_field_method = getattr(df.groupby('values')['key'], method) + + def time_dtype_as_group(self, dtype, method, application): self.as_group_method() - def time_dtype_as_field(self, dtype, method): + def time_dtype_as_field(self, dtype, method, application): self.as_field_method()