-
-
Notifications
You must be signed in to change notification settings - Fork 17.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Refactored GroupBy ASVs #20043
Refactored GroupBy ASVs #20043
Changes from 1 commit
0902bb0
d8d2732
6eafae6
1917e60
23f0777
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,7 +14,10 @@ | |
method_blacklist = { | ||
'object': {'median', 'prod', 'sem', 'cumsum', 'sum', 'cummin', 'mean', | ||
'max', 'skew', 'cumprod', 'cummax', 'rank', 'pct_change', 'min', | ||
'var', 'mad', 'describe', 'std'} | ||
'var', 'mad', 'describe', 'std'}, | ||
'datetime': {'median', 'prod', 'sem', 'cumsum', 'sum', 'mean', 'skew', | ||
'cumprod', 'cummax', 'pct_change', 'var', 'mad', 'describe', | ||
'std'} | ||
} | ||
|
||
|
||
|
@@ -90,45 +93,6 @@ def time_series_groups(self, data, key): | |
self.ser.groupby(self.ser).groups | ||
|
||
|
||
class FirstLast(object): | ||
|
||
goal_time = 0.2 | ||
|
||
param_names = ['dtype'] | ||
params = ['float32', 'float64', 'datetime', 'object'] | ||
|
||
def setup(self, dtype): | ||
N = 10**5 | ||
# with datetimes (GH7555) | ||
if dtype == 'datetime': | ||
self.df = DataFrame({'values': date_range('1/1/2011', | ||
periods=N, | ||
freq='s'), | ||
'key': range(N)}) | ||
elif dtype == 'object': | ||
self.df = DataFrame({'values': ['foo'] * N, | ||
'key': range(N)}) | ||
else: | ||
labels = np.arange(N / 10).repeat(10) | ||
data = Series(np.random.randn(len(labels)), dtype=dtype) | ||
data[::3] = np.nan | ||
data[1::3] = np.nan | ||
labels = labels.take(np.random.permutation(len(labels))) | ||
self.df = DataFrame({'values': data, 'key': labels}) | ||
|
||
def time_groupby_first(self, dtype): | ||
self.df.groupby('key').first() | ||
|
||
def time_groupby_last(self, dtype): | ||
self.df.groupby('key').last() | ||
|
||
def time_groupby_nth_all(self, dtype): | ||
self.df.groupby('key').nth(0, dropna='all') | ||
|
||
def time_groupby_nth_none(self, dtype): | ||
self.df.groupby('key').nth(0) | ||
|
||
|
||
class GroupManyLabels(object): | ||
|
||
goal_time = 0.2 | ||
|
@@ -149,39 +113,40 @@ class Nth(object): | |
|
||
goal_time = 0.2 | ||
|
||
def setup_cache(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure if there is a reason why we are intermixing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. that's fine There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. While refactoring the benchmarks, I mainly preferred I think it should be okay to change them to just There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks @mroeschke that makes sense. Should be easy to revert if it causes an issue |
||
df = DataFrame(np.random.randint(1, 100, (10000, 2))) | ||
df.iloc[1, 1] = np.nan | ||
return df | ||
|
||
def time_frame_nth_any(self, df): | ||
df.groupby(0).nth(0, dropna='any') | ||
|
||
def time_frame_nth(self, df): | ||
df.groupby(0).nth(0) | ||
|
||
param_names = ['dtype'] | ||
params = ['float32', 'float64', 'datetime', 'object'] | ||
|
||
def time_series_nth_any(self, df): | ||
df[1].groupby(df[0]).nth(0, dropna='any') | ||
def setup(self, dtype): | ||
N = 10**5 | ||
# with datetimes (GH7555) | ||
if dtype == 'datetime': | ||
values = date_range('1/1/2011', periods=N, freq='s') | ||
elif dtype == 'object': | ||
values = ['foo'] * N | ||
else: | ||
values = np.arange(N).astype(dtype) | ||
|
||
def time_series_nth(self, df): | ||
df[1].groupby(df[0]).nth(0) | ||
key = np.arange(N) | ||
self.df = DataFrame({'key': key, 'values': values}) | ||
self.df.iloc[1, 1] = np.nan # insert missing data | ||
|
||
def time_frame_nth_any(self, dtype): | ||
self.df.groupby('key').nth(0, dropna='any') | ||
|
||
class NthObject(object): | ||
def time_groupby_nth_all(self, dtype): | ||
self.df.groupby('key').nth(0, dropna='all') | ||
|
||
goal_time = 0.2 | ||
def time_frame_nth(self, dtype): | ||
self.df.groupby('key').nth(0) | ||
|
||
def setup_cache(self): | ||
df = DataFrame(np.random.randint(1, 100, (10000,)), columns=['g']) | ||
df['obj'] = ['a'] * 5000 + ['b'] * 5000 | ||
return df | ||
def time_series_nth_any(self, dtype): | ||
self.df['values'].groupby(self.df['key']).nth(0, dropna='any') | ||
|
||
def time_nth(self, df): | ||
df.groupby('g').nth(5) | ||
def time_groupby_nth_all(self, dtype): | ||
self.df['values'].groupby(self.df['key']).nth(0, dropna='all') | ||
|
||
def time_nth_last(self, df): | ||
df.groupby('g').last() | ||
def time_series_nth(self, dtype): | ||
self.df['values'].groupby(self.df['key']).nth(0) | ||
|
||
|
||
class DateAttributes(object): | ||
|
@@ -243,7 +208,7 @@ def time_multi_count(self, df): | |
df.groupby(['key1', 'key2']).count() | ||
|
||
|
||
class CountInt(object): | ||
class CountMultiInt(object): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
|
||
goal_time = 0.2 | ||
|
||
|
@@ -255,10 +220,10 @@ def setup_cache(self): | |
'ints2': np.random.randint(0, 1000, size=n)}) | ||
return df | ||
|
||
def time_int_count(self, df): | ||
def time_multi_int_count(self, df): | ||
df.groupby(['key1', 'key2']).count() | ||
|
||
def time_int_nunique(self, df): | ||
def time_multi_int_nunique(self, df): | ||
df.groupby(['key1', 'key2']).nunique() | ||
|
||
|
||
|
@@ -361,9 +326,6 @@ def setup(self): | |
def time_multi_size(self): | ||
self.df.groupby(['key1', 'key2']).size() | ||
|
||
def time_dt_size(self): | ||
self.df.groupby(['dates']).size() | ||
|
||
def time_dt_timegrouper_size(self): | ||
with warnings.catch_warnings(record=True): | ||
self.df.groupby(TimeGrouper(key='dates', freq='M')).size() | ||
|
@@ -377,7 +339,7 @@ class GroupByMethods(object): | |
goal_time = 0.2 | ||
|
||
param_names = ['dtype', 'method'] | ||
params = [['int', 'float', 'object'], | ||
params = [['int', 'float', 'object', 'datetime'], | ||
['all', 'any', 'bfill', 'count', 'cumcount', 'cummax', 'cummin', | ||
'cumprod', 'cumsum', 'describe', 'ffill', 'first', 'head', | ||
'last', 'mad', 'max', 'min', 'median', 'mean', 'nunique', | ||
|
@@ -398,12 +360,18 @@ def setup(self, dtype, method): | |
np.random.random(ngroups) * 10.0]) | ||
elif dtype == 'object': | ||
key = ['foo'] * size | ||
elif dtype == 'datetime': | ||
key = date_range('1/1/2011', periods=size, freq='s') | ||
|
||
df = DataFrame({'values': values, 'key': key}) | ||
self.df_groupby_method = getattr(df.groupby('key')['values'], method) | ||
self.as_group_method = getattr(df.groupby('key')['values'], method) | ||
self.as_field_method = getattr(df.groupby('values')['key'], method) | ||
|
||
def time_dtype_as_group(self, dtype, method): | ||
self.as_group_method() | ||
|
||
def time_method(self, dtype, method): | ||
self.df_groupby_method() | ||
def time_dtype_as_field(self, dtype, method): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Before this was only testing the dtype as part of the grouping, but it makes sense to test it as an aggregated / transformed field as well |
||
self.as_field_method() | ||
|
||
|
||
class Float32(object): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I removed this class because it is duplicative of other tests.
first
andlast
are already covered in theGroupByMethods
class. For thenth
tests (although they are arguably duplicated offirst
) I ended up moving them all to one sharedNth
class that covers the various dtype permutations