Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactored GroupBy ASVs #20043

Merged
merged 5 commits into from
Mar 10, 2018
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 43 additions & 75 deletions asv_bench/benchmarks/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@
method_blacklist = {
'object': {'median', 'prod', 'sem', 'cumsum', 'sum', 'cummin', 'mean',
'max', 'skew', 'cumprod', 'cummax', 'rank', 'pct_change', 'min',
'var', 'mad', 'describe', 'std'}
'var', 'mad', 'describe', 'std'},
'datetime': {'median', 'prod', 'sem', 'cumsum', 'sum', 'mean', 'skew',
'cumprod', 'cummax', 'pct_change', 'var', 'mad', 'describe',
'std'}
}


Expand Down Expand Up @@ -90,45 +93,6 @@ def time_series_groups(self, data, key):
self.ser.groupby(self.ser).groups


class FirstLast(object):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I removed this class because it is duplicative of other tests. first and last are already covered in the GroupByMethods class. For the nth tests (although they are arguably duplicated of first) I ended up moving them all to one shared Nth class that covers the various dtype permutations


goal_time = 0.2

param_names = ['dtype']
params = ['float32', 'float64', 'datetime', 'object']

def setup(self, dtype):
N = 10**5
# with datetimes (GH7555)
if dtype == 'datetime':
self.df = DataFrame({'values': date_range('1/1/2011',
periods=N,
freq='s'),
'key': range(N)})
elif dtype == 'object':
self.df = DataFrame({'values': ['foo'] * N,
'key': range(N)})
else:
labels = np.arange(N / 10).repeat(10)
data = Series(np.random.randn(len(labels)), dtype=dtype)
data[::3] = np.nan
data[1::3] = np.nan
labels = labels.take(np.random.permutation(len(labels)))
self.df = DataFrame({'values': data, 'key': labels})

def time_groupby_first(self, dtype):
self.df.groupby('key').first()

def time_groupby_last(self, dtype):
self.df.groupby('key').last()

def time_groupby_nth_all(self, dtype):
self.df.groupby('key').nth(0, dropna='all')

def time_groupby_nth_none(self, dtype):
self.df.groupby('key').nth(0)


class GroupManyLabels(object):

goal_time = 0.2
Expand All @@ -149,39 +113,40 @@ class Nth(object):

goal_time = 0.2

def setup_cache(self):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if there is a reason why we are intermixing setup_cache and setup within this module. The former seems preferable given these setups aren't super resource intensive and when refactoring this to use setup I didn't encounter any issues locally. May be worth a separate cleanup to consistently use setup throughout the module

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that's fine

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While refactoring the benchmarks, I mainly preferred setup_cache over setup to avoid running the setup n times for each method of the benchmarking class.

I think it should be okay to change them to just setup, just need to monitor that it will not significantly increase the time of the ASV CI run on travis. (which I don't think it will)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @mroeschke that makes sense. Should be easy to revert if it causes an issue

df = DataFrame(np.random.randint(1, 100, (10000, 2)))
df.iloc[1, 1] = np.nan
return df

def time_frame_nth_any(self, df):
df.groupby(0).nth(0, dropna='any')

def time_frame_nth(self, df):
df.groupby(0).nth(0)

param_names = ['dtype']
params = ['float32', 'float64', 'datetime', 'object']

def time_series_nth_any(self, df):
df[1].groupby(df[0]).nth(0, dropna='any')
def setup(self, dtype):
N = 10**5
# with datetimes (GH7555)
if dtype == 'datetime':
values = date_range('1/1/2011', periods=N, freq='s')
elif dtype == 'object':
values = ['foo'] * N
else:
values = np.arange(N).astype(dtype)

def time_series_nth(self, df):
df[1].groupby(df[0]).nth(0)
key = np.arange(N)
self.df = DataFrame({'key': key, 'values': values})
self.df.iloc[1, 1] = np.nan # insert missing data

def time_frame_nth_any(self, dtype):
self.df.groupby('key').nth(0, dropna='any')

class NthObject(object):
def time_groupby_nth_all(self, dtype):
self.df.groupby('key').nth(0, dropna='all')

goal_time = 0.2
def time_frame_nth(self, dtype):
self.df.groupby('key').nth(0)

def setup_cache(self):
df = DataFrame(np.random.randint(1, 100, (10000,)), columns=['g'])
df['obj'] = ['a'] * 5000 + ['b'] * 5000
return df
def time_series_nth_any(self, dtype):
self.df['values'].groupby(self.df['key']).nth(0, dropna='any')

def time_nth(self, df):
df.groupby('g').nth(5)
def time_groupby_nth_all(self, dtype):
self.df['values'].groupby(self.df['key']).nth(0, dropna='all')

def time_nth_last(self, df):
df.groupby('g').last()
def time_series_nth(self, dtype):
self.df['values'].groupby(self.df['key']).nth(0)


class DateAttributes(object):
Expand Down Expand Up @@ -243,7 +208,7 @@ def time_multi_count(self, df):
df.groupby(['key1', 'key2']).count()


class CountInt(object):
class CountMultiInt(object):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

int tests for count / nunique are already covered in GroupByMethods though the differentiator here is that we are testing the use of multiple fields in the grouping. It may make sense to just do that natively in GroupByMethods but I didn't want to rip up too many things as of yet


goal_time = 0.2

Expand All @@ -255,10 +220,10 @@ def setup_cache(self):
'ints2': np.random.randint(0, 1000, size=n)})
return df

def time_int_count(self, df):
def time_multi_int_count(self, df):
df.groupby(['key1', 'key2']).count()

def time_int_nunique(self, df):
def time_multi_int_nunique(self, df):
df.groupby(['key1', 'key2']).nunique()


Expand Down Expand Up @@ -361,9 +326,6 @@ def setup(self):
def time_multi_size(self):
self.df.groupby(['key1', 'key2']).size()

def time_dt_size(self):
self.df.groupby(['dates']).size()

def time_dt_timegrouper_size(self):
with warnings.catch_warnings(record=True):
self.df.groupby(TimeGrouper(key='dates', freq='M')).size()
Expand All @@ -377,7 +339,7 @@ class GroupByMethods(object):
goal_time = 0.2

param_names = ['dtype', 'method']
params = [['int', 'float', 'object'],
params = [['int', 'float', 'object', 'datetime'],
['all', 'any', 'bfill', 'count', 'cumcount', 'cummax', 'cummin',
'cumprod', 'cumsum', 'describe', 'ffill', 'first', 'head',
'last', 'mad', 'max', 'min', 'median', 'mean', 'nunique',
Expand All @@ -398,12 +360,18 @@ def setup(self, dtype, method):
np.random.random(ngroups) * 10.0])
elif dtype == 'object':
key = ['foo'] * size
elif dtype == 'datetime':
key = date_range('1/1/2011', periods=size, freq='s')

df = DataFrame({'values': values, 'key': key})
self.df_groupby_method = getattr(df.groupby('key')['values'], method)
self.as_group_method = getattr(df.groupby('key')['values'], method)
self.as_field_method = getattr(df.groupby('values')['key'], method)

def time_dtype_as_group(self, dtype, method):
self.as_group_method()

def time_method(self, dtype, method):
self.df_groupby_method()
def time_dtype_as_field(self, dtype, method):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Before this was only testing the dtype as part of the grouping, but it makes sense to test it as an aggregated / transformed field as well

self.as_field_method()


class Float32(object):
Expand Down