-
Notifications
You must be signed in to change notification settings - Fork 653
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Attempt to accelerate mean aggregation #3586
Changes from all commits
2ef2e0b
3f689a7
a4e1e41
6078771
dfb4e60
e239ae0
2f1d0fb
1780101
2cc6bda
a98d196
b84ccf1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2491,6 +2491,44 @@ def _callable_func(self, func, axis, *args, **kwargs): | |
groupby_prod = GroupByReduce.register("prod") | ||
groupby_sum = GroupByReduce.register("sum") | ||
|
||
def _mean_agg_map(dfgb, **kwargs): | ||
kwargs["min_count"] = 1 | ||
result = dfgb.sum(**kwargs) | ||
divisor = dfgb.count() | ||
divisor.set_axis( | ||
["__mean_agg_size_column__" + x for x in divisor.columns], | ||
axis=1, | ||
inplace=True, | ||
) | ||
result = pandas.concat([result, divisor], axis=1, copy=False) | ||
return result | ||
|
||
def _mean_agg_reduce(dfgb, **kwargs): | ||
kwargs["min_count"] = 1 | ||
result = dfgb.sum(**kwargs) | ||
divirgent = result[ | ||
[x for x in result.columns if not x.startswith("__mean_agg_size_column__")] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. in the case of MultiIndex There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thinking again about this, probably a best approach would be to add a new multiindex level which would distinguish between original columns and result of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, it should be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see, I have a prototype of this here: https://github.com/modin-project/modin/pull/1902/files |
||
] | ||
divisor = result[ | ||
[x for x in result.columns if x.startswith("__mean_agg_size_column__")] | ||
] | ||
divisor.set_axis( | ||
[x[len("__mean_agg_size_column__") :] for x in divisor.columns], | ||
axis=1, | ||
inplace=True, | ||
) | ||
Comment on lines
+2515
to
+2519
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The order of columns among dividend and divisor frames should be the same, so can we just do: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Except divisor and divirgent have different sets of columns because divisor also has "by" columns prefixed with "mean_agg_size_column". The following line (with comment above it) makes sure that they are excluded. |
||
# Following line makes sure we exclude any "by" columns that could be carried in via "__mean_agg_size_column__" | ||
# while they shouldn't be present and was removed since map phase was done. | ||
divisor = divisor[divirgent.columns] | ||
result = divirgent.divide(divisor) | ||
return result | ||
|
||
groupby_mean_numeric = GroupByReduce.register( | ||
_mean_agg_map, | ||
_mean_agg_reduce, | ||
default_to_pandas_func=lambda dfgb, **kwargs: dfgb.mean(**kwargs), | ||
) | ||
|
||
def groupby_size( | ||
self, by, axis, groupby_args, map_args, reduce_args, numeric_only, drop | ||
): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -33,6 +33,7 @@ | |
) | ||
from modin.core.storage_formats.base.query_compiler import BaseQueryCompiler | ||
from modin.core.dataframe.algebra.default2pandas.groupby import GroupBy | ||
from pandas.core.dtypes.common import is_datetime64_any_dtype | ||
from modin.config import IsExperimental | ||
from .series import Series | ||
from .utils import is_label | ||
|
@@ -128,7 +129,46 @@ def sem(self, ddof=1): | |
return self._default_to_pandas(lambda df: df.sem(ddof=ddof)) | ||
|
||
def mean(self, *args, **kwargs): | ||
return self._apply_agg_function(lambda df: df.mean(*args, **kwargs)) | ||
fallback = False | ||
converted_columns = {} | ||
numeric_only = kwargs.get("numeric_only", False) | ||
if not numeric_only: | ||
for col, dt in self._query_compiler.dtypes.items(): | ||
if is_datetime64_any_dtype(dt): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. will timedelta64 be affected too? |
||
if self._df[col].hasnans: | ||
fallback = True | ||
break | ||
else: | ||
converted_columns[col] = dt | ||
Comment on lines
+136
to
+142
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. in order to minimize backend-API calls I would replace this for-loop with something like this: datetime_cols = self._df[[col for col, dt in self._df.dtypes.items() if is_datetime64_any_dtype(dt)]]
has_nans_mask = datetime_cols.isna().any() I've compared these two approaches and it appeared that the proposed one almost always faster: Micro-benchmark and its resultsimport modin.pandas as pd
import numpy as np
import timeit
def is_dtype_to_pick(dtype):
return dtype == "float"
def fn1(df):
to_convert = {}
fallback = False
for col, dt in df.dtypes.items():
if is_dtype_to_pick(dt):
if df[col].hasnans:
fallback = True
break
else:
to_convert[col] = dt
def fn2(df):
cols_to_select = [col for col, dt in df.dtypes.items() if is_dtype_to_pick(dt)]
if len(cols_to_select) == 0:
return
selected_df = df[cols_to_select]
fallback = selected_df.isna().any().any()
if not fallback:
to_convert = selected_df.dtypes.items()
NROWS = 1_000_000
NCOLS = 10
df_no_floats = pd.DataFrame({f"col{i}": np.arange(NROWS) for i in range(NCOLS)})
df_has_half_float_no_nans = df_no_floats.astype({col: "float" for col in df_no_floats.columns[:len(df_no_floats.columns) // 2]})
df_has_half_float_nans_first = df_has_half_float_no_nans.copy()
df_has_half_float_nans_first.loc[0, df_has_half_float_nans_first.select_dtypes("float").columns[0]] = np.nan
df_has_half_float_nans_last = df_has_half_float_no_nans.copy()
df_has_half_float_nans_first.loc[0, df_has_half_float_nans_first.select_dtypes("float").columns[-1]] = np.nan
NRUNS = 5
print("df_no_float:")
print("for-loop:", timeit.timeit(lambda: fn1(df_no_floats), number=NRUNS))
print("non-for-loop:", timeit.timeit(lambda: fn2(df_no_floats), number=NRUNS))
print("\ndf_has_half_float_no_nans:")
print("for-loop:", timeit.timeit(lambda: fn1(df_has_half_float_no_nans), number=NRUNS))
print("non-for-loop:", timeit.timeit(lambda: fn2(df_has_half_float_no_nans), number=NRUNS))
print("\ndf_has_half_float_nans_first:")
print("for-loop:", timeit.timeit(lambda: fn1(df_has_half_float_nans_first), number=NRUNS))
print("non-for-loop:", timeit.timeit(lambda: fn2(df_has_half_float_nans_first), number=NRUNS))
print("\ndf_has_half_float_nans_last:")
print("for-loop:", timeit.timeit(lambda: fn1(df_has_half_float_nans_last), number=NRUNS))
print("non-for-loop:", timeit.timeit(lambda: fn2(df_has_half_float_nans_last), number=NRUNS)) Results (it's a sum of all 5 runs, this is how
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the |
||
if fallback: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I suggest moving this dispatching to the QueryCompiler level, calling groupby functions via Or instead of this, to make things clear, we should rename There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think renaming into |
||
# We cannot use map-reduce approach because non-numeric columns are present and | ||
# are requested to be processed. It happens because in map-reduce we use "sum" | ||
# aggregation which always drops non-numeric columns unconditionally, no matter | ||
# what arguments are specified. It is different from how "mean" handles non-numeric | ||
# columns (in particular datetime types). We could use a workaround to convert | ||
# datetime to int64 but it works only when NaNs aren't present in datetime columns. | ||
# NaNs converted to int64 produce wrong results and have to be handled differently, | ||
# so we have to resort to less efficient approach of broadcast full axis in | ||
Comment on lines
+150
to
+151
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why does There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Consider the following example:
Here if we convert column "b" to int64 as There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what would it take to make this case (or just all datetimelike cases) just use the pandas implementation? https://github.com/pandas-dev/pandas/blob/main/pandas/_libs/groupby.pyx#L721 |
||
# _apply_agg_function. | ||
result = self._apply_agg_function(lambda df: df.mean(*args, **kwargs)) | ||
else: | ||
if len(converted_columns) > 0: | ||
# Convert all datetime64 types to int64 to allow pandas "sum" to work. | ||
self._df = self._df.astype( | ||
{col: "int64" for col in converted_columns.keys()} | ||
) | ||
Comment on lines
+156
to
+159
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I still think that it's pandas-backend specific because of its MapReduce implementation and so should be moved to the QC. Let's wait for other reviewers' opinions though. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. in pandas this astype would make a copy, so we'd do There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
No, |
||
self._query_compiler = self._df._query_compiler | ||
result = self._wrap_aggregation( | ||
type(self._query_compiler).groupby_mean_numeric, | ||
lambda df, **kwargs: df.mean(*args, **kwargs), | ||
**kwargs, | ||
) | ||
if len(converted_columns) > 0: | ||
# Convert int64 types back to datetime64 types in result. | ||
result = result.astype( | ||
{col: dt for col, dt in converted_columns.items()} | ||
) | ||
return result | ||
|
||
def any(self, **kwargs): | ||
return self._wrap_aggregation( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1466,6 +1466,62 @@ def test_agg_exceptions(operation): | |
eval_aggregation(*create_test_dfs(data), operation=operation) | ||
|
||
|
||
@pytest.mark.parametrize("numeric_only", [True, False]) | ||
def test_mean_agg_different_types(numeric_only): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let's also add a |
||
N = 200 | ||
fill_data = { | ||
"date": [ | ||
np.datetime64("2000"), | ||
np.datetime64("2010"), | ||
np.datetime64("2011"), | ||
np.datetime64("2011-06-15T00:00"), | ||
np.datetime64("2009-01-01"), | ||
] | ||
* (N // 5), | ||
"int_only": [2000, 2010, 2011, 2012, 2009] * (N // 5), | ||
"int_and_nan": [ | ||
2000, | ||
2010, | ||
2011, | ||
None, | ||
None, | ||
] | ||
* (N // 5), | ||
"float_only": [ | ||
2000.0, | ||
2010.0, | ||
2011.0, | ||
2012.0, | ||
2009.0, | ||
] | ||
* (N // 5), | ||
"float_and_nan": [ | ||
2000.0, | ||
2010.0, | ||
2011.0, | ||
None, | ||
None, | ||
] | ||
* (N // 5), | ||
} | ||
|
||
data1 = { | ||
"column_to_by": ["foo", "bar", "baz", "qux"] * (N // 4), | ||
} | ||
|
||
data2 = { | ||
f"{key}{i}": value | ||
for key, value in fill_data.items() | ||
for i in range(N // len(fill_data)) | ||
} | ||
|
||
data = {**data1, **data2} | ||
|
||
eval_aggregation( | ||
*create_test_dfs(data), operation="mean", numeric_only=numeric_only | ||
) | ||
|
||
|
||
@pytest.mark.skip( | ||
"Pandas raises a ValueError on empty dictionary aggregation since 1.2.0" | ||
"It's unclear is that was made on purpose or it is a bug. That question" | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this breaks on a MultiIndex columns in
divisor
Error traceback
I would recommend using
add_prefix
here instead of manual renaming:divisor.add_prefix(prefix)
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also, I would like the prefix to be stored in some variable to simplify its usage