Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BENCH: add some cases for join and merge ops from pandas #5021

Merged
merged 3 commits into from
Oct 10, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 62 additions & 1 deletion asv_bench/benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
# measurements

import numpy as np
import pandas._testing as tm

from .utils import (
generate_dataframe,
Expand Down Expand Up @@ -127,12 +128,56 @@ def time_join(self, shapes, how, sort):
execute(self.df1.join(self.df2, how=how, lsuffix="left_", sort=sort))


class TimeJoinStringIndex:
YarShev marked this conversation as resolved.
Show resolved Hide resolved
param_names = ["shapes", "sort"]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We likely want to benchmark left and inner values for how parameter, don't we?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We already have benchmarks for the parameter. I think we don't need another benchmark for this as it seems like a duplication to me.

params = [
get_benchmark_shapes("TimeJoinStringIndex"),
[True, False],
]

def setup(self, shapes, sort):
assert shapes[0] % 100 == 0, "implementation restriction"
level1 = tm.makeStringIndex(10).values
level2 = tm.makeStringIndex(shapes[0] // 100).values
codes1 = np.arange(10).repeat(shapes[0] // 100)
codes2 = np.tile(np.arange(shapes[0] // 100), 10)
index2 = IMPL.MultiIndex(levels=[level1, level2], codes=[codes1, codes2])
self.df_multi = IMPL.DataFrame(
np.random.randn(len(index2), 4), index=index2, columns=["A", "B", "C", "D"]
)

self.key1 = np.tile(level1.take(codes1), 10)
self.key2 = np.tile(level2.take(codes2), 10)
self.df = generate_dataframe("int", *shapes, RAND_LOW, RAND_HIGH)
# just to keep source shape
self.df = self.df.drop(columns=self.df.columns[-2:])
self.df["key1"] = self.key1
self.df["key2"] = self.key2
execute(self.df)

self.df_key1 = IMPL.DataFrame(
np.random.randn(len(level1), 4), index=level1, columns=["A", "B", "C", "D"]
)
self.df_key2 = IMPL.DataFrame(
np.random.randn(len(level2), 4), index=level2, columns=["A", "B", "C", "D"]
)

def time_join_dataframe_index_multi(self, shapes, sort):
execute(self.df.join(self.df_multi, on=["key1", "key2"], sort=sort))

def time_join_dataframe_index_single_key_bigger(self, shapes, sort):
execute(self.df.join(self.df_key2, on="key2", sort=sort))

def time_join_dataframe_index_single_key_small(self, shapes, sort):
execute(self.df.join(self.df_key1, on="key1", sort=sort))


class TimeMerge:
param_names = ["shapes", "how", "sort"]
params = [
get_benchmark_shapes("TimeMerge"),
["left", "inner"],
[False],
[True, False],
]

def setup(self, shapes, how, sort):
Expand All @@ -147,6 +192,19 @@ def time_merge(self, shapes, how, sort):
)
)

def time_merge_default(self, shapes, how, sort):
execute(IMPL.merge(self.df1, self.df2, how=how, sort=sort))

def time_merge_dataframe_empty_right(self, shapes, how, sort):
# Getting an empty dataframe using `iloc` should be very fast,
# so the impact on the time of the merge operation should be negligible.
execute(IMPL.merge(self.df1, self.df2.iloc[:0], how=how, sort=sort))

def time_merge_dataframe_empty_left(self, shapes, how, sort):
# Getting an empty dataframe using `iloc` should be very fast,
# so the impact on the time of the merge operation should be negligible.
execute(IMPL.merge(self.df1.iloc[:0], self.df2, how=how, sort=sort))


class TimeMergeCategoricals:
param_names = ["shapes", "data_type"]
Expand Down Expand Up @@ -759,3 +817,6 @@ def time_columns(self, shape):

def time_index(self, shape):
return self.df.index


from .utils import setup # noqa: E402, F401
2 changes: 2 additions & 0 deletions asv_bench/benchmarks/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
random_booleans,
translator_groupby_ngroups,
trigger_import,
setup,
)

__all__ = [
Expand All @@ -54,4 +55,5 @@
"random_booleans",
"translator_groupby_ngroups",
"trigger_import",
"setup",
]
7 changes: 7 additions & 0 deletions asv_bench/benchmarks/utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -594,3 +594,10 @@ def prepare_io_data_parquet(test_filename: str, data_type: str, shapes: list):
df.to_parquet(test_filenames[shape_id], index=False)

return test_filenames


def setup(*args, **kwargs): # noqa: GL08
# This function just needs to be imported into each benchmark file to
anmyachev marked this conversation as resolved.
Show resolved Hide resolved
# set up the random seed before each function. ASV run it automatically.
# https://asv.readthedocs.io/en/latest/writing_benchmarks.html
np.random.seed(42)
3 changes: 3 additions & 0 deletions asv_bench/benchmarks/utils/data_shapes.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,9 @@
DEFAULT_CONFIG["MergeCategoricals"] = (
[[10_000, 2]] if ASV_DATASET_SIZE == "big" else [[1_000, 2]]
)
DEFAULT_CONFIG["TimeJoinStringIndex"] = (
[[100_000, 64]] if ASV_DATASET_SIZE == "big" else [[1_000, 4]]
)
for config in (_DEFAULT_CONFIG_T, _DEFAULT_HDK_CONFIG_T):
for _shape, _names in config:
DEFAULT_CONFIG.update({_name: _shape for _name in _names})
Expand Down