Skip to content

Commit

Permalink
TEST-#5014: Simplify adding new ASV benchmarks (#5015)
Browse files Browse the repository at this point in the history
Signed-off-by: Myachev <anatoly.myachev@intel.com>
  • Loading branch information
anmyachev authored Sep 24, 2022
1 parent 0a2c0de commit ff3425d
Show file tree
Hide file tree
Showing 9 changed files with 90 additions and 132 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -470,7 +470,7 @@ jobs:
# check pure pandas
MODIN_ASV_USE_IMPL=pandas asv run --quick --strict --show-stderr --launch-method=spawn \
-b ^benchmarks -b ^io -b ^scalability | tee benchmarks.log
-b ^benchmarks -b ^io | tee benchmarks.log
# HDK: ERR_OUT_OF_CPU_MEM: Not enough host memory to execute the query (MODIN#4270)
# just disable test for testing - it works well in a machine with more memory
Expand Down
102 changes: 38 additions & 64 deletions asv_bench/benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
# define `MODIN_ASV_USE_IMPL` env var to choose library for using in performance
# measurements

import modin.pandas as pd
import numpy as np

from .utils import (
Expand All @@ -30,7 +29,6 @@
random_string,
random_columns,
random_booleans,
ASV_USE_IMPL,
GROUPBY_NGROUPS,
IMPL,
execute,
Expand All @@ -44,7 +42,6 @@ class BaseTimeGroupBy:
def setup(self, shape, ngroups=5, groupby_ncols=1):
ngroups = translator_groupby_ngroups(ngroups, shape)
self.df, self.groupby_columns = generate_dataframe(
ASV_USE_IMPL,
"int",
*shape,
RAND_LOW,
Expand Down Expand Up @@ -122,12 +119,8 @@ class TimeJoin:
]

def setup(self, shapes, how, sort):
self.df1 = generate_dataframe(
ASV_USE_IMPL, "int", *shapes[0], RAND_LOW, RAND_HIGH
)
self.df2 = generate_dataframe(
ASV_USE_IMPL, "int", *shapes[1], RAND_LOW, RAND_HIGH
)
self.df1 = generate_dataframe("int", *shapes[0], RAND_LOW, RAND_HIGH)
self.df2 = generate_dataframe("int", *shapes[1], RAND_LOW, RAND_HIGH)

def time_join(self, shapes, how, sort):
# join dataframes on index to get the predictable shape
Expand All @@ -143,12 +136,8 @@ class TimeMerge:
]

def setup(self, shapes, how, sort):
self.df1 = generate_dataframe(
ASV_USE_IMPL, "int", *shapes[0], RAND_LOW, RAND_HIGH
)
self.df2 = generate_dataframe(
ASV_USE_IMPL, "int", *shapes[1], RAND_LOW, RAND_HIGH
)
self.df1 = generate_dataframe("int", *shapes[0], RAND_LOW, RAND_HIGH)
self.df2 = generate_dataframe("int", *shapes[1], RAND_LOW, RAND_HIGH)

def time_merge(self, shapes, how, sort):
# merge dataframes by index to get the predictable shape
Expand All @@ -168,15 +157,11 @@ class TimeConcat:
]

def setup(self, shapes, how, axis):
self.df1 = generate_dataframe(
ASV_USE_IMPL, "int", *shapes[0], RAND_LOW, RAND_HIGH
)
self.df2 = generate_dataframe(
ASV_USE_IMPL, "int", *shapes[1], RAND_LOW, RAND_HIGH
)
self.df1 = generate_dataframe("int", *shapes[0], RAND_LOW, RAND_HIGH)
self.df2 = generate_dataframe("int", *shapes[1], RAND_LOW, RAND_HIGH)

def time_concat(self, shapes, how, axis):
execute(IMPL[ASV_USE_IMPL].concat([self.df1, self.df2], axis=axis, join=how))
execute(IMPL.concat([self.df1, self.df2], axis=axis, join=how))


class TimeAppend:
Expand All @@ -187,12 +172,8 @@ class TimeAppend:
]

def setup(self, shapes, sort):
self.df1 = generate_dataframe(
ASV_USE_IMPL, "int", *shapes[0], RAND_LOW, RAND_HIGH
)
self.df2 = generate_dataframe(
ASV_USE_IMPL, "int", *shapes[1], RAND_LOW, RAND_HIGH
)
self.df1 = generate_dataframe("int", *shapes[0], RAND_LOW, RAND_HIGH)
self.df2 = generate_dataframe("int", *shapes[1], RAND_LOW, RAND_HIGH)
if sort:
self.df1.columns = self.df1.columns[::-1]

Expand All @@ -209,12 +190,8 @@ class TimeBinaryOp:
]

def setup(self, shapes, binary_op, axis):
self.df1 = generate_dataframe(
ASV_USE_IMPL, "int", *shapes[0], RAND_LOW, RAND_HIGH
)
self.df2 = generate_dataframe(
ASV_USE_IMPL, "int", *shapes[1], RAND_LOW, RAND_HIGH
)
self.df1 = generate_dataframe("int", *shapes[0], RAND_LOW, RAND_HIGH)
self.df2 = generate_dataframe("int", *shapes[1], RAND_LOW, RAND_HIGH)
self.op = getattr(self.df1, binary_op)

def time_binary_op(self, shapes, binary_op, axis):
Expand Down Expand Up @@ -242,9 +219,7 @@ def get_loc(df, loc, axis, item_length):
)

def setup(self, shape, item_length, loc, is_equal_indices):
self.df = generate_dataframe(
ASV_USE_IMPL, "int", *shape, RAND_LOW, RAND_HIGH
).copy()
self.df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH).copy()
self.loc, self.iloc = self.get_loc(
self.df, loc, item_length=item_length, axis=1
)
Expand Down Expand Up @@ -297,7 +272,7 @@ class TimeArithmetic:
]

def setup(self, shape, axis):
self.df = generate_dataframe(ASV_USE_IMPL, "int", *shape, RAND_LOW, RAND_HIGH)
self.df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH)

def time_sum(self, shape, axis):
execute(self.df.sum(axis=axis))
Expand Down Expand Up @@ -351,7 +326,7 @@ class TimeSortValues:
]

def setup(self, shape, columns_number, ascending_list):
self.df = generate_dataframe(ASV_USE_IMPL, "int", *shape, RAND_LOW, RAND_HIGH)
self.df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH)
self.columns = random_columns(self.df.columns, columns_number)
self.ascending = (
random_booleans(columns_number)
Expand All @@ -372,7 +347,7 @@ class TimeDrop:
]

def setup(self, shape, axis, drop_ncols):
self.df = generate_dataframe(ASV_USE_IMPL, "int", *shape, RAND_LOW, RAND_HIGH)
self.df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH)
drop_count = (
int(len(self.df.axes[axis]) * drop_ncols)
if isinstance(drop_ncols, float)
Expand All @@ -392,7 +367,7 @@ class TimeHead:
]

def setup(self, shape, head_count):
self.df = generate_dataframe(ASV_USE_IMPL, "int", *shape, RAND_LOW, RAND_HIGH)
self.df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH)
self.head_count = (
int(head_count * len(self.df.index))
if isinstance(head_count, float)
Expand All @@ -411,7 +386,7 @@ class TimeTail:
]

def setup(self, shape, tail_count):
self.df = generate_dataframe(ASV_USE_IMPL, "int", *shape, RAND_LOW, RAND_HIGH)
self.df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH)
self.tail_count = (
int(tail_count * len(self.df.index))
if isinstance(tail_count, float)
Expand All @@ -430,7 +405,7 @@ class TimeExplode:

def setup(self, shape):
self.df = generate_dataframe(
ASV_USE_IMPL, "int", *shape, RAND_LOW, RAND_HIGH, gen_unique_key=True
"int", *shape, RAND_LOW, RAND_HIGH, gen_unique_key=True
)

def time_explode(self, shape):
Expand All @@ -446,16 +421,15 @@ class TimeFillnaSeries:
]

def setup(self, value_type, shape, limit):
pd = IMPL[ASV_USE_IMPL]
self.series = gen_nan_data(ASV_USE_IMPL, *shape)
self.series = gen_nan_data(*shape)

if value_type == "scalar":
self.value = 18.19
elif value_type == "dict":
self.value = {k: k * 1.23 for k in range(shape[0])}
elif value_type == "Series":
self.value = pd.Series(
[k * 1.23 for k in range(shape[0])], index=pd.RangeIndex(shape[0])
self.value = IMPL.Series(
[k * 1.23 for k in range(shape[0])], index=IMPL.RangeIndex(shape[0])
)
else:
assert False
Expand All @@ -479,25 +453,24 @@ class TimeFillnaDataFrame:
]

def setup(self, value_type, shape, limit):
pd = IMPL[ASV_USE_IMPL]
self.df = gen_nan_data(ASV_USE_IMPL, *shape)
self.df = gen_nan_data(*shape)
columns = self.df.columns

if value_type == "scalar":
self.value = 18.19
elif value_type == "dict":
self.value = {k: i * 1.23 for i, k in enumerate(columns)}
elif value_type == "Series":
self.value = pd.Series(
self.value = IMPL.Series(
[i * 1.23 for i in range(len(columns))], index=columns
)
elif value_type == "DataFrame":
self.value = pd.DataFrame(
self.value = IMPL.DataFrame(
{
k: [i + j * 1.23 for j in range(shape[0])]
for i, k in enumerate(columns)
},
index=pd.RangeIndex(shape[0]),
index=IMPL.RangeIndex(shape[0]),
columns=columns,
)
else:
Expand All @@ -517,7 +490,6 @@ class BaseTimeValueCounts:
def setup(self, shape, ngroups=5, subset=1):
ngroups = translator_groupby_ngroups(ngroups, shape)
self.df, self.subset = generate_dataframe(
ASV_USE_IMPL,
"int",
*shape,
RAND_LOW,
Expand Down Expand Up @@ -585,11 +557,11 @@ class TimeIndexing:
}

def setup(self, shape, indexer_type):
self.df = generate_dataframe(ASV_USE_IMPL, "int", *shape, RAND_LOW, RAND_HIGH)
self.df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH)
trigger_import(self.df)

self.indexer = self.indexer_getters[indexer_type](self.df)
if isinstance(self.indexer, (pd.Series, pd.DataFrame)):
if isinstance(self.indexer, (IMPL.Series, IMPL.DataFrame)):
# HACK: Triggering `dtypes` meta-data computation in advance,
# so it won't affect the `loc/iloc` time:
self.indexer.dtypes
Expand All @@ -611,7 +583,7 @@ class TimeIndexingColumns:
params = [get_benchmark_shapes("TimeIndexing")]

def setup(self, shape):
self.df = generate_dataframe(ASV_USE_IMPL, "int", *shape, RAND_LOW, RAND_HIGH)
self.df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH)
trigger_import(self.df)
self.numeric_indexer = [0, 1]
self.labels_indexer = self.df.columns[self.numeric_indexer].tolist()
Expand All @@ -631,10 +603,12 @@ class TimeMultiIndexing:
params = [get_benchmark_shapes("TimeMultiIndexing")]

def setup(self, shape):
df = generate_dataframe(ASV_USE_IMPL, "int", *shape, RAND_LOW, RAND_HIGH)
df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH)

index = pd.MultiIndex.from_product([df.index[: shape[0] // 2], ["bar", "foo"]])
columns = pd.MultiIndex.from_product(
index = IMPL.MultiIndex.from_product(
[df.index[: shape[0] // 2], ["bar", "foo"]]
)
columns = IMPL.MultiIndex.from_product(
[df.columns[: shape[1] // 2], ["buz", "fuz"]]
)

Expand All @@ -661,10 +635,10 @@ class TimeResetIndex:
]

def setup(self, shape, drop, level):
self.df = generate_dataframe(ASV_USE_IMPL, "int", *shape, RAND_LOW, RAND_HIGH)
self.df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH)

if level:
index = pd.MultiIndex.from_product(
index = IMPL.MultiIndex.from_product(
[self.df.index[: shape[0] // 2], ["bar", "foo"]],
names=["level_1", "level_2"],
)
Expand All @@ -683,7 +657,7 @@ class TimeAstype:
]

def setup(self, shape, dtype, astype_ncolumns):
self.df = generate_dataframe(ASV_USE_IMPL, "int", *shape, RAND_LOW, RAND_HIGH)
self.df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH)
if astype_ncolumns == "all":
self.astype_arg = dtype
elif astype_ncolumns == "one":
Expand All @@ -702,7 +676,7 @@ class TimeDescribe:
]

def setup(self, shape):
self.df = generate_dataframe(ASV_USE_IMPL, "int", *shape, RAND_LOW, RAND_HIGH)
self.df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH)

def time_describe(self, shape):
execute(self.df.describe())
Expand All @@ -715,7 +689,7 @@ class TimeProperties:
]

def setup(self, shape):
self.df = generate_dataframe(ASV_USE_IMPL, "int", *shape, RAND_LOW, RAND_HIGH)
self.df = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH)

def time_shape(self, shape):
return self.df.shape
Expand Down
Loading

0 comments on commit ff3425d

Please sign in to comment.