Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CLN: ASV Algorithms benchmark #18423

Merged
merged 4 commits into from
Nov 25, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
168 changes: 86 additions & 82 deletions asv_bench/benchmarks/algorithms.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from importlib import import_module

import numpy as np

import pandas as pd
from pandas.util import testing as tm

Expand All @@ -12,113 +11,118 @@
except:
pass

class Algorithms(object):

class Factorize(object):

goal_time = 0.2

def setup(self):
N = 100000
np.random.seed(1234)
params = [True, False]
param_names = ['sort']

self.int_unique = pd.Int64Index(np.arange(N * 5))
# cache is_unique
self.int_unique.is_unique
def setup(self, sort):
N = 10**5
np.random.seed(1234)
self.int_idx = pd.Int64Index(np.arange(N).repeat(5))
self.float_idx = pd.Float64Index(np.random.randn(N).repeat(5))
self.string_idx = tm.makeStringIndex(N)

self.int = pd.Int64Index(np.arange(N).repeat(5))
self.float = pd.Float64Index(np.random.randn(N).repeat(5))
def time_factorize_int(self, sort):
self.int_idx.factorize(sort=sort)

# Convenience naming.
self.checked_add = pd.core.algorithms.checked_add_with_arr
def time_factorize_float(self, sort):
self.float_idx.factorize(sort=sort)

self.arr = np.arange(1000000)
self.arrpos = np.arange(1000000)
self.arrneg = np.arange(-1000000, 0)
self.arrmixed = np.array([1, -1]).repeat(500000)
self.strings = tm.makeStringIndex(100000)
def time_factorize_string(self, sort):
self.string_idx.factorize(sort=sort)

self.arr_nan = np.random.choice([True, False], size=1000000)
self.arrmixed_nan = np.random.choice([True, False], size=1000000)

# match
self.uniques = tm.makeStringIndex(1000).values
self.all = self.uniques.repeat(10)
class Duplicated(object):

def time_factorize_string(self):
self.strings.factorize()
goal_time = 0.2

def time_factorize_int(self):
self.int.factorize()
params = ['first', 'last', False]
param_names = ['keep']

def time_factorize_float(self):
self.int.factorize()
def setup(self, keep):
N = 10**5
np.random.seed(1234)
self.int_idx = pd.Int64Index(np.arange(N).repeat(5))
self.float_idx = pd.Float64Index(np.random.randn(N).repeat(5))
self.string_idx = tm.makeStringIndex(N)

def time_duplicated_int_unique(self):
self.int_unique.duplicated()
def time_duplicated_int(self, keep):
self.int_idx.duplicated(keep=keep)

def time_duplicated_int(self):
self.int.duplicated()
def time_duplicated_float(self, keep):
self.float_idx.duplicated(keep=keep)

def time_duplicated_float(self):
self.float.duplicated()
def time_duplicated_string(self, keep):
self.string_idx.duplicated(keep=keep)

def time_match_strings(self):
pd.match(self.all, self.uniques)

def time_add_overflow_pos_scalar(self):
self.checked_add(self.arr, 1)
class DuplicatedUniqueIndex(object):

def time_add_overflow_neg_scalar(self):
self.checked_add(self.arr, -1)
goal_time = 0.2

def time_add_overflow_zero_scalar(self):
self.checked_add(self.arr, 0)
def setup(self):
N = 10**5
self.idx_int_dup = pd.Int64Index(np.arange(N * 5))
# cache is_unique
self.idx_int_dup.is_unique

def time_add_overflow_pos_arr(self):
self.checked_add(self.arr, self.arrpos)
def time_duplicated_unique_int(self):
self.idx_int_dup.duplicated()

def time_add_overflow_neg_arr(self):
self.checked_add(self.arr, self.arrneg)

def time_add_overflow_mixed_arr(self):
self.checked_add(self.arr, self.arrmixed)
class Match(object):

def time_add_overflow_first_arg_nan(self):
self.checked_add(self.arr, self.arrmixed, arr_mask=self.arr_nan)
goal_time = 0.2

def time_add_overflow_second_arg_nan(self):
self.checked_add(self.arr, self.arrmixed, b_mask=self.arrmixed_nan)
def setup(self):
np.random.seed(1234)
self.uniques = tm.makeStringIndex(1000).values
self.all = self.uniques.repeat(10)

def time_add_overflow_both_arg_nan(self):
self.checked_add(self.arr, self.arrmixed, arr_mask=self.arr_nan,
b_mask=self.arrmixed_nan)
def time_match_string(self):
pd.match(self.all, self.uniques)


class Hashing(object):

goal_time = 0.2

def setup(self):
N = 100000

self.df = pd.DataFrame(
{'A': pd.Series(tm.makeStringIndex(100).take(
np.random.randint(0, 100, size=N))),
'B': pd.Series(tm.makeStringIndex(10000).take(
np.random.randint(0, 10000, size=N))),
'D': np.random.randn(N),
'E': np.arange(N),
'F': pd.date_range('20110101', freq='s', periods=N),
'G': pd.timedelta_range('1 day', freq='s', periods=N),
})
self.df['C'] = self.df['B'].astype('category')
self.df.iloc[10:20] = np.nan

def time_frame(self):
hashing.hash_pandas_object(self.df)

def time_series_int(self):
hashing.hash_pandas_object(self.df.E)

def time_series_string(self):
hashing.hash_pandas_object(self.df.B)

def time_series_categorical(self):
hashing.hash_pandas_object(self.df.C)
def setup_cache(self):
np.random.seed(1234)
N = 10**5

df = pd.DataFrame(
{'strings': pd.Series(tm.makeStringIndex(10000).take(
np.random.randint(0, 10000, size=N))),
'floats': np.random.randn(N),
'ints': np.arange(N),
'dates': pd.date_range('20110101', freq='s', periods=N),
'timedeltas': pd.timedelta_range('1 day', freq='s', periods=N)})
df['categories'] = df['strings'].astype('category')
df.iloc[10:20] = np.nan
return df

def time_frame(self, df):
hashing.hash_pandas_object(df)

def time_series_int(self, df):
hashing.hash_pandas_object(df['ints'])

def time_series_string(self, df):
hashing.hash_pandas_object(df['strings'])

def time_series_float(self, df):
hashing.hash_pandas_object(df['floats'])

def time_series_categorical(self, df):
hashing.hash_pandas_object(df['categories'])

def time_series_timedeltas(self, df):
hashing.hash_pandas_object(df['timedeltas'])

def time_series_dates(self, df):
hashing.hash_pandas_object(df['dates'])
44 changes: 44 additions & 0 deletions asv_bench/benchmarks/binary_ops.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import numpy as np
from pandas import DataFrame, Series, date_range
from pandas.core.algorithms import checked_add_with_arr
try:
import pandas.core.computation.expressions as expr
except ImportError:
Expand Down Expand Up @@ -108,3 +109,46 @@ def time_timestamp_ops_diff(self, tz):

def time_timestamp_ops_diff_with_shift(self, tz):
self.s - self.s.shift()


class AddOverflowScalar(object):

goal_time = 0.2

params = [1, -1, 0]
param_names = ['scalar']

def setup(self, scalar):
N = 10**6
self.arr = np.arange(N)

def time_add_overflow_scalar(self, scalar):
checked_add_with_arr(self.arr, scalar)


class AddOverflowArray(object):

goal_time = 0.2

def setup(self):
np.random.seed(1234)
N = 10**6
self.arr = np.arange(N)
self.arr_rev = np.arange(-N, 0)
self.arr_mixed = np.array([1, -1]).repeat(N / 2)
self.arr_nan_1 = np.random.choice([True, False], size=N)
self.arr_nan_2 = np.random.choice([True, False], size=N)

def time_add_overflow_arr_rev(self):
checked_add_with_arr(self.arr, self.arr_rev)

def time_add_overflow_arr_mask_nan(self):
checked_add_with_arr(self.arr, self.arr_mixed, arr_mask=self.arr_nan_1)

def time_add_overflow_b_mask_nan(self):
checked_add_with_arr(self.arr, self.arr_mixed,
b_mask=self.arr_nan_1)

def time_add_overflow_both_arg_nan(self):
checked_add_with_arr(self.arr, self.arr_mixed, arr_mask=self.arr_nan_1,
b_mask=self.arr_nan_2)