Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: rolling.quantile does not return an interpolated result #16247

Merged
merged 11 commits into from
Jul 10, 2017
185 changes: 185 additions & 0 deletions asv_bench/benchmarks/rolling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
from .pandas_vb_common import *
import pandas as pd
import numpy as np


class DataframeRolling(object):
goal_time = 0.2

def setup(self):
self.N = 100000
self.Ns = 10000
self.df = pd.DataFrame({'a': np.random.random(self.N)})
self.dfs = pd.DataFrame({'a': np.random.random(self.Ns)})
self.wins = 10
self.winl = 1000

def time_rolling_quantile_0(self):
(self.df.rolling(self.wins).quantile(0.0))

def time_rolling_quantile_1(self):
(self.df.rolling(self.wins).quantile(1.0))

def time_rolling_quantile_median(self):
(self.df.rolling(self.wins).quantile(0.5))

def time_rolling_median(self):
(self.df.rolling(self.wins).median())

def time_rolling_median(self):
(self.df.rolling(self.wins).mean())

def time_rolling_max(self):
(self.df.rolling(self.wins).max())

def time_rolling_min(self):
(self.df.rolling(self.wins).min())

def time_rolling_std(self):
(self.df.rolling(self.wins).std())

def time_rolling_count(self):
(self.df.rolling(self.wins).count())

def time_rolling_skew(self):
(self.df.rolling(self.wins).skew())

def time_rolling_kurt(self):
(self.df.rolling(self.wins).kurt())

def time_rolling_sum(self):
(self.df.rolling(self.wins).sum())

def time_rolling_corr(self):
(self.dfs.rolling(self.wins).corr())

def time_rolling_cov(self):
(self.dfs.rolling(self.wins).cov())

def time_rolling_quantile_0_l(self):
(self.df.rolling(self.winl).quantile(0.0))

def time_rolling_quantile_1_l(self):
(self.df.rolling(self.winl).quantile(1.0))

def time_rolling_quantile_median_l(self):
(self.df.rolling(self.winl).quantile(0.5))

def time_rolling_median_l(self):
(self.df.rolling(self.winl).median())

def time_rolling_median_l(self):
(self.df.rolling(self.winl).mean())

def time_rolling_max_l(self):
(self.df.rolling(self.winl).max())

def time_rolling_min_l(self):
(self.df.rolling(self.winl).min())

def time_rolling_std_l(self):
(self.df.rolling(self.wins).std())

def time_rolling_count_l(self):
(self.df.rolling(self.wins).count())

def time_rolling_skew_l(self):
(self.df.rolling(self.wins).skew())

def time_rolling_kurt_l(self):
(self.df.rolling(self.wins).kurt())

def time_rolling_sum_l(self):
(self.df.rolling(self.wins).sum())


class SeriesRolling(object):
goal_time = 0.2

def setup(self):
self.N = 100000
self.Ns = 10000
self.df = pd.DataFrame({'a': np.random.random(self.N)})
self.dfs = pd.DataFrame({'a': np.random.random(self.Ns)})
self.sr = self.df.a
self.srs = self.dfs.a
self.wins = 10
self.winl = 1000

def time_rolling_quantile_0(self):
(self.sr.rolling(self.wins).quantile(0.0))

def time_rolling_quantile_1(self):
(self.sr.rolling(self.wins).quantile(1.0))

def time_rolling_quantile_median(self):
(self.sr.rolling(self.wins).quantile(0.5))

def time_rolling_median(self):
(self.sr.rolling(self.wins).median())

def time_rolling_median(self):
(self.sr.rolling(self.wins).mean())

def time_rolling_max(self):
(self.sr.rolling(self.wins).max())

def time_rolling_min(self):
(self.sr.rolling(self.wins).min())

def time_rolling_std(self):
(self.sr.rolling(self.wins).std())

def time_rolling_count(self):
(self.sr.rolling(self.wins).count())

def time_rolling_skew(self):
(self.sr.rolling(self.wins).skew())

def time_rolling_kurt(self):
(self.sr.rolling(self.wins).kurt())

def time_rolling_sum(self):
(self.sr.rolling(self.wins).sum())

def time_rolling_corr(self):
(self.srs.rolling(self.wins).corr())

def time_rolling_cov(self):
(self.srs.rolling(self.wins).cov())

def time_rolling_quantile_0_l(self):
(self.sr.rolling(self.winl).quantile(0.0))

def time_rolling_quantile_1_l(self):
(self.sr.rolling(self.winl).quantile(1.0))

def time_rolling_quantile_median_l(self):
(self.sr.rolling(self.winl).quantile(0.5))

def time_rolling_median_l(self):
(self.sr.rolling(self.winl).median())

def time_rolling_median_l(self):
(self.sr.rolling(self.winl).mean())

def time_rolling_max_l(self):
(self.sr.rolling(self.winl).max())

def time_rolling_min_l(self):
(self.sr.rolling(self.winl).min())

def time_rolling_std_l(self):
(self.sr.rolling(self.wins).std())

def time_rolling_count_l(self):
(self.sr.rolling(self.wins).count())

def time_rolling_skew_l(self):
(self.sr.rolling(self.wins).skew())

def time_rolling_kurt_l(self):
(self.sr.rolling(self.wins).kurt())

def time_rolling_sum_l(self):
(self.sr.rolling(self.wins).sum())
5 changes: 4 additions & 1 deletion doc/source/whatsnew/v0.21.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -167,9 +167,11 @@ Plotting

Groupby/Resample/Rolling
^^^^^^^^^^^^^^^^^^^^^^^^
- Bug in ``DataFrame.resample().size()`` where an empty ``DataFrame`` did not return a ``Series`` (:issue:`14962`)

- Bug in ``DataFrame.resample().size()`` where an empty ``DataFrame`` did not return a ``Series`` (:issue:`14962`)
- Bug in ``infer_freq`` causing indices with 2-day gaps during the working week to be wrongly inferred as business daily (:issue:`16624`)
- Bug in ``.rolling.quantile()`` which incorrectly used different defaults than :func:`Series.quantile()` and :func:`DataFrame.quantile()` (:issue:`9413`, :issue:`16211`)


Sparse
^^^^^^
Expand All @@ -190,6 +192,7 @@ Categorical
^^^^^^^^^^^



Other
^^^^^
- Bug in :func:`eval` where the ``inplace`` parameter was being incorrectly handled (:issue:`16732`)
15 changes: 13 additions & 2 deletions pandas/_libs/window.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1348,8 +1348,9 @@ def roll_quantile(ndarray[float64_t, cast=True] input, int64_t win,
bint is_variable
ndarray[int64_t] start, end
ndarray[double_t] output
double vlow, vhigh

if quantile < 0.0 or quantile > 1.0:
if quantile <= 0.0 or quantile >= 1.0:
raise ValueError("quantile value {0} not in [0, 1]".format(quantile))

# we use the Fixed/Variable Indexer here as the
Expand Down Expand Up @@ -1391,7 +1392,17 @@ def roll_quantile(ndarray[float64_t, cast=True] input, int64_t win,

if nobs >= minp:
idx = int(quantile * <double>(nobs - 1))
output[i] = skiplist.get(idx)

# Single value in skip list
if nobs == 1:
output[i] = skiplist.get(0)

# Interpolated quantile
else:
vlow = skiplist.get(idx)
vhigh = skiplist.get(idx + 1)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hadn't realized this before: skiplist.get is an O(log win) operation which we now have to perform twice. We also do a skiplist.insert and a skiplist.remove, which are also O(log win), and probably more expensive than skiplist.get, but a pessimistic estimation says the performance of this function will go down by 25%.

I think there are two things to do here:

  1. quantify that slow down by running a couple of benchmarks with and without this PR, and leave a record of the results for further reference.
  2. if the slowdown is significant, we could gain the speed back by doing some changes to the skiplist internals: when the first index is fetched, getting the next one is trivially simple and cheap, so we could add a .get_two() method to IndexableSkiplist and use it here. I think that would be work for another PR, but creating an issue for tracking based on the results of the benchmarks should be part of merging this.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I ran some very simple performance tests. The new version is indeed 25% slower.

New

In [1]: import pandas
In [2]: pandas.__version__
Out[2]: '0.20.0+19.g36db9bdaf'
In [3]: import numpy as np
In [4]: df = pandas.DataFrame({'a': np.random.random(1000000)})
In [5]: %timeit df.a.rolling(10).quantile(0.5)
1 loop, best of 3: 1.78 s per loop
In [6]: %timeit df.a.rolling(10).median()
1 loop, best of 3: 460 ms per loop

Old

In [1]: import pandas
In [2]: pandas.__version__
Out[2]: '0.20.1'
In [3]: import numpy as np
In [4]: df = pandas.DataFrame({'a': np.random.random(1000000)})
In [5]: %timeit df.a.rolling(10).quantile(0.5)
1 loop, best of 3: 1.4 s per loop
In [6]: %timeit df.a.rolling(10).median()
1 loop, best of 3: 451 ms per loop

YMMV, of course.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for doing this! Seeing this, there is a more obvious approach to speeding this in a future PR: use the C implemented skiplist that median uses, instead of the Cython IndexableSkiplist used by quantile. But that's clearly a different PR.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jaimefrio the issue is that for median you are only doing a single calculation. when doing many calculations (like here) a skip list is far faster. The actual impl of the skiplist could be improved a lot though (it has python objects inside it).

output[i] = (vlow + (vhigh - vlow) *
(quantile * (nobs - 1) - idx))
else:
output[i] = NaN

Expand Down
11 changes: 9 additions & 2 deletions pandas/core/window.py
Original file line number Diff line number Diff line change
Expand Up @@ -975,8 +975,15 @@ def quantile(self, quantile, **kwargs):

def f(arg, *args, **kwargs):
minp = _use_window(self.min_periods, window)
return _window.roll_quantile(arg, window, minp, indexi,
self.closed, quantile)
if quantile == 1.0:
return _window.roll_max(arg, window, minp, indexi,
self.closed)
elif quantile == 0.0:
return _window.roll_min(arg, window, minp, indexi,
self.closed)
else:
return _window.roll_quantile(arg, window, minp, indexi,
self.closed, quantile)

return self._apply(f, 'quantile', quantile=quantile,
**kwargs)
Expand Down
41 changes: 38 additions & 3 deletions pandas/tests/test_window.py
Original file line number Diff line number Diff line change
Expand Up @@ -1122,8 +1122,19 @@ def test_rolling_quantile(self):
def scoreatpercentile(a, per):
values = np.sort(a, axis=0)

idx = per / 1. * (values.shape[0] - 1)
return values[int(idx)]
idx = int(per / 1. * (values.shape[0] - 1))

if idx == values.shape[0] - 1:
retval = values[-1]

else:
qlow = float(idx) / float(values.shape[0] - 1)
qhig = float(idx + 1) / float(values.shape[0] - 1)
vlow = values[idx]
vhig = values[idx + 1]
retval = vlow + (vhig - vlow) * (per - qlow) / (qhig - qlow)

return retval

for q in qs:

Expand All @@ -1138,6 +1149,30 @@ def alt(x):

self._check_moment_func(f, alt, name='quantile', quantile=q)

def test_rolling_quantile_np_percentile(self):
# #9413: Tests that rolling window's quantile default behavior
# is analogus to Numpy's percentile
row = 10
col = 5
idx = pd.date_range(20100101, periods=row, freq='B')
df = pd.DataFrame(np.random.rand(row * col).reshape((row, -1)),
index=idx)

df_quantile = df.quantile([0.25, 0.5, 0.75], axis=0)
np_percentile = np.percentile(df, [25, 50, 75], axis=0)

tm.assert_almost_equal(df_quantile.values, np.array(np_percentile))

def test_rolling_quantile_series(self):
# #16211: Tests that rolling window's quantile default behavior
# is analogus to pd.Series' quantile
arr = np.arange(100)
s = pd.Series(arr)
q1 = s.quantile(0.1)
q2 = s.rolling(100).quantile(0.1).iloc[-1]

tm.assert_almost_equal(q1, q2)

def test_rolling_quantile_param(self):
ser = Series([0.0, .1, .5, .9, 1.0])

Expand Down Expand Up @@ -3558,7 +3593,7 @@ def test_ragged_quantile(self):

result = df.rolling(window='2s', min_periods=1).quantile(0.5)
expected = df.copy()
expected['B'] = [0.0, 1, 1.0, 3.0, 3.0]
expected['B'] = [0.0, 1, 1.5, 3.0, 3.5]
tm.assert_frame_equal(result, expected)

def test_ragged_std(self):
Expand Down