From a43c1576ce3d94bc82f7cdd63531280ced5a9fa0 Mon Sep 17 00:00:00 2001 From: Guillem Borrell Date: Mon, 10 Jul 2017 12:15:08 +0200 Subject: [PATCH] BUG: rolling.quantile does not return an interpolated result (#16247) --- asv_bench/benchmarks/rolling.py | 185 ++++++++++++++++++++++++++++++++ doc/source/whatsnew/v0.21.0.txt | 5 +- pandas/_libs/window.pyx | 15 ++- pandas/core/window.py | 11 +- pandas/tests/test_window.py | 41 ++++++- 5 files changed, 249 insertions(+), 8 deletions(-) create mode 100644 asv_bench/benchmarks/rolling.py diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py new file mode 100644 index 0000000000000..9da9d0b855323 --- /dev/null +++ b/asv_bench/benchmarks/rolling.py @@ -0,0 +1,185 @@ +from .pandas_vb_common import * +import pandas as pd +import numpy as np + + +class DataframeRolling(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.Ns = 10000 + self.df = pd.DataFrame({'a': np.random.random(self.N)}) + self.dfs = pd.DataFrame({'a': np.random.random(self.Ns)}) + self.wins = 10 + self.winl = 1000 + + def time_rolling_quantile_0(self): + (self.df.rolling(self.wins).quantile(0.0)) + + def time_rolling_quantile_1(self): + (self.df.rolling(self.wins).quantile(1.0)) + + def time_rolling_quantile_median(self): + (self.df.rolling(self.wins).quantile(0.5)) + + def time_rolling_median(self): + (self.df.rolling(self.wins).median()) + + def time_rolling_median(self): + (self.df.rolling(self.wins).mean()) + + def time_rolling_max(self): + (self.df.rolling(self.wins).max()) + + def time_rolling_min(self): + (self.df.rolling(self.wins).min()) + + def time_rolling_std(self): + (self.df.rolling(self.wins).std()) + + def time_rolling_count(self): + (self.df.rolling(self.wins).count()) + + def time_rolling_skew(self): + (self.df.rolling(self.wins).skew()) + + def time_rolling_kurt(self): + (self.df.rolling(self.wins).kurt()) + + def time_rolling_sum(self): + (self.df.rolling(self.wins).sum()) + + def time_rolling_corr(self): + (self.dfs.rolling(self.wins).corr()) + + def time_rolling_cov(self): + (self.dfs.rolling(self.wins).cov()) + + def time_rolling_quantile_0_l(self): + (self.df.rolling(self.winl).quantile(0.0)) + + def time_rolling_quantile_1_l(self): + (self.df.rolling(self.winl).quantile(1.0)) + + def time_rolling_quantile_median_l(self): + (self.df.rolling(self.winl).quantile(0.5)) + + def time_rolling_median_l(self): + (self.df.rolling(self.winl).median()) + + def time_rolling_median_l(self): + (self.df.rolling(self.winl).mean()) + + def time_rolling_max_l(self): + (self.df.rolling(self.winl).max()) + + def time_rolling_min_l(self): + (self.df.rolling(self.winl).min()) + + def time_rolling_std_l(self): + (self.df.rolling(self.wins).std()) + + def time_rolling_count_l(self): + (self.df.rolling(self.wins).count()) + + def time_rolling_skew_l(self): + (self.df.rolling(self.wins).skew()) + + def time_rolling_kurt_l(self): + (self.df.rolling(self.wins).kurt()) + + def time_rolling_sum_l(self): + (self.df.rolling(self.wins).sum()) + + +class SeriesRolling(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.Ns = 10000 + self.df = pd.DataFrame({'a': np.random.random(self.N)}) + self.dfs = pd.DataFrame({'a': np.random.random(self.Ns)}) + self.sr = self.df.a + self.srs = self.dfs.a + self.wins = 10 + self.winl = 1000 + + def time_rolling_quantile_0(self): + (self.sr.rolling(self.wins).quantile(0.0)) + + def time_rolling_quantile_1(self): + (self.sr.rolling(self.wins).quantile(1.0)) + + def time_rolling_quantile_median(self): + (self.sr.rolling(self.wins).quantile(0.5)) + + def time_rolling_median(self): + (self.sr.rolling(self.wins).median()) + + def time_rolling_median(self): + (self.sr.rolling(self.wins).mean()) + + def time_rolling_max(self): + (self.sr.rolling(self.wins).max()) + + def time_rolling_min(self): + (self.sr.rolling(self.wins).min()) + + def time_rolling_std(self): + (self.sr.rolling(self.wins).std()) + + def time_rolling_count(self): + (self.sr.rolling(self.wins).count()) + + def time_rolling_skew(self): + (self.sr.rolling(self.wins).skew()) + + def time_rolling_kurt(self): + (self.sr.rolling(self.wins).kurt()) + + def time_rolling_sum(self): + (self.sr.rolling(self.wins).sum()) + + def time_rolling_corr(self): + (self.srs.rolling(self.wins).corr()) + + def time_rolling_cov(self): + (self.srs.rolling(self.wins).cov()) + + def time_rolling_quantile_0_l(self): + (self.sr.rolling(self.winl).quantile(0.0)) + + def time_rolling_quantile_1_l(self): + (self.sr.rolling(self.winl).quantile(1.0)) + + def time_rolling_quantile_median_l(self): + (self.sr.rolling(self.winl).quantile(0.5)) + + def time_rolling_median_l(self): + (self.sr.rolling(self.winl).median()) + + def time_rolling_median_l(self): + (self.sr.rolling(self.winl).mean()) + + def time_rolling_max_l(self): + (self.sr.rolling(self.winl).max()) + + def time_rolling_min_l(self): + (self.sr.rolling(self.winl).min()) + + def time_rolling_std_l(self): + (self.sr.rolling(self.wins).std()) + + def time_rolling_count_l(self): + (self.sr.rolling(self.wins).count()) + + def time_rolling_skew_l(self): + (self.sr.rolling(self.wins).skew()) + + def time_rolling_kurt_l(self): + (self.sr.rolling(self.wins).kurt()) + + def time_rolling_sum_l(self): + (self.sr.rolling(self.wins).sum()) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 43bfebd0c2e59..1edbf1638d233 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -168,9 +168,11 @@ Plotting Groupby/Resample/Rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- Bug in ``DataFrame.resample().size()`` where an empty ``DataFrame`` did not return a ``Series`` (:issue:`14962`) +- Bug in ``DataFrame.resample().size()`` where an empty ``DataFrame`` did not return a ``Series`` (:issue:`14962`) - Bug in ``infer_freq`` causing indices with 2-day gaps during the working week to be wrongly inferred as business daily (:issue:`16624`) +- Bug in ``.rolling.quantile()`` which incorrectly used different defaults than :func:`Series.quantile()` and :func:`DataFrame.quantile()` (:issue:`9413`, :issue:`16211`) + Sparse ^^^^^^ @@ -191,6 +193,7 @@ Categorical ^^^^^^^^^^^ + Other ^^^^^ - Bug in :func:`eval` where the ``inplace`` parameter was being incorrectly handled (:issue:`16732`) diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index 3bb8abe26c781..2450eea5500cd 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -1348,8 +1348,9 @@ def roll_quantile(ndarray[float64_t, cast=True] input, int64_t win, bint is_variable ndarray[int64_t] start, end ndarray[double_t] output + double vlow, vhigh - if quantile < 0.0 or quantile > 1.0: + if quantile <= 0.0 or quantile >= 1.0: raise ValueError("quantile value {0} not in [0, 1]".format(quantile)) # we use the Fixed/Variable Indexer here as the @@ -1391,7 +1392,17 @@ def roll_quantile(ndarray[float64_t, cast=True] input, int64_t win, if nobs >= minp: idx = int(quantile * (nobs - 1)) - output[i] = skiplist.get(idx) + + # Single value in skip list + if nobs == 1: + output[i] = skiplist.get(0) + + # Interpolated quantile + else: + vlow = skiplist.get(idx) + vhigh = skiplist.get(idx + 1) + output[i] = (vlow + (vhigh - vlow) * + (quantile * (nobs - 1) - idx)) else: output[i] = NaN diff --git a/pandas/core/window.py b/pandas/core/window.py index 02b508bb94e4c..57611794c375f 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -975,8 +975,15 @@ def quantile(self, quantile, **kwargs): def f(arg, *args, **kwargs): minp = _use_window(self.min_periods, window) - return _window.roll_quantile(arg, window, minp, indexi, - self.closed, quantile) + if quantile == 1.0: + return _window.roll_max(arg, window, minp, indexi, + self.closed) + elif quantile == 0.0: + return _window.roll_min(arg, window, minp, indexi, + self.closed) + else: + return _window.roll_quantile(arg, window, minp, indexi, + self.closed, quantile) return self._apply(f, 'quantile', quantile=quantile, **kwargs) diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 9c3765ffdb716..3ba5d2065cddf 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -1122,8 +1122,19 @@ def test_rolling_quantile(self): def scoreatpercentile(a, per): values = np.sort(a, axis=0) - idx = per / 1. * (values.shape[0] - 1) - return values[int(idx)] + idx = int(per / 1. * (values.shape[0] - 1)) + + if idx == values.shape[0] - 1: + retval = values[-1] + + else: + qlow = float(idx) / float(values.shape[0] - 1) + qhig = float(idx + 1) / float(values.shape[0] - 1) + vlow = values[idx] + vhig = values[idx + 1] + retval = vlow + (vhig - vlow) * (per - qlow) / (qhig - qlow) + + return retval for q in qs: @@ -1138,6 +1149,30 @@ def alt(x): self._check_moment_func(f, alt, name='quantile', quantile=q) + def test_rolling_quantile_np_percentile(self): + # #9413: Tests that rolling window's quantile default behavior + # is analogus to Numpy's percentile + row = 10 + col = 5 + idx = pd.date_range(20100101, periods=row, freq='B') + df = pd.DataFrame(np.random.rand(row * col).reshape((row, -1)), + index=idx) + + df_quantile = df.quantile([0.25, 0.5, 0.75], axis=0) + np_percentile = np.percentile(df, [25, 50, 75], axis=0) + + tm.assert_almost_equal(df_quantile.values, np.array(np_percentile)) + + def test_rolling_quantile_series(self): + # #16211: Tests that rolling window's quantile default behavior + # is analogus to pd.Series' quantile + arr = np.arange(100) + s = pd.Series(arr) + q1 = s.quantile(0.1) + q2 = s.rolling(100).quantile(0.1).iloc[-1] + + tm.assert_almost_equal(q1, q2) + def test_rolling_quantile_param(self): ser = Series([0.0, .1, .5, .9, 1.0]) @@ -3558,7 +3593,7 @@ def test_ragged_quantile(self): result = df.rolling(window='2s', min_periods=1).quantile(0.5) expected = df.copy() - expected['B'] = [0.0, 1, 1.0, 3.0, 3.0] + expected['B'] = [0.0, 1, 1.5, 3.0, 3.5] tm.assert_frame_equal(result, expected) def test_ragged_std(self):