From fa971a229a109c708fe3692a9e63ef8cad6964b6 Mon Sep 17 00:00:00 2001 From: Guillem Borrell Date: Fri, 5 May 2017 08:38:53 +0200 Subject: [PATCH 01/11] BUG: rolling.quantile now returns an interpolated result (#9413) --- pandas/_libs/window.pyx | 15 +++++++++++++- pandas/tests/test_window.py | 39 ++++++++++++++++++++++++++++++++++--- 2 files changed, 50 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index 3bb8abe26c781..275ff26a7eb35 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -1348,6 +1348,7 @@ def roll_quantile(ndarray[float64_t, cast=True] input, int64_t win, bint is_variable ndarray[int64_t] start, end ndarray[double_t] output + double qlow, qhigh, vlow, vhigh if quantile < 0.0 or quantile > 1.0: raise ValueError("quantile value {0} not in [0, 1]".format(quantile)) @@ -1391,7 +1392,19 @@ def roll_quantile(ndarray[float64_t, cast=True] input, int64_t win, if nobs >= minp: idx = int(quantile * (nobs - 1)) - output[i] = skiplist.get(idx) + + # Exactly last point + if idx == nobs - 1: + output[i] = skiplist.get(idx) + + # Interpolated quantile + else: + qlow = ( idx) / ((nobs - 1)) + qhigh = ( (idx + 1)) / ((nobs - 1)) + vlow = skiplist.get(idx) + vhigh = skiplist.get(idx + 1) + output[i] = vlow + (vhigh - vlow) * \ + (quantile - qlow) / (qhigh - qlow) else: output[i] = NaN diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 9c3765ffdb716..1d6070a15369f 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -1122,8 +1122,19 @@ def test_rolling_quantile(self): def scoreatpercentile(a, per): values = np.sort(a, axis=0) - idx = per / 1. * (values.shape[0] - 1) - return values[int(idx)] + idx = int(per / 1. * (values.shape[0] - 1)) + + if idx == values.shape[0] - 1: + retval = values[-1] + + else: + qlow = float(idx) / float(values.shape[0] - 1) + qhig = float(idx + 1) / float(values.shape[0] - 1) + vlow = values[idx] + vhig = values[idx + 1] + retval = vlow + (vhig - vlow) * (per - qlow) / (qhig - qlow) + + return retval for q in qs: @@ -1138,6 +1149,28 @@ def alt(x): self._check_moment_func(f, alt, name='quantile', quantile=q) + def test_rolling_quantile_np_percentile(self): + # #9413 + row = 10 + col = 5 + idx = pd.date_range(20100101, periods=row, freq='B') + df = pd.DataFrame(np.random.rand(row * col).reshape((row, -1)), + index=idx) + + df_quantile = df.quantile([0.25, 0.5, 0.75], axis=0) + np_percentile = np.percentile(df, [25, 50, 75], axis=0) + + tm.assert_almost_equal(df_quantile.values, np.array(np_percentile)) + + def test_rolling_quantile_series(self): + # #16211 + arr = np.arange(100) + s = pd.Series(arr) + q1 = s.quantile(0.1) + q2 = s.rolling(100).quantile(0.1).iloc[-1] + + tm.assert_almost_equal(q1, q2) + def test_rolling_quantile_param(self): ser = Series([0.0, .1, .5, .9, 1.0]) @@ -3558,7 +3591,7 @@ def test_ragged_quantile(self): result = df.rolling(window='2s', min_periods=1).quantile(0.5) expected = df.copy() - expected['B'] = [0.0, 1, 1.0, 3.0, 3.0] + expected['B'] = [0.0, 1, 1.5, 3.0, 3.5] tm.assert_frame_equal(result, expected) def test_ragged_std(self): From ad8c0344a6e645c668b58410676ddab1917e3594 Mon Sep 17 00:00:00 2001 From: Guillem Borrell Date: Mon, 8 May 2017 12:38:09 +0200 Subject: [PATCH 02/11] Implemented changes suggested by reviewer @jaimefrio --- doc/source/whatsnew/v0.21.0.txt | 2 ++ pandas/_libs/window.pyx | 12 +++++------- pandas/core/window.py | 11 +++++++++-- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index d5cc3d6ddca8e..033ab51f41496 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -189,6 +189,8 @@ Numeric Categorical ^^^^^^^^^^^ +- Bug in ``.rolling.quantile()`` which incorrectly used different defaults than Series.quantile() and DataFrame.quantile() + Other ^^^^^ diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index 275ff26a7eb35..5a8ff2836a225 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -1348,7 +1348,7 @@ def roll_quantile(ndarray[float64_t, cast=True] input, int64_t win, bint is_variable ndarray[int64_t] start, end ndarray[double_t] output - double qlow, qhigh, vlow, vhigh + double vlow, vhigh if quantile < 0.0 or quantile > 1.0: raise ValueError("quantile value {0} not in [0, 1]".format(quantile)) @@ -1393,18 +1393,16 @@ def roll_quantile(ndarray[float64_t, cast=True] input, int64_t win, if nobs >= minp: idx = int(quantile * (nobs - 1)) - # Exactly last point - if idx == nobs - 1: - output[i] = skiplist.get(idx) + # Single value in skip list + if nobs == 1: + output[i] = skiplist.get(0) # Interpolated quantile else: - qlow = ( idx) / ((nobs - 1)) - qhigh = ( (idx + 1)) / ((nobs - 1)) vlow = skiplist.get(idx) vhigh = skiplist.get(idx + 1) output[i] = vlow + (vhigh - vlow) * \ - (quantile - qlow) / (qhigh - qlow) + (quantile * (nobs - 1) - idx) else: output[i] = NaN diff --git a/pandas/core/window.py b/pandas/core/window.py index 02b508bb94e4c..57611794c375f 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -975,8 +975,15 @@ def quantile(self, quantile, **kwargs): def f(arg, *args, **kwargs): minp = _use_window(self.min_periods, window) - return _window.roll_quantile(arg, window, minp, indexi, - self.closed, quantile) + if quantile == 1.0: + return _window.roll_max(arg, window, minp, indexi, + self.closed) + elif quantile == 0.0: + return _window.roll_min(arg, window, minp, indexi, + self.closed) + else: + return _window.roll_quantile(arg, window, minp, indexi, + self.closed, quantile) return self._apply(f, 'quantile', quantile=quantile, **kwargs) From 18da308463f5b146d89e3b8e9bc40d7f0dc89a1d Mon Sep 17 00:00:00 2001 From: Guillem Borrell Date: Wed, 10 May 2017 09:24:09 +0200 Subject: [PATCH 03/11] quantile 1.0 and 0.0 are handled by the wrapper. --- pandas/_libs/window.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index 5a8ff2836a225..2450eea5500cd 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -1350,7 +1350,7 @@ def roll_quantile(ndarray[float64_t, cast=True] input, int64_t win, ndarray[double_t] output double vlow, vhigh - if quantile < 0.0 or quantile > 1.0: + if quantile <= 0.0 or quantile >= 1.0: raise ValueError("quantile value {0} not in [0, 1]".format(quantile)) # we use the Fixed/Variable Indexer here as the @@ -1401,8 +1401,8 @@ def roll_quantile(ndarray[float64_t, cast=True] input, int64_t win, else: vlow = skiplist.get(idx) vhigh = skiplist.get(idx + 1) - output[i] = vlow + (vhigh - vlow) * \ - (quantile * (nobs - 1) - idx) + output[i] = (vlow + (vhigh - vlow) * + (quantile * (nobs - 1) - idx)) else: output[i] = NaN From 69b0a488f7eea4442a2fc479199d2c27eb1e0852 Mon Sep 17 00:00:00 2001 From: Guillem Borrell Date: Wed, 10 May 2017 12:16:33 +0200 Subject: [PATCH 04/11] Move entry in Whatsnew to Groupby/Resample/Rolling --- doc/source/whatsnew/v0.21.0.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 033ab51f41496..93363b2944f72 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -167,9 +167,11 @@ Plotting Groupby/Resample/Rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- Bug in ``DataFrame.resample().size()`` where an empty ``DataFrame`` did not return a ``Series`` (:issue:`14962`) +- Bug in ``DataFrame.resample().size()`` where an empty ``DataFrame`` did not return a ``Series`` (:issue:`14962`) - Bug in ``infer_freq`` causing indices with 2-day gaps during the working week to be wrongly inferred as business daily (:issue:`16624`) +- Bug in ``.rolling.quantile()`` which incorrectly used different defaults than Series.quantile() and DataFrame.quantile() + Sparse ^^^^^^ @@ -189,7 +191,6 @@ Numeric Categorical ^^^^^^^^^^^ -- Bug in ``.rolling.quantile()`` which incorrectly used different defaults than Series.quantile() and DataFrame.quantile() Other From 3acc6fd8b81ce688edf37656dc8e6858b29a4f28 Mon Sep 17 00:00:00 2001 From: Guillem Borrell Date: Mon, 22 May 2017 20:51:55 +0200 Subject: [PATCH 05/11] Added benchmarks for rolling windows --- asv_bench/benchmarks/rolling.py | 185 ++++++++++++++++++++++++++++++++ 1 file changed, 185 insertions(+) create mode 100644 asv_bench/benchmarks/rolling.py diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py new file mode 100644 index 0000000000000..9da9d0b855323 --- /dev/null +++ b/asv_bench/benchmarks/rolling.py @@ -0,0 +1,185 @@ +from .pandas_vb_common import * +import pandas as pd +import numpy as np + + +class DataframeRolling(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.Ns = 10000 + self.df = pd.DataFrame({'a': np.random.random(self.N)}) + self.dfs = pd.DataFrame({'a': np.random.random(self.Ns)}) + self.wins = 10 + self.winl = 1000 + + def time_rolling_quantile_0(self): + (self.df.rolling(self.wins).quantile(0.0)) + + def time_rolling_quantile_1(self): + (self.df.rolling(self.wins).quantile(1.0)) + + def time_rolling_quantile_median(self): + (self.df.rolling(self.wins).quantile(0.5)) + + def time_rolling_median(self): + (self.df.rolling(self.wins).median()) + + def time_rolling_median(self): + (self.df.rolling(self.wins).mean()) + + def time_rolling_max(self): + (self.df.rolling(self.wins).max()) + + def time_rolling_min(self): + (self.df.rolling(self.wins).min()) + + def time_rolling_std(self): + (self.df.rolling(self.wins).std()) + + def time_rolling_count(self): + (self.df.rolling(self.wins).count()) + + def time_rolling_skew(self): + (self.df.rolling(self.wins).skew()) + + def time_rolling_kurt(self): + (self.df.rolling(self.wins).kurt()) + + def time_rolling_sum(self): + (self.df.rolling(self.wins).sum()) + + def time_rolling_corr(self): + (self.dfs.rolling(self.wins).corr()) + + def time_rolling_cov(self): + (self.dfs.rolling(self.wins).cov()) + + def time_rolling_quantile_0_l(self): + (self.df.rolling(self.winl).quantile(0.0)) + + def time_rolling_quantile_1_l(self): + (self.df.rolling(self.winl).quantile(1.0)) + + def time_rolling_quantile_median_l(self): + (self.df.rolling(self.winl).quantile(0.5)) + + def time_rolling_median_l(self): + (self.df.rolling(self.winl).median()) + + def time_rolling_median_l(self): + (self.df.rolling(self.winl).mean()) + + def time_rolling_max_l(self): + (self.df.rolling(self.winl).max()) + + def time_rolling_min_l(self): + (self.df.rolling(self.winl).min()) + + def time_rolling_std_l(self): + (self.df.rolling(self.wins).std()) + + def time_rolling_count_l(self): + (self.df.rolling(self.wins).count()) + + def time_rolling_skew_l(self): + (self.df.rolling(self.wins).skew()) + + def time_rolling_kurt_l(self): + (self.df.rolling(self.wins).kurt()) + + def time_rolling_sum_l(self): + (self.df.rolling(self.wins).sum()) + + +class SeriesRolling(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.Ns = 10000 + self.df = pd.DataFrame({'a': np.random.random(self.N)}) + self.dfs = pd.DataFrame({'a': np.random.random(self.Ns)}) + self.sr = self.df.a + self.srs = self.dfs.a + self.wins = 10 + self.winl = 1000 + + def time_rolling_quantile_0(self): + (self.sr.rolling(self.wins).quantile(0.0)) + + def time_rolling_quantile_1(self): + (self.sr.rolling(self.wins).quantile(1.0)) + + def time_rolling_quantile_median(self): + (self.sr.rolling(self.wins).quantile(0.5)) + + def time_rolling_median(self): + (self.sr.rolling(self.wins).median()) + + def time_rolling_median(self): + (self.sr.rolling(self.wins).mean()) + + def time_rolling_max(self): + (self.sr.rolling(self.wins).max()) + + def time_rolling_min(self): + (self.sr.rolling(self.wins).min()) + + def time_rolling_std(self): + (self.sr.rolling(self.wins).std()) + + def time_rolling_count(self): + (self.sr.rolling(self.wins).count()) + + def time_rolling_skew(self): + (self.sr.rolling(self.wins).skew()) + + def time_rolling_kurt(self): + (self.sr.rolling(self.wins).kurt()) + + def time_rolling_sum(self): + (self.sr.rolling(self.wins).sum()) + + def time_rolling_corr(self): + (self.srs.rolling(self.wins).corr()) + + def time_rolling_cov(self): + (self.srs.rolling(self.wins).cov()) + + def time_rolling_quantile_0_l(self): + (self.sr.rolling(self.winl).quantile(0.0)) + + def time_rolling_quantile_1_l(self): + (self.sr.rolling(self.winl).quantile(1.0)) + + def time_rolling_quantile_median_l(self): + (self.sr.rolling(self.winl).quantile(0.5)) + + def time_rolling_median_l(self): + (self.sr.rolling(self.winl).median()) + + def time_rolling_median_l(self): + (self.sr.rolling(self.winl).mean()) + + def time_rolling_max_l(self): + (self.sr.rolling(self.winl).max()) + + def time_rolling_min_l(self): + (self.sr.rolling(self.winl).min()) + + def time_rolling_std_l(self): + (self.sr.rolling(self.wins).std()) + + def time_rolling_count_l(self): + (self.sr.rolling(self.wins).count()) + + def time_rolling_skew_l(self): + (self.sr.rolling(self.wins).skew()) + + def time_rolling_kurt_l(self): + (self.sr.rolling(self.wins).kurt()) + + def time_rolling_sum_l(self): + (self.sr.rolling(self.wins).sum()) From b9dc329993d15bf13b81f4c8753ca46d896be908 Mon Sep 17 00:00:00 2001 From: Guillem Borrell Date: Mon, 8 May 2017 12:38:09 +0200 Subject: [PATCH 06/11] Implemented changes suggested by reviewer @jaimefrio --- doc/source/whatsnew/v0.21.0.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 93363b2944f72..3e59fb18d72b0 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -191,6 +191,7 @@ Numeric Categorical ^^^^^^^^^^^ +- Bug in ``.rolling.quantile()`` which incorrectly used different defaults than Series.quantile() and DataFrame.quantile() Other From b94bda23b3f62d283adb80b42db7418153988d8e Mon Sep 17 00:00:00 2001 From: Guillem Borrell Date: Wed, 10 May 2017 09:24:09 +0200 Subject: [PATCH 07/11] quantile 1.0 and 0.0 are handled by the wrapper. From 79e905f2401dbf561738bcbd2126e4591cfae0fe Mon Sep 17 00:00:00 2001 From: Guillem Borrell Date: Wed, 10 May 2017 12:16:33 +0200 Subject: [PATCH 08/11] Move entry in Whatsnew to Groupby/Resample/Rolling --- doc/source/whatsnew/v0.21.0.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 3e59fb18d72b0..93363b2944f72 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -191,7 +191,6 @@ Numeric Categorical ^^^^^^^^^^^ -- Bug in ``.rolling.quantile()`` which incorrectly used different defaults than Series.quantile() and DataFrame.quantile() Other From fa4e036fc7bbb218dc9aab6186fb6217b9e62559 Mon Sep 17 00:00:00 2001 From: Guillem Borrell Date: Mon, 22 May 2017 20:51:55 +0200 Subject: [PATCH 09/11] Added benchmarks for rolling windows From b99e2be63b16a18035552fc8cf850e50c8432f20 Mon Sep 17 00:00:00 2001 From: Guillem Borrell Date: Fri, 7 Jul 2017 17:24:50 +0200 Subject: [PATCH 10/11] Fixed whatsnew syntax and comments on new tests. --- doc/source/whatsnew/v0.21.0.txt | 2 +- pandas/tests/test_window.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 93363b2944f72..5fcbfd8b571c8 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -170,7 +170,7 @@ Groupby/Resample/Rolling - Bug in ``DataFrame.resample().size()`` where an empty ``DataFrame`` did not return a ``Series`` (:issue:`14962`) - Bug in ``infer_freq`` causing indices with 2-day gaps during the working week to be wrongly inferred as business daily (:issue:`16624`) -- Bug in ``.rolling.quantile()`` which incorrectly used different defaults than Series.quantile() and DataFrame.quantile() +- Bug in ``.rolling.quantile()`` which incorrectly used different defaults than :func:`Series.quantile()` and :func:`DataFrame.quantile()` (:issue:`9413`, :issue:`16211`) Sparse diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 1d6070a15369f..0f8554c91cabf 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -1150,7 +1150,8 @@ def alt(x): self._check_moment_func(f, alt, name='quantile', quantile=q) def test_rolling_quantile_np_percentile(self): - # #9413 + # #9413: Tests that rolling window's quantile default behavior + # is analogus to Numpy's percentile row = 10 col = 5 idx = pd.date_range(20100101, periods=row, freq='B') @@ -1163,7 +1164,8 @@ def test_rolling_quantile_np_percentile(self): tm.assert_almost_equal(df_quantile.values, np.array(np_percentile)) def test_rolling_quantile_series(self): - # #16211 + # #16211: Tests that rolling window's quantile default behavior + # is analogus to pd.Series' quantile arr = np.arange(100) s = pd.Series(arr) q1 = s.quantile(0.1) From 4080942107c217aef2ad54defd09928f6469f7fc Mon Sep 17 00:00:00 2001 From: Guillem Borrell Date: Fri, 7 Jul 2017 18:18:16 +0200 Subject: [PATCH 11/11] Fixed trailing whitespace to make the linter happy --- pandas/tests/test_window.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 0f8554c91cabf..3ba5d2065cddf 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -1151,7 +1151,7 @@ def alt(x): def test_rolling_quantile_np_percentile(self): # #9413: Tests that rolling window's quantile default behavior - # is analogus to Numpy's percentile + # is analogus to Numpy's percentile row = 10 col = 5 idx = pd.date_range(20100101, periods=row, freq='B')