From 8f4e34b82f18e8782d8b300444e3306238795d1f Mon Sep 17 00:00:00 2001 From: Licht-T Date: Sat, 25 Nov 2017 22:34:53 +0900 Subject: [PATCH 1/4] BUG: Fix inaccurate rolling.var calculation --- pandas/_libs/window.pyx | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index 4d5ebdc0c581a..000ebc6e4bbbc 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -661,9 +661,9 @@ cdef inline void add_var(double val, double *nobs, double *mean_x, if val == val: nobs[0] = nobs[0] + 1 - delta = (val - mean_x[0]) + delta = val - mean_x[0] mean_x[0] = mean_x[0] + delta / nobs[0] - ssqdm_x[0] = ssqdm_x[0] + delta * (val - mean_x[0]) + ssqdm_x[0] = ssqdm_x[0] + ((nobs[0] - 1) * delta ** 2) / nobs[0] cdef inline void remove_var(double val, double *nobs, double *mean_x, @@ -675,9 +675,11 @@ cdef inline void remove_var(double val, double *nobs, double *mean_x, if val == val: nobs[0] = nobs[0] - 1 if nobs[0]: - delta = (val - mean_x[0]) + # a part of Welford's method for the online variance-calculation + # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance + delta = val - mean_x[0] mean_x[0] = mean_x[0] - delta / nobs[0] - ssqdm_x[0] = ssqdm_x[0] - delta * (val - mean_x[0]) + ssqdm_x[0] = ssqdm_x[0] - ((nobs[0] + 1) * delta ** 2) / nobs[0] else: mean_x[0] = 0 ssqdm_x[0] = 0 @@ -689,7 +691,7 @@ def roll_var(ndarray[double_t] input, int64_t win, int64_t minp, Numerically stable implementation using Welford's method. """ cdef: - double val, prev, mean_x = 0, ssqdm_x = 0, nobs = 0, delta + double val, prev, mean_x = 0, ssqdm_x = 0, nobs = 0, delta, mean_x_old int64_t s, e bint is_variable Py_ssize_t i, j, N @@ -760,10 +762,12 @@ def roll_var(ndarray[double_t] input, int64_t win, int64_t minp, # Adding one observation and removing another one delta = val - prev - prev -= mean_x + mean_x_old = mean_x + mean_x += delta / nobs - val -= mean_x - ssqdm_x += (val + prev) * delta + ssqdm_x += ((nobs - 1) * val + + (nobs + 1) * prev + - 2 * nobs * mean_x_old) * delta / nobs else: add_var(val, &nobs, &mean_x, &ssqdm_x) From 47ef5fbcd11db0fbe8b57273b1cad3a88a40bdf8 Mon Sep 17 00:00:00 2001 From: Licht-T Date: Sat, 25 Nov 2017 22:36:21 +0900 Subject: [PATCH 2/4] TST: Add test for rolling.corr with zero variance --- pandas/tests/test_window.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 742b8a5ac9a55..d85cf082d0a9e 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -2482,6 +2482,14 @@ def test_rolling_corr_pairwise(self): self._check_pairwise_moment('rolling', 'corr', window=10, min_periods=5) + @pytest.mark.parametrize('window', range(7)) + def test_rolling_corr_with_zero_variance(self, window): + # GH 18430 + s = pd.Series(np.zeros(20)) + other = pd.Series(np.arange(20)) + + assert s.rolling(window=window).corr(other=other).isna().all() + def _check_pairwise_moment(self, dispatch, name, **kwargs): def get_result(obj, obj2=None): return getattr(getattr(obj, dispatch)(**kwargs), name)(obj2) From b8e84e3688ce3eb71692104037e02608bb9ac7ae Mon Sep 17 00:00:00 2001 From: Licht-T Date: Sat, 25 Nov 2017 23:52:55 +0900 Subject: [PATCH 3/4] DOC: Add whatsnew note --- doc/source/whatsnew/v0.21.1.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 0ab536f2898c7..cd2c59ebb89ae 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -98,7 +98,7 @@ Plotting Groupby/Resample/Rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- +- Bug in ``rolling.var`` which calculation is inaccurate with a zero-value array (:issue:`18430`) - - From b31558d121d329399c7f8b0eddd629405e6726a5 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 25 Nov 2017 16:10:52 -0500 Subject: [PATCH 4/4] add more refs to welfords --- doc/source/whatsnew/v0.21.1.txt | 2 +- pandas/_libs/window.pyx | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 98e5fcf00050c..976f3524e3c71 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -103,7 +103,7 @@ Groupby/Resample/Rolling - Bug in ``DataFrame.resample(...).apply(...)`` when there is a callable that returns different columns (:issue:`15169`) - Bug in ``DataFrame.resample(...)`` when there is a time change (DST) and resampling frequecy is 12h or higher (:issue:`15549`) - Bug in ``pd.DataFrameGroupBy.count()`` when counting over a datetimelike column (:issue:`13393`) -- Bug in ``rolling.var`` which calculation is inaccurate with a zero-value array (:issue:`18430`) +- Bug in ``rolling.var`` where calculation is inaccurate with a zero-valued array (:issue:`18430`) - - diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index 000ebc6e4bbbc..95df5a07a390b 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -661,6 +661,8 @@ cdef inline void add_var(double val, double *nobs, double *mean_x, if val == val: nobs[0] = nobs[0] + 1 + # a part of Welford's method for the online variance-calculation + # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance delta = val - mean_x[0] mean_x[0] = mean_x[0] + delta / nobs[0] ssqdm_x[0] = ssqdm_x[0] + ((nobs[0] - 1) * delta ** 2) / nobs[0] @@ -751,6 +753,9 @@ def roll_var(ndarray[double_t] input, int64_t win, int64_t minp, add_var(input[i], &nobs, &mean_x, &ssqdm_x) output[i] = calc_var(minp, ddof, nobs, ssqdm_x) + # a part of Welford's method for the online variance-calculation + # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance + # After the first window, observations can both be added and # removed for i from win <= i < N: