From f869ec39574fe636921284f8e125df03512ec277 Mon Sep 17 00:00:00 2001 From: Matthieu Gomez Date: Fri, 23 Feb 2018 13:53:48 -0500 Subject: [PATCH] Modify weighted quantile (#316) The quantile method for frequency weights is now equivalent to the unweighted method with a vector of repeated values The quantile method for non frequency weight now removes zeros. --- src/weights.jl | 72 ++++++++++--------- test/weights.jl | 180 ++++++++++++++++++++++++++++++++++-------------- 2 files changed, 163 insertions(+), 89 deletions(-) diff --git a/src/weights.jl b/src/weights.jl index 7bb90ae52..9017d29bd 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -552,25 +552,23 @@ wmedian(v::RealVector, w::AbstractWeights{<:Real}) = median(v, w) ###### Weighted quantile ##### -# http://stats.stackexchange.com/questions/13169/defining-quantiles-over-a-weighted-sample -# In the non weighted version, we compute a vector of index h(N, p) -# and take interpolation between floor and ceil of this index -# Here there is a supplementary function from index to weighted index k -> Sk """ quantile(v, w::AbstractWeights, p) -Compute the weighted quantiles of a vector `x` at a specified set of probability +Compute the weighted quantiles of a vector `v` at a specified set of probability values `p`, using weights given by a weight vector `w` (of type `AbstractWeights`). Weights must not be negative. The weights and data vectors must have the same length. -The quantile for `p` is defined as follows. Denoting -``S_k = (k-1)w_k + (n-1) \\sum_{i= N - # out was initialized with maximum v - return out - end - k += 1 - Skold, vkold = Sk, vk - vk, wk = vw[k] - Sk = (k - 1) * wk + (N - 1) * cumulative_weight + h = p[i] * (wsum - vw[1][2]) + vw[1][2] + end + while Sk <= h + k += 1 + if k > N + # out was initialized with maximum v + return out end - # in particular, Sk is different from Skold - g = (h - Skold) / (Sk - Skold) - out[ppermute[i]] = vkold + g * (vk - vkold) + Skold, vkold = Sk, vk + vk, wk = vw[k] + Sk += wk + end + if isa(w, FrequencyWeights) + out[ppermute[i]] = vkold + min(h - Skold, 1) * (vk - vkold) + else + out[ppermute[i]] = vkold + (h - Skold) / (Sk - Skold) * (vk - vkold) end end return out diff --git a/test/weights.jl b/test/weights.jl index 3a237e62d..5cc9a52ea 100644 --- a/test/weights.jl +++ b/test/weights.jl @@ -290,7 +290,80 @@ end @test_throws ErrorException median(data, f(wt)) end -@testset "Quantile $f" for f in weight_funcs + +# Quantile fweights +@testset "Quantile fweights" begin + data = ( + [7, 1, 2, 4, 10], + [7, 1, 2, 4, 10], + [7, 1, 2, 4, 10, 15], + [1, 2, 4, 7, 10, 15], + [0, 10, 20, 30], + [1, 2, 3, 4, 5], + [1, 2, 3, 4, 5], + [30, 40, 50, 60, 35], + [2, 0.6, 1.3, 0.3, 0.3, 1.7, 0.7, 1.7], + [1, 2, 2], + [3.7, 3.3, 3.5, 2.8], + [100, 125, 123, 60, 45, 56, 66], + [2, 2, 2, 2, 2, 2], + [2.3], + [-2, -3, 1, 2, -10], + [1, 2, 3, 4, 5], + [5, 4, 3, 2, 1], + [-2, 2, -1, 3, 6], + [-10, 1, 1, -10, -10], + ) + wt = ( + [3, 1, 1, 1, 3], + [1, 1, 1, 1, 1], + [3, 1, 1, 1, 3, 3], + [1, 1, 1, 3, 3, 3], + [30, 191, 9, 0], + [10, 1, 1, 1, 9], + [10, 1, 1, 1, 900], + [1, 3, 5, 4, 2], + [2, 2, 5, 0, 2, 2, 1, 6], + [1, 1, 8], + [5, 5, 4, 1], + [30, 56, 144, 24, 55, 43, 67], + [1, 2, 3, 4, 5, 6], + [12], + [7, 1, 1, 1, 6], + [1, 0, 0, 0, 2], + [1, 2, 3, 4, 5], + [1, 2, 3, 2, 1], + [0, 1, 1, 1, 1], + ) + p = [0.0, 0.25, 0.5, 0.75, 1.0] + function _rep(x::AbstractVector, lengths::AbstractVector{Int}) + res = similar(x, sum(lengths)) + i = 1 + for idx in 1:length(x) + tmp = x[idx] + for kdx in 1:lengths[idx] + res[i] = tmp + i += 1 + end + end + return res + end + # quantile with fweights is the same as repeated vectors + for i = 1:length(data) + @test quantile(data[i], fweights(wt[i]), p) ≈ quantile(_rep(data[i], wt[i]), p) + end + # quantile with fweights = 1 is the same as quantile + for i = 1:length(data) + @test quantile(data[i], fweights(ones(wt[i])), p) ≈ quantile(data[i], p) + end + + # Issue #313 + @test quantile([1, 2, 3, 4, 5], fweights([0,1,2,1,0]), p) ≈ quantile([2, 3, 3, 4], p) + @test quantile([1, 2], fweights([1, 1]), 0.25) ≈ 1.25 + @test quantile([1, 2], fweights([2, 2]), 0.25) ≈ 1.0 +end + +@testset "Quantile aweights, pweights and weights" for f in (aweights, pweights, weights) data = ( [7, 1, 2, 4, 10], [7, 1, 2, 4, 10], @@ -313,81 +386,84 @@ end [-10, 1, 1, -10, -10], ) wt = ( - f([1, 1/3, 1/3, 1/3, 1]), - f([1, 1, 1, 1, 1]), - f([1, 1/3, 1/3, 1/3, 1, 1]), - f([1/3, 1/3, 1/3, 1, 1, 1]), - f([30, 191, 9, 0]), - f([10, 1, 1, 1, 9]), - f([10, 1, 1, 1, 900]), - f([1, 3, 5, 4, 2]), - f([2, 2, 5, 1, 2, 2, 1, 6]), - f([0.1, 0.1, 0.8]), - f([5, 5, 4, 1]), - f([30, 56, 144, 24, 55, 43, 67]), - f([0.1, 0.2, 0.3, 0.4, 0.5, 0.6]), - f([12]), - f([7, 1, 1, 1, 6]), - f([1, 0, 0, 0, 2]), - f([1, 2, 3, 4, 5]), - f([0.1, 0.2, 0.3, 0.2, 0.1]), - f([1, 1, 1, 1, 1]), + [1, 1/3, 1/3, 1/3, 1], + [1, 1, 1, 1, 1], + [1, 1/3, 1/3, 1/3, 1, 1], + [1/3, 1/3, 1/3, 1, 1, 1], + [30, 191, 9, 0], + [10, 1, 1, 1, 9], + [10, 1, 1, 1, 900], + [1, 3, 5, 4, 2], + [2, 2, 5, 1, 2, 2, 1, 6], + [0.1, 0.1, 0.8], + [5, 5, 4, 1], + [30, 56, 144, 24, 55, 43, 67], + [0.1, 0.2, 0.3, 0.4, 0.5, 0.6], + [12], + [7, 1, 1, 1, 6], + [1, 0, 0, 0, 2], + [1, 2, 3, 4, 5], + [0.1, 0.2, 0.3, 0.2, 0.1], + [1, 1, 1, 1, 1], ) quantile_answers = ( - [1.0,3.6000000000000005,6.181818181818182,8.2,10.0], - [1.0,2.0,4.0,7.0,10.0], - [1.0,4.75,8.0,10.833333333333334,15.0], - [1.0,4.75,8.0,10.833333333333334,15.0], - [0.0,6.1387900355871885,11.600000000000001,15.912500000000001,30.0], - [1.0,1.5365853658536586,2.5999999999999996,4.405405405405405,5.0], - [1.0,4.239377950569287,4.492918633712858,4.746459316856429,5.0], - [30.0,38.75,45.714285714285715,52.85714285714286,60.0], - [0.3,0.6903846153846154,1.484,1.7,2.0], - [1.0,2.0,2.0,2.0,2.0], - [2.8,3.3361111111111112,3.4611111111111112,3.581578947368421,3.7], - [45.0,59.88593155893536,100.08846153846153,118.62115384615385,125.0], - [2.0,2.0,2.0,2.0,2.0], - [2.3,2.3,2.3,2.3,2.3], - [-10.0,-5.52,-2.5882352941176467,-0.9411764705882351,2.0], - [1.0,1.75,4.25,4.625,5.0], - [1.0,1.625,2.3333333333333335,3.25,5.0], - [-2.0,-0.5384615384615388,1.5384615384615383,2.6999999999999997,6.0], - [-10.0,-10.0,-10.0,1.0,1.0] + [1.0, 4.0, 6.0, 8.0, 10.0], + [1.0, 2.0, 4.0, 7.0, 10.0], + [1.0, 4.75, 7.5, 10.4166667, 15.0], + [1.0, 4.75, 7.5, 10.4166667, 15.0], + [0.0, 2.6178010, 5.2356021, 7.8534031, 20.0], + [1.0, 4.0, 4.3333333, 4.6666667, 5.0], + [1.0, 4.2475, 4.4983333, 4.7491667, 5.0], + [30.0, 37.5, 44.0, 51.25, 60.0], + [0.3, 0.7, 1.3, 1.7, 2.0], + [1.0, 2.0, 2.0, 2.0, 2.0], + [2.8, 3.15, 3.4, 3.56, 3.7], + [45.0, 62.149253, 102.875, 117.4097222, 125.0], + [2.0, 2.0, 2.0, 2.0, 2.0], + [2.3, 2.3, 2.3, 2.3, 2.3], + [-10.0, -2.7857143, -2.4285714, -2.0714286, 2.0], + [1.0, 2.0, 3.0, 4.0, 5.0], + [1.0, 1.625, 2.3333333, 3.25, 5.0], + [-2.0, -1.3333333, 0.5, 2.5, 6.0], + [-10.0, -10.0, -10.0, 1.0, 1.0] ) p = [0.0, 0.25, 0.5, 0.75, 1.0] srand(10) for i = 1:length(data) - @test quantile(data[i], wt[i], p) ≈ quantile_answers[i] + @test quantile(data[i], f(wt[i]), p) ≈ quantile_answers[i] atol = 1e-5 for j = 1:10 # order of p does not matter reorder = sortperm(rand(length(p))) - @test quantile(data[i], wt[i], p[reorder]) ≈ quantile_answers[i][reorder] + @test quantile(data[i], f(wt[i]), p[reorder]) ≈ quantile_answers[i][reorder] atol = 1e-5 end for j = 1:10 # order of w does not matter reorder = sortperm(rand(length(data[i]))) - @test quantile(data[i][reorder], f(wt[i][reorder]), p) ≈ quantile_answers[i] + @test quantile(data[i][reorder], f(wt[i][reorder]), p) ≈ quantile_answers[i] atol = 1e-5 end end # w = 1 corresponds to base quantile for i = 1:length(data) - @test quantile(data[i], f(ones(Int64, length(data[i]))), p) ≈ quantile(data[i], p) + @test quantile(data[i], f(ones(Int64, length(data[i]))), p) ≈ quantile(data[i], p) atol = 1e-5 for j = 1:10 prandom = rand(4) - @test quantile(data[i], f(ones(Int64, length(data[i]))), prandom) ≈ quantile(data[i], prandom) + @test quantile(data[i], f(ones(Int64, length(data[i]))), prandom) ≈ quantile(data[i], prandom) atol = 1e-5 end end - - # other syntaxes + # test zeros are removed + for i = 1:length(data) + @test quantile(vcat(1.0, data[i]), f(vcat(0.0, wt[i])), p) ≈ quantile_answers[i] atol = 1e-5 + end + # Syntax v = [7, 1, 2, 4, 10] w = [1, 1/3, 1/3, 1/3, 1] - answer = 6.181818181818182 - @test quantile(data[1], f(w), 0.5) ≈ answer - @test wquantile(data[1], f(w), [0.5]) ≈ [answer] - @test wquantile(data[1], f(w), 0.5) ≈ answer - @test wquantile(data[1], w, [0.5]) ≈ [answer] - @test wquantile(data[1], w, 0.5) ≈ answer + answer = 6.0 + @test quantile(data[1], f(w), 0.5) ≈ answer atol = 1e-5 + @test wquantile(data[1], f(w), [0.5]) ≈ [answer] atol = 1e-5 + @test wquantile(data[1], f(w), 0.5) ≈ answer atol = 1e-5 + @test wquantile(data[1], w, [0.5]) ≈ [answer] atol = 1e-5 + @test wquantile(data[1], w, 0.5) ≈ answer atol = 1e-5 end end # @testset StatsBase.Weights