Skip to content

Commit

Permalink
Modify weighted quantile (#316)
Browse files Browse the repository at this point in the history
The quantile method for frequency weights is now equivalent to the unweighted method with a vector of repeated values
The quantile method for non frequency weight now removes zeros.
  • Loading branch information
matthieugomez authored and nalimilan committed Feb 23, 2018
1 parent 9f1d7aa commit f869ec3
Show file tree
Hide file tree
Showing 2 changed files with 163 additions and 89 deletions.
72 changes: 35 additions & 37 deletions src/weights.jl
Original file line number Diff line number Diff line change
Expand Up @@ -552,25 +552,23 @@ wmedian(v::RealVector, w::AbstractWeights{<:Real}) = median(v, w)

###### Weighted quantile #####

# http://stats.stackexchange.com/questions/13169/defining-quantiles-over-a-weighted-sample
# In the non weighted version, we compute a vector of index h(N, p)
# and take interpolation between floor and ceil of this index
# Here there is a supplementary function from index to weighted index k -> Sk

"""
quantile(v, w::AbstractWeights, p)
Compute the weighted quantiles of a vector `x` at a specified set of probability
Compute the weighted quantiles of a vector `v` at a specified set of probability
values `p`, using weights given by a weight vector `w` (of type `AbstractWeights`).
Weights must not be negative. The weights and data vectors must have the same length.
The quantile for `p` is defined as follows. Denoting
``S_k = (k-1)w_k + (n-1) \\sum_{i<k}w_i``, define ``x_{k+1}`` the smallest element of `x`
such that ``S_{k+1}/S_{n}`` is strictly superior to `p`. The function returns
``(1-\\gamma) x_k + \\gamma x_{k+1}`` with ``\\gamma = (pS_n- S_k)/(S_{k+1}-S_k)``.
This corresponds to R-7, Excel, SciPy-(1,1) and Maple-6 when `w` contains only ones
(see [Wikipedia](https://en.wikipedia.org/wiki/Quantile)).
With [`FrequencyWeights`](@ref), the function returns the same result as
`quantile` for a vector with repeated values.
With non `FrequencyWeights`, denote ``N`` the length of the vector, ``w`` the vector of weights,
``h = p (\\sum_{i<= N}w_i - w_1) + w_1`` the cumulative weight corresponding to the
probability ``p`` and ``S_k = \\sum_{i<=k}w_i`` the cumulative weight for each
observation, define ``v_{k+1}`` the smallest element of `v` such that ``S_{k+1}``
is strictly superior to ``h``. The weighted ``p`` quantile is given by ``v_k + \\gamma (v_{k+1} -v_k)``
with ``\\gamma = (h - S_k)/(S_{k+1}-S_k)``. In particular, when `w` is a vector
of ones, the function returns the same result as `quantile`.
"""
function quantile(v::RealVector{V}, w::AbstractWeights{W}, p::RealVector) where {V,W<:Real}
# checks
Expand All @@ -584,47 +582,47 @@ function quantile(v::RealVector{V}, w::AbstractWeights{W}, p::RealVector) where
x < 0 && error("weight vector cannot contain negative entries")
end

# full sort
vw = sort!(collect(zip(v, w.values)))

wsum = w.sum
# remove zeros weights and sort
wsum = sum(w)
nz = .!iszero.(w)
vw = sort!(collect(zip(view(v, nz), view(w, nz))))
N = length(vw)

# prepare percentiles
ppermute = sortperm(p)
p = p[ppermute]
p = bound_quantiles(p)

# prepare out vector
N = length(vw)
out = Vector{typeof(zero(V)/1)}(uninitialized, length(p))
fill!(out, vw[end][1])

# start looping on quantiles
cumulative_weight, Sk, Skold = zero(W), zero(W), zero(W)
Sk, Skold = zero(W), zero(W)
vk, vkold = zero(V), zero(V)
k = 1
k = 0

for i in 1:length(p)
h = p[i] * (N - 1) * wsum
if h == 0
# happens when N or p or wsum equal zero
out[ppermute[i]] = vw[1][1]
if isa(w, FrequencyWeights)
h = p[i] * (wsum - 1) + 1
else
while Sk <= h
# happens in particular when k == 1
vk, wk = vw[k]
cumulative_weight += wk
if k >= N
# out was initialized with maximum v
return out
end
k += 1
Skold, vkold = Sk, vk
vk, wk = vw[k]
Sk = (k - 1) * wk + (N - 1) * cumulative_weight
h = p[i] * (wsum - vw[1][2]) + vw[1][2]
end
while Sk <= h
k += 1
if k > N
# out was initialized with maximum v
return out
end
# in particular, Sk is different from Skold
g = (h - Skold) / (Sk - Skold)
out[ppermute[i]] = vkold + g * (vk - vkold)
Skold, vkold = Sk, vk
vk, wk = vw[k]
Sk += wk
end
if isa(w, FrequencyWeights)
out[ppermute[i]] = vkold + min(h - Skold, 1) * (vk - vkold)
else
out[ppermute[i]] = vkold + (h - Skold) / (Sk - Skold) * (vk - vkold)
end
end
return out
Expand Down
180 changes: 128 additions & 52 deletions test/weights.jl
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,80 @@ end
@test_throws ErrorException median(data, f(wt))
end

@testset "Quantile $f" for f in weight_funcs

# Quantile fweights
@testset "Quantile fweights" begin
data = (
[7, 1, 2, 4, 10],
[7, 1, 2, 4, 10],
[7, 1, 2, 4, 10, 15],
[1, 2, 4, 7, 10, 15],
[0, 10, 20, 30],
[1, 2, 3, 4, 5],
[1, 2, 3, 4, 5],
[30, 40, 50, 60, 35],
[2, 0.6, 1.3, 0.3, 0.3, 1.7, 0.7, 1.7],
[1, 2, 2],
[3.7, 3.3, 3.5, 2.8],
[100, 125, 123, 60, 45, 56, 66],
[2, 2, 2, 2, 2, 2],
[2.3],
[-2, -3, 1, 2, -10],
[1, 2, 3, 4, 5],
[5, 4, 3, 2, 1],
[-2, 2, -1, 3, 6],
[-10, 1, 1, -10, -10],
)
wt = (
[3, 1, 1, 1, 3],
[1, 1, 1, 1, 1],
[3, 1, 1, 1, 3, 3],
[1, 1, 1, 3, 3, 3],
[30, 191, 9, 0],
[10, 1, 1, 1, 9],
[10, 1, 1, 1, 900],
[1, 3, 5, 4, 2],
[2, 2, 5, 0, 2, 2, 1, 6],
[1, 1, 8],
[5, 5, 4, 1],
[30, 56, 144, 24, 55, 43, 67],
[1, 2, 3, 4, 5, 6],
[12],
[7, 1, 1, 1, 6],
[1, 0, 0, 0, 2],
[1, 2, 3, 4, 5],
[1, 2, 3, 2, 1],
[0, 1, 1, 1, 1],
)
p = [0.0, 0.25, 0.5, 0.75, 1.0]
function _rep(x::AbstractVector, lengths::AbstractVector{Int})
res = similar(x, sum(lengths))
i = 1
for idx in 1:length(x)
tmp = x[idx]
for kdx in 1:lengths[idx]
res[i] = tmp
i += 1
end
end
return res
end
# quantile with fweights is the same as repeated vectors
for i = 1:length(data)
@test quantile(data[i], fweights(wt[i]), p) quantile(_rep(data[i], wt[i]), p)
end
# quantile with fweights = 1 is the same as quantile
for i = 1:length(data)
@test quantile(data[i], fweights(ones(wt[i])), p) quantile(data[i], p)
end

# Issue #313
@test quantile([1, 2, 3, 4, 5], fweights([0,1,2,1,0]), p) quantile([2, 3, 3, 4], p)
@test quantile([1, 2], fweights([1, 1]), 0.25) 1.25
@test quantile([1, 2], fweights([2, 2]), 0.25) 1.0
end

@testset "Quantile aweights, pweights and weights" for f in (aweights, pweights, weights)
data = (
[7, 1, 2, 4, 10],
[7, 1, 2, 4, 10],
Expand All @@ -313,81 +386,84 @@ end
[-10, 1, 1, -10, -10],
)
wt = (
f([1, 1/3, 1/3, 1/3, 1]),
f([1, 1, 1, 1, 1]),
f([1, 1/3, 1/3, 1/3, 1, 1]),
f([1/3, 1/3, 1/3, 1, 1, 1]),
f([30, 191, 9, 0]),
f([10, 1, 1, 1, 9]),
f([10, 1, 1, 1, 900]),
f([1, 3, 5, 4, 2]),
f([2, 2, 5, 1, 2, 2, 1, 6]),
f([0.1, 0.1, 0.8]),
f([5, 5, 4, 1]),
f([30, 56, 144, 24, 55, 43, 67]),
f([0.1, 0.2, 0.3, 0.4, 0.5, 0.6]),
f([12]),
f([7, 1, 1, 1, 6]),
f([1, 0, 0, 0, 2]),
f([1, 2, 3, 4, 5]),
f([0.1, 0.2, 0.3, 0.2, 0.1]),
f([1, 1, 1, 1, 1]),
[1, 1/3, 1/3, 1/3, 1],
[1, 1, 1, 1, 1],
[1, 1/3, 1/3, 1/3, 1, 1],
[1/3, 1/3, 1/3, 1, 1, 1],
[30, 191, 9, 0],
[10, 1, 1, 1, 9],
[10, 1, 1, 1, 900],
[1, 3, 5, 4, 2],
[2, 2, 5, 1, 2, 2, 1, 6],
[0.1, 0.1, 0.8],
[5, 5, 4, 1],
[30, 56, 144, 24, 55, 43, 67],
[0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
[12],
[7, 1, 1, 1, 6],
[1, 0, 0, 0, 2],
[1, 2, 3, 4, 5],
[0.1, 0.2, 0.3, 0.2, 0.1],
[1, 1, 1, 1, 1],
)
quantile_answers = (
[1.0,3.6000000000000005,6.181818181818182,8.2,10.0],
[1.0,2.0,4.0,7.0,10.0],
[1.0,4.75,8.0,10.833333333333334,15.0],
[1.0,4.75,8.0,10.833333333333334,15.0],
[0.0,6.1387900355871885,11.600000000000001,15.912500000000001,30.0],
[1.0,1.5365853658536586,2.5999999999999996,4.405405405405405,5.0],
[1.0,4.239377950569287,4.492918633712858,4.746459316856429,5.0],
[30.0,38.75,45.714285714285715,52.85714285714286,60.0],
[0.3,0.6903846153846154,1.484,1.7,2.0],
[1.0,2.0,2.0,2.0,2.0],
[2.8,3.3361111111111112,3.4611111111111112,3.581578947368421,3.7],
[45.0,59.88593155893536,100.08846153846153,118.62115384615385,125.0],
[2.0,2.0,2.0,2.0,2.0],
[2.3,2.3,2.3,2.3,2.3],
[-10.0,-5.52,-2.5882352941176467,-0.9411764705882351,2.0],
[1.0,1.75,4.25,4.625,5.0],
[1.0,1.625,2.3333333333333335,3.25,5.0],
[-2.0,-0.5384615384615388,1.5384615384615383,2.6999999999999997,6.0],
[-10.0,-10.0,-10.0,1.0,1.0]
[1.0, 4.0, 6.0, 8.0, 10.0],
[1.0, 2.0, 4.0, 7.0, 10.0],
[1.0, 4.75, 7.5, 10.4166667, 15.0],
[1.0, 4.75, 7.5, 10.4166667, 15.0],
[0.0, 2.6178010, 5.2356021, 7.8534031, 20.0],
[1.0, 4.0, 4.3333333, 4.6666667, 5.0],
[1.0, 4.2475, 4.4983333, 4.7491667, 5.0],
[30.0, 37.5, 44.0, 51.25, 60.0],
[0.3, 0.7, 1.3, 1.7, 2.0],
[1.0, 2.0, 2.0, 2.0, 2.0],
[2.8, 3.15, 3.4, 3.56, 3.7],
[45.0, 62.149253, 102.875, 117.4097222, 125.0],
[2.0, 2.0, 2.0, 2.0, 2.0],
[2.3, 2.3, 2.3, 2.3, 2.3],
[-10.0, -2.7857143, -2.4285714, -2.0714286, 2.0],
[1.0, 2.0, 3.0, 4.0, 5.0],
[1.0, 1.625, 2.3333333, 3.25, 5.0],
[-2.0, -1.3333333, 0.5, 2.5, 6.0],
[-10.0, -10.0, -10.0, 1.0, 1.0]
)
p = [0.0, 0.25, 0.5, 0.75, 1.0]

srand(10)
for i = 1:length(data)
@test quantile(data[i], wt[i], p) quantile_answers[i]
@test quantile(data[i], f(wt[i]), p) quantile_answers[i] atol = 1e-5
for j = 1:10
# order of p does not matter
reorder = sortperm(rand(length(p)))
@test quantile(data[i], wt[i], p[reorder]) quantile_answers[i][reorder]
@test quantile(data[i], f(wt[i]), p[reorder]) quantile_answers[i][reorder] atol = 1e-5
end
for j = 1:10
# order of w does not matter
reorder = sortperm(rand(length(data[i])))
@test quantile(data[i][reorder], f(wt[i][reorder]), p) quantile_answers[i]
@test quantile(data[i][reorder], f(wt[i][reorder]), p) quantile_answers[i] atol = 1e-5
end
end
# w = 1 corresponds to base quantile
for i = 1:length(data)
@test quantile(data[i], f(ones(Int64, length(data[i]))), p) quantile(data[i], p)
@test quantile(data[i], f(ones(Int64, length(data[i]))), p) quantile(data[i], p) atol = 1e-5
for j = 1:10
prandom = rand(4)
@test quantile(data[i], f(ones(Int64, length(data[i]))), prandom) quantile(data[i], prandom)
@test quantile(data[i], f(ones(Int64, length(data[i]))), prandom) quantile(data[i], prandom) atol = 1e-5
end
end

# other syntaxes
# test zeros are removed
for i = 1:length(data)
@test quantile(vcat(1.0, data[i]), f(vcat(0.0, wt[i])), p) quantile_answers[i] atol = 1e-5
end
# Syntax
v = [7, 1, 2, 4, 10]
w = [1, 1/3, 1/3, 1/3, 1]
answer = 6.181818181818182
@test quantile(data[1], f(w), 0.5) answer
@test wquantile(data[1], f(w), [0.5]) [answer]
@test wquantile(data[1], f(w), 0.5) answer
@test wquantile(data[1], w, [0.5]) [answer]
@test wquantile(data[1], w, 0.5) answer
answer = 6.0
@test quantile(data[1], f(w), 0.5) answer atol = 1e-5
@test wquantile(data[1], f(w), [0.5]) [answer] atol = 1e-5
@test wquantile(data[1], f(w), 0.5) answer atol = 1e-5
@test wquantile(data[1], w, [0.5]) [answer] atol = 1e-5
@test wquantile(data[1], w, 0.5) answer atol = 1e-5
end

end # @testset StatsBase.Weights

0 comments on commit f869ec3

Please sign in to comment.