JuliaStats · nalimilan · Feb 23, 2018 · Nov 9, 2017 · Nov 9, 2017 · Nov 9, 2017
diff --git a/src/weights.jl b/src/weights.jl
@@ -537,25 +537,22 @@ wmedian(v::RealVector, w::AbstractWeights{<:Real}) = median(v, w)
 
 ###### Weighted quantile #####
 
-# http://stats.stackexchange.com/questions/13169/defining-quantiles-over-a-weighted-sample
-# In the non weighted version, we compute a vector of index h(N, p)
-# and take interpolation between floor and ceil of this index
-# Here there is a supplementary function from index to weighted index k -> Sk
 
 """
     quantile(v, w::AbstractWeights, p)
 
-Compute the weighted quantiles of a vector `x` at a specified set of probability
-values `p`, using weights given by a weight vector `w` (of type `AbstractWeights`).
+Compute the weighted quantiles of a vector ``x`` at a specified set of probability
+values ``p``, using weights given by a weight vector ``w`` (of type `AbstractWeights`).
 Weights must not be negative. The weights and data vectors must have the same length.
 
-The quantile for `p` is defined as follows. Denoting
-``S_k = (k-1)w_k + (n-1) \\sum_{i<k}w_i``, define ``x_{k+1}`` the smallest element of `x`
-such that ``S_{k+1}/S_{n}`` is strictly superior to `p`. The function returns
-``(1-\\gamma) x_k + \\gamma x_{k+1}`` with  ``\\gamma = (pS_n- S_k)/(S_{k+1}-S_k)``.
-
-This corresponds to  R-7, Excel, SciPy-(1,1) and Maple-6 when `w` contains only ones
-(see [Wikipedia](https://en.wikipedia.org/wiki/Quantile)).
+With [FrequencyWeights](@ref FrequencyWeights), the function returns the same result as 
+`quantile` for a vector with repeated values.
+With non FrequencyWeights,  denote N the length of the vector, w the vector of weights, ``h = p (\\sum_{i<= N}w_i - w_1) + w_1`` the cumulative weight corresponding to the probability `p` and 
+ ``S_k = \\sum_{i<=k}w_i`` the cumulative weight for to each observation,
+  define ``x_{k+1}`` the smallest element of ``x`` such that ``S_{k+1}`` is strictly 
+  superior to ``h``. The function returns``x_k + \\gamma (x_{k+1} -x_k)`` 
+  with  ``\\gamma = (h - S_k)/(S_{k+1}-S_k)``. In particular, when ``w`` is a vector
+   of one, the function returns the same result as `quantile`.
 """
 function quantile(v::RealVector{V}, w::AbstractWeights{W}, p::RealVector) where {V,W<:Real}
     # checks
@@ -569,47 +566,47 @@ function quantile(v::RealVector{V}, w::AbstractWeights{W}, p::RealVector) where
         x < 0 && error("weight vector cannot contain negative entries")
     end
 
-    # full sort
-    vw = sort!(collect(zip(v, w.values)))
 
-    wsum = w.sum
+    #remove zeros weights and sort
+    wsum = sum(w.value)
+    nz = .!iszero.(w.values)
+    vw = sort!(collect(zip(view(v, nz), view(wvalues, nz))))
+    N = length(vw)
 
     # prepare percentiles
     ppermute = sortperm(p)
     p = p[ppermute]
     p = bound_quantiles(p)
 
     # prepare out vector
-    N = length(vw)
     out = Vector{typeof(zero(V)/1)}(length(p))
     fill!(out, vw[end][1])
 
     # start looping on quantiles
-    cumulative_weight, Sk, Skold =  zero(W), zero(W), zero(W)
-    vk, vkold = zero(V), zero(V)
-    k = 1
+    Sk, Skold = zero(W), zero(W)
+    vk, vkold= zero(V), zero(V)
+    k = 0
+
     for i in 1:length(p)
-        h = p[i] * (N - 1) * wsum
-        if h == 0
-            # happens when N or p or wsum equal zero
-            out[ppermute[i]] = vw[1][1]
+        if isa(w, FrequencyWeights)
+            h = p[i] * (wsum - 1) + 1
         else
-            while Sk <= h
-                # happens in particular when k == 1
-                vk, wk = vw[k]
-                cumulative_weight += wk
-                if k >= N
-                    # out was initialized with maximum v
-                    return out
-                end
-                k += 1
-                Skold, vkold = Sk, vk
-                vk, wk = vw[k]
-                Sk = (k - 1) * wk + (N - 1) * cumulative_weight
+            h = p[i] * (wsum - vw[1][2]) + vw[1][2]
+        end
+        while Sk <= h
+            k += 1
+            if k > N
+               # out was initialized with maximum v
+               return out
             end
-            # in particular, Sk is different from Skold
-            g = (h - Skold) / (Sk - Skold)
-            out[ppermute[i]] = vkold + g * (vk - vkold)
+            Skold, vkold = Sk, vk
+            vk, wk = vw[k]
+            Sk += wk
+        end
+        if isa(w, FrequencyWeights)
+            out[ppermute[i]] = vkold + min(h - Skold, 1) * (vk - vkold)
+        else
+            out[ppermute[i]] = vkold + (h - Skold) / (Sk - Skold) * (vk - vkold)
         end
     end
     return out

diff --git a/test/weights.jl b/test/weights.jl
@@ -261,7 +261,9 @@ end
     @test_throws ErrorException median(data, f(wt))
 end
 
-@testset "Quantile $f" for f in weight_funcs
+
+# Quantile fweights
+@testset "Quantile fweights" begin
     data = (
         [7, 1, 2, 4, 10],
         [7, 1, 2, 4, 10],
@@ -284,81 +286,155 @@ end
         [-10, 1, 1, -10, -10],
     )
     wt = (
-        f([1, 1/3, 1/3, 1/3, 1]),
-        f([1, 1, 1, 1, 1]),
-        f([1, 1/3, 1/3, 1/3, 1, 1]),
-        f([1/3, 1/3, 1/3, 1, 1, 1]),
-        f([30, 191, 9, 0]),
-        f([10, 1, 1, 1, 9]),
-        f([10, 1, 1, 1, 900]),
-        f([1, 3, 5, 4, 2]),
-        f([2, 2, 5, 1, 2, 2, 1, 6]),
-        f([0.1, 0.1, 0.8]),
-        f([5, 5, 4, 1]),
-        f([30, 56, 144, 24, 55, 43, 67]),
-        f([0.1, 0.2, 0.3, 0.4, 0.5, 0.6]),
-        f([12]),
-        f([7, 1, 1, 1, 6]),
-        f([1, 0, 0, 0, 2]),
-        f([1, 2, 3, 4, 5]),
-        f([0.1, 0.2, 0.3, 0.2, 0.1]),
-        f([1, 1, 1, 1, 1]),
+        Int[3, 1, 1, 1, 3],
+        Int[1, 1, 1, 1, 1],
+        Int[3, 1, 1, 1, 3, 3],
+        Int[1, 1, 1, 3, 3, 3],
+        Int[30, 191, 9, 0],
+        Int[10, 1, 1, 1, 9],
+        Int[10, 1, 1, 1, 900],
+        Int[1, 3, 5, 4, 2],
+        Int[2, 2, 5, 0, 2, 2, 1, 6],
+        Int[1, 1, 8],
+        Int[5, 5, 4, 1],
+        Int[30, 56, 144, 24, 55, 43, 67],
+        Int[1, 2, 3, 4, 5, 6],
+        Int[12],
+        Int[7, 1, 1, 1, 6],
+        Int[1, 0, 0, 0, 2],
+        Int[1, 2, 3, 4, 5],
+        Int[1, 2, 3, 2, 1],
+        Int[0, 1, 1, 1, 1],
+    )
+    p = [0.0, 0.25, 0.5, 0.75, 1.0]
+    function _rep(x::AbstractVector, lengths::AbstractVector{Int})
+        res = similar(x, sum(lengths))
+        i = 1
+        for idx in 1:length(x)
+            tmp = x[idx]
+            for kdx in 1:lengths[idx]
+                res[i] = tmp
+                i += 1
+            end
+        end
+        return res
+    end
+    # quantile with fweights is the same as repeated vectors
+    for i = 1:length(data)
+        @test quantile(data[i], fweights(wt[i]), p) ≈ quantile(_rep(data[i], wt[i]), p)
+    end
+    # quantile with fweights = 1  is the same as quantile
+    for i = 1:length(data)
+        @test quantile(data[i], fweights(ones(wt[i])), p) ≈ quantile(data[i], p)
+    end
+
+    # Issue #313
+    @test quantile(x, fweights([0,1,2,1,0]), p) ≈ quantile([2, 3, 3, 4], p)
+    @test quantile([1, 2], fweights([1, 1]), 0.25) ≈ 1.25
+    @test quantile([1, 2], fweights([2, 2]), 0.25) ≈ 1.0
+end
+
+@testset "Quantile aweights" begin
+    data = (
+        [7, 1, 2, 4, 10],
+        [7, 1, 2, 4, 10],
+        [7, 1, 2, 4, 10, 15],
+        [1, 2, 4, 7, 10, 15],
+        [0, 10, 20, 30],
+        [1, 2, 3, 4, 5],
+        [1, 2, 3, 4, 5],
+        [30, 40, 50, 60, 35],
+        [2, 0.6, 1.3, 0.3, 0.3, 1.7, 0.7, 1.7],
+        [1, 2, 2],
+        [3.7, 3.3, 3.5, 2.8],
+        [100, 125, 123, 60, 45, 56, 66],
+        [2, 2, 2, 2, 2, 2],
+        [2.3],
+        [-2, -3, 1, 2, -10],
+        [1, 2, 3, 4, 5],
+        [5, 4, 3, 2, 1],
+        [-2, 2, -1, 3, 6],
+        [-10, 1, 1, -10, -10],
+    )
+    wt = (
+        [1, 1/3, 1/3, 1/3, 1],
+        [1, 1, 1, 1, 1],
+        [1, 1/3, 1/3, 1/3, 1, 1],
+        [1/3, 1/3, 1/3, 1, 1, 1],
+        [30, 191, 9, 0],
+        [10, 1, 1, 1, 9],
+        [10, 1, 1, 1, 900],
+        [1, 3, 5, 4, 2],
+        [2, 2, 5, 1, 2, 2, 1, 6],
+        [0.1, 0.1, 0.8],
+        [5, 5, 4, 1],
+        [30, 56, 144, 24, 55, 43, 67],
+        [0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
+        [12],
+        [7, 1, 1, 1, 6],
+        [1, 0, 0, 0, 2],
+        [1, 2, 3, 4, 5],
+        [0.1, 0.2, 0.3, 0.2, 0.1],
+        [1, 1, 1, 1, 1],
     )
     quantile_answers = (
-        [1.0,3.6000000000000005,6.181818181818182,8.2,10.0],
-        [1.0,2.0,4.0,7.0,10.0],
-        [1.0,4.75,8.0,10.833333333333334,15.0],
-        [1.0,4.75,8.0,10.833333333333334,15.0],
-        [0.0,6.1387900355871885,11.600000000000001,15.912500000000001,30.0],
-        [1.0,1.5365853658536586,2.5999999999999996,4.405405405405405,5.0],
-        [1.0,4.239377950569287,4.492918633712858,4.746459316856429,5.0],
-        [30.0,38.75,45.714285714285715,52.85714285714286,60.0],
-        [0.3,0.6903846153846154,1.484,1.7,2.0],
-        [1.0,2.0,2.0,2.0,2.0],
-        [2.8,3.3361111111111112,3.4611111111111112,3.581578947368421,3.7],
-        [45.0,59.88593155893536,100.08846153846153,118.62115384615385,125.0],
-        [2.0,2.0,2.0,2.0,2.0],
-        [2.3,2.3,2.3,2.3,2.3],
-        [-10.0,-5.52,-2.5882352941176467,-0.9411764705882351,2.0],
-        [1.0,1.75,4.25,4.625,5.0],
-        [1.0,1.625,2.3333333333333335,3.25,5.0],
-        [-2.0,-0.5384615384615388,1.5384615384615383,2.6999999999999997,6.0],
-        [-10.0,-10.0,-10.0,1.0,1.0]
+        [1.0, 4.0, 6.0, 8.0, 10.0],
+        [1.0, 2.0, 4.0, 7.0, 10.0],
+        [1.0, 4.75, 7.5, 10.4167, 15.0],
+        [1.0, 4.75, 7.5, 10.4167, 15.0],
+        [0.0, 2.6178, 5.2356, 7.8534, 20.0],
+        [1.0, 4.0, 4.33333, 4.66667, 5.0],
+        [1.0, 4.2475, 4.49833, 4.74917, 5.0],
+        [30.0, 37.5, 44.0, 51.25, 60.0],
+        [0.3, 0.7, 1.3, 1.7, 2.0],
+        [1.0, 2.0, 2.0, 2.0, 2.0],
+        [2.8, 3.15, 3.4, 3.56, 3.7],
+        [45.0, 62.1493, 102.875, 117.41, 125.0],
+        [2.0, 2.0, 2.0, 2.0, 2.0],
+        [2.3, 2.3, 2.3, 2.3, 2.3],
+        [-10.0, -2.78571, -2.42857, -2.07143, 2.0],
+        [1.0, 2.0, 3.0, 4.0, 5.0],
+        [1.0, 1.625, 2.33333, 3.25, 5.0],
+        [-2.0, -1.33333, 0.5, 2.5, 6.0],
+        [-10.0, -10.0, -10.0, 1.0, 1.0]
     )
     p = [0.0, 0.25, 0.5, 0.75, 1.0]
 
     srand(10)
     for i = 1:length(data)
-        @test quantile(data[i], wt[i], p) ≈ quantile_answers[i]
+        @test quantile(data[i], aweights(wt[i]), p) ≈ quantile_answers[i] atol = 1e-3
         for j = 1:10
             # order of p does not matter
             reorder = sortperm(rand(length(p)))
-            @test quantile(data[i], wt[i], p[reorder]) ≈ quantile_answers[i][reorder]
+            @test quantile(data[i], aweights(wt[i]), p[reorder]) ≈ quantile_answers[i][reorder] atol = 1e-3
         end
         for j = 1:10
             # order of w does not matter
             reorder = sortperm(rand(length(data[i])))
-            @test quantile(data[i][reorder], f(wt[i][reorder]), p) ≈ quantile_answers[i]
+            @test quantile(data[i][reorder], aweights(wt[i][reorder]), p) ≈ quantile_answers[i] atol = 1e-3
         end
     end
     # w = 1 corresponds to base quantile
     for i = 1:length(data)
-        @test quantile(data[i], f(ones(Int64, length(data[i]))), p) ≈ quantile(data[i], p)
+        @test quantile(data[i], aweights(ones(Int64, length(data[i]))), p) ≈ quantile(data[i], p) atol = 1e-3
         for j = 1:10
             prandom = rand(4)
-            @test quantile(data[i], f(ones(Int64, length(data[i]))),  prandom) ≈ quantile(data[i], prandom)
+            @test quantile(data[i], aweights(ones(Int64, length(data[i]))),  prandom) ≈ quantile(data[i], prandom) atol = 1e-3
         end
     end
-
-    # other syntaxes
+    # test zeros are removed
+    for i = 1:length(data)
+        @test quantile(vcat(1.0, data[i]), aweights(vcat(0.0, wt[i])), p) ≈ quantile_answers[i] atol = 1e-3
+    end
+    # Syntax
     v = [7, 1, 2, 4, 10]
     w = [1, 1/3, 1/3, 1/3, 1]
-    answer = 6.181818181818182
-    @test quantile(data[1], f(w), 0.5)    ≈  answer
-    @test wquantile(data[1], f(w), [0.5]) ≈ [answer]
-    @test wquantile(data[1], f(w), 0.5)   ≈  answer
-    @test wquantile(data[1], w, [0.5])          ≈ [answer]
-    @test wquantile(data[1], w, 0.5)            ≈  answer
+    answer = 6.0
+    @test quantile(data[1], aweights(w), 0.5)    ≈  answer atol = 1e-3
+    @test wquantile(data[1], aweights(w), [0.5]) ≈ [answer] atol = 1e-3
+    @test wquantile(data[1], aweights(w), 0.5)   ≈  answer atol = 1e-3
+    @test wquantile(data[1], w, [0.5])          ≈ [answer] atol = 1e-3
+    @test wquantile(data[1], w, 0.5)            ≈  answer atol = 1e-3
 end
 
-end # @testset StatsBase.Weights
+end # @testset StatsBase.Weights