diff --git a/Project.toml b/Project.toml index 69967992b..215bf0b43 100644 --- a/Project.toml +++ b/Project.toml @@ -13,8 +13,9 @@ SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" [extras] +Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["DelimitedFiles", "Test"] +test = ["Dates", "DelimitedFiles", "Test"] diff --git a/docs/src/weights.md b/docs/src/weights.md index e8322bfaa..fe2770105 100644 --- a/docs/src/weights.md +++ b/docs/src/weights.md @@ -50,6 +50,72 @@ w = Weights([1., 2., 3.]) w = weights([1., 2., 3.]) ``` +### Exponential weights: `eweights` + +Exponential weights are a common form of temporal weights which assign exponentially decreasing +weights to past observations. + +If `t` is a vector of temporal indices then for each index `i` we compute the weight as: + +``λ (1 - λ)^{1 - i}`` + +``λ`` is a smoothing factor or rate parameter such that ``0 < λ ≤ 1``. +As this value approaches 0, the resulting weights will be almost equal, +while values closer to 1 will put greater weight on the tail elements of the vector. + +# Examples + +```julia-repl +julia> eweights(1:10, 0.3) +10-element Weights{Float64,Float64,Array{Float64,1}}: + 0.3 + 0.42857142857142855 + 0.6122448979591837 + 0.8746355685131197 + 1.249479383590171 + 1.7849705479859588 + 2.549957925694227 + 3.642797036706039 + 5.203995766722913 + 7.434279666747019 +``` + +Simply passing the number of observations `n` is equivalent to passing in `1:n`. + +```julia-repl +julia> eweights(10, 0.3) +10-element Weights{Float64,Float64,Array{Float64,1}}: + 0.3 + 0.42857142857142855 + 0.6122448979591837 + 0.8746355685131197 + 1.249479383590171 + 1.7849705479859588 + 2.549957925694227 + 3.642797036706039 + 5.203995766722913 + 7.434279666747019 +``` + +Finally, you can construct exponential weights from an arbitrary subset of timestamps within a larger range. + +```julia-repl +julia> t +2019-01-01T01:00:00:2 hours:2019-01-01T05:00:00 + +julia> r +2019-01-01T01:00:00:1 hour:2019-01-02T01:00:00 + +julia> eweights(t, r, 0.3) +3-element Weights{Float64,Float64,Array{Float64,1}}: + 0.3 + 0.6122448979591837 + 1.249479383590171 +``` + +NOTE: This is equivalent to `eweights(something.(indexin(t, r)), 0.3)`, which is saying that for each value in `t` return the corresponding index for that value in `r`. +Since `indexin` returns `nothing` if there is no corresponding value from `t` in `r` we use `something` to eliminate that possibility. + ## Methods `AbstractWeights` implements the following methods: @@ -70,5 +136,6 @@ Weights aweights fweights pweights +eweights weights -``` \ No newline at end of file +``` diff --git a/src/StatsBase.jl b/src/StatsBase.jl index af6a47aa8..102af4ab9 100644 --- a/src/StatsBase.jl +++ b/src/StatsBase.jl @@ -34,6 +34,7 @@ export aweights, # construct an AnalyticWeights vector fweights, # construct a FrequencyWeights vector pweights, # construct a ProbabilityWeights vector + eweights, # construct an exponential Weights vector wsum, # weighted sum with vector as second argument wsum!, # weighted sum across dimensions with provided storage wmean, # weighted mean diff --git a/src/weights.jl b/src/weights.jl index 174753370..d2548c17f 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -193,6 +193,63 @@ pweights(vs::RealArray) = ProbabilityWeights(vec(vs)) end end +""" + eweights(t::AbstractVector{<:Integer}, λ::Real) + eweights(t::AbstractVector{T}, r::StepRange{T}, λ::Real) where T + eweights(n::Integer, λ::Real) + +Construct a [`Weights`](@ref) vector which assigns exponentially decreasing weights to past +observations, which in this case corresponds to larger integer values `i` in `t`. +If an integer `n` is provided, weights are generated for values from 1 to `n` +(equivalent to `t = 1:n`). + +For each element `i` in `t` the weight value is computed as: + +``λ (1 - λ)^{1 - i}`` + +# Arguments + +- `t::AbstractVector`: temporal indices or timestamps +- `r::StepRange`: a larger range to use when constructing weights from a subset of timestamps +- `n::Integer`: if provided instead of `t`, temporal indices are taken to be `1:n` +- `λ::Real`: a smoothing factor or rate parameter such that ``0 < λ ≤ 1``. + As this value approaches 0, the resulting weights will be almost equal, + while values closer to 1 will put greater weight on the tail elements of the vector. + +# Examples +```julia-repl +julia> eweights(1:10, 0.3) +10-element Weights{Float64,Float64,Array{Float64,1}}: + 0.3 + 0.42857142857142855 + 0.6122448979591837 + 0.8746355685131197 + 1.249479383590171 + 1.7849705479859588 + 2.549957925694227 + 3.642797036706039 + 5.203995766722913 + 7.434279666747019 +``` +""" +function eweights(t::AbstractVector{T}, λ::Real) where T<:Integer + 0 < λ <= 1 || throw(ArgumentError("Smoothing factor must be between 0 and 1")) + + w0 = map(t) do i + i > 0 || throw(ArgumentError("Time indices must be non-zero positive integers")) + λ * (1 - λ)^(1 - i) + end + + s = sum(w0) + Weights(w0, s) +end + +eweights(n::Integer, λ::Real) = eweights(1:n, λ) +eweights(t::AbstractVector, r::AbstractRange, λ::Real) = + eweights(something.(indexin(t, r)), λ) + +# NOTE: No variance correction is implemented for exponential weights + ##### Equality tests ##### for w in (AnalyticWeights, FrequencyWeights, ProbabilityWeights, Weights) @@ -481,7 +538,7 @@ _mean(A::AbstractArray{T}, w::AbstractWeights{W}, dims::Int) where {T,W} = Compute the weighted quantiles of a vector `v` at a specified set of probability values `p`, using weights given by a weight vector `w` (of type `AbstractWeights`). Weights must not be negative. The weights and data vectors must have the same length. -`NaN` is returned if `x` contains any `NaN` values. An error is raised if `w` contains +`NaN` is returned if `x` contains any `NaN` values. An error is raised if `w` contains any `NaN` values. With [`FrequencyWeights`](@ref), the function returns the same result as @@ -502,15 +559,15 @@ function quantile(v::RealVector{V}, w::AbstractWeights{W}, p::RealVector) where all(x -> 0 <= x <= 1, p) || throw(ArgumentError("input probability out of [0,1] range")) w.sum == 0 && throw(ArgumentError("weight vector cannot sum to zero")) - length(v) == length(w) || throw(ArgumentError("data and weight vectors must be the same size," * + length(v) == length(w) || throw(ArgumentError("data and weight vectors must be the same size," * "got $(length(v)) and $(length(w))")) for x in w.values isnan(x) && throw(ArgumentError("weight vector cannot contain NaN entries")) x < 0 && throw(ArgumentError("weight vector cannot contain negative entries")) end - isa(w, FrequencyWeights) && !(eltype(w) <: Integer) && any(!isinteger, w) && - throw(ArgumentError("The values of the vector of `FrequencyWeights` must be numerically" * + isa(w, FrequencyWeights) && !(eltype(w) <: Integer) && any(!isinteger, w) && + throw(ArgumentError("The values of the vector of `FrequencyWeights` must be numerically" * "equal to integers. Use `ProbabilityWeights` or `AnalyticWeights` instead.")) # remove zeros weights and sort diff --git a/test/runtests.jl b/test/runtests.jl index dac21a0c8..500539c74 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,4 +1,5 @@ using StatsBase +using Dates using LinearAlgebra using Random using Statistics diff --git a/test/weights.jl b/test/weights.jl index fa8f40be4..ab40cf667 100644 --- a/test/weights.jl +++ b/test/weights.jl @@ -447,4 +447,55 @@ end @test round(mean(Union{Int,Missing}[1,2], weights([1,2])), digits=3) ≈ 1.667 end +@testset "Exponential Weights" begin + @testset "Usage" begin + θ = 5.25 + λ = 1 - exp(-1 / θ) # simple conversion for the more common/readable method + v = [λ*(1-λ)^(1-i) for i = 1:4] + w = Weights(v) + + @test round.(w, digits=4) == [0.1734, 0.2098, 0.2539, 0.3071] + + @testset "basic" begin + @test eweights(1:4, λ) ≈ w + end + + @testset "1:n" begin + @test eweights(4, λ) ≈ w + end + + @testset "indexin" begin + v = [λ*(1-λ)^(1-i) for i = 1:10] + + # Test that we should be able to skip indices easily + @test eweights([1, 3, 5, 7], 1:10, λ) ≈ Weights(v[[1, 3, 5, 7]]) + + # This should also work with actual time types + t1 = DateTime(2019, 1, 1, 1) + tx = t1 + Hour(7) + tn = DateTime(2019, 1, 2, 1) + + @test eweights(t1:Hour(2):tx, t1:Hour(1):tn, λ) ≈ Weights(v[[1, 3, 5, 7]]) + end + end + + @testset "Empty" begin + @test eweights(0, 0.3) == Weights(Float64[]) + @test eweights(1:0, 0.3) == Weights(Float64[]) + @test eweights(Int[], 1:10, 0.4) == Weights(Float64[]) + end + + @testset "Failure Conditions" begin + # λ > 1.0 + @test_throws ArgumentError eweights(1, 1.1) + + # time indices are not all positive non-zero integers + @test_throws ArgumentError eweights([0, 1, 2, 3], 0.3) + + # Passing in an array of bools will work because Bool <: Integer, + # but any `false` values will trigger the same argument error as 0.0 + @test_throws ArgumentError eweights([true, false, true, true], 0.3) + end +end + end # @testset StatsBase.Weights