From b2c01a5ee93ce44283f31c033a71f3fa9abdd0dc Mon Sep 17 00:00:00 2001 From: Rory Finnegan Date: Tue, 17 Jul 2018 16:07:04 -0700 Subject: [PATCH 001/105] Add exponential weights Co-authored-by: Alex Arslan --- docs/src/weights.md | 14 +++++++++++- src/StatsBase.jl | 2 ++ src/weights.jl | 52 ++++++++++++++++++++++++++++++++++++++++++++- test/weights.jl | 20 +++++++++++++++++ 4 files changed, 86 insertions(+), 2 deletions(-) diff --git a/docs/src/weights.md b/docs/src/weights.md index e8322bfa..424d207f 100644 --- a/docs/src/weights.md +++ b/docs/src/weights.md @@ -41,6 +41,16 @@ w = ProbabilityWeights([0.2, 0.1, 0.3]) w = pweights([0.2, 0.1, 0.3]) ``` +### `ExponentialWeights` + +Exponential weights are a common form of temporal weights which assign exponentially decreasing +weight to past observations. + +```julia +w = ExponentialWeights([0.1837, 0.2222, 0.2688, 0.3253]) +w = eweights(4, 0.173) # construction based on length and rate parameter +``` + ### `Weights` The `Weights` type describes a generic weights vector which does not support all operations possible for `FrequencyWeights`, `AnalyticWeights` and `ProbabilityWeights`. @@ -66,9 +76,11 @@ The following constructors are provided: AnalyticWeights FrequencyWeights ProbabilityWeights +ExponentialWeights Weights aweights fweights pweights +eweights weights -``` \ No newline at end of file +``` diff --git a/src/StatsBase.jl b/src/StatsBase.jl index af6a47aa..46cd90aa 100644 --- a/src/StatsBase.jl +++ b/src/StatsBase.jl @@ -30,10 +30,12 @@ export AnalyticWeights, # to represent an analytic/precision/reliability weight vector FrequencyWeights, # to representing a frequency/case/repeat weight vector ProbabilityWeights, # to representing a probability/sampling weight vector + ExponentialWeights, # to represent an exponential weight vector weights, # construct a generic Weights vector aweights, # construct an AnalyticWeights vector fweights, # construct a FrequencyWeights vector pweights, # construct a ProbabilityWeights vector + eweights, # construct an ExponentialWeights vector wsum, # weighted sum with vector as second argument wsum!, # weighted sum across dimensions with provided storage wmean, # weighted mean diff --git a/src/weights.jl b/src/weights.jl index 17475337..0e2f6af2 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -193,9 +193,59 @@ pweights(vs::RealArray) = ProbabilityWeights(vec(vs)) end end +@weights ExponentialWeights + +@doc """ + ExponentialWeights(vs, wsum=sum(vs)) + +Construct an `ExponentialWeights` vector with weight values `vs`. +A precomputed sum may be provided as `wsum`. + +Exponential weights are a common form of temporal weights which assign exponentially +decreasing weight to past observations, which in this case corresponds to the front of +the vector. That is, newer observations are assumed to be at the end. +""" ExponentialWeights + +""" + eweights(n, λ) + +Construct an [`ExponentialWeights`](@ref) vector with length `n`, +where each element in position ``i`` is set to ``λ (1 - λ)^{1 - i}``. + +``λ`` is a smoothing factor or rate parameter such that ``0 < λ \\leq 1``. +As this value approaches 0, the resulting weights will be almost equal, +while values closer to 1 will put greater weight on the tail elements of the vector. + +# Examples + +```julia-repl +julia> eweights(10, 0.3) +10-element ExponentialWeights{Float64,Float64,Array{Float64,1}}: + 0.3 + 0.42857142857142855 + 0.6122448979591837 + 0.8746355685131197 + 1.249479383590171 + 1.7849705479859588 + 2.549957925694227 + 3.642797036706039 + 5.203995766722913 + 7.434279666747019 +``` +""" +function eweights(n::Integer, λ::Real) + n > 0 || throw(ArgumentError("cannot construct exponential weights of length < 1")) + 0 < λ <= 1 || throw(ArgumentError("smoothing factor must be between 0 and 1")) + w0 = map(i -> λ * (1 - λ)^(1 - i), 1:n) + s = sum(w0) + ExponentialWeights{typeof(s), eltype(w0), typeof(w0)}(w0, s) +end + +# NOTE: No variance correction is implemented for exponential weights + ##### Equality tests ##### -for w in (AnalyticWeights, FrequencyWeights, ProbabilityWeights, Weights) +for w in (AnalyticWeights, FrequencyWeights, ProbabilityWeights, ExponentialWeights, Weights) @eval begin Base.isequal(x::$w, y::$w) = isequal(x.sum, y.sum) && isequal(x.values, y.values) Base.:(==)(x::$w, y::$w) = (x.sum == y.sum) && (x.values == y.values) diff --git a/test/weights.jl b/test/weights.jl index fa8f40be..ecd07f90 100644 --- a/test/weights.jl +++ b/test/weights.jl @@ -2,6 +2,8 @@ using StatsBase using LinearAlgebra, Random, SparseArrays, Test @testset "StatsBase.Weights" begin +# NOTE: Do not add eweights here, as its methods don't match those of the others, so the +# tests below don't make sense for it weight_funcs = (weights, aweights, fweights, pweights) # Construction @@ -447,4 +449,22 @@ end @test round(mean(Union{Int,Missing}[1,2], weights([1,2])), digits=3) ≈ 1.667 end +@testset "ExponentialWeights" begin + @testset "Basic Usage" begin + θ = 5.25 + λ = 1 - exp(-1 / θ) # simple conversion for the more common/readable method + + v = [λ*(1-λ)^(1-i) for i = 1:4] + w = ExponentialWeights(v) + + @test round.(w, digits=4) == [0.1734, 0.2098, 0.2539, 0.3071] + @test eweights(4, λ) ≈ w + end + + @testset "Failure Conditions" begin + @test_throws ArgumentError eweights(0, 0.3) + @test_throws ArgumentError eweights(1, 1.1) + end +end + end # @testset StatsBase.Weights From f701581313d33fd4efa4c10aa576613e550a75f7 Mon Sep 17 00:00:00 2001 From: rofinn Date: Tue, 28 May 2019 16:02:05 -0500 Subject: [PATCH 002/105] Clean eweights code to use the default `Weights` type and support alternate methods. --- Project.toml | 3 +- docs/src/weights.md | 17 +++++----- src/StatsBase.jl | 3 +- src/weights.jl | 80 +++++++++++++++++++++++++++++++-------------- test/runtests.jl | 1 + test/weights.jl | 40 ++++++++++++++++++++--- 6 files changed, 102 insertions(+), 42 deletions(-) diff --git a/Project.toml b/Project.toml index 69967992..215bf0b4 100644 --- a/Project.toml +++ b/Project.toml @@ -13,8 +13,9 @@ SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" [extras] +Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["DelimitedFiles", "Test"] +test = ["Dates", "DelimitedFiles", "Test"] diff --git a/docs/src/weights.md b/docs/src/weights.md index 424d207f..94f02eba 100644 --- a/docs/src/weights.md +++ b/docs/src/weights.md @@ -41,23 +41,22 @@ w = ProbabilityWeights([0.2, 0.1, 0.3]) w = pweights([0.2, 0.1, 0.3]) ``` -### `ExponentialWeights` +### `Weights` -Exponential weights are a common form of temporal weights which assign exponentially decreasing -weight to past observations. +The `Weights` type describes a generic weights vector which does not support all operations possible for `FrequencyWeights`, `AnalyticWeights` and `ProbabilityWeights`. ```julia -w = ExponentialWeights([0.1837, 0.2222, 0.2688, 0.3253]) -w = eweights(4, 0.173) # construction based on length and rate parameter +w = Weights([1., 2., 3.]) +w = weights([1., 2., 3.]) ``` -### `Weights` +### `eweights` -The `Weights` type describes a generic weights vector which does not support all operations possible for `FrequencyWeights`, `AnalyticWeights` and `ProbabilityWeights`. +Exponential weights are a common form of temporal weights which assign exponentially decreasing +weight to past observations. ```julia -w = Weights([1., 2., 3.]) -w = weights([1., 2., 3.]) +w = eweights(4, 0.173) # construction based on length and rate parameter ``` ## Methods diff --git a/src/StatsBase.jl b/src/StatsBase.jl index 46cd90aa..102af4ab 100644 --- a/src/StatsBase.jl +++ b/src/StatsBase.jl @@ -30,12 +30,11 @@ export AnalyticWeights, # to represent an analytic/precision/reliability weight vector FrequencyWeights, # to representing a frequency/case/repeat weight vector ProbabilityWeights, # to representing a probability/sampling weight vector - ExponentialWeights, # to represent an exponential weight vector weights, # construct a generic Weights vector aweights, # construct an AnalyticWeights vector fweights, # construct a FrequencyWeights vector pweights, # construct a ProbabilityWeights vector - eweights, # construct an ExponentialWeights vector + eweights, # construct an exponential Weights vector wsum, # weighted sum with vector as second argument wsum!, # weighted sum across dimensions with provided storage wmean, # weighted mean diff --git a/src/weights.jl b/src/weights.jl index 0e2f6af2..06b80365 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -193,34 +193,42 @@ pweights(vs::RealArray) = ProbabilityWeights(vec(vs)) end end -@weights ExponentialWeights - -@doc """ - ExponentialWeights(vs, wsum=sum(vs)) - -Construct an `ExponentialWeights` vector with weight values `vs`. -A precomputed sum may be provided as `wsum`. +""" + eweights(t::AbstractVector{<:Integer}, λ::Real) + eweights(t::AbstractVector{T}, r::StepRange{T}, λ::Real) where T + eweights(n::Integer, λ::Real) -Exponential weights are a common form of temporal weights which assign exponentially -decreasing weight to past observations, which in this case corresponds to the front of -the vector. That is, newer observations are assumed to be at the end. -""" ExponentialWeights +Construct [`Weights`](@ref) vector which assigns exponentially decreasing weights to past +observations, which in this case corresponds to larger integer values `i` in `t`. -""" - eweights(n, λ) +For each element `i` in `t` the weight value is computed as: -Construct an [`ExponentialWeights`](@ref) vector with length `n`, -where each element in position ``i`` is set to ``λ (1 - λ)^{1 - i}``. +``λ (1 - λ)^{1 - i}`` ``λ`` is a smoothing factor or rate parameter such that ``0 < λ \\leq 1``. As this value approaches 0, the resulting weights will be almost equal, while values closer to 1 will put greater weight on the tail elements of the vector. # Examples +```julia-repl +julia> eweights(1:10, 0.3) +10-element Weights{Float64,Float64,Array{Float64,1}}: + 0.3 + 0.42857142857142855 + 0.6122448979591837 + 0.8746355685131197 + 1.249479383590171 + 1.7849705479859588 + 2.549957925694227 + 3.642797036706039 + 5.203995766722913 + 7.434279666747019 +``` +Simply passing the number of observations `n` is equivalent to passing in `1:n`. ```julia-repl julia> eweights(10, 0.3) -10-element ExponentialWeights{Float64,Float64,Array{Float64,1}}: +10-element Weights{Float64,Float64,Array{Float64,1}}: 0.3 0.42857142857142855 0.6122448979591837 @@ -232,20 +240,42 @@ julia> eweights(10, 0.3) 5.203995766722913 7.434279666747019 ``` + +Finally, passing arbitrary times and a step range is equivalent to passing +`something.(indexin(t, r))`. +```julia-repl +julia> eweights([1, 3, 5], 1:10, 0.3) +3-element Weights{Float64,Float64,Array{Float64,1}}: + 0.3 + 0.6122448979591837 + 1.249479383590171 +``` """ +function eweights(t::AbstractVector{T}, λ::Real) where T<:Integer + 0 < λ <= 1 || throw(ArgumentError("Smoothing factor must be between 0 and 1")) + + w0 = map(t) do i + i > 0 || throw(ArgumentError("Time indices must be non-zero positive integers")) + λ * (1 - λ)^(1 - i) + end + + s = sum(w0) + Weights{typeof(s), eltype(w0), typeof(w0)}(w0, s) +end + function eweights(n::Integer, λ::Real) n > 0 || throw(ArgumentError("cannot construct exponential weights of length < 1")) - 0 < λ <= 1 || throw(ArgumentError("smoothing factor must be between 0 and 1")) - w0 = map(i -> λ * (1 - λ)^(1 - i), 1:n) - s = sum(w0) - ExponentialWeights{typeof(s), eltype(w0), typeof(w0)}(w0, s) + eweights(1:n, λ) end +eweights(t::AbstractVector, r::AbstractRange, λ::Real) = + eweights(something.(indexin(t, r)), λ) + # NOTE: No variance correction is implemented for exponential weights ##### Equality tests ##### -for w in (AnalyticWeights, FrequencyWeights, ProbabilityWeights, ExponentialWeights, Weights) +for w in (AnalyticWeights, FrequencyWeights, ProbabilityWeights, Weights) @eval begin Base.isequal(x::$w, y::$w) = isequal(x.sum, y.sum) && isequal(x.values, y.values) Base.:(==)(x::$w, y::$w) = (x.sum == y.sum) && (x.values == y.values) @@ -531,7 +561,7 @@ _mean(A::AbstractArray{T}, w::AbstractWeights{W}, dims::Int) where {T,W} = Compute the weighted quantiles of a vector `v` at a specified set of probability values `p`, using weights given by a weight vector `w` (of type `AbstractWeights`). Weights must not be negative. The weights and data vectors must have the same length. -`NaN` is returned if `x` contains any `NaN` values. An error is raised if `w` contains +`NaN` is returned if `x` contains any `NaN` values. An error is raised if `w` contains any `NaN` values. With [`FrequencyWeights`](@ref), the function returns the same result as @@ -552,15 +582,15 @@ function quantile(v::RealVector{V}, w::AbstractWeights{W}, p::RealVector) where all(x -> 0 <= x <= 1, p) || throw(ArgumentError("input probability out of [0,1] range")) w.sum == 0 && throw(ArgumentError("weight vector cannot sum to zero")) - length(v) == length(w) || throw(ArgumentError("data and weight vectors must be the same size," * + length(v) == length(w) || throw(ArgumentError("data and weight vectors must be the same size," * "got $(length(v)) and $(length(w))")) for x in w.values isnan(x) && throw(ArgumentError("weight vector cannot contain NaN entries")) x < 0 && throw(ArgumentError("weight vector cannot contain negative entries")) end - isa(w, FrequencyWeights) && !(eltype(w) <: Integer) && any(!isinteger, w) && - throw(ArgumentError("The values of the vector of `FrequencyWeights` must be numerically" * + isa(w, FrequencyWeights) && !(eltype(w) <: Integer) && any(!isinteger, w) && + throw(ArgumentError("The values of the vector of `FrequencyWeights` must be numerically" * "equal to integers. Use `ProbabilityWeights` or `AnalyticWeights` instead.")) # remove zeros weights and sort diff --git a/test/runtests.jl b/test/runtests.jl index dac21a0c..500539c7 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,4 +1,5 @@ using StatsBase +using Dates using LinearAlgebra using Random using Statistics diff --git a/test/weights.jl b/test/weights.jl index ecd07f90..fab085d0 100644 --- a/test/weights.jl +++ b/test/weights.jl @@ -449,21 +449,51 @@ end @test round(mean(Union{Int,Missing}[1,2], weights([1,2])), digits=3) ≈ 1.667 end -@testset "ExponentialWeights" begin - @testset "Basic Usage" begin +@testset "Exponential Weights" begin + @testset "Usage" begin θ = 5.25 λ = 1 - exp(-1 / θ) # simple conversion for the more common/readable method - v = [λ*(1-λ)^(1-i) for i = 1:4] - w = ExponentialWeights(v) + w = Weights(v) @test round.(w, digits=4) == [0.1734, 0.2098, 0.2539, 0.3071] - @test eweights(4, λ) ≈ w + + @testset "basic" begin + @test eweights(1:4, λ) ≈ w + end + + @testset "1:n" begin + @test eweights(4, λ) ≈ w + end + + @testset "indexin" begin + v = [λ*(1-λ)^(1-i) for i = 1:10] + + # Test that we should be able to skip indices easily + @test eweights([1, 3, 5, 7], 1:10, λ) ≈ Weights(v[[1, 3, 5, 7]]) + + # This should also work with actual time types + t1 = DateTime(2019, 1, 1, 1) + tx = t1 + Hour(7) + tn = DateTime(2019, 1, 2, 1) + + @test eweights(t1:Hour(2):tx, t1:Hour(1):tn, λ) ≈ Weights(v[[1, 3, 5, 7]]) + end end @testset "Failure Conditions" begin + # n == 0 @test_throws ArgumentError eweights(0, 0.3) + + # λ > 1.0 @test_throws ArgumentError eweights(1, 1.1) + + # time indices are not all positive non-zero integers + @test_throws ArgumentError eweights([0, 1, 2, 3], 0.3) + + # Passing in an array of bools will work because Bool <: Integer, + # but any `false` values will trigger the same argument error as 0.0 + @test_throws ArgumentError eweights([true, false, true, true], 0.3) end end From bc9cdd0cce5f28ba71f335df6667cd6d969e9d26 Mon Sep 17 00:00:00 2001 From: rofinn Date: Wed, 29 May 2019 11:57:57 -0500 Subject: [PATCH 003/105] Move some of the documentation to the markdown files. --- docs/src/weights.md | 50 ++++++++++++++++++++++++++++++++++++++++++--- src/weights.jl | 26 ----------------------- 2 files changed, 47 insertions(+), 29 deletions(-) diff --git a/docs/src/weights.md b/docs/src/weights.md index 94f02eba..214610d5 100644 --- a/docs/src/weights.md +++ b/docs/src/weights.md @@ -55,8 +55,53 @@ w = weights([1., 2., 3.]) Exponential weights are a common form of temporal weights which assign exponentially decreasing weight to past observations. -```julia -w = eweights(4, 0.173) # construction based on length and rate parameter +For each element `i` in `t` the weight value is computed as: + +``λ (1 - λ)^{1 - i}`` + +``λ`` is a smoothing factor or rate parameter such that ``0 < λ \\leq 1``. +As this value approaches 0, the resulting weights will be almost equal, +while values closer to 1 will put greater weight on the tail elements of the vector. + +# Examples +```julia-repl +julia> eweights(1:10, 0.3) +10-element Weights{Float64,Float64,Array{Float64,1}}: + 0.3 + 0.42857142857142855 + 0.6122448979591837 + 0.8746355685131197 + 1.249479383590171 + 1.7849705479859588 + 2.549957925694227 + 3.642797036706039 + 5.203995766722913 + 7.434279666747019 +``` + +Simply passing the number of observations `n` is equivalent to passing in `1:n`. +```julia-repl +julia> eweights(10, 0.3) +10-element Weights{Float64,Float64,Array{Float64,1}}: + 0.3 + 0.42857142857142855 + 0.6122448979591837 + 0.8746355685131197 + 1.249479383590171 + 1.7849705479859588 + 2.549957925694227 + 3.642797036706039 + 5.203995766722913 + 7.434279666747019 +``` + +Finally, passing arbitrary times and a step range is equivalent to passing `something.(indexin(t, r))`. +```julia-repl +julia> eweights([1, 3, 5], 1:10, 0.3) +3-element Weights{Float64,Float64,Array{Float64,1}}: + 0.3 + 0.6122448979591837 + 1.249479383590171 ``` ## Methods @@ -75,7 +120,6 @@ The following constructors are provided: AnalyticWeights FrequencyWeights ProbabilityWeights -ExponentialWeights Weights aweights fweights diff --git a/src/weights.jl b/src/weights.jl index 06b80365..cea63817 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -224,32 +224,6 @@ julia> eweights(1:10, 0.3) 5.203995766722913 7.434279666747019 ``` - -Simply passing the number of observations `n` is equivalent to passing in `1:n`. -```julia-repl -julia> eweights(10, 0.3) -10-element Weights{Float64,Float64,Array{Float64,1}}: - 0.3 - 0.42857142857142855 - 0.6122448979591837 - 0.8746355685131197 - 1.249479383590171 - 1.7849705479859588 - 2.549957925694227 - 3.642797036706039 - 5.203995766722913 - 7.434279666747019 -``` - -Finally, passing arbitrary times and a step range is equivalent to passing -`something.(indexin(t, r))`. -```julia-repl -julia> eweights([1, 3, 5], 1:10, 0.3) -3-element Weights{Float64,Float64,Array{Float64,1}}: - 0.3 - 0.6122448979591837 - 1.249479383590171 -``` """ function eweights(t::AbstractVector{T}, λ::Real) where T<:Integer 0 < λ <= 1 || throw(ArgumentError("Smoothing factor must be between 0 and 1")) From 22c0f62bfa20864a7058f179b76bd1fd88fd00b1 Mon Sep 17 00:00:00 2001 From: Rory Finnegan Date: Mon, 3 Jun 2019 10:44:39 -0500 Subject: [PATCH 004/105] Update docs/src/weights.md Co-Authored-By: Milan Bouchet-Valat --- docs/src/weights.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/weights.md b/docs/src/weights.md index 214610d5..40f64161 100644 --- a/docs/src/weights.md +++ b/docs/src/weights.md @@ -50,7 +50,7 @@ w = Weights([1., 2., 3.]) w = weights([1., 2., 3.]) ``` -### `eweights` +### Exponential weights: `eweights` Exponential weights are a common form of temporal weights which assign exponentially decreasing weight to past observations. From 6fdf82d347796141fc1af0feecc48aaa85041034 Mon Sep 17 00:00:00 2001 From: Rory Finnegan Date: Mon, 3 Jun 2019 10:46:24 -0500 Subject: [PATCH 005/105] Update docs/src/weights.md Co-Authored-By: Milan Bouchet-Valat --- docs/src/weights.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/weights.md b/docs/src/weights.md index 40f64161..8dd85cf1 100644 --- a/docs/src/weights.md +++ b/docs/src/weights.md @@ -59,7 +59,7 @@ For each element `i` in `t` the weight value is computed as: ``λ (1 - λ)^{1 - i}`` -``λ`` is a smoothing factor or rate parameter such that ``0 < λ \\leq 1``. +``λ`` is a smoothing factor or rate parameter such that ``0 < λ ≤ 1``. As this value approaches 0, the resulting weights will be almost equal, while values closer to 1 will put greater weight on the tail elements of the vector. From 71296bdc13fa454aa2e771a1cc39a35392c3bde8 Mon Sep 17 00:00:00 2001 From: Rory Finnegan Date: Mon, 3 Jun 2019 10:52:13 -0500 Subject: [PATCH 006/105] Update docs/src/weights.md Co-Authored-By: Milan Bouchet-Valat --- docs/src/weights.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/weights.md b/docs/src/weights.md index 8dd85cf1..e375148d 100644 --- a/docs/src/weights.md +++ b/docs/src/weights.md @@ -53,7 +53,7 @@ w = weights([1., 2., 3.]) ### Exponential weights: `eweights` Exponential weights are a common form of temporal weights which assign exponentially decreasing -weight to past observations. +weights to past observations. For each element `i` in `t` the weight value is computed as: From e9e2796b1d79c96f01c732031662794dc33a9a63 Mon Sep 17 00:00:00 2001 From: Rory Finnegan Date: Mon, 3 Jun 2019 10:52:25 -0500 Subject: [PATCH 007/105] Update src/weights.jl Co-Authored-By: Milan Bouchet-Valat --- src/weights.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/weights.jl b/src/weights.jl index cea63817..59e9d84a 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -205,7 +205,7 @@ For each element `i` in `t` the weight value is computed as: ``λ (1 - λ)^{1 - i}`` -``λ`` is a smoothing factor or rate parameter such that ``0 < λ \\leq 1``. +``λ`` is a smoothing factor or rate parameter such that ``0 < λ ≤ 1``. As this value approaches 0, the resulting weights will be almost equal, while values closer to 1 will put greater weight on the tail elements of the vector. From 8bb948f5d84bea42120a05c683ebd0763cd47dde Mon Sep 17 00:00:00 2001 From: Rory Finnegan Date: Mon, 3 Jun 2019 11:41:56 -0500 Subject: [PATCH 008/105] Update src/weights.jl Co-Authored-By: Milan Bouchet-Valat --- src/weights.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/weights.jl b/src/weights.jl index 59e9d84a..c3f0f950 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -234,7 +234,7 @@ function eweights(t::AbstractVector{T}, λ::Real) where T<:Integer end s = sum(w0) - Weights{typeof(s), eltype(w0), typeof(w0)}(w0, s) + Weights(w0, s) end function eweights(n::Integer, λ::Real) From 96275d8f94270f43f67307eaccab20bdc498c261 Mon Sep 17 00:00:00 2001 From: rofinn Date: Mon, 3 Jun 2019 11:57:36 -0500 Subject: [PATCH 009/105] Minor cleanup --- src/weights.jl | 6 +----- test/weights.jl | 11 ++++++----- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/src/weights.jl b/src/weights.jl index c3f0f950..37a94943 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -237,11 +237,7 @@ function eweights(t::AbstractVector{T}, λ::Real) where T<:Integer Weights(w0, s) end -function eweights(n::Integer, λ::Real) - n > 0 || throw(ArgumentError("cannot construct exponential weights of length < 1")) - eweights(1:n, λ) -end - +eweights(n::Integer, λ::Real) = eweights(1:n, λ) eweights(t::AbstractVector, r::AbstractRange, λ::Real) = eweights(something.(indexin(t, r)), λ) diff --git a/test/weights.jl b/test/weights.jl index fab085d0..ab40cf66 100644 --- a/test/weights.jl +++ b/test/weights.jl @@ -2,8 +2,6 @@ using StatsBase using LinearAlgebra, Random, SparseArrays, Test @testset "StatsBase.Weights" begin -# NOTE: Do not add eweights here, as its methods don't match those of the others, so the -# tests below don't make sense for it weight_funcs = (weights, aweights, fweights, pweights) # Construction @@ -481,10 +479,13 @@ end end end - @testset "Failure Conditions" begin - # n == 0 - @test_throws ArgumentError eweights(0, 0.3) + @testset "Empty" begin + @test eweights(0, 0.3) == Weights(Float64[]) + @test eweights(1:0, 0.3) == Weights(Float64[]) + @test eweights(Int[], 1:10, 0.4) == Weights(Float64[]) + end + @testset "Failure Conditions" begin # λ > 1.0 @test_throws ArgumentError eweights(1, 1.1) From cb0962f897d311e6578cffb8e157832c7ebe2c1c Mon Sep 17 00:00:00 2001 From: rofinn Date: Thu, 20 Jun 2019 16:16:41 -0500 Subject: [PATCH 010/105] More docs cleanup. --- docs/src/weights.md | 18 +++++++++++++++--- src/weights.jl | 11 ++++++++--- 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/docs/src/weights.md b/docs/src/weights.md index e375148d..fe277010 100644 --- a/docs/src/weights.md +++ b/docs/src/weights.md @@ -55,7 +55,7 @@ w = weights([1., 2., 3.]) Exponential weights are a common form of temporal weights which assign exponentially decreasing weights to past observations. -For each element `i` in `t` the weight value is computed as: +If `t` is a vector of temporal indices then for each index `i` we compute the weight as: ``λ (1 - λ)^{1 - i}`` @@ -64,6 +64,7 @@ As this value approaches 0, the resulting weights will be almost equal, while values closer to 1 will put greater weight on the tail elements of the vector. # Examples + ```julia-repl julia> eweights(1:10, 0.3) 10-element Weights{Float64,Float64,Array{Float64,1}}: @@ -80,6 +81,7 @@ julia> eweights(1:10, 0.3) ``` Simply passing the number of observations `n` is equivalent to passing in `1:n`. + ```julia-repl julia> eweights(10, 0.3) 10-element Weights{Float64,Float64,Array{Float64,1}}: @@ -95,15 +97,25 @@ julia> eweights(10, 0.3) 7.434279666747019 ``` -Finally, passing arbitrary times and a step range is equivalent to passing `something.(indexin(t, r))`. +Finally, you can construct exponential weights from an arbitrary subset of timestamps within a larger range. + ```julia-repl -julia> eweights([1, 3, 5], 1:10, 0.3) +julia> t +2019-01-01T01:00:00:2 hours:2019-01-01T05:00:00 + +julia> r +2019-01-01T01:00:00:1 hour:2019-01-02T01:00:00 + +julia> eweights(t, r, 0.3) 3-element Weights{Float64,Float64,Array{Float64,1}}: 0.3 0.6122448979591837 1.249479383590171 ``` +NOTE: This is equivalent to `eweights(something.(indexin(t, r)), 0.3)`, which is saying that for each value in `t` return the corresponding index for that value in `r`. +Since `indexin` returns `nothing` if there is no corresponding value from `t` in `r` we use `something` to eliminate that possibility. + ## Methods `AbstractWeights` implements the following methods: diff --git a/src/weights.jl b/src/weights.jl index 37a94943..63de259b 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -205,9 +205,14 @@ For each element `i` in `t` the weight value is computed as: ``λ (1 - λ)^{1 - i}`` -``λ`` is a smoothing factor or rate parameter such that ``0 < λ ≤ 1``. -As this value approaches 0, the resulting weights will be almost equal, -while values closer to 1 will put greater weight on the tail elements of the vector. +# Arguments + +- `t::AbstractVector`: temporal indices or timestamps +- `r::StepRange`: a larger range to use when constructing weights from a subset of timestamps +- `n::Integer`: number of temporal indices to assume for the exponential weighting +- `λ::Real`: a smoothing factor or rate parameter such that ``0 < λ ≤ 1``. + As this value approaches 0, the resulting weights will be almost equal, + while values closer to 1 will put greater weight on the tail elements of the vector. # Examples ```julia-repl From b132deb38eaf47772d63535bf31ed999e254344c Mon Sep 17 00:00:00 2001 From: Rory Finnegan Date: Tue, 25 Jun 2019 10:06:11 -0500 Subject: [PATCH 011/105] Update src/weights.jl Co-Authored-By: Milan Bouchet-Valat --- src/weights.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/weights.jl b/src/weights.jl index 63de259b..873e6d69 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -198,7 +198,7 @@ end eweights(t::AbstractVector{T}, r::StepRange{T}, λ::Real) where T eweights(n::Integer, λ::Real) -Construct [`Weights`](@ref) vector which assigns exponentially decreasing weights to past +Construct a [`Weights`](@ref) vector which assigns exponentially decreasing weights to past observations, which in this case corresponds to larger integer values `i` in `t`. For each element `i` in `t` the weight value is computed as: From bbc3850a9e581e0cd4940e18a3c8262501e7f846 Mon Sep 17 00:00:00 2001 From: Rory Finnegan Date: Tue, 25 Jun 2019 10:06:29 -0500 Subject: [PATCH 012/105] Update src/weights.jl Co-Authored-By: Milan Bouchet-Valat --- src/weights.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/weights.jl b/src/weights.jl index 873e6d69..bdad988b 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -200,6 +200,8 @@ end Construct a [`Weights`](@ref) vector which assigns exponentially decreasing weights to past observations, which in this case corresponds to larger integer values `i` in `t`. +If an integer `n` is provided, weights are generated for values from 1 to `n` +(equivalent to `t = 1:n`). For each element `i` in `t` the weight value is computed as: From 02e0c5e410a8c6d9ca4d10944bada5b66965d9f4 Mon Sep 17 00:00:00 2001 From: Rory Finnegan Date: Tue, 25 Jun 2019 10:06:40 -0500 Subject: [PATCH 013/105] Update src/weights.jl Co-Authored-By: Milan Bouchet-Valat --- src/weights.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/weights.jl b/src/weights.jl index bdad988b..d2548c17 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -211,7 +211,7 @@ For each element `i` in `t` the weight value is computed as: - `t::AbstractVector`: temporal indices or timestamps - `r::StepRange`: a larger range to use when constructing weights from a subset of timestamps -- `n::Integer`: number of temporal indices to assume for the exponential weighting +- `n::Integer`: if provided instead of `t`, temporal indices are taken to be `1:n` - `λ::Real`: a smoothing factor or rate parameter such that ``0 < λ ≤ 1``. As this value approaches 0, the resulting weights will be almost equal, while values closer to 1 will put greater weight on the tail elements of the vector. From e2ceeec803e4cec70d59d8a86a6d2445e2707158 Mon Sep 17 00:00:00 2001 From: Rory Finnegan Date: Wed, 3 Jul 2019 17:03:09 -0500 Subject: [PATCH 014/105] Bump minor version number (#502) Since `eweights` is now being exported we should bump the minor version number --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 215bf0b4..7e721fff 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "StatsBase" uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" -version = "0.30.0" +version = "0.31.0" [deps] DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" From fde3142e416c89ed5e7c4e4f76e3e81b7b348cca Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Tue, 9 Jul 2019 22:35:38 +0200 Subject: [PATCH 015/105] Fix incorrect heading level in exponential weights docs (#503) --- docs/src/weights.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/src/weights.md b/docs/src/weights.md index fe277010..4653af76 100644 --- a/docs/src/weights.md +++ b/docs/src/weights.md @@ -63,8 +63,7 @@ If `t` is a vector of temporal indices then for each index `i` we compute the we As this value approaches 0, the resulting weights will be almost equal, while values closer to 1 will put greater weight on the tail elements of the vector. -# Examples - +For example, the following call generates exponential weights for ten observations with ``λ = 0.3``. ```julia-repl julia> eweights(1:10, 0.3) 10-element Weights{Float64,Float64,Array{Float64,1}}: From 7665328b8266c976e9108fa1c149c3a52bcad681 Mon Sep 17 00:00:00 2001 From: quinnj Date: Thu, 23 May 2019 21:36:16 -0600 Subject: [PATCH 016/105] Add DataAPI as a dependency to share describe with other packages --- Project.toml | 1 + src/StatsBase.jl | 3 +-- src/scalarstats.jl | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Project.toml b/Project.toml index 7e721fff..a812adf3 100644 --- a/Project.toml +++ b/Project.toml @@ -3,6 +3,7 @@ uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" version = "0.31.0" [deps] +DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" diff --git a/src/StatsBase.jl b/src/StatsBase.jl index 102af4ab..4419e550 100644 --- a/src/StatsBase.jl +++ b/src/StatsBase.jl @@ -1,10 +1,9 @@ -__precompile__() - module StatsBase import Base: length, isempty, eltype, values, sum, show, maximum, minimum, extrema import Base.Cartesian: @nloops, @nref, @nextract using Base: @irrational, @propagate_inbounds +using DataAPI import DataStructures: heapify!, heappop!, percolate_down! using SortingAlgorithms using Missings diff --git a/src/scalarstats.jl b/src/scalarstats.jl index 40a0edf3..41486c41 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -648,12 +648,12 @@ Pretty-print the summary statistics provided by [`summarystats`](@ref): the mean, minimum, 25th percentile, median, 75th percentile, and maximum. """ -describe(a::AbstractArray) = describe(stdout, a) -function describe(io::IO, a::AbstractArray{T}) where T<:Union{Real,Missing} +DataAPI.describe(x) = describe(stdout, x) +function DataAPI.describe(io::IO, a::AbstractArray{T}) where T<:Union{Real,Missing} show(io, summarystats(a)) println(io, "Type: $(string(eltype(a)))") end -function describe(io::IO, a::AbstractArray) +function DataAPI.describe(io::IO, a::AbstractArray) println(io, "Summary Stats:") println(io, "Length: $(length(a))") println(io, "Type: $(string(eltype(a)))") From 33182a3decb2abd2216f50438f8c3db7a8ad4938 Mon Sep 17 00:00:00 2001 From: Jacob Quinn Date: Tue, 16 Jul 2019 23:22:04 -0600 Subject: [PATCH 017/105] fix --- src/StatsBase.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/StatsBase.jl b/src/StatsBase.jl index 4419e550..6a250547 100644 --- a/src/StatsBase.jl +++ b/src/StatsBase.jl @@ -4,6 +4,7 @@ import Base: length, isempty, eltype, values, sum, show, maximum, minimum, extre import Base.Cartesian: @nloops, @nref, @nextract using Base: @irrational, @propagate_inbounds using DataAPI +import DataAPI: describe import DataStructures: heapify!, heappop!, percolate_down! using SortingAlgorithms using Missings From 2e91d3730ac775c0ea74f3718c77892b5ad621b7 Mon Sep 17 00:00:00 2001 From: Jacob Quinn Date: Wed, 17 Jul 2019 13:59:19 -0600 Subject: [PATCH 018/105] Drop 0.7 support --- .travis.yml | 2 +- Project.toml | 3 +++ appveyor.yml | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 0197a78b..bba54c4f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,8 +3,8 @@ os: - linux - osx julia: - - 0.7 - 1.0 + - 1.2 - nightly notifications: email: false diff --git a/Project.toml b/Project.toml index a812adf3..758c821c 100644 --- a/Project.toml +++ b/Project.toml @@ -20,3 +20,6 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] test = ["Dates", "DelimitedFiles", "Test"] + +[compat] +julia = "1" \ No newline at end of file diff --git a/appveyor.yml b/appveyor.yml index e3488828..c9bc717e 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -1,7 +1,7 @@ environment: matrix: - - julia_version: 0.7 - julia_version: 1.0 + - julia_version: 1.2 - julia_version: nightly platform: From 0faf210f5fb84d361bd101404aadaef54376fcbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mathieu=20Besan=C3=A7on?= Date: Tue, 30 Jul 2019 15:02:24 +0200 Subject: [PATCH 019/105] version bump (#507) * version bump * fix test warnings --- Project.toml | 5 +++-- test/sampling.jl | 10 +++++----- test/scalarstats.jl | 3 ++- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/Project.toml b/Project.toml index 758c821c..521c3831 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,7 @@ name = "StatsBase" uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" -version = "0.31.0" +authors = ["JuliaStats"] +version = "0.32.0" [deps] DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" @@ -22,4 +23,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" test = ["Dates", "DelimitedFiles", "Test"] [compat] -julia = "1" \ No newline at end of file +julia = "1" diff --git a/test/sampling.jl b/test/sampling.jl index 4f307d87..c3baa015 100644 --- a/test/sampling.jl +++ b/test/sampling.jl @@ -78,13 +78,13 @@ test_rng_use(sample, 1:10, 10) @testset "sampling pairs" begin - Random.seed!(1) + rng = Random.MersenneTwister(1) - @test samplepair(2) === (1, 2) - @test samplepair(10) === (8, 2) + @test samplepair(rng, 2) === (1, 2) + @test samplepair(rng, 10) === (8, 2) - @test samplepair([3, 4, 2, 6, 8]) === (2, 6) - @test samplepair([1, 2]) === (1, 2) + @test samplepair(rng, [3, 4, 2, 6, 8]) === (2, 6) + @test samplepair(rng, [1, 2]) === (1, 2) end test_rng_use(samplepair, 1000) diff --git a/test/scalarstats.jl b/test/scalarstats.jl index 4418f0fc..f163988f 100644 --- a/test/scalarstats.jl +++ b/test/scalarstats.jl @@ -1,6 +1,7 @@ using StatsBase using Test using DelimitedFiles +using Statistics ##### Location @@ -113,7 +114,7 @@ z2 = [8. 2. 3. 1.; 24. 10. -1. -1.; 20. 12. 1. -2.] @test mad((x for x in (1, 2.1)), normalize=false) ≈ 0.55 @test mad(Any[1, 2.1], normalize=false) ≈ 0.55 @test mad(Union{Int,Missing}[1, 2], normalize=false) ≈ 0.5 -@test_throws ArgumentError mad(Int[]) +@test_throws ArgumentError mad(Int[], normalize = true) # Issue 197 @test mad(1:2, normalize=true) ≈ 0.7413011092528009 From 416af5561d3efd11c78c82c269a9d7c7d26c681d Mon Sep 17 00:00:00 2001 From: Ian Fiske <135570+ianfiske@users.noreply.github.com> Date: Wed, 7 Aug 2019 15:33:24 -0400 Subject: [PATCH 020/105] Add rle support for BitArray (#511) --- src/misc.jl | 2 +- test/misc.jl | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/misc.jl b/src/misc.jl index 8d09e311..279c4ac0 100644 --- a/src/misc.jl +++ b/src/misc.jl @@ -18,7 +18,7 @@ julia> rle([1,1,1,2,2,3,3,3,3,2,2,2]) ([1, 2, 3, 2], [3, 2, 4, 3]) ``` """ -function rle(v::Vector{T}) where T +function rle(v::AbstractVector{T}) where T n = length(v) vals = T[] lens = Int[] diff --git a/test/misc.jl b/test/misc.jl index 6773d2c2..38204d18 100644 --- a/test/misc.jl +++ b/test/misc.jl @@ -15,6 +15,11 @@ vals, lens = rle(z) @test lens == [2, 2, 1, 1, 3] @test inverse_rle(vals, lens) == z +z = BitArray([true, true, false, false, true]) +(vals, lens) = rle(z) +@test vals == [true, false, true] +@test lens == [2, 2, 1] + # levelsmap a = [1, 1, 2, 2, 2, 3, 1, 2, 2, 3, 3, 3, 3, 2] b = [true, false, false, true, false, true, true, false] From fde620993a3037cc236f0c1904ece54d2e416b5c Mon Sep 17 00:00:00 2001 From: John Zito Date: Fri, 23 Aug 2019 08:48:32 -0500 Subject: [PATCH 021/105] extend ecdf to a weighted sample (#500) --- src/empirical.jl | 33 ++++++++++++++++++++++----------- test/empirical.jl | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+), 11 deletions(-) diff --git a/src/empirical.jl b/src/empirical.jl index 3e9c31ef..9bd3a175 100644 --- a/src/empirical.jl +++ b/src/empirical.jl @@ -3,43 +3,49 @@ ## Empirical CDF -struct ECDF{T <: AbstractVector{<:Real}} +struct ECDF{T <: AbstractVector{<:Real}, W <: AbstractWeights{<:Real}} sorted_values::T + weights::W end function (ecdf::ECDF)(x::Real) - searchsortedlast(ecdf.sorted_values, x) / length(ecdf.sorted_values) + n = searchsortedlast(ecdf.sorted_values, x) + evenweights = isempty(ecdf.weights) + weightsum = evenweights ? length(ecdf.sorted_values) : sum(ecdf.weights) + partialsum = evenweights ? n : sum(view(ecdf.weights, 1:n)) + partialsum / weightsum end function (ecdf::ECDF)(v::RealVector) + evenweights = isempty(ecdf.weights) + weightsum = evenweights ? length(ecdf.sorted_values) : sum(ecdf.weights) ord = sortperm(v) m = length(v) r = similar(ecdf.sorted_values, m) - r0 = 0 + r0 = zero(weightsum) i = 1 - n = length(ecdf.sorted_values) - for x in ecdf.sorted_values + for (j, x) in enumerate(ecdf.sorted_values) while i <= m && x > v[ord[i]] r[ord[i]] = r0 i += 1 end - r0 += 1 + r0 += evenweights ? 1 : ecdf.weights[j] if i > m break end end while i <= m - r[ord[i]] = n + r[ord[i]] = weightsum i += 1 end - return r / n + return r / weightsum end """ - ecdf(X) + ecdf(X; weights::AbstractWeights) Return an empirical cumulative distribution function (ECDF) based on a vector of samples -given in `X`. +given in `X`. Optionally providing `weights` returns a weighted ECDF. Note: this function that returns a callable composite type, which can then be applied to evaluate CDF values on other samples. @@ -47,7 +53,12 @@ evaluate CDF values on other samples. `extrema`, `minimum`, and `maximum` are supported to for obtaining the range over which function is inside the interval ``(0,1)``; the function is defined for the whole real line. """ -ecdf(X::RealVector{T}) where T<:Real = ECDF(sort(X)) +function ecdf(X::RealVector; weights::AbstractVector{<:Real}=Weights(Float64[])) + isempty(weights) || length(X) == length(weights) || throw(ArgumentError("data and weight vectors must be the same size," * + "got $(length(X)) and $(length(weights))")) + ord = sortperm(X) + ECDF(X[ord], isempty(weights) ? weights : Weights(weights[ord])) +end minimum(ecdf::ECDF) = first(ecdf.sorted_values) diff --git a/test/empirical.jl b/test/empirical.jl index 42582ff1..a0a8f771 100644 --- a/test/empirical.jl +++ b/test/empirical.jl @@ -13,3 +13,41 @@ using Test @test fnecdf([zeros(5000); ones(5000)]) == [zeros(5000); ones(5000)] @test extrema(fnecdf) == (minimum(fnecdf), maximum(fnecdf)) == (0.5, 0.5) end + +@testset "Weighted ECDF" begin + x = randn(10000000) + w1 = rand(10000000) + w2 = weights(w1) + fnecdf = ecdf(x, weights=w1) + fnecdfalt = ecdf(x, weights=w2) + @test fnecdf.sorted_values == fnecdfalt.sorted_values + @test fnecdf.weights == fnecdfalt.weights + @test fnecdf.weights != w1 # check that w wasn't accidently modified in place + @test fnecdfalt.weights != w2 + y = [-1.96, -1.644854, -1.281552, -0.6744898, 0, 0.6744898, 1.281552, 1.644854, 1.96] + @test isapprox(fnecdf(y), [0.025, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.975], atol=1e-3) + @test isapprox(fnecdf(1.96), 0.975, atol=1e-3) + @test fnecdf(y) ≈ map(fnecdf, y) + @test extrema(fnecdf) == (minimum(fnecdf), maximum(fnecdf)) == extrema(x) + fnecdf = ecdf([1.0, 0.5], weights=weights([3, 1])) + @test fnecdf(0.75) == 0.25 + @test extrema(fnecdf) == (minimum(fnecdf), maximum(fnecdf)) == (0.5, 1.0) + @test_throws ArgumentError ecdf(rand(8), weights=weights(rand(10))) + # Check frequency weights + v = randn(100) + r = rand(1:100, 100) + vv = vcat(fill.(v, r)...) # repeat elements of v according to r + fw = fweights(r) + frecdf1 = ecdf(v, weights=fw) + frecdf2 = ecdf(vv) + @test frecdf1(y) ≈ frecdf2(y) + # Check probability weights + a = randn(100) + b = rand(100) + b̃ = abs(10randn()) * b + bw1 = pweights(b) + bw2 = pweights(b̃) + precdf1 = ecdf(a, weights=bw1) + precdf2 = ecdf(a, weights=bw2) + @test precdf1(y) ≈ precdf2(y) +end From b3f9d192ab5125c1cb437d7e26e5c3258d27f82c Mon Sep 17 00:00:00 2001 From: Andreas Noack Date: Sat, 7 Sep 2019 20:50:04 +0200 Subject: [PATCH 022/105] Fix docs (#519) --- .travis.yml | 1 - docs/make.jl | 2 +- docs/src/means.md | 4 ++-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/.travis.yml b/.travis.yml index bba54c4f..441c172b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -27,4 +27,3 @@ jobs: Pkg.instantiate()' - julia --project=docs/ docs/make.jl after_success: skip - diff --git a/docs/make.jl b/docs/make.jl index 83712031..11e56130 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -1,4 +1,4 @@ -using Documenter, StatsBase, Statistics, Random +using Documenter, StatsBase, Statistics, Random, LinearAlgebra # Workaround for JuliaLang/julia/pull/28625 if Base.HOME_PROJECT[] !== nothing diff --git a/docs/src/means.md b/docs/src/means.md index 47b9ff08..638f505a 100644 --- a/docs/src/means.md +++ b/docs/src/means.md @@ -12,6 +12,6 @@ The `mean` and `mean!` functions are also extended to accept a weight vector of `AbstractWeights` to compute weighted mean. ```@docs -Statistics.mean(A::AbstractArray, w::AbstractWeights) -Statistics.mean!(R::AbstractArray, A::AbstractArray, w::AbstractWeights, dim::Int) +mean +mean! ``` From e7333de3fc769e29fb34d10aa104f286e4ad2fb2 Mon Sep 17 00:00:00 2001 From: evalparse Date: Thu, 19 Sep 2019 18:34:21 +1000 Subject: [PATCH 023/105] rle for Union{Missing, T} (#521) --- src/misc.jl | 2 +- test/misc.jl | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/misc.jl b/src/misc.jl index 279c4ac0..a450b32f 100644 --- a/src/misc.jl +++ b/src/misc.jl @@ -31,7 +31,7 @@ function rle(v::AbstractVector{T}) where T i = 2 @inbounds while i <= n vi = v[i] - if vi == cv + if isequal(vi, cv) cl += 1 else push!(vals, cv) diff --git a/test/misc.jl b/test/misc.jl index 38204d18..63fc24df 100644 --- a/test/misc.jl +++ b/test/misc.jl @@ -20,6 +20,12 @@ z = BitArray([true, true, false, false, true]) @test vals == [true, false, true] @test lens == [2, 2, 1] +z = [1, 1, 2, missing, 2, 3, 1, missing, missing, 3, 3, 3, 3] +vals, lens = rle(z) +@test isequal(vals, [1, 2, missing, 2, 3, 1, missing, 3]) +@test lens == [2, 1, 1, 1, 1, 1, 2, 4] +@test isequal(inverse_rle(vals, lens), z) + # levelsmap a = [1, 1, 2, 2, 2, 3, 1, 2, 2, 3, 3, 3, 3, 2] b = [true, false, false, true, false, true, true, false] From 8f422bdcf0d1c4763372e35df7f1f310c7201a40 Mon Sep 17 00:00:00 2001 From: Luca Bittarello Date: Thu, 19 Sep 2019 10:34:43 +0200 Subject: [PATCH 024/105] Add UnitWeights (#515) --- docs/src/weights.md | 11 +++++ src/StatsBase.jl | 4 +- src/weights.jl | 108 ++++++++++++++++++++++++++++++++++++++++++-- test/weights.jl | 48 ++++++++++++++++++++ 4 files changed, 167 insertions(+), 4 deletions(-) diff --git a/docs/src/weights.md b/docs/src/weights.md index 4653af76..73f01e0e 100644 --- a/docs/src/weights.md +++ b/docs/src/weights.md @@ -41,6 +41,17 @@ w = ProbabilityWeights([0.2, 0.1, 0.3]) w = pweights([0.2, 0.1, 0.3]) ``` +### `UnitWeights` + +Unit weights are a special case in which all observations are given a weight equal to `1`. Using such weights is equivalent to computing unweighted statistics. + +This type can notably be used when implementing an algorithm so that a only a weighted variant has to be written. The unweighted variant is then obtained by passing a `UnitWeights` object. This is very efficient since no weights vector is actually allocated. + +```julia +w = uweights(3) +w = uweights(Float64, 3) +``` + ### `Weights` The `Weights` type describes a generic weights vector which does not support all operations possible for `FrequencyWeights`, `AnalyticWeights` and `ProbabilityWeights`. diff --git a/src/StatsBase.jl b/src/StatsBase.jl index 6a250547..8ed42363 100644 --- a/src/StatsBase.jl +++ b/src/StatsBase.jl @@ -1,6 +1,6 @@ module StatsBase -import Base: length, isempty, eltype, values, sum, show, maximum, minimum, extrema +import Base: length, size, isempty, values, sum, show, maximum, minimum, extrema import Base.Cartesian: @nloops, @nref, @nextract using Base: @irrational, @propagate_inbounds using DataAPI @@ -30,11 +30,13 @@ export AnalyticWeights, # to represent an analytic/precision/reliability weight vector FrequencyWeights, # to representing a frequency/case/repeat weight vector ProbabilityWeights, # to representing a probability/sampling weight vector + UnitWeights, # to representing a uniform weight vector weights, # construct a generic Weights vector aweights, # construct an AnalyticWeights vector fweights, # construct a FrequencyWeights vector pweights, # construct a ProbabilityWeights vector eweights, # construct an exponential Weights vector + uweights, # construct an UnitWeights vector wsum, # weighted sum with vector as second argument wsum!, # weighted sum across dimensions with provided storage wmean, # weighted mean diff --git a/src/weights.jl b/src/weights.jl index d2548c17..5affc704 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -17,14 +17,13 @@ macro weights(name) end end -eltype(wv::AbstractWeights) = eltype(wv.values) length(wv::AbstractWeights) = length(wv.values) values(wv::AbstractWeights) = wv.values sum(wv::AbstractWeights) = wv.sum isempty(wv::AbstractWeights) = isempty(wv.values) +size(wv::AbstractWeights) = size(wv.values) Base.getindex(wv::AbstractWeights, i) = getindex(wv.values, i) -Base.size(wv::AbstractWeights) = size(wv.values) @propagate_inbounds function Base.setindex!(wv::AbstractWeights, v::Real, i::Int) s = v - wv[i] @@ -179,7 +178,7 @@ pweights(vs::RealArray) = ProbabilityWeights(vec(vs)) """ varcorrection(w::ProbabilityWeights, corrected=false) -* `corrected=true`: ``\\frac{n}{(n - 1) \\sum w}`` where ``n`` equals `count(!iszero, w)` +* `corrected=true`: ``\\frac{n}{(n - 1) \\sum w}``, where ``n`` equals `count(!iszero, w)` * `corrected=false`: ``\\frac{1}{\\sum w}`` """ @inline function varcorrection(w::ProbabilityWeights, corrected::Bool=false) @@ -250,6 +249,72 @@ eweights(t::AbstractVector, r::AbstractRange, λ::Real) = # NOTE: No variance correction is implemented for exponential weights +struct UnitWeights{T<:Real} <: AbstractWeights{Int, T, V where V<:Vector{T}} + len::Int +end + +@doc """ + UnitWeights{T}(s) + +Construct a `UnitWeights` vector with length `s` and weight elements of type `T`. +All weight elements are identically one. +""" UnitWeights + +values(wv::UnitWeights{T}) where T = fill(one(T), length(wv)) +sum(wv::UnitWeights{T}) where T = convert(T, length(wv)) +isempty(wv::UnitWeights) = iszero(wv.len) +length(wv::UnitWeights) = wv.len +size(wv::UnitWeights) = Tuple(length(wv)) + +@propagate_inbounds function Base.getindex(wv::UnitWeights{T}, i::Integer) where T + @boundscheck checkbounds(wv, i) + one(T) +end + +@propagate_inbounds function Base.getindex(wv::UnitWeights{T}, i::AbstractArray{<:Int}) where T + @boundscheck checkbounds(wv, i) + fill(one(T), size(i)) +end + +Base.getindex(wv::UnitWeights{T}, ::Colon) where T = fill(one(T), length(wv)) + +""" + uweights(s::Integer) + uweights(::Type{T}, s::Integer) where T<:Real + +Construct a `UnitWeights` vector with length `s` and weight elements of type `T`. +All weight elements are identically one. + +# Examples +```julia-repl +julia> uweights(3) +3-element UnitWeights{Int64}: + 1 + 1 + 1 + +julia> uweights(Float64, 3) +3-element UnitWeights{Float64}: + 1.0 + 1.0 + 1.0 +``` +""" +uweights(s::Int) = UnitWeights{Int}(s) +uweights(::Type{T}, s::Int) where {T<:Real} = UnitWeights{T}(s) + +""" + varcorrection(w::UnitWeights, corrected=false) + +* `corrected=true`: ``\\frac{n}{n - 1}``, where ``n`` is the length of the weight vector +* `corrected=false`: ``\\frac{1}{n}``, where ``n`` is the length of the weight vector + +This definition is equivalent to the correction applied to unweighted data. +""" +@inline function varcorrection(w::UnitWeights, corrected::Bool=false) + corrected ? (1 / (w.len - 1)) : (1 / w.len) +end + ##### Equality tests ##### for w in (AnalyticWeights, FrequencyWeights, ProbabilityWeights, Weights) @@ -259,6 +324,9 @@ for w in (AnalyticWeights, FrequencyWeights, ProbabilityWeights, Weights) end end +Base.isequal(x::UnitWeights, y::UnitWeights) = isequal(x.len, y.len) +Base.:(==)(x::UnitWeights, y::UnitWeights) = (x.len == y.len) + Base.isequal(x::AbstractWeights, y::AbstractWeights) = false Base.:(==)(x::AbstractWeights, y::AbstractWeights) = false @@ -279,6 +347,17 @@ Base.sum(v::BitArray, w::AbstractWeights) = wsum(v, values(w)) Base.sum(v::SparseArrays.SparseMatrixCSC, w::AbstractWeights) = wsum(v, values(w)) Base.sum(v::AbstractArray, w::AbstractWeights) = dot(v, values(w)) +for v in (AbstractArray{<:Number}, BitArray, SparseArrays.SparseMatrixCSC, AbstractArray) + @eval begin + function Base.sum(v::$v, w::UnitWeights) + if length(v) != length(w) + throw(DimensionMismatch("Inconsistent array dimension.")) + end + return sum(v) + end + end +end + ## wsum along dimension # # Brief explanation of the algorithm: @@ -475,6 +554,11 @@ function wsum(A::AbstractArray{T}, w::AbstractVector{W}, dim::Int) where {T<:Num _wsum!(similar(A, wsumtype(T,W), Base.reduced_indices(axes(A), dim)), A, w, dim, true) end +function wsum(A::AbstractArray{<:Number}, w::UnitWeights, dim::Int) + size(A, dim) != length(w) && throw(DimensionMismatch("Inconsistent array dimension.")) + return sum(A, dims=dim) +end + # extended sum! and wsum Base.sum!(R::AbstractArray, A::AbstractArray, w::AbstractWeights{<:Real}, dim::Int; init::Bool=true) = @@ -482,6 +566,10 @@ Base.sum!(R::AbstractArray, A::AbstractArray, w::AbstractWeights{<:Real}, dim::I Base.sum(A::AbstractArray{<:Number}, w::AbstractWeights{<:Real}, dim::Int) = wsum(A, values(w), dim) +function Base.sum(A::AbstractArray{<:Number}, w::UnitWeights, dim::Int) + size(A, dim) != length(w) && throw(DimensionMismatch("Inconsistent array dimension.")) + return sum(A, dims=dim) +end ###### Weighted means ##### @@ -530,6 +618,15 @@ _mean(A::AbstractArray, w::AbstractWeights, dims::Nothing) = _mean(A::AbstractArray{T}, w::AbstractWeights{W}, dims::Int) where {T,W} = _mean!(similar(A, wmeantype(T, W), Base.reduced_indices(axes(A), dims)), A, w, dims) +function _mean(A::AbstractArray, w::UnitWeights, dims::Nothing) + length(A) != length(w) && throw(DimensionMismatch("Inconsistent array dimension.")) + return mean(A) +end + +function _mean(A::AbstractArray, w::UnitWeights, dims::Int) + size(A, dims) != length(w) && throw(DimensionMismatch("Inconsistent array dimension.")) + return mean(A, dims=dims) +end ###### Weighted quantile ##### """ @@ -619,6 +716,11 @@ function quantile(v::RealVector{V}, w::AbstractWeights{W}, p::RealVector) where return out end +function quantile(v::RealVector, w::UnitWeights, p::RealVector) + length(v) != length(w) && throw(DimensionMismatch("Inconsistent array dimension.")) + return quantile(v, p) +end + quantile(v::RealVector, w::AbstractWeights{<:Real}, p::Number) = quantile(v, w, [p])[1] diff --git a/test/weights.jl b/test/weights.jl index ab40cf66..5b3fd8ba 100644 --- a/test/weights.jl +++ b/test/weights.jl @@ -96,6 +96,23 @@ end @test x == y end +@testset "Unit weights" begin + wv = uweights(Float64, 3) + @test wv[1] === 1. + @test wv[1:3] == fill(1.0, 3) + @test wv[:] == fill(1.0, 3) + @test !isempty(wv) + @test length(wv) === 3 + @test size(wv) === (3,) + @test sum(wv) === 3. + @test values(wv) == fill(1.0, 3) + @test StatsBase.varcorrection(wv) == 1/3 + @test !isequal(wv, fweights(fill(1.0, 3))) + @test isequal(wv, uweights(3)) + @test wv != fweights(fill(1.0, 3)) + @test wv == uweights(3) +end + ## wsum x = [6., 8., 9.] w = [2., 3., 4.] @@ -447,6 +464,37 @@ end @test round(mean(Union{Int,Missing}[1,2], weights([1,2])), digits=3) ≈ 1.667 end +@testset "Sum, mean, quantiles and variance for unit weights" begin + wt = uweights(Float64, 3) + + @test sum([1.0, 2.0, 3.0], wt) ≈ 6.0 + @test mean([1.0, 2.0, 3.0], wt) ≈ 2.0 + + @test sum(a, wt, 1) ≈ sum(a, dims=1) + @test sum(a, wt, 2) ≈ sum(a, dims=2) + @test sum(a, wt, 3) ≈ sum(a, dims=3) + + @test wsum(a, wt, 1) ≈ sum(a, dims=1) + @test wsum(a, wt, 2) ≈ sum(a, dims=2) + @test wsum(a, wt, 3) ≈ sum(a, dims=3) + + @test mean(a, wt, dims=1) ≈ mean(a, dims=1) + @test mean(a, wt, dims=2) ≈ mean(a, dims=2) + @test mean(a, wt, dims=3) ≈ mean(a, dims=3) + + @test_throws DimensionMismatch sum(a, wt) + @test_throws DimensionMismatch sum(a, wt, 4) + @test_throws DimensionMismatch wsum(a, wt, 4) + @test_throws DimensionMismatch mean(a, wt, dims=4) + + @test quantile([1.0, 4.0, 6.0, 8.0, 10.0], uweights(5), [0.5]) ≈ [6.0] + @test quantile([1.0, 4.0, 6.0, 8.0, 10.0], uweights(5), 0.5) ≈ 6.0 + @test median([1.0, 4.0, 6.0, 8.0, 10.0], uweights(5)) ≈ 6.0 + + @test var(a, uweights(Float64, 27), corrected=false) ≈ var(a, corrected=false) + @test var(a, uweights(Float64, 27), corrected=true) ≈ var(a, corrected= true) +end + @testset "Exponential Weights" begin @testset "Usage" begin θ = 5.25 From 2fd192af3ba3eede886ef6f8791cc65e1a4af8aa Mon Sep 17 00:00:00 2001 From: Yueh-Hua Tu Date: Thu, 19 Sep 2019 19:28:31 +0800 Subject: [PATCH 025/105] Provide standardize API for 1D array (#490) --- src/transformations.jl | 317 +++++++++++++++++++++++++--------------- test/transformations.jl | 170 +++++++++++++++++---- 2 files changed, 344 insertions(+), 143 deletions(-) diff --git a/src/transformations.jl b/src/transformations.jl index 4d15353a..c3967fc1 100644 --- a/src/transformations.jl +++ b/src/transformations.jl @@ -8,63 +8,72 @@ abstract type AbstractDataTransform end Apply transformation `t` to vector or matrix `x` in place. """ -transform!(t::AbstractDataTransform, x::AbstractArray{<:Real,1}) = transform!(x, t, x) -transform!(t::AbstractDataTransform, x::AbstractArray{<:Real,2}) = transform!(x, t, x) +transform!(t::AbstractDataTransform, x::AbstractMatrix{<:Real}) = + transform!(x, t, x) +transform!(t::AbstractDataTransform, x::AbstractVector{<:Real}) = + (transform!(t, reshape(x, :, 1)); x) """ transform(t::AbstractDataTransform, x) -Return a row-standardized vector or matrix `x` using `t` transformation. +Return a standardized vector or matrix `x` using `t` transformation. """ -transform(t::AbstractDataTransform, x::AbstractArray{<:Real,1}) = transform!(similar(x), t, x) -transform(t::AbstractDataTransform, x::AbstractArray{<:Real,2}) = transform!(similar(x), t, x) +transform(t::AbstractDataTransform, x::AbstractMatrix{<:Real}) = + transform!(similar(x), t, x) +transform(t::AbstractDataTransform, x::AbstractVector{<:Real}) = + vec(transform(t, reshape(x, :, 1))) # reconstruct the original data from transformed values """ reconstruct!(t::AbstractDataTransform, y) -Perform an in-place reconstruction into an original data scale from a row-transformed +Perform an in-place reconstruction into an original data scale from a transformed vector or matrix `y` using `t` transformation. """ -reconstruct!(t::AbstractDataTransform, y::AbstractArray{<:Real,1}) = reconstruct!(y, t, y) -reconstruct!(t::AbstractDataTransform, y::AbstractArray{<:Real,2}) = reconstruct!(y, t, y) +reconstruct!(t::AbstractDataTransform, y::AbstractMatrix{<:Real}) = + reconstruct!(y, t, y) +reconstruct!(t::AbstractDataTransform, y::AbstractVector{<:Real}) = + (reconstruct!(t, reshape(y, :, 1)); y) """ reconstruct(t::AbstractDataTransform, y) -Return a reconstruction of an originally scaled data from a row-transformed vector +Return a reconstruction of an originally scaled data from a transformed vector or matrix `y` using `t` transformation. """ -reconstruct(t::AbstractDataTransform, y::AbstractArray{<:Real,1}) = reconstruct!(similar(y), t, y) -reconstruct(t::AbstractDataTransform, y::AbstractArray{<:Real,2}) = reconstruct!(similar(y), t, y) +reconstruct(t::AbstractDataTransform, y::AbstractMatrix{<:Real}) = + reconstruct!(similar(y), t, y) +reconstruct(t::AbstractDataTransform, y::AbstractVector{<:Real}) = + vec(reconstruct(t, reshape(y, :, 1))) """ Standardization (Z-score transformation) """ struct ZScoreTransform{T<:Real} <: AbstractDataTransform - dim::Int + len::Int + dims::Int mean::Vector{T} scale::Vector{T} - function ZScoreTransform(d::Int, m::Vector{T}, s::Vector{T}) where T + function ZScoreTransform(l::Int, dims::Int, m::Vector{T}, s::Vector{T}) where T lenm = length(m) lens = length(s) - lenm == d || lenm == 0 || throw(DimensionMismatch("Inconsistent dimensions.")) - lens == d || lens == 0 || throw(DimensionMismatch("Inconsistent dimensions.")) - new{T}(d, m, s) + lenm == l || lenm == 0 || throw(DimensionMismatch("Inconsistent dimensions.")) + lens == l || lens == 0 || throw(DimensionMismatch("Inconsistent dimensions.")) + new{T}(l, dims, m, s) end end function Base.getproperty(t::ZScoreTransform, p::Symbol) if p === :indim || p === :outdim - return t.dim + return t.len else return getfield(t, p) end end """ - fit(ZScoreTransform, X; center=true, scale=true) + fit(ZScoreTransform, X; dims=nothing, center=true, scale=true) Fit standardization parameters to `X` and return a `ZScoreTransform` transformation object. @@ -74,6 +83,9 @@ Fit standardization parameters to `X` and return a `ZScoreTransform` transformat # Keyword arguments +* `dims`: if `1` fit standardization parameters in column-wise fashion; + if `2` fit in row-wise fashion. The default is `nothing`, which is equivalent to `dims=2` with a deprecation warning. + * `center`: if `true` (the default) center data so that its mean is zero. * `scale`: if `true` (the default) scale the data so that its variance is equal to one. @@ -88,7 +100,7 @@ julia> X = [0.0 -0.5 0.5; 0.0 1.0 2.0] 0.0 -0.5 0.5 0.0 1.0 2.0 -julia> dt = fit(ZScoreTransform, X) +julia> dt = fit(ZScoreTransform, X, dims=2) ZScoreTransform{Float64}(2, [0.0, 1.0], [0.5, 1.0]) julia> StatsBase.transform(dt, X) @@ -97,67 +109,101 @@ julia> StatsBase.transform(dt, X) -1.0 0.0 1.0 ``` """ -function fit(::Type{ZScoreTransform}, X::AbstractArray{<:Real,2}; center::Bool=true, scale::Bool=true) - d, n = size(X) - n >= 2 || error("X must contain at least two columns.") - +function fit(::Type{ZScoreTransform}, X::AbstractMatrix{<:Real}; + dims::Union{Integer,Nothing}=nothing, center::Bool=true, scale::Bool=true) + if dims == 1 + n, l = size(X) + n >= 2 || error("X must contain at least two rows.") + m, s = mean_and_std(X, 1) + elseif dims == 2 + l, n = size(X) + n >= 2 || error("X must contain at least two columns.") + m, s = mean_and_std(X, 2) + elseif dims === nothing + Base.depwarn("fit(t, x) is deprecated: use fit(t, x, dims=2) instead", :fit) + m, s = mean_and_std(X, 2) + else + throw(DomainError(dims, "fit only accept dims to be 1 or 2.")) + end T = eltype(X) - m, s = mean_and_std(X, 2) - - return ZScoreTransform(d, (center ? vec(m) : zeros(T, 0)), - (scale ? vec(s) : zeros(T, 0))) + return ZScoreTransform(l, dims, (center ? vec(m) : zeros(T, 0)), + (scale ? vec(s) : zeros(T, 0))) end -function transform!(y::AbstractVecOrMat{<:Real}, t::ZScoreTransform, x::AbstractVecOrMat{<:Real}) - d = t.dim - size(x,1) == size(y,1) == d || throw(DimensionMismatch("Inconsistent dimensions.")) - n = size(y,2) - size(x,2) == n || throw(DimensionMismatch("Inconsistent dimensions.")) +function fit(::Type{ZScoreTransform}, X::AbstractVector{<:Real}; + dims::Union{Integer,Nothing}=nothing, center::Bool=true, scale::Bool=true) + if dims == nothing + Base.depwarn("fit(t, x) is deprecated: use fit(t, x, dims=2) instead", :fit) + elseif dims != 1 + throw(DomainError(dims, "fit only accepts dims=1 over a vector. Try fit(t, x, dims=1).")) + end - m = t.mean - s = t.scale + T = eltype(X) + m, s = mean_and_std(X) + return ZScoreTransform(1, dims, (center ? [m] : zeros(T, 0)), + (scale ? [s] : zeros(T, 0))) +end - if isempty(m) - if isempty(s) - if x !== y - copyto!(y, x) +function transform!(y::AbstractMatrix{<:Real}, t::ZScoreTransform, x::AbstractMatrix{<:Real}) + if t.dims == 1 + l = t.len + size(x,2) == size(y,2) == l || throw(DimensionMismatch("Inconsistent dimensions.")) + n = size(y,1) + size(x,1) == n || throw(DimensionMismatch("Inconsistent dimensions.")) + + m = t.mean + s = t.scale + + if isempty(m) + if isempty(s) + if x !== y + copyto!(y, x) + end + else + broadcast!(/, y, x, s') end else - broadcast!(/, y, x, s) - end - else - if isempty(s) - broadcast!(-, y, x, m) - else - broadcast!((x,m,s)->(x-m)/s, y, x, m, s) + if isempty(s) + broadcast!(-, y, x, m') + else + broadcast!((x,m,s)->(x-m)/s, y, x, m', s') + end end + elseif t.dims == 2 + t_ = ZScoreTransform(t.len, 1, t.mean, t.scale) + transform!(y', t_, x') end return y end -function reconstruct!(x::AbstractVecOrMat{<:Real}, t::ZScoreTransform, y::AbstractVecOrMat{<:Real}) - d = t.dim - size(x,1) == size(y,1) == d || throw(DimensionMismatch("Inconsistent dimensions.")) - n = size(y,2) - size(x,2) == n || throw(DimensionMismatch("Inconsistent dimensions.")) - - m = t.mean - s = t.scale - - if isempty(m) - if isempty(s) - if y !== x - copyto!(x, y) +function reconstruct!(x::AbstractMatrix{<:Real}, t::ZScoreTransform, y::AbstractMatrix{<:Real}) + if t.dims == 1 + l = t.len + size(x,2) == size(y,2) == l || throw(DimensionMismatch("Inconsistent dimensions.")) + n = size(y,1) + size(x,1) == n || throw(DimensionMismatch("Inconsistent dimensions.")) + + m = t.mean + s = t.scale + + if isempty(m) + if isempty(s) + if y !== x + copyto!(x, y) + end + else + broadcast!(*, x, y, s') end else - broadcast!(*, x, y, s) - end - else - if isempty(s) - broadcast!(+, x, y, m) - else - broadcast!((y,m,s)->y*s+m, x, y, m, s) + if isempty(s) + broadcast!(+, x, y, m') + else + broadcast!((y,m,s)->y*s+m, x, y, m', s') + end end + elseif t.dims == 2 + t_ = ZScoreTransform(t.len, 1, t.mean, t.scale) + reconstruct!(x', t_, y') end return x end @@ -166,23 +212,24 @@ end Unit range normalization """ struct UnitRangeTransform{T<:Real} <: AbstractDataTransform - dim::Int + len::Int + dims::Int unit::Bool min::Vector{T} scale::Vector{T} - function UnitRangeTransform(d::Int, unit::Bool, min::Vector{T}, max::Vector{T}) where {T} + function UnitRangeTransform(l::Int, dims::Int, unit::Bool, min::Vector{T}, max::Vector{T}) where {T} lenmin = length(min) lenmax = length(max) - lenmin == d || lenmin == 0 || throw(DimensionMismatch("Inconsistent dimensions.")) - lenmax == d || lenmax == 0 || throw(DimensionMismatch("Inconsistent dimensions.")) - new{T}(d, unit, min, max) + lenmin == l || lenmin == 0 || throw(DimensionMismatch("Inconsistent dimensions.")) + lenmax == l || lenmax == 0 || throw(DimensionMismatch("Inconsistent dimensions.")) + new{T}(l, dims, unit, min, max) end end function Base.getproperty(t::UnitRangeTransform, p::Symbol) if p === :indim || p === :outdim - return t.dim + return t.len else return getfield(t, p) end @@ -190,7 +237,7 @@ end # fit a unit transform """ - fit(UnitRangeTransform, X; center=true, scale=true) + fit(UnitRangeTransform, X; dims=nothing, unit=true) Fit a scaling parameters to `X` and return transformation description. @@ -200,9 +247,10 @@ Fit a scaling parameters to `X` and return transformation description. # Keyword arguments -* `center`: if `true` (the default) centere data around zero. +* `dims`: if `1` fit standardization parameters in column-wise fashion; + if `2` fit in row-wise fashion. The default is `nothing`. -* `scale`: if `true` (the default) perform variance scaling. +* `unit`: if `true` (the default) shift the minimum data to zero. # Examples @@ -214,7 +262,7 @@ julia> X = [0.0 -0.5 0.5; 0.0 1.0 2.0] 0.0 -0.5 0.5 0.0 1.0 2.0 -julia> dt = fit(UnitRangeTransform, X) +julia> dt = fit(UnitRangeTransform, X, dims=2) UnitRangeTransform{Float64}(2, true, [-0.5, 0.0], [1.0, 0.5]) julia> StatsBase.transform(dt, X) @@ -223,64 +271,101 @@ julia> StatsBase.transform(dt, X) 0.0 0.5 1.0 ``` """ -function fit(::Type{UnitRangeTransform}, X::AbstractArray{<:Real,2}; unit::Bool=true) - d, n = size(X) - - tmin = X[:, 1] - tmax = X[:, 1] - for j = 2:n - @inbounds for i = 1:d - if X[i, j] < tmin[i] - tmin[i] = X[i, j] - elseif X[i, j] > tmax[i] - tmax[i] = X[i, j] +function fit(::Type{UnitRangeTransform}, X::AbstractMatrix{<:Real}; + dims::Union{Integer,Nothing}=nothing, unit::Bool=true) + if dims == 1 + l, tmin, tmax = _compute_extrema(X) + elseif dims == 2 + l, tmin, tmax = _compute_extrema(X') + elseif dims == nothing + Base.depwarn("fit(t, x) is deprecated: use fit(t, x, dims=2) instead", :fit) + l, tmin, tmax = _compute_extrema(X') + else + throw(DomainError(dims, "fit only accept dims to be 1 or 2.")) + end + + for i = 1:l + @inbounds tmax[i] = 1 / (tmax[i] - tmin[i]) + end + return UnitRangeTransform(l, dims, unit, tmin, tmax) +end + +function _compute_extrema(X::AbstractMatrix{<:Real}) + n, l = size(X) + tmin = X[1, :] + tmax = X[1, :] + for j = 1:l + @inbounds for i = 2:n + if X[i, j] < tmin[j] + tmin[j] = X[i, j] + elseif X[i, j] > tmax[j] + tmax[j] = X[i, j] end end end - for i = 1:d - @inbounds tmax[i] = 1 / (tmax[i] - tmin[i]) + return l, tmin, tmax +end + +function fit(::Type{UnitRangeTransform}, X::AbstractVector{<:Real}; + dims::Union{Integer,Nothing}=nothing, unit::Bool=true) + if dims != 1 + throw(DomainError(dims, "fit only accept dims=1 over a vector. Try fit(t, x, dims=1).")) end - return UnitRangeTransform(d, unit, tmin, tmax) + + l, tmin, tmax = _compute_extrema(reshape(X, :, 1)) + tmax = 1 / (tmax - tmin) + return UnitRangeTransform(1, dims, unit, vec(tmin), vec(tmax)) end -function transform!(y::AbstractVecOrMat{<:Real}, t::UnitRangeTransform, x::AbstractVecOrMat{<:Real}) - d = t.dim - size(x,1) == size(y,1) == d || throw(DimensionMismatch("Inconsistent dimensions.")) - n = size(x,2) - size(y,2) == n || throw(DimensionMismatch("Inconsistent dimensions.")) +function transform!(y::AbstractMatrix{<:Real}, t::UnitRangeTransform, x::AbstractMatrix{<:Real}) + if t.dims == 1 + l = t.len + size(x,2) == size(y,2) == l || throw(DimensionMismatch("Inconsistent dimensions.")) + n = size(x,1) + size(y,1) == n || throw(DimensionMismatch("Inconsistent dimensions.")) - tmin = t.min - tscale = t.scale + tmin = t.min + tscale = t.scale - if t.unit - broadcast!((x,s,m) -> (x-m)*s, y, x, tscale, tmin) - else - broadcast!(*, y, x, tscale) + if t.unit + broadcast!((x,s,m)->(x-m)*s, y, x, tscale', tmin') + else + broadcast!(*, y, x, tscale') + end + elseif t.dims == 2 + t_ = UnitRangeTransform(t.len, 1, t.unit, t.min, t.scale) + transform!(y', t_, x') end return y end -function reconstruct!(x::AbstractVecOrMat{<:Real}, t::UnitRangeTransform, y::AbstractVecOrMat{<:Real}) - d = t.dim - size(x,1) == size(y,1) == d || throw(DimensionMismatch("Inconsistent dimensions.")) - n = size(y,2) - size(x,2) == n || throw(DimensionMismatch("Inconsistent dimensions.")) +function reconstruct!(x::AbstractMatrix{<:Real}, t::UnitRangeTransform, y::AbstractMatrix{<:Real}) + if t.dims == 1 + l = t.len + size(x,2) == size(y,2) == l || throw(DimensionMismatch("Inconsistent dimensions.")) + n = size(y,1) + size(x,1) == n || throw(DimensionMismatch("Inconsistent dimensions.")) - tmin = t.min - tscale = t.scale + tmin = t.min + tscale = t.scale - if t.unit - broadcast!((y,s,m) -> y/s + m, x, y, tscale, tmin) - else - broadcast!(/, x, y, tscale) + if t.unit + broadcast!((y,s,m)->y/s+m, x, y, tscale', tmin') + else + broadcast!(/, x, y, tscale') + end + elseif t.dims == 2 + t_ = UnitRangeTransform(t.len, 1, t.unit, t.min, t.scale) + reconstruct!(x', t_, y') end return x end """ - standardize(DT, X; kwargs...) + standardize(DT, X; dims=nothing, kwargs...) -Return a row-standardized matrix `X` using `DT` transformation which is a subtype of `AbstractDataTransform`: + Return a standardized copy of vector or matrix `X` along dimensions `dims` + using transformation `DT` which is a subtype of `AbstractDataTransform`: - `ZScoreTransform` - `UnitRangeTransform` @@ -290,17 +375,17 @@ Return a row-standardized matrix `X` using `DT` transformation which is a subtyp ```jldoctest julia> using StatsBase -julia> standardize(ZScoreTransform, [0.0 -0.5 0.5; 0.0 1.0 2.0]) +julia> standardize(ZScoreTransform, [0.0 -0.5 0.5; 0.0 1.0 2.0], dims=2) 2×3 Array{Float64,2}: 0.0 -1.0 1.0 -1.0 0.0 1.0 -julia> standardize(UnitRangeTransform, [0.0 -0.5 0.5; 0.0 1.0 2.0]) +julia> standardize(UnitRangeTransform, [0.0 -0.5 0.5; 0.0 1.0 2.0], dims=2) 2×3 Array{Float64,2}: 0.5 0.0 1.0 0.0 0.5 1.0 ``` """ -function standardize(::Type{DT}, X::AbstractArray{<:Real,2}; kwargs...) where {DT<:AbstractDataTransform} +function standardize(::Type{DT}, X::AbstractVecOrMat{<:Real}; kwargs...) where {DT <: AbstractDataTransform} return transform(fit(DT, X; kwargs...), X) end diff --git a/test/transformations.jl b/test/transformations.jl index 393fb27e..7d8e2b0a 100644 --- a/test/transformations.jl +++ b/test/transformations.jl @@ -1,66 +1,182 @@ using StatsBase -import StatsBase: transform, reconstruct +import StatsBase: transform, reconstruct, transform!, reconstruct! using Statistics using Test @testset "Transformations" begin + # matrix X = rand(5, 8) + X_ = copy(X) - t = fit(ZScoreTransform, X; center=false, scale=false) + t = fit(ZScoreTransform, X, dims=1, center=false, scale=false) Y = transform(t, X) @test isa(t, AbstractDataTransform) @test isempty(t.mean) @test isempty(t.scale) @test isequal(X, Y) - @test transform(t, X[:,1]) ≈ Y[:,1] - @test reconstruct(t, Y[:,1]) ≈ X[:,1] @test reconstruct(t, Y) ≈ X + @test transform!(t, X) === X + @test isequal(X, Y) + @test reconstruct!(t, Y) === Y + @test Y ≈ X_ - t = fit(ZScoreTransform, X; center=false) + X = copy(X_) + t = fit(ZScoreTransform, X, dims=1, center=false) Y = transform(t, X) @test isempty(t.mean) - @test length(t.scale) == 5 - @test Y ≈ X ./ std(X, dims=2) - @test transform(t, X[:,1]) ≈ Y[:,1] - @test reconstruct(t, Y[:,1]) ≈ X[:,1] + @test length(t.scale) == 8 + @test Y ≈ X ./ std(X, dims=1) @test reconstruct(t, Y) ≈ X + @test transform!(t, X) === X + @test isequal(X, Y) + @test reconstruct!(t, Y) === Y + @test Y ≈ X_ - t = fit(ZScoreTransform, X; scale=false) + X = copy(X_) + t = fit(ZScoreTransform, X, dims=1, scale=false) Y = transform(t, X) - @test length(t.mean) == 5 + @test length(t.mean) == 8 @test isempty(t.scale) - @test Y ≈ X .- mean(X, dims=2) - @test transform(t, X[:,1]) ≈ Y[:,1] - @test reconstruct(t, Y[:,1]) ≈ X[:,1] + @test Y ≈ X .- mean(X, dims=1) @test reconstruct(t, Y) ≈ X + @test transform!(t, X) === X + @test isequal(X, Y) + @test reconstruct!(t, Y) === Y + @test Y ≈ X_ - t = fit(ZScoreTransform, X) + X = copy(X_) + t = fit(ZScoreTransform, X, dims=1) + Y = transform(t, X) + @test length(t.mean) == 8 + @test length(t.scale) == 8 + @test Y ≈ (X .- mean(X, dims=1)) ./ std(X, dims=1) + @test reconstruct(t, Y) ≈ X + @test Y ≈ standardize(ZScoreTransform, X, dims=1) + @test transform!(t, X) === X + @test isequal(X, Y) + @test reconstruct!(t, Y) === Y + @test Y ≈ X_ + + X = copy(X_) + t = fit(ZScoreTransform, X, dims=2) Y = transform(t, X) @test length(t.mean) == 5 @test length(t.scale) == 5 @test Y ≈ (X .- mean(X, dims=2)) ./ std(X, dims=2) - @test transform(t, X[:,1]) ≈ Y[:,1] - @test reconstruct(t, Y[:,1]) ≈ X[:,1] @test reconstruct(t, Y) ≈ X + @test Y ≈ standardize(ZScoreTransform, X, dims=2) + @test transform!(t, X) === X + @test isequal(X, Y) + @test reconstruct!(t, Y) === Y + @test Y ≈ X_ + + X = copy(X_) + t = fit(UnitRangeTransform, X, dims=1, unit=false) + Y = transform(t, X) + @test length(t.min) == 8 + @test length(t.scale) == 8 + @test Y ≈ X ./ (maximum(X, dims=1) .- minimum(X, dims=1)) + @test reconstruct(t, Y) ≈ X + @test transform!(t, X) === X + @test isequal(X, Y) + @test reconstruct!(t, Y) === Y + @test Y ≈ X_ - t = fit(UnitRangeTransform, X) + X = copy(X_) + t = fit(UnitRangeTransform, X, dims=1) + Y = transform(t, X) + @test isa(t, AbstractDataTransform) + @test length(t.min) == 8 + @test length(t.scale) == 8 + @test Y ≈ (X .- minimum(X, dims=1)) ./ (maximum(X, dims=1) .- minimum(X, dims=1)) + @test reconstruct(t, Y) ≈ X + @test Y ≈ standardize(UnitRangeTransform, X, dims=1) + @test transform!(t, X) === X + @test isequal(X, Y) + @test reconstruct!(t, Y) === Y + @test Y ≈ X_ + + X = copy(X_) + t = fit(UnitRangeTransform, X, dims=2) Y = transform(t, X) @test isa(t, AbstractDataTransform) @test length(t.min) == 5 @test length(t.scale) == 5 @test Y ≈ (X .- minimum(X, dims=2)) ./ (maximum(X, dims=2) .- minimum(X, dims=2)) - @test transform(t, X[:,1]) ≈ Y[:,1] - @test reconstruct(t, Y[:,1]) ≈ X[:,1] @test reconstruct(t, Y) ≈ X + @test transform!(t, X) === X + @test isequal(X, Y) + @test reconstruct!(t, Y) === Y + @test Y ≈ X_ - t = fit(UnitRangeTransform, X; unit=false) + # vector + X = rand(10) + X_ = copy(X) + + t = fit(ZScoreTransform, X, dims=1, center=false, scale=false) Y = transform(t, X) - @test length(t.min) == 5 - @test length(t.scale) == 5 - @test Y ≈ X ./ (maximum(X, dims=2) .- minimum(X, dims=2)) - @test transform(t, X[:,1]) ≈ Y[:,1] - @test reconstruct(t, Y[:,1]) ≈ X[:,1] + @test transform(t, X) ≈ Y + @test reconstruct(t, Y) ≈ X + @test transform!(t, X) === X + @test isequal(X, Y) + @test reconstruct!(t, Y) === Y + @test Y ≈ X_ + + X = copy(X_) + t = fit(ZScoreTransform, X, dims=1, center=false) + Y = transform(t, X) + @test Y ≈ X ./ std(X, dims=1) + @test transform(t, X) ≈ Y + @test reconstruct(t, Y) ≈ X + @test transform!(t, X) === X + @test isequal(X, Y) + @test reconstruct!(t, Y) === Y + @test Y ≈ X_ + + X = copy(X_) + t = fit(ZScoreTransform, X, dims=1, scale=false) + Y = transform(t, X) + @test Y ≈ X .- mean(X, dims=1) + @test transform(t, X) ≈ Y + @test reconstruct(t, Y) ≈ X + @test transform!(t, X) === X + @test isequal(X, Y) + @test reconstruct!(t, Y) === Y + @test Y ≈ X_ + + X = copy(X_) + t = fit(ZScoreTransform, X, dims=1) + Y = transform(t, X) + @test Y ≈ (X .- mean(X, dims=1)) ./ std(X, dims=1) + @test transform(t, X) ≈ Y + @test reconstruct(t, Y) ≈ X + @test Y ≈ standardize(ZScoreTransform, X, dims=1) + @test transform!(t, X) === X + @test isequal(X, Y) + @test reconstruct!(t, Y) === Y + @test Y ≈ X_ + + X = copy(X_) + t = fit(UnitRangeTransform, X, dims=1) + Y = transform(t, X) + @test Y ≈ (X .- minimum(X, dims=1)) ./ (maximum(X, dims=1) .- minimum(X, dims=1)) + @test transform(t, X) ≈ Y @test reconstruct(t, Y) ≈ X + @test transform!(t, X) === X + @test isequal(X, Y) + @test reconstruct!(t, Y) === Y + @test Y ≈ X_ + + X = copy(X_) + t = fit(UnitRangeTransform, X, dims=1, unit=false) + Y = transform(t, X) + @test Y ≈ X ./ (maximum(X, dims=1) .- minimum(X, dims=1)) + @test transform(t, X) ≈ Y + @test reconstruct(t, Y) ≈ X + @test Y ≈ standardize(UnitRangeTransform, X, dims=1, unit=false) + @test transform!(t, X) === X + @test isequal(X, Y) + @test reconstruct!(t, Y) === Y + @test Y ≈ X_ - @test Y == standardize(UnitRangeTransform, X; unit=false) end From 28b59dfb27620d9830244b319d00315080a4e66b Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Thu, 19 Sep 2019 15:25:03 +0200 Subject: [PATCH 026/105] Improve transformation docs (#491) --- docs/src/transformations.md | 2 +- src/transformations.jl | 24 +++++++++--------------- 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/docs/src/transformations.md b/docs/src/transformations.md index 3c4b973f..f5ec2275 100644 --- a/docs/src/transformations.md +++ b/docs/src/transformations.md @@ -24,7 +24,7 @@ fit(::Type{ZScoreTransform}, X::AbstractArray{<:Real,2}; center::Bool=true, scal ## Unit range normalization -**Unit range normalization* is an alternative data transformation which scales features +**Unit range normalization** is an alternative data transformation which scales features to lie in the interval `[0; 1]`. Unit range normalization can be performed using `fit(UnitRangeTransform, ...)`. diff --git a/src/transformations.jl b/src/transformations.jl index c3967fc1..3b581bb3 100644 --- a/src/transformations.jl +++ b/src/transformations.jl @@ -16,7 +16,7 @@ transform!(t::AbstractDataTransform, x::AbstractVector{<:Real}) = """ transform(t::AbstractDataTransform, x) -Return a standardized vector or matrix `x` using `t` transformation. +Return a standardized copy of vector or matrix `x` using transformation `t`. """ transform(t::AbstractDataTransform, x::AbstractMatrix{<:Real}) = transform!(similar(x), t, x) @@ -28,7 +28,7 @@ transform(t::AbstractDataTransform, x::AbstractVector{<:Real}) = reconstruct!(t::AbstractDataTransform, y) Perform an in-place reconstruction into an original data scale from a transformed -vector or matrix `y` using `t` transformation. +vector or matrix `y` using transformation `t`. """ reconstruct!(t::AbstractDataTransform, y::AbstractMatrix{<:Real}) = reconstruct!(y, t, y) @@ -39,7 +39,7 @@ reconstruct!(t::AbstractDataTransform, y::AbstractVector{<:Real}) = reconstruct(t::AbstractDataTransform, y) Return a reconstruction of an originally scaled data from a transformed vector -or matrix `y` using `t` transformation. +or matrix `y` using transformation `t`. """ reconstruct(t::AbstractDataTransform, y::AbstractMatrix{<:Real}) = reconstruct!(similar(y), t, y) @@ -47,7 +47,7 @@ reconstruct(t::AbstractDataTransform, y::AbstractVector{<:Real}) = vec(reconstruct(t, reshape(y, :, 1))) """ - Standardization (Z-score transformation) +Standardization (Z-score transformation) """ struct ZScoreTransform{T<:Real} <: AbstractDataTransform len::Int @@ -75,11 +75,8 @@ end """ fit(ZScoreTransform, X; dims=nothing, center=true, scale=true) -Fit standardization parameters to `X` and return a `ZScoreTransform` transformation object. - -# Arguments - -* `data`: matrix of samples to fit transformation parameters. +Fit standardization parameters to vector or matrix `X` +and return a `ZScoreTransform` transformation object. # Keyword arguments @@ -209,7 +206,7 @@ function reconstruct!(x::AbstractMatrix{<:Real}, t::ZScoreTransform, y::Abstract end """ - Unit range normalization +Unit range normalization """ struct UnitRangeTransform{T<:Real} <: AbstractDataTransform len::Int @@ -239,11 +236,8 @@ end """ fit(UnitRangeTransform, X; dims=nothing, unit=true) -Fit a scaling parameters to `X` and return transformation description. - -# Arguments - -* `data`: matrix of samples to fit transformation parameters. +Fit a scaling parameters to vector or matrix `X` +and return a `UnitRangeTransform` transformation object. # Keyword arguments From 2082d44ca15589e899bbce75259f1d4cb7468cda Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Fri, 20 Sep 2019 14:58:57 +0200 Subject: [PATCH 027/105] Remove pkg.julialang.org badges (#525) These haven't worked for ages. --- README.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/README.md b/README.md index 063aa543..7b0710ee 100644 --- a/README.md +++ b/README.md @@ -2,9 +2,6 @@ *StatsBase.jl* is a Julia package that provides basic support for statistics. Particularly, it implements a variety of statistics-related functions, such as scalar statistics, high-order moment computation, counting, ranking, covariances, sampling, and empirical density estimation. -- **Current Release**: - [![StatsBase](http://pkg.julialang.org/badges/StatsBase_0.5.svg)](http://pkg.julialang.org/?pkg=StatsBase) - [![StatsBase](http://pkg.julialang.org/badges/StatsBase_0.6.svg)](http://pkg.julialang.org/?pkg=StatsBase) - **Build & Testing Status:** [![Build Status](https://travis-ci.org/JuliaStats/StatsBase.jl.svg?branch=master)](https://travis-ci.org/JuliaStats/StatsBase.jl) [![Build status](https://ci.appveyor.com/api/projects/status/fsut3j3onulvws1w?svg=true)](https://ci.appveyor.com/project/nalimilan/statsbase-jl) From 95b794a9db17931f24443089c789bf0f795bb8be Mon Sep 17 00:00:00 2001 From: Jan Weidner Date: Fri, 20 Sep 2019 15:11:28 +0200 Subject: [PATCH 028/105] add docstring to Histogram (#495) * add docstring to Histogram * fix spelling in Histogram docs * fix spelling in Histogram docs * Cut at 92 chars, reword a bit * Improve examples in Histogram docs * fix --- docs/src/empirical.md | 6 ++-- src/hist.jl | 64 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 3 deletions(-) diff --git a/docs/src/empirical.md b/docs/src/empirical.md index 30faf21c..e015804c 100644 --- a/docs/src/empirical.md +++ b/docs/src/empirical.md @@ -2,9 +2,9 @@ ## Histograms -The `Histogram` type represents data that has been tabulated into intervals -(known as *bins*) along the real line, or in higher dimensions, over the real -plane. +```@docs +Histogram +``` Histograms can be fitted to data using the `fit` method. diff --git a/src/hist.jl b/src/hist.jl index b6373089..e8a348e4 100644 --- a/src/hist.jl +++ b/src/hist.jl @@ -115,6 +115,70 @@ end abstract type AbstractHistogram{T<:Real,N,E} end # N-dimensional histogram object +""" + Histogram <: AbstractHistogram + +The `Histogram` type represents data that has been tabulated into intervals +(known as *bins*) along the real line, or in higher dimensions, over a real space. +Histograms can be fitted to data using the `fit` method. + +# Fields +* edges: An iterator that contains the boundaries of the bins in each dimension. +* weights: An array that contains the weight of each bin. +* closed: A symbol with value `:right` or `:left` indicating on which side bins + (half-open intervals or higher-dimensional analogues thereof) are closed. + See below for an example. +* isdensity: There are two interpretations of a `Histogram`. If `isdensity=false` the weight of a bin corresponds to the amount of a quantity in the bin. + If `isdensity=true` then it corresponds to the density (amount / volume) of the quantity in the bin. See below for an example. + +# Examples +## Example illustrating `closed` +```jldoctest +julia> fit(Histogram, [2.], 1:3, closed=:left) +Histogram{Int64,1,Tuple{UnitRange{Int64}}} +edges: + 1:3 +weights: [0, 1] +closed: left +isdensity: false + +julia> fit(Histogram, [2.], 1:3, closed=:right) +Histogram{Int64,1,Tuple{UnitRange{Int64}}} +edges: + 1:3 +weights: [1, 0] +closed: right +isdensity: false +``` +## Example illustrating `isdensity` +```julia +julia> using StatsBase, LinearAlgebra + +julia> bins = [0,1,7]; # a small and a large bin + +julia> obs = [0.5, 1.5, 1.5, 2.5]; # one observation in the small bin and three in the large + +julia> h = fit(Histogram, obs, bins) +Histogram{Int64,1,Tuple{Array{Int64,1}}} +edges: + [0, 1, 7] +weights: [1, 3] +closed: left +isdensity: false + +julia> # observe isdensity = false and the weights field records the number of observations in each bin + +julia> normalize(h, mode=:density) +Histogram{Float64,1,Tuple{Array{Int64,1}}} +edges: + [0, 1, 7] +weights: [1.0, 0.5] +closed: left +isdensity: true + +julia> # observe isdensity = true and weights tells us the number of observation per binsize in each bin +``` +""" mutable struct Histogram{T<:Real,N,E} <: AbstractHistogram{T,N,E} edges::E weights::Array{T,N} From b039107860d2ce0213f5bc27e819f53c1b200bf2 Mon Sep 17 00:00:00 2001 From: Luca Bittarello Date: Tue, 1 Oct 2019 13:41:40 +0200 Subject: [PATCH 029/105] Simplify weights (#526) --- src/deprecates.jl | 4 + src/weights.jl | 95 +++++++++++----------- test/weights.jl | 199 ++++++++++++++++++++++++---------------------- 3 files changed, 151 insertions(+), 147 deletions(-) diff --git a/src/deprecates.jl b/src/deprecates.jl index cb9c55c9..9e409b33 100644 --- a/src/deprecates.jl +++ b/src/deprecates.jl @@ -35,3 +35,7 @@ end @deprecate wmedian(v::RealVector, w::RealVector) median(v, weights(w)) @deprecate quantile(v::AbstractArray{<:Real}) quantile(v, [.0, .25, .5, .75, 1.0]) + +### Deprecated September 2019 +@deprecate sum(A::AbstractArray, w::AbstractWeights, dims::Int) sum(A, w, dims=dims) +@deprecate values(wv::AbstractWeights) convert(Vector, wv) diff --git a/src/weights.jl b/src/weights.jl index 5affc704..e5df6b73 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -1,4 +1,4 @@ -###### Weight vector ##### +##### Weight vector ##### abstract type AbstractWeights{S<:Real, T<:Real, V<:AbstractVector{T}} <: AbstractVector{T} end """ @@ -18,12 +18,24 @@ macro weights(name) end length(wv::AbstractWeights) = length(wv.values) -values(wv::AbstractWeights) = wv.values sum(wv::AbstractWeights) = wv.sum isempty(wv::AbstractWeights) = isempty(wv.values) size(wv::AbstractWeights) = size(wv.values) -Base.getindex(wv::AbstractWeights, i) = getindex(wv.values, i) +Base.convert(::Type{Vector}, wv::AbstractWeights) = convert(Vector, wv.values) + +@propagate_inbounds function Base.getindex(wv::AbstractWeights, i::Integer) + @boundscheck checkbounds(wv, i) + @inbounds wv.values[i] +end + +@propagate_inbounds function Base.getindex(wv::W, i::AbstractArray) where W <: AbstractWeights + @boundscheck checkbounds(wv, i) + @inbounds v = wv.values[i] + W(v, sum(v)) +end + +Base.getindex(wv::W, ::Colon) where {W <: AbstractWeights} = W(copy(wv.values), sum(wv)) @propagate_inbounds function Base.setindex!(wv::AbstractWeights, v::Real, i::Int) s = v - wv[i] @@ -247,7 +259,7 @@ eweights(n::Integer, λ::Real) = eweights(1:n, λ) eweights(t::AbstractVector, r::AbstractRange, λ::Real) = eweights(something.(indexin(t, r)), λ) -# NOTE: No variance correction is implemented for exponential weights +# NOTE: no variance correction is implemented for exponential weights struct UnitWeights{T<:Real} <: AbstractWeights{Int, T, V where V<:Vector{T}} len::Int @@ -260,12 +272,13 @@ Construct a `UnitWeights` vector with length `s` and weight elements of type `T` All weight elements are identically one. """ UnitWeights -values(wv::UnitWeights{T}) where T = fill(one(T), length(wv)) sum(wv::UnitWeights{T}) where T = convert(T, length(wv)) isempty(wv::UnitWeights) = iszero(wv.len) length(wv::UnitWeights) = wv.len size(wv::UnitWeights) = Tuple(length(wv)) +Base.convert(::Type{Vector}, wv::UnitWeights{T}) where {T} = ones(T, length(wv)) + @propagate_inbounds function Base.getindex(wv::UnitWeights{T}, i::Integer) where T @boundscheck checkbounds(wv, i) one(T) @@ -273,10 +286,10 @@ end @propagate_inbounds function Base.getindex(wv::UnitWeights{T}, i::AbstractArray{<:Int}) where T @boundscheck checkbounds(wv, i) - fill(one(T), size(i)) + UnitWeights{T}(length(i)) end -Base.getindex(wv::UnitWeights{T}, ::Colon) where T = fill(one(T), length(wv)) +Base.getindex(wv::UnitWeights{T}, ::Colon) where {T} = UnitWeights{T}(wv.len) """ uweights(s::Integer) @@ -315,7 +328,7 @@ This definition is equivalent to the correction applied to unweighted data. corrected ? (1 / (w.len - 1)) : (1 / w.len) end -##### Equality tests ##### +#### Equality tests ##### for w in (AnalyticWeights, FrequencyWeights, ProbabilityWeights, Weights) @eval begin @@ -341,22 +354,7 @@ Compute the weighted sum of an array `v` with weights `w`, optionally over the d """ wsum(v::AbstractVector, w::AbstractVector) = dot(v, w) wsum(v::AbstractArray, w::AbstractVector) = dot(vec(v), w) - -# Note: the methods for BitArray and SparseMatrixCSC are to avoid ambiguities -Base.sum(v::BitArray, w::AbstractWeights) = wsum(v, values(w)) -Base.sum(v::SparseArrays.SparseMatrixCSC, w::AbstractWeights) = wsum(v, values(w)) -Base.sum(v::AbstractArray, w::AbstractWeights) = dot(v, values(w)) - -for v in (AbstractArray{<:Number}, BitArray, SparseArrays.SparseMatrixCSC, AbstractArray) - @eval begin - function Base.sum(v::$v, w::UnitWeights) - if length(v) != length(w) - throw(DimensionMismatch("Inconsistent array dimension.")) - end - return sum(v) - end - end -end +wsum(v::AbstractArray, w::AbstractVector, dims::Colon) = wsum(v, w) ## wsum along dimension # @@ -392,7 +390,6 @@ end # (d) A is a general dense array with eltype <: BlasReal: # dim <= 2: delegate to (a) and (b) # otherwise, decompose A into multiple pages -# function _wsum1!(R::AbstractArray, A::AbstractVector, w::AbstractVector, init::Bool) r = wsum(A, w) @@ -455,7 +452,8 @@ function _wsumN!(R::StridedArray{T}, A::DenseArray{T,N}, w::StridedVector{T}, di return R end -# General Cartesian-based weighted sum across dimensions +## general Cartesian-based weighted sum across dimensions + @generated function _wsum_general!(R::AbstractArray{RT}, f::supertype(typeof(abs)), A::AbstractArray{T,N}, w::AbstractVector{WT}, dim::Int, init::Bool) where {T,RT,WT,N} quote @@ -512,7 +510,6 @@ end end end - # N = 1 _wsum!(R::StridedArray{T}, A::DenseArray{T,1}, w::StridedVector{T}, dim::Int, init::Bool) where {T<:BlasReal} = _wsum1!(R, A, w, init) @@ -533,7 +530,6 @@ _wsum!(R::AbstractArray, A::AbstractArray, w::AbstractVector, dim::Int, init::Bo wsumtype(::Type{T}, ::Type{W}) where {T,W} = typeof(zero(T) * zero(W) + zero(T) * zero(W)) wsumtype(::Type{T}, ::Type{T}) where {T<:BlasReal} = T - """ wsum!(R, A, w, dim; init=true) @@ -559,19 +555,21 @@ function wsum(A::AbstractArray{<:Number}, w::UnitWeights, dim::Int) return sum(A, dims=dim) end -# extended sum! and wsum +## extended sum! and wsum Base.sum!(R::AbstractArray, A::AbstractArray, w::AbstractWeights{<:Real}, dim::Int; init::Bool=true) = - wsum!(R, A, values(w), dim; init=init) + wsum!(R, A, w, dim; init=init) -Base.sum(A::AbstractArray{<:Number}, w::AbstractWeights{<:Real}, dim::Int) = wsum(A, values(w), dim) +Base.sum(A::AbstractArray, w::AbstractWeights{<:Real}; dims::Union{Colon,Int}=:) = + wsum(A, w, dims) -function Base.sum(A::AbstractArray{<:Number}, w::UnitWeights, dim::Int) - size(A, dim) != length(w) && throw(DimensionMismatch("Inconsistent array dimension.")) - return sum(A, dims=dim) +function Base.sum(A::AbstractArray, w::UnitWeights; dims::Union{Colon,Int}=:) + a = (dims === :) ? length(A) : size(A, dims) + a != length(w) && throw(DimensionMismatch("Inconsistent array dimension.")) + return sum(A, dims=dims) end -###### Weighted means ##### +##### Weighted means ##### """ wmean(v, w::AbstractVector) @@ -589,9 +587,10 @@ end Compute the weighted mean of array `A` with weight vector `w` (of type `AbstractWeights`) along dimension `dims`, and write results to `R`. """ -mean!(R::AbstractArray, A::AbstractArray, w::AbstractWeights; - dims::Union{Nothing,Int}=nothing) = _mean!(R, A, w, dims) -_mean!(R::AbstractArray, A::AbstractArray, w::AbstractWeights, dims::Nothing) = throw(ArgumentError("dims argument must be provided")) +mean!(R::AbstractArray, A::AbstractArray, w::AbstractWeights; dims::Union{Nothing,Int}=nothing) = + _mean!(R, A, w, dims) +_mean!(R::AbstractArray, A::AbstractArray, w::AbstractWeights, dims::Nothing) = + throw(ArgumentError("dims argument must be provided")) _mean!(R::AbstractArray, A::AbstractArray, w::AbstractWeights, dims::Int) = rmul!(Base.sum!(R, A, w, dims), inv(sum(w))) @@ -611,24 +610,21 @@ w = rand(n) mean(x, weights(w)) ``` """ -mean(A::AbstractArray, w::AbstractWeights; dims::Union{Nothing,Int}=nothing) = +mean(A::AbstractArray, w::AbstractWeights; dims::Union{Colon,Int}=:) = _mean(A, w, dims) -_mean(A::AbstractArray, w::AbstractWeights, dims::Nothing) = +_mean(A::AbstractArray, w::AbstractWeights, dims::Colon) = sum(A, w) / sum(w) _mean(A::AbstractArray{T}, w::AbstractWeights{W}, dims::Int) where {T,W} = _mean!(similar(A, wmeantype(T, W), Base.reduced_indices(axes(A), dims)), A, w, dims) -function _mean(A::AbstractArray, w::UnitWeights, dims::Nothing) - length(A) != length(w) && throw(DimensionMismatch("Inconsistent array dimension.")) - return mean(A) -end - -function _mean(A::AbstractArray, w::UnitWeights, dims::Int) - size(A, dims) != length(w) && throw(DimensionMismatch("Inconsistent array dimension.")) +function mean(A::AbstractArray, w::UnitWeights; dims::Union{Colon,Int}=:) + a = (dims === :) ? length(A) : size(A, dims) + a != length(w) && throw(DimensionMismatch("Inconsistent array dimension.")) return mean(A, dims=dims) end -###### Weighted quantile ##### +##### Weighted quantile ##### + """ quantile(v, w::AbstractWeights, p) @@ -723,9 +719,8 @@ end quantile(v::RealVector, w::AbstractWeights{<:Real}, p::Number) = quantile(v, w, [p])[1] +##### Weighted median ##### - -###### Weighted median ##### """ median(v::RealVector, w::AbstractWeights) diff --git a/test/weights.jl b/test/weights.jl index 5b3fd8ba..4391cb51 100644 --- a/test/weights.jl +++ b/test/weights.jl @@ -4,7 +4,8 @@ using LinearAlgebra, Random, SparseArrays, Test @testset "StatsBase.Weights" begin weight_funcs = (weights, aweights, fweights, pweights) -# Construction +## Construction + @testset "$f" for f in weight_funcs @test isa(f([1, 2, 3]), AbstractWeights{Int}) @test isa(f([1., 2., 3.]), AbstractWeights{Float64}) @@ -17,7 +18,7 @@ weight_funcs = (weights, aweights, fweights, pweights) wv = f(w) @test eltype(wv) === Float64 @test length(wv) === 3 - @test values(wv) === w + @test values(wv) == w @test sum(wv) === 6.0 @test !isempty(wv) @@ -25,7 +26,7 @@ weight_funcs = (weights, aweights, fweights, pweights) bv = f(b) @test eltype(bv) === Bool @test length(bv) === 3 - @test values(bv) === b + @test values(bv) == b @test sum(bv) === 3 @test !isempty(bv) @@ -114,117 +115,124 @@ end end ## wsum -x = [6., 8., 9.] -w = [2., 3., 4.] -p = [1. 2. ; 3. 4.] -q = [1., 2., 3., 4.] -@test wsum(Float64[], Float64[]) === 0.0 -@test wsum(x, w) === 72.0 -@test wsum(p, q) === 29.0 +@testset "wsum" begin + x = [6., 8., 9.] + w = [2., 3., 4.] + p = [1. 2. ; 3. 4.] + q = [1., 2., 3., 4.] + + @test wsum(Float64[], Float64[]) === 0.0 + @test wsum(x, w) === 72.0 + @test wsum(p, q) === 29.0 + + ## wsum along dimension -## wsum along dimension -@test wsum(x, w, 1) == [72.0] + @test wsum(x, w, 1) == [72.0] -x = rand(6, 8) -w1 = rand(6) -w2 = rand(8) + x = rand(6, 8) + w1 = rand(6) + w2 = rand(8) -@test size(wsum(x, w1, 1)) == (1, 8) -@test size(wsum(x, w2, 2)) == (6, 1) + @test size(wsum(x, w1, 1)) == (1, 8) + @test size(wsum(x, w2, 2)) == (6, 1) -@test wsum(x, w1, 1) ≈ sum(x .* w1, dims = 1) -@test wsum(x, w2, 2) ≈ sum(x .* w2', dims = 2) + @test wsum(x, w1, 1) ≈ sum(x .* w1, dims=1) + @test wsum(x, w2, 2) ≈ sum(x .* w2', dims=2) -x = rand(6, 5, 4) -w1 = rand(6) -w2 = rand(5) -w3 = rand(4) + x = rand(6, 5, 4) + w1 = rand(6) + w2 = rand(5) + w3 = rand(4) -@test size(wsum(x, w1, 1)) == (1, 5, 4) -@test size(wsum(x, w2, 2)) == (6, 1, 4) -@test size(wsum(x, w3, 3)) == (6, 5, 1) + @test size(wsum(x, w1, 1)) == (1, 5, 4) + @test size(wsum(x, w2, 2)) == (6, 1, 4) + @test size(wsum(x, w3, 3)) == (6, 5, 1) -@test wsum(x, w1, 1) ≈ sum(x .* w1, dims = 1) -@test wsum(x, w2, 2) ≈ sum(x .* w2', dims = 2) -@test wsum(x, w3, 3) ≈ sum(x .* reshape(w3, 1, 1, 4), dims = 3) + @test wsum(x, w1, 1) ≈ sum(x .* w1, dims=1) + @test wsum(x, w2, 2) ≈ sum(x .* w2', dims=2) + @test wsum(x, w3, 3) ≈ sum(x .* reshape(w3, 1, 1, 4), dims=3) -v = view(x, 2:4, :, :) + v = view(x, 2:4, :, :) -@test wsum(v, w1[1:3], 1) ≈ sum(v .* w1[1:3], dims = 1) -@test wsum(v, w2, 2) ≈ sum(v .* w2', dims = 2) -@test wsum(v, w3, 3) ≈ sum(v .* reshape(w3, 1, 1, 4), dims = 3) + @test wsum(v, w1[1:3], 1) ≈ sum(v .* w1[1:3], dims=1) + @test wsum(v, w2, 2) ≈ sum(v .* w2', dims=2) + @test wsum(v, w3, 3) ≈ sum(v .* reshape(w3, 1, 1, 4), dims=3) -## wsum for Arrays with non-BlasReal elements -x = rand(1:100, 6, 8) -w1 = rand(6) -w2 = rand(8) + ## wsum for Arrays with non-BlasReal elements -@test wsum(x, w1, 1) ≈ sum(x .* w1, dims = 1) -@test wsum(x, w2, 2) ≈ sum(x .* w2', dims = 2) + x = rand(1:100, 6, 8) + w1 = rand(6) + w2 = rand(8) -## wsum! -x = rand(6) -w = rand(6) + @test wsum(x, w1, 1) ≈ sum(x .* w1, dims=1) + @test wsum(x, w2, 2) ≈ sum(x .* w2', dims=2) -r = ones(1) -@test wsum!(r, x, w, 1; init=true) === r -@test r ≈ [dot(x, w)] + ## wsum! -r = ones(1) -@test wsum!(r, x, w, 1; init=false) === r -@test r ≈ [dot(x, w) + 1.0] + x = rand(6) + w = rand(6) -x = rand(6, 8) -w1 = rand(6) -w2 = rand(8) + r = ones(1) + @test wsum!(r, x, w, 1; init=true) === r + @test r ≈ [dot(x, w)] -r = ones(1, 8) -@test wsum!(r, x, w1, 1; init=true) === r -@test r ≈ sum(x .* w1, dims = 1) + r = ones(1) + @test wsum!(r, x, w, 1; init=false) === r + @test r ≈ [dot(x, w) + 1.0] -r = ones(1, 8) -@test wsum!(r, x, w1, 1; init=false) === r -@test r ≈ sum(x .* w1, dims = 1) .+ 1.0 + x = rand(6, 8) + w1 = rand(6) + w2 = rand(8) -r = ones(6) -@test wsum!(r, x, w2, 2; init=true) === r -@test r ≈ sum(x .* w2', dims = 2) + r = ones(1, 8) + @test wsum!(r, x, w1, 1; init=true) === r + @test r ≈ sum(x .* w1, dims=1) -r = ones(6) -@test wsum!(r, x, w2, 2; init=false) === r -@test r ≈ sum(x .* w2', dims = 2) .+ 1.0 + r = ones(1, 8) + @test wsum!(r, x, w1, 1; init=false) === r + @test r ≈ sum(x .* w1, dims=1) .+ 1.0 -x = rand(8, 6, 5) -w1 = rand(8) -w2 = rand(6) -w3 = rand(5) + r = ones(6) + @test wsum!(r, x, w2, 2; init=true) === r + @test r ≈ sum(x .* w2', dims=2) -r = ones(1, 6, 5) -@test wsum!(r, x, w1, 1; init=true) === r -@test r ≈ sum(x .* w1, dims = 1) + r = ones(6) + @test wsum!(r, x, w2, 2; init=false) === r + @test r ≈ sum(x .* w2', dims=2) .+ 1.0 -r = ones(1, 6, 5) -@test wsum!(r, x, w1, 1; init=false) === r -@test r ≈ sum(x .* w1, dims = 1) .+ 1.0 + x = rand(8, 6, 5) + w1 = rand(8) + w2 = rand(6) + w3 = rand(5) -r = ones(8, 1, 5) -@test wsum!(r, x, w2, 2; init=true) === r -@test r ≈ sum(x .* w2', dims = 2) + r = ones(1, 6, 5) + @test wsum!(r, x, w1, 1; init=true) === r + @test r ≈ sum(x .* w1, dims=1) -r = ones(8, 1, 5) -@test wsum!(r, x, w2, 2; init=false) === r -@test r ≈ sum(x .* w2', dims = 2) .+ 1.0 + r = ones(1, 6, 5) + @test wsum!(r, x, w1, 1; init=false) === r + @test r ≈ sum(x .* w1, dims=1) .+ 1.0 -r = ones(8, 6) -@test wsum!(r, x, w3, 3; init=true) === r -@test r ≈ sum(x .* reshape(w3, (1, 1, 5)), dims = 3) + r = ones(8, 1, 5) + @test wsum!(r, x, w2, 2; init=true) === r + @test r ≈ sum(x .* w2', dims=2) -r = ones(8, 6) -@test wsum!(r, x, w3, 3; init=false) === r -@test r ≈ sum(x .* reshape(w3, (1, 1, 5)), dims = 3) .+ 1.0 + r = ones(8, 1, 5) + @test wsum!(r, x, w2, 2; init=false) === r + @test r ≈ sum(x .* w2', dims=2) .+ 1.0 + + r = ones(8, 6) + @test wsum!(r, x, w3, 3; init=true) === r + @test r ≈ sum(x .* reshape(w3, (1, 1, 5)), dims=3) + + r = ones(8, 6) + @test wsum!(r, x, w3, 3; init=false) === r + @test r ≈ sum(x .* reshape(w3, (1, 1, 5)), dims=3) .+ 1.0 +end + +## sum, mean and quantile -## the sum and mean syntax a = reshape(1.0:27.0, 3, 3, 3) @testset "Sum $f" for f in weight_funcs @@ -232,9 +240,9 @@ a = reshape(1.0:27.0, 3, 3, 3) @test sum(1:3, f([1.0, 1.0, 0.5])) ≈ 4.5 for wt in ([1.0, 1.0, 1.0], [1.0, 0.2, 0.0], [0.2, 0.0, 1.0]) - @test sum(a, f(wt), 1) ≈ sum(a.*reshape(wt, length(wt), 1, 1), dims = 1) - @test sum(a, f(wt), 2) ≈ sum(a.*reshape(wt, 1, length(wt), 1), dims = 2) - @test sum(a, f(wt), 3) ≈ sum(a.*reshape(wt, 1, 1, length(wt)), dims = 3) + @test sum(a, f(wt), dims=1) ≈ sum(a.*reshape(wt, length(wt), 1, 1), dims=1) + @test sum(a, f(wt), dims=2) ≈ sum(a.*reshape(wt, 1, length(wt), 1), dims=2) + @test sum(a, f(wt), dims=3) ≈ sum(a.*reshape(wt, 1, 1, length(wt)), dims=3) end end @@ -250,8 +258,6 @@ end end end - -# Quantile fweights @testset "Quantile fweights" begin data = ( [7, 1, 2, 4, 10], @@ -429,10 +435,9 @@ end v = [7, 1, 2, 4, 10] w = [1, 1/3, 1/3, 1/3, 1] answer = 6.0 - @test quantile(data[1], f(w), 0.5) ≈ answer atol = 1e-5 + @test quantile(data[1], f(w), 0.5) ≈ answer atol = 1e-5 end - @testset "Median $f" for f in weight_funcs data = [4, 3, 2, 1] wt = [0, 0, 0, 0] @@ -470,9 +475,9 @@ end @test sum([1.0, 2.0, 3.0], wt) ≈ 6.0 @test mean([1.0, 2.0, 3.0], wt) ≈ 2.0 - @test sum(a, wt, 1) ≈ sum(a, dims=1) - @test sum(a, wt, 2) ≈ sum(a, dims=2) - @test sum(a, wt, 3) ≈ sum(a, dims=3) + @test sum(a, wt, dims=1) ≈ sum(a, dims=1) + @test sum(a, wt, dims=2) ≈ sum(a, dims=2) + @test sum(a, wt, dims=3) ≈ sum(a, dims=3) @test wsum(a, wt, 1) ≈ sum(a, dims=1) @test wsum(a, wt, 2) ≈ sum(a, dims=2) @@ -483,7 +488,7 @@ end @test mean(a, wt, dims=3) ≈ mean(a, dims=3) @test_throws DimensionMismatch sum(a, wt) - @test_throws DimensionMismatch sum(a, wt, 4) + @test_throws DimensionMismatch sum(a, wt, dims=4) @test_throws DimensionMismatch wsum(a, wt, 4) @test_throws DimensionMismatch mean(a, wt, dims=4) From 752bdf8050afeca557df4f5c25fd01b87354822f Mon Sep 17 00:00:00 2001 From: Matthieu Gomez Date: Sat, 19 Oct 2019 10:13:19 -0400 Subject: [PATCH 030/105] add responsename (#528) --- docs/src/statmodels.md | 1 + src/StatsBase.jl | 1 + src/statmodels.jl | 7 +++++++ 3 files changed, 9 insertions(+) diff --git a/docs/src/statmodels.md b/docs/src/statmodels.md index de8bba8d..91be9083 100644 --- a/docs/src/statmodels.md +++ b/docs/src/statmodels.md @@ -40,6 +40,7 @@ leverage meanresponse modelmatrix response +responsename predict predict! residuals diff --git a/src/StatsBase.jl b/src/StatsBase.jl index 8ed42363..03a3d1b8 100644 --- a/src/StatsBase.jl +++ b/src/StatsBase.jl @@ -192,6 +192,7 @@ export modelmatrix, mss, response, + responsename, nobs, nulldeviance, nullloglikelihood, diff --git a/src/statmodels.jl b/src/statmodels.jl index a0386120..c02bd88a 100644 --- a/src/statmodels.jl +++ b/src/statmodels.jl @@ -294,6 +294,13 @@ Return the model response (a.k.a. the dependent variable). """ response(obj::RegressionModel) = error("response is not defined for $(typeof(obj)).") +""" + responsename(obj::RegressionModel) + +Return the name of the model response (a.k.a. the dependent variable). +""" +responsename(obj::RegressionModel) = error("responsename is not defined for $(typeof(obj)).") + """ meanresponse(obj::RegressionModel) From 21abaeef2b254a12eded05c4858dd3669a3467b3 Mon Sep 17 00:00:00 2001 From: Andreas Noack Date: Tue, 3 Dec 2019 10:00:52 +0100 Subject: [PATCH 031/105] Create CompatHelper.yml --- .github/workflows/CompatHelper.yml | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 .github/workflows/CompatHelper.yml diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml new file mode 100644 index 00000000..68dbe39c --- /dev/null +++ b/.github/workflows/CompatHelper.yml @@ -0,0 +1,24 @@ +name: CompatHelper + +on: + schedule: + - cron: '00 * * * *' + +jobs: + CompatHelper: + runs-on: ${{ matrix.os }} + strategy: + matrix: + julia-version: [1.2.0] + julia-arch: [x86] + os: [ubuntu-latest] + steps: + - uses: julia-actions/setup-julia@latest + with: + version: ${{ matrix.julia-version }} + - name: Pkg.add("CompatHelper") + run: julia -e 'using Pkg; Pkg.add("CompatHelper")' + - name: CompatHelper.main() + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: julia -e 'using CompatHelper; CompatHelper.main()' From 93831ccc6b42c782695afa6e8de13ce36e670dee Mon Sep 17 00:00:00 2001 From: Josh Day Date: Tue, 3 Dec 2019 05:30:08 -0500 Subject: [PATCH 032/105] better NaN handling with ecdf (#537) * better NaN handling with ecdf * ecdf tests --- Project.toml | 6 +++--- src/empirical.jl | 3 ++- test/empirical.jl | 2 ++ 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/Project.toml b/Project.toml index 521c3831..7d9033fd 100644 --- a/Project.toml +++ b/Project.toml @@ -14,6 +14,9 @@ SortingAlgorithms = "a2af1166-a08f-5f64-846c-94a0d3cef48c" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +[compat] +julia = "1" + [extras] Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab" @@ -21,6 +24,3 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] test = ["Dates", "DelimitedFiles", "Test"] - -[compat] -julia = "1" diff --git a/src/empirical.jl b/src/empirical.jl index 9bd3a175..98ef7d91 100644 --- a/src/empirical.jl +++ b/src/empirical.jl @@ -1,6 +1,5 @@ # Empirical estimation of CDF and PDF - ## Empirical CDF struct ECDF{T <: AbstractVector{<:Real}, W <: AbstractWeights{<:Real}} @@ -9,6 +8,7 @@ struct ECDF{T <: AbstractVector{<:Real}, W <: AbstractWeights{<:Real}} end function (ecdf::ECDF)(x::Real) + isnan(x) && return NaN n = searchsortedlast(ecdf.sorted_values, x) evenweights = isempty(ecdf.weights) weightsum = evenweights ? length(ecdf.sorted_values) : sum(ecdf.weights) @@ -54,6 +54,7 @@ evaluate CDF values on other samples. function is inside the interval ``(0,1)``; the function is defined for the whole real line. """ function ecdf(X::RealVector; weights::AbstractVector{<:Real}=Weights(Float64[])) + any(isnan, X) && throw(ArgumentError("ecdf can not include NaN values")) isempty(weights) || length(X) == length(weights) || throw(ArgumentError("data and weight vectors must be the same size," * "got $(length(X)) and $(length(weights))")) ord = sortperm(X) diff --git a/test/empirical.jl b/test/empirical.jl index a0a8f771..cb031746 100644 --- a/test/empirical.jl +++ b/test/empirical.jl @@ -12,6 +12,8 @@ using Test fnecdf = ecdf([0.5]) @test fnecdf([zeros(5000); ones(5000)]) == [zeros(5000); ones(5000)] @test extrema(fnecdf) == (minimum(fnecdf), maximum(fnecdf)) == (0.5, 0.5) + @test isnan(ecdf([1,2,3])(NaN)) + @test_throws ArgumentError ecdf([1, NaN]) end @testset "Weighted ECDF" begin From 182cb82ec5526aec41603b50e2b05503cdceb799 Mon Sep 17 00:00:00 2001 From: Andreas Noack Date: Wed, 4 Dec 2019 08:20:25 +0100 Subject: [PATCH 033/105] Stop using the deprecated values function (#543) --- src/counts.jl | 8 ++++---- src/cov.jl | 4 ++-- src/hist.jl | 2 +- src/moments.jl | 14 +++++++------- src/sampling.jl | 6 +++--- test/weights.jl | 14 +++++++------- test/wsampling.jl | 4 ++-- 7 files changed, 26 insertions(+), 26 deletions(-) diff --git a/src/counts.jl b/src/counts.jl index 84fc8dcf..f0fa42c4 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -48,7 +48,7 @@ function addcounts!(r::AbstractArray, x::IntegerArray, levels::IntUnitRange, wv: m0 = levels[1] m1 = levels[end] b = m0 - 1 - w = values(wv) + w = convert(Vector, wv) @inbounds for i in 1 : length(x) xi = x[i] @@ -160,7 +160,7 @@ function addcounts!(r::AbstractArray, x::IntegerArray, y::IntegerArray, bx = mx0 - 1 by = my0 - 1 - w = values(wv) + w = convert(Vector, wv) for i = 1:n xi = x[i] @@ -326,7 +326,7 @@ radixsort_safe(::Type) = false function _addcounts_radix_sort_loop!(cm::Dict{T}, sx::AbstractArray{T}) where T last_sx = sx[1] tmpcount = get(cm, last_sx, 0) + 1 - + # now the data is sorted: can just run through and accumulate values before # adding into the Dict @inbounds for i in 2:length(sx) @@ -358,7 +358,7 @@ end function addcounts!(cm::Dict{T}, x::AbstractArray{T}, wv::AbstractVector{W}) where {T,W<:Real} n = length(x) length(wv) == n || throw(DimensionMismatch()) - w = values(wv) + w = convert(Vector, wv) z = zero(W) for i = 1 : n diff --git a/src/cov.jl b/src/cov.jl index 4251e96a..bb2f8f84 100644 --- a/src/cov.jl +++ b/src/cov.jl @@ -27,9 +27,9 @@ _unscaled_covzm(x::DenseMatrix, dims::Colon) = unscaled_covzm(x) _unscaled_covzm(x::DenseMatrix, dims::Integer) = unscaled_covzm(x, dims) _unscaled_covzm(x::DenseMatrix, wv::AbstractWeights, dims::Colon) = - _symmetrize!(unscaled_covzm(x, _scalevars(x, values(wv)))) + _symmetrize!(unscaled_covzm(x, _scalevars(x, convert(Vector, wv)))) _unscaled_covzm(x::DenseMatrix, wv::AbstractWeights, dims::Integer) = - _symmetrize!(unscaled_covzm(x, _scalevars(x, values(wv), dims), dims)) + _symmetrize!(unscaled_covzm(x, _scalevars(x, convert(Vector, wv), dims), dims)) """ scattermat(X, [wv::AbstractWeights]; mean=nothing, dims=1) diff --git a/src/hist.jl b/src/hist.jl index e8a348e4..40332c68 100644 --- a/src/hist.jl +++ b/src/hist.jl @@ -316,7 +316,7 @@ function append!(h::AbstractHistogram{T,N}, vs::NTuple{N,AbstractVector}, wv::Ab end h end -append!(h::AbstractHistogram{T,N}, vs::NTuple{N,AbstractVector}, wv::AbstractWeights) where {T,N} = append!(h, vs, values(wv)) +append!(h::AbstractHistogram{T,N}, vs::NTuple{N,AbstractVector}, wv::AbstractWeights) where {T,N} = append!(h, vs, convert(Vector, wv)) # Turn kwargs nbins into a type-stable tuple of integers: diff --git a/src/moments.jl b/src/moments.jl index 2a3c3bff..1a17f8e4 100644 --- a/src/moments.jl +++ b/src/moments.jl @@ -54,7 +54,7 @@ end function varm!(R::AbstractArray, A::RealArray, w::AbstractWeights, M::RealArray, dim::Int; corrected::DepBool=nothing) corrected = depcheck(:varm!, corrected) - rmul!(_wsum_centralize!(R, abs2, A, values(w), M, dim, true), + rmul!(_wsum_centralize!(R, abs2, A, convert(Vector, w), M, dim, true), varcorrection(w, corrected)) end @@ -234,7 +234,7 @@ end function _moment2(v::RealArray, wv::AbstractWeights, m::Real; corrected=false) n = length(v) s = 0.0 - w = values(wv) + w = convert(Vector, wv) for i = 1:n @inbounds z = v[i] - m @inbounds s += (z * z) * w[i] @@ -256,7 +256,7 @@ end function _moment3(v::RealArray, wv::AbstractWeights, m::Real) n = length(v) s = 0.0 - w = values(wv) + w = convert(Vector, wv) for i = 1:n @inbounds z = v[i] - m @inbounds s += (z * z * z) * w[i] @@ -277,7 +277,7 @@ end function _moment4(v::RealArray, wv::AbstractWeights, m::Real) n = length(v) s = 0.0 - w = values(wv) + w = convert(Vector, wv) for i = 1:n @inbounds z = v[i] - m @inbounds s += abs2(z * z) * w[i] @@ -298,7 +298,7 @@ end function _momentk(v::RealArray, k::Int, wv::AbstractWeights, m::Real) n = length(v) s = 0.0 - w = values(wv) + w = convert(Vector, wv) for i = 1:n @inbounds z = v[i] - m @inbounds s += (z ^ k) * w[i] @@ -364,7 +364,7 @@ function skewness(v::RealArray, wv::AbstractWeights, m::Real) length(wv) == n || throw(DimensionMismatch("Inconsistent array lengths.")) cm2 = 0.0 # empirical 2nd centered moment (variance) cm3 = 0.0 # empirical 3rd centered moment - w = values(wv) + w = convert(Vector, wv) @inbounds for i = 1:n x_i = v[i] @@ -411,7 +411,7 @@ function kurtosis(v::RealArray, wv::AbstractWeights, m::Real) length(wv) == n || throw(DimensionMismatch("Inconsistent array lengths.")) cm2 = 0.0 # empirical 2nd centered moment (variance) cm4 = 0.0 # empirical 4th centered moment - w = values(wv) + w = convert(Vector, wv) @inbounds for i = 1 : n x_i = v[i] diff --git a/src/sampling.jl b/src/sampling.jl index 1e0d4797..fcf25398 100644 --- a/src/sampling.jl +++ b/src/sampling.jl @@ -412,7 +412,7 @@ Optionally specify a random number generator `rng` as the first argument """ function sample(rng::AbstractRNG, wv::AbstractWeights) t = rand(rng) * sum(wv) - w = values(wv) + w = convert(Vector, wv) n = length(w) i = 1 cw = w[1] @@ -530,7 +530,7 @@ function alias_sample!(rng::AbstractRNG, a::AbstractArray, wv::AbstractWeights, # create alias table ap = Vector{Float64}(undef, n) alias = Vector{Int}(undef, n) - make_alias_table!(values(wv), sum(wv), ap, alias) + make_alias_table!(convert(Vector, wv), sum(wv), ap, alias) # sampling s = RangeGenerator(1:n) @@ -561,7 +561,7 @@ function naive_wsample_norep!(rng::AbstractRNG, a::AbstractArray, k = length(x) w = Vector{Float64}(undef, n) - copyto!(w, values(wv)) + copyto!(w, convert(Vector, wv)) wsum = sum(wv) for i = 1:k diff --git a/test/weights.jl b/test/weights.jl index 4391cb51..1e88798c 100644 --- a/test/weights.jl +++ b/test/weights.jl @@ -18,7 +18,7 @@ weight_funcs = (weights, aweights, fweights, pweights) wv = f(w) @test eltype(wv) === Float64 @test length(wv) === 3 - @test values(wv) == w + @test convert(Vector, wv) == w @test sum(wv) === 6.0 @test !isempty(wv) @@ -26,7 +26,7 @@ weight_funcs = (weights, aweights, fweights, pweights) bv = f(b) @test eltype(bv) === Bool @test length(bv) === 3 - @test values(bv) == b + @test convert(Vector, bv) == b @test sum(bv) === 3 @test !isempty(bv) @@ -44,20 +44,20 @@ end # Check getindex & sum @test wv[1] === 1. @test sum(wv) === 6. - @test values(wv) == w + @test convert(Vector, wv) == w # Test setindex! success @test (wv[1] = 4) === 4 # setindex! returns original val @test wv[1] === 4. # value correctly converted and set @test sum(wv) === 9. # sum updated - @test values(wv) == [4., 2., 3.] # Test state of all values + @test convert(Vector, wv) == [4., 2., 3.] # Test state of all values # Test mulivalue setindex! wv[1:2] = [3., 5.] @test wv[1] === 3. @test wv[2] === 5. @test sum(wv) === 11. - @test values(wv) == [3., 5., 3.] # Test state of all values + @test convert(Vector, wv) == [3., 5., 3.] # Test state of all values # Test failed setindex! due to conversion error w = [1, 2, 3] @@ -66,7 +66,7 @@ end @test_throws InexactError wv[1] = 1.5 # Returns original value @test wv[1] === 1 # value not updated @test sum(wv) === 6 # sum not corrupted - @test values(wv) == [1, 2, 3] # Test state of all values + @test convert(Vector, wv) == [1, 2, 3] # Test state of all values end @testset "$f, isequal and ==" for f in weight_funcs @@ -106,7 +106,7 @@ end @test length(wv) === 3 @test size(wv) === (3,) @test sum(wv) === 3. - @test values(wv) == fill(1.0, 3) + @test convert(Vector, wv) == fill(1.0, 3) @test StatsBase.varcorrection(wv) == 1/3 @test !isequal(wv, fweights(fill(1.0, 3))) @test isequal(wv, uweights(3)) diff --git a/test/wsampling.jl b/test/wsampling.jl index fc9a10a3..f88d5788 100644 --- a/test/wsampling.jl +++ b/test/wsampling.jl @@ -10,7 +10,7 @@ function check_wsample_wrep(a::AbstractArray, vrgn, wv::AbstractWeights, ptol::R (vmin, vmax) = vrgn (amin, amax) = extrema(a) @test vmin <= amin <= amax <= vmax - p0 = values(wv) ./ sum(wv) + p0 = convert(Vector, wv) ./ sum(wv) if ordered @test issorted(a) if ptol > 0 @@ -68,7 +68,7 @@ function check_wsample_norep(a::AbstractArray, vrgn, wv::AbstractWeights, ptol:: end if ptol > 0 - p0 = values(wv) ./ sum(wv) + p0 = convert(Vector, wv) ./ sum(wv) @test isapprox(proportions(a[1,:], vmin:vmax), p0, atol=ptol) end end From 0bb7740956aff807a2705872e8a36cbc98b40f55 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Wed, 4 Dec 2019 13:55:47 +0100 Subject: [PATCH 034/105] Remove mention of deviance in adjr2 docstring (#531) Since #400 we use the log-likelihood. --- src/statmodels.jl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/statmodels.jl b/src/statmodels.jl index c02bd88a..377c6c28 100644 --- a/src/statmodels.jl +++ b/src/statmodels.jl @@ -261,8 +261,7 @@ Adjusted pseudo-coefficient of determination (adjusted pseudo R-squared). For nonlinear models, one of the several pseudo R² definitions must be chosen via `variant`. The only currently supported variant is `:MacFadden`, defined as ``1 - (\\log (L) - k)/\\log (L0)``. In this formula, ``L`` is the likelihood of the model, ``L0`` that of the null model -(the model including only the intercept). These two quantities are taken to be minus half -`deviance` of the corresponding models. ``k`` is the number of consumed degrees of freedom +(the model including only the intercept), and ``k`` is the number of consumed degrees of freedom of the model (as returned by [`dof`](@ref)). """ function adjr2(obj::StatisticalModel, variant::Symbol) From ee992463813b2036a297bc49c4c5ca53e8263692 Mon Sep 17 00:00:00 2001 From: Matthieu Gomez Date: Wed, 4 Dec 2019 07:57:00 -0500 Subject: [PATCH 035/105] add crossmodelmatrix (#529) This is a useful API for packages that compute robust standard errors. Different models may want to implement more efficient methods since X'X is typically computed during the estimation. --- docs/src/statmodels.md | 1 + src/StatsBase.jl | 1 + src/statmodels.jl | 8 ++++++++ 3 files changed, 10 insertions(+) diff --git a/docs/src/statmodels.md b/docs/src/statmodels.md index 91be9083..a4d9c050 100644 --- a/docs/src/statmodels.md +++ b/docs/src/statmodels.md @@ -34,6 +34,7 @@ weights(::StatisticalModel) `RegressionModel` extends `StatisticalModel` by implementing the following additional methods. ```@docs +crossmodelmatrix dof_residual fitted leverage diff --git a/src/StatsBase.jl b/src/StatsBase.jl index 03a3d1b8..a0abe9c9 100644 --- a/src/StatsBase.jl +++ b/src/StatsBase.jl @@ -177,6 +177,7 @@ export coefnames, coeftable, confint, + crossmodelmatrix, deviance, dof, dof_residual, diff --git a/src/statmodels.jl b/src/statmodels.jl index 377c6c28..36ef5221 100644 --- a/src/statmodels.jl +++ b/src/statmodels.jl @@ -314,6 +314,14 @@ Return the model matrix (a.k.a. the design matrix). """ modelmatrix(obj::RegressionModel) = error("modelmatrix is not defined for $(typeof(obj)).") +""" + crossmodelmatrix(obj::RegressionModel) + +Return `X'X` where `X` is the model matrix of `obj`. +This function will return a pre-computed matrix stored in `obj` if possible. +""" +crossmodelmatrix(obj::RegressionModel) = (x = modelmatrix(obj); Symmetric(x' * x)) + """ leverage(obj::RegressionModel) From da42557642046116097ddaca39fd5dc2c41402cc Mon Sep 17 00:00:00 2001 From: Andreas Noack Date: Thu, 5 Dec 2019 11:57:16 +0100 Subject: [PATCH 036/105] Add bounds on dependencies (#545) --- Project.toml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Project.toml b/Project.toml index 7d9033fd..985f0764 100644 --- a/Project.toml +++ b/Project.toml @@ -16,6 +16,10 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" [compat] julia = "1" +DataAPI = "1" +DataStructures = "0.10, 0.11, 0.12, 0.13, 0.14, 0.17" +Missings = "0.3, 0.4" +SortingAlgorithms = "0.3" [extras] Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" From a80e960ee59e4a2745abcb8623cf6db620cf0093 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Tue, 14 Jan 2020 19:50:24 +0100 Subject: [PATCH 037/105] Remove unnecessary convert(Vector, wv) calls `AbstractWeights <: AbstractVector` for a long time now. `convert` has to allocate a vector when `UnitWeights` is passed. --- src/counts.jl | 9 +++------ src/cov.jl | 6 +++--- src/hist.jl | 2 -- src/moments.jl | 18 ++++++------------ src/sampling.jl | 11 +++++------ test/weights.jl | 12 ++++++------ test/wsampling.jl | 4 ++-- 7 files changed, 25 insertions(+), 37 deletions(-) diff --git a/src/counts.jl b/src/counts.jl index f0fa42c4..2b017e4b 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -48,12 +48,11 @@ function addcounts!(r::AbstractArray, x::IntegerArray, levels::IntUnitRange, wv: m0 = levels[1] m1 = levels[end] b = m0 - 1 - w = convert(Vector, wv) @inbounds for i in 1 : length(x) xi = x[i] if m0 <= xi <= m1 - r[xi - b] += w[i] + r[xi - b] += wv[i] end end return r @@ -160,13 +159,12 @@ function addcounts!(r::AbstractArray, x::IntegerArray, y::IntegerArray, bx = mx0 - 1 by = my0 - 1 - w = convert(Vector, wv) for i = 1:n xi = x[i] yi = y[i] if (mx0 <= xi <= mx1) && (my0 <= yi <= my1) - r[xi - bx, yi - by] += w[i] + r[xi - bx, yi - by] += wv[i] end end return r @@ -358,12 +356,11 @@ end function addcounts!(cm::Dict{T}, x::AbstractArray{T}, wv::AbstractVector{W}) where {T,W<:Real} n = length(x) length(wv) == n || throw(DimensionMismatch()) - w = convert(Vector, wv) z = zero(W) for i = 1 : n @inbounds xi = x[i] - @inbounds wi = w[i] + @inbounds wi = wv[i] cm[xi] = get(cm, xi, z) + wi end return cm diff --git a/src/cov.jl b/src/cov.jl index bb2f8f84..a77cd508 100644 --- a/src/cov.jl +++ b/src/cov.jl @@ -15,7 +15,7 @@ function _symmetrize!(a::DenseMatrix) return a end -function _scalevars(x::DenseMatrix, s::DenseVector, dims::Int) +function _scalevars(x::DenseMatrix, s::AbstractWeights, dims::Int) dims == 1 ? Diagonal(s) * x : dims == 2 ? x * Diagonal(s) : error("dims should be either 1 or 2.") @@ -27,9 +27,9 @@ _unscaled_covzm(x::DenseMatrix, dims::Colon) = unscaled_covzm(x) _unscaled_covzm(x::DenseMatrix, dims::Integer) = unscaled_covzm(x, dims) _unscaled_covzm(x::DenseMatrix, wv::AbstractWeights, dims::Colon) = - _symmetrize!(unscaled_covzm(x, _scalevars(x, convert(Vector, wv)))) + _symmetrize!(unscaled_covzm(x, _scalevars(x, wv))) _unscaled_covzm(x::DenseMatrix, wv::AbstractWeights, dims::Integer) = - _symmetrize!(unscaled_covzm(x, _scalevars(x, convert(Vector, wv), dims), dims)) + _symmetrize!(unscaled_covzm(x, _scalevars(x, wv, dims), dims)) """ scattermat(X, [wv::AbstractWeights]; mean=nothing, dims=1) diff --git a/src/hist.jl b/src/hist.jl index 40332c68..459e82d3 100644 --- a/src/hist.jl +++ b/src/hist.jl @@ -316,8 +316,6 @@ function append!(h::AbstractHistogram{T,N}, vs::NTuple{N,AbstractVector}, wv::Ab end h end -append!(h::AbstractHistogram{T,N}, vs::NTuple{N,AbstractVector}, wv::AbstractWeights) where {T,N} = append!(h, vs, convert(Vector, wv)) - # Turn kwargs nbins into a type-stable tuple of integers: function _nbins_tuple(vs::NTuple{N,AbstractVector}, nbins) where N diff --git a/src/moments.jl b/src/moments.jl index 1a17f8e4..c5d0ae5c 100644 --- a/src/moments.jl +++ b/src/moments.jl @@ -234,10 +234,9 @@ end function _moment2(v::RealArray, wv::AbstractWeights, m::Real; corrected=false) n = length(v) s = 0.0 - w = convert(Vector, wv) for i = 1:n @inbounds z = v[i] - m - @inbounds s += (z * z) * w[i] + @inbounds s += (z * z) * wv[i] end varcorrection(wv, corrected) * s @@ -256,10 +255,9 @@ end function _moment3(v::RealArray, wv::AbstractWeights, m::Real) n = length(v) s = 0.0 - w = convert(Vector, wv) for i = 1:n @inbounds z = v[i] - m - @inbounds s += (z * z * z) * w[i] + @inbounds s += (z * z * z) * wv[i] end s / sum(wv) end @@ -277,10 +275,9 @@ end function _moment4(v::RealArray, wv::AbstractWeights, m::Real) n = length(v) s = 0.0 - w = convert(Vector, wv) for i = 1:n @inbounds z = v[i] - m - @inbounds s += abs2(z * z) * w[i] + @inbounds s += abs2(z * z) * wv[i] end s / sum(wv) end @@ -298,10 +295,9 @@ end function _momentk(v::RealArray, k::Int, wv::AbstractWeights, m::Real) n = length(v) s = 0.0 - w = convert(Vector, wv) for i = 1:n @inbounds z = v[i] - m - @inbounds s += (z ^ k) * w[i] + @inbounds s += (z ^ k) * wv[i] end s / sum(wv) end @@ -364,11 +360,10 @@ function skewness(v::RealArray, wv::AbstractWeights, m::Real) length(wv) == n || throw(DimensionMismatch("Inconsistent array lengths.")) cm2 = 0.0 # empirical 2nd centered moment (variance) cm3 = 0.0 # empirical 3rd centered moment - w = convert(Vector, wv) @inbounds for i = 1:n x_i = v[i] - w_i = w[i] + w_i = wv[i] z = x_i - m z2w = z * z * w_i cm2 += z2w @@ -411,11 +406,10 @@ function kurtosis(v::RealArray, wv::AbstractWeights, m::Real) length(wv) == n || throw(DimensionMismatch("Inconsistent array lengths.")) cm2 = 0.0 # empirical 2nd centered moment (variance) cm4 = 0.0 # empirical 4th centered moment - w = convert(Vector, wv) @inbounds for i = 1 : n x_i = v[i] - w_i = w[i] + w_i = wv[i] z = x_i - m z2 = z * z z2w = z2 * w_i diff --git a/src/sampling.jl b/src/sampling.jl index fcf25398..bf648ba0 100644 --- a/src/sampling.jl +++ b/src/sampling.jl @@ -412,13 +412,12 @@ Optionally specify a random number generator `rng` as the first argument """ function sample(rng::AbstractRNG, wv::AbstractWeights) t = rand(rng) * sum(wv) - w = convert(Vector, wv) - n = length(w) + n = length(wv) i = 1 - cw = w[1] + cw = wv[1] while cw < t && i < n i += 1 - @inbounds cw += w[i] + @inbounds cw += wv[i] end return i end @@ -530,7 +529,7 @@ function alias_sample!(rng::AbstractRNG, a::AbstractArray, wv::AbstractWeights, # create alias table ap = Vector{Float64}(undef, n) alias = Vector{Int}(undef, n) - make_alias_table!(convert(Vector, wv), sum(wv), ap, alias) + make_alias_table!(wv, sum(wv), ap, alias) # sampling s = RangeGenerator(1:n) @@ -561,7 +560,7 @@ function naive_wsample_norep!(rng::AbstractRNG, a::AbstractArray, k = length(x) w = Vector{Float64}(undef, n) - copyto!(w, convert(Vector, wv)) + copyto!(w, wv) wsum = sum(wv) for i = 1:k diff --git a/test/weights.jl b/test/weights.jl index 1e88798c..9f071483 100644 --- a/test/weights.jl +++ b/test/weights.jl @@ -18,7 +18,7 @@ weight_funcs = (weights, aweights, fweights, pweights) wv = f(w) @test eltype(wv) === Float64 @test length(wv) === 3 - @test convert(Vector, wv) == w + @test wv == w @test sum(wv) === 6.0 @test !isempty(wv) @@ -44,20 +44,20 @@ end # Check getindex & sum @test wv[1] === 1. @test sum(wv) === 6. - @test convert(Vector, wv) == w + @test wv == w # Test setindex! success @test (wv[1] = 4) === 4 # setindex! returns original val @test wv[1] === 4. # value correctly converted and set @test sum(wv) === 9. # sum updated - @test convert(Vector, wv) == [4., 2., 3.] # Test state of all values + @test wv == [4., 2., 3.] # Test state of all values # Test mulivalue setindex! wv[1:2] = [3., 5.] @test wv[1] === 3. @test wv[2] === 5. @test sum(wv) === 11. - @test convert(Vector, wv) == [3., 5., 3.] # Test state of all values + @test wv == [3., 5., 3.] # Test state of all values # Test failed setindex! due to conversion error w = [1, 2, 3] @@ -66,7 +66,7 @@ end @test_throws InexactError wv[1] = 1.5 # Returns original value @test wv[1] === 1 # value not updated @test sum(wv) === 6 # sum not corrupted - @test convert(Vector, wv) == [1, 2, 3] # Test state of all values + @test wv == [1, 2, 3] # Test state of all values end @testset "$f, isequal and ==" for f in weight_funcs @@ -106,7 +106,7 @@ end @test length(wv) === 3 @test size(wv) === (3,) @test sum(wv) === 3. - @test convert(Vector, wv) == fill(1.0, 3) + @test wv == fill(1.0, 3) @test StatsBase.varcorrection(wv) == 1/3 @test !isequal(wv, fweights(fill(1.0, 3))) @test isequal(wv, uweights(3)) diff --git a/test/wsampling.jl b/test/wsampling.jl index f88d5788..37a3b9eb 100644 --- a/test/wsampling.jl +++ b/test/wsampling.jl @@ -10,7 +10,7 @@ function check_wsample_wrep(a::AbstractArray, vrgn, wv::AbstractWeights, ptol::R (vmin, vmax) = vrgn (amin, amax) = extrema(a) @test vmin <= amin <= amax <= vmax - p0 = convert(Vector, wv) ./ sum(wv) + p0 = wv ./ sum(wv) if ordered @test issorted(a) if ptol > 0 @@ -68,7 +68,7 @@ function check_wsample_norep(a::AbstractArray, vrgn, wv::AbstractWeights, ptol:: end if ptol > 0 - p0 = convert(Vector, wv) ./ sum(wv) + p0 = wv ./ sum(wv) @test isapprox(proportions(a[1,:], vmin:vmax), p0, atol=ptol) end end From a052232c75cc570028cddecd4148437a1332e5dc Mon Sep 17 00:00:00 2001 From: Julia TagBot <50554310+JuliaTagBot@users.noreply.github.com> Date: Sat, 8 Feb 2020 20:44:03 +0700 Subject: [PATCH 038/105] Install TagBot as a GitHub Action --- .github/workflows/TagBot.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 .github/workflows/TagBot.yml diff --git a/.github/workflows/TagBot.yml b/.github/workflows/TagBot.yml new file mode 100644 index 00000000..d77d3a0c --- /dev/null +++ b/.github/workflows/TagBot.yml @@ -0,0 +1,11 @@ +name: TagBot +on: + schedule: + - cron: 0 * * * * +jobs: + TagBot: + runs-on: ubuntu-latest + steps: + - uses: JuliaRegistries/TagBot@v1 + with: + token: ${{ secrets.GITHUB_TOKEN }} From d5e6fbc4077160e95c7f46efa1bb69261784d9fb Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Sun, 16 Feb 2020 19:05:21 +0100 Subject: [PATCH 039/105] New version (0.32.1) and update docs (#554) * bump version to ~0.33.0~ 0.32.1 * build docs on 1.3; test on (pre-)release (1.3 and 1.4), drop 1.2 * fix doctests * release is non-breaking, using 0.24 documenter exactly --- .travis.yml | 5 +++-- Project.toml | 2 +- docs/Project.toml | 3 ++- src/hist.jl | 2 ++ src/misc.jl | 4 ++-- src/transformations.jl | 4 ++-- 6 files changed, 12 insertions(+), 8 deletions(-) diff --git a/.travis.yml b/.travis.yml index 441c172b..a93b114c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,7 +4,8 @@ os: - osx julia: - 1.0 - - 1.2 + - 1.3 + - 1.4 - nightly notifications: email: false @@ -20,7 +21,7 @@ after_success: jobs: include: - stage: "Documentation" - julia: 1.0 + julia: 1.3 os: linux script: - julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); diff --git a/Project.toml b/Project.toml index 985f0764..17145e41 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "StatsBase" uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" authors = ["JuliaStats"] -version = "0.32.0" +version = "0.32.1" [deps] DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" diff --git a/docs/Project.toml b/docs/Project.toml index cb418284..a37a076b 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -1,5 +1,6 @@ [deps] Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" +StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" [compat] -Documenter = "~0.22" +Documenter = "0.24" diff --git a/src/hist.jl b/src/hist.jl index 459e82d3..36915213 100644 --- a/src/hist.jl +++ b/src/hist.jl @@ -134,6 +134,8 @@ Histograms can be fitted to data using the `fit` method. # Examples ## Example illustrating `closed` ```jldoctest +julia> using StatsBase + julia> fit(Histogram, [2.], 1:3, closed=:left) Histogram{Int64,1,Tuple{UnitRange{Int64}}} edges: diff --git a/src/misc.jl b/src/misc.jl index a450b32f..7c15ccc9 100644 --- a/src/misc.jl +++ b/src/misc.jl @@ -127,8 +127,8 @@ julia> using StatsBase julia> indicatormat([1 2 2], 2) 2×3 Array{Bool,2}: - true false false - false true true + 1 0 0 + 0 1 1 ``` """ function indicatormat(x::IntegerArray, k::Integer; sparse::Bool=false) diff --git a/src/transformations.jl b/src/transformations.jl index 3b581bb3..e4fdb9ac 100644 --- a/src/transformations.jl +++ b/src/transformations.jl @@ -98,7 +98,7 @@ julia> X = [0.0 -0.5 0.5; 0.0 1.0 2.0] 0.0 1.0 2.0 julia> dt = fit(ZScoreTransform, X, dims=2) -ZScoreTransform{Float64}(2, [0.0, 1.0], [0.5, 1.0]) +ZScoreTransform{Float64}(2, 2, [0.0, 1.0], [0.5, 1.0]) julia> StatsBase.transform(dt, X) 2×3 Array{Float64,2}: @@ -257,7 +257,7 @@ julia> X = [0.0 -0.5 0.5; 0.0 1.0 2.0] 0.0 1.0 2.0 julia> dt = fit(UnitRangeTransform, X, dims=2) -UnitRangeTransform{Float64}(2, true, [-0.5, 0.0], [1.0, 0.5]) +UnitRangeTransform{Float64}(2, 2, true, [-0.5, 0.0], [1.0, 0.5]) julia> StatsBase.transform(dt, X) 2×3 Array{Float64,2}: From 3762c78ba86a86671819ab8e5463a9f5e11a3988 Mon Sep 17 00:00:00 2001 From: Asaf Manela Date: Mon, 17 Feb 2020 03:07:53 -0600 Subject: [PATCH 040/105] Change trim and winsor to return iterators (#546) `trim` and `winsor` now return iterators rather than arrays to avoid unnecessary copies, and maintain original order of the data. --- src/robust.jl | 107 ++++++++++++++++++++++++++----------------------- test/robust.jl | 41 ++++++++++--------- 2 files changed, 78 insertions(+), 70 deletions(-) diff --git a/src/robust.jl b/src/robust.jl index c1d93264..a186be4b 100644 --- a/src/robust.jl +++ b/src/robust.jl @@ -7,96 +7,103 @@ ############################# # Trimmed set +"Return the upper and lower bound elements used by `trim` and `winsor`" +function uplo(x::AbstractVector; prop::Real=0.0, count::Integer=0) + n = length(x) + n > 0 || throw(ArgumentError("x can not be empty.")) + + if count == 0 + 0 <= prop < 0.5 || throw(ArgumentError("prop must satisfy 0 ≤ prop < 0.5.")) + count = floor(Int, n * prop) + else + prop == 0 || throw(ArgumentError("prop and count can not both be > 0.")) + 0 <= count < n/2 || throw(ArgumentError("count must satisfy 0 ≤ count < length(x)/2.")) + end + + # indices for lowest count values + x2 = copy(x) + lo = partialsort!(x2, 1:count+1)[end] + # indices for largest count values + up = partialsort!(x2, n-count:n)[1] + + up, lo +end + """ - trim(x; prop=0.0, count=0) + trim(x::AbstractVector; prop=0.0, count=0) + +Return an iterator of all elements of `x` that omits either `count` or proportion +`prop` of the highest and lowest elements. -Return a copy of `x` with either `count` or proportion `prop` of the highest -and lowest elements removed. To compute the trimmed mean of `x` use -`mean(trim(x))`; to compute the variance use `trimvar(x)` (see [`trimvar`](@ref)). +The number of trimmed elements could be smaller than specified if several +elements equal the lower or upper bound. + +To compute the trimmed mean of `x` use `mean(trim(x))`; +to compute the variance use `trimvar(x)` (see [`trimvar`](@ref)). # Example ```julia -julia> trim([1,2,3,4,5], prop=0.2) +julia> collect(trim([5,2,4,3,1], prop=0.2)) 3-element Array{Int64,1}: 2 - 3 4 + 3 ``` """ function trim(x::AbstractVector; prop::Real=0.0, count::Integer=0) - trim!(copy(x); prop=prop, count=count) + up, lo = uplo(x; prop=prop, count=count) + + (xi for xi in x if lo <= xi <= up) end """ - trim!(x; prop=0.0, count=0) + trim!(x::AbstractVector; prop=0.0, count=0) A variant of [`trim`](@ref) that modifies `x` in place. """ function trim!(x::AbstractVector; prop::Real=0.0, count::Integer=0) - n = length(x) - n > 0 || throw(ArgumentError("x can not be empty.")) - - if count == 0 - 0 <= prop < 0.5 || throw(ArgumentError("prop must satisfy 0 ≤ prop < 0.5.")) - count = floor(Int, n * prop) - else - prop == 0 || throw(ArgumentError("prop and count can not both be > 0.")) - 0 <= count < n/2 || throw(ArgumentError("count must satisfy 0 ≤ count < length(x)/2.")) - end - - partialsort!(x, (n-count+1):n) - partialsort!(x, 1:count) - deleteat!(x, (n-count+1):n) - deleteat!(x, 1:count) - + up, lo = uplo(x; prop=prop, count=count) + ix = (i for (i,xi) in enumerate(x) if lo > xi || xi > up) + deleteat!(x, ix) return x end """ - winsor(x; prop=0.0, count=0) + winsor(x::AbstractVector; prop=0.0, count=0) -Return a copy of `x` with either `count` or proportion `prop` of the lowest -elements of `x` replaced with the next-lowest, and an equal number of the -highest elements replaced with the previous-highest. To compute the Winsorized -mean of `x` use `mean(winsor(x))`. +Return an iterator of all elements of `x` that replaces either `count` or +proportion `prop` of the highest elements with the previous-highest element +and an equal number of the lowest elements with the next-lowest element. + +The number of replaced elements could be smaller than specified if several +elements equal the lower or upper bound. + +To compute the Winsorized mean of `x` use `mean(winsor(x))`. # Example ```julia -julia> winsor([1,2,3,4,5], prop=0.2) +julia> collect(winsor([5,2,3,4,1], prop=0.2)) 5-element Array{Int64,1}: - 2 + 4 2 3 4 - 4 + 2 ``` """ function winsor(x::AbstractVector; prop::Real=0.0, count::Integer=0) - winsor!(copy(x); prop=prop, count=count) + up, lo = uplo(x; prop=prop, count=count) + + (clamp(xi, lo, up) for xi in x) end """ - winsor!(x; prop=0.0, count=0) + winsor!(x::AbstractVector; prop=0.0, count=0) A variant of [`winsor`](@ref) that modifies vector `x` in place. """ function winsor!(x::AbstractVector; prop::Real=0.0, count::Integer=0) - n = length(x) - n > 0 || throw(ArgumentError("x can not be empty.")) - - if count == 0 - 0 <= prop < 0.5 || throw(ArgumentError("prop must satisfy 0 ≤ prop < 0.5.")) - count = floor(Int, n * prop) - else - prop == 0 || throw(ArgumentError("prop and count can not both be > 0.")) - 0 <= count < n/2 || throw(ArgumentError("count must satisfy 0 ≤ count < length(x)/2.")) - end - - partialsort!(x, (n-count+1):n) - partialsort!(x, 1:count) - x[1:count] .= x[count+1] - x[n-count+1:end] .= x[n-count] - + copyto!(x, winsor(x; prop=prop, count=count)) return x end diff --git a/test/robust.jl b/test/robust.jl index 5b42b568..9d35c9b7 100644 --- a/test/robust.jl +++ b/test/robust.jl @@ -3,39 +3,40 @@ using Test ### Trimming outliers -@test trim([1,2,3,4,5,6,7,8], prop=0.1) == [1,2,3,4,5,6,7,8] -@test trim([1,2,3,4,5,6,7,8], prop=0.2) == [2,3,4,5,6,7] -@test trim([1,2,3,4,5,6,7,8,9], prop=0.4) == [4,5,6] -@test trim([1,2,3,4,5,6,7,8], count=1) == [2,3,4,5,6,7] -@test trim([1,2,3,4,5,6,7,8,9], count=3) == [4,5,6] +@test collect(trim([8,2,3,4,5,6,7,1], prop=0.1)) == [8,2,3,4,5,6,7,1] +@test collect(trim([8,2,3,4,5,6,7,1], prop=0.2)) == [2,3,4,5,6,7] +@test collect(trim([1,2,3,4,5,6,7,8,9], prop=0.4)) == [4,5,6] +@test collect(trim([8,7,6,5,4,3,2,1], count=1)) == [7,6,5,4,3,2] +@test collect(trim([1,2,3,4,5,6,7,8,9], count=3)) == [4,5,6] @test_throws ArgumentError trim([]) @test_throws ArgumentError trim([1,2,3,4,5], prop=0.5) -@test trim!([1,2,3,4,5,6,7,8], prop=0.1) == [1,2,3,4,5,6,7,8] -@test trim!([1,2,3,4,5,6,7,8], prop=0.2) == [2,3,4,5,6,7] -@test trim!([1,2,3,4,5,6,7,8,9], prop=0.4) == [4,5,6] -@test trim!([1,2,3,4,5,6,7,8], count=1) == [2,3,4,5,6,7] -@test trim!([1,2,3,4,5,6,7,8,9], count=3) == [4,5,6] +@test collect(trim!([8,2,3,4,5,6,7,1], prop=0.1)) == [8,2,3,4,5,6,7,1] +@test collect(trim!([8,2,3,4,5,6,7,1], prop=0.2)) == [2,3,4,5,6,7] +@test collect(trim!([1,2,3,4,5,6,7,8,9], prop=0.4)) == [4,5,6] +@test collect(trim!([8,7,6,5,4,3,2,1], count=1)) == [7,6,5,4,3,2] +@test collect(trim!([1,2,3,4,5,6,7,8,9], count=3)) == [4,5,6] @test_throws ArgumentError trim!([]) @test_throws ArgumentError trim!([1,2,3,4,5], prop=0.5) -@test winsor([1,2,3,4,5,6,7,8], prop=0.1) == [1,2,3,4,5,6,7,8] -@test winsor([1,2,3,4,5,6,7,8], prop=0.2) == [2,2,3,4,5,6,7,7] -@test winsor([1,2,3,4,5,6,7,8,9], prop=0.4) == [4,4,4,4,5,6,6,6,6] -@test winsor([1,2,3,4,5,6,7,8], count=1) == [2,2,3,4,5,6,7,7] -@test winsor([1,2,3,4,5,6,7,8,9], count=3) == [4,4,4,4,5,6,6,6,6] +@test collect(winsor([8,2,3,4,5,6,7,1], prop=0.1)) == [8,2,3,4,5,6,7,1] +@test collect(winsor([8,2,3,4,5,6,7,1], prop=0.2)) == [7,2,3,4,5,6,7,2] +@test collect(winsor([1,2,3,4,5,6,7,8,9], prop=0.4)) == [4,4,4,4,5,6,6,6,6] +@test collect(winsor([1,2,3,4,5,6,7,8], count=1)) == [2,2,3,4,5,6,7,7] +@test collect(winsor([8,7,6,5,4,3,2,1], count=1)) == [7,7,6,5,4,3,2,2] +@test collect(winsor([1,2,3,4,5,6,7,8,9], count=3)) == [4,4,4,4,5,6,6,6,6] @test_throws ArgumentError winsor([]) @test_throws ArgumentError winsor([1,2,3,4,5], prop=0.5) -@test winsor!([1,2,3,4,5,6,7,8], prop=0.1) == [1,2,3,4,5,6,7,8] -@test winsor!([1,2,3,4,5,6,7,8], prop=0.2) == [2,2,3,4,5,6,7,7] -@test winsor!([1,2,3,4,5,6,7,8,9], prop=0.4) == [4,4,4,4,5,6,6,6,6] -@test winsor!([1,2,3,4,5,6,7,8], count=1) == [2,2,3,4,5,6,7,7] -@test winsor!([1,2,3,4,5,6,7,8,9], count=3) == [4,4,4,4,5,6,6,6,6] +@test collect(winsor!([8,2,3,4,5,6,7,1], prop=0.1)) == [8,2,3,4,5,6,7,1] +@test collect(winsor!([8,2,3,4,5,6,7,1], prop=0.2)) == [7,2,3,4,5,6,7,2] +@test collect(winsor!([1,2,3,4,5,6,7,8,9], prop=0.4)) == [4,4,4,4,5,6,6,6,6] +@test collect(winsor!([8,7,6,5,4,3,2,1], count=1)) == [7,7,6,5,4,3,2,2] +@test collect(winsor!([1,2,3,4,5,6,7,8,9], count=3)) == [4,4,4,4,5,6,6,6,6] @test_throws ArgumentError winsor!([]) @test_throws ArgumentError winsor!([1,2,3,4,5], prop=0.5) From 65351de819ca64941cb81c047e4b77157446f7c5 Mon Sep 17 00:00:00 2001 From: getzze Date: Mon, 2 Mar 2020 08:51:53 +0000 Subject: [PATCH 041/105] =?UTF-8?q?Add=20:devianceratio=20variant=20to=20r?= =?UTF-8?q?=C2=B2=20function=20(#550)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add another generalization for r2 to GLM that uses the deviance ratio. Change the last line of the docs to state that it corresponds to mss/tss for OLS. --- src/statmodels.jl | 55 +++++++++++++++++++++++++++++++---------------- 1 file changed, 36 insertions(+), 19 deletions(-) diff --git a/src/statmodels.jl b/src/statmodels.jl index 36ef5221..4836a1eb 100644 --- a/src/statmodels.jl +++ b/src/statmodels.jl @@ -216,25 +216,35 @@ Supported variants are: - `:MacFadden` (a.k.a. likelihood ratio index), defined as ``1 - \\log (L)/\\log (L_0)``; - `:CoxSnell`, defined as ``1 - (L_0/L)^{2/n}``; - `:Nagelkerke`, defined as ``(1 - (L_0/L)^{2/n})/(1 - L_0^{2/n})``. +- `:devianceratio`, defined as ``1 - D/D_0``. In the above formulas, ``L`` is the likelihood of the model, ``L_0`` is the likelihood of the null model (the model with only an intercept), -``n`` is the number of observations, ``y_i`` are the responses, -``\\hat{y}_i`` are fitted values and ``\\bar{y}`` is the average response. +``D`` is the deviance of the model (from the saturated model), +``D_0`` is the deviance of the null model, +``n`` is the number of observations (given by [`nobs`](@ref)). -Cox and Snell's R² should match the classical R² for linear models. +The Cox-Snell and the deviance ratio variants both match the classical definition of R² +for linear models. """ function r2(obj::StatisticalModel, variant::Symbol) - ll = loglikelihood(obj) - ll0 = nullloglikelihood(obj) - if variant == :McFadden - 1 - ll/ll0 - elseif variant == :CoxSnell - 1 - exp(2 * (ll0 - ll) / nobs(obj)) - elseif variant == :Nagelkerke - (1 - exp(2 * (ll0 - ll) / nobs(obj))) / (1 - exp(2 * ll0 / nobs(obj))) + loglikbased = (:McFadden, :CoxSnell, :Nagelkerke) + if variant in loglikbased + ll = loglikelihood(obj) + ll0 = nullloglikelihood(obj) + if variant == :McFadden + 1 - ll/ll0 + elseif variant == :CoxSnell + 1 - exp(2 * (ll0 - ll) / nobs(obj)) + elseif variant == :Nagelkerke + (1 - exp(2 * (ll0 - ll) / nobs(obj))) / (1 - exp(2 * ll0 / nobs(obj))) + end + elseif variant == :devianceratio + dev = deviance(obj) + dev0 = nulldeviance(obj) + 1 - dev/dev0 else - error("variant must be one of :McFadden, :CoxSnell or :Nagelkerke") + error("variant must be one of $(join(loglikbased, ", ")) or :devianceratio") end end @@ -259,19 +269,26 @@ adjr2(obj::StatisticalModel) = error("adjr2 is not defined for $(typeof(obj)).") Adjusted pseudo-coefficient of determination (adjusted pseudo R-squared). For nonlinear models, one of the several pseudo R² definitions must be chosen via `variant`. -The only currently supported variant is `:MacFadden`, defined as ``1 - (\\log (L) - k)/\\log (L0)``. -In this formula, ``L`` is the likelihood of the model, ``L0`` that of the null model -(the model including only the intercept), and ``k`` is the number of consumed degrees of freedom -of the model (as returned by [`dof`](@ref)). +The only currently supported variants are `:MacFadden`, defined as ``1 - (\\log (L) - k)/\\log (L0)`` and +`:devianceratio`, defined as ``1 - (D/(n-k))/(D_0/(n-1))``. +In these formulas, ``L`` is the likelihood of the model, ``L0`` that of the null model +(the model including only the intercept), ``D`` is the deviance of the model, +``D_0`` is the deviance of the null model, ``n`` is the number of observations (given by [`nobs`](@ref)) and +``k`` is the number of consumed degrees of freedom of the model (as returned by [`dof`](@ref)). """ function adjr2(obj::StatisticalModel, variant::Symbol) - ll = loglikelihood(obj) - ll0 = nullloglikelihood(obj) k = dof(obj) if variant == :McFadden + ll = loglikelihood(obj) + ll0 = nullloglikelihood(obj) 1 - (ll - k)/ll0 + elseif variant == :devianceratio + n = nobs(obj) + dev = deviance(obj) + dev0 = nulldeviance(obj) + 1 - (dev*(n-1))/(dev0*(n-k)) else - error(":McFadden is the only currently supported variant") + error("variant must be one of :McFadden or :devianceratio") end end From f52b4697184002055a6b0f79db6a2c423e2892fb Mon Sep 17 00:00:00 2001 From: David Widmann Date: Sun, 22 Mar 2020 19:16:54 +0100 Subject: [PATCH 042/105] Remove fptype (#567) * Remove fptype * Add generic alternatives for _autodot and _crossdot --- src/common.jl | 7 -- src/signalcorr.jl | 163 +++++++++++++++++++++++++++++---------------- test/signalcorr.jl | 16 ++++- 3 files changed, 121 insertions(+), 65 deletions(-) diff --git a/src/common.jl b/src/common.jl index 2f5fb901..36c128da 100644 --- a/src/common.jl +++ b/src/common.jl @@ -20,13 +20,6 @@ const IntegerMatrix{T<:Integer} = AbstractArray{T,2} const RealFP = Union{Float32, Float64} -## conversion from real to fp types - -fptype(::Type{T}) where {T<:Union{Float32,Bool,Int8,UInt8,Int16,UInt16}} = Float32 -fptype(::Type{T}) where {T<:Union{Float64,Int32,UInt32,Int64,UInt64,Int128,UInt128}} = Float64 -fptype(::Type{Complex{Float32}}) = Complex{Float32} -fptype(::Type{Complex{Float64}}) = Complex{Float64} - # A convenient typealias for deprecating default corrected Bool const DepBool = Union{Bool, Nothing} diff --git a/src/signalcorr.jl b/src/signalcorr.jl index 5daa3524..9e8ce60f 100644 --- a/src/signalcorr.jl +++ b/src/signalcorr.jl @@ -14,7 +14,8 @@ default_laglen(lx::Int) = min(lx-1, round(Int,10*log10(lx))) check_lags(lx::Int, lags::AbstractVector) = (maximum(lags) < lx || error("lags must be less than the sample length.")) -function demean_col!(z::AbstractVector{T}, x::AbstractMatrix{T}, j::Int, demean::Bool) where T<:RealFP +function demean_col!(z::RealVector, x::RealMatrix, j::Int, demean::Bool) + T = eltype(z) m = size(x, 1) @assert m == length(z) b = m * (j-1) @@ -42,7 +43,8 @@ end default_autolags(lx::Int) = 0 : default_laglen(lx) -_autodot(x::AbstractVector{<:RealFP}, lx::Int, l::Int) = dot(x, 1:lx-l, x, 1+l:lx) +_autodot(x::AbstractVector{<:RealFP}, lx::Int, l::Int) = dot(x, 1:(lx-l), x, (1+l):lx) +_autodot(x::RealVector, lx::Int, l::Int) = dot(view(x, 1:(lx-l)), view(x, (1+l):lx)) ## autocov @@ -59,12 +61,13 @@ where each column in the result will correspond to a column in `x`. The output is not normalized. See [`autocor!`](@ref) for a method with normalization. """ -function autocov!(r::RealVector, x::AbstractVector{T}, lags::IntegerVector; demean::Bool=true) where T<:RealFP +function autocov!(r::RealVector, x::RealVector, lags::IntegerVector; demean::Bool=true) lx = length(x) m = length(lags) length(r) == m || throw(DimensionMismatch()) check_lags(lx, lags) + T = typeof(zero(eltype(x)) / 1) z::Vector{T} = demean ? x .- mean(x) : x for k = 1 : m # foreach lag value r[k] = _autodot(z, lx, lags[k]) / lx @@ -72,13 +75,14 @@ function autocov!(r::RealVector, x::AbstractVector{T}, lags::IntegerVector; deme return r end -function autocov!(r::RealMatrix, x::AbstractMatrix{T}, lags::IntegerVector; demean::Bool=true) where T<:RealFP +function autocov!(r::RealMatrix, x::RealMatrix, lags::IntegerVector; demean::Bool=true) lx = size(x, 1) ns = size(x, 2) m = length(lags) size(r) == (m, ns) || throw(DimensionMismatch()) check_lags(lx, lags) + T = typeof(zero(eltype(x)) / 1) z = Vector{T}(undef, lx) for j = 1 : ns demean_col!(z, x, j, demean) @@ -106,15 +110,18 @@ When left unspecified, the lags used are the integers from 0 to The output is not normalized. See [`autocor`](@ref) for a function with normalization. """ -function autocov(x::AbstractVector{T}, lags::IntegerVector; demean::Bool=true) where T<:Real - autocov!(Vector{fptype(T)}(undef, length(lags)), float(x), lags; demean=demean) +function autocov(x::RealVector, lags::IntegerVector; demean::Bool=true) + out = Vector{float(eltype(x))}(undef, length(lags)) + autocov!(out, x, lags; demean=demean) end -function autocov(x::AbstractMatrix{T}, lags::IntegerVector; demean::Bool=true) where T<:Real - autocov!(Matrix{fptype(T)}(undef, length(lags), size(x,2)), float(x), lags; demean=demean) +function autocov(x::RealMatrix, lags::IntegerVector; demean::Bool=true) + out = Matrix{float(eltype(x))}(undef, length(lags), size(x,2)) + autocov!(out, x, lags; demean=demean) end -autocov(x::AbstractVecOrMat{<:Real}; demean::Bool=true) = autocov(x, default_autolags(size(x,1)); demean=demean) +autocov(x::AbstractVecOrMat{<:Real}; demean::Bool=true) = + autocov(x, default_autolags(size(x,1)); demean=demean) ## autocor @@ -132,12 +139,13 @@ where each column in the result will correspond to a column in `x`. The output is normalized by the variance of `x`, i.e. so that the lag 0 autocorrelation is 1. See [`autocov!`](@ref) for the unnormalized form. """ -function autocor!(r::RealVector, x::AbstractVector{T}, lags::IntegerVector; demean::Bool=true) where T<:RealFP +function autocor!(r::RealVector, x::RealVector, lags::IntegerVector; demean::Bool=true) lx = length(x) m = length(lags) length(r) == m || throw(DimensionMismatch()) check_lags(lx, lags) + T = typeof(zero(eltype(x)) / 1) z::Vector{T} = demean ? x .- mean(x) : x zz = dot(z, z) for k = 1 : m # foreach lag value @@ -146,13 +154,14 @@ function autocor!(r::RealVector, x::AbstractVector{T}, lags::IntegerVector; deme return r end -function autocor!(r::RealMatrix, x::AbstractMatrix{T}, lags::IntegerVector; demean::Bool=true) where T<:RealFP +function autocor!(r::RealMatrix, x::RealMatrix, lags::IntegerVector; demean::Bool=true) lx = size(x, 1) ns = size(x, 2) m = length(lags) size(r) == (m, ns) || throw(DimensionMismatch()) check_lags(lx, lags) + T = typeof(zero(eltype(x)) / 1) z = Vector{T}(undef, lx) for j = 1 : ns demean_col!(z, x, j, demean) @@ -182,15 +191,18 @@ When left unspecified, the lags used are the integers from 0 to The output is normalized by the variance of `x`, i.e. so that the lag 0 autocorrelation is 1. See [`autocov`](@ref) for the unnormalized form. """ -function autocor(x::AbstractVector{T}, lags::IntegerVector; demean::Bool=true) where T<:Real - autocor!(Vector{fptype(T)}(undef, length(lags)), float(x), lags; demean=demean) +function autocor(x::RealVector, lags::IntegerVector; demean::Bool=true) + out = Vector{float(eltype(x))}(undef, length(lags)) + autocor!(out, x, lags; demean=demean) end -function autocor(x::AbstractMatrix{T}, lags::IntegerVector; demean::Bool=true) where T<:Real - autocor!(Matrix{fptype(T)}(undef, length(lags), size(x,2)), float(x), lags; demean=demean) +function autocor(x::RealMatrix, lags::IntegerVector; demean::Bool=true) + out = Matrix{float(eltype(x))}(undef, length(lags), size(x,2)) + autocor!(out, x, lags; demean=demean) end -autocor(x::AbstractVecOrMat{<:Real}; demean::Bool=true) = autocor(x, default_autolags(size(x,1)); demean=demean) +autocor(x::AbstractVecOrMat{<:Real}; demean::Bool=true) = + autocor(x, default_autolags(size(x,1)); demean=demean) ####################################### @@ -201,8 +213,20 @@ autocor(x::AbstractVecOrMat{<:Real}; demean::Bool=true) = autocor(x, default_aut default_crosslags(lx::Int) = (l=default_laglen(lx); -l:l) -_crossdot(x::AbstractVector{T}, y::AbstractVector{T}, lx::Int, l::Int) where {T<:RealFP} = - (l >= 0 ? dot(x, 1:lx-l, y, 1+l:lx) : dot(x, 1-l:lx, y, 1:lx+l)) +function _crossdot(x::AbstractVector{T}, y::AbstractVector{T}, lx::Int, l::Int) where {T<:RealFP} + if l >= 0 + dot(x, 1:(lx-l), y, (1+l):lx) + else + dot(x, (1-l):lx, y, 1:(lx+l)) + end +end +function _crossdot(x::RealVector, y::RealVector, lx::Int, l::Int) + if l >= 0 + dot(view(x, 1:(lx-l)), view(y, (1+l):lx)) + else + dot(view(x, (1-l):lx), view(y, 1:(lx+l))) + end +end ## crosscov @@ -222,13 +246,15 @@ three-dimensional array of size `(length(lags), size(x, 2), size(y, 2))`. The output is not normalized. See [`crosscor!`](@ref) for a function with normalization. """ -function crosscov!(r::RealVector, x::AbstractVector{T}, y::AbstractVector{T}, lags::IntegerVector; demean::Bool=true) where T<:RealFP +function crosscov!(r::RealVector, x::RealVector, y::RealVector, lags::IntegerVector; demean::Bool=true) lx = length(x) m = length(lags) (length(y) == lx && length(r) == m) || throw(DimensionMismatch()) check_lags(lx, lags) + T = typeof(zero(eltype(x)) / 1) zx::Vector{T} = demean ? x .- mean(x) : x + S = typeof(zero(eltype(y)) / 1) zy::Vector{T} = demean ? y .- mean(y) : y for k = 1 : m # foreach lag value r[k] = _crossdot(zx, zy, lx, lags[k]) / lx @@ -236,15 +262,17 @@ function crosscov!(r::RealVector, x::AbstractVector{T}, y::AbstractVector{T}, la return r end -function crosscov!(r::RealMatrix, x::AbstractMatrix{T}, y::AbstractVector{T}, lags::IntegerVector; demean::Bool=true) where T<:RealFP +function crosscov!(r::RealMatrix, x::RealMatrix, y::RealVector, lags::IntegerVector; demean::Bool=true) lx = size(x, 1) ns = size(x, 2) m = length(lags) (length(y) == lx && size(r) == (m, ns)) || throw(DimensionMismatch()) check_lags(lx, lags) + T = typeof(zero(eltype(x)) / 1) zx = Vector{T}(undef, lx) - zy::Vector{T} = demean ? y .- mean(y) : y + S = typeof(zero(eltype(y)) / 1) + zy::Vector{S} = demean ? y .- mean(y) : y for j = 1 : ns demean_col!(zx, x, j, demean) for k = 1 : m @@ -254,15 +282,17 @@ function crosscov!(r::RealMatrix, x::AbstractMatrix{T}, y::AbstractVector{T}, la return r end -function crosscov!(r::RealMatrix, x::AbstractVector{T}, y::AbstractMatrix{T}, lags::IntegerVector; demean::Bool=true) where T<:RealFP +function crosscov!(r::RealMatrix, x::RealVector, y::RealMatrix, lags::IntegerVector; demean::Bool=true) lx = length(x) ns = size(y, 2) m = length(lags) (size(y, 1) == lx && size(r) == (m, ns)) || throw(DimensionMismatch()) check_lags(lx, lags) + T = typeof(zero(eltype(x)) / 1) zx::Vector{T} = demean ? x .- mean(x) : x - zy = Vector{T}(undef, lx) + S = typeof(zero(eltype(y)) / 1) + zy = Vector{S}(undef, lx) for j = 1 : ns demean_col!(zy, y, j, demean) for k = 1 : m @@ -272,7 +302,7 @@ function crosscov!(r::RealMatrix, x::AbstractVector{T}, y::AbstractMatrix{T}, la return r end -function crosscov!(r::AbstractArray{T,3}, x::AbstractMatrix{T}, y::AbstractMatrix{T}, lags::IntegerVector; demean::Bool=true) where T<:RealFP +function crosscov!(r::AbstractArray{<:Real,3}, x::RealMatrix, y::RealMatrix, lags::IntegerVector; demean::Bool=true) lx = size(x, 1) nx = size(x, 2) ny = size(y, 2) @@ -281,6 +311,7 @@ function crosscov!(r::AbstractArray{T,3}, x::AbstractMatrix{T}, y::AbstractMatri check_lags(lx, lags) # cached (centered) columns of x + T = typeof(zero(eltype(x)) / 1) zxs = Vector{T}[] sizehint!(zxs, nx) for j = 1 : nx @@ -294,8 +325,8 @@ function crosscov!(r::AbstractArray{T,3}, x::AbstractMatrix{T}, y::AbstractMatri push!(zxs, xj) end - zx = Vector{T}(undef, lx) - zy = Vector{T}(undef, lx) + S = typeof(zero(eltype(y)) / 1) + zy = Vector{S}(undef, lx) for j = 1 : ny demean_col!(zy, y, j, demean) for i = 1 : nx @@ -325,23 +356,28 @@ When left unspecified, the lags used are the integers from The output is not normalized. See [`crosscor`](@ref) for a function with normalization. """ -function crosscov(x::AbstractVector{T}, y::AbstractVector{T}, lags::IntegerVector; demean::Bool=true) where T<:Real - crosscov!(Vector{fptype(T)}(undef, length(lags)), float(x), float(y), lags; demean=demean) +function crosscov(x::RealVector, y::RealVector, lags::IntegerVector; demean::Bool=true) + out = Vector{float(Base.promote_eltype(x, y))}(undef, length(lags)) + crosscov!(out, x, y, lags; demean=demean) end -function crosscov(x::AbstractMatrix{T}, y::AbstractVector{T}, lags::IntegerVector; demean::Bool=true) where T<:Real - crosscov!(Matrix{fptype(T)}(undef, length(lags), size(x,2)), float(x), float(y), lags; demean=demean) +function crosscov(x::RealMatrix, y::RealVector, lags::IntegerVector; demean::Bool=true) + out = Matrix{float(Base.promote_eltype(x, y))}(undef, length(lags), size(x,2)) + crosscov!(out, x, y, lags; demean=demean) end -function crosscov(x::AbstractVector{T}, y::AbstractMatrix{T}, lags::IntegerVector; demean::Bool=true) where T<:Real - crosscov!(Matrix{fptype(T)}(undef, length(lags), size(y,2)), float(x), float(y), lags; demean=demean) +function crosscov(x::RealVector, y::RealMatrix, lags::IntegerVector; demean::Bool=true) + out = Matrix{float(Base.promote_eltype(x, y))}(undef, length(lags), size(y,2)) + crosscov!(out, x, y, lags; demean=demean) end -function crosscov(x::AbstractMatrix{T}, y::AbstractMatrix{T}, lags::IntegerVector; demean::Bool=true) where T<:Real - crosscov!(Array{fptype(T),3}(undef, length(lags), size(x,2), size(y,2)), float(x), float(y), lags; demean=demean) +function crosscov(x::RealMatrix, y::RealMatrix, lags::IntegerVector; demean::Bool=true) + out = Array{float(Base.promote_eltype(x, y)),3}(undef, length(lags), size(x,2), size(y,2)) + crosscov!(out, x, y, lags; demean=demean) end -crosscov(x::AbstractVecOrMat{T}, y::AbstractVecOrMat{T}; demean::Bool=true) where {T<:Real} = crosscov(x, y, default_crosslags(size(x,1)); demean=demean) +crosscov(x::AbstractVecOrMat{<:Real}, y::AbstractVecOrMat{<:Real}; demean::Bool=true) = + crosscov(x, y, default_crosslags(size(x,1)); demean=demean) ## crosscor @@ -361,13 +397,15 @@ three-dimensional array of size `(length(lags), size(x, 2), size(y, 2))`. The output is normalized by `sqrt(var(x)*var(y))`. See [`crosscov!`](@ref) for the unnormalized form. """ -function crosscor!(r::RealVector, x::AbstractVector{T}, y::AbstractVector{T}, lags::IntegerVector; demean::Bool=true) where T<:RealFP +function crosscor!(r::RealVector, x::RealVector, y::RealVector, lags::IntegerVector; demean::Bool=true) lx = length(x) m = length(lags) (length(y) == lx && length(r) == m) || throw(DimensionMismatch()) check_lags(lx, lags) + T = typeof(zero(eltype(x)) / 1) zx::Vector{T} = demean ? x .- mean(x) : x + S = typeof(zero(eltype(y)) / 1) zy::Vector{T} = demean ? y .- mean(y) : y sc = sqrt(dot(zx, zx) * dot(zy, zy)) for k = 1 : m # foreach lag value @@ -376,15 +414,17 @@ function crosscor!(r::RealVector, x::AbstractVector{T}, y::AbstractVector{T}, la return r end -function crosscor!(r::RealMatrix, x::AbstractMatrix{T}, y::AbstractVector{T}, lags::IntegerVector; demean::Bool=true) where T<:RealFP +function crosscor!(r::RealMatrix, x::RealMatrix, y::RealVector, lags::IntegerVector; demean::Bool=true) lx = size(x, 1) ns = size(x, 2) m = length(lags) (length(y) == lx && size(r) == (m, ns)) || throw(DimensionMismatch()) check_lags(lx, lags) + T = typeof(zero(eltype(x)) / 1) zx = Vector{T}(undef, lx) - zy::Vector{T} = demean ? y .- mean(y) : y + S = typeof(zero(eltype(y)) / 1) + zy::Vector{S} = demean ? y .- mean(y) : y yy = dot(zy, zy) for j = 1 : ns demean_col!(zx, x, j, demean) @@ -396,15 +436,17 @@ function crosscor!(r::RealMatrix, x::AbstractMatrix{T}, y::AbstractVector{T}, la return r end -function crosscor!(r::RealMatrix, x::AbstractVector{T}, y::AbstractMatrix{T}, lags::IntegerVector; demean::Bool=true) where T<:RealFP +function crosscor!(r::RealMatrix, x::RealVector, y::RealMatrix, lags::IntegerVector; demean::Bool=true) lx = length(x) ns = size(y, 2) m = length(lags) (size(y, 1) == lx && size(r) == (m, ns)) || throw(DimensionMismatch()) check_lags(lx, lags) + T = typeof(zero(eltype(x)) / 1) zx::Vector{T} = demean ? x .- mean(x) : x - zy = Vector{T}(undef, lx) + S = typeof(zero(eltype(y)) / 1) + zy = Vector{S}(undef, lx) xx = dot(zx, zx) for j = 1 : ns demean_col!(zy, y, j, demean) @@ -416,7 +458,7 @@ function crosscor!(r::RealMatrix, x::AbstractVector{T}, y::AbstractMatrix{T}, la return r end -function crosscor!(r::AbstractArray{T,3}, x::AbstractMatrix{T}, y::AbstractMatrix{T}, lags::IntegerVector; demean::Bool=true) where T<:RealFP +function crosscor!(r::AbstractArray{<:Real,3}, x::RealMatrix, y::RealMatrix, lags::IntegerVector; demean::Bool=true) lx = size(x, 1) nx = size(x, 2) ny = size(y, 2) @@ -425,6 +467,7 @@ function crosscor!(r::AbstractArray{T,3}, x::AbstractMatrix{T}, y::AbstractMatri check_lags(lx, lags) # cached (centered) columns of x + T = typeof(zero(eltype(x)) / 1) zxs = Vector{T}[] sizehint!(zxs, nx) xxs = Vector{T}(undef, nx) @@ -441,8 +484,8 @@ function crosscor!(r::AbstractArray{T,3}, x::AbstractMatrix{T}, y::AbstractMatri xxs[j] = dot(xj, xj) end - zx = Vector{T}(undef, lx) - zy = Vector{T}(undef, lx) + S = typeof(zero(eltype(y)) / 1) + zy = Vector{S}(undef, lx) for j = 1 : ny demean_col!(zy, y, j, demean) yy = dot(zy, zy) @@ -474,23 +517,28 @@ When left unspecified, the lags used are the integers from The output is normalized by `sqrt(var(x)*var(y))`. See [`crosscov`](@ref) for the unnormalized form. """ -function crosscor(x::AbstractVector{T}, y::AbstractVector{T}, lags::IntegerVector; demean::Bool=true) where T<:Real - crosscor!(Vector{fptype(T)}(undef, length(lags)), float(x), float(y), lags; demean=demean) +function crosscor(x::RealVector, y::RealVector, lags::IntegerVector; demean::Bool=true) + out = Vector{float(Base.promote_eltype(x, y))}(undef, length(lags)) + crosscor!(out, x, y, lags; demean=demean) end -function crosscor(x::AbstractMatrix{T}, y::AbstractVector{T}, lags::IntegerVector; demean::Bool=true) where T<:Real - crosscor!(Matrix{fptype(T)}(undef, length(lags), size(x,2)), float(x), float(y), lags; demean=demean) +function crosscor(x::RealMatrix, y::RealVector, lags::IntegerVector; demean::Bool=true) + out = Matrix{float(Base.promote_eltype(x, y))}(undef, length(lags), size(x,2)) + crosscor!(out, x, y, lags; demean=demean) end -function crosscor(x::AbstractVector{T}, y::AbstractMatrix{T}, lags::IntegerVector; demean::Bool=true) where T<:Real - crosscor!(Matrix{fptype(T)}(undef, length(lags), size(y,2)), float(x), float(y), lags; demean=demean) +function crosscor(x::RealVector, y::RealMatrix, lags::IntegerVector; demean::Bool=true) + out = Matrix{float(Base.promote_eltype(x, y))}(undef, length(lags), size(y,2)) + crosscor!(out, x, y, lags; demean=demean) end -function crosscor(x::AbstractMatrix{T}, y::AbstractMatrix{T}, lags::IntegerVector; demean::Bool=true) where T<:Real - crosscor!(Array{fptype(T),3}(undef, length(lags), size(x,2), size(y,2)), float(x), float(y), lags; demean=demean) +function crosscor(x::RealMatrix, y::RealMatrix, lags::IntegerVector; demean::Bool=true) + out = Array{float(Base.promote_eltype(x, y)),3}(undef, length(lags), size(x,2), size(y,2)) + crosscor!(out, x, y, lags; demean=demean) end -crosscor(x::AbstractVecOrMat{T}, y::AbstractVecOrMat{T}; demean::Bool=true) where {T<:Real} = crosscor(x, y, default_crosslags(size(x,1)); demean=demean) +crosscor(x::AbstractVecOrMat{<:Real}, y::AbstractVecOrMat{<:Real}; demean::Bool=true) = + crosscor(x, y, default_crosslags(size(x,1)); demean=demean) ####################################### @@ -501,9 +549,9 @@ crosscor(x::AbstractVecOrMat{T}, y::AbstractVecOrMat{T}; demean::Bool=true) wher # ####################################### -function pacf_regress!(r::RealMatrix, X::AbstractMatrix{T}, lags::IntegerVector, mk::Integer) where T<:RealFP +function pacf_regress!(r::RealMatrix, X::RealMatrix, lags::IntegerVector, mk::Integer) lx = size(X, 1) - tmpX = ones(T, lx, mk + 1) + tmpX = ones(eltype(X), lx, mk + 1) for j = 1 : size(X,2) for l = 1 : mk for i = 1+l:lx @@ -573,10 +621,11 @@ If `x` is a vector, return a vector of the same length as `lags`. If `x` is a matrix, return a matrix of size `(length(lags), size(x, 2))`, where each column in the result corresponds to a column in `x`. """ -function pacf(X::AbstractMatrix{T}, lags::IntegerVector; method::Symbol=:regression) where T<:Real - pacf!(Matrix{fptype(T)}(undef, length(lags), size(X,2)), float(X), lags; method=method) +function pacf(X::RealMatrix, lags::IntegerVector; method::Symbol=:regression) + out = Matrix{float(eltype(X))}(undef, length(lags), size(X,2)) + pacf!(out, float(X), lags; method=method) end -function pacf(x::AbstractVector{T}, lags::IntegerVector; method::Symbol=:regression) where T<:Real +function pacf(x::RealVector, lags::IntegerVector; method::Symbol=:regression) vec(pacf(reshape(x, length(x), 1), lags, method=method)) end diff --git a/test/signalcorr.jl b/test/signalcorr.jl index c8ee35dd..bce1c83a 100644 --- a/test/signalcorr.jl +++ b/test/signalcorr.jl @@ -22,6 +22,9 @@ x = [-2.133252557240862 -.7445937365828654; x1 = view(x, :, 1) x2 = view(x, :, 2) +realx = convert(AbstractMatrix{Real}, x) +realx1 = convert(AbstractVector{Real}, x1) +realx2 = convert(AbstractVector{Real}, x2) # autocov & autocorr @@ -40,7 +43,9 @@ racovx1 = [1.839214242630635709475, -0.088687020167434751916] @test autocov(x1) ≈ racovx1 +@test autocov(realx1) ≈ racovx1 @test autocov(x) ≈ [autocov(x1) autocov(x2)] +@test autocov(realx) ≈ [autocov(realx1) autocov(realx2)] racorx1 = [0.999999999999999888978, -0.221173011668873431557, @@ -54,7 +59,9 @@ racorx1 = [0.999999999999999888978, -0.048220059475281865091] @test autocor(x1) ≈ racorx1 +@test autocor(realx1) ≈ racorx1 @test autocor(x) ≈ [autocor(x1) autocor(x2)] +@test autocor(realx) ≈ [autocor(realx1) autocor(realx2)] # crosscov & crosscor @@ -76,10 +83,14 @@ c11 = crosscov(x1, x1) c12 = crosscov(x1, x2) c21 = crosscov(x2, x1) c22 = crosscov(x2, x2) +@test crosscov(realx1, realx2) ≈ c12 @test crosscov(x, x1) ≈ [c11 c21] +@test crosscov(realx, realx1) ≈ [c11 c21] @test crosscov(x1, x) ≈ [c11 c12] +@test crosscov(realx1, realx) ≈ [c11 c12] @test crosscov(x, x) ≈ cat([c11 c21], [c12 c22], dims=3) +@test crosscov(realx, realx) ≈ cat([c11 c21], [c12 c22], dims=3) rcor0 = [0.230940107675850, -0.230940107675850, @@ -98,10 +109,14 @@ c11 = crosscor(x1, x1) c12 = crosscor(x1, x2) c21 = crosscor(x2, x1) c22 = crosscor(x2, x2) +@test crosscor(realx1, realx2) ≈ c12 @test crosscor(x, x1) ≈ [c11 c21] +@test crosscor(realx, realx1) ≈ [c11 c21] @test crosscor(x1, x) ≈ [c11 c12] +@test crosscor(realx1, realx) ≈ [c11 c12] @test crosscor(x, x) ≈ cat([c11 c21], [c12 c22], dims=3) +@test crosscor(realx, realx) ≈ cat([c11 c21], [c12 c22], dims=3) ## pacf @@ -119,4 +134,3 @@ rpacfy = [-0.221173011668873, -0.175020669835420] @test pacf(x[:,1], 1:4, method=:yulewalker) ≈ rpacfy - From 4bdccf8938e2f393b28b59796ef76197530b295c Mon Sep 17 00:00:00 2001 From: mrrobot-2000 <60689620+mrrobot-2000@users.noreply.github.com> Date: Thu, 26 Mar 2020 17:01:26 +0530 Subject: [PATCH 043/105] Add random sampling algorithm D (#558) --- docs/src/sampling.md | 1 + perf/sampling.jl | 7 +++- src/sampling.jl | 88 +++++++++++++++++++++++++++++++++++++++++++- test/sampling.jl | 10 ++++- 4 files changed, 102 insertions(+), 4 deletions(-) diff --git a/docs/src/sampling.md b/docs/src/sampling.md index ffe3ffa8..4af68cbd 100644 --- a/docs/src/sampling.md +++ b/docs/src/sampling.md @@ -45,6 +45,7 @@ StatsBase.fisher_yates_sample! StatsBase.self_avoid_sample! StatsBase.seqsample_a! StatsBase.seqsample_c! +StatsBase.seqsample_d! ``` ### Weighted Sampling Algorithms diff --git a/perf/sampling.jl b/perf/sampling.jl index 494395b4..dc65ff7e 100644 --- a/perf/sampling.jl +++ b/perf/sampling.jl @@ -6,7 +6,7 @@ using StatsBase import StatsBase: direct_sample!, xmultinom_sample! import StatsBase: knuths_sample!, fisher_yates_sample!, self_avoid_sample! -import StatsBase: seqsample_a!, seqsample_c! +import StatsBase: seqsample_a!, seqsample_c!, seqsample_d! ### generic sampling benchmarking @@ -42,6 +42,9 @@ tsample!(s::Seq_A, a, x) = seqsample_a!(a, x) mutable struct Seq_C <: NoRep end tsample!(s::Seq_C, a, x) = seqsample_c!(a, x) +mutable struct Seq_D <: NoRep end +tsample!(s::Seq_D, a, x) = seqsample_d!(a, x) + mutable struct Sample_NoRep <: NoRep end tsample!(s::Sample_NoRep, a, x) = sample!(a, x; replace=false, ordered=false) @@ -87,6 +90,7 @@ const procs2 = Proc[ SampleProc{Knuths}(), SampleProc{Sample_NoRep}(), SampleProc{Seq_A}(), SampleProc{Seq_C}(), + SampleProc{Seq_D}(), SampleProc{Sample_NoRep_Ord}() ] const cfgs2 = (Int, Int)[] @@ -110,4 +114,3 @@ println("Sampling Without Replacement") println("===================================") show(rtable2; unit=:mps, cfghead="(n, k)") println() - diff --git a/src/sampling.jl b/src/sampling.jl index bf648ba0..8af72092 100644 --- a/src/sampling.jl +++ b/src/sampling.jl @@ -286,7 +286,93 @@ function seqsample_c!(rng::AbstractRNG, a::AbstractArray, x::AbstractArray) end seqsample_c!(a::AbstractArray, x::AbstractArray) = seqsample_c!(Random.GLOBAL_RNG, a, x) -## TODO: implement Algorithm D (page 716 - 717) +""" + seqsample_d!([rng], a::AbstractArray, x::AbstractArray) + +Random subsequence sampling using algorithm D described in the following paper (page 716-17): +Jeffrey Scott Vitter. "Faster Methods for Random Sampling". Communications of the ACM, +27 (7), July 1984. + +This algorithm consumes ``O(k)`` random numbers, with `k=length(x)`. +The outputs are ordered. +""" +function seqsample_d!(rng::AbstractRNG, a::AbstractArray, x::AbstractArray) + N = length(a) + n = length(x) + n <= N || error("length(x) should not exceed length(a)") + + i = 0 + j = 0 + + vprime = exp(-randexp(rng)/n) + q1 = N - n + 1 + q2 = q1 / N + alpha = 1 / 13 # choose alpha value + threshold = alpha * n + + while n > 1 && threshold < N + while true + local X + while true + X = N * (1 - vprime) + s = trunc(Int, X) + if s < q1 + break + end + vprime = exp(-randexp(rng)/n) + end + + y = rand(rng) / q2 + lhs = exp(log(y) / (n - 1)) + rhs = ((q1 - s) / q1) * (N / (N - X)) + + if lhs <= rhs + vprime = lhs / rhs + break + end + + if n - 1 > s + bottom = N - n + limit = N - s + else + bottom = N - s - 1 + limit = q1 + end + + top = N - 1 + + while top >= limit + y = y * top / bottom + bottom -= 1 + top -= 1 + end + + if log(y) < (n - 1)*(log(N) - log(N - X)) + vprime = exp(-randexp(rng) / (n-1)) + break + end + vprime = exp(-randexp(rng)/n) + end + + j += 1 + i += s+1 + @inbounds x[j] = a[i] + N = N - s - 1 + n -= 1 + q1 -= s + q2 = q1 / N + threshold -= alpha + end + + if n > 1 + seqsample_a!(rng, a[i+1:end], @view x[j+1:end]) + else + s = trunc(Int, N * vprime) + @inbounds x[j+=1] = a[i+=s+1] + end +end + +seqsample_d!(a::AbstractArray, x::AbstractArray) = seqsample_d!(Random.GLOBAL_RNG, a, x) ### Interface functions (poly-algorithms) diff --git a/test/sampling.jl b/test/sampling.jl index c3baa015..1f9c1d43 100644 --- a/test/sampling.jl +++ b/test/sampling.jl @@ -122,7 +122,7 @@ function check_sample_norep(a::AbstractArray, vrgn, ptol::Real; ordered::Bool=fa end import StatsBase: knuths_sample!, fisher_yates_sample!, self_avoid_sample! -import StatsBase: seqsample_a!, seqsample_c! +import StatsBase: seqsample_a!, seqsample_c!, seqsample_d! a = zeros(Int, 5, n) for j = 1:size(a,2) @@ -164,6 +164,14 @@ check_sample_norep(a, (3, 12), 5.0e-3; ordered=true) test_rng_use(seqsample_c!, 1:10, zeros(Int, 6)) +a = zeros(Int, 5, n) +for j = 1:size(a,2) + seqsample_d!(3:12, view(a,:,j)) +end +check_sample_norep(a, (3, 12), 5.0e-3; ordered=true) + +test_rng_use(seqsample_d!, 1:10, zeros(Int, 6)) + a = sample(3:12, 5; replace=false) check_sample_norep(a, (3, 12), 0; ordered=false) From 0ea8e798c3d19609ed33b11311de5a2bd6ee9fd0 Mon Sep 17 00:00:00 2001 From: Alex Arslan Date: Thu, 26 Mar 2020 11:46:16 -0700 Subject: [PATCH 044/105] Bump version to 0.33.0 (#571) --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 17145e41..79d22069 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "StatsBase" uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" authors = ["JuliaStats"] -version = "0.32.1" +version = "0.33.0" [deps] DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" From 6d567f0f0f17957ef06ce654d2f824ded540e234 Mon Sep 17 00:00:00 2001 From: Rafael Fourquet Date: Wed, 20 May 2020 10:37:19 +0200 Subject: [PATCH 045/105] fix legacy usage of RangeGenerator (#576) --- src/sampling.jl | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/sampling.jl b/src/sampling.jl index 8af72092..1417596f 100644 --- a/src/sampling.jl +++ b/src/sampling.jl @@ -5,12 +5,12 @@ # ########################################################### -using Random: RangeGenerator, Random.GLOBAL_RNG +using Random: Sampler, Random.GLOBAL_RNG ### Algorithms for sampling with replacement function direct_sample!(rng::AbstractRNG, a::UnitRange, x::AbstractArray) - s = RangeGenerator(1:length(a)) + s = Sampler(rng, 1:length(a)) b = a[1] - 1 if b == 0 for i = 1:length(x) @@ -34,7 +34,7 @@ and set `x[j] = a[i]`, with `n=length(a)` and `k=length(x)`. This algorithm consumes `k` random numbers. """ function direct_sample!(rng::AbstractRNG, a::AbstractArray, x::AbstractArray) - s = RangeGenerator(1:length(a)) + s = Sampler(rng, 1:length(a)) for i = 1:length(x) @inbounds x[i] = a[rand(rng, s)] end @@ -107,7 +107,7 @@ function knuths_sample!(rng::AbstractRNG, a::AbstractArray, x::AbstractArray; end # scan remaining - s = RangeGenerator(1:k) + s = Sampler(rng, 1:k) for i = k+1:n if rand(rng) * i < k # keep it with probability k / i @inbounds x[rand(rng, s)] = a[i] @@ -185,7 +185,7 @@ function self_avoid_sample!(rng::AbstractRNG, a::AbstractArray, x::AbstractArray s = Set{Int}() sizehint!(s, k) - rgen = RangeGenerator(1:n) + rgen = Sampler(rng, 1:n) # first one idx = rand(rng, rgen) @@ -618,7 +618,7 @@ function alias_sample!(rng::AbstractRNG, a::AbstractArray, wv::AbstractWeights, make_alias_table!(wv, sum(wv), ap, alias) # sampling - s = RangeGenerator(1:n) + s = Sampler(rng, 1:n) for i = 1:length(x) j = rand(rng, s) x[i] = rand(rng) < ap[j] ? a[j] : a[alias[j]] From 54cc6fe2e2f709623b9c75275884f9b295fdbd35 Mon Sep 17 00:00:00 2001 From: Fabian Zierler Date: Sat, 20 Jun 2020 17:46:06 +0200 Subject: [PATCH 046/105] Correct docstrings of autocor and autocov (#584) If `x` is a vector then `autocov` and `autocor` as well as their mutating versions return a vector of length `length(lags)` and not a vector of the same length as `x`. --- src/signalcorr.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/signalcorr.jl b/src/signalcorr.jl index 9e8ce60f..1adf9791 100644 --- a/src/signalcorr.jl +++ b/src/signalcorr.jl @@ -55,7 +55,7 @@ Compute the autocovariance of a vector or matrix `x` at `lags` and store the res in `r`. `demean` denotes whether the mean of `x` should be subtracted from `x` before computing the autocovariance. -If `x` is a vector, `r` must be a vector of the same length as `x`. +If `x` is a vector, `r` must be a vector of the same length as `lags`. If `x` is a matrix, `r` must be a matrix of size `(length(lags), size(x,2))`, and where each column in the result will correspond to a column in `x`. @@ -101,7 +101,7 @@ Compute the autocovariance of a vector or matrix `x`, optionally specifying the `lags` at which to compute the autocovariance. `demean` denotes whether the mean of `x` should be subtracted from `x` before computing the autocovariance. -If `x` is a vector, return a vector of the same length as `x`. +If `x` is a vector, return a vector of the same length as `lags`. If `x` is a matrix, return a matrix of size `(length(lags), size(x,2))`, where each column in the result corresponds to a column in `x`. @@ -132,7 +132,7 @@ Compute the autocorrelation function (ACF) of a vector or matrix `x` at `lags` and store the result in `r`. `demean` denotes whether the mean of `x` should be subtracted from `x` before computing the ACF. -If `x` is a vector, `r` must be a vector of the same length as `x`. +If `x` is a vector, `r` must be a vector of the same length as `lags`. If `x` is a matrix, `r` must be a matrix of size `(length(lags), size(x,2))`, and where each column in the result will correspond to a column in `x`. @@ -181,7 +181,7 @@ Compute the autocorrelation function (ACF) of a vector or matrix `x`, optionally specifying the `lags`. `demean` denotes whether the mean of `x` should be subtracted from `x` before computing the ACF. -If `x` is a vector, return a vector of the same length as `x`. +If `x` is a vector, return a vector of the same length as `lags`. If `x` is a matrix, return a matrix of size `(length(lags), size(x,2))`, where each column in the result corresponds to a column in `x`. From 3e60f4e4e533c5d3aea03184d2b4d15dff34467f Mon Sep 17 00:00:00 2001 From: Andreas Noack Date: Wed, 19 Aug 2020 13:48:09 +0200 Subject: [PATCH 047/105] Test on current release and fix test failures (#592) * Test on current release version * Add StableRNGs as a test dependency to ensure same test results across Julia versions. Increase n when testing weighted sampleing with replacement to avoid test failure on Julia >= 1.5 * Use Travis for Windows testing for simplicity * Try testing on 32 bit as well * Remove AppVeyor badge --- .travis.yml | 13 +++++++++++-- Project.toml | 5 +++-- README.md | 1 - appveyor.yml | 37 ------------------------------------- test/sampling.jl | 10 +++++----- test/wsampling.jl | 2 +- 6 files changed, 20 insertions(+), 48 deletions(-) delete mode 100644 appveyor.yml diff --git a/.travis.yml b/.travis.yml index a93b114c..9f59d6f5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,22 +2,31 @@ language: julia os: - linux - osx + - windows + julia: - 1.0 - - 1.3 - - 1.4 + - 1 - nightly + +arch: + - x64 + - x86 + notifications: email: false # Work around a Travis bug + git: depth: 999999 # Uncomment the following lines to override the default test script #script: # - if [[ -a .git/shallow ]]; then git fetch --unshallow; fi # - julia -e 'Pkg.clone(pwd()); Pkg.build("StatsBase"); Pkg.test("StatsBase"; coverage=true)' + after_success: - julia -e 'using Pkg; Pkg.add("Coverage"); using Coverage; Coveralls.submit(process_folder()); Codecov.submit(process_folder())'; + jobs: include: - stage: "Documentation" diff --git a/Project.toml b/Project.toml index 79d22069..0d67db43 100644 --- a/Project.toml +++ b/Project.toml @@ -15,16 +15,17 @@ SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" [compat] -julia = "1" DataAPI = "1" DataStructures = "0.10, 0.11, 0.12, 0.13, 0.14, 0.17" Missings = "0.3, 0.4" SortingAlgorithms = "0.3" +julia = "1" [extras] Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab" +StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Dates", "DelimitedFiles", "Test"] +test = ["Dates", "DelimitedFiles", "StableRNGs", "Test"] diff --git a/README.md b/README.md index 7b0710ee..7d4c2ba3 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,6 @@ - **Build & Testing Status:** [![Build Status](https://travis-ci.org/JuliaStats/StatsBase.jl.svg?branch=master)](https://travis-ci.org/JuliaStats/StatsBase.jl) - [![Build status](https://ci.appveyor.com/api/projects/status/fsut3j3onulvws1w?svg=true)](https://ci.appveyor.com/project/nalimilan/statsbase-jl) [![Coverage Status](https://coveralls.io/repos/JuliaStats/StatsBase.jl/badge.svg?branch=master)](https://coveralls.io/r/JuliaStats/StatsBase.jl?branch=master) [![Coverage Status](http://codecov.io/github/JuliaStats/StatsBase.jl/coverage.svg?branch=master)](http://codecov.io/github/JuliaStats/StatsBase.jl?branch=master) diff --git a/appveyor.yml b/appveyor.yml deleted file mode 100644 index c9bc717e..00000000 --- a/appveyor.yml +++ /dev/null @@ -1,37 +0,0 @@ -environment: - matrix: - - julia_version: 1.0 - - julia_version: 1.2 - - julia_version: nightly - -platform: - - x86 # 32-bit - - x64 # 64-bit - -# Uncomment the following lines to allow failures on nightly julia -# (tests will run but not make your overall status red) -matrix: - allow_failures: - - julia_version: nightly - -branches: - only: - - master - - /release-.*/ - -notifications: - - provider: Email - on_build_success: false - on_build_failure: false - on_build_status_changed: false - -install: - - ps: iex ((new-object net.webclient).DownloadString("https://raw.githubusercontent.com/JuliaCI/Appveyor.jl/version-1/bin/install.ps1")) - -build_script: - - echo "%JL_BUILD_SCRIPT%" - - C:\julia\bin\julia -e "%JL_BUILD_SCRIPT%" - -test_script: - - echo "%JL_TEST_SCRIPT%" - - C:\julia\bin\julia -e "%JL_TEST_SCRIPT%" diff --git a/test/sampling.jl b/test/sampling.jl index 1f9c1d43..91623302 100644 --- a/test/sampling.jl +++ b/test/sampling.jl @@ -1,5 +1,5 @@ using StatsBase -using Test, Random +using Test, Random, StableRNGs Random.seed!(1234) @@ -78,12 +78,12 @@ test_rng_use(sample, 1:10, 10) @testset "sampling pairs" begin - rng = Random.MersenneTwister(1) + rng = StableRNG(1) - @test samplepair(rng, 2) === (1, 2) - @test samplepair(rng, 10) === (8, 2) + @test samplepair(rng, 2) === (2, 1) + @test samplepair(rng, 10) === (5, 6) - @test samplepair(rng, [3, 4, 2, 6, 8]) === (2, 6) + @test samplepair(rng, [3, 4, 2, 6, 8]) === (3, 8) @test samplepair(rng, [1, 2]) === (1, 2) end diff --git a/test/wsampling.jl b/test/wsampling.jl index 37a3b9eb..fd9e6cec 100644 --- a/test/wsampling.jl +++ b/test/wsampling.jl @@ -32,7 +32,7 @@ end import StatsBase: direct_sample!, alias_sample! -n = 10^5 +n = 10^6 wv = weights([0.2, 0.8, 0.4, 0.6]) a = direct_sample!(4:7, wv, zeros(Int, n, 3)) From c1ca0ae96675e06e5baf93336885e1cd8e426962 Mon Sep 17 00:00:00 2001 From: Andreas Noack Date: Wed, 19 Aug 2020 14:35:41 +0200 Subject: [PATCH 048/105] Allow DataStructures 0.18 (#593) Supersedes #591 --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 0d67db43..00c3c3a2 100644 --- a/Project.toml +++ b/Project.toml @@ -16,7 +16,7 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" [compat] DataAPI = "1" -DataStructures = "0.10, 0.11, 0.12, 0.13, 0.14, 0.17" +DataStructures = "0.10, 0.11, 0.12, 0.13, 0.14, 0.17, 0.18" Missings = "0.3, 0.4" SortingAlgorithms = "0.3" julia = "1" From 512d6aee5f4e63f8c70789491310a6d2d1efa468 Mon Sep 17 00:00:00 2001 From: Dhairya Gandhi Date: Fri, 28 Aug 2020 03:55:08 +0530 Subject: [PATCH 049/105] bump patch (#596) Co-authored-by: Dhairya Gandhi --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 00c3c3a2..bceb331e 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "StatsBase" uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" authors = ["JuliaStats"] -version = "0.33.0" +version = "0.33.1" [deps] DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" From d1017b067a62735e193062195bcc6b402203bde3 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Fri, 28 Aug 2020 19:24:07 +0200 Subject: [PATCH 050/105] Fix wrong argument name in informationmatrix (#579) Also specify "of the model" for consistency with other functions Rename obj to model everywhere. --- src/statmodels.jl | 215 +++++++++++++++++++++++----------------------- 1 file changed, 109 insertions(+), 106 deletions(-) diff --git a/src/statmodels.jl b/src/statmodels.jl index 4836a1eb..3536e3c9 100644 --- a/src/statmodels.jl +++ b/src/statmodels.jl @@ -3,211 +3,214 @@ abstract type StatisticalModel end """ - coef(obj::StatisticalModel) + coef(model::StatisticalModel) Return the coefficients of the model. """ -coef(obj::StatisticalModel) = error("coef is not defined for $(typeof(obj)).") +coef(model::StatisticalModel) = error("coef is not defined for $(typeof(model)).") """ - coefnames(obj::StatisticalModel) + coefnames(model::StatisticalModel) Return the names of the coefficients. """ -coefnames(obj::StatisticalModel) = error("coefnames is not defined for $(typeof(obj)).") +coefnames(model::StatisticalModel) = error("coefnames is not defined for $(typeof(model)).") """ - coeftable(obj::StatisticalModel; level::Real=0.95) + coeftable(model::StatisticalModel; level::Real=0.95) Return a table of class `CoefTable` with coefficients and related statistics. `level` determines the level for confidence intervals (by default, 95%). """ -coeftable(obj::StatisticalModel) = error("coeftable is not defined for $(typeof(obj)).") +coeftable(model::StatisticalModel) = error("coeftable is not defined for $(typeof(model)).") """ - confint(obj::StatisticalModel; level::Real=0.95) + confint(model::StatisticalModel; level::Real=0.95) Compute confidence intervals for coefficients, with confidence level `level` (by default 95%). """ -confint(obj::StatisticalModel) = error("confint is not defined for $(typeof(obj)).") +confint(model::StatisticalModel) = error("confint is not defined for $(typeof(model)).") """ - deviance(obj::StatisticalModel) + deviance(model::StatisticalModel) Return the deviance of the model relative to a reference, which is usually when applicable the saturated model. It is equal, *up to a constant*, to ``-2 \\log L``, with ``L`` the likelihood of the model. """ -deviance(obj::StatisticalModel) = error("deviance is not defined for $(typeof(obj)).") +deviance(model::StatisticalModel) = error("deviance is not defined for $(typeof(model)).") """ - islinear(obj::StatisticalModel) + islinear(model::StatisticalModel) Indicate whether the model is linear. """ -islinear(obj::StatisticalModel) = error("islinear is not defined for $(typeof(obj)).") +islinear(model::StatisticalModel) = error("islinear is not defined for $(typeof(model)).") """ - nulldeviance(obj::StatisticalModel) + nulldeviance(model::StatisticalModel) Return the deviance of the null model, that is the one including only the intercept. """ -nulldeviance(obj::StatisticalModel) = error("nulldeviance is not defined for $(typeof(obj)).") +nulldeviance(model::StatisticalModel) = + error("nulldeviance is not defined for $(typeof(model)).") """ - loglikelihood(obj::StatisticalModel) + loglikelihood(model::StatisticalModel) Return the log-likelihood of the model. """ -loglikelihood(obj::StatisticalModel) = error("loglikelihood is not defined for $(typeof(obj)).") +loglikelihood(model::StatisticalModel) = + error("loglikelihood is not defined for $(typeof(model)).") """ - loglikelihood(obj::StatisticalModel) + loglikelihood(model::StatisticalModel) -Return the log-likelihood of the null model corresponding to model `obj`. +Return the log-likelihood of the null model corresponding to `model`. This is usually the model containing only the intercept. """ -nullloglikelihood(obj::StatisticalModel) = error("nullloglikelihood is not defined for $(typeof(obj)).") +nullloglikelihood(model::StatisticalModel) = + error("nullloglikelihood is not defined for $(typeof(model)).") """ - score(obj::StatisticalModel) + score(model::StatisticalModel) -Return the score of the statistical model. The score is the gradient of the +Return the score of the model, that is the gradient of the log-likelihood with respect to the coefficients. """ -score(obj::StatisticalModel) = error("score is not defined for $(typeof(obj)).") +score(model::StatisticalModel) = error("score is not defined for $(typeof(model)).") """ - nobs(obj::StatisticalModel) + nobs(model::StatisticalModel) Return the number of independent observations on which the model was fitted. Be careful when using this information, as the definition of an independent observation may vary depending on the model, on the format used to pass the data, on the sampling plan (if specified), etc. """ -nobs(obj::StatisticalModel) = error("nobs is not defined for $(typeof(obj)).") +nobs(model::StatisticalModel) = error("nobs is not defined for $(typeof(model)).") """ - dof(obj::StatisticalModel) + dof(model::StatisticalModel) Return the number of degrees of freedom consumed in the model, including when applicable the intercept and the distribution's dispersion parameter. """ -dof(obj::StatisticalModel) = error("dof is not defined for $(typeof(obj)).") +dof(model::StatisticalModel) = error("dof is not defined for $(typeof(model)).") """ - mss(obj::StatisticalModel) + mss(model::StatisticalModel) Return the model sum of squares. """ -mss(obj::StatisticalModel) = error("mss is not defined for $(typeof(obj)).") +mss(model::StatisticalModel) = error("mss is not defined for $(typeof(model)).") """ - rss(obj::StatisticalModel) + rss(model::StatisticalModel) -Return the residual sum of squares. +Return the residual sum of squares of the model. """ -rss(obj::StatisticalModel) = error("rss is not defined for $(typeof(obj)).") +rss(model::StatisticalModel) = error("rss is not defined for $(typeof(model)).") """ informationmatrix(model::StatisticalModel; expected::Bool = true) -Return the information matrix. By default the Fisher information matrix is returned, -while the observed information matrix can be requested with `expected = false`. +Return the information matrix of the model. By default the Fisher information matrix +is returned, while the observed information matrix can be requested with `expected = false`. """ informationmatrix(model::StatisticalModel; expected::Bool = true) = - error("informationmatrix is not defined for $(typeof(obj)).") + error("informationmatrix is not defined for $(typeof(model)).") """ - stderror(obj::StatisticalModel) + stderror(model::StatisticalModel) Return the standard errors for the coefficients of the model. """ -stderror(obj::StatisticalModel) = sqrt.(diag(vcov(obj))) +stderror(model::StatisticalModel) = sqrt.(diag(vcov(model))) """ - vcov(obj::StatisticalModel) + vcov(model::StatisticalModel) Return the variance-covariance matrix for the coefficients of the model. """ -vcov(obj::StatisticalModel) = error("vcov is not defined for $(typeof(obj)).") +vcov(model::StatisticalModel) = error("vcov is not defined for $(typeof(model)).") """ - weights(obj::StatisticalModel) + weights(model::StatisticalModel) Return the weights used in the model. """ -weights(obj::StatisticalModel) = error("weights is not defined for $(typeof(obj)).") +weights(model::StatisticalModel) = error("weights is not defined for $(typeof(model)).") """ - isfitted(obj::StatisticalModel) + isfitted(model::StatisticalModel) Indicate whether the model has been fitted. """ -isfitted(obj::StatisticalModel) = error("isfitted is not defined for $(typeof(obj)).") +isfitted(model::StatisticalModel) = error("isfitted is not defined for $(typeof(model)).") """ Fit a statistical model. """ -fit(obj::StatisticalModel, args...) = error("fit is not defined for $(typeof(obj)).") +fit(model::StatisticalModel, args...) = error("fit is not defined for $(typeof(model)).") """ Fit a statistical model in-place. """ -fit!(obj::StatisticalModel, args...) = error("fit! is not defined for $(typeof(obj)).") +fit!(model::StatisticalModel, args...) = error("fit! is not defined for $(typeof(model)).") """ - aic(obj::StatisticalModel) + aic(model::StatisticalModel) Akaike's Information Criterion, defined as ``-2 \\log L + 2k``, with ``L`` the likelihood of the model, and `k` its number of consumed degrees of freedom (as returned by [`dof`](@ref)). """ -aic(obj::StatisticalModel) = -2loglikelihood(obj) + 2dof(obj) +aic(model::StatisticalModel) = -2loglikelihood(model) + 2dof(model) """ - aicc(obj::StatisticalModel) + aicc(model::StatisticalModel) Corrected Akaike's Information Criterion for small sample sizes (Hurvich and Tsai 1989), defined as ``-2 \\log L + 2k + 2k(k-1)/(n-k-1)``, with ``L`` the likelihood of the model, ``k`` its number of consumed degrees of freedom (as returned by [`dof`](@ref)), and ``n`` the number of observations (as returned by [`nobs`](@ref)). """ -function aicc(obj::StatisticalModel) - k = dof(obj) - n = nobs(obj) - -2loglikelihood(obj) + 2k + 2k*(k+1)/(n-k-1) +function aicc(model::StatisticalModel) + k = dof(model) + n = nobs(model) + -2loglikelihood(model) + 2k + 2k*(k+1)/(n-k-1) end """ - bic(obj::StatisticalModel) + bic(model::StatisticalModel) Bayesian Information Criterion, defined as ``-2 \\log L + k \\log n``, with ``L`` the likelihood of the model, ``k`` its number of consumed degrees of freedom (as returned by [`dof`](@ref)), and ``n`` the number of observations (as returned by [`nobs`](@ref)). """ -bic(obj::StatisticalModel) = -2loglikelihood(obj) + dof(obj)*log(nobs(obj)) +bic(model::StatisticalModel) = -2loglikelihood(model) + dof(model)*log(nobs(model)) """ - r2(obj::StatisticalModel) - r²(obj::StatisticalModel) + r2(model::StatisticalModel) + r²(model::StatisticalModel) Coefficient of determination (R-squared). For a linear model, the R² is defined as ``ESS/TSS``, with ``ESS`` the explained sum of squares and ``TSS`` the total sum of squares. """ -function r2(obj::StatisticalModel) +function r2(model::StatisticalModel) Base.depwarn("The default r² method for linear models is deprecated. " * "Packages should define their own methods.", :r2) - mss(obj) / deviance(obj) + mss(model) / deviance(model) end """ - r2(obj::StatisticalModel, variant::Symbol) - r²(obj::StatisticalModel, variant::Symbol) + r2(model::StatisticalModel, variant::Symbol) + r²(model::StatisticalModel, variant::Symbol) Pseudo-coefficient of determination (pseudo R-squared). @@ -227,21 +230,21 @@ In the above formulas, ``L`` is the likelihood of the model, The Cox-Snell and the deviance ratio variants both match the classical definition of R² for linear models. """ -function r2(obj::StatisticalModel, variant::Symbol) +function r2(model::StatisticalModel, variant::Symbol) loglikbased = (:McFadden, :CoxSnell, :Nagelkerke) if variant in loglikbased - ll = loglikelihood(obj) - ll0 = nullloglikelihood(obj) + ll = loglikelihood(model) + ll0 = nullloglikelihood(model) if variant == :McFadden 1 - ll/ll0 elseif variant == :CoxSnell - 1 - exp(2 * (ll0 - ll) / nobs(obj)) + 1 - exp(2 * (ll0 - ll) / nobs(model)) elseif variant == :Nagelkerke - (1 - exp(2 * (ll0 - ll) / nobs(obj))) / (1 - exp(2 * ll0 / nobs(obj))) + (1 - exp(2 * (ll0 - ll) / nobs(model))) / (1 - exp(2 * ll0 / nobs(model))) end elseif variant == :devianceratio - dev = deviance(obj) - dev0 = nulldeviance(obj) + dev = deviance(model) + dev0 = nulldeviance(model) 1 - dev/dev0 else error("variant must be one of $(join(loglikbased, ", ")) or :devianceratio") @@ -251,8 +254,8 @@ end const r² = r2 """ - adjr2(obj::StatisticalModel) - adjr²(obj::StatisticalModel) + adjr2(model::StatisticalModel) + adjr²(model::StatisticalModel) Adjusted coefficient of determination (adjusted R-squared). @@ -260,11 +263,11 @@ For linear models, the adjusted R² is defined as ``1 - (1 - (1-R^2)(n-1)/(n-p)) the coefficient of determination, ``n`` the number of observations, and ``p`` the number of coefficients (including the intercept). This definition is generally known as the Wherry Formula I. """ -adjr2(obj::StatisticalModel) = error("adjr2 is not defined for $(typeof(obj)).") +adjr2(model::StatisticalModel) = error("adjr2 is not defined for $(typeof(model)).") """ - adjr2(obj::StatisticalModel, variant::Symbol) - adjr²(obj::StatisticalModel, variant::Symbol) + adjr2(model::StatisticalModel, variant::Symbol) + adjr²(model::StatisticalModel, variant::Symbol) Adjusted pseudo-coefficient of determination (adjusted pseudo R-squared). @@ -276,16 +279,16 @@ In these formulas, ``L`` is the likelihood of the model, ``L0`` that of the null ``D_0`` is the deviance of the null model, ``n`` is the number of observations (given by [`nobs`](@ref)) and ``k`` is the number of consumed degrees of freedom of the model (as returned by [`dof`](@ref)). """ -function adjr2(obj::StatisticalModel, variant::Symbol) - k = dof(obj) +function adjr2(model::StatisticalModel, variant::Symbol) + k = dof(model) if variant == :McFadden - ll = loglikelihood(obj) - ll0 = nullloglikelihood(obj) + ll = loglikelihood(model) + ll0 = nullloglikelihood(model) 1 - (ll - k)/ll0 elseif variant == :devianceratio - n = nobs(obj) - dev = deviance(obj) - dev0 = nulldeviance(obj) + n = nobs(model) + dev = deviance(model) + dev0 = nulldeviance(model) 1 - (dev*(n-1))/(dev0*(n-k)) else error("variant must be one of :McFadden or :devianceratio") @@ -297,72 +300,72 @@ const adjr² = adjr2 abstract type RegressionModel <: StatisticalModel end """ - fitted(obj::RegressionModel) + fitted(model::RegressionModel) Return the fitted values of the model. """ -fitted(obj::RegressionModel) = error("fitted is not defined for $(typeof(obj)).") +fitted(model::RegressionModel) = error("fitted is not defined for $(typeof(model)).") """ - response(obj::RegressionModel) + response(model::RegressionModel) Return the model response (a.k.a. the dependent variable). """ -response(obj::RegressionModel) = error("response is not defined for $(typeof(obj)).") +response(model::RegressionModel) = error("response is not defined for $(typeof(model)).") """ - responsename(obj::RegressionModel) + responsename(model::RegressionModel) Return the name of the model response (a.k.a. the dependent variable). """ -responsename(obj::RegressionModel) = error("responsename is not defined for $(typeof(obj)).") +responsename(model::RegressionModel) = error("responsename is not defined for $(typeof(model)).") """ - meanresponse(obj::RegressionModel) + meanresponse(model::RegressionModel) Return the mean of the response. """ -meanresponse(obj::RegressionModel) = error("meanresponse is not defined for $(typeof(obj)).") +meanresponse(model::RegressionModel) = error("meanresponse is not defined for $(typeof(model)).") """ - modelmatrix(obj::RegressionModel) + modelmatrix(model::RegressionModel) Return the model matrix (a.k.a. the design matrix). """ -modelmatrix(obj::RegressionModel) = error("modelmatrix is not defined for $(typeof(obj)).") +modelmatrix(model::RegressionModel) = error("modelmatrix is not defined for $(typeof(model)).") """ - crossmodelmatrix(obj::RegressionModel) + crossmodelmatrix(model::RegressionModel) -Return `X'X` where `X` is the model matrix of `obj`. -This function will return a pre-computed matrix stored in `obj` if possible. +Return `X'X` where `X` is the model matrix of `model`. +This function will return a pre-computed matrix stored in `model` if possible. """ -crossmodelmatrix(obj::RegressionModel) = (x = modelmatrix(obj); Symmetric(x' * x)) +crossmodelmatrix(model::RegressionModel) = (x = modelmatrix(model); Symmetric(x' * x)) """ - leverage(obj::RegressionModel) + leverage(model::RegressionModel) -Return the diagonal of the projection matrix. +Return the diagonal of the projection matrix of the model. """ -leverage(obj::RegressionModel) = error("leverage is not defined for $(typeof(obj)).") +leverage(model::RegressionModel) = error("leverage is not defined for $(typeof(model)).") """ - residuals(obj::RegressionModel) + residuals(model::RegressionModel) Return the residuals of the model. """ -residuals(obj::RegressionModel) = error("residuals is not defined for $(typeof(obj)).") +residuals(model::RegressionModel) = error("residuals is not defined for $(typeof(model)).") """ - predict(obj::RegressionModel, [newX]) + predict(model::RegressionModel, [newX]) -Form the predicted response of model `obj`. An object with new covariate values `newX` can be supplied, -which should have the same type and structure as that used to fit `obj`; e.g. for a GLM +Form the predicted response of `model`. An object with new covariate values `newX` can be supplied, +which should have the same type and structure as that used to fit `model`; e.g. for a GLM it would generally be a `DataFrame` with the same variable names as the original predictors. """ function predict end -predict(obj::RegressionModel) = error("predict is not defined for $(typeof(obj)).") +predict(model::RegressionModel) = error("predict is not defined for $(typeof(model)).") """ predict! @@ -371,21 +374,21 @@ In-place version of [`predict`](@ref). """ function predict! end -predict!(obj::RegressionModel) = error("predict! is not defined for $(typeof(obj)).") +predict!(model::RegressionModel) = error("predict! is not defined for $(typeof(model)).") """ - dof_residual(obj::RegressionModel) + dof_residual(model::RegressionModel) Return the residual degrees of freedom of the model. """ -dof_residual(obj::RegressionModel) = error("dof_residual is not defined for $(typeof(obj)).") +dof_residual(model::RegressionModel) = error("dof_residual is not defined for $(typeof(model)).") """ - params(obj) + params(model) Return all parameters of a model. """ -params(obj) = error("params is not defined for $(typeof(obj))") +params(model) = error("params is not defined for $(typeof(model))") function params! end ## coefficient tables with specialized show method From ec33334161a8702bbc757b65da986e839ffd2cee Mon Sep 17 00:00:00 2001 From: Art Date: Sat, 29 Aug 2020 06:49:32 -0400 Subject: [PATCH 051/105] Fix transformation `fit` call with default variables (#595) --- src/transformations.jl | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/transformations.jl b/src/transformations.jl index e4fdb9ac..aec33ae6 100644 --- a/src/transformations.jl +++ b/src/transformations.jl @@ -108,6 +108,10 @@ julia> StatsBase.transform(dt, X) """ function fit(::Type{ZScoreTransform}, X::AbstractMatrix{<:Real}; dims::Union{Integer,Nothing}=nothing, center::Bool=true, scale::Bool=true) + if dims === nothing + Base.depwarn("fit(t, x) is deprecated: use fit(t, x, dims=2) instead", :fit) + dims = 2 + end if dims == 1 n, l = size(X) n >= 2 || error("X must contain at least two rows.") @@ -116,9 +120,6 @@ function fit(::Type{ZScoreTransform}, X::AbstractMatrix{<:Real}; l, n = size(X) n >= 2 || error("X must contain at least two columns.") m, s = mean_and_std(X, 2) - elseif dims === nothing - Base.depwarn("fit(t, x) is deprecated: use fit(t, x, dims=2) instead", :fit) - m, s = mean_and_std(X, 2) else throw(DomainError(dims, "fit only accept dims to be 1 or 2.")) end @@ -128,10 +129,8 @@ function fit(::Type{ZScoreTransform}, X::AbstractMatrix{<:Real}; end function fit(::Type{ZScoreTransform}, X::AbstractVector{<:Real}; - dims::Union{Integer,Nothing}=nothing, center::Bool=true, scale::Bool=true) - if dims == nothing - Base.depwarn("fit(t, x) is deprecated: use fit(t, x, dims=2) instead", :fit) - elseif dims != 1 + dims::Integer=1, center::Bool=true, scale::Bool=true) + if dims != 1 throw(DomainError(dims, "fit only accepts dims=1 over a vector. Try fit(t, x, dims=1).")) end @@ -267,13 +266,14 @@ julia> StatsBase.transform(dt, X) """ function fit(::Type{UnitRangeTransform}, X::AbstractMatrix{<:Real}; dims::Union{Integer,Nothing}=nothing, unit::Bool=true) + if dims === nothing + Base.depwarn("fit(t, x) is deprecated: use fit(t, x, dims=2) instead", :fit) + dims = 2 + end if dims == 1 l, tmin, tmax = _compute_extrema(X) elseif dims == 2 l, tmin, tmax = _compute_extrema(X') - elseif dims == nothing - Base.depwarn("fit(t, x) is deprecated: use fit(t, x, dims=2) instead", :fit) - l, tmin, tmax = _compute_extrema(X') else throw(DomainError(dims, "fit only accept dims to be 1 or 2.")) end @@ -301,7 +301,7 @@ function _compute_extrema(X::AbstractMatrix{<:Real}) end function fit(::Type{UnitRangeTransform}, X::AbstractVector{<:Real}; - dims::Union{Integer,Nothing}=nothing, unit::Bool=true) + dims::Integer=1, unit::Bool=true) if dims != 1 throw(DomainError(dims, "fit only accept dims=1 over a vector. Try fit(t, x, dims=1).")) end From fbf9e6660fcd4c067ce24c4955be38f6392bf2af Mon Sep 17 00:00:00 2001 From: Matthieu Gomez Date: Tue, 29 Sep 2020 05:26:01 -0700 Subject: [PATCH 052/105] Fix UnitWeights slow iteration (#602) Fixes https://github.com/JuliaStats/StatsBase.jl/issues/600 --- src/weights.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/weights.jl b/src/weights.jl index e5df6b73..bc8f732b 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -275,7 +275,7 @@ All weight elements are identically one. sum(wv::UnitWeights{T}) where T = convert(T, length(wv)) isempty(wv::UnitWeights) = iszero(wv.len) length(wv::UnitWeights) = wv.len -size(wv::UnitWeights) = Tuple(length(wv)) +size(wv::UnitWeights) = tuple(length(wv)) Base.convert(::Type{Vector}, wv::UnitWeights{T}) where {T} = ones(T, length(wv)) From 983bf021be7521c399dfe4d25d174667344bc8fb Mon Sep 17 00:00:00 2001 From: Matthieu Gomez Date: Tue, 29 Sep 2020 06:54:33 -0700 Subject: [PATCH 053/105] Fix indexing UnitWeights with a Boolean vector (#603) --- src/weights.jl | 5 +++++ test/weights.jl | 1 + 2 files changed, 6 insertions(+) diff --git a/src/weights.jl b/src/weights.jl index bc8f732b..86f804a9 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -289,6 +289,11 @@ end UnitWeights{T}(length(i)) end +function Base.getindex(wv::UnitWeights{T}, i::AbstractArray{Bool}) where T + length(wv) == length(i) || throw(DimensionMismatch()) + UnitWeights{T}(count(i)) +end + Base.getindex(wv::UnitWeights{T}, ::Colon) where {T} = UnitWeights{T}(wv.len) """ diff --git a/test/weights.jl b/test/weights.jl index 9f071483..7735e04f 100644 --- a/test/weights.jl +++ b/test/weights.jl @@ -112,6 +112,7 @@ end @test isequal(wv, uweights(3)) @test wv != fweights(fill(1.0, 3)) @test wv == uweights(3) + @test wv[[true, false, false]] == uweights(Float64, 1) end ## wsum From 0ffe2e20998d98eb514fa0b08090cf05a18fa7b5 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Sun, 4 Oct 2020 19:49:18 +0200 Subject: [PATCH 054/105] Fix tests on Julia 1.6 (#606) `@printf(stdout,"<1e%2.2d", -4)` now correctly gives `"<1e-04"`, which allows the values to be always aligned (the column has always a 6-character width). --- test/statmodels.jl | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/test/statmodels.jl b/test/statmodels.jl index 80acafe1..5d0e93de 100644 --- a/test/statmodels.jl +++ b/test/statmodels.jl @@ -30,7 +30,11 @@ m = rand(3,4) @test sprint(show, StatsBase.PValue(1.0)) == "1.0000" @test sprint(show, StatsBase.PValue(1e-1)) == "0.1000" -@test sprint(show, StatsBase.PValue(1e-5)) == "<1e-4" +if VERSION > v"1.6.0-DEV" + @test sprint(show, StatsBase.PValue(1e-5)) == "<1e-04" +else + @test sprint(show, StatsBase.PValue(1e-5)) == "<1e-4" +end @test sprint(show, StatsBase.PValue(NaN)) == "NaN" @test_throws ErrorException StatsBase.PValue(-0.1) @test_throws ErrorException StatsBase.PValue(1.1) From 591d0455b15e566a6c0d85d70f1c4fa5134bce7b Mon Sep 17 00:00:00 2001 From: Jerry Ling Date: Tue, 6 Oct 2020 13:17:03 -0400 Subject: [PATCH 055/105] Make countmap support iterators (#605) --- src/counts.jl | 34 +++++++++++++++++++++++++++------- test/counts.jl | 37 ++++++++++++++++++++++++++++++++++--- 2 files changed, 61 insertions(+), 10 deletions(-) diff --git a/src/counts.jl b/src/counts.jl index 2b017e4b..3278fce9 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -255,7 +255,9 @@ raw counts. - `:dict`: use `Dict`-based method which is generally slower but uses less RAM and is safe for any data type. """ -function addcounts!(cm::Dict{T}, x::AbstractArray{T}; alg = :auto) where T +addcounts!(cm::Dict, x; alg = :auto) = _addcounts!(eltype(x), cm, x, alg = alg) + +function _addcounts!(::Type{T}, cm::Dict, x; alg = :auto) where T # if it's safe to be sorted using radixsort then it should be faster # albeit using more RAM if radixsort_safe(T) && (alg == :auto || alg == :radixsort) @@ -269,7 +271,7 @@ function addcounts!(cm::Dict{T}, x::AbstractArray{T}; alg = :auto) where T end """Dict-based addcounts method""" -function addcounts_dict!(cm::Dict{T}, x::AbstractArray{T}) where T +function addcounts_dict!(cm::Dict{T}, x) where T for v in x index = ht_keyindex2!(cm, v) if index > 0 @@ -286,14 +288,27 @@ end # faster results and less memory usage. However we still wish to enable others # to write generic algorithms, therefore the methods below still accept the # `alg` argument but it is ignored. -function addcounts!(cm::Dict{Bool}, x::AbstractArray{Bool}; alg = :ignored) +function _addcounts!(::Type{Bool}, cm::Dict{Bool}, x::AbstractArray{Bool}; alg = :ignored) sumx = sum(x) cm[true] = get(cm, true, 0) + sumx cm[false] = get(cm, false, 0) + length(x) - sumx cm end -function addcounts!(cm::Dict{T}, x::AbstractArray{T}; alg = :ignored) where T <: Union{UInt8, UInt16, Int8, Int16} +# specialized for `Bool` iterator +function _addcounts!(::Type{Bool}, cm::Dict{Bool}, x; alg = :ignored) + sumx = 0 + len = 0 + for i in x + sumx += i + len += 1 + end + cm[true] = get(cm, true, 0) + sumx + cm[false] = get(cm, false, 0) + len - sumx + cm +end + +function _addcounts!(::Type{T}, cm::Dict{T}, x; alg = :ignored) where T <: Union{UInt8, UInt16, Int8, Int16} counts = zeros(Int, 2^(8sizeof(T))) @inbounds for xi in x @@ -318,8 +333,7 @@ const BaseRadixSortSafeTypes = Union{Int8, Int16, Int32, Int64, Int128, Float32, Float64} "Can the type be safely sorted by radixsort" -radixsort_safe(::Type{T}) where {T<:BaseRadixSortSafeTypes} = true -radixsort_safe(::Type) = false +radixsort_safe(::Type{T}) where T = T<:BaseRadixSortSafeTypes function _addcounts_radix_sort_loop!(cm::Dict{T}, sx::AbstractArray{T}) where T last_sx = sx[1] @@ -353,6 +367,12 @@ function addcounts_radixsort!(cm::Dict{T}, x::AbstractArray{T}) where T return _addcounts_radix_sort_loop!(cm, sx) end +# fall-back for `x` an iterator +function addcounts_radixsort!(cm::Dict{T}, x) where T + sx = sort!(collect(x), alg = RadixSort) + return _addcounts_radix_sort_loop!(cm, sx) +end + function addcounts!(cm::Dict{T}, x::AbstractArray{T}, wv::AbstractVector{W}) where {T,W<:Real} n = length(x) length(wv) == n || throw(DimensionMismatch()) @@ -386,7 +406,7 @@ of occurrences. - `:dict`: use `Dict`-based method which is generally slower but uses less RAM and is safe for any data type. """ -countmap(x::AbstractArray{T}; alg = :auto) where {T} = addcounts!(Dict{T,Int}(), x; alg = alg) +countmap(x; alg = :auto) = addcounts!(Dict{eltype(x),Int}(), x; alg = alg) countmap(x::AbstractArray{T}, wv::AbstractVector{W}) where {T,W<:Real} = addcounts!(Dict{T,W}(), x, wv) diff --git a/test/counts.jl b/test/counts.jl index 2fd50832..9f684df8 100644 --- a/test/counts.jl +++ b/test/counts.jl @@ -80,6 +80,14 @@ cm = countmap(x) @test cm["a"] == 3 @test cm["b"] == 2 @test cm["c"] == 1 + +# iterator, non-radixsort +cm_missing = countmap(skipmissing(x)) +cm_any_itr = countmap((i for i in x)) +@test cm_missing == cm_any_itr == cm +@test cm_missing isa Dict{String, Int} +@test cm_any_itr isa Dict{Any, Int} + pm = proportionmap(x) @test pm["a"] ≈ (1/2) @test pm["b"] ≈ (1/3) @@ -91,6 +99,15 @@ xx = repeat([6, 1, 3, 1], outer=100_000) cm = countmap(xx) @test cm == Dict(1 => 200_000, 3 => 100_000, 6 => 100_000) +# with iterator +cm_missing = countmap(skipmissing(xx)) +@test cm_missing isa Dict{Int, Int} +@test cm_missing == cm + +cm_any_itr = countmap((i for i in xx)) +@test cm_any_itr isa Dict{Any,Int} # no knowledge about type +@test cm_missing == cm + # testing the radixsort-based addcounts xx = repeat([6, 1, 3, 1], outer=100_000) cm = Dict{Int, Int}() @@ -99,11 +116,20 @@ StatsBase.addcounts_radixsort!(cm,xx) xx2 = repeat([7, 1, 3, 1], outer=100_000) StatsBase.addcounts_radixsort!(cm,xx2) @test cm == Dict(1 => 400_000, 3 => 200_000, 6 => 100_000, 7 => 100_000) +# with iterator +cm_missing = Dict{Int, Int}() +StatsBase.addcounts_radixsort!(cm_missing,skipmissing(xx)) +@test cm_missing == Dict(1 => 200_000, 3 => 100_000, 6 => 100_000) +StatsBase.addcounts_radixsort!(cm_missing,skipmissing(xx2)) +@test cm_missing == Dict(1 => 400_000, 3 => 200_000, 6 => 100_000, 7 => 100_000) # testing the Dict-based addcounts cm = Dict{Int, Int}() +cm_itr = Dict{Int, Int}() StatsBase.addcounts_dict!(cm,xx) -@test cm == Dict(1 => 200_000, 3 => 100_000, 6 => 100_000) +StatsBase.addcounts_dict!(cm_itr,skipmissing(xx)) +@test cm_itr == cm == Dict(1 => 200_000, 3 => 100_000, 6 => 100_000) +@test cm_itr isa Dict{Int, Int} cm = countmap(x, weights(w)) @test cm["a"] == 5.5 @@ -119,11 +145,16 @@ pm = proportionmap(x, weights(w)) # testing small bits type bx = [true, false, true, true, false] -@test countmap(bx) == Dict(true => 3, false => 2) +cm_bx_missing = countmap(skipmissing(bx)) +@test cm_bx_missing == countmap(bx) == Dict(true => 3, false => 2) +@test cm_bx_missing isa Dict{Bool, Int} for T in [UInt8, UInt16, Int8, Int16] tx = T[typemin(T), 8, typemax(T), 19, 8] - @test countmap(tx) == Dict(typemin(T) => 1, typemax(T) => 1, 8 => 2, 19 => 1) + tx_missing = skipmissing(T[typemin(T), 8, typemax(T), 19, 8]) + cm_tx_missing = countmap(tx_missing) + @test cm_tx_missing == countmap(tx) == Dict(typemin(T) => 1, typemax(T) => 1, 8 => 2, 19 => 1) + @test cm_tx_missing isa Dict{T, Int} end @testset "views" begin From 08d4b77a4b42ef8cadb67b98da65e7fbd9959e0b Mon Sep 17 00:00:00 2001 From: Andreas Noack Date: Wed, 7 Oct 2020 15:52:20 +0200 Subject: [PATCH 056/105] Release 0.33.2 --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index bceb331e..c16ed2a4 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "StatsBase" uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" authors = ["JuliaStats"] -version = "0.33.1" +version = "0.33.2" [deps] DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" From c5328b186f721f9e320928356fb69cf579035240 Mon Sep 17 00:00:00 2001 From: Andreas Noack Date: Wed, 7 Oct 2020 21:43:21 +0200 Subject: [PATCH 057/105] Update Travis links in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7d4c2ba3..3c5a2225 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ *StatsBase.jl* is a Julia package that provides basic support for statistics. Particularly, it implements a variety of statistics-related functions, such as scalar statistics, high-order moment computation, counting, ranking, covariances, sampling, and empirical density estimation. - **Build & Testing Status:** - [![Build Status](https://travis-ci.org/JuliaStats/StatsBase.jl.svg?branch=master)](https://travis-ci.org/JuliaStats/StatsBase.jl) + [![Build Status](https://travis-ci.com/JuliaStats/StatsBase.jl.svg?branch=master)](https://travis-ci.com/JuliaStats/StatsBase.jl) [![Coverage Status](https://coveralls.io/repos/JuliaStats/StatsBase.jl/badge.svg?branch=master)](https://coveralls.io/r/JuliaStats/StatsBase.jl?branch=master) [![Coverage Status](http://codecov.io/github/JuliaStats/StatsBase.jl/coverage.svg?branch=master)](http://codecov.io/github/JuliaStats/StatsBase.jl?branch=master) From 1c3f3d7ae4fd8afae126443d15ddaf8029316618 Mon Sep 17 00:00:00 2001 From: Alex Arslan Date: Fri, 23 Oct 2020 12:55:20 -0700 Subject: [PATCH 058/105] Add a no-op PValue(::PValue) constructor (#612) `show(::CoefTable)` calls `PValue` on all values in the p-value column (if present). If things are constructed manually and you happen to already have your p-values as `PValue`s, printing the table will fail. It seems sort of silly to have this kind of definition, but it's nice to not get errors when trying to look at your `CoefTable` in the REPL. --- src/statmodels.jl | 1 + test/statmodels.jl | 16 +++++++++------- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/statmodels.jl b/src/statmodels.jl index 3536e3c9..08ce2473 100644 --- a/src/statmodels.jl +++ b/src/statmodels.jl @@ -431,6 +431,7 @@ struct PValue new(v) end end +PValue(p::PValue) = p function show(io::IO, pv::PValue) v = pv.v diff --git a/test/statmodels.jl b/test/statmodels.jl index 5d0e93de..632d861a 100644 --- a/test/statmodels.jl +++ b/test/statmodels.jl @@ -1,4 +1,5 @@ using StatsBase +using StatsBase: PValue using Test, Random v1 = [1.45666, -23.14, 1.56734e-13] @@ -28,16 +29,17 @@ m = rand(3,4) [3] 0.344454 0.179574 0.242208 0.4531 ──────────────────────────────────────────""" -@test sprint(show, StatsBase.PValue(1.0)) == "1.0000" -@test sprint(show, StatsBase.PValue(1e-1)) == "0.1000" +@test sprint(show, PValue(1.0)) == "1.0000" +@test sprint(show, PValue(1e-1)) == "0.1000" if VERSION > v"1.6.0-DEV" - @test sprint(show, StatsBase.PValue(1e-5)) == "<1e-04" + @test sprint(show, PValue(1e-5)) == "<1e-04" else - @test sprint(show, StatsBase.PValue(1e-5)) == "<1e-4" + @test sprint(show, PValue(1e-5)) == "<1e-4" end -@test sprint(show, StatsBase.PValue(NaN)) == "NaN" -@test_throws ErrorException StatsBase.PValue(-0.1) -@test_throws ErrorException StatsBase.PValue(1.1) +@test sprint(show, PValue(NaN)) == "NaN" +@test_throws ErrorException PValue(-0.1) +@test_throws ErrorException PValue(1.1) +@test PValue(PValue(0.05)) === PValue(0.05) @test sprint(showerror, ConvergenceException(10)) == "failure to converge after 10 iterations." From 0ba8a95576cfc7c070fc608c16f469a2d3404bc0 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Fri, 4 Dec 2020 10:59:31 +0100 Subject: [PATCH 059/105] Move to GitHub Actions for CI (#621) --- .github/workflows/ci.yml | 66 ++++++++++++++++++++++++++++++++++++++++ .travis.yml | 39 ------------------------ README.md | 2 +- 3 files changed, 67 insertions(+), 40 deletions(-) create mode 100644 .github/workflows/ci.yml delete mode 100644 .travis.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 00000000..e2bee76a --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,66 @@ +name: CI +on: + push: + branches: [master] + tags: ["*"] + pull_request: +jobs: + test: + name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + version: + - '1.0' + - '1' # automatically expands to the latest stable 1.x release of Julia + - 'nightly' + os: + - ubuntu-latest + - macOS-latest + - windows-latest + arch: + - x64 + - x86 + exclude: + - os: macOS-latest + arch: x86 + steps: + - uses: actions/checkout@v2 + - uses: julia-actions/setup-julia@v1 + with: + version: ${{ matrix.version }} + arch: ${{ matrix.arch }} + - uses: actions/cache@v1 + env: + cache-name: cache-artifacts + with: + path: ~/.julia/artifacts + key: ${{ runner.os }}-test-${{ env.cache-name }}-${{ hashFiles('**/Project.toml') }} + restore-keys: | + ${{ runner.os }}-test-${{ env.cache-name }}- + ${{ runner.os }}-test- + ${{ runner.os }}- + - uses: julia-actions/julia-buildpkg@v1 + - uses: julia-actions/julia-runtest@v1 + - uses: julia-actions/julia-processcoverage@v1 + - uses: codecov/codecov-action@v1 + with: + file: lcov.info + docs: + name: Documentation + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: julia-actions/setup-julia@v1 + with: + version: '1' + - run: | + julia --project=docs -e ' + using Pkg + Pkg.develop(PackageSpec(path=pwd())) + Pkg.instantiate()' + - run: julia --project=docs docs/make.jl + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }} diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 9f59d6f5..00000000 --- a/.travis.yml +++ /dev/null @@ -1,39 +0,0 @@ -language: julia -os: - - linux - - osx - - windows - -julia: - - 1.0 - - 1 - - nightly - -arch: - - x64 - - x86 - -notifications: - email: false -# Work around a Travis bug - -git: - depth: 999999 -# Uncomment the following lines to override the default test script -#script: -# - if [[ -a .git/shallow ]]; then git fetch --unshallow; fi -# - julia -e 'Pkg.clone(pwd()); Pkg.build("StatsBase"); Pkg.test("StatsBase"; coverage=true)' - -after_success: - - julia -e 'using Pkg; Pkg.add("Coverage"); using Coverage; Coveralls.submit(process_folder()); Codecov.submit(process_folder())'; - -jobs: - include: - - stage: "Documentation" - julia: 1.3 - os: linux - script: - - julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); - Pkg.instantiate()' - - julia --project=docs/ docs/make.jl - after_success: skip diff --git a/README.md b/README.md index 3c5a2225..ad16a64c 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ *StatsBase.jl* is a Julia package that provides basic support for statistics. Particularly, it implements a variety of statistics-related functions, such as scalar statistics, high-order moment computation, counting, ranking, covariances, sampling, and empirical density estimation. - **Build & Testing Status:** - [![Build Status](https://travis-ci.com/JuliaStats/StatsBase.jl.svg?branch=master)](https://travis-ci.com/JuliaStats/StatsBase.jl) + [![Build status](https://github.com/JuliaStats/StatsBase.jl/workflows/Run%20tests/badge.svg)]((https://github.com/JuliaStats/StatsBase.jl/actions?query=workflow%3ACI+branch%3Amaster)) [![Coverage Status](https://coveralls.io/repos/JuliaStats/StatsBase.jl/badge.svg?branch=master)](https://coveralls.io/r/JuliaStats/StatsBase.jl?branch=master) [![Coverage Status](http://codecov.io/github/JuliaStats/StatsBase.jl/coverage.svg?branch=master)](http://codecov.io/github/JuliaStats/StatsBase.jl?branch=master) From 217da39c5a5a6193af75bc03bc3b9f4af0cf212d Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Fri, 4 Dec 2020 11:10:29 +0100 Subject: [PATCH 060/105] Document that addcounts supports weights (#623) The method exists but it wasn't mentioned. --- src/counts.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/counts.jl b/src/counts.jl index 3278fce9..21793a5f 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -389,9 +389,10 @@ end """ countmap(x; alg = :auto) + countmap(x::AbstractVector, w::AbstractVector{<:Real}; alg = :auto) Return a dictionary mapping each unique value in `x` to its number -of occurrences. +of occurrences. A vector of weights `w` can be provided when `x` is a vector. - `:auto` (default): if `StatsBase.radixsort_safe(eltype(x)) == true` then use `:radixsort`, otherwise use `:dict`. From da4ee2e894619343e0467a2e30295a448d5a22ba Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Fri, 4 Dec 2020 22:02:51 +0100 Subject: [PATCH 061/105] Tighten signature of weights constructor (#620) This ensures the error is thrown earlier for clarity, instead of doing it from the two-argument constructor. --- src/weights.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/weights.jl b/src/weights.jl index 86f804a9..a6bafedc 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -13,7 +13,7 @@ macro weights(name) values::V sum::S end - $(esc(name))(vs) = $(esc(name))(vs, sum(vs)) + $(esc(name))(values::AbstractVector{<:Real}) = $(esc(name))(values, sum(values)) end end From 8075e284d1ae62aa1e1c9ecf62efed5a0b5c14da Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Sat, 5 Dec 2020 12:03:22 +0100 Subject: [PATCH 062/105] Avoid random failures in test (#624) Negative weights can give errors. --- test/moments.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/moments.jl b/test/moments.jl index cc71a2fc..97fda44a 100644 --- a/test/moments.jl +++ b/test/moments.jl @@ -104,8 +104,8 @@ expected_std = sqrt.(expected_var) end x = rand(5, 6) -w1 = rand(5) -w2 = rand(6) +w1 = [0.57, 5.10, 0.91, 1.72, 0.0] +w2 = [3.84, 2.70, 8.29, 8.91, 9.71, 0.0] @testset "Uncorrected with $f" for f in weight_funcs wv1 = f(w1) From 854a5415f09834a4c2ad02bba96f51fdae28dc67 Mon Sep 17 00:00:00 2001 From: Rory Finnegan Date: Sat, 5 Dec 2020 05:32:36 -0600 Subject: [PATCH 063/105] Adds a weighted mode call (#611) * Adds a weighted mode call. Use `sort` to make `modes` result consistent on different test architectures. Mention weighted option in mode docstrings. * Apply suggestions from code review. Co-authored-by: Milan Bouchet-Valat --- src/scalarstats.jl | 49 ++++++++++++++++++++++++++++++++++++++++++--- test/scalarstats.jl | 14 +++++++++++++ 2 files changed, 60 insertions(+), 3 deletions(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index 41486c41..c5e0bfee 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -47,10 +47,11 @@ end # compute mode, given the range of integer values """ mode(a, [r]) + mode(a::AbstractArray, wv::AbstractWeights) Return the mode (most common number) of an array, optionally -over a specified range `r`. If several modes exist, the first -one (in order of appearance) is returned. +over a specified range `r` or weighted via a vector `wv`. +If several modes exist, the first one (in order of appearance) is returned. """ function mode(a::AbstractArray{T}, r::UnitRange{T}) where T<:Integer isempty(a) && throw(ArgumentError("mode is not defined for empty collections")) @@ -75,9 +76,10 @@ end """ modes(a, [r])::Vector + mode(a::AbstractArray, wv::AbstractWeights)::Vector Return all modes (most common numbers) of an array, optionally over a -specified range `r`. +specified range `r` or weighted via vector `wv`. """ function modes(a::AbstractArray{T}, r::UnitRange{T}) where T<:Integer r0 = r[1] @@ -158,6 +160,47 @@ function modes(a) return [x for (x, c) in cnts if c == mc] end +# Weighted mode of arbitrary vectors of values +function mode(a::AbstractVector, wv::AbstractWeights{T}) where T <: Real + isempty(a) && throw(ArgumentError("mode is not defined for empty collections")) + length(a) == length(wv) || + throw(ArgumentError("data and weight vectors must be the same size, got $(length(a)) and $(length(wv))")) + + # Iterate through the data + mv = first(a) + mw = first(wv) + weights = Dict{eltype(a), T}() + for (x, w) in zip(a, wv) + _w = get!(weights, x, zero(T)) + w + if _w > mw + mv = x + mw = _w + end + weights[x] = _w + end + + return mv +end + +function modes(a::AbstractVector, wv::AbstractWeights{T}) where T <: Real + isempty(a) && throw(ArgumentError("mode is not defined for empty collections")) + length(a) == length(wv) || + throw(ArgumentError("data and weight vectors must be the same size, got $(length(a)) and $(length(wv))")) + + # Iterate through the data + mw = first(wv) + weights = Dict{eltype(a), T}() + for (x, w) in zip(a, wv) + _w = get!(weights, x, zero(T)) + w + if _w > mw + mw = _w + end + weights[x] = _w + end + + # find values corresponding to maximum counts + return [x for (x, w) in weights if w == mw] +end ############################# # diff --git a/test/scalarstats.jl b/test/scalarstats.jl index f163988f..22024636 100644 --- a/test/scalarstats.jl +++ b/test/scalarstats.jl @@ -44,10 +44,24 @@ using Statistics @test modes(skipmissing([1, missing, missing, 3, 2, 2, missing])) == [2] @test sort(modes(skipmissing([1, missing, 3, 3, 2, 2, missing]))) == [2, 3] +d1 = [1, 2, 3, 3, 4, 5, 5, 3] +d2 = ['a', 'b', 'c', 'c', 'd', 'e', 'e', 'c'] +wv = weights([0.1:0.1:0.7; 0.1]) +@test mode(d1) == 3 +@test mode(d2) == 'c' +@test mode(d1, wv) == 5 +@test mode(d2, wv) == 'e' +@test sort(modes(d1[1:end-1], weights(ones(7)))) == [3, 5] +@test sort(modes(d1, weights([.9, .1, .1, .1, .9, .1, .1, .1]))) == [1, 4] + @test_throws ArgumentError mode(Int[]) @test_throws ArgumentError modes(Int[]) @test_throws ArgumentError mode(Any[]) @test_throws ArgumentError modes(Any[]) +@test_throws ArgumentError mode([], weights(Float64[])) +@test_throws ArgumentError modes([], weights(Float64[])) +@test_throws ArgumentError mode([1, 2, 3], weights([0.1, 0.3])) +@test_throws ArgumentError modes([1, 2, 3], weights([0.1, 0.3])) ## zscores From 83ccc37d4a8deae0df9320740fda4a42ea9dee8b Mon Sep 17 00:00:00 2001 From: sdewaele <14310676+sdewaele@users.noreply.github.com> Date: Sat, 19 Dec 2020 16:09:22 -0500 Subject: [PATCH 064/105] more generic ZScoreTransform, UnitRangeTranform to support CuArrays (#622) --- src/transformations.jl | 71 ++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 44 deletions(-) diff --git a/src/transformations.jl b/src/transformations.jl index aec33ae6..4406e3d9 100644 --- a/src/transformations.jl +++ b/src/transformations.jl @@ -49,18 +49,18 @@ reconstruct(t::AbstractDataTransform, y::AbstractVector{<:Real}) = """ Standardization (Z-score transformation) """ -struct ZScoreTransform{T<:Real} <: AbstractDataTransform +struct ZScoreTransform{T<:Real, U<:AbstractVector{T}} <: AbstractDataTransform len::Int dims::Int - mean::Vector{T} - scale::Vector{T} + mean::U + scale::U - function ZScoreTransform(l::Int, dims::Int, m::Vector{T}, s::Vector{T}) where T + function ZScoreTransform(l::Int, dims::Int, m::U, s::U) where {T<:Real, U<:AbstractVector{T}} lenm = length(m) lens = length(s) lenm == l || lenm == 0 || throw(DimensionMismatch("Inconsistent dimensions.")) lens == l || lens == 0 || throw(DimensionMismatch("Inconsistent dimensions.")) - new{T}(l, dims, m, s) + new{T, U}(l, dims, m, s) end end @@ -123,9 +123,8 @@ function fit(::Type{ZScoreTransform}, X::AbstractMatrix{<:Real}; else throw(DomainError(dims, "fit only accept dims to be 1 or 2.")) end - T = eltype(X) - return ZScoreTransform(l, dims, (center ? vec(m) : zeros(T, 0)), - (scale ? vec(s) : zeros(T, 0))) + return ZScoreTransform(l, dims, (center ? vec(m) : similar(m, 0)), + (scale ? vec(s) : similar(s, 0))) end function fit(::Type{ZScoreTransform}, X::AbstractVector{<:Real}; @@ -134,10 +133,7 @@ function fit(::Type{ZScoreTransform}, X::AbstractVector{<:Real}; throw(DomainError(dims, "fit only accepts dims=1 over a vector. Try fit(t, x, dims=1).")) end - T = eltype(X) - m, s = mean_and_std(X) - return ZScoreTransform(1, dims, (center ? [m] : zeros(T, 0)), - (scale ? [s] : zeros(T, 0))) + return fit(ZScoreTransform, reshape(X, :, 1); dims=dims, center=center, scale=scale) end function transform!(y::AbstractMatrix{<:Real}, t::ZScoreTransform, x::AbstractMatrix{<:Real}) @@ -207,19 +203,19 @@ end """ Unit range normalization """ -struct UnitRangeTransform{T<:Real} <: AbstractDataTransform +struct UnitRangeTransform{T<:Real, U<:AbstractVector} <: AbstractDataTransform len::Int dims::Int unit::Bool - min::Vector{T} - scale::Vector{T} + min::U + scale::U - function UnitRangeTransform(l::Int, dims::Int, unit::Bool, min::Vector{T}, max::Vector{T}) where {T} + function UnitRangeTransform(l::Int, dims::Int, unit::Bool, min::U, max::U) where {T, U<:AbstractVector{T}} lenmin = length(min) lenmax = length(max) lenmin == l || lenmin == 0 || throw(DimensionMismatch("Inconsistent dimensions.")) lenmax == l || lenmax == 0 || throw(DimensionMismatch("Inconsistent dimensions.")) - new{T}(l, dims, unit, min, max) + new{T, U}(l, dims, unit, min, max) end end @@ -270,34 +266,22 @@ function fit(::Type{UnitRangeTransform}, X::AbstractMatrix{<:Real}; Base.depwarn("fit(t, x) is deprecated: use fit(t, x, dims=2) instead", :fit) dims = 2 end - if dims == 1 - l, tmin, tmax = _compute_extrema(X) - elseif dims == 2 - l, tmin, tmax = _compute_extrema(X') - else - throw(DomainError(dims, "fit only accept dims to be 1 or 2.")) - end - - for i = 1:l - @inbounds tmax[i] = 1 / (tmax[i] - tmin[i]) - end + dims ∈ (1, 2) || throw(DomainError(dims, "fit only accept dims to be 1 or 2.")) + tmin, tmax = _compute_extrema(X, dims) + @. tmax = 1 / (tmax - tmin) + l = length(tmin) return UnitRangeTransform(l, dims, unit, tmin, tmax) end -function _compute_extrema(X::AbstractMatrix{<:Real}) - n, l = size(X) - tmin = X[1, :] - tmax = X[1, :] - for j = 1:l - @inbounds for i = 2:n - if X[i, j] < tmin[j] - tmin[j] = X[i, j] - elseif X[i, j] > tmax[j] - tmax[j] = X[i, j] - end - end +function _compute_extrema(X::AbstractMatrix, dims::Integer) + dims == 2 && return _compute_extrema(X', 1) + l = size(X, 2) + tmin = similar(X, l) + tmax = similar(X, l) + for i in 1:l + @inbounds tmin[i], tmax[i] = extrema(@view(X[:, i])) end - return l, tmin, tmax + return tmin, tmax end function fit(::Type{UnitRangeTransform}, X::AbstractVector{<:Real}; @@ -305,10 +289,9 @@ function fit(::Type{UnitRangeTransform}, X::AbstractVector{<:Real}; if dims != 1 throw(DomainError(dims, "fit only accept dims=1 over a vector. Try fit(t, x, dims=1).")) end - - l, tmin, tmax = _compute_extrema(reshape(X, :, 1)) + tmin, tmax = extrema(X) tmax = 1 / (tmax - tmin) - return UnitRangeTransform(1, dims, unit, vec(tmin), vec(tmax)) + return UnitRangeTransform(1, dims, unit, [tmin], [tmax]) end function transform!(y::AbstractMatrix{<:Real}, t::UnitRangeTransform, x::AbstractMatrix{<:Real}) From 11d91abd264f17f14d4e72ed98a03666d3792a74 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Mon, 4 Jan 2021 11:03:49 +0100 Subject: [PATCH 065/105] Fix CI badge (#628) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ad16a64c..92c06f6e 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ *StatsBase.jl* is a Julia package that provides basic support for statistics. Particularly, it implements a variety of statistics-related functions, such as scalar statistics, high-order moment computation, counting, ranking, covariances, sampling, and empirical density estimation. - **Build & Testing Status:** - [![Build status](https://github.com/JuliaStats/StatsBase.jl/workflows/Run%20tests/badge.svg)]((https://github.com/JuliaStats/StatsBase.jl/actions?query=workflow%3ACI+branch%3Amaster)) + [![Build status](https://github.com/JuliaStats/StatsBase.jl/workflows/CI/badge.svg)]((https://github.com/JuliaStats/StatsBase.jl/actions?query=workflow%3ACI+branch%3Amaster)) [![Coverage Status](https://coveralls.io/repos/JuliaStats/StatsBase.jl/badge.svg?branch=master)](https://coveralls.io/r/JuliaStats/StatsBase.jl?branch=master) [![Coverage Status](http://codecov.io/github/JuliaStats/StatsBase.jl/coverage.svg?branch=master)](http://codecov.io/github/JuliaStats/StatsBase.jl?branch=master) From 41669cd8dfeadce35db0c4e07ac7afe5d10fb957 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Mon, 4 Jan 2021 11:56:17 +0100 Subject: [PATCH 066/105] Stop using LinearAlgebra.copy_oftype (#626) This undocumented function does not actually ensure that the result is mutable. Use the same approach as `Base.copymutable`, which relies only on public API. --- src/scalarstats.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index c5e0bfee..d00b9168 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -317,7 +317,7 @@ function mad(x; center=nothing, normalize::Union{Bool, Nothing}=nothing, constan # Knowing the eltype allows allocating a single array able to hold both original values # and differences from the center, instead of two arrays S = isconcretetype(T) ? promote_type(T, typeof(middle(zero(T)))) : T - x2 = x isa AbstractArray ? LinearAlgebra.copy_oftype(x, S) : collect(S, x) + x2 = x isa AbstractArray ? copyto!(similar(x, S), x) : collect(S, x) c = center === nothing ? median!(x2) : center if isconcretetype(T) x2 .= abs.(x2 .- c) From 320411b96c0ef0c5b0006169768b13f2554afb00 Mon Sep 17 00:00:00 2001 From: Kirill Ignatiev Date: Thu, 21 Jan 2021 09:27:27 +0000 Subject: [PATCH 067/105] Fix #631, summarystats returned NaN quantiles for arrays with mean 0 (#632) --- src/scalarstats.jl | 2 +- test/scalarstats.jl | 28 ++++++++++++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/src/scalarstats.jl b/src/scalarstats.jl index d00b9168..210703f6 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -660,7 +660,7 @@ function summarystats(a::AbstractArray{T}) where T<:Union{Real,Missing} R = typeof(m) n = length(a) ns = length(s) - qs = if m == 0 || n == 0 + qs = if ns == 0 R[NaN, NaN, NaN, NaN, NaN] elseif T >: Missing quantile!(s, [0.00, 0.25, 0.50, 0.75, 1.00]) diff --git a/test/scalarstats.jl b/test/scalarstats.jl index 22024636..db2178cf 100644 --- a/test/scalarstats.jl +++ b/test/scalarstats.jl @@ -217,3 +217,31 @@ s = summarystats(1:5) @test s.median ≈ 3.0 @test s.q25 ≈ 2.0 @test s.q75 ≈ 4.0 + +# Issue #631 +s = summarystats([-2, -1, 0, 1, 2, missing]) +@test isa(s, StatsBase.SummaryStats) +@test s.min == -2.0 +@test s.max == 2.0 +@test s.mean ≈ 0.0 +@test s.median ≈ 0.0 +@test s.q25 ≈ -1.0 +@test s.q75 ≈ +1.0 + +# Issue #631 +s = summarystats(zeros(10)) +@test isa(s, StatsBase.SummaryStats) +@test s.min == 0.0 +@test s.max == 0.0 +@test s.mean ≈ 0.0 +@test s.median ≈ 0.0 +@test s.q25 ≈ 0.0 +@test s.q75 ≈ 0.0 + +# Issue #631 +s = summarystats(Union{Float64,Missing}[missing, missing]) +@test isa(s, StatsBase.SummaryStats) +@test s.nobs == 2 +@test s.nmiss == 2 +@test isnan(s.mean) +@test isnan(s.median) From ed3b86edf57d0b95c6b793496073e7ead09b302e Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Mon, 1 Feb 2021 09:44:32 +0100 Subject: [PATCH 068/105] Make CoefTable implement the Tables.jl interface (#629) This allows retrieving the contents of a `CoefTable` object in a convenient form, notably a `DataFrame`. To avoid introducing a dependency on Tables.jl, `CoefTable` has to iterate `NamedTuple`s, so that it implements the row-table interface implicitly. This is inefficient since `CoefTable` uses a column-based storage, but given the typical size of such tables it should not matter. --- src/statmodels.jl | 29 ++++++++++++++++++++++++++++- test/statmodels.jl | 31 +++++++++++++++++++++++++++---- 2 files changed, 55 insertions(+), 5 deletions(-) diff --git a/src/statmodels.jl b/src/statmodels.jl index 08ce2473..b0644fac 100644 --- a/src/statmodels.jl +++ b/src/statmodels.jl @@ -19,8 +19,12 @@ coefnames(model::StatisticalModel) = error("coefnames is not defined for $(typeo """ coeftable(model::StatisticalModel; level::Real=0.95) -Return a table of class `CoefTable` with coefficients and related statistics. +Return a table with coefficients and related statistics of the model. `level` determines the level for confidence intervals (by default, 95%). + +The returned `CoefTable` object implements the +[Tables.jl](https://github.com/JuliaData/Tables.jl/) interface, and can be +converted e.g. to a `DataFrame` via `using DataFrames; DataFrame(coeftable(model))`. """ coeftable(model::StatisticalModel) = error("coeftable is not defined for $(typeof(model)).") @@ -420,6 +424,29 @@ mutable struct CoefTable end end +Base.length(ct::CoefTable) = length(ct.cols[1]) +function Base.eltype(ct::CoefTable) + names = isempty(ct.rownms) ? + tuple(Symbol.(ct.colnms)...) : + tuple(Symbol("Name"), Symbol.(ct.colnms)...) + types = isempty(ct.rownms) ? + Tuple{eltype.(ct.cols)...} : + Tuple{eltype(ct.rownms), eltype.(ct.cols)...} + NamedTuple{names, types} +end + +function Base.iterate(ct::CoefTable, i::Integer=1) + if i in 1:length(ct) + cols = getindex.(ct.cols, Ref(i)) + nt = isempty(ct.rownms) ? + eltype(ct)(tuple(cols...)) : + eltype(ct)(tuple(ct.rownms[i], cols...)) + (nt, i+1) + else + nothing + end +end + """ Show a p-value using 6 characters, either using the standard 0.XXXX representation or as Date: Mon, 1 Feb 2021 09:59:36 +0100 Subject: [PATCH 069/105] Enhance ranking code (#589) * ordinalrank!(): use eachindex() * ranking: use _rank() helper, @inbounds _rank() helper provides: 1) correct support for n-dim, n>1, input arrays 2) minimizes code duplication 3) passthrough of sortperm() args 4) macro-less support for missing values * replace while-loops with for-loops * rankings: expand sortkwargs in docstring * rankings: cleanup docstring --- src/ranking.jl | 138 +++++++++++++++++++++---------------------------- 1 file changed, 59 insertions(+), 79 deletions(-) diff --git a/src/ranking.jl b/src/ranking.jl index 90ed8d39..05a5b465 100644 --- a/src/ranking.jl +++ b/src/ranking.jl @@ -6,65 +6,70 @@ # The implementations here follow this wikipedia page. # - function _check_randparams(rks, x, p) n = length(rks) length(x) == length(p) == n || raise_dimerror() return n end +# ranking helper function: calls sortperm(x) and then ranking method f! +function _rank(f!, x::AbstractArray, R::Type=Int; sortkwargs...) + rks = similar(x, R) + ord = reshape(sortperm(vec(x); sortkwargs...), size(x)) + return f!(rks, x, ord) +end +# ranking helper function for arrays with missing values +function _rank(f!, x::AbstractArray{>: Missing}, R::Type=Int; sortkwargs...) + inds = findall(!ismissing, vec(x)) + isempty(inds) && return missings(R, size(x)) + xv = disallowmissing(view(vec(x), inds)) + ordv = sortperm(xv; sortkwargs...) + rks = missings(R, size(x)) + f!(view(rks, inds), xv, ordv) + return rks +end # Ordinal ranking ("1234 ranking") -- use the literal order resulted from sort -function ordinalrank!(rks::AbstractArray, x::AbstractArray, p::IntegerArray) - n = _check_randparams(rks, x, p) - - if n > 0 - i = 1 - while i <= n - rks[p[i]] = i - i += 1 - end +function _ordinalrank!(rks::AbstractArray, x::AbstractArray, p::IntegerArray) + _check_randparams(rks, x, p) + @inbounds for i in eachindex(p) + rks[p[i]] = i end - return rks end """ - ordinalrank(x; lt = isless, rev::Bool = false) + ordinalrank(x; lt=isless, by=identity, rev::Bool=false, ...) Return the [ordinal ranking](https://en.wikipedia.org/wiki/Ranking#Ordinal_ranking_.28.221234.22_ranking.29) -("1234" ranking) of an array. The `lt` keyword allows providing a custom "less -than" function; use `rev=true` to reverse the sorting order. -All items in `x` are given distinct, successive ranks based on their -position in `sort(x; lt = lt, rev = rev)`. +("1234" ranking) of an array. Supports the same keyword arguments as the `sort` function. +All items in `x` are given distinct, successive ranks based on their position +in the sorted vector. Missing values are assigned rank `missing`. """ -ordinalrank(x::AbstractArray; lt = isless, rev::Bool = false) = - ordinalrank!(Array{Int}(undef, size(x)), x, sortperm(x; lt = lt, rev = rev)) +ordinalrank(x::AbstractArray; sortkwargs...) = + _rank(_ordinalrank!, x; sortkwargs...) # Competition ranking ("1224" ranking) -- resolve tied ranks using min -function competerank!(rks::AbstractArray, x::AbstractArray, p::IntegerArray) +function _competerank!(rks::AbstractArray, x::AbstractArray, p::IntegerArray) n = _check_randparams(rks, x, p) - if n > 0 + @inbounds if n > 0 p1 = p[1] v = x[p1] rks[p1] = k = 1 - i = 2 - while i <= n + for i in 2:n pi = p[i] xi = x[pi] - if xi == v - rks[pi] = k - else - rks[pi] = k = i + if xi != v v = xi + k = i end - i += 1 + rks[pi] = k end end @@ -73,39 +78,35 @@ end """ - competerank(x; lt = isless, rev::Bool = false) + competerank(x; lt=isless, by=identity, rev::Bool=false, ...) Return the [standard competition ranking](http://en.wikipedia.org/wiki/Ranking#Standard_competition_ranking_.28.221224.22_ranking.29) -("1224" ranking) of an array. The `lt` keyword allows providing a custom "less -than" function; use `rev=true` to reverse the sorting order. -Items that compare equal are given the same rank, then a gap is left -in the rankings the size of the number of tied items - 1. +("1224" ranking) of an array. Supports the same keyword arguments as the `sort` function. +Equal (*"tied"*) items are given the same rank, and the next rank comes after a gap +that is equal to the number of tied items - 1. Missing values are assigned rank `missing`. """ -competerank(x::AbstractArray; lt = isless, rev::Bool = false) = - competerank!(Array{Int}(undef, size(x)), x, sortperm(x; lt = lt, rev = rev)) +competerank(x::AbstractArray; sortkwargs...) = + _rank(_competerank!, x; sortkwargs...) # Dense ranking ("1223" ranking) -- resolve tied ranks using min -function denserank!(rks::AbstractArray, x::AbstractArray, p::IntegerArray) +function _denserank!(rks::AbstractArray, x::AbstractArray, p::IntegerArray) n = _check_randparams(rks, x, p) - if n > 0 + @inbounds if n > 0 p1 = p[1] v = x[p1] rks[p1] = k = 1 - i = 2 - while i <= n + for i in 2:n pi = p[i] xi = x[pi] - if xi == v - rks[pi] = k - else - rks[pi] = (k += 1) + if xi != v v = xi + k += 1 end - i += 1 + rks[pi] = k end end @@ -114,29 +115,27 @@ end """ - denserank(x) + denserank(x; lt=isless, by=identity, rev::Bool=false, ...) Return the [dense ranking](http://en.wikipedia.org/wiki/Ranking#Dense_ranking_.28.221223.22_ranking.29) -("1223" ranking) of an array. The `lt` keyword allows providing a custom "less -than" function; use `rev=true` to reverse the sorting order. Items that -compare equal receive the same ranking, and the next subsequent rank is +("1223" ranking) of an array. Supports the same keyword arguments as the `sort` function. +Equal items receive the same rank, and the next subsequent rank is assigned with no gap. Missing values are assigned rank `missing`. """ -denserank(x::AbstractArray; lt = isless, rev::Bool = false) = - denserank!(Array{Int}(undef, size(x)), x, sortperm(x; lt = lt, rev = rev)) +denserank(x::AbstractArray; sortkwargs...) = + _rank(_denserank!, x; sortkwargs...) # Tied ranking ("1 2.5 2.5 4" ranking) -- resolve tied ranks using average -function tiedrank!(rks::AbstractArray, x::AbstractArray, p::IntegerArray) +function _tiedrank!(rks::AbstractArray, x::AbstractArray, p::IntegerArray) n = _check_randparams(rks, x, p) - if n > 0 + @inbounds if n > 0 v = x[p[1]] s = 1 # starting index of current range - e = 2 # pass-by-end index of current range - while e <= n + for e in 2:n # e is pass-by-end index of current range cx = x[p[e]] if cx != v # fill average rank to s : e-1 @@ -148,10 +147,9 @@ function tiedrank!(rks::AbstractArray, x::AbstractArray, p::IntegerArray) s = e v = cx end - e += 1 end - # the last range (e == n+1) + # the last range ar = (s + n) / 2 for i = s : n rks[p[i]] = ar @@ -161,33 +159,15 @@ function tiedrank!(rks::AbstractArray, x::AbstractArray, p::IntegerArray) return rks end -# order (aka. rank), resolving ties using the mean rank """ - tiedrank(x) + tiedrank(x; lt=isless, by=identity, rev::Bool=false, ...) Return the [tied ranking](http://en.wikipedia.org/wiki/Ranking#Fractional_ranking_.28.221_2.5_2.5_4.22_ranking.29), also called fractional or "1 2.5 2.5 4" ranking, -of an array. The `lt` keyword allows providing a custom "less -than" function; use `rev=true` to reverse the sorting order. -Items that compare equal receive the mean of the -rankings they would have been assigned under ordinal ranking. +of an array. Supports the same keyword arguments as the `sort` function. +Equal (*"tied"*) items receive the mean of the ranks they would +have been assigned under the ordinal ranking (see [`ordinalrank`](@ref)). Missing values are assigned rank `missing`. """ -tiedrank(x::AbstractArray; lt = isless, rev::Bool = false) = - tiedrank!(Array{Float64}(undef, size(x)), x, sortperm(x; lt = lt, rev = rev)) - -for (f, f!, S) in zip([:ordinalrank, :competerank, :denserank, :tiedrank], - [:ordinalrank!, :competerank!, :denserank!, :tiedrank!], - [Int, Int, Int, Float64]) - @eval begin - function $f(x::AbstractArray{>: Missing}; lt = isless, rev::Bool = false) - inds = findall(!ismissing, x) - isempty(inds) && return missings($S, size(x)) - xv = disallowmissing(view(x, inds)) - sp = sortperm(xv; lt = lt, rev = rev) - rks = missings($S, length(x)) - $(f!)(view(rks, inds), xv, sp) - rks - end - end -end \ No newline at end of file +tiedrank(x::AbstractArray; sortkwargs...) = + _rank(_tiedrank!, x, Float64; sortkwargs...) From 2103509b41894d026ccfd7199510fafe802da8d3 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Wed, 3 Feb 2021 13:57:34 +0100 Subject: [PATCH 070/105] Fix link to CI badge (#651) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 92c06f6e..9abe8ce9 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ *StatsBase.jl* is a Julia package that provides basic support for statistics. Particularly, it implements a variety of statistics-related functions, such as scalar statistics, high-order moment computation, counting, ranking, covariances, sampling, and empirical density estimation. - **Build & Testing Status:** - [![Build status](https://github.com/JuliaStats/StatsBase.jl/workflows/CI/badge.svg)]((https://github.com/JuliaStats/StatsBase.jl/actions?query=workflow%3ACI+branch%3Amaster)) + [![Build status](https://github.com/JuliaStats/StatsBase.jl/workflows/CI/badge.svg)](https://github.com/JuliaStats/StatsBase.jl/actions?query=workflow%3ACI+branch%3Amaster) [![Coverage Status](https://coveralls.io/repos/JuliaStats/StatsBase.jl/badge.svg?branch=master)](https://coveralls.io/r/JuliaStats/StatsBase.jl?branch=master) [![Coverage Status](http://codecov.io/github/JuliaStats/StatsBase.jl/coverage.svg?branch=master)](http://codecov.io/github/JuliaStats/StatsBase.jl?branch=master) From 3b0b2da5179af9696928e5ecc72794b098c4cb53 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Wed, 3 Feb 2021 09:13:32 -0500 Subject: [PATCH 071/105] use docdeploy instead of manual doc script (#653) --- .github/workflows/ci.yml | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e2bee76a..aaeda107 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -52,15 +52,8 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - - uses: julia-actions/setup-julia@v1 - with: - version: '1' - - run: | - julia --project=docs -e ' - using Pkg - Pkg.develop(PackageSpec(path=pwd())) - Pkg.instantiate()' - - run: julia --project=docs docs/make.jl + - uses: julia-actions/julia-buildpkg@latest + - uses: julia-actions/julia-docdeploy@latest env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }} From 11ac5b596405367b3217d3d962e22523fef9bb0d Mon Sep 17 00:00:00 2001 From: Philip Swannell <18028484+PGS62@users.noreply.github.com> Date: Mon, 8 Feb 2021 16:21:48 +0000 Subject: [PATCH 072/105] Rewrote corkendall (issue #634) (#647) New version of corkendall is approx 4 times faster if both arguments are vectors and 7 times faster if at least one is a matrix. See issue #634 for details. --- src/rankcorr.jl | 242 +++++++++++++++++++++++++++++++---------------- test/rankcorr.jl | 80 ++++++++++++++-- 2 files changed, 235 insertions(+), 87 deletions(-) diff --git a/src/rankcorr.jl b/src/rankcorr.jl index ea804105..0592530b 100644 --- a/src/rankcorr.jl +++ b/src/rankcorr.jl @@ -33,66 +33,50 @@ corspearman(X::RealMatrix) = (Z = mapslices(tiedrank, X, dims=1); cor(Z, Z)) # ####################################### -# Knight JASA (1966) - -function corkendall!(x::RealVector, y::RealVector) +# Knight, William R. “A Computer Method for Calculating Kendall's Tau with Ungrouped Data.” +# Journal of the American Statistical Association, vol. 61, no. 314, 1966, pp. 436–439. +# JSTOR, www.jstor.org/stable/2282833. +function corkendall!(x::RealVector, y::RealVector, permx::AbstractVector{<:Integer}=sortperm(x)) if any(isnan, x) || any(isnan, y) return NaN end n = length(x) if n != length(y) error("Vectors must have same length") end # Initial sorting - pm = sortperm(y) - x[:] = x[pm] - y[:] = y[pm] - pm[:] = sortperm(x) - x[:] = x[pm] - - # Counting ties in x and y - iT = 1 - nT = 0 - iU = 1 - nU = 0 - for i = 2:n - if x[i] == x[i-1] - iT += 1 - else - nT += iT*(iT - 1) - iT = 1 - end - if y[i] == y[i-1] - iU += 1 - else - nU += iU*(iU - 1) - iU = 1 + permute!(x, permx) + permute!(y, permx) + + # Use widen to avoid overflows on both 32bit and 64bit + npairs = div(widen(n) * (n - 1), 2) + ntiesx = ndoubleties = nswaps = widen(0) + k = 0 + + @inbounds for i = 2:n + if x[i - 1] == x[i] + k += 1 + elseif k > 0 + # Sort the corresponding chunk of y, so the rows of hcat(x,y) are + # sorted first on x, then (where x values are tied) on y. Hence + # double ties can be counted by calling countties. + sort!(view(y, (i - k - 1):(i - 1))) + ntiesx += div(widen(k) * (k + 1), 2) # Must use wide integers here + ndoubleties += countties(y, i - k - 1, i - 1) + k = 0 end end - if iT > 1 nT += iT*(iT - 1) end - nT = div(nT,2) - if iU > 1 nU += iU*(iU - 1) end - nU = div(nU,2) - - # Sort y after x - y[:] = y[pm] - - # Calculate double ties - iV = 1 - nV = 0 - jV = 1 - for i = 2:n - if x[i] == x[i-1] && y[i] == y[i-1] - iV += 1 - else - nV += iV*(iV - 1) - iV = 1 - end + if k > 0 + sort!(view(y, (n - k):n)) + ntiesx += div(widen(k) * (k + 1), 2) + ndoubleties += countties(y, n - k, n) end - if iV > 1 nV += iV*(iV - 1) end - nV = div(nV,2) - nD = div(n*(n - 1),2) - return (nD - nT - nU + nV - 2swaps!(y)) / (sqrt(nD - nT) * sqrt(nD - nU)) -end + nswaps = merge_sort!(y, 1, n) + ntiesy = countties(y, 1, n) + # Calls to float below prevent possible overflow errors when + # length(x) exceeds 77_936 (32 bit) or 5_107_605_667 (64 bit) + (npairs + ndoubleties - ntiesx - ntiesy - 2 * nswaps) / + sqrt(float(npairs - ntiesx) * float(npairs - ntiesy)) +end """ corkendall(x, y=x) @@ -100,21 +84,40 @@ end Compute Kendall's rank correlation coefficient, τ. `x` and `y` must both be either matrices or vectors. """ -corkendall(x::RealVector, y::RealVector) = corkendall!(float(copy(x)), float(copy(y))) +corkendall(x::RealVector, y::RealVector) = corkendall!(copy(x), copy(y)) -corkendall(X::RealMatrix, y::RealVector) = Float64[corkendall!(float(X[:,i]), float(copy(y))) for i in 1:size(X, 2)] - -corkendall(x::RealVector, Y::RealMatrix) = (n = size(Y,2); reshape(Float64[corkendall!(float(copy(x)), float(Y[:,i])) for i in 1:n], 1, n)) +function corkendall(X::RealMatrix, y::RealVector) + permy = sortperm(y) + return([corkendall!(copy(y), X[:,i], permy) for i in 1:size(X, 2)]) +end -corkendall(X::RealMatrix, Y::RealMatrix) = Float64[corkendall!(float(X[:,i]), float(Y[:,j])) for i in 1:size(X, 2), j in 1:size(Y, 2)] +function corkendall(x::RealVector, Y::RealMatrix) + n = size(Y, 2) + permx = sortperm(x) + return(reshape([corkendall!(copy(x), Y[:,i], permx) for i in 1:n], 1, n)) +end function corkendall(X::RealMatrix) n = size(X, 2) - C = Matrix{eltype(X)}(I, n, n) + C = Matrix{Float64}(I, n, n) for j = 2:n - for i = 1:j-1 - C[i,j] = corkendall!(X[:,i],X[:,j]) - C[j,i] = C[i,j] + permx = sortperm(X[:,j]) + for i = 1:j - 1 + C[j,i] = corkendall!(X[:,j], X[:,i], permx) + C[i,j] = C[j,i] + end + end + return C +end + +function corkendall(X::RealMatrix, Y::RealMatrix) + nr = size(X, 2) + nc = size(Y, 2) + C = Matrix{Float64}(undef, nr, nc) + for j = 1:nr + permx = sortperm(X[:,j]) + for i = 1:nc + C[j,i] = corkendall!(X[:,j], Y[:,i], permx) end end return C @@ -122,32 +125,111 @@ end # Auxilliary functions for Kendall's rank correlation -function swaps!(x::RealVector) - n = length(x) - if n == 1 return 0 end - n2 = div(n, 2) - xl = view(x, 1:n2) - xr = view(x, n2+1:n) - nsl = swaps!(xl) - nsr = swaps!(xr) - sort!(xl) - sort!(xr) - return nsl + nsr + mswaps(xl,xr) +""" + countties(x::RealVector, lo::Integer, hi::Integer) + +Return the number of ties within `x[lo:hi]`. Assumes `x` is sorted. +""" +function countties(x::AbstractVector, lo::Integer, hi::Integer) + # Use of widen below prevents possible overflow errors when + # length(x) exceeds 2^16 (32 bit) or 2^32 (64 bit) + thistiecount = result = widen(0) + checkbounds(x, lo:hi) + @inbounds for i = (lo + 1):hi + if x[i] == x[i - 1] + thistiecount += 1 + elseif thistiecount > 0 + result += div(thistiecount * (thistiecount + 1), 2) + thistiecount = widen(0) + end + end + + if thistiecount > 0 + result += div(thistiecount * (thistiecount + 1), 2) + end + result end -function mswaps(x::RealVector, y::RealVector) - i = 1 - j = 1 - nSwaps = 0 - n = length(x) - while i <= n && j <= length(y) - if y[j] < x[i] - nSwaps += n - i + 1 +# Tests appear to show that a value of 64 is optimal, +# but note that the equivalent constant in base/sort.jl is 20. +const SMALL_THRESHOLD = 64 + +# merge_sort! copied from Julia Base +# (commit 28330a2fef4d9d149ba0fd3ffa06347b50067647, dated 20 Sep 2020) +""" + merge_sort!(v::AbstractVector, lo::Integer, hi::Integer, t::AbstractVector=similar(v, 0)) + +Mutates `v` by sorting elements `x[lo:hi]` using the merge sort algorithm. +This method is a copy-paste-edit of sort! in base/sort.jl, amended to return the bubblesort distance. +""" +function merge_sort!(v::AbstractVector, lo::Integer, hi::Integer, t::AbstractVector=similar(v, 0)) + # Use of widen below prevents possible overflow errors when + # length(v) exceeds 2^16 (32 bit) or 2^32 (64 bit) + nswaps = widen(0) + @inbounds if lo < hi + hi - lo <= SMALL_THRESHOLD && return insertion_sort!(v, lo, hi) + + m = midpoint(lo, hi) + (length(t) < m - lo + 1) && resize!(t, m - lo + 1) + + nswaps = merge_sort!(v, lo, m, t) + nswaps += merge_sort!(v, m + 1, hi, t) + + i, j = 1, lo + while j <= m + t[i] = v[j] + i += 1 j += 1 - else + end + + i, k = 1, lo + while k < j <= hi + if v[j] < t[i] + v[k] = v[j] + j += 1 + nswaps += m - lo + 1 - (i - 1) + else + v[k] = t[i] + i += 1 + end + k += 1 + end + while k < j + v[k] = t[i] + k += 1 i += 1 end end - return nSwaps + return nswaps end +# insertion_sort! and midpoint copied from Julia Base +# (commit 28330a2fef4d9d149ba0fd3ffa06347b50067647, dated 20 Sep 2020) +midpoint(lo::T, hi::T) where T <: Integer = lo + ((hi - lo) >>> 0x01) +midpoint(lo::Integer, hi::Integer) = midpoint(promote(lo, hi)...) + +""" + insertion_sort!(v::AbstractVector, lo::Integer, hi::Integer) + +Mutates `v` by sorting elements `x[lo:hi]` using the insertion sort algorithm. +This method is a copy-paste-edit of sort! in base/sort.jl, amended to return the bubblesort distance. +""" +function insertion_sort!(v::AbstractVector, lo::Integer, hi::Integer) + if lo == hi return widen(0) end + nswaps = widen(0) + @inbounds for i = lo + 1:hi + j = i + x = v[i] + while j > lo + if x < v[j - 1] + nswaps += 1 + v[j] = v[j - 1] + j -= 1 + continue + end + break + end + v[j] = x + end + return nswaps +end diff --git a/test/rankcorr.jl b/test/rankcorr.jl index e5505c2a..6110d4c4 100644 --- a/test/rankcorr.jl +++ b/test/rankcorr.jl @@ -23,20 +23,86 @@ c22 = corspearman(x2, x2) @test corspearman(X, X) ≈ [c11 c12; c12 c22] @test corspearman(X) ≈ [c11 c12; c12 c22] - # corkendall -@test corkendall(x1, y) ≈ -0.105409255338946 -@test corkendall(x2, y) ≈ -0.117851130197758 +# Check error, handling of NaN, Inf etc +@test_throws ErrorException("Vectors must have same length") corkendall([1,2,3,4], [1,2,3]) +@test isnan(corkendall([1,2], [3,NaN])) +@test isnan(corkendall([1,1,1], [1,2,3])) +@test corkendall([-Inf,-0.0,Inf],[1,2,3]) == 1.0 + +# Test, with exact equality, some known results. +# RealVector, RealVector +@test corkendall(x1, y) == -1/sqrt(90) +@test corkendall(x2, y) == -1/sqrt(72) +# RealMatrix, RealVector +@test corkendall(X, y) == [-1/sqrt(90), -1/sqrt(72)] +# RealVector, RealMatrix +@test corkendall(y, X) == [-1/sqrt(90) -1/sqrt(72)] + +# n = 78_000 tests for overflow errors on 32 bit +# Testing for overflow errors on 64bit would require n be too large for practicality +# This also tests merge_sort! since n is (much) greater than SMALL_THRESHOLD. +n = 78_000 +# Test with many repeats +@test corkendall(repeat(x1, n), repeat(y, n)) ≈ -1/sqrt(90) +@test corkendall(repeat(x2, n), repeat(y, n)) ≈ -1/sqrt(72) +@test corkendall(repeat(X, n), repeat(y, n)) ≈ [-1/sqrt(90), -1/sqrt(72)] +@test corkendall(repeat(y, n), repeat(X, n)) ≈ [-1/sqrt(90) -1/sqrt(72)] +@test corkendall(repeat([0,1,1,0], n), repeat([1,0,1,0], n)) == 0.0 + +# Test with no repeats, note testing for exact equality +@test corkendall(collect(1:n), collect(1:n)) == 1.0 +@test corkendall(collect(1:n), reverse(collect(1:n))) == -1.0 -@test corkendall(X, y) ≈ [-0.105409255338946, -0.117851130197758] -@test corkendall(y, X) ≈ [-0.105409255338946 -0.117851130197758] +# All elements identical should yield NaN +@test isnan(corkendall(repeat([1], n), collect(1:n))) c11 = corkendall(x1, x1) c12 = corkendall(x1, x2) c22 = corkendall(x2, x2) -@test c11 ≈ 1.0 -@test c22 ≈ 1.0 +# RealMatrix, RealMatrix @test corkendall(X, X) ≈ [c11 c12; c12 c22] +# RealMatrix @test corkendall(X) ≈ [c11 c12; c12 c22] + +@test c11 == 1.0 +@test c22 == 1.0 +@test c12 == 3/sqrt(20) + +# Finished testing for overflow, so redefine n for speedier tests +n = 100 + +@test corkendall(repeat(X, n), repeat(X, n)) ≈ [c11 c12; c12 c22] +@test corkendall(repeat(X, n)) ≈ [c11 c12; c12 c22] + +# All eight three-element permutations +z = [1 1 1; + 1 1 2; + 1 2 2; + 1 2 2; + 1 2 1; + 2 1 2; + 1 1 2; + 2 2 2] + +@test corkendall(z) == [1 0 1/3; 0 1 0; 1/3 0 1] +@test corkendall(z, z) == [1 0 1/3; 0 1 0; 1/3 0 1] +@test corkendall(z[:,1], z) == [1 0 1/3] +@test corkendall(z, z[:,1]) == [1; 0; 1/3] + +z = float(z) +@test corkendall(z) == [1 0 1/3; 0 1 0; 1/3 0 1] +@test corkendall(z, z) == [1 0 1/3; 0 1 0; 1/3 0 1] +@test corkendall(z[:,1], z) == [1 0 1/3] +@test corkendall(z, z[:,1]) == [1; 0; 1/3] + +w = repeat(z, n) +@test corkendall(w) == [1 0 1/3; 0 1 0; 1/3 0 1] +@test corkendall(w, w) == [1 0 1/3; 0 1 0; 1/3 0 1] +@test corkendall(w[:,1], w) == [1 0 1/3] +@test corkendall(w, w[:,1]) == [1; 0; 1/3] + +StatsBase.midpoint(1,10) == 5 +StatsBase.midpoint(1,widen(10)) == 5 From a89554bcc128df1efc18077692f6680b1e1bbab9 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Wed, 10 Feb 2021 22:06:55 +0100 Subject: [PATCH 073/105] Improve manual for data transformations (#648) `standardize` allows both Z-score normalization (a.k.a. as standardization) and unit range normalization. This can be confusing, so avoid saying "standardization" without more explicit terms. --- docs/src/transformations.md | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/docs/src/transformations.md b/docs/src/transformations.md index f5ec2275..381660d0 100644 --- a/docs/src/transformations.md +++ b/docs/src/transformations.md @@ -3,37 +3,44 @@ In general, data transformations change raw feature vectors into a representation that is more suitable for various estimators. -## Standardization +## Standardization a.k.a Z-score Normalization -**Standardization** of dataset is a common requirement for many machine -learning techniques. These techniques might perform poorly if the individual -features do not more or less look like standard normally distributed data. +**Standardization**, also known as Z-score normalization, is a common requirement +for many machine learning techniques. These techniques might perform poorly +if the individual features do not more or less look like standard normally +distributed data. Standardization transforms data points into corresponding standard scores -by removing mean and scaling to unit variance. +by subtracting mean and scaling to unit variance. -The **standard score** is the signed number of standard deviations by which -the value of an observation or data point is above the mean value of what -is being observed or measured. +The **standard score**, also known as Z-score, is the signed number of +standard deviations by which the value of an observation or data point +is above the mean value of what is being observed or measured. -Standardization can be performed using `fit(ZScoreTransform, ...)`. +Standardization can be performed using `t = fit(ZScoreTransform, ...)` +followed by `StatsBase.transform(t, ...)` or `StatsBase.transform!(t, ...)`. +`standardize(ZScoreTransform, ...)` is a shorthand to perform both operations +in a single call. ```@docs fit(::Type{ZScoreTransform}, X::AbstractArray{<:Real,2}; center::Bool=true, scale::Bool=true) ``` -## Unit range normalization +## Unit Range Normalization -**Unit range normalization** is an alternative data transformation which scales features -to lie in the interval `[0; 1]`. +**Unit range normalization**, also known as min-max scaling, is an alternative +data transformation which scales features to lie in the interval `[0; 1]`. -Unit range normalization can be performed using `fit(UnitRangeTransform, ...)`. +Unit range normalization can be performed using `t = fit(UnitRangeTransform, ...)` +followed by `StatsBase.transform(t, ...)` or `StatsBase.transform!(t, ...)`. +`standardize(UnitRangeTransform, ...)` is a shorthand to perform both operations +in a single call. ```@docs fit(::Type{UnitRangeTransform}, X::AbstractArray{<:Real,2}; unit::Bool=true) ``` -## Additional methods +## Additional Methods ```@docs StatsBase.transform StatsBase.transform! From 5a85a309747b332e6c31743dce97f2910a5d2d87 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Mon, 15 Feb 2021 10:52:24 +0100 Subject: [PATCH 074/105] Fix histogram tests on Julia nightlies (#656) Generators are not longer allows in generated functions as they make them unpure. --- src/hist.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hist.jl b/src/hist.jl index 36915213..3607b2dd 100644 --- a/src/hist.jl +++ b/src/hist.jl @@ -461,7 +461,7 @@ arrays appropriately. See description of `normalize` for details. Returns `h`. if mode == :pdf || mode == :density # Divide weights by bin volume, for :pdf also divide by sum of weights SumT = norm_type(h) - vs_0 = (mode == :pdf) ? sum(SumT(x) for x in weights) : one(SumT) + vs_0 = (mode == :pdf) ? sum(SumT, weights) : one(SumT) @inbounds @nloops $N i weights d->(vs_{$N-d+1} = vs_{$N-d} * _edge_binvolume(SumT, edges[d], i_d)) begin (@nref $N weights i) /= $(Symbol("vs_$N")) for A in aux_weights From 027ca076fcd2b23b9b3ea33686c7d64f13c35bb4 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Mon, 15 Feb 2021 10:52:41 +0100 Subject: [PATCH 075/105] Release 0.33.3 (#655) --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index c16ed2a4..71157984 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "StatsBase" uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" authors = ["JuliaStats"] -version = "0.33.2" +version = "0.33.3" [deps] DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" From 5bf2b8f5d99a06c5b3c8089513f1f0773304e2e0 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Thu, 25 Feb 2021 22:35:07 +0100 Subject: [PATCH 076/105] Stop exporting nonexistent findat and wmean! (#663) These were deprecated and then removed long ago. --- src/StatsBase.jl | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/StatsBase.jl b/src/StatsBase.jl index a0abe9c9..bd4df36b 100644 --- a/src/StatsBase.jl +++ b/src/StatsBase.jl @@ -39,10 +39,6 @@ export uweights, # construct an UnitWeights vector wsum, # weighted sum with vector as second argument wsum!, # weighted sum across dimensions with provided storage - wmean, # weighted mean - wmean!, # weighted mean across dimensions with provided storage - wmedian, # weighted median - wquantile, # weighted quantile ## moments skewness, # (standardized) skewness @@ -160,7 +156,6 @@ export inverse_rle, # inverse run-length encoding indexmap, # construct a map from element to index levelsmap, # construct a map from n unique elements to [1, ..., n] - findat, # find the position within a for elements in b indicatormat, # construct indicator matrix # statistical models From 870712dd6834f3a1670e9c5465eef98d6b91c117 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Sun, 7 Mar 2021 17:58:54 +0100 Subject: [PATCH 077/105] Make `corspearman` return `NaN` when there are `NaN`s in the input (#659) Make `corspearman` return `NaN` when there are `NaN`s in the input, instead of silently sorting them at the end. This is consistent with what `corkendall` and `cor` do. --- src/rankcorr.jl | 103 +++++++++++++++++++++++++++++++++++++++++++---- test/rankcorr.jl | 55 ++++++++++++++++++++++++- 2 files changed, 150 insertions(+), 8 deletions(-) diff --git a/src/rankcorr.jl b/src/rankcorr.jl index 0592530b..714548d5 100644 --- a/src/rankcorr.jl +++ b/src/rankcorr.jl @@ -17,14 +17,103 @@ Compute Spearman's rank correlation coefficient. If `x` and `y` are vectors, the output is a float, otherwise it's a matrix corresponding to the pairwise correlations of the columns of `x` and `y`. """ -corspearman(x::RealVector, y::RealVector) = cor(tiedrank(x), tiedrank(y)) +function corspearman(x::RealVector, y::RealVector) + n = length(x) + n == length(y) || throw(DimensionMismatch("vectors must have same length")) + (any(isnan, x) || any(isnan, y)) && return NaN + return cor(tiedrank(x), tiedrank(y)) +end + +function corspearman(X::RealMatrix, y::RealVector) + size(X, 1) == length(y) || + throw(DimensionMismatch("X and y have inconsistent dimensions")) + n = size(X, 2) + C = Matrix{Float64}(I, n, 1) + any(isnan, y) && return fill!(C, NaN) + yrank = tiedrank(y) + for j = 1:n + Xj = view(X, :, j) + if any(isnan, Xj) + C[j,1] = NaN + else + Xjrank = tiedrank(Xj) + C[j,1] = cor(Xjrank, yrank) + end + end + return C +end + +function corspearman(x::RealVector, Y::RealMatrix) + size(Y, 1) == length(x) || + throw(DimensionMismatch("x and Y have inconsistent dimensions")) + n = size(Y, 2) + C = Matrix{Float64}(I, 1, n) + any(isnan, x) && return fill!(C, NaN) + xrank = tiedrank(x) + for j = 1:n + Yj = view(Y, :, j) + if any(isnan, Yj) + C[1,j] = NaN + else + Yjrank = tiedrank(Yj) + C[1,j] = cor(xrank, Yjrank) + end + end + return C +end -corspearman(X::RealMatrix, Y::RealMatrix) = - cor(mapslices(tiedrank, X, dims=1), mapslices(tiedrank, Y, dims=1)) -corspearman(X::RealMatrix, y::RealVector) = cor(mapslices(tiedrank, X, dims=1), tiedrank(y)) -corspearman(x::RealVector, Y::RealMatrix) = cor(tiedrank(x), mapslices(tiedrank, Y, dims=1)) +function corspearman(X::RealMatrix) + n = size(X, 2) + C = Matrix{Float64}(I, n, n) + anynan = Vector{Bool}(undef, n) + for j = 1:n + Xj = view(X, :, j) + anynan[j] = any(isnan, Xj) + if anynan[j] + C[:,j] .= NaN + C[j,:] .= NaN + C[j,j] = 1 + continue + end + Xjrank = tiedrank(Xj) + for i = 1:(j-1) + Xi = view(X, :, i) + if anynan[i] + C[i,j] = C[j,i] = NaN + else + Xirank = tiedrank(Xi) + C[i,j] = C[j,i] = cor(Xjrank, Xirank) + end + end + end + return C +end -corspearman(X::RealMatrix) = (Z = mapslices(tiedrank, X, dims=1); cor(Z, Z)) +function corspearman(X::RealMatrix, Y::RealMatrix) + size(X, 1) == size(Y, 1) || + throw(ArgumentError("number of columns in each array must match")) + nr = size(X, 2) + nc = size(Y, 2) + C = Matrix{Float64}(undef, nr, nc) + for j = 1:nr + Xj = view(X, :, j) + if any(isnan, Xj) + C[j,:] .= NaN + continue + end + Xjrank = tiedrank(Xj) + for i = 1:nc + Yi = view(Y, :, i) + if any(isnan, Yi) + C[j,i] = NaN + else + Yirank = tiedrank(Yi) + C[j,i] = cor(Xjrank, Yirank) + end + end + end + return C +end ####################################### @@ -44,7 +133,7 @@ function corkendall!(x::RealVector, y::RealVector, permx::AbstractVector{<:Integ # Initial sorting permute!(x, permx) permute!(y, permx) - + # Use widen to avoid overflows on both 32bit and 64bit npairs = div(widen(n) * (n - 1), 2) ntiesx = ndoubleties = nswaps = widen(0) diff --git a/test/rankcorr.jl b/test/rankcorr.jl index 6110d4c4..93b64449 100644 --- a/test/rankcorr.jl +++ b/test/rankcorr.jl @@ -2,10 +2,11 @@ using StatsBase using Test X = Float64[1 0; 2 1; 3 0; 4 1; 5 10] +Y = Float64[5 5 6; 3 4 1; 4 0 4; 2 6 1; 5 7 10] x1 = X[:,1] x2 = X[:,2] -y = [5, 3, 4, 2, 5] +y = Y[:,1] # corspearman @@ -23,6 +24,9 @@ c22 = corspearman(x2, x2) @test corspearman(X, X) ≈ [c11 c12; c12 c22] @test corspearman(X) ≈ [c11 c12; c12 c22] +@test corspearman(X, Y) == + [corspearman(X[:,i], Y[:,j]) for i in axes(X, 2), j in axes(Y, 2)] + # corkendall # Check error, handling of NaN, Inf etc @@ -106,3 +110,52 @@ w = repeat(z, n) StatsBase.midpoint(1,10) == 5 StatsBase.midpoint(1,widen(10)) == 5 + + +# NaN handling + +Xnan = copy(X) +Xnan[1,1] = NaN +Ynan = copy(Y) +Ynan[2,1] = NaN + +for f in (corspearman, corkendall) + @test isnan(f([1.0, NaN, 2.0], [2.0, 1.0, 3.4])) + @test all(isnan, f([1.0, NaN], [1 2; 3 4])) + @test all(isnan, f([1 2; 3 4], [1.0, NaN])) + @test isequal(f([1 NaN; NaN 4]), [1 NaN; NaN 1]) + @test all(isnan, f([1 NaN; NaN 4], [1 NaN; NaN 4])) + @test all(isnan, f([1 NaN; NaN 4], [NaN 1; NaN 4])) + + @test isequal(f(Xnan, Ynan), + [f(Xnan[:,i], Ynan[:,j]) for i in axes(Xnan, 2), j in axes(Ynan, 2)]) + @test isequal(f(Xnan), + [i == j ? 1.0 : f(Xnan[:,i], Xnan[:,j]) + for i in axes(Xnan, 2), j in axes(Xnan, 2)]) + for k in 1:2 + @test isequal(f(Xnan[:,k], Ynan), + [f(Xnan[:,k], Ynan[:,j]) for i in 1:1, j in axes(Ynan, 2)]) + # TODO: fix corkendall (PR#659) + if f === corspearman + @test isequal(f(Xnan, Ynan[:,k]), + [f(Xnan[:,i], Ynan[:,k]) for i in axes(Xnan, 2), j in 1:1]) + else + @test isequal(f(Xnan, Ynan[:,k]), + [f(Xnan[:,i], Ynan[:,k]) for i in axes(Xnan, 2)]) + end + end +end + + +# Wrong dimensions + +@test_throws DimensionMismatch corspearman([1], [1, 2]) +@test_throws DimensionMismatch corspearman([1], [1 2; 3 4]) +@test_throws DimensionMismatch corspearman([1 2; 3 4], [1]) +@test_throws ArgumentError corspearman([1 2; 3 4: 4 6], [1 2; 3 4]) + +# TODO: fix corkendall to match corspearman (PR#659) +@test_throws ErrorException corkendall([1], [1, 2]) +@test_throws ErrorException corkendall([1], [1 2; 3 4]) +@test_throws ErrorException corkendall([1 2; 3 4], [1]) +@test_throws ArgumentError corkendall([1 2; 3 4: 4 6], [1 2; 3 4]) \ No newline at end of file From 20411f479bfc0e19078d606baec79534cb0b8729 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Sat, 13 Mar 2021 00:22:24 +0000 Subject: [PATCH 078/105] Make PValue and TestStat behave like Reals (#668) * make PValue and TestStat behave like numbers * add tests for TestStat show methods * make PValue real * fix CoefTable show method for PValue <: Real * add TestStat to imports * more operations Co-Authored-By: Milan Bouchet-Valat Co-authored-by: Alex Arslan * unify definitions, pass kwargs for isapprox, explicit isequal * approximately right for the last time * NaN NaN NaN, Batman! Co-authored-by: Milan Bouchet-Valat Co-authored-by: Alex Arslan --- src/statmodels.jl | 31 ++++++++++++++++++--- test/statmodels.jl | 69 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 95 insertions(+), 5 deletions(-) diff --git a/src/statmodels.jl b/src/statmodels.jl index b0644fac..960c6fd8 100644 --- a/src/statmodels.jl +++ b/src/statmodels.jl @@ -277,7 +277,7 @@ Adjusted pseudo-coefficient of determination (adjusted pseudo R-squared). For nonlinear models, one of the several pseudo R² definitions must be chosen via `variant`. The only currently supported variants are `:MacFadden`, defined as ``1 - (\\log (L) - k)/\\log (L0)`` and -`:devianceratio`, defined as ``1 - (D/(n-k))/(D_0/(n-1))``. +`:devianceratio`, defined as ``1 - (D/(n-k))/(D_0/(n-1))``. In these formulas, ``L`` is the likelihood of the model, ``L0`` that of the null model (the model including only the intercept), ``D`` is the deviance of the model, ``D_0`` is the deviance of the null model, ``n`` is the number of observations (given by [`nobs`](@ref)) and @@ -319,7 +319,7 @@ response(model::RegressionModel) = error("response is not defined for $(typeof(m """ responsename(model::RegressionModel) - + Return the name of the model response (a.k.a. the dependent variable). """ responsename(model::RegressionModel) = error("responsename is not defined for $(typeof(model)).") @@ -451,7 +451,7 @@ end Show a p-value using 6 characters, either using the standard 0.XXXX representation or as , :≥, :(isless), :(isequal)] # isless and < to place nice with NaN + @eval begin + Base.$op(x::Union{TestStat, PValue}, y::Real) = $op(x.v, y) + Base.$op(y::Real, x::Union{TestStat, PValue}) = $op(y, x.v) + Base.$op(x1::Union{TestStat, PValue}, x2::Union{TestStat, PValue}) = $op(x1.v, x2.v) + end +end + +# necessary to avoid a method ambiguity with isless(::TestStat, NaN) +Base.isless(x::Union{TestStat, PValue}, y::AbstractFloat) = isless(x.v, y) +Base.isless(y::AbstractFloat, x::Union{TestStat, PValue},) = isless(y, x.v) +Base.isequal(y::AbstractFloat, x::Union{TestStat, PValue}) = isequal(y, x.v) +Base.isequal(x::Union{TestStat, PValue}, y::AbstractFloat) = isequal(x.v, y) + + +Base.isapprox(x::Union{TestStat, PValue}, y::Real; kwargs...) = isapprox(x.v, y; kwargs...) +Base.isapprox(y::Real, x::Union{TestStat, PValue}; kwargs...) = isapprox(y, x.v; kwargs...) +Base.isapprox(x1::Union{TestStat, PValue}, x2::Union{TestStat, PValue}; kwargs...) = isapprox(x1.v, x2.v; kwargs...) + """Wrap a string so that show omits quotes""" struct NoQuote @@ -493,7 +516,7 @@ function show(io::IO, ct::CoefTable) rownms = [lpad("[$i]",floor(Integer, log10(nr))+3) for i in 1:nr] end mat = [j == 1 ? NoQuote(rownms[i]) : - j-1 == ct.pvalcol ? PValue(cols[j-1][i]) : + j-1 == ct.pvalcol ? NoQuote(sprint(show, PValue(cols[j-1][i]))) : j-1 in ct.teststatcol ? TestStat(cols[j-1][i]) : cols[j-1][i] isa AbstractString ? NoQuote(cols[j-1][i]) : cols[j-1][i] for i in 1:nr, j in 1:nc+1] diff --git a/test/statmodels.jl b/test/statmodels.jl index 918d89f7..82234d9b 100644 --- a/test/statmodels.jl +++ b/test/statmodels.jl @@ -1,5 +1,5 @@ using StatsBase -using StatsBase: PValue +using StatsBase: PValue, TestStat using Test, Random v1 = [1.45666, -23.14, 1.56734e-13] @@ -63,6 +63,73 @@ end @test_throws ErrorException PValue(-0.1) @test_throws ErrorException PValue(1.1) @test PValue(PValue(0.05)) === PValue(0.05) +@test isless(PValue(0.01), 0.05) +@test isless(PValue(0.01), NaN) == isless(0.01, NaN) +@test (PValue(0.01) < NaN) == (0.01 < NaN) +@test isless(NaN, PValue(0.01)) == isless(NaN, 0.01) +@test (NaN < PValue(0.01)) == (NaN < 0.01) +@test isequal(NaN, PValue(0.01)) == isequal(NaN, 0.01) +@test (NaN == PValue(0.01)) == (NaN == 0.01) +@test isequal(PValue(0.01), NaN) == isequal(0.01, NaN) +@test (PValue(0.01) == NaN) == (0.01 == NaN) +@test isequal(PValue(0.05), 0.05) +@test isapprox(PValue(0.05), 0.05) +@test PValue(0.05) <= 0.05 +@test PValue(0.1) > 0.05 +@test PValue(0.1) >= PValue(0.05) +@test PValue(0.05) <= PValue(0.05) +@test PValue(0.1) > PValue(0.05) +@test PValue(0.1) >= PValue(0.05) +@test 0.1 >= PValue(0.05) +@test 0.05 <= PValue(0.05) +@test 0.1 > PValue(0.05) +@test 0.1 >= PValue(0.05) +# exact equality should hold here since it's the exact same atomic operations +@test float(PValue(Rational(1,3))) == float(1/3) +@test PValue(Rational(1,3)) == Rational(1,3) +@test PValue(Rational(1,3)) ≈ 1/3 +@test PValue(Rational(1,3)) == PValue(Rational(1,3)) +@test PValue(Rational(1,3)) ≈ PValue(1/3) +@test Rational(1,3) == PValue(Rational(1,3)) +@test Rational(1,3) ≈ PValue(1/3) atol=0.01 +@test PValue(Rational(1,3)) isa Real + +@test sprint(show, TestStat(1e-1)) == "0.10" +@test sprint(show, TestStat(1e-5)) == "0.00" +@test sprint(show, TestStat(π)) == "3.14" +@test TestStat(TestStat(0.05)) === TestStat(0.05) +@test isless(TestStat(0.01), 0.05) +@test isless(TestStat(0.01), NaN) == isless(0.01, NaN) +@test (TestStat(0.01) < NaN) == (0.01 < NaN) +@test isless(NaN, TestStat(0.01)) == isless(NaN, 0.01) +@test (NaN < TestStat(0.01)) == (NaN < 0.01) +@test isequal(TestStat(0.01), NaN) == isequal(0.01, NaN) +@test (TestStat(0.01) == NaN) == (0.01 == NaN) +@test isequal(NaN, TestStat(0.01)) == isequal(NaN, 0.01) +@test (NaN == TestStat(0.01)) == (NaN == 0.01) +@test isequal(TestStat(0.05), 0.05) + +@test isapprox(TestStat(0.05), 0.05) +@test TestStat(0.05) <= 0.05 +@test TestStat(0.1) > 0.05 +@test TestStat(0.1) >= TestStat(0.05) +@test TestStat(0.05) <= TestStat(0.05) +@test TestStat(0.1) > TestStat(0.05) +@test TestStat(0.1) >= TestStat(0.05) +@test 0.1 >= TestStat(0.05) +@test 0.05 <= TestStat(0.05) +@test 0.1 > TestStat(0.05) +@test 0.1 >= TestStat(0.05) +# exact equality should hold here since it's the exact same atomic operations +@test float(TestStat(Rational(1,3))) == float(1/3) +@test TestStat(Rational(1,3)) == Rational(1,3) +@test TestStat(Rational(1,3)) ≈ 1/3 +@test TestStat(Rational(1,3)) == TestStat(Rational(1,3)) +@test TestStat(Rational(1,3)) ≈ TestStat(1/3) +@test Rational(1,3) == TestStat(Rational(1,3)) +@test Rational(1,3) ≈ TestStat(1/3) +@test TestStat(Rational(1,3)) isa Real +@test TestStat(π) ≈ 3.14 atol=0.01 @test sprint(showerror, ConvergenceException(10)) == "failure to converge after 10 iterations." From 7089961ffd95bfe62a0b57896530edd778ba05d4 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Sun, 14 Mar 2021 21:21:59 +0000 Subject: [PATCH 079/105] NaN test for Real-ness for PValue and TestStat (#669) * NaN test for Real-ness for PValue and TestStat * isless * this is why we have loops --- test/statmodels.jl | 84 ++++++++++++---------------------------------- 1 file changed, 21 insertions(+), 63 deletions(-) diff --git a/test/statmodels.jl b/test/statmodels.jl index 82234d9b..506d2630 100644 --- a/test/statmodels.jl +++ b/test/statmodels.jl @@ -62,74 +62,32 @@ end @test sprint(show, PValue(NaN)) == "NaN" @test_throws ErrorException PValue(-0.1) @test_throws ErrorException PValue(1.1) -@test PValue(PValue(0.05)) === PValue(0.05) -@test isless(PValue(0.01), 0.05) -@test isless(PValue(0.01), NaN) == isless(0.01, NaN) -@test (PValue(0.01) < NaN) == (0.01 < NaN) -@test isless(NaN, PValue(0.01)) == isless(NaN, 0.01) -@test (NaN < PValue(0.01)) == (NaN < 0.01) -@test isequal(NaN, PValue(0.01)) == isequal(NaN, 0.01) -@test (NaN == PValue(0.01)) == (NaN == 0.01) -@test isequal(PValue(0.01), NaN) == isequal(0.01, NaN) -@test (PValue(0.01) == NaN) == (0.01 == NaN) -@test isequal(PValue(0.05), 0.05) -@test isapprox(PValue(0.05), 0.05) -@test PValue(0.05) <= 0.05 -@test PValue(0.1) > 0.05 -@test PValue(0.1) >= PValue(0.05) -@test PValue(0.05) <= PValue(0.05) -@test PValue(0.1) > PValue(0.05) -@test PValue(0.1) >= PValue(0.05) -@test 0.1 >= PValue(0.05) -@test 0.05 <= PValue(0.05) -@test 0.1 > PValue(0.05) -@test 0.1 >= PValue(0.05) -# exact equality should hold here since it's the exact same atomic operations -@test float(PValue(Rational(1,3))) == float(1/3) -@test PValue(Rational(1,3)) == Rational(1,3) -@test PValue(Rational(1,3)) ≈ 1/3 -@test PValue(Rational(1,3)) == PValue(Rational(1,3)) -@test PValue(Rational(1,3)) ≈ PValue(1/3) -@test Rational(1,3) == PValue(Rational(1,3)) -@test Rational(1,3) ≈ PValue(1/3) atol=0.01 -@test PValue(Rational(1,3)) isa Real +@test sprint(show, TestStat(NaN)) == "NaN" @test sprint(show, TestStat(1e-1)) == "0.10" @test sprint(show, TestStat(1e-5)) == "0.00" @test sprint(show, TestStat(π)) == "3.14" -@test TestStat(TestStat(0.05)) === TestStat(0.05) -@test isless(TestStat(0.01), 0.05) -@test isless(TestStat(0.01), NaN) == isless(0.01, NaN) -@test (TestStat(0.01) < NaN) == (0.01 < NaN) -@test isless(NaN, TestStat(0.01)) == isless(NaN, 0.01) -@test (NaN < TestStat(0.01)) == (NaN < 0.01) -@test isequal(TestStat(0.01), NaN) == isequal(0.01, NaN) -@test (TestStat(0.01) == NaN) == (0.01 == NaN) -@test isequal(NaN, TestStat(0.01)) == isequal(NaN, 0.01) -@test (NaN == TestStat(0.01)) == (NaN == 0.01) -@test isequal(TestStat(0.05), 0.05) -@test isapprox(TestStat(0.05), 0.05) -@test TestStat(0.05) <= 0.05 -@test TestStat(0.1) > 0.05 -@test TestStat(0.1) >= TestStat(0.05) -@test TestStat(0.05) <= TestStat(0.05) -@test TestStat(0.1) > TestStat(0.05) -@test TestStat(0.1) >= TestStat(0.05) -@test 0.1 >= TestStat(0.05) -@test 0.05 <= TestStat(0.05) -@test 0.1 > TestStat(0.05) -@test 0.1 >= TestStat(0.05) -# exact equality should hold here since it's the exact same atomic operations -@test float(TestStat(Rational(1,3))) == float(1/3) -@test TestStat(Rational(1,3)) == Rational(1,3) -@test TestStat(Rational(1,3)) ≈ 1/3 -@test TestStat(Rational(1,3)) == TestStat(Rational(1,3)) -@test TestStat(Rational(1,3)) ≈ TestStat(1/3) -@test Rational(1,3) == TestStat(Rational(1,3)) -@test Rational(1,3) ≈ TestStat(1/3) -@test TestStat(Rational(1,3)) isa Real -@test TestStat(π) ≈ 3.14 atol=0.01 +@testset "Union{PValue, TestStat} is Real" begin + vals = [0.0, Rational(1,3), NaN] + for T in [PValue, TestStat], + f in (==, <, ≤, >, ≥, isless, isequal), + lhs in vals, rhs in vals + # make sure that T behaves like a Real, + # regardless of whether it's on the LHS, RHS or both + @test f(T(lhs), T(rhs)) == f(lhs, rhs) + @test f(lhs, T(rhs)) == f(lhs, rhs) + @test f(T(lhs), rhs) == f(lhs, rhs) + end + + # the (approximate) equality operators get a bit more attention + for T in [PValue, TestStat] + @test T(Rational(1,3)) ≈ T(1/3) + @test Rational(1,3) ≈ T(1/3) atol=0.01 + @test T(Rational(1,3)) isa Real + @test T(T(0.05)) === T(0.05) + end +end @test sprint(showerror, ConvergenceException(10)) == "failure to converge after 10 iterations." From f32d0f1f6b6fc2f822424f34e5e7e550f85897bd Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Mon, 15 Mar 2021 14:23:03 +0000 Subject: [PATCH 080/105] Base.hash for test statistics (#670) * Base.hash for test statistics * woops, simple equality --- src/statmodels.jl | 3 ++- test/statmodels.jl | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/statmodels.jl b/src/statmodels.jl index 960c6fd8..8641b7eb 100644 --- a/src/statmodels.jl +++ b/src/statmodels.jl @@ -489,13 +489,14 @@ for op in [:(==), :<, :≤, :>, :≥, :(isless), :(isequal)] # isless and < to p end end +Base.hash(x::Union{TestStat, PValue}, h::UInt) = hash(x.v, h) + # necessary to avoid a method ambiguity with isless(::TestStat, NaN) Base.isless(x::Union{TestStat, PValue}, y::AbstractFloat) = isless(x.v, y) Base.isless(y::AbstractFloat, x::Union{TestStat, PValue},) = isless(y, x.v) Base.isequal(y::AbstractFloat, x::Union{TestStat, PValue}) = isequal(y, x.v) Base.isequal(x::Union{TestStat, PValue}, y::AbstractFloat) = isequal(x.v, y) - Base.isapprox(x::Union{TestStat, PValue}, y::Real; kwargs...) = isapprox(x.v, y; kwargs...) Base.isapprox(y::Real, x::Union{TestStat, PValue}; kwargs...) = isapprox(y, x.v; kwargs...) Base.isapprox(x1::Union{TestStat, PValue}, x2::Union{TestStat, PValue}; kwargs...) = isapprox(x1.v, x2.v; kwargs...) diff --git a/test/statmodels.jl b/test/statmodels.jl index 506d2630..581c5ea1 100644 --- a/test/statmodels.jl +++ b/test/statmodels.jl @@ -86,6 +86,8 @@ end @test Rational(1,3) ≈ T(1/3) atol=0.01 @test T(Rational(1,3)) isa Real @test T(T(0.05)) === T(0.05) + @test hash(T(0.05)) == hash(0.05) + @test hash(T(0.05), UInt(42)) == hash(0.05, UInt(42)) end end From 3f9030e72b21541df24bbb1e3ef823ef00d312e7 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Mon, 15 Mar 2021 21:48:38 +0000 Subject: [PATCH 081/105] Markdown show method for CoefTable (#664) * md show method for CoefTable * Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat * be less clever * default IO for MD show * change markdown coltitle padding * update md show for p being Real * no show for no IO * test case without rownames * patch bump Co-authored-by: Milan Bouchet-Valat --- Project.toml | 2 +- src/statmodels.jl | 48 ++++++++++++++++++++++++++++++++++++++++++++++ test/statmodels.jl | 28 +++++++++++++++++++++++++++ 3 files changed, 77 insertions(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 71157984..f571bba4 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "StatsBase" uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" authors = ["JuliaStats"] -version = "0.33.3" +version = "0.33.4" [deps] DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" diff --git a/src/statmodels.jl b/src/statmodels.jl index 8641b7eb..cc1db8e7 100644 --- a/src/statmodels.jl +++ b/src/statmodels.jl @@ -543,6 +543,54 @@ function show(io::IO, ct::CoefTable) nothing end +function show(io::IO, ::MIME"text/markdown", ct::CoefTable) + cols = ct.cols; rownms = ct.rownms; colnms = ct.colnms; + nc = length(cols) + nr = length(cols[1]) + if length(rownms) == 0 + rownms = [lpad("[$i]",floor(Integer, log10(nr))+3) for i in 1:nr] + end + mat = [j == 1 ? NoQuote(rownms[i]) : + j-1 == ct.pvalcol ? NoQuote(sprint(show, PValue(cols[j-1][i]))) : + j-1 in ct.teststatcol ? TestStat(cols[j-1][i]) : + cols[j-1][i] isa AbstractString ? NoQuote(cols[j-1][i]) : cols[j-1][i] + for i in 1:nr, j in 1:nc+1] + # Code inspired by print_matrix in Base + io = IOContext(io, :compact=>true, :limit=>false) + A = Base.alignment(io, mat, 1:size(mat, 1), 1:size(mat, 2), + typemax(Int), typemax(Int), 3) + nmswidths = pushfirst!(length.(colnms), 0) + A = [nmswidths[i] > sum(A[i]) ? (A[i][1]+nmswidths[i]-sum(A[i]), A[i][2]) : A[i] + for i in 1:length(A)] + totwidth = sum(sum.(A)) + 2 * (length(A) - 1) + + # not using Markdown stdlib here because that won't give us nice decimal + # alignment (even if that is lost when rendering to HTML, it's still nice + # when looking at the markdown itself) + + print(io, '|', ' '^(sum(A[1])+1)) + for j in 1:length(colnms) + print(io, " | ", lpad(colnms[j], sum(A[j+1]))) + end + + println(io, " |") + print(io, '|', rpad(':', sum(A[1])+2, '-')) + for j in 1:length(colnms) + _pad = j-1 in [ct.teststatcol; ct.pvalcol] ? rpad : lpad + print(io, '|', _pad(':', sum(A[j+1])+2, '-')) + end + println(io, '|') + + for i in 1:size(mat, 1) + print(io, "| ") + Base.print_matrix_row(io, mat, A, i, 1:size(mat, 2), " | ") + print(io, " |") + i != size(mat, 1) && println(io) + end + + nothing +end + """ ConvergenceException(iters::Int, lastchange::Real=NaN, tol::Real=NaN) diff --git a/test/statmodels.jl b/test/statmodels.jl index 581c5ea1..c72da568 100644 --- a/test/statmodels.jl +++ b/test/statmodels.jl @@ -10,6 +10,10 @@ v5 = [0.12, 0.3467, 1.345e-16] ct = CoefTable(Any[v1, v2, v3, v4, v5], ["Estimate", "Comments", "df", "t", "p"], ["x1", "x2", "x3"], 5, 4) +ct_noname = CoefTable(Any[v1, v2, v3, v4, v5], + ["Estimate", "Comments", "df", "t", "p"], + [], 5, 4) + @test sprint(show, ct) == """ ─────────────────────────────────────────────── Estimate Comments df t p @@ -18,6 +22,30 @@ x1 1.45666 Good 1 -12.56 0.1200 x2 -23.14 Great 56 0.13 0.3467 x3 1.56734e-13 Bad 2 0.00 <1e-15 ───────────────────────────────────────────────""" + +@test sprint(show, ct_noname) == """ +──────────────────────────────────────────────── + Estimate Comments df t p +──────────────────────────────────────────────── +[1] 1.45666 Good 1 -12.56 0.1200 +[2] -23.14 Great 56 0.13 0.3467 +[3] 1.56734e-13 Bad 2 0.00 <1e-15 +────────────────────────────────────────────────""" + +@test sprint(show, MIME"text/markdown"(), ct) == """ +| | Estimate | Comments | df | t | p | +|:---|--------------:|---------:|---:|-------:|:-------| +| x1 | 1.45666 | Good | 1 | -12.56 | 0.1200 | +| x2 | -23.14 | Great | 56 | 0.13 | 0.3467 | +| x3 | 1.56734e-13 | Bad | 2 | 0.00 | <1e-15 |""" + +@test sprint(show, MIME"text/markdown"(), ct_noname) == """ +| | Estimate | Comments | df | t | p | +|:----|--------------:|---------:|---:|-------:|:-------| +| [1] | 1.45666 | Good | 1 | -12.56 | 0.1200 | +| [2] | -23.14 | Great | 56 | 0.13 | 0.3467 | +| [3] | 1.56734e-13 | Bad | 2 | 0.00 | <1e-15 |""" + @test length(ct) === 3 @test eltype(ct) == NamedTuple{(:Name, :Estimate, :Comments, :df, :t, :p), From 6ec45793591cb816187169ebd03c5e9edb214663 Mon Sep 17 00:00:00 2001 From: Fred Callaway Date: Mon, 29 Mar 2021 13:34:22 -0700 Subject: [PATCH 082/105] add replace and ordered keywords for sample! method (#633) (#636) --- src/sampling.jl | 5 +++-- test/sampling.jl | 20 ++++++++++++++++++++ 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/src/sampling.jl b/src/sampling.jl index 1417596f..b5314919 100644 --- a/src/sampling.jl +++ b/src/sampling.jl @@ -864,8 +864,9 @@ function sample!(rng::AbstractRNG, a::AbstractArray, wv::AbstractWeights, x::Abs end return x end -sample!(a::AbstractArray, wv::AbstractWeights, x::AbstractArray) = - sample!(Random.GLOBAL_RNG, a, wv, x) +sample!(a::AbstractArray, wv::AbstractWeights, x::AbstractArray; + replace::Bool=true, ordered::Bool=false) = + sample!(Random.GLOBAL_RNG, a, wv, x; replace=replace, ordered=ordered) sample(rng::AbstractRNG, a::AbstractArray{T}, wv::AbstractWeights, n::Integer; replace::Bool=true, ordered::Bool=false) where {T} = diff --git a/test/sampling.jl b/test/sampling.jl index 91623302..714ebdb1 100644 --- a/test/sampling.jl +++ b/test/sampling.jl @@ -215,3 +215,23 @@ Random.seed!(1); @test sample([1, 2], Weights([1, 1]), (2,2)) == ones(2,2) @test sample([1, 2], Weights([0, 1]), (2,2)) == [2 2 ; 2 2] @test sample(collect(1:4), Weights(1:4), (2,2), replace=false) == [4 1; 3 2] + + +#### check that sample and sample! do the same thing +function test_same(;kws...) + wv = Weights(rand(20)) + Random.seed!(1) + x1 = sample(1:20, wv, 10; kws...) + Random.seed!(1) + x2 = zeros(Int, 10) + sample!(1:20, wv, x2; kws...) + @test x1 == x2 +end + +test_same() +test_same(replace=true) +test_same(replace=false) +test_same(replace=true, ordered=true) +test_same(replace=false, ordered=true) +test_same(replace=true, ordered=false) +test_same(replace=false, ordered=false) From bc20566cb784d314a097127152c48cd6d7438c0c Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Wed, 7 Apr 2021 16:04:23 -0400 Subject: [PATCH 083/105] fix sequential sampling with replacement (#677) --- src/sampling.jl | 107 +++++++++++++++++++++++++++++++++++----------- test/sampling.jl | 27 ++++++++---- test/wsampling.jl | 32 ++++++++++---- 3 files changed, 124 insertions(+), 42 deletions(-) diff --git a/src/sampling.jl b/src/sampling.jl index b5314919..d12fd56e 100644 --- a/src/sampling.jl +++ b/src/sampling.jl @@ -42,6 +42,49 @@ function direct_sample!(rng::AbstractRNG, a::AbstractArray, x::AbstractArray) end direct_sample!(a::AbstractArray, x::AbstractArray) = direct_sample!(Random.GLOBAL_RNG, a, x) +# check whether we can use T to store indices 1:n exactly, and +# use some heuristics to decide whether it is beneficial for k samples +# (true for a subset of hardware-supported numeric types) +_storeindices(n, k, ::Type{T}) where {T<:Integer} = n ≤ typemax(T) +_storeindices(n, k, ::Type{T}) where {T<:Union{Float32,Float64}} = k < 22 && n ≤ maxintfloat(T) +_storeindices(n, k, ::Type{Complex{T}}) where {T} = _storeindices(n, k, T) +_storeindices(n, k, ::Type{Rational{T}}) where {T} = k < 16 && _storeindices(n, k, T) +_storeindices(n, k, T) = false +storeindices(n, k, ::Type{T}) where {T<:Base.HWNumber} = _storeindices(n, k, T) +storeindices(n, k, T) = false + +# order results of a sampler that does not order automatically +function sample_ordered!(sampler!, rng::AbstractRNG, a::AbstractArray, x::AbstractArray) + n, k = length(a), length(x) + # todo: if eltype(x) <: Real && eltype(a) <: Real, + # in some cases it might be faster to check + # issorted(a) to see if we can just sort x + if storeindices(n, k, eltype(x)) + sort!(sampler!(rng, Base.OneTo(n), x), by=real, lt=<) + @inbounds for i = 1:k + x[i] = a[Int(x[i])] + end + else + indices = Array{Int}(undef, k) + sort!(sampler!(rng, Base.OneTo(n), indices)) + @inbounds for i = 1:k + x[i] = a[indices[i]] + end + end + return x +end + +# special case of a range can be done more efficiently +sample_ordered!(sampler!, rng::AbstractRNG, a::AbstractRange, x::AbstractArray) = + sort!(sampler!(rng, a, x), rev=step(a)<0) + +# weighted case: +sample_ordered!(sampler!, rng::AbstractRNG, a::AbstractArray, + wv::AbstractWeights, x::AbstractArray) = + sample_ordered!(rng, a, x) do rng, a, x + sampler!(rng, a, wv, x) + end + ### draw a pair of distinct integers in [1:n] """ @@ -396,21 +439,24 @@ Draw a random sample of `length(x)` elements from an array `a` and store the result in `x`. A polyalgorithm is used for sampling. Sampling probabilities are proportional to the weights given in `wv`, if provided. `replace` dictates whether sampling is performed with -replacement and `order` dictates whether an ordered sample, also called -a sequential sample, should be taken. +replacement. `ordered` dictates whether +an ordered sample (also called a sequential sample, i.e. a sample where +items appear in the same order as in `a`) should be taken. Optionally specify a random number generator `rng` as the first argument (defaults to `Random.GLOBAL_RNG`). """ function sample!(rng::AbstractRNG, a::AbstractArray, x::AbstractArray; replace::Bool=true, ordered::Bool=false) + 1 == firstindex(a) == firstindex(x) || + throw(ArgumentError("non 1-based arrays are not supported")) n = length(a) k = length(x) k == 0 && return x if replace # with replacement if ordered - sort!(direct_sample!(rng, a, x)) + sample_ordered!(direct_sample!, rng, a, x) else direct_sample!(rng, a, x) end @@ -448,8 +494,9 @@ sample!(a::AbstractArray, x::AbstractArray; replace::Bool=true, ordered::Bool=fa Select a random, optionally weighted sample of size `n` from an array `a` using a polyalgorithm. Sampling probabilities are proportional to the weights given in `wv`, if provided. `replace` dictates whether sampling is performed -with replacement and `order` dictates whether an ordered sample, also called -a sequential sample, should be taken. +with replacement. `ordered` dictates whether +an ordered sample (also called a sequential sample, i.e. a sample where +items appear in the same order as in `a`) should be taken. Optionally specify a random number generator `rng` as the first argument (defaults to `Random.GLOBAL_RNG`). @@ -468,8 +515,9 @@ sample(a::AbstractArray, n::Integer; replace::Bool=true, ordered::Bool=false) = Select a random, optionally weighted sample from an array `a` specifying the dimensions `dims` of the output array. Sampling probabilities are proportional to the weights given in `wv`, if provided. `replace` dictates -whether sampling is performed with replacement and `order` dictates whether -an ordered sample, also called a sequential sample, should be taken. +whether sampling is performed with replacement. `ordered` dictates whether +an ordered sample (also called a sequential sample, i.e. a sample where +items appear in the same order as in `a`) should be taken. Optionally specify a random number generator `rng` as the first argument (defaults to `Random.GLOBAL_RNG`). @@ -781,7 +829,8 @@ Noting `k=length(x)` and `n=length(a)`, this algorithm takes ``O(k \\log(k) \\lo processing time to draw ``k`` elements. It consumes ``O(k \\log(n / k))`` random numbers. """ function efraimidis_aexpj_wsample_norep!(rng::AbstractRNG, a::AbstractArray, - wv::AbstractWeights, x::AbstractArray) + wv::AbstractWeights, x::AbstractArray; + ordered::Bool=false) n = length(a) length(wv) == n || throw(DimensionMismatch("a and wv must be of same length (got $n and $(length(wv))).")) k = length(x) @@ -824,24 +873,36 @@ function efraimidis_aexpj_wsample_norep!(rng::AbstractRNG, a::AbstractArray, threshold = pq[1].first X = threshold * randexp(rng) end - - # fill output array with items in descending order - @inbounds for i in k:-1:1 - x[i] = a[heappop!(pq).second] + if ordered + # fill output array with items sorted as in a + sort!(pq, by=last) + @inbounds for i in 1:k + x[i] = a[pq[i].second] + end + else + # fill output array with items in descending order + @inbounds for i in k:-1:1 + x[i] = a[heappop!(pq).second] + end end return x end -efraimidis_aexpj_wsample_norep!(a::AbstractArray, wv::AbstractWeights, x::AbstractArray) = - efraimidis_aexpj_wsample_norep!(Random.GLOBAL_RNG, a, wv, x) +efraimidis_aexpj_wsample_norep!(a::AbstractArray, wv::AbstractWeights, x::AbstractArray; + ordered::Bool=false) = + efraimidis_aexpj_wsample_norep!(Random.GLOBAL_RNG, a, wv, x; ordered=ordered) function sample!(rng::AbstractRNG, a::AbstractArray, wv::AbstractWeights, x::AbstractArray; replace::Bool=true, ordered::Bool=false) + 1 == firstindex(a) == firstindex(wv) == firstindex(x) || + throw(ArgumentError("non 1-based arrays are not supported")) n = length(a) k = length(x) if replace if ordered - sort!(direct_sample!(rng, a, wv, x)) + sample_ordered!(rng, a, wv, x) do rng, a, wv, x + sample!(rng, a, wv, x; replace=true, ordered=false) + end else if n < 40 direct_sample!(rng, a, wv, x) @@ -856,11 +917,7 @@ function sample!(rng::AbstractRNG, a::AbstractArray, wv::AbstractWeights, x::Abs end else k <= n || error("Cannot draw $n samples from $k samples without replacement.") - - efraimidis_aexpj_wsample_norep!(rng, a, wv, x) - if ordered - sort!(x) - end + efraimidis_aexpj_wsample_norep!(rng, a, wv, x; ordered=ordered) end return x end @@ -889,8 +946,9 @@ sample(a::AbstractArray, wv::AbstractWeights, dims::Dims; Select a weighted sample from an array `a` and store the result in `x`. Sampling probabilities are proportional to the weights given in `w`. `replace` dictates -whether sampling is performed with replacement and `order` dictates whether an -ordered sample, also called a sequential sample, should be taken. +whether sampling is performed with replacement. `ordered` dictates whether +an ordered sample (also called a sequential sample, i.e. a sample where +items appear in the same order as in `a`) should be taken. Optionally specify a random number generator `rng` as the first argument (defaults to `Random.GLOBAL_RNG`). @@ -923,8 +981,9 @@ wsample(a::AbstractArray, w::RealVector) = wsample(Random.GLOBAL_RNG, a, w) Select a weighted random sample of size `n` from `a` with probabilities proportional to the weights given in `w` if `a` is present, otherwise select a random sample of size `n` of the weights given in `w`. `replace` dictates whether sampling is performed with -replacement and `order` dictates whether an ordered sample, also called a sequential -sample, should be taken. +replacement. `ordered` dictates whether +an ordered sample (also called a sequential sample, i.e. a sample where +items appear in the same order as in `a`) should be taken. Optionally specify a random number generator `rng` as the first argument (defaults to `Random.GLOBAL_RNG`). diff --git a/test/sampling.jl b/test/sampling.jl index 714ebdb1..66bb0231 100644 --- a/test/sampling.jl +++ b/test/sampling.jl @@ -27,19 +27,19 @@ end #### sample with replacement -function check_sample_wrep(a::AbstractArray, vrgn, ptol::Real; ordered::Bool=false) +function check_sample_wrep(a::AbstractArray, vrgn, ptol::Real; ordered::Bool=false, rev::Bool=false) vmin, vmax = vrgn (amin, amax) = extrema(a) @test vmin <= amin <= amax <= vmax n = vmax - vmin + 1 p0 = fill(1/n, n) if ordered - @test issorted(a) + @test issorted(a; rev=rev) if ptol > 0 @test isapprox(proportions(a, vmin:vmax), p0, atol=ptol) end else - @test !issorted(a) + @test !issorted(a; rev=rev) ncols = size(a,2) if ncols == 1 @test isapprox(proportions(a, vmin:vmax), p0, atol=ptol) @@ -68,11 +68,17 @@ test_rng_use(direct_sample!, 1:10, zeros(Int, 6)) a = sample(3:12, n) check_sample_wrep(a, (3, 12), 5.0e-3; ordered=false) -a = sample(3:12, n; ordered=true) -check_sample_wrep(a, (3, 12), 5.0e-3; ordered=true) +for rev in (true, false), T in (Int, Int16, Float64, Float16, BigInt, ComplexF64, Rational{Int}) + r = rev ? reverse(3:12) : (3:12) + r = T===Int ? r : T.(r) + aa = Int.(sample(r, n; ordered=true)) + check_sample_wrep(aa, (3, 12), 5.0e-3; ordered=true, rev=rev) -a = sample(3:12, 10; ordered=true) -check_sample_wrep(a, (3, 12), 0; ordered=true) + aa = Int.(sample(r, 10; ordered=true)) + check_sample_wrep(aa, (3, 12), 0; ordered=true, rev=rev) +end + +@test StatsBase._storeindices(1, 1, BigFloat) == StatsBase._storeindices(1, 1, BigFloat) == false test_rng_use(sample, 1:10, 10) @@ -91,7 +97,7 @@ test_rng_use(samplepair, 1000) #### sample without replacement -function check_sample_norep(a::AbstractArray, vrgn, ptol::Real; ordered::Bool=false) +function check_sample_norep(a::AbstractArray, vrgn, ptol::Real; ordered::Bool=false, rev::Bool=false) # each column of a for one run vmin, vmax = vrgn @@ -103,7 +109,7 @@ function check_sample_norep(a::AbstractArray, vrgn, ptol::Real; ordered::Bool=fa aj = view(a,:,j) @assert allunique(aj) if ordered - @assert issorted(aj) + @assert issorted(aj, rev=rev) end end @@ -178,6 +184,9 @@ check_sample_norep(a, (3, 12), 0; ordered=false) a = sample(3:12, 5; replace=false, ordered=true) check_sample_norep(a, (3, 12), 0; ordered=true) +a = sample(reverse(3:12), 5; replace=false, ordered=true) +check_sample_norep(a, (3, 12), 0; ordered=true, rev=true) + # tests of multidimensional sampling a = sample(3:12, (2, 2); replace=false) diff --git a/test/wsampling.jl b/test/wsampling.jl index fd9e6cec..5ff725f7 100644 --- a/test/wsampling.jl +++ b/test/wsampling.jl @@ -5,19 +5,21 @@ Random.seed!(1234) #### weighted sample with replacement -function check_wsample_wrep(a::AbstractArray, vrgn, wv::AbstractWeights, ptol::Real; ordered::Bool=false) +function check_wsample_wrep(a::AbstractArray, vrgn, wv::AbstractWeights, ptol::Real; + ordered::Bool=false, rev::Bool=false) K = length(wv) (vmin, vmax) = vrgn (amin, amax) = extrema(a) @test vmin <= amin <= amax <= vmax p0 = wv ./ sum(wv) + rev && reverse!(p0) if ordered - @test issorted(a) + @test issorted(a; rev=rev) if ptol > 0 @test isapprox(proportions(a, vmin:vmax), p0, atol=ptol) end else - @test !issorted(a) + @test !issorted(a; rev=rev) ncols = size(a,2) if ncols == 1 @test isapprox(proportions(a, vmin:vmax), p0, atol=ptol) @@ -45,13 +47,20 @@ check_wsample_wrep(a, (4, 7), wv, 5.0e-3; ordered=false) a = sample(4:7, wv, n; ordered=false) check_wsample_wrep(a, (4, 7), wv, 5.0e-3; ordered=false) -a = sample(4:7, wv, n; ordered=true) -check_wsample_wrep(a, (4, 7), wv, 5.0e-3; ordered=true) +for rev in (true, false), T in (Int, Int16, Float64, Float16, BigInt, ComplexF64, Rational{Int}) + r = rev ? reverse(4:7) : (4:7) + r = T===Int ? r : T.(r) + aa = Int.(sample(r, wv, n; ordered=true)) + check_wsample_wrep(aa, (4, 7), wv, 5.0e-3; ordered=true, rev=rev) + aa = Int.(sample(r, wv, 10; ordered=true)) + check_wsample_wrep(aa, (4, 7), wv, -1; ordered=true, rev=rev) +end #### weighted sampling without replacement -function check_wsample_norep(a::AbstractArray, vrgn, wv::AbstractWeights, ptol::Real; ordered::Bool=false) +function check_wsample_norep(a::AbstractArray, vrgn, wv::AbstractWeights, ptol::Real; + ordered::Bool=false, rev::Bool=false) # each column of a for one run vmin, vmax = vrgn @@ -63,12 +72,13 @@ function check_wsample_norep(a::AbstractArray, vrgn, wv::AbstractWeights, ptol:: aj = view(a,:,j) @assert allunique(aj) if ordered - @assert issorted(aj) + @assert issorted(aj; rev=rev) end end if ptol > 0 p0 = wv ./ sum(wv) + rev && reverse!(p0) @test isapprox(proportions(a[1,:], vmin:vmax), p0, atol=ptol) end end @@ -110,5 +120,9 @@ test_rng_use(efraimidis_aexpj_wsample_norep!, 4:7, wv, zeros(Int, 2)) a = sample(4:7, wv, 3; replace=false, ordered=false) check_wsample_norep(a, (4, 7), wv, -1; ordered=false) -a = sample(4:7, wv, 3; replace=false, ordered=true) -check_wsample_norep(a, (4, 7), wv, -1; ordered=true) +for rev in (true, false), T in (Int, Int16, Float64, Float16, BigInt, ComplexF64, Rational{Int}) + r = rev ? reverse(4:7) : (4:7) + r = T===Int ? r : T.(r) + aa = Int.(sample(r, wv, 3; replace=false, ordered=true)) + check_wsample_norep(aa, (4, 7), wv, -1; ordered=true, rev=rev) +end From f2a067a8516aaf3f8ed80ad00d6d0cbe9029d953 Mon Sep 17 00:00:00 2001 From: ericqu Date: Wed, 7 Apr 2021 22:47:44 +0200 Subject: [PATCH 084/105] add cooksdistance (#679) Originate from the addition of the cooksdistance in GLM.jl for LinearModel. Discussed in https://github.com/JuliaStats/GLM.jl/pull/415 --- Project.toml | 2 +- docs/src/statmodels.md | 1 + src/statmodels.jl | 9 +++++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index f571bba4..9df1dc37 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "StatsBase" uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" authors = ["JuliaStats"] -version = "0.33.4" +version = "0.33.5" [deps] DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" diff --git a/docs/src/statmodels.md b/docs/src/statmodels.md index a4d9c050..473d9766 100644 --- a/docs/src/statmodels.md +++ b/docs/src/statmodels.md @@ -38,6 +38,7 @@ crossmodelmatrix dof_residual fitted leverage +cooksdistance meanresponse modelmatrix response diff --git a/src/statmodels.jl b/src/statmodels.jl index cc1db8e7..708a6c8f 100644 --- a/src/statmodels.jl +++ b/src/statmodels.jl @@ -353,6 +353,15 @@ Return the diagonal of the projection matrix of the model. """ leverage(model::RegressionModel) = error("leverage is not defined for $(typeof(model)).") +""" + cooksdistance(model::RegressionModel) + +Compute [Cook's distance](https://en.wikipedia.org/wiki/Cook%27s_distance) +for each observation in linear model `model`, giving an estimate of the influence +of each data point. +""" +cooksdistance(model::RegressionModel) = error("cooksdistance is not defined for $(typeof(model)).") + """ residuals(model::RegressionModel) From 9865dc03401b56941d597d2128fdb51c380b8eb7 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Sun, 11 Apr 2021 17:02:55 +0200 Subject: [PATCH 085/105] Export cooksdistance (#681) f2a067a forgot that step. --- src/StatsBase.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/StatsBase.jl b/src/StatsBase.jl index bd4df36b..70c812c0 100644 --- a/src/StatsBase.jl +++ b/src/StatsBase.jl @@ -172,6 +172,7 @@ export coefnames, coeftable, confint, + cooksdistance, crossmodelmatrix, deviance, dof, From b1f4c7e9fe049a5c81257a556474e2b4226f0fd2 Mon Sep 17 00:00:00 2001 From: Philip Swannell <18028484+PGS62@users.noreply.github.com> Date: Thu, 15 Apr 2021 21:40:49 +0100 Subject: [PATCH 086/105] corrected error message in corrspearman (#682) --- src/rankcorr.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rankcorr.jl b/src/rankcorr.jl index 714548d5..3751b067 100644 --- a/src/rankcorr.jl +++ b/src/rankcorr.jl @@ -91,7 +91,7 @@ end function corspearman(X::RealMatrix, Y::RealMatrix) size(X, 1) == size(Y, 1) || - throw(ArgumentError("number of columns in each array must match")) + throw(ArgumentError("number of rows in each array must match")) nr = size(X, 2) nc = size(Y, 2) C = Matrix{Float64}(undef, nr, nc) From f11bafa6fb3dd870704f12ca695795812a3e7daf Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Thu, 22 Apr 2021 19:16:54 +0000 Subject: [PATCH 087/105] compathelper privelege (#686) --- .github/workflows/CompatHelper.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml index 68dbe39c..15d97fda 100644 --- a/.github/workflows/CompatHelper.yml +++ b/.github/workflows/CompatHelper.yml @@ -21,4 +21,5 @@ jobs: - name: CompatHelper.main() env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }} run: julia -e 'using CompatHelper; CompatHelper.main()' From 2993089b4f807e86f6574b3436f80fe94d3a9d57 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 22 Apr 2021 19:50:14 +0000 Subject: [PATCH 088/105] CompatHelper: bump compat for "Missings" to "1.0" (#684) Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Phillip Alday --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 9df1dc37..25a726b1 100644 --- a/Project.toml +++ b/Project.toml @@ -17,7 +17,7 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" [compat] DataAPI = "1" DataStructures = "0.10, 0.11, 0.12, 0.13, 0.14, 0.17, 0.18" -Missings = "0.3, 0.4" +Missings = "0.3, 0.4, 1.0" SortingAlgorithms = "0.3" julia = "1" From e6a729a62f9d2d68478c15cb534e52b9ff63a5ae Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Fri, 23 Apr 2021 23:35:33 +0200 Subject: [PATCH 089/105] Bump version to 0.33.6 (#689) --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 25a726b1..883052c1 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "StatsBase" uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" authors = ["JuliaStats"] -version = "0.33.5" +version = "0.33.6" [deps] DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" From baf223c0fc76c62d3db538d14751aa3397cb9245 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Mon, 26 Apr 2021 20:01:34 +0000 Subject: [PATCH 090/105] pointwise loglikelihood (#685) * pointwiseloglikelihood * multiple dispatch instead of names * remove export * Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat Co-authored-by: Milan Bouchet-Valat --- src/statmodels.jl | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/statmodels.jl b/src/statmodels.jl index 708a6c8f..0e2b4af2 100644 --- a/src/statmodels.jl +++ b/src/statmodels.jl @@ -76,6 +76,25 @@ This is usually the model containing only the intercept. nullloglikelihood(model::StatisticalModel) = error("nullloglikelihood is not defined for $(typeof(model)).") +""" + loglikelihood(model::StatisticalModel, ::Colon) + +Return a vector of each observation's contribution to the log-likelihood of the model. +In other words, this is the vector of the pointwise log-likelihood contributions. + +In general, `sum(loglikehood(model, :)) == loglikelihood(model)`. +""" +loglikelihood(model::StatisticalModel, ::Colon) = + error("loglikelihood(model::StatisticalModel, ::Colon) is not defined for $(typeof(model)).") + +""" + loglikelihood(model::StatisticalModel, observation) + +Return the contribution of `observation` to the log-likelihood of `model`. +""" +loglikelihood(model::StatisticalModel, observation) = + error("loglikelihood(model::StatisticalModel, observation) is not defined for $(typeof(model)).") + """ score(model::StatisticalModel) From 9facdd268b4356eb2f1d85470cc527c86f1e1cf4 Mon Sep 17 00:00:00 2001 From: Andreas Noack Date: Sat, 1 May 2021 13:24:32 +0200 Subject: [PATCH 091/105] Update CompatHelper.yml --- .github/workflows/CompatHelper.yml | 33 +++++++++++++++--------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml index 15d97fda..7784f241 100644 --- a/.github/workflows/CompatHelper.yml +++ b/.github/workflows/CompatHelper.yml @@ -1,25 +1,26 @@ name: CompatHelper - on: schedule: - - cron: '00 * * * *' - + - cron: 0 0 * * * + workflow_dispatch: jobs: CompatHelper: - runs-on: ${{ matrix.os }} - strategy: - matrix: - julia-version: [1.2.0] - julia-arch: [x86] - os: [ubuntu-latest] + runs-on: ubuntu-latest steps: - - uses: julia-actions/setup-julia@latest - with: - version: ${{ matrix.julia-version }} - - name: Pkg.add("CompatHelper") - run: julia -e 'using Pkg; Pkg.add("CompatHelper")' - - name: CompatHelper.main() + - name: "Install CompatHelper" + run: | + import Pkg + name = "CompatHelper" + uuid = "aa819f21-2bde-4658-8897-bab36330d9b7" + version = "2" + Pkg.add(; name, uuid, version) + shell: julia --color=yes {0} + - name: "Run CompatHelper" + run: | + import CompatHelper + CompatHelper.main() + shell: julia --color=yes {0} env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }} - run: julia -e 'using CompatHelper; CompatHelper.main()' + # COMPATHELPER_PRIV: ${{ secrets.COMPATHELPER_PRIV }} From c92ada988f0849353dc00a4b87574bc213897ff6 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Sat, 1 May 2021 13:25:41 +0200 Subject: [PATCH 092/105] CompatHelper: bump compat for "SortingAlgorithms" to "1.0" (#690) Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 883052c1..7d3eb9fa 100644 --- a/Project.toml +++ b/Project.toml @@ -18,7 +18,7 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" DataAPI = "1" DataStructures = "0.10, 0.11, 0.12, 0.13, 0.14, 0.17, 0.18" Missings = "0.3, 0.4, 1.0" -SortingAlgorithms = "0.3" +SortingAlgorithms = "0.3, 1.0" julia = "1" [extras] From 45d65ec8fd64fb2601c1bf3570c47a107e709649 Mon Sep 17 00:00:00 2001 From: Andreas Noack Date: Sat, 1 May 2021 13:32:14 +0200 Subject: [PATCH 093/105] Release 0.33.7 --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 7d3eb9fa..f1a4b3a6 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "StatsBase" uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" authors = ["JuliaStats"] -version = "0.33.6" +version = "0.33.7" [deps] DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" From d18762ce03fc92028f9ec9cd51d5ef026eda5e06 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Sun, 2 May 2021 16:37:13 +0200 Subject: [PATCH 094/105] Add pairwise (#627) This generic method takes iterators of vectors and supports skipping missing values. It is a more general version of `pairwise` in Distances.jl. Since methods are compatible, both packages can override a common empty function defined in StatsAPI. --- Project.toml | 2 + docs/src/misc.md | 2 + src/StatsBase.jl | 4 + src/pairwise.jl | 313 +++++++++++++++++++++++++++++++++++++++++++++++ test/pairwise.jl | 261 +++++++++++++++++++++++++++++++++++++++ test/runtests.jl | 1 + 6 files changed, 583 insertions(+) create mode 100644 src/pairwise.jl create mode 100644 test/pairwise.jl diff --git a/Project.toml b/Project.toml index f1a4b3a6..8b7b3137 100644 --- a/Project.toml +++ b/Project.toml @@ -13,12 +13,14 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" SortingAlgorithms = "a2af1166-a08f-5f64-846c-94a0d3cef48c" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +StatsAPI = "82ae8749-77ed-4fe6-ae5f-f523153014b0" [compat] DataAPI = "1" DataStructures = "0.10, 0.11, 0.12, 0.13, 0.14, 0.17, 0.18" Missings = "0.3, 0.4, 1.0" SortingAlgorithms = "0.3, 1.0" +StatsAPI = "1" julia = "1" [extras] diff --git a/docs/src/misc.md b/docs/src/misc.md index 79b55e06..66c84028 100644 --- a/docs/src/misc.md +++ b/docs/src/misc.md @@ -7,4 +7,6 @@ levelsmap indexmap indicatormat StatsBase.midpoints +pairwise +pairwise! ``` diff --git a/src/StatsBase.jl b/src/StatsBase.jl index 70c812c0..2d234449 100644 --- a/src/StatsBase.jl +++ b/src/StatsBase.jl @@ -19,6 +19,7 @@ import LinearAlgebra: BlasReal, BlasFloat import Statistics: mean, mean!, var, varm, varm!, std, stdm, cov, covm, cor, corm, cov2cor!, unscaled_covzm, quantile, sqrt!, median, middle +import StatsAPI: pairwise, pairwise! ## tackle compatibility issues @@ -157,6 +158,8 @@ export indexmap, # construct a map from element to index levelsmap, # construct a map from n unique elements to [1, ..., n] indicatormat, # construct indicator matrix + pairwise, # pairwise application of functions + pairwise!, # pairwise! application of functions # statistical models CoefTable, @@ -228,6 +231,7 @@ include("signalcorr.jl") include("partialcor.jl") include("empirical.jl") include("hist.jl") +include("pairwise.jl") include("misc.jl") include("sampling.jl") diff --git a/src/pairwise.jl b/src/pairwise.jl new file mode 100644 index 00000000..97e51a3f --- /dev/null +++ b/src/pairwise.jl @@ -0,0 +1,313 @@ +function _pairwise!(::Val{:none}, f, dest::AbstractMatrix, x, y, symmetric::Bool) + @inbounds for (i, xi) in enumerate(x), (j, yj) in enumerate(y) + symmetric && i > j && continue + + # For performance, diagonal is special-cased + if f === cor && eltype(dest) !== Union{} && i == j && xi === yj + # TODO: float() will not be needed after JuliaLang/Statistics.jl#61 + dest[i, j] = float(cor(xi)) + else + dest[i, j] = f(xi, yj) + end + end + if symmetric + m, n = size(dest) + @inbounds for j in 1:n, i in (j+1):m + dest[i, j] = dest[j, i] + end + end + return dest +end + +function check_vectors(x, y, skipmissing::Symbol) + m = length(x) + n = length(y) + if !(all(xi -> xi isa AbstractVector, x) && all(yi -> yi isa AbstractVector, y)) + throw(ArgumentError("All entries in x and y must be vectors " * + "when skipmissing=:$skipmissing")) + end + if m > 1 + indsx = keys(first(x)) + for i in 2:m + keys(x[i]) == indsx || + throw(ArgumentError("All input vectors must have the same indices")) + end + end + if n > 1 + indsy = keys(first(y)) + for j in 2:n + keys(y[j]) == indsy || + throw(ArgumentError("All input vectors must have the same indices")) + end + end + if m > 1 && n > 1 + indsx == indsy || + throw(ArgumentError("All input vectors must have the same indices")) + end +end + +function _pairwise!(::Val{:pairwise}, f, dest::AbstractMatrix, x, y, symmetric::Bool) + check_vectors(x, y, :pairwise) + @inbounds for (j, yj) in enumerate(y) + ynminds = .!ismissing.(yj) + @inbounds for (i, xi) in enumerate(x) + symmetric && i > j && continue + + if xi === yj + ynm = view(yj, ynminds) + # For performance, diagonal is special-cased + if f === cor && eltype(dest) !== Union{} && i == j + # TODO: float() will not be needed after JuliaLang/Statistics.jl#61 + dest[i, j] = float(cor(xi)) + else + dest[i, j] = f(ynm, ynm) + end + else + nminds = .!ismissing.(xi) .& ynminds + xnm = view(xi, nminds) + ynm = view(yj, nminds) + dest[i, j] = f(xnm, ynm) + end + end + end + if symmetric + m, n = size(dest) + @inbounds for j in 1:n, i in (j+1):m + dest[i, j] = dest[j, i] + end + end + return dest +end + +function _pairwise!(::Val{:listwise}, f, dest::AbstractMatrix, x, y, symmetric::Bool) + check_vectors(x, y, :listwise) + m, n = size(dest) + nminds = .!ismissing.(first(x)) + @inbounds for xi in Iterators.drop(x, 1) + nminds .&= .!ismissing.(xi) + end + if x !== y + @inbounds for yj in y + nminds .&= .!ismissing.(yj) + end + end + + # Computing integer indices once for all vectors is faster + nminds′ = findall(nminds) + # TODO: check whether wrapping views in a custom array type which asserts + # that entries cannot be `missing` (similar to `skipmissing`) + # could offer better performance + return _pairwise!(Val(:none), f, dest, + [view(xi, nminds′) for xi in x], + [view(yi, nminds′) for yi in y], + symmetric) +end + +function _pairwise!(f, dest::AbstractMatrix, x, y; + symmetric::Bool=false, skipmissing::Symbol=:none) + if !(skipmissing in (:none, :pairwise, :listwise)) + throw(ArgumentError("skipmissing must be one of :none, :pairwise or :listwise")) + end + + x′ = x isa Union{AbstractArray, Tuple, NamedTuple} ? x : collect(x) + y′ = y isa Union{AbstractArray, Tuple, NamedTuple} ? y : collect(y) + m = length(x′) + n = length(y′) + + size(dest) != (m, n) && + throw(DimensionMismatch("dest has dimensions $(size(dest)) but expected ($m, $n)")) + + Base.has_offset_axes(dest) && throw("dest indices must start at 1") + + return _pairwise!(Val(skipmissing), f, dest, x′, y′, symmetric) +end + +function _pairwise(::Val{skipmissing}, f, x, y, symmetric::Bool) where {skipmissing} + x′ = x isa Union{AbstractArray, Tuple, NamedTuple} ? x : collect(x) + y′ = y isa Union{AbstractArray, Tuple, NamedTuple} ? y : collect(y) + m = length(x′) + n = length(y′) + + T = Core.Compiler.return_type(f, Tuple{eltype(x′), eltype(y′)}) + Tsm = Core.Compiler.return_type((x, y) -> f(disallowmissing(x), disallowmissing(y)), + Tuple{eltype(x′), eltype(y′)}) + + if skipmissing === :none + dest = Matrix{T}(undef, m, n) + elseif skipmissing in (:pairwise, :listwise) + dest = Matrix{Tsm}(undef, m, n) + else + throw(ArgumentError("skipmissing must be one of :none, :pairwise or :listwise")) + end + + # Preserve inferred element type + isempty(dest) && return dest + + _pairwise!(f, dest, x′, y′, symmetric=symmetric, skipmissing=skipmissing) + + if isconcretetype(eltype(dest)) + return dest + else + # Final eltype depends on actual contents (consistent with map and broadcast) + U = mapreduce(typeof, promote_type, dest) + # V is inferred (contrary to U), but it only gives an upper bound for U + V = promote_type(T, Tsm) + return convert(Matrix{U}, dest)::Matrix{<:V} + end +end + +""" + pairwise!(f, dest::AbstractMatrix, x[, y]; + symmetric::Bool=false, skipmissing::Symbol=:none) + +Store in matrix `dest` the result of applying `f` to all possible pairs +of entries in iterators `x` and `y`, and return it. Rows correspond to +entries in `x` and columns to entries in `y`, and `dest` must therefore +be of size `length(x) × length(y)`. +If `y` is omitted then `x` is crossed with itself. + +As a special case, if `f` is `cor`, diagonal cells for which entries +from `x` and `y` are identical (according to `===`) are set to one even +in the presence `missing`, `NaN` or `Inf` entries. + +# Keyword arguments +- `symmetric::Bool=false`: If `true`, `f` is only called to compute + for the lower triangle of the matrix, and these values are copied + to fill the upper triangle. Only allowed when `y` is omitted. + Defaults to `true` when `f` is `cor` or `cov`. +- `skipmissing::Symbol=:none`: If `:none` (the default), missing values + in inputs are passed to `f` without any modification. + Use `:pairwise` to skip entries with a `missing` value in either + of the two vectors passed to `f` for a given pair of vectors in `x` and `y`. + Use `:listwise` to skip entries with a `missing` value in any of the + vectors in `x` or `y`; note that this might drop a large part of entries. + Only allowed when entries in `x` and `y` are vectors. + +# Examples +```jldoctest +julia> using StatsBase, Statistics + +julia> dest = zeros(3, 3); + +julia> x = [1 3 7 + 2 5 6 + 3 8 4 + 4 6 2]; + +julia> pairwise!(cor, dest, eachcol(x)); + +julia> dest +3×3 Matrix{Float64}: + 1.0 0.744208 -0.989778 + 0.744208 1.0 -0.68605 + -0.989778 -0.68605 1.0 + +julia> y = [1 3 missing + 2 5 6 + 3 missing 2 + 4 6 2]; + +julia> pairwise!(cor, dest, eachcol(y), skipmissing=:pairwise); + +julia> dest +3×3 Matrix{Float64}: + 1.0 0.928571 -0.866025 + 0.928571 1.0 -1.0 + -0.866025 -1.0 1.0 +``` +""" +function pairwise!(f, dest::AbstractMatrix, x, y=x; + symmetric::Bool=false, skipmissing::Symbol=:none) + if symmetric && x !== y + throw(ArgumentError("symmetric=true only makes sense passing " * + "a single set of variables (x === y)")) + end + + return _pairwise!(f, dest, x, y, symmetric=symmetric, skipmissing=skipmissing) +end + +""" + pairwise(f, x[, y]; + symmetric::Bool=false, skipmissing::Symbol=:none) + +Return a matrix holding the result of applying `f` to all possible pairs +of entries in iterators `x` and `y`. Rows correspond to +entries in `x` and columns to entries in `y`. If `y` is omitted then a +square matrix crossing `x` with itself is returned. + +As a special case, if `f` is `cor`, diagonal cells for which entries +from `x` and `y` are identical (according to `===`) are set to one even +in the presence `missing`, `NaN` or `Inf` entries. + +# Keyword arguments +- `symmetric::Bool=false`: If `true`, `f` is only called to compute + for the lower triangle of the matrix, and these values are copied + to fill the upper triangle. Only allowed when `y` is omitted. + Defaults to `true` when `f` is `cor` or `cov`. +- `skipmissing::Symbol=:none`: If `:none` (the default), missing values + in inputs are passed to `f` without any modification. + Use `:pairwise` to skip entries with a `missing` value in either + of the two vectors passed to `f` for a given pair of vectors in `x` and `y`. + Use `:listwise` to skip entries with a `missing` value in any of the + vectors in `x` or `y`; note that this might drop a large part of entries. + Only allowed when entries in `x` and `y` are vectors. + +# Examples +```jldoctest +julia> using StatsBase, Statistics + +julia> x = [1 3 7 + 2 5 6 + 3 8 4 + 4 6 2]; + +julia> pairwise(cor, eachcol(x)) +3×3 Matrix{Float64}: + 1.0 0.744208 -0.989778 + 0.744208 1.0 -0.68605 + -0.989778 -0.68605 1.0 + +julia> y = [1 3 missing + 2 5 6 + 3 missing 2 + 4 6 2]; + +julia> pairwise(cor, eachcol(y), skipmissing=:pairwise) +3×3 Matrix{Float64}: + 1.0 0.928571 -0.866025 + 0.928571 1.0 -1.0 + -0.866025 -1.0 1.0 +``` +""" +function pairwise(f, x, y=x; symmetric::Bool=false, skipmissing::Symbol=:none) + if symmetric && x !== y + throw(ArgumentError("symmetric=true only makes sense passing " * + "a single set of variables (x === y)")) + end + + return _pairwise(Val(skipmissing), f, x, y, symmetric) +end + +# cov(x) is faster than cov(x, x) +_cov(x, y) = x === y ? cov(x) : cov(x, y) + +pairwise!(::typeof(cov), dest::AbstractMatrix, x, y; + symmetric::Bool=false, skipmissing::Symbol=:none) = + pairwise!(_cov, dest, x, y, symmetric=symmetric, skipmissing=skipmissing) + +pairwise(::typeof(cov), x, y; symmetric::Bool=false, skipmissing::Symbol=:none) = + pairwise(_cov, x, y, symmetric=symmetric, skipmissing=skipmissing) + +pairwise!(::typeof(cov), dest::AbstractMatrix, x; + symmetric::Bool=true, skipmissing::Symbol=:none) = + pairwise!(_cov, dest, x, x, symmetric=symmetric, skipmissing=skipmissing) + +pairwise(::typeof(cov), x; symmetric::Bool=true, skipmissing::Symbol=:none) = + pairwise(_cov, x, x, symmetric=symmetric, skipmissing=skipmissing) + +pairwise!(::typeof(cor), dest::AbstractMatrix, x; + symmetric::Bool=true, skipmissing::Symbol=:none) = + pairwise!(cor, dest, x, x, symmetric=symmetric, skipmissing=skipmissing) + +pairwise(::typeof(cor), x; symmetric::Bool=true, skipmissing::Symbol=:none) = + pairwise(cor, x, x, symmetric=symmetric, skipmissing=skipmissing) diff --git a/test/pairwise.jl b/test/pairwise.jl new file mode 100644 index 00000000..d3120965 --- /dev/null +++ b/test/pairwise.jl @@ -0,0 +1,261 @@ +using StatsBase +using Test, Random, Statistics, LinearAlgebra +using Missings + +const ≅ = isequal + +Random.seed!(1) + +# to avoid using specialized method +arbitrary_fun(x, y) = cor(x, y) + +@testset "pairwise and pairwise! with $f" for f in (arbitrary_fun, cor, cov) + @testset "basic interface" begin + x = [rand(10) for _ in 1:4] + y = [rand(Float32, 10) for _ in 1:5] + # to test case where inference of returned eltype fails + z = [Vector{Any}(rand(Float32, 10)) for _ in 1:5] + + res = @inferred pairwise(f, x, y) + @test res isa Matrix{Float64} + res2 = zeros(Float64, size(res)) + @test pairwise!(f, res2, x, y) === res2 + @test res == res2 == [f(xi, yi) for xi in x, yi in y] + + res = pairwise(f, y, z) + @test res isa Matrix{Float32} + res2 = zeros(Float32, size(res)) + @test pairwise!(f, res2, y, z) === res2 + @test res == res2 == [f(yi, zi) for yi in y, zi in z] + + res = pairwise(f, Any[[1.0, 2.0, 3.0], [1.0f0, 3.0f0, 10.5f0]]) + @test res isa Matrix{Float64} + res2 = zeros(AbstractFloat, size(res)) + @test pairwise!(f, res2, Any[[1.0, 2.0, 3.0], [1.0f0, 3.0f0, 10.5f0]]) === res2 + @test res == res2 == + [f(xi, yi) for xi in ([1.0, 2.0, 3.0], [1.0f0, 3.0f0, 10.5f0]), + yi in ([1.0, 2.0, 3.0], [1.0f0, 3.0f0, 10.5f0])] + @test res isa Matrix{Float64} + + @inferred pairwise(f, x, y) + + @test_throws ArgumentError pairwise(f, [Int[]], [Int[]]) + @test_throws ArgumentError pairwise!(f, zeros(1, 1), [Int[]], [Int[]]) + + res = pairwise(f, [], []) + @test size(res) == (0, 0) + @test res isa Matrix{Any} + res2 = zeros(0, 0) + @test pairwise!(f, res2, [], []) === res2 + + res = pairwise(f, Vector{Int}[], Vector{Int}[]) + @test size(res) == (0, 0) + @test res isa Matrix{Float64} + res2 = zeros(0, 0) + @test pairwise!(f, res2, Vector{Int}[], Vector{Int}[]) === res2 + + res = pairwise(f, [[1, 2]], Vector{Int}[]) + @test size(res) == (1, 0) + @test res isa Matrix{Float64} + res2 = zeros(1, 0) + @test pairwise!(f, res2, [[1, 2]], Vector{Int}[]) === res2 + + res = pairwise(f, Vector{Int}[], [[1, 2], [2, 3]]) + @test size(res) == (0, 2) + @test res isa Matrix{Float64} + res2 = zeros(0, 2) + @test pairwise!(f, res2, [], [[1, 2], [2, 3]]) === res2 + + @test_throws DimensionMismatch pairwise!(f, zeros(1, 2), x, y) + @test_throws DimensionMismatch pairwise!(f, zeros(1, 2), [], []) + @test_throws DimensionMismatch pairwise!(f, zeros(0, 0), + [], [[1, 2], [2, 3]]) + end + + @testset "missing values handling interface" begin + xm = [ifelse.(rand(100) .> 0.9, missing, rand(100)) for _ in 1:4] + ym = [ifelse.(rand(100) .> 0.9, missing, rand(Float32, 100)) for _ in 1:4] + zm = [ifelse.(rand(100) .> 0.9, missing, rand(Float32, 100)) for _ in 1:4] + + res = pairwise(f, xm, ym) + @test res isa Matrix{Missing} + res2 = zeros(Union{Float64, Missing}, size(res)) + @test pairwise!(f, res2, xm, ym) === res2 + @test res ≅ res2 ≅ [missing for xi in xm, yi in ym] + + res = pairwise(f, xm, ym, skipmissing=:pairwise) + @test res isa Matrix{Float64} + res2 = zeros(Union{Float64, Missing}, size(res)) + @test pairwise!(f, res2, xm, ym, skipmissing=:pairwise) === res2 + @test res ≅ res2 + @test isapprox(res, [f(collect.(skipmissings(xi, yi))...) for xi in xm, yi in ym], + rtol=1e-6) + + res = pairwise(f, ym, zm, skipmissing=:pairwise) + @test res isa Matrix{Float32} + res2 = zeros(Union{Float32, Missing}, size(res)) + @test pairwise!(f, res2, ym, zm, skipmissing=:pairwise) === res2 + @test res ≅ res2 + @test isapprox(res, [f(collect.(skipmissings(yi, zi))...) for yi in ym, zi in zm], + rtol=1e-6) + + nminds = mapreduce(x -> .!ismissing.(x), + (x, y) -> x .& y, + [xm; ym]) + res = pairwise(f, xm, ym, skipmissing=:listwise) + @test res isa Matrix{Float64} + res2 = zeros(Union{Float64, Missing}, size(res)) + @test pairwise!(f, res2, xm, ym, skipmissing=:listwise) === res2 + @test res ≅ res2 + @test isapprox(res, [f(view(xi, nminds), view(yi, nminds)) for xi in xm, yi in ym], + rtol=1e-6) + + if VERSION >= v"1.6.0-DEV" + # inference of cor fails so use an inferrable function + # to check that pairwise itself is inferrable + for skipmissing in (:none, :pairwise, :listwise) + g(x, y=x) = pairwise((x, y) -> x[1] * y[1], x, y, skipmissing=skipmissing) + @test Core.Compiler.return_type(g, Tuple{Vector{Vector{Union{Float64, Missing}}}}) == + Core.Compiler.return_type(g, Tuple{Vector{Vector{Union{Float64, Missing}}}, + Vector{Vector{Union{Float64, Missing}}}}) == + Matrix{<: Union{Float64, Missing}} + if skipmissing in (:pairwise, :listwise) + @test_broken Core.Compiler.return_type(g, Tuple{Vector{Vector{Union{Float64, Missing}}}}) == + Core.Compiler.return_type(g, Tuple{Vector{Vector{Union{Float64, Missing}}}, + Vector{Vector{Union{Float64, Missing}}}}) == + Matrix{Float64} + end + end + end + + @test_throws ArgumentError pairwise(f, xm, ym, skipmissing=:something) + @test_throws ArgumentError pairwise!(f, zeros(Union{Float64, Missing}, + length(xm), length(ym)), xm, ym, + skipmissing=:something) + + # variable with only missings + xm = [fill(missing, 10), rand(10)] + ym = [rand(10), rand(10)] + + res = pairwise(f, xm, ym) + @test res isa Matrix{Union{Float64, Missing}} + res2 = zeros(Union{Float64, Missing}, size(res)) + @test pairwise!(f, res2, xm, ym) === res2 + @test res ≅ res2 ≅ [f(xi, yi) for xi in xm, yi in ym] + + if VERSION >= v"1.5" # Fails with UndefVarError on Julia 1.0 + @test_throws ArgumentError pairwise(f, xm, ym, skipmissing=:pairwise) + @test_throws ArgumentError pairwise(f, xm, ym, skipmissing=:listwise) + + res = zeros(Union{Float64, Missing}, length(xm), length(ym)) + @test_throws ArgumentError pairwise!(f, res, xm, ym, skipmissing=:pairwise) + @test_throws ArgumentError pairwise!(f, res, xm, ym, skipmissing=:listwise) + end + + for sm in (:pairwise, :listwise) + @test_throws ArgumentError pairwise(f, [[1, 2]], [1], skipmissing=sm) + @test_throws ArgumentError pairwise(f, [1], [[1, 2]], skipmissing=sm) + @test_throws ArgumentError pairwise(f, [1], [1], skipmissing=sm) + end + end + + @testset "iterators" begin + x = (v for v in [rand(10) for _ in 1:4]) + y = (v for v in [rand(10) for _ in 1:4]) + + res = @inferred pairwise(f, x, y) + res2 = zeros(size(res)) + @test pairwise!(f, res2, x, y) === res2 + @test res == res2 == pairwise(f, collect(x), collect(y)) + + res = @inferred(pairwise(f, x)) + res2 = zeros(size(res)) + @test pairwise!(f, res2, x) === res2 + @test res == res2 == pairwise(f, collect(x)) + end + + @testset "non-vector entries" begin + x = (Iterators.drop(v, 1) for v in [rand(10) for _ in 1:4]) + y = (Iterators.drop(v, 1) for v in [rand(10) for _ in 1:4]) + + @test pairwise((x, y) -> f(collect(x), collect(y)), x, y) == + [f(collect(xi), collect(yi)) for xi in x, yi in y] + @test pairwise((x, y) -> f(collect(x), collect(y)), x) == + [f(collect(xi1), collect(xi2)) for xi1 in x, xi2 in x] + @test_throws ArgumentError pairwise((x, y) -> f(collect(x), collect(y)), x, y, + skipmissing=:pairwise) + @test_throws ArgumentError pairwise((x, y) -> f(collect(x), collect(y)), x, y, + skipmissing=:listwise) + end + + @testset "two-argument method" begin + x = [rand(10) for _ in 1:4] + res = pairwise(f, x) + res2 = zeros(size(res)) + @test pairwise!(f, res2, x) === res2 + @test res == res2 == pairwise(f, x, x) + end + + @testset "symmetric" begin + x = [rand(10) for _ in 1:4] + y = [rand(10) for _ in 1:4] + + @test pairwise(f, x, x, symmetric=true) == + pairwise(f, x, symmetric=true) == + Symmetric(pairwise(f, x, x), :U) + + res = zeros(4, 4) + res2 = zeros(4, 4) + @test pairwise!(f, res, x, x, symmetric=true) === res + @test pairwise!(f, res2, x, symmetric=true) === res2 + @test res == res2 == Symmetric(pairwise(f, x, x), :U) + + @test_throws ArgumentError pairwise(f, x, y, symmetric=true) + @test_throws ArgumentError pairwise!(f, res, x, y, symmetric=true) + end + + @testset "cor corner cases" begin + # Integer inputs must give a Float64 output + res = pairwise(cor, [[1, 2, 3], [1, 5, 2]]) + @test res isa Matrix{Float64} + @test res == [cor(xi, yi) for xi in ([1, 2, 3], [1, 5, 2]), + yi in ([1, 2, 3], [1, 5, 2])] + + # NaNs are ignored for the diagonal + res = pairwise(cor, [[1, 2, NaN], [1, 5, 2]]) + @test res isa Matrix{Float64} + @test res ≅ [1.0 NaN + NaN 1.0] + + # missings are ignored for the diagonal + res = pairwise(cor, [[1, 2, 7], [1, 5, missing]]) + @test res isa Matrix{Union{Float64, Missing}} + @test res ≅ [1.0 missing + missing 1.0] + res = pairwise(cor, Vector{Union{Int, Missing}}[[missing, missing, missing], + [missing, missing, missing]]) + @test res isa Matrix{Union{Float64, Missing}} + @test res ≅ [1.0 missing + missing 1.0] + if VERSION >= v"1.5" + # except when eltype is Missing + res = pairwise(cor, [[missing, missing, missing], + [missing, missing, missing]]) + @test res isa Matrix{Missing} + @test res ≅ [missing missing + missing missing] + end + + for sm in (:pairwise, :listwise) + res = pairwise(cor, [[1, 2, NaN, 4], [1, 5, 5, missing]], skipmissing=sm) + @test res isa Matrix{Float64} + @test res ≅ [1.0 NaN + NaN 1.0] + if VERSION >= v"1.5" + @test_throws ArgumentError pairwise(cor, [[missing, missing, missing], + [missing, missing, missing]], + skipmissing=sm) + end + end + end +end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index 500539c7..ca7be4b8 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -17,6 +17,7 @@ tests = ["ambiguous", "rankcorr", "signalcorr", "misc", + "pairwise", "robust", "sampling", "wsampling", From 2080127fd072b6e0671206f3bcd20c1f88a71167 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Sun, 2 May 2021 16:40:55 +0200 Subject: [PATCH 095/105] Bump version to 0.33.8 --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 8b7b3137..6878fa4f 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "StatsBase" uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" authors = ["JuliaStats"] -version = "0.33.7" +version = "0.33.8" [deps] DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" From 6f81971aac604a7cd605663c13dfa8319c5e5e60 Mon Sep 17 00:00:00 2001 From: ignace-computing <44204565+ignace-computing@users.noreply.github.com> Date: Wed, 19 May 2021 14:08:31 +0200 Subject: [PATCH 096/105] a suggestion for improvement in the docs, imho (#692) --- src/weights.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/weights.jl b/src/weights.jl index a6bafedc..b5365162 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -611,6 +611,8 @@ weighted mean along dimension `dims`. # Examples ```julia +n = 20 +x = rand(n) w = rand(n) mean(x, weights(w)) ``` From 1018a899b56bf437d2bc8cda10cce2d8af8fda13 Mon Sep 17 00:00:00 2001 From: Yuval Date: Sun, 23 May 2021 17:31:48 +0300 Subject: [PATCH 097/105] `mean_and_var` doc fix --- src/moments.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/moments.jl b/src/moments.jl index c5d0ae5c..76562674 100644 --- a/src/moments.jl +++ b/src/moments.jl @@ -153,7 +153,7 @@ std(v::RealArray, w::AbstractWeights, dim::Int; mean=nothing, """ mean_and_var(x, [w::AbstractWeights], [dim]; corrected=false) -> (mean, var) -Return the mean and standard deviation of collection `x`. If `x` is an `AbstractArray`, +Return the mean and variance of collection `x`. If `x` is an `AbstractArray`, `dim` can be specified as a tuple to compute statistics over these dimensions. A weighting vector `w` can be specified to weight the estimates. Finally, bias correction is be applied to the variance calculation if `corrected=true`. From 8f61785fa113f93fbd0e86313a73173d14aadb51 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Wed, 14 Jul 2021 19:24:52 +0200 Subject: [PATCH 098/105] fix tests for change in default RNG on nightly (#699) * fix tests for change in default RNG on nightly * ditch redundant test --- test/sampling.jl | 7 ++++--- test/statmodels.jl | 6 ++++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/test/sampling.jl b/test/sampling.jl index 66bb0231..15bf69f3 100644 --- a/test/sampling.jl +++ b/test/sampling.jl @@ -219,11 +219,12 @@ wv = Weights([zeros(5); 1:4; -1]) @test_throws ErrorException sample(a, wv, 1, replace=false) #### weighted sampling with dimension -Random.seed!(1); -@test sample([1, 2], Weights([1, 1]), (2,2)) == ones(2,2) +# weights respected; this works because of the 0-weight @test sample([1, 2], Weights([0, 1]), (2,2)) == [2 2 ; 2 2] -@test sample(collect(1:4), Weights(1:4), (2,2), replace=false) == [4 1; 3 2] +wm = sample(collect(1:4), Weights(1:4), (2,2), replace=false) +@test size(wm) == (2, 2) # correct shape +@test length(Set(wm)) == 4 # no duplicates in elements #### check that sample and sample! do the same thing diff --git a/test/statmodels.jl b/test/statmodels.jl index c72da568..da8824cf 100644 --- a/test/statmodels.jl +++ b/test/statmodels.jl @@ -56,8 +56,10 @@ x3 1.56734e-13 Bad 2 0.00 <1e-15 (Name = "x3", Estimate = 1.56734e-13, Comments = "Bad", df = 2, t = 2.68e-16, p = 1.345e-16) ] -Random.seed!(10) -m = rand(3,4) + +m = [0.11258244478647295 0.05664544616214151 0.38181274408522614 0.8197779704008801 + 0.36831406658084287 0.12078054506961555 0.8151038332483567 0.6699313951612162 + 0.3444540231363058 0.17957407667101322 0.2422083248151139 0.4530583319523316] ct = CoefTable(m, ["Estimate", "Stderror", "df", "p"], [], 4) @test sprint(show, ct) == """ ────────────────────────────────────────── From db6d8b9a4da5f3e6f94ea4570ceccf4166e521f4 Mon Sep 17 00:00:00 2001 From: Kristoffer Carlsson Date: Thu, 29 Jul 2021 02:03:48 +0200 Subject: [PATCH 099/105] fix call of floatrange on 1.7+ (#703) --- src/hist.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hist.jl b/src/hist.jl index 3607b2dd..d1690b0a 100644 --- a/src/hist.jl +++ b/src/hist.jl @@ -96,7 +96,7 @@ function histrange(lo::F, hi::F, n::Integer, closed::Symbol=:left) where F len += one(F) end end - Base.floatrange(start,step,len,divisor) + Base.floatrange(start,step,Int(len),divisor) end histrange(vs::NTuple{N,AbstractVector},nbins::NTuple{N,Integer},closed::Symbol) where {N} = From 6bd3433f46bdae354f5cee707047f33b956aa004 Mon Sep 17 00:00:00 2001 From: Alex Arslan Date: Thu, 29 Jul 2021 08:47:46 -0700 Subject: [PATCH 100/105] Bump version to 0.33.9 --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 6878fa4f..a42c8799 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "StatsBase" uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" authors = ["JuliaStats"] -version = "0.33.8" +version = "0.33.9" [deps] DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" From f557775947be6d37623c19e0137e73d5d1577a04 Mon Sep 17 00:00:00 2001 From: Tim Holy Date: Sat, 21 Aug 2021 08:01:22 -0500 Subject: [PATCH 101/105] Generalize exception type (#707) In preparation for https://github.com/JuliaLang/julia/pull/41885 --- test/pairwise.jl | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/test/pairwise.jl b/test/pairwise.jl index d3120965..09699b27 100644 --- a/test/pairwise.jl +++ b/test/pairwise.jl @@ -39,8 +39,8 @@ arbitrary_fun(x, y) = cor(x, y) @inferred pairwise(f, x, y) - @test_throws ArgumentError pairwise(f, [Int[]], [Int[]]) - @test_throws ArgumentError pairwise!(f, zeros(1, 1), [Int[]], [Int[]]) + @test_throws Union{ArgumentError,MethodError} pairwise(f, [Int[]], [Int[]]) + @test_throws Union{ArgumentError,MethodError} pairwise!(f, zeros(1, 1), [Int[]], [Int[]]) res = pairwise(f, [], []) @test size(res) == (0, 0) @@ -144,12 +144,12 @@ arbitrary_fun(x, y) = cor(x, y) @test res ≅ res2 ≅ [f(xi, yi) for xi in xm, yi in ym] if VERSION >= v"1.5" # Fails with UndefVarError on Julia 1.0 - @test_throws ArgumentError pairwise(f, xm, ym, skipmissing=:pairwise) - @test_throws ArgumentError pairwise(f, xm, ym, skipmissing=:listwise) + @test_throws Union{ArgumentError,MethodError} pairwise(f, xm, ym, skipmissing=:pairwise) + @test_throws Union{ArgumentError,MethodError} pairwise(f, xm, ym, skipmissing=:listwise) res = zeros(Union{Float64, Missing}, length(xm), length(ym)) - @test_throws ArgumentError pairwise!(f, res, xm, ym, skipmissing=:pairwise) - @test_throws ArgumentError pairwise!(f, res, xm, ym, skipmissing=:listwise) + @test_throws Union{ArgumentError,MethodError} pairwise!(f, res, xm, ym, skipmissing=:pairwise) + @test_throws Union{ArgumentError,MethodError} pairwise!(f, res, xm, ym, skipmissing=:listwise) end for sm in (:pairwise, :listwise) From 7fcea247602853b64ed257c1dcdf1c4edfd9bd47 Mon Sep 17 00:00:00 2001 From: Jose Storopoli <43353831+storopoli@users.noreply.github.com> Date: Tue, 24 Aug 2021 13:49:48 -0300 Subject: [PATCH 102/105] Added Minimal Support for Reliability Scores (aka Cronbach's alpha) (#701) --- docs/src/scalarstats.md | 6 ++++ src/StatsBase.jl | 7 +++- src/reliability.jl | 74 ++++++++++++++++++++++++++++++++++++++ test/reliability.jl | 80 +++++++++++++++++++++++++++++++++++++++++ test/runtests.jl | 1 + 5 files changed, 167 insertions(+), 1 deletion(-) create mode 100644 src/reliability.jl create mode 100644 test/reliability.jl diff --git a/docs/src/scalarstats.md b/docs/src/scalarstats.md index 1159b274..e8f7fb5f 100644 --- a/docs/src/scalarstats.md +++ b/docs/src/scalarstats.md @@ -62,3 +62,9 @@ modes summarystats describe ``` + +## Reliability Measures + +```@docs +cronbachalpha +``` diff --git a/src/StatsBase.jl b/src/StatsBase.jl index 2d234449..6d3bf1ac 100644 --- a/src/StatsBase.jl +++ b/src/StatsBase.jl @@ -212,7 +212,11 @@ export standardize, AbstractDataTransform, # the type to represent a abstract data transformation ZScoreTransform, # the type to represent a z-score data transformation - UnitRangeTransform # the type to represent a 0-1 data transformation + UnitRangeTransform, # the type to represent a 0-1 data transformation + + # reliability + CronbachAlpha, # the type to represent Cronbach's alpha scores + cronbachalpha # function to compute Cronbach's alpha scores # source files @@ -232,6 +236,7 @@ include("partialcor.jl") include("empirical.jl") include("hist.jl") include("pairwise.jl") +include("reliability.jl") include("misc.jl") include("sampling.jl") diff --git a/src/reliability.jl b/src/reliability.jl new file mode 100644 index 00000000..aebb94b2 --- /dev/null +++ b/src/reliability.jl @@ -0,0 +1,74 @@ +struct CronbachAlpha{T <: Real} + alpha::T + dropped::Vector{T} +end + +function Base.show(io::IO, x::CronbachAlpha) + @printf(io, "Cronbach's alpha for all items: %.4f\n", x.alpha) + isempty(x.dropped) && return + println(io, "\nCronbach's alpha if an item is dropped:") + for (idx, val) in enumerate(x.dropped) + @printf(io, "item %i: %.4f\n", idx, val) + end +end + +""" + cronbachalpha(covmatrix::AbstractMatrix{<:Real}) + +Calculate Cronbach's alpha (1951) from a covariance matrix `covmatrix` according to +the [formula](https://en.wikipedia.org/wiki/Cronbach%27s_alpha): + +```math +\\rho = \\frac{k}{k-1} (1 - \\frac{\\sum^k_{i=1} \\sigma^2_i}{\\sum_{i=1}^k \\sum_{j=1}^k \\sigma_{ij}}) +``` + +where ``k`` is the number of items, i.e. columns, ``\\sigma_i^2`` the item variance, +and ``\\sigma_{ij}`` the inter-item covariance. + +Returns a `CronbachAlpha` object that holds: + +* `alpha`: the Cronbach's alpha score for all items, i.e. columns, in `covmatrix`; and +* `dropped`: a vector giving Cronbach's alpha scores if a specific item, + i.e. column, is dropped from `covmatrix`. + +# Example +```jldoctest +julia> using StatsBase + +julia> cov_X = [10 6 6 6; + 6 11 6 6; + 6 6 12 6; + 6 6 6 13]; + +julia> cronbachalpha(cov_X) +Cronbach's alpha for all items: 0.8136 + +Cronbach's alpha if an item is dropped: +item 1: 0.7500 +item 2: 0.7606 +item 3: 0.7714 +item 4: 0.7826 +``` +""" +function cronbachalpha(covmatrix::AbstractMatrix{<:Real}) + isposdef(covmatrix) || throw(ArgumentError("Covariance matrix must be positive definite.")) + k = size(covmatrix, 2) + k > 1 || throw(ArgumentError("Covariance matrix must have more than one column.")) + v = vec(sum(covmatrix, dims=1)) + σ = sum(v) + for i in axes(v, 1) + v[i] -= covmatrix[i, i] + end + σ_diag = sum(i -> covmatrix[i, i], 1:k) + + alpha = k * (1 - σ_diag / σ) / (k - 1) + if k > 2 + dropped = typeof(alpha)[(k - 1) * (1 - (σ_diag - covmatrix[i, i]) / (σ - 2*v[i] - covmatrix[i, i])) / (k - 2) + for i in 1:k] + else + # if k = 2 do not produce dropped; this has to be also + # correctly handled in show + dropped = Vector{typeof(alpha)}() + end + return CronbachAlpha(alpha, dropped) +end diff --git a/test/reliability.jl b/test/reliability.jl new file mode 100644 index 00000000..916e097c --- /dev/null +++ b/test/reliability.jl @@ -0,0 +1,80 @@ +using StatsBase +using LinearAlgebra, Random, Test + +@testset "Cronbach's Alpha" begin + # basic vanilla test + cov_X = [10 6 6 6; + 6 11 6 6; + 6 6 12 6; + 6 6 6 13] + cronbach_X = cronbachalpha(cov_X) + @test cronbach_X isa CronbachAlpha{Float64} + @test cronbach_X.alpha ≈ 0.8135593220338981 + @test cronbach_X.dropped ≈ + [0.75, 0.7605633802816901, 0.7714285714285715, 0.782608695652174] + + # testing Rational + cov_rational = cov_X .// 1 + cronbach_rational = cronbachalpha(cov_rational) + @test cronbach_rational isa CronbachAlpha{Rational{Int}} + @test cronbach_rational.alpha == 48 // 59 + @test cronbach_rational.dropped == + [3 // 4, 54 // 71, 27 // 35, 18 // 23] + + # testing BigFloat + cov_bigfloat = BigFloat.(cov_X) + cronbach_bigfloat = cronbachalpha(cov_bigfloat) + @test cronbach_bigfloat isa CronbachAlpha{BigFloat} + @test cronbach_bigfloat.alpha ≈ 0.8135593220338981 + @test cronbach_bigfloat.dropped ≈ + [0.75, 0.7605633802816901, 0.7714285714285715, 0.782608695652174] + + # testing corner cases + @test_throws MethodError cronbachalpha([1.0, 2.0]) + cov_k2 = [10 6; + 6 11] + cronbach_k2 = cronbachalpha(cov_k2) + @test cronbach_k2.alpha ≈ 0.7272727272727273 + @test isempty(cronbach_k2.dropped) + + # testing when Matrix is not positive-definite + cov_not_pos = [-1 1; + -1 1] + @test_throws ArgumentError cronbachalpha(cov_not_pos) + + # testing with a zero + cov_zero = [1 2; + 0 1] + @test_throws ArgumentError cronbachalpha(cov_not_pos) + + # testing with one column + cov_k1 = reshape([1, 2], 2, 1) + @test_throws ArgumentError cronbachalpha(cov_k1) + + # testing with Missing + cov_missing = [1 2; + missing 1] + @test_throws MethodError cronbachalpha(cov_missing) + + + # testing Base.show + cronbach_X = cronbachalpha(cov_X) + io = IOBuffer() + show(io, cronbach_X) + str = String(take!(io)) + @test str == """ + Cronbach's alpha for all items: 0.8136 + + Cronbach's alpha if an item is dropped: + item 1: 0.7500 + item 2: 0.7606 + item 3: 0.7714 + item 4: 0.7826 + """ + # for two columns + io = IOBuffer() + show(io, cronbach_k2) + str = String(take!(io)) + @test str == "Cronbach's alpha for all items: 0.7273\n" + +end # @testset "Cronbach's Alpha" diff --git a/test/runtests.jl b/test/runtests.jl index ca7be4b8..7d30ecd8 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -18,6 +18,7 @@ tests = ["ambiguous", "signalcorr", "misc", "pairwise", + "reliability", "robust", "sampling", "wsampling", From 6138a11faa8b2d78a6dd4dfeacaaf5a3f2035ce7 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Wed, 25 Aug 2021 16:31:55 +0200 Subject: [PATCH 103/105] Cleanup docs (#708) Ensure that all exported methods are included in the manual, so that we can enable strict=true, which ensures that doctests pass. Update doctests to latest Julia version. --- docs/make.jl | 5 +++-- docs/src/counts.md | 2 +- docs/src/index.md | 2 +- docs/src/means.md | 17 ----------------- docs/src/robust.md | 2 ++ docs/src/sampling.md | 2 ++ docs/src/scalarstats.md | 22 ++++++++++++++++++++++ docs/src/statmodels.md | 6 ++++++ docs/src/transformations.md | 8 +++++++- docs/src/weights.md | 4 +++- src/hist.jl | 4 ++-- src/misc.jl | 2 +- src/transformations.jl | 16 ++++++++-------- src/weights.jl | 25 ++++++++++++++++++------- 14 files changed, 76 insertions(+), 41 deletions(-) delete mode 100644 docs/src/means.md diff --git a/docs/make.jl b/docs/make.jl index 11e56130..b4a3a985 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -10,7 +10,6 @@ makedocs( modules = [StatsBase], pages = ["index.md", "weights.md", - "means.md", "scalarstats.md", "robust.md", "deviation.md", @@ -23,7 +22,9 @@ makedocs( "multivariate.md", "misc.md", "statmodels.md", - "transformations.md"] + "transformations.md"], + strict=true, + checkdocs=:exports ) deploydocs( diff --git a/docs/src/counts.md b/docs/src/counts.md index 648d15d4..604f7926 100644 --- a/docs/src/counts.md +++ b/docs/src/counts.md @@ -15,5 +15,5 @@ addcounts!(r::AbstractArray, x::StatsBase.IntegerArray, levels::StatsBase.IntUni ```@docs countmap proportionmap -addcounts!{T}(cm::Dict{T}, x::AbstractArray{T}) +addcounts!(cm::Dict, x::Any) ``` diff --git a/docs/src/index.md b/docs/src/index.md index f6f9de5c..e1e3fe2e 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -12,7 +12,7 @@ end ```@contents -Pages = ["weights.md", "means.md", "scalarstats.md", "robust.md", "deviation.md", "cov.md", "counts.md", "ranking.md", "sampling.md", "empirical.md", "signalcorr.md", "misc.md", "statmodels.md", "transformations.md"] +Pages = ["weights.md", "scalarstats.md", "robust.md", "deviation.md", "cov.md", "counts.md", "ranking.md", "sampling.md", "empirical.md", "signalcorr.md", "misc.md", "statmodels.md", "transformations.md"] Depth = 2 ``` diff --git a/docs/src/means.md b/docs/src/means.md deleted file mode 100644 index 638f505a..00000000 --- a/docs/src/means.md +++ /dev/null @@ -1,17 +0,0 @@ -# Mean Functions - -The package provides functions to compute means of different kinds. - -```@docs -geomean -harmmean -genmean -``` - -The `mean` and `mean!` functions are also extended to accept a weight vector of type -`AbstractWeights` to compute weighted mean. - -```@docs -mean -mean! -``` diff --git a/docs/src/robust.md b/docs/src/robust.md index cf912884..0e3c7f31 100644 --- a/docs/src/robust.md +++ b/docs/src/robust.md @@ -2,6 +2,8 @@ ```@docs trim +trim! winsor +winsor! trimvar ``` diff --git a/docs/src/sampling.md b/docs/src/sampling.md index 4af68cbd..2e7e7951 100644 --- a/docs/src/sampling.md +++ b/docs/src/sampling.md @@ -7,6 +7,8 @@ The package provides functions for sampling from a given population (with or wit ```@docs sample sample! +wsample +wsample! ``` ## Algorithms diff --git a/docs/src/scalarstats.md b/docs/src/scalarstats.md index e8f7fb5f..4e27670d 100644 --- a/docs/src/scalarstats.md +++ b/docs/src/scalarstats.md @@ -2,6 +2,27 @@ The package implements functions for computing various statistics over an array of scalar real numbers. +## Weighted sum and mean + +```@docs +sum +sum! +wsum +wsum! +mean +mean! +``` + +## Means + +The package provides functions to compute means of different kinds. + +```@docs +geomean +harmmean +genmean +``` + ## Moments ```@docs @@ -21,6 +42,7 @@ span variation sem mad +mad! ``` ## Z-scores diff --git a/docs/src/statmodels.md b/docs/src/statmodels.md index 473d9766..b1882489 100644 --- a/docs/src/statmodels.md +++ b/docs/src/statmodels.md @@ -24,6 +24,7 @@ loglikelihood mss nobs nulldeviance +nullloglikelihood r2 rss score @@ -47,3 +48,8 @@ predict predict! residuals ``` + +An exception type is provided to signal convergence failures during model estimation: +```@docs +ConvergenceException +``` \ No newline at end of file diff --git a/docs/src/transformations.md b/docs/src/transformations.md index 381660d0..b0f23150 100644 --- a/docs/src/transformations.md +++ b/docs/src/transformations.md @@ -40,7 +40,7 @@ in a single call. fit(::Type{UnitRangeTransform}, X::AbstractArray{<:Real,2}; unit::Bool=true) ``` -## Additional Methods +## Methods ```@docs StatsBase.transform StatsBase.transform! @@ -48,3 +48,9 @@ StatsBase.reconstruct StatsBase.reconstruct! standardize ``` + +## Types +```@docs +UnitRangeTransform +ZScoreTransform +``` \ No newline at end of file diff --git a/docs/src/weights.md b/docs/src/weights.md index 73f01e0e..50f6c1bc 100644 --- a/docs/src/weights.md +++ b/docs/src/weights.md @@ -54,7 +54,7 @@ w = uweights(Float64, 3) ### `Weights` -The `Weights` type describes a generic weights vector which does not support all operations possible for `FrequencyWeights`, `AnalyticWeights` and `ProbabilityWeights`. +The `Weights` type describes a generic weights vector which does not support all operations possible for `FrequencyWeights`, `AnalyticWeights`, `ProbabilityWeights` and `UnitWeights`. ```julia w = Weights([1., 2., 3.]) @@ -142,10 +142,12 @@ The following constructors are provided: AnalyticWeights FrequencyWeights ProbabilityWeights +UnitWeights Weights aweights fweights pweights eweights +uweights weights ``` diff --git a/src/hist.jl b/src/hist.jl index d1690b0a..f7afa333 100644 --- a/src/hist.jl +++ b/src/hist.jl @@ -137,7 +137,7 @@ Histograms can be fitted to data using the `fit` method. julia> using StatsBase julia> fit(Histogram, [2.], 1:3, closed=:left) -Histogram{Int64,1,Tuple{UnitRange{Int64}}} +Histogram{Int64, 1, Tuple{UnitRange{Int64}}} edges: 1:3 weights: [0, 1] @@ -145,7 +145,7 @@ closed: left isdensity: false julia> fit(Histogram, [2.], 1:3, closed=:right) -Histogram{Int64,1,Tuple{UnitRange{Int64}}} +Histogram{Int64, 1, Tuple{UnitRange{Int64}}} edges: 1:3 weights: [1, 0] diff --git a/src/misc.jl b/src/misc.jl index 7c15ccc9..25fc0f3d 100644 --- a/src/misc.jl +++ b/src/misc.jl @@ -126,7 +126,7 @@ it will be dense (default). julia> using StatsBase julia> indicatormat([1 2 2], 2) -2×3 Array{Bool,2}: +2×3 Matrix{Bool}: 1 0 0 0 1 1 ``` diff --git a/src/transformations.jl b/src/transformations.jl index 4406e3d9..a4214b5d 100644 --- a/src/transformations.jl +++ b/src/transformations.jl @@ -93,15 +93,15 @@ and return a `ZScoreTransform` transformation object. julia> using StatsBase julia> X = [0.0 -0.5 0.5; 0.0 1.0 2.0] -2×3 Array{Float64,2}: +2×3 Matrix{Float64}: 0.0 -0.5 0.5 0.0 1.0 2.0 julia> dt = fit(ZScoreTransform, X, dims=2) -ZScoreTransform{Float64}(2, 2, [0.0, 1.0], [0.5, 1.0]) +ZScoreTransform{Float64, Vector{Float64}}(2, 2, [0.0, 1.0], [0.5, 1.0]) julia> StatsBase.transform(dt, X) -2×3 Array{Float64,2}: +2×3 Matrix{Float64}: 0.0 -1.0 1.0 -1.0 0.0 1.0 ``` @@ -247,15 +247,15 @@ and return a `UnitRangeTransform` transformation object. julia> using StatsBase julia> X = [0.0 -0.5 0.5; 0.0 1.0 2.0] -2×3 Array{Float64,2}: +2×3 Matrix{Float64}: 0.0 -0.5 0.5 0.0 1.0 2.0 julia> dt = fit(UnitRangeTransform, X, dims=2) -UnitRangeTransform{Float64}(2, 2, true, [-0.5, 0.0], [1.0, 0.5]) +UnitRangeTransform{Float64, Vector{Float64}}(2, 2, true, [-0.5, 0.0], [1.0, 0.5]) julia> StatsBase.transform(dt, X) -2×3 Array{Float64,2}: +2×3 Matrix{Float64}: 0.5 0.0 1.0 0.0 0.5 1.0 ``` @@ -353,12 +353,12 @@ end julia> using StatsBase julia> standardize(ZScoreTransform, [0.0 -0.5 0.5; 0.0 1.0 2.0], dims=2) -2×3 Array{Float64,2}: +2×3 Matrix{Float64}: 0.0 -1.0 1.0 -1.0 0.0 1.0 julia> standardize(UnitRangeTransform, [0.0 -0.5 0.5; 0.0 1.0 2.0], dims=2) -2×3 Array{Float64,2}: +2×3 Matrix{Float64}: 0.5 0.0 1.0 0.0 0.5 1.0 ``` diff --git a/src/weights.jl b/src/weights.jl index b5365162..34fe4cd7 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -536,8 +536,9 @@ wsumtype(::Type{T}, ::Type{W}) where {T,W} = typeof(zero(T) * zero(W) + zero(T) wsumtype(::Type{T}, ::Type{T}) where {T<:BlasReal} = T """ - wsum!(R, A, w, dim; init=true) - + wsum!(R::AbstractArray, A::AbstractArray, + w::AbstractWeights{<:Real}, dim::Int; + init::Bool=true) Compute the weighted sum of `A` with weights `w` over the dimension `dim` and store the result in `R`. If `init=false`, the sum is added to `R` rather than starting from zero. @@ -562,9 +563,24 @@ end ## extended sum! and wsum +""" + sum!(R::AbstractArray, A::AbstractArray, + w::AbstractWeights{<:Real}, dim::Int; + init::Bool=true) + +Compute the weighted sum of `A` with weights `w` over the dimension `dim` and store +the result in `R`. If `init=false`, the sum is added to `R` rather than starting +from zero. +""" Base.sum!(R::AbstractArray, A::AbstractArray, w::AbstractWeights{<:Real}, dim::Int; init::Bool=true) = wsum!(R, A, w, dim; init=init) +""" + sum(v::AbstractArray, w::AbstractVector{<:Real}; [dims]) + +Compute the weighted sum of an array `v` with weights `w`, +optionally over the dimension `dims`. +""" Base.sum(A::AbstractArray, w::AbstractWeights{<:Real}; dims::Union{Colon,Int}=:) = wsum(A, w, dims) @@ -576,11 +592,6 @@ end ##### Weighted means ##### -""" - wmean(v, w::AbstractVector) - -Compute the weighted mean of an array `v` with weights `w`. -""" function wmean(v::AbstractArray{<:Number}, w::AbstractVector) Base.depwarn("wmean is deprecated, use mean(v, weights(w)) instead.", :wmean) mean(v, weights(w)) From ef208fe2d16e5cd4d666963e5d4e239629c6821f Mon Sep 17 00:00:00 2001 From: Alex Arslan Date: Sun, 29 Aug 2021 10:23:58 -0700 Subject: [PATCH 104/105] Bump version to 0.33.10 --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index a42c8799..f75e9453 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "StatsBase" uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" authors = ["JuliaStats"] -version = "0.33.9" +version = "0.33.10" [deps] DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" From 0a179531f14d0140356fcf1f1d55bd0240030cf2 Mon Sep 17 00:00:00 2001 From: Andrew Baas Date: Tue, 31 Aug 2021 16:48:04 -0400 Subject: [PATCH 105/105] Fix _addcounts_radix_sort for empty input array (#706) --- src/counts.jl | 1 + test/counts.jl | 3 +++ 2 files changed, 4 insertions(+) diff --git a/src/counts.jl b/src/counts.jl index 21793a5f..58087059 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -336,6 +336,7 @@ const BaseRadixSortSafeTypes = Union{Int8, Int16, Int32, Int64, Int128, radixsort_safe(::Type{T}) where T = T<:BaseRadixSortSafeTypes function _addcounts_radix_sort_loop!(cm::Dict{T}, sx::AbstractArray{T}) where T + isempty(sx) && return cm last_sx = sx[1] tmpcount = get(cm, last_sx, 0) + 1 diff --git a/test/counts.jl b/test/counts.jl index 9f684df8..d7b6fea0 100644 --- a/test/counts.jl +++ b/test/counts.jl @@ -108,6 +108,9 @@ cm_any_itr = countmap((i for i in xx)) @test cm_any_itr isa Dict{Any,Int} # no knowledge about type @test cm_missing == cm +# with empty array +@test countmap(Int[]) == Dict{Int, Int}() + # testing the radixsort-based addcounts xx = repeat([6, 1, 3, 1], outer=100_000) cm = Dict{Int, Int}()