diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml new file mode 100644 index 00000000..7784f241 --- /dev/null +++ b/.github/workflows/CompatHelper.yml @@ -0,0 +1,26 @@ +name: CompatHelper +on: + schedule: + - cron: 0 0 * * * + workflow_dispatch: +jobs: + CompatHelper: + runs-on: ubuntu-latest + steps: + - name: "Install CompatHelper" + run: | + import Pkg + name = "CompatHelper" + uuid = "aa819f21-2bde-4658-8897-bab36330d9b7" + version = "2" + Pkg.add(; name, uuid, version) + shell: julia --color=yes {0} + - name: "Run CompatHelper" + run: | + import CompatHelper + CompatHelper.main() + shell: julia --color=yes {0} + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }} + # COMPATHELPER_PRIV: ${{ secrets.COMPATHELPER_PRIV }} diff --git a/.github/workflows/TagBot.yml b/.github/workflows/TagBot.yml new file mode 100644 index 00000000..d77d3a0c --- /dev/null +++ b/.github/workflows/TagBot.yml @@ -0,0 +1,11 @@ +name: TagBot +on: + schedule: + - cron: 0 * * * * +jobs: + TagBot: + runs-on: ubuntu-latest + steps: + - uses: JuliaRegistries/TagBot@v1 + with: + token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d6dc8f05..439ac8a8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -52,15 +52,8 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - - uses: julia-actions/setup-julia@v1 - with: - version: '1' - - run: | - julia --project=docs -e ' - using Pkg - Pkg.develop(PackageSpec(path=pwd())) - Pkg.instantiate()' - - run: julia --project=docs docs/make.jl + - uses: julia-actions/julia-buildpkg@latest + - uses: julia-actions/julia-docdeploy@latest env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }} diff --git a/Project.toml b/Project.toml index 5d99be84..8d2bd28e 100644 --- a/Project.toml +++ b/Project.toml @@ -1,5 +1,5 @@ name = "Statistics" -uuid = "20745b16-79ce-11e8-11f9-7d13ad32a3b2" +uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" [deps] LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" @@ -7,8 +7,9 @@ Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" [extras] +Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Random", "Test"] +test = ["Dates", "Random", "Test"] diff --git a/README.md b/README.md index 063aa543..8b4e1001 100644 --- a/README.md +++ b/README.md @@ -1,20 +1,16 @@ -## StatsBase.jl +# Statistics.jl -*StatsBase.jl* is a Julia package that provides basic support for statistics. Particularly, it implements a variety of statistics-related functions, such as scalar statistics, high-order moment computation, counting, ranking, covariances, sampling, and empirical density estimation. +[![Build status](https://github.com/JuliaLang/Statistics.jl/workflows/CI/badge.svg)](https://github.com/JuliaLang/Statistics.jl/actions?query=workflow%3ACI+branch%3Amaster) -- **Current Release**: - [![StatsBase](http://pkg.julialang.org/badges/StatsBase_0.5.svg)](http://pkg.julialang.org/?pkg=StatsBase) - [![StatsBase](http://pkg.julialang.org/badges/StatsBase_0.6.svg)](http://pkg.julialang.org/?pkg=StatsBase) -- **Build & Testing Status:** - [![Build Status](https://travis-ci.org/JuliaStats/StatsBase.jl.svg?branch=master)](https://travis-ci.org/JuliaStats/StatsBase.jl) - [![Build status](https://ci.appveyor.com/api/projects/status/fsut3j3onulvws1w?svg=true)](https://ci.appveyor.com/project/nalimilan/statsbase-jl) - [![Coverage Status](https://coveralls.io/repos/JuliaStats/StatsBase.jl/badge.svg?branch=master)](https://coveralls.io/r/JuliaStats/StatsBase.jl?branch=master) - [![Coverage Status](http://codecov.io/github/JuliaStats/StatsBase.jl/coverage.svg?branch=master)](http://codecov.io/github/JuliaStats/StatsBase.jl?branch=master) +Development repository for the Statistics standard library (stdlib) that ships with Julia. -- **Documentation**: [![][docs-stable-img]][docs-stable-url] [![][docs-latest-img]][docs-latest-url] +#### Using the development version of Statistics.jl -[docs-latest-img]: https://img.shields.io/badge/docs-latest-blue.svg -[docs-latest-url]: http://JuliaStats.github.io/StatsBase.jl/latest/ +If you want to develop this package, do the following steps: +- Clone the repo anywhere. +- In line 2 of the `Project.toml` file (the line that begins with `uuid = ...`), modify the UUID, e.g. change the `107` to `207`. +- Change the current directory to the Statistics repo you just cloned and start julia with `julia --project`. +- `import Statistics` will now load the files in the cloned repo instead of the Statistics stdlib. +- To test your changes, simply do `include("test/runtests.jl")`. -[docs-stable-img]: https://img.shields.io/badge/docs-stable-blue.svg -[docs-stable-url]: http://JuliaStats.github.io/StatsBase.jl/stable/ +If you need to build Julia from source with a git checkout of Statistics, then instead use `make DEPS_GIT=Statistics` when building Julia. The `Statistics` repo is in `stdlib/Statistics`, and created initially with a detached `HEAD`. If you're doing this from a pre-existing Julia repository, you may need to `make clean` beforehand. diff --git a/docs/src/empirical.md b/docs/src/empirical.md index 76b37237..abaadbc1 100644 --- a/docs/src/empirical.md +++ b/docs/src/empirical.md @@ -2,9 +2,9 @@ ## Histograms -The `Histogram` type represents data that has been tabulated into intervals -(known as *bins*) along the real line, or in higher dimensions, over the real -plane. +```@docs +Histogram +``` Histograms can be fitted to data using the `fit` method. diff --git a/docs/src/index.md b/docs/src/index.md index 26c5023e..bc931c90 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -13,4 +13,4 @@ corrections where necessary. Pages = ["weights.md", "scalarstats.md", "cov.md", "robust.md", "ranking.jl", "empirical.md"] Depth = 2 -``` \ No newline at end of file +``` diff --git a/docs/src/scalarstats.md b/docs/src/scalarstats.md index 1c50795b..f31313b8 100644 --- a/docs/src/scalarstats.md +++ b/docs/src/scalarstats.md @@ -71,4 +71,4 @@ modes ```@docs describe -``` \ No newline at end of file +``` diff --git a/docs/src/weights.md b/docs/src/weights.md index a273dfaa..2fcd46a9 100644 --- a/docs/src/weights.md +++ b/docs/src/weights.md @@ -64,15 +64,91 @@ w = ProbabilityWeights([0.2, 0.1, 0.3]) w = pweights([0.2, 0.1, 0.3]) ``` +### `UnitWeights` + +Unit weights are a special case in which all observations are given a weight equal to `1`. Using such weights is equivalent to computing unweighted statistics. + +This type can notably be used when implementing an algorithm so that a only a weighted variant has to be written. The unweighted variant is then obtained by passing a `UnitWeights` object. This is very efficient since no weights vector is actually allocated. + +```julia +w = uweights(3) +w = uweights(Float64, 3) +``` + ### `Weights` -The `Weights` type describes a generic weights vector which does not support all operations possible for `FrequencyWeights`, `AnalyticWeights` and `ProbabilityWeights`. +The `Weights` type describes a generic weights vector which does not support all operations possible for `FrequencyWeights`, `AnalyticWeights`, `ProbabilityWeights` and `UnitWeights`. ```julia w = Weights([1., 2., 3.]) w = weights([1., 2., 3.]) ``` +### Exponential weights: `eweights` + +Exponential weights are a common form of temporal weights which assign exponentially decreasing +weights to past observations. + +If `t` is a vector of temporal indices then for each index `i` we compute the weight as: + +``λ (1 - λ)^{1 - i}`` + +``λ`` is a smoothing factor or rate parameter such that ``0 < λ ≤ 1``. +As this value approaches 0, the resulting weights will be almost equal, +while values closer to 1 will put greater weight on the tail elements of the vector. + +For example, the following call generates exponential weights for ten observations with ``λ = 0.3``. +```julia-repl +julia> eweights(1:10, 0.3) +10-element Weights{Float64,Float64,Array{Float64,1}}: + 0.3 + 0.42857142857142855 + 0.6122448979591837 + 0.8746355685131197 + 1.249479383590171 + 1.7849705479859588 + 2.549957925694227 + 3.642797036706039 + 5.203995766722913 + 7.434279666747019 +``` + +Simply passing the number of observations `n` is equivalent to passing in `1:n`. + +```julia-repl +julia> eweights(10, 0.3) +10-element Weights{Float64,Float64,Array{Float64,1}}: + 0.3 + 0.42857142857142855 + 0.6122448979591837 + 0.8746355685131197 + 1.249479383590171 + 1.7849705479859588 + 2.549957925694227 + 3.642797036706039 + 5.203995766722913 + 7.434279666747019 +``` + +Finally, you can construct exponential weights from an arbitrary subset of timestamps within a larger range. + +```julia-repl +julia> t +2019-01-01T01:00:00:2 hours:2019-01-01T05:00:00 + +julia> r +2019-01-01T01:00:00:1 hour:2019-01-02T01:00:00 + +julia> eweights(t, r, 0.3) +3-element Weights{Float64,Float64,Array{Float64,1}}: + 0.3 + 0.6122448979591837 + 1.249479383590171 +``` + +NOTE: This is equivalent to `eweights(something.(indexin(t, r)), 0.3)`, which is saying that for each value in `t` return the corresponding index for that value in `r`. +Since `indexin` returns `nothing` if there is no corresponding value from `t` in `r` we use `something` to eliminate that possibility. + ## Methods `AbstractWeights` implements the following methods: @@ -90,9 +166,12 @@ AbstractWeights AnalyticWeights FrequencyWeights ProbabilityWeights +UnitWeights Weights aweights fweights pweights +eweights +uweights weights -``` \ No newline at end of file +``` diff --git a/perf/sampling.jl b/perf/sampling.jl index 494395b4..dc65ff7e 100644 --- a/perf/sampling.jl +++ b/perf/sampling.jl @@ -6,7 +6,7 @@ using StatsBase import StatsBase: direct_sample!, xmultinom_sample! import StatsBase: knuths_sample!, fisher_yates_sample!, self_avoid_sample! -import StatsBase: seqsample_a!, seqsample_c! +import StatsBase: seqsample_a!, seqsample_c!, seqsample_d! ### generic sampling benchmarking @@ -42,6 +42,9 @@ tsample!(s::Seq_A, a, x) = seqsample_a!(a, x) mutable struct Seq_C <: NoRep end tsample!(s::Seq_C, a, x) = seqsample_c!(a, x) +mutable struct Seq_D <: NoRep end +tsample!(s::Seq_D, a, x) = seqsample_d!(a, x) + mutable struct Sample_NoRep <: NoRep end tsample!(s::Sample_NoRep, a, x) = sample!(a, x; replace=false, ordered=false) @@ -87,6 +90,7 @@ const procs2 = Proc[ SampleProc{Knuths}(), SampleProc{Sample_NoRep}(), SampleProc{Seq_A}(), SampleProc{Seq_C}(), + SampleProc{Seq_D}(), SampleProc{Sample_NoRep_Ord}() ] const cfgs2 = (Int, Int)[] @@ -110,4 +114,3 @@ println("Sampling Without Replacement") println("===================================") show(rtable2; unit=:mps, cfghead="(n, k)") println() - diff --git a/src/Statistics.jl b/src/Statistics.jl index f188ff54..53f6e4be 100644 --- a/src/Statistics.jl +++ b/src/Statistics.jl @@ -18,8 +18,8 @@ export std, stdm, var, varm, mean!, mean, # moments.jl skewness, kurtosis, # weights.jl - AbstractWeights, Weights, AnalyticWeights, FrequencyWeights, ProbabilityWeights, - weights, aweights, fweights, pweights, + AbstractWeights, Weights, AnalyticWeights, FrequencyWeights, ProbabilityWeights, UnitWeights, + weights, aweights, eweights, fweights, pweights, uweights, # scalarstats.jl geomean, harmmean, genmean, mode, modes, percentile, span, variation, sem, mad, mad!, iqr, genvar, totalvar, entropy, renyientropy, crossentropy, kldivergence, describe, @@ -264,6 +264,16 @@ _mean(::typeof(identity), A::AbstractArray, dims::Colon, w::AbstractArray) = _mean(::typeof(identity), A::AbstractArray, dims, w::AbstractArray) = _mean!(Base.reducedim_init(t -> (t*zero(eltype(w)))/2, Base.add_sum, A, dims), A, w) +function _mean(::typeof(identity), A::AbstractArray, dims, w::UnitWeights) + size(A, dims) != length(w) && throw(DimensionMismatch("Inconsistent array dimension.")) + return mean(A, dims=dims) +end + +function _mean(::typeof(identity), A::AbstractArray, dims::Colon, w::UnitWeights) + length(A) != length(w) && throw(DimensionMismatch("Inconsistent array dimension.")) + return mean(A) +end + ##### variances ##### # faster computation of real(conj(x)*y) @@ -451,78 +461,6 @@ function _varm(A::AbstractArray{T}, m, corrected::Bool, dims::Colon, varcorrection(w, corrected) * s end -""" - varcorrection(n::Integer, corrected=false) - -Compute a bias correction factor for calculating `var`, `std` and `cov` with -`n` observations. Returns ``\\frac{1}{n - 1}`` when `corrected=true` -(i.e. [Bessel's correction](https://en.wikipedia.org/wiki/Bessel's_correction)), -otherwise returns ``\\frac{1}{n}`` (i.e. no correction). -""" -@inline varcorrection(n::Integer, corrected::Bool=false) = 1 / (n - Int(corrected)) - -""" - varcorrection(w::Weights, corrected=false) - -Returns ``\\frac{1}{\\sum w}`` when `corrected=false` and throws an `ArgumentError` -if `corrected=true`. -""" -@inline function varcorrection(w::Weights, corrected::Bool=false) - corrected && throw(ArgumentError("Weights type does not support bias correction: " * - "use FrequencyWeights, AnalyticWeights or ProbabilityWeights if applicable.")) - 1 / w.sum -end - -""" - varcorrection(w::AnalyticWeights, corrected=false) - -* `corrected=true`: ``\\frac{1}{\\sum w - \\sum {w^2} / \\sum w}`` -* `corrected=false`: ``\\frac{1}{\\sum w}`` -""" -@inline function varcorrection(w::AnalyticWeights, corrected::Bool=false) - s = w.sum - - if corrected - sum_sn = sum(x -> (x / s) ^ 2, w) - 1 / (s * (1 - sum_sn)) - else - 1 / s - end -end - -""" - varcorrection(w::FrequencyWeights, corrected=false) - -* `corrected=true`: ``\\frac{1}{\\sum{w} - 1}`` -* `corrected=false`: ``\\frac{1}{\\sum w}`` -""" -@inline function varcorrection(w::FrequencyWeights, corrected::Bool=false) - s = w.sum - - if corrected - 1 / (s - 1) - else - 1 / s - end -end - -""" - varcorrection(w::ProbabilityWeights, corrected=false) - -* `corrected=true`: ``\\frac{n}{(n - 1) \\sum w}`` where ``n`` equals `count(!iszero, w)` -* `corrected=false`: ``\\frac{1}{\\sum w}`` -""" -@inline function varcorrection(w::ProbabilityWeights, corrected::Bool=false) - s = w.sum - - if corrected - n = count(!iszero, w) - n / (s * (n - 1)) - else - 1 / s - end -end - """ var(itr; corrected::Bool=true, [weights::AbstractWeights], mean=nothing[, dims]) @@ -1425,6 +1363,18 @@ function _quantile(v::AbstractArray{V}, p, sorted::Bool, alpha::Real, beta::Real return out end +function _quantile(v::AbstractArray, p, sorted::Bool, + alpha::Real, beta::Real, w::UnitWeights) + length(v) != length(w) && throw(DimensionMismatch("Inconsistent array dimension.")) + return quantile(v, p) +end + +function _quantile(v::AbstractArray, p::Real, sorted::Bool, + alpha::Real, beta::Real, w::UnitWeights) + length(v) != length(w) && throw(DimensionMismatch("Inconsistent array dimension.")) + return quantile(v, p) +end + _quantile(v::AbstractArray, p::Real, sorted::Bool, alpha::Real, beta::Real, w::AbstractArray) = _quantile(v, [p], sorted, alpha, beta, w)[1] diff --git a/src/StatsBase.jl b/src/StatsBase.jl deleted file mode 100644 index af6a47aa..00000000 --- a/src/StatsBase.jl +++ /dev/null @@ -1,239 +0,0 @@ -__precompile__() - -module StatsBase - -import Base: length, isempty, eltype, values, sum, show, maximum, minimum, extrema -import Base.Cartesian: @nloops, @nref, @nextract -using Base: @irrational, @propagate_inbounds -import DataStructures: heapify!, heappop!, percolate_down! -using SortingAlgorithms -using Missings - -using Statistics -using LinearAlgebra -using Random -using Printf -using SparseArrays -import Random: rand, rand! -import LinearAlgebra: BlasReal, BlasFloat -import Statistics: mean, mean!, var, varm, varm!, std, stdm, cov, covm, - cor, corm, cov2cor!, unscaled_covzm, quantile, sqrt!, - median, middle - - ## tackle compatibility issues - -export - - ## weights - AbstractWeights, # abstract type to represent any weight vector - Weights, # to represent a generic weight vector - AnalyticWeights, # to represent an analytic/precision/reliability weight vector - FrequencyWeights, # to representing a frequency/case/repeat weight vector - ProbabilityWeights, # to representing a probability/sampling weight vector - weights, # construct a generic Weights vector - aweights, # construct an AnalyticWeights vector - fweights, # construct a FrequencyWeights vector - pweights, # construct a ProbabilityWeights vector - wsum, # weighted sum with vector as second argument - wsum!, # weighted sum across dimensions with provided storage - wmean, # weighted mean - wmean!, # weighted mean across dimensions with provided storage - wmedian, # weighted median - wquantile, # weighted quantile - - ## moments - skewness, # (standardized) skewness - kurtosis, # (excessive) kurtosis - moment, # central moment of given order - mean_and_var, # (mean, var) - mean_and_std, # (mean, std) - mean_and_cov, # (mean, cov) - - ## scalarstats - geomean, # geometric mean - harmmean, # harmonic mean - genmean, # generalized/power mean - middle, # the mean of two real numbers - mode, # find a mode from data (the first one) - modes, # find all modes from data - - zscore, # compute Z-scores - zscore!, # compute Z-scores inplace or to a pre-allocated array - - percentile, # quantile using percentage (instead of fraction) as argument - nquantile, # quantiles at [0:n]/n - - span, # The range minimum(x):maximum(x) - variation, # ratio of standard deviation to mean - sem, # standard error of the mean, i.e. sqrt(var / n) - mad, # median absolute deviation - iqr, # interquatile range - - genvar, # generalized variance - totalvar, # total variation - - entropy, # the entropy of a probability vector - renyientropy, # the Rényi (generalised) entropy of a probability vector - crossentropy, # cross entropy between two probability vectors - kldivergence, # K-L divergence between two probability vectors - - summarystats, # summary statistics - describe, # print the summary statistics - - # deviation - counteq, # count the number of equal pairs - countne, # count the number of non-equal pairs - sqL2dist, # squared L2 distance between two arrays - L2dist, # L2 distance between two arrays - L1dist, # L1 distance between two arrays - Linfdist, # L-inf distance between two arrays - gkldiv, # (Generalized) Kullback-Leibler divergence between two vectors - meanad, # mean absolute deviation - maxad, # maximum absolute deviation - msd, # mean squared deviation - rmsd, # root mean squared deviation - psnr, # peak signal-to-noise ratio (in dB) - - # cov - scattermat, # scatter matrix (i.e. unnormalized covariance) - cov2cor, # converts a covariance matrix to a correlation matrix - cor2cov, # converts a correlation matrix to a covariance matrix - CovarianceEstimator, # abstract type for covariance estimators - SimpleCovariance, # simple covariance estimator - - ## counts - addcounts!, # add counts to an accumulating array or map - counts, # count integer values in given arrays - proportions, # proportions of integer values in given arrays - # (normalized version of counts) - countmap, # count distinct values and return a map - proportionmap, # proportions of distinct values returned as a map - - ## ranking - ordinalrank, # ordinal ranking ("1234" ranking) - competerank, # competition ranking ("1 2 2 4" ranking) - denserank, # dense ranking ("1 2 2 3" ranking) - tiedrank, # tied ranking ("1 2.5 2.5 4" ranking) - - ## rankcorr - corspearman, # spearman's rank correlation - corkendall, # kendall's rank correlation - - ## partialcor - partialcor, # partial correlation - - ## signalcorr - autocov!, autocov, # auto covariance - autocor!, autocor, # auto correlation - crosscov!, crosscov, # cross covariance - crosscor!, crosscor, # cross correlation - pacf!, pacf, # partial auto-correlation - - ## sampling - samplepair, # draw a pair of distinct elements    - sample, # sampling from a population - sample!, # sampling from a population, with pre-allocated output - wsample, # sampling from a population with weights - wsample!, # weighted sampling, with pre-allocated output - - ## empirical - ecdf, # empirical cumulative distribution function - ECDF, # type for empirical cumulative distribution function - - AbstractHistogram, - Histogram, - midpoints, - # histrange, - - ## robust - trim, # trimmed set - trim!, # trimmed set - winsor, # Winsorized set - winsor!, # Winsorized set - trimvar, # variance of the mean of a trimmed set - - ## misc - rle, # run-length encoding - inverse_rle, # inverse run-length encoding - indexmap, # construct a map from element to index - levelsmap, # construct a map from n unique elements to [1, ..., n] - findat, # find the position within a for elements in b - indicatormat, # construct indicator matrix - - # statistical models - CoefTable, - StatisticalModel, - RegressionModel, - - adjr2, - adjr², - aic, - aicc, - bic, - coef, - coefnames, - coeftable, - confint, - deviance, - dof, - dof_residual, - fit, - fit!, - fitted, - informationmatrix, - isfitted, - islinear, - leverage, - loglikelihood, - meanresponse, - modelmatrix, - mss, - response, - nobs, - nulldeviance, - nullloglikelihood, - rss, - score, - stderror, - vcov, - predict, - predict!, - residuals, - r2, - r², - - ConvergenceException, - - # data standardization - standardize, - AbstractDataTransform, # the type to represent a abstract data transformation - ZScoreTransform, # the type to represent a z-score data transformation - UnitRangeTransform # the type to represent a 0-1 data transformation - -# source files - -include("common.jl") -include("weights.jl") -include("moments.jl") -include("scalarstats.jl") -include("robust.jl") -include("deviation.jl") -include("cov.jl") -include("counts.jl") -include("ranking.jl") -include("toeplitzsolvers.jl") -include("rankcorr.jl") -include("signalcorr.jl") -include("partialcor.jl") -include("empirical.jl") -include("hist.jl") -include("misc.jl") - -include("sampling.jl") -include("statmodels.jl") - -include("transformations.jl") - -include("deprecates.jl") - -end # module diff --git a/src/common.jl b/src/common.jl index 5f2fceec..0a1d4736 100644 --- a/src/common.jl +++ b/src/common.jl @@ -18,11 +18,4 @@ const IntegerArray{T<:Integer,N} = AbstractArray{T,N} const IntegerVector{T<:Integer} = AbstractArray{T,1} const IntegerMatrix{T<:Integer} = AbstractArray{T,2} -const RealFP = Union{Float32, Float64} - -## conversion from real to fp types - -fptype(::Type{T}) where {T<:Union{Float32,Bool,Int8,UInt8,Int16,UInt16}} = Float32 -fptype(::Type{T}) where {T<:Union{Float64,Int32,UInt32,Int64,UInt64,Int128,UInt128}} = Float64 -fptype(::Type{Complex{Float32}}) = Complex{Float32} -fptype(::Type{Complex{Float64}}) = Complex{Float64} \ No newline at end of file +const RealFP = Union{Float32, Float64} \ No newline at end of file diff --git a/src/counts.jl b/src/counts.jl index 84fc8dcf..58087059 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -48,12 +48,11 @@ function addcounts!(r::AbstractArray, x::IntegerArray, levels::IntUnitRange, wv: m0 = levels[1] m1 = levels[end] b = m0 - 1 - w = values(wv) @inbounds for i in 1 : length(x) xi = x[i] if m0 <= xi <= m1 - r[xi - b] += w[i] + r[xi - b] += wv[i] end end return r @@ -160,13 +159,12 @@ function addcounts!(r::AbstractArray, x::IntegerArray, y::IntegerArray, bx = mx0 - 1 by = my0 - 1 - w = values(wv) for i = 1:n xi = x[i] yi = y[i] if (mx0 <= xi <= mx1) && (my0 <= yi <= my1) - r[xi - bx, yi - by] += w[i] + r[xi - bx, yi - by] += wv[i] end end return r @@ -257,7 +255,9 @@ raw counts. - `:dict`: use `Dict`-based method which is generally slower but uses less RAM and is safe for any data type. """ -function addcounts!(cm::Dict{T}, x::AbstractArray{T}; alg = :auto) where T +addcounts!(cm::Dict, x; alg = :auto) = _addcounts!(eltype(x), cm, x, alg = alg) + +function _addcounts!(::Type{T}, cm::Dict, x; alg = :auto) where T # if it's safe to be sorted using radixsort then it should be faster # albeit using more RAM if radixsort_safe(T) && (alg == :auto || alg == :radixsort) @@ -271,7 +271,7 @@ function addcounts!(cm::Dict{T}, x::AbstractArray{T}; alg = :auto) where T end """Dict-based addcounts method""" -function addcounts_dict!(cm::Dict{T}, x::AbstractArray{T}) where T +function addcounts_dict!(cm::Dict{T}, x) where T for v in x index = ht_keyindex2!(cm, v) if index > 0 @@ -288,14 +288,27 @@ end # faster results and less memory usage. However we still wish to enable others # to write generic algorithms, therefore the methods below still accept the # `alg` argument but it is ignored. -function addcounts!(cm::Dict{Bool}, x::AbstractArray{Bool}; alg = :ignored) +function _addcounts!(::Type{Bool}, cm::Dict{Bool}, x::AbstractArray{Bool}; alg = :ignored) sumx = sum(x) cm[true] = get(cm, true, 0) + sumx cm[false] = get(cm, false, 0) + length(x) - sumx cm end -function addcounts!(cm::Dict{T}, x::AbstractArray{T}; alg = :ignored) where T <: Union{UInt8, UInt16, Int8, Int16} +# specialized for `Bool` iterator +function _addcounts!(::Type{Bool}, cm::Dict{Bool}, x; alg = :ignored) + sumx = 0 + len = 0 + for i in x + sumx += i + len += 1 + end + cm[true] = get(cm, true, 0) + sumx + cm[false] = get(cm, false, 0) + len - sumx + cm +end + +function _addcounts!(::Type{T}, cm::Dict{T}, x; alg = :ignored) where T <: Union{UInt8, UInt16, Int8, Int16} counts = zeros(Int, 2^(8sizeof(T))) @inbounds for xi in x @@ -320,13 +333,13 @@ const BaseRadixSortSafeTypes = Union{Int8, Int16, Int32, Int64, Int128, Float32, Float64} "Can the type be safely sorted by radixsort" -radixsort_safe(::Type{T}) where {T<:BaseRadixSortSafeTypes} = true -radixsort_safe(::Type) = false +radixsort_safe(::Type{T}) where T = T<:BaseRadixSortSafeTypes function _addcounts_radix_sort_loop!(cm::Dict{T}, sx::AbstractArray{T}) where T + isempty(sx) && return cm last_sx = sx[1] tmpcount = get(cm, last_sx, 0) + 1 - + # now the data is sorted: can just run through and accumulate values before # adding into the Dict @inbounds for i in 2:length(sx) @@ -355,15 +368,20 @@ function addcounts_radixsort!(cm::Dict{T}, x::AbstractArray{T}) where T return _addcounts_radix_sort_loop!(cm, sx) end +# fall-back for `x` an iterator +function addcounts_radixsort!(cm::Dict{T}, x) where T + sx = sort!(collect(x), alg = RadixSort) + return _addcounts_radix_sort_loop!(cm, sx) +end + function addcounts!(cm::Dict{T}, x::AbstractArray{T}, wv::AbstractVector{W}) where {T,W<:Real} n = length(x) length(wv) == n || throw(DimensionMismatch()) - w = values(wv) z = zero(W) for i = 1 : n @inbounds xi = x[i] - @inbounds wi = w[i] + @inbounds wi = wv[i] cm[xi] = get(cm, xi, z) + wi end return cm @@ -372,9 +390,10 @@ end """ countmap(x; alg = :auto) + countmap(x::AbstractVector, w::AbstractVector{<:Real}; alg = :auto) Return a dictionary mapping each unique value in `x` to its number -of occurrences. +of occurrences. A vector of weights `w` can be provided when `x` is a vector. - `:auto` (default): if `StatsBase.radixsort_safe(eltype(x)) == true` then use `:radixsort`, otherwise use `:dict`. @@ -389,7 +408,7 @@ of occurrences. - `:dict`: use `Dict`-based method which is generally slower but uses less RAM and is safe for any data type. """ -countmap(x::AbstractArray{T}; alg = :auto) where {T} = addcounts!(Dict{T,Int}(), x; alg = alg) +countmap(x; alg = :auto) = addcounts!(Dict{eltype(x),Int}(), x; alg = alg) countmap(x::AbstractArray{T}, wv::AbstractVector{W}) where {T,W<:Real} = addcounts!(Dict{T,W}(), x, wv) diff --git a/src/cov.jl b/src/cov.jl index 72333e0a..d59cc4bf 100644 --- a/src/cov.jl +++ b/src/cov.jl @@ -15,7 +15,7 @@ function _symmetrize!(a::DenseMatrix) return a end -function _scalevars(x::DenseMatrix, s::DenseVector, dims::Int) +function _scalevars(x::DenseMatrix, s::AbstractWeights, dims::Int) dims == 1 ? Diagonal(s) * x : dims == 2 ? x * Diagonal(s) : error("dims should be either 1 or 2.") @@ -27,9 +27,9 @@ _unscaled_covzm(x::DenseMatrix, dims::Colon) = unscaled_covzm(x) _unscaled_covzm(x::DenseMatrix, dims::Integer) = unscaled_covzm(x, dims) _unscaled_covzm(x::DenseMatrix, wv::AbstractWeights, dims::Colon) = - _symmetrize!(unscaled_covzm(x, _scalevars(x, values(wv)))) + _symmetrize!(unscaled_covzm(x, _scalevars(x, wv))) _unscaled_covzm(x::DenseMatrix, wv::AbstractWeights, dims::Integer) = - _symmetrize!(unscaled_covzm(x, _scalevars(x, values(wv), dims), dims)) + _symmetrize!(unscaled_covzm(x, _scalevars(x, wv, dims), dims)) """ scattermat(X; mean=nothing, dims=1[, weights::AbstractWeights]) diff --git a/src/deprecates.jl b/src/deprecates.jl index cb9c55c9..9e409b33 100644 --- a/src/deprecates.jl +++ b/src/deprecates.jl @@ -35,3 +35,7 @@ end @deprecate wmedian(v::RealVector, w::RealVector) median(v, weights(w)) @deprecate quantile(v::AbstractArray{<:Real}) quantile(v, [.0, .25, .5, .75, 1.0]) + +### Deprecated September 2019 +@deprecate sum(A::AbstractArray, w::AbstractWeights, dims::Int) sum(A, w, dims=dims) +@deprecate values(wv::AbstractWeights) convert(Vector, wv) diff --git a/src/empirical.jl b/src/empirical.jl index ee458831..02d88067 100644 --- a/src/empirical.jl +++ b/src/empirical.jl @@ -1,45 +1,51 @@ # Empirical estimation of CDF and PDF - ## Empirical CDF -struct ECDF{T <: AbstractVector{<:Real}} +struct ECDF{T <: AbstractVector{<:Real}, W <: AbstractWeights{<:Real}} sorted_values::T + weights::W end function (ecdf::ECDF)(x::Real) - searchsortedlast(ecdf.sorted_values, x) / length(ecdf.sorted_values) + isnan(x) && return NaN + n = searchsortedlast(ecdf.sorted_values, x) + evenweights = isempty(ecdf.weights) + weightsum = evenweights ? length(ecdf.sorted_values) : sum(ecdf.weights) + partialsum = evenweights ? n : sum(view(ecdf.weights, 1:n)) + partialsum / weightsum end function (ecdf::ECDF)(v::RealVector) + evenweights = isempty(ecdf.weights) + weightsum = evenweights ? length(ecdf.sorted_values) : sum(ecdf.weights) ord = sortperm(v) m = length(v) r = similar(ecdf.sorted_values, m) - r0 = 0 + r0 = zero(weightsum) i = 1 - n = length(ecdf.sorted_values) - for x in ecdf.sorted_values + for (j, x) in enumerate(ecdf.sorted_values) while i <= m && x > v[ord[i]] r[ord[i]] = r0 i += 1 end - r0 += 1 + r0 += evenweights ? 1 : ecdf.weights[j] if i > m break end end while i <= m - r[ord[i]] = n + r[ord[i]] = weightsum i += 1 end - return r / n + return r / weightsum end """ - ecdf(X) + ecdf(X; weights::AbstractWeights) Return an empirical cumulative distribution function (ECDF) based on a vector of samples -given in `X`. +given in `X`. Optionally providing `weights` returns a weighted ECDF. Note: this function that returns a callable composite type, which can then be applied to evaluate CDF values on other samples. @@ -47,7 +53,13 @@ evaluate CDF values on other samples. `extrema`, `minimum`, and `maximum` are supported to for obtaining the range over which function is inside the interval ``(0,1)``; the function is defined for the whole real line. """ -ecdf(X::RealVector{T}) where T<:Real = ECDF(sort(X)) +function ecdf(X::RealVector; weights::AbstractVector{<:Real}=Weights(Float64[])) + any(isnan, X) && throw(ArgumentError("ecdf can not include NaN values")) + isempty(weights) || length(X) == length(weights) || throw(ArgumentError("data and weight vectors must be the same size," * + "got $(length(X)) and $(length(weights))")) + ord = sortperm(X) + ECDF(X[ord], isempty(weights) ? weights : Weights(weights[ord])) +end Base.minimum(ecdf::ECDF) = first(ecdf.sorted_values) diff --git a/src/hist.jl b/src/hist.jl index 8cef89a1..f7afa333 100644 --- a/src/hist.jl +++ b/src/hist.jl @@ -115,6 +115,72 @@ end abstract type AbstractHistogram{T<:Real,N,E} end # N-dimensional histogram object +""" + Histogram <: AbstractHistogram + +The `Histogram` type represents data that has been tabulated into intervals +(known as *bins*) along the real line, or in higher dimensions, over a real space. +Histograms can be fitted to data using the `fit` method. + +# Fields +* edges: An iterator that contains the boundaries of the bins in each dimension. +* weights: An array that contains the weight of each bin. +* closed: A symbol with value `:right` or `:left` indicating on which side bins + (half-open intervals or higher-dimensional analogues thereof) are closed. + See below for an example. +* isdensity: There are two interpretations of a `Histogram`. If `isdensity=false` the weight of a bin corresponds to the amount of a quantity in the bin. + If `isdensity=true` then it corresponds to the density (amount / volume) of the quantity in the bin. See below for an example. + +# Examples +## Example illustrating `closed` +```jldoctest +julia> using StatsBase + +julia> fit(Histogram, [2.], 1:3, closed=:left) +Histogram{Int64, 1, Tuple{UnitRange{Int64}}} +edges: + 1:3 +weights: [0, 1] +closed: left +isdensity: false + +julia> fit(Histogram, [2.], 1:3, closed=:right) +Histogram{Int64, 1, Tuple{UnitRange{Int64}}} +edges: + 1:3 +weights: [1, 0] +closed: right +isdensity: false +``` +## Example illustrating `isdensity` +```julia +julia> using StatsBase, LinearAlgebra + +julia> bins = [0,1,7]; # a small and a large bin + +julia> obs = [0.5, 1.5, 1.5, 2.5]; # one observation in the small bin and three in the large + +julia> h = fit(Histogram, obs, bins) +Histogram{Int64,1,Tuple{Array{Int64,1}}} +edges: + [0, 1, 7] +weights: [1, 3] +closed: left +isdensity: false + +julia> # observe isdensity = false and the weights field records the number of observations in each bin + +julia> normalize(h, mode=:density) +Histogram{Float64,1,Tuple{Array{Int64,1}}} +edges: + [0, 1, 7] +weights: [1.0, 0.5] +closed: left +isdensity: true + +julia> # observe isdensity = true and weights tells us the number of observation per binsize in each bin +``` +""" mutable struct Histogram{T<:Real,N,E} <: AbstractHistogram{T,N,E} edges::E weights::Array{T,N} @@ -252,8 +318,6 @@ function append!(h::AbstractHistogram{T,N}, vs::NTuple{N,AbstractVector}, wv::Ab end h end -append!(h::AbstractHistogram{T,N}, vs::NTuple{N,AbstractVector}, wv::AbstractWeights) where {T,N} = append!(h, vs, values(wv)) - # Turn kwargs nbins into a type-stable tuple of integers: function _nbins_tuple(vs::NTuple{N,AbstractVector}, nbins) where N diff --git a/src/misc.jl b/src/misc.jl index 8d09e311..25fc0f3d 100644 --- a/src/misc.jl +++ b/src/misc.jl @@ -18,7 +18,7 @@ julia> rle([1,1,1,2,2,3,3,3,3,2,2,2]) ([1, 2, 3, 2], [3, 2, 4, 3]) ``` """ -function rle(v::Vector{T}) where T +function rle(v::AbstractVector{T}) where T n = length(v) vals = T[] lens = Int[] @@ -31,7 +31,7 @@ function rle(v::Vector{T}) where T i = 2 @inbounds while i <= n vi = v[i] - if vi == cv + if isequal(vi, cv) cl += 1 else push!(vals, cv) @@ -126,9 +126,9 @@ it will be dense (default). julia> using StatsBase julia> indicatormat([1 2 2], 2) -2×3 Array{Bool,2}: - true false false - false true true +2×3 Matrix{Bool}: + 1 0 0 + 0 1 1 ``` """ function indicatormat(x::IntegerArray, k::Integer; sparse::Bool=false) diff --git a/src/pairwise.jl b/src/pairwise.jl new file mode 100644 index 00000000..97e51a3f --- /dev/null +++ b/src/pairwise.jl @@ -0,0 +1,313 @@ +function _pairwise!(::Val{:none}, f, dest::AbstractMatrix, x, y, symmetric::Bool) + @inbounds for (i, xi) in enumerate(x), (j, yj) in enumerate(y) + symmetric && i > j && continue + + # For performance, diagonal is special-cased + if f === cor && eltype(dest) !== Union{} && i == j && xi === yj + # TODO: float() will not be needed after JuliaLang/Statistics.jl#61 + dest[i, j] = float(cor(xi)) + else + dest[i, j] = f(xi, yj) + end + end + if symmetric + m, n = size(dest) + @inbounds for j in 1:n, i in (j+1):m + dest[i, j] = dest[j, i] + end + end + return dest +end + +function check_vectors(x, y, skipmissing::Symbol) + m = length(x) + n = length(y) + if !(all(xi -> xi isa AbstractVector, x) && all(yi -> yi isa AbstractVector, y)) + throw(ArgumentError("All entries in x and y must be vectors " * + "when skipmissing=:$skipmissing")) + end + if m > 1 + indsx = keys(first(x)) + for i in 2:m + keys(x[i]) == indsx || + throw(ArgumentError("All input vectors must have the same indices")) + end + end + if n > 1 + indsy = keys(first(y)) + for j in 2:n + keys(y[j]) == indsy || + throw(ArgumentError("All input vectors must have the same indices")) + end + end + if m > 1 && n > 1 + indsx == indsy || + throw(ArgumentError("All input vectors must have the same indices")) + end +end + +function _pairwise!(::Val{:pairwise}, f, dest::AbstractMatrix, x, y, symmetric::Bool) + check_vectors(x, y, :pairwise) + @inbounds for (j, yj) in enumerate(y) + ynminds = .!ismissing.(yj) + @inbounds for (i, xi) in enumerate(x) + symmetric && i > j && continue + + if xi === yj + ynm = view(yj, ynminds) + # For performance, diagonal is special-cased + if f === cor && eltype(dest) !== Union{} && i == j + # TODO: float() will not be needed after JuliaLang/Statistics.jl#61 + dest[i, j] = float(cor(xi)) + else + dest[i, j] = f(ynm, ynm) + end + else + nminds = .!ismissing.(xi) .& ynminds + xnm = view(xi, nminds) + ynm = view(yj, nminds) + dest[i, j] = f(xnm, ynm) + end + end + end + if symmetric + m, n = size(dest) + @inbounds for j in 1:n, i in (j+1):m + dest[i, j] = dest[j, i] + end + end + return dest +end + +function _pairwise!(::Val{:listwise}, f, dest::AbstractMatrix, x, y, symmetric::Bool) + check_vectors(x, y, :listwise) + m, n = size(dest) + nminds = .!ismissing.(first(x)) + @inbounds for xi in Iterators.drop(x, 1) + nminds .&= .!ismissing.(xi) + end + if x !== y + @inbounds for yj in y + nminds .&= .!ismissing.(yj) + end + end + + # Computing integer indices once for all vectors is faster + nminds′ = findall(nminds) + # TODO: check whether wrapping views in a custom array type which asserts + # that entries cannot be `missing` (similar to `skipmissing`) + # could offer better performance + return _pairwise!(Val(:none), f, dest, + [view(xi, nminds′) for xi in x], + [view(yi, nminds′) for yi in y], + symmetric) +end + +function _pairwise!(f, dest::AbstractMatrix, x, y; + symmetric::Bool=false, skipmissing::Symbol=:none) + if !(skipmissing in (:none, :pairwise, :listwise)) + throw(ArgumentError("skipmissing must be one of :none, :pairwise or :listwise")) + end + + x′ = x isa Union{AbstractArray, Tuple, NamedTuple} ? x : collect(x) + y′ = y isa Union{AbstractArray, Tuple, NamedTuple} ? y : collect(y) + m = length(x′) + n = length(y′) + + size(dest) != (m, n) && + throw(DimensionMismatch("dest has dimensions $(size(dest)) but expected ($m, $n)")) + + Base.has_offset_axes(dest) && throw("dest indices must start at 1") + + return _pairwise!(Val(skipmissing), f, dest, x′, y′, symmetric) +end + +function _pairwise(::Val{skipmissing}, f, x, y, symmetric::Bool) where {skipmissing} + x′ = x isa Union{AbstractArray, Tuple, NamedTuple} ? x : collect(x) + y′ = y isa Union{AbstractArray, Tuple, NamedTuple} ? y : collect(y) + m = length(x′) + n = length(y′) + + T = Core.Compiler.return_type(f, Tuple{eltype(x′), eltype(y′)}) + Tsm = Core.Compiler.return_type((x, y) -> f(disallowmissing(x), disallowmissing(y)), + Tuple{eltype(x′), eltype(y′)}) + + if skipmissing === :none + dest = Matrix{T}(undef, m, n) + elseif skipmissing in (:pairwise, :listwise) + dest = Matrix{Tsm}(undef, m, n) + else + throw(ArgumentError("skipmissing must be one of :none, :pairwise or :listwise")) + end + + # Preserve inferred element type + isempty(dest) && return dest + + _pairwise!(f, dest, x′, y′, symmetric=symmetric, skipmissing=skipmissing) + + if isconcretetype(eltype(dest)) + return dest + else + # Final eltype depends on actual contents (consistent with map and broadcast) + U = mapreduce(typeof, promote_type, dest) + # V is inferred (contrary to U), but it only gives an upper bound for U + V = promote_type(T, Tsm) + return convert(Matrix{U}, dest)::Matrix{<:V} + end +end + +""" + pairwise!(f, dest::AbstractMatrix, x[, y]; + symmetric::Bool=false, skipmissing::Symbol=:none) + +Store in matrix `dest` the result of applying `f` to all possible pairs +of entries in iterators `x` and `y`, and return it. Rows correspond to +entries in `x` and columns to entries in `y`, and `dest` must therefore +be of size `length(x) × length(y)`. +If `y` is omitted then `x` is crossed with itself. + +As a special case, if `f` is `cor`, diagonal cells for which entries +from `x` and `y` are identical (according to `===`) are set to one even +in the presence `missing`, `NaN` or `Inf` entries. + +# Keyword arguments +- `symmetric::Bool=false`: If `true`, `f` is only called to compute + for the lower triangle of the matrix, and these values are copied + to fill the upper triangle. Only allowed when `y` is omitted. + Defaults to `true` when `f` is `cor` or `cov`. +- `skipmissing::Symbol=:none`: If `:none` (the default), missing values + in inputs are passed to `f` without any modification. + Use `:pairwise` to skip entries with a `missing` value in either + of the two vectors passed to `f` for a given pair of vectors in `x` and `y`. + Use `:listwise` to skip entries with a `missing` value in any of the + vectors in `x` or `y`; note that this might drop a large part of entries. + Only allowed when entries in `x` and `y` are vectors. + +# Examples +```jldoctest +julia> using StatsBase, Statistics + +julia> dest = zeros(3, 3); + +julia> x = [1 3 7 + 2 5 6 + 3 8 4 + 4 6 2]; + +julia> pairwise!(cor, dest, eachcol(x)); + +julia> dest +3×3 Matrix{Float64}: + 1.0 0.744208 -0.989778 + 0.744208 1.0 -0.68605 + -0.989778 -0.68605 1.0 + +julia> y = [1 3 missing + 2 5 6 + 3 missing 2 + 4 6 2]; + +julia> pairwise!(cor, dest, eachcol(y), skipmissing=:pairwise); + +julia> dest +3×3 Matrix{Float64}: + 1.0 0.928571 -0.866025 + 0.928571 1.0 -1.0 + -0.866025 -1.0 1.0 +``` +""" +function pairwise!(f, dest::AbstractMatrix, x, y=x; + symmetric::Bool=false, skipmissing::Symbol=:none) + if symmetric && x !== y + throw(ArgumentError("symmetric=true only makes sense passing " * + "a single set of variables (x === y)")) + end + + return _pairwise!(f, dest, x, y, symmetric=symmetric, skipmissing=skipmissing) +end + +""" + pairwise(f, x[, y]; + symmetric::Bool=false, skipmissing::Symbol=:none) + +Return a matrix holding the result of applying `f` to all possible pairs +of entries in iterators `x` and `y`. Rows correspond to +entries in `x` and columns to entries in `y`. If `y` is omitted then a +square matrix crossing `x` with itself is returned. + +As a special case, if `f` is `cor`, diagonal cells for which entries +from `x` and `y` are identical (according to `===`) are set to one even +in the presence `missing`, `NaN` or `Inf` entries. + +# Keyword arguments +- `symmetric::Bool=false`: If `true`, `f` is only called to compute + for the lower triangle of the matrix, and these values are copied + to fill the upper triangle. Only allowed when `y` is omitted. + Defaults to `true` when `f` is `cor` or `cov`. +- `skipmissing::Symbol=:none`: If `:none` (the default), missing values + in inputs are passed to `f` without any modification. + Use `:pairwise` to skip entries with a `missing` value in either + of the two vectors passed to `f` for a given pair of vectors in `x` and `y`. + Use `:listwise` to skip entries with a `missing` value in any of the + vectors in `x` or `y`; note that this might drop a large part of entries. + Only allowed when entries in `x` and `y` are vectors. + +# Examples +```jldoctest +julia> using StatsBase, Statistics + +julia> x = [1 3 7 + 2 5 6 + 3 8 4 + 4 6 2]; + +julia> pairwise(cor, eachcol(x)) +3×3 Matrix{Float64}: + 1.0 0.744208 -0.989778 + 0.744208 1.0 -0.68605 + -0.989778 -0.68605 1.0 + +julia> y = [1 3 missing + 2 5 6 + 3 missing 2 + 4 6 2]; + +julia> pairwise(cor, eachcol(y), skipmissing=:pairwise) +3×3 Matrix{Float64}: + 1.0 0.928571 -0.866025 + 0.928571 1.0 -1.0 + -0.866025 -1.0 1.0 +``` +""" +function pairwise(f, x, y=x; symmetric::Bool=false, skipmissing::Symbol=:none) + if symmetric && x !== y + throw(ArgumentError("symmetric=true only makes sense passing " * + "a single set of variables (x === y)")) + end + + return _pairwise(Val(skipmissing), f, x, y, symmetric) +end + +# cov(x) is faster than cov(x, x) +_cov(x, y) = x === y ? cov(x) : cov(x, y) + +pairwise!(::typeof(cov), dest::AbstractMatrix, x, y; + symmetric::Bool=false, skipmissing::Symbol=:none) = + pairwise!(_cov, dest, x, y, symmetric=symmetric, skipmissing=skipmissing) + +pairwise(::typeof(cov), x, y; symmetric::Bool=false, skipmissing::Symbol=:none) = + pairwise(_cov, x, y, symmetric=symmetric, skipmissing=skipmissing) + +pairwise!(::typeof(cov), dest::AbstractMatrix, x; + symmetric::Bool=true, skipmissing::Symbol=:none) = + pairwise!(_cov, dest, x, x, symmetric=symmetric, skipmissing=skipmissing) + +pairwise(::typeof(cov), x; symmetric::Bool=true, skipmissing::Symbol=:none) = + pairwise(_cov, x, x, symmetric=symmetric, skipmissing=skipmissing) + +pairwise!(::typeof(cor), dest::AbstractMatrix, x; + symmetric::Bool=true, skipmissing::Symbol=:none) = + pairwise!(cor, dest, x, x, symmetric=symmetric, skipmissing=skipmissing) + +pairwise(::typeof(cor), x; symmetric::Bool=true, skipmissing::Symbol=:none) = + pairwise(cor, x, x, symmetric=symmetric, skipmissing=skipmissing) diff --git a/src/rankcorr.jl b/src/rankcorr.jl index ea804105..3751b067 100644 --- a/src/rankcorr.jl +++ b/src/rankcorr.jl @@ -17,14 +17,103 @@ Compute Spearman's rank correlation coefficient. If `x` and `y` are vectors, the output is a float, otherwise it's a matrix corresponding to the pairwise correlations of the columns of `x` and `y`. """ -corspearman(x::RealVector, y::RealVector) = cor(tiedrank(x), tiedrank(y)) +function corspearman(x::RealVector, y::RealVector) + n = length(x) + n == length(y) || throw(DimensionMismatch("vectors must have same length")) + (any(isnan, x) || any(isnan, y)) && return NaN + return cor(tiedrank(x), tiedrank(y)) +end + +function corspearman(X::RealMatrix, y::RealVector) + size(X, 1) == length(y) || + throw(DimensionMismatch("X and y have inconsistent dimensions")) + n = size(X, 2) + C = Matrix{Float64}(I, n, 1) + any(isnan, y) && return fill!(C, NaN) + yrank = tiedrank(y) + for j = 1:n + Xj = view(X, :, j) + if any(isnan, Xj) + C[j,1] = NaN + else + Xjrank = tiedrank(Xj) + C[j,1] = cor(Xjrank, yrank) + end + end + return C +end -corspearman(X::RealMatrix, Y::RealMatrix) = - cor(mapslices(tiedrank, X, dims=1), mapslices(tiedrank, Y, dims=1)) -corspearman(X::RealMatrix, y::RealVector) = cor(mapslices(tiedrank, X, dims=1), tiedrank(y)) -corspearman(x::RealVector, Y::RealMatrix) = cor(tiedrank(x), mapslices(tiedrank, Y, dims=1)) +function corspearman(x::RealVector, Y::RealMatrix) + size(Y, 1) == length(x) || + throw(DimensionMismatch("x and Y have inconsistent dimensions")) + n = size(Y, 2) + C = Matrix{Float64}(I, 1, n) + any(isnan, x) && return fill!(C, NaN) + xrank = tiedrank(x) + for j = 1:n + Yj = view(Y, :, j) + if any(isnan, Yj) + C[1,j] = NaN + else + Yjrank = tiedrank(Yj) + C[1,j] = cor(xrank, Yjrank) + end + end + return C +end + +function corspearman(X::RealMatrix) + n = size(X, 2) + C = Matrix{Float64}(I, n, n) + anynan = Vector{Bool}(undef, n) + for j = 1:n + Xj = view(X, :, j) + anynan[j] = any(isnan, Xj) + if anynan[j] + C[:,j] .= NaN + C[j,:] .= NaN + C[j,j] = 1 + continue + end + Xjrank = tiedrank(Xj) + for i = 1:(j-1) + Xi = view(X, :, i) + if anynan[i] + C[i,j] = C[j,i] = NaN + else + Xirank = tiedrank(Xi) + C[i,j] = C[j,i] = cor(Xjrank, Xirank) + end + end + end + return C +end -corspearman(X::RealMatrix) = (Z = mapslices(tiedrank, X, dims=1); cor(Z, Z)) +function corspearman(X::RealMatrix, Y::RealMatrix) + size(X, 1) == size(Y, 1) || + throw(ArgumentError("number of rows in each array must match")) + nr = size(X, 2) + nc = size(Y, 2) + C = Matrix{Float64}(undef, nr, nc) + for j = 1:nr + Xj = view(X, :, j) + if any(isnan, Xj) + C[j,:] .= NaN + continue + end + Xjrank = tiedrank(Xj) + for i = 1:nc + Yi = view(Y, :, i) + if any(isnan, Yi) + C[j,i] = NaN + else + Yirank = tiedrank(Yi) + C[j,i] = cor(Xjrank, Yirank) + end + end + end + return C +end ####################################### @@ -33,66 +122,50 @@ corspearman(X::RealMatrix) = (Z = mapslices(tiedrank, X, dims=1); cor(Z, Z)) # ####################################### -# Knight JASA (1966) - -function corkendall!(x::RealVector, y::RealVector) +# Knight, William R. “A Computer Method for Calculating Kendall's Tau with Ungrouped Data.” +# Journal of the American Statistical Association, vol. 61, no. 314, 1966, pp. 436–439. +# JSTOR, www.jstor.org/stable/2282833. +function corkendall!(x::RealVector, y::RealVector, permx::AbstractVector{<:Integer}=sortperm(x)) if any(isnan, x) || any(isnan, y) return NaN end n = length(x) if n != length(y) error("Vectors must have same length") end # Initial sorting - pm = sortperm(y) - x[:] = x[pm] - y[:] = y[pm] - pm[:] = sortperm(x) - x[:] = x[pm] - - # Counting ties in x and y - iT = 1 - nT = 0 - iU = 1 - nU = 0 - for i = 2:n - if x[i] == x[i-1] - iT += 1 - else - nT += iT*(iT - 1) - iT = 1 - end - if y[i] == y[i-1] - iU += 1 - else - nU += iU*(iU - 1) - iU = 1 + permute!(x, permx) + permute!(y, permx) + + # Use widen to avoid overflows on both 32bit and 64bit + npairs = div(widen(n) * (n - 1), 2) + ntiesx = ndoubleties = nswaps = widen(0) + k = 0 + + @inbounds for i = 2:n + if x[i - 1] == x[i] + k += 1 + elseif k > 0 + # Sort the corresponding chunk of y, so the rows of hcat(x,y) are + # sorted first on x, then (where x values are tied) on y. Hence + # double ties can be counted by calling countties. + sort!(view(y, (i - k - 1):(i - 1))) + ntiesx += div(widen(k) * (k + 1), 2) # Must use wide integers here + ndoubleties += countties(y, i - k - 1, i - 1) + k = 0 end end - if iT > 1 nT += iT*(iT - 1) end - nT = div(nT,2) - if iU > 1 nU += iU*(iU - 1) end - nU = div(nU,2) - - # Sort y after x - y[:] = y[pm] - - # Calculate double ties - iV = 1 - nV = 0 - jV = 1 - for i = 2:n - if x[i] == x[i-1] && y[i] == y[i-1] - iV += 1 - else - nV += iV*(iV - 1) - iV = 1 - end + if k > 0 + sort!(view(y, (n - k):n)) + ntiesx += div(widen(k) * (k + 1), 2) + ndoubleties += countties(y, n - k, n) end - if iV > 1 nV += iV*(iV - 1) end - nV = div(nV,2) - nD = div(n*(n - 1),2) - return (nD - nT - nU + nV - 2swaps!(y)) / (sqrt(nD - nT) * sqrt(nD - nU)) -end + nswaps = merge_sort!(y, 1, n) + ntiesy = countties(y, 1, n) + # Calls to float below prevent possible overflow errors when + # length(x) exceeds 77_936 (32 bit) or 5_107_605_667 (64 bit) + (npairs + ndoubleties - ntiesx - ntiesy - 2 * nswaps) / + sqrt(float(npairs - ntiesx) * float(npairs - ntiesy)) +end """ corkendall(x, y=x) @@ -100,21 +173,40 @@ end Compute Kendall's rank correlation coefficient, τ. `x` and `y` must both be either matrices or vectors. """ -corkendall(x::RealVector, y::RealVector) = corkendall!(float(copy(x)), float(copy(y))) +corkendall(x::RealVector, y::RealVector) = corkendall!(copy(x), copy(y)) -corkendall(X::RealMatrix, y::RealVector) = Float64[corkendall!(float(X[:,i]), float(copy(y))) for i in 1:size(X, 2)] - -corkendall(x::RealVector, Y::RealMatrix) = (n = size(Y,2); reshape(Float64[corkendall!(float(copy(x)), float(Y[:,i])) for i in 1:n], 1, n)) +function corkendall(X::RealMatrix, y::RealVector) + permy = sortperm(y) + return([corkendall!(copy(y), X[:,i], permy) for i in 1:size(X, 2)]) +end -corkendall(X::RealMatrix, Y::RealMatrix) = Float64[corkendall!(float(X[:,i]), float(Y[:,j])) for i in 1:size(X, 2), j in 1:size(Y, 2)] +function corkendall(x::RealVector, Y::RealMatrix) + n = size(Y, 2) + permx = sortperm(x) + return(reshape([corkendall!(copy(x), Y[:,i], permx) for i in 1:n], 1, n)) +end function corkendall(X::RealMatrix) n = size(X, 2) - C = Matrix{eltype(X)}(I, n, n) + C = Matrix{Float64}(I, n, n) for j = 2:n - for i = 1:j-1 - C[i,j] = corkendall!(X[:,i],X[:,j]) - C[j,i] = C[i,j] + permx = sortperm(X[:,j]) + for i = 1:j - 1 + C[j,i] = corkendall!(X[:,j], X[:,i], permx) + C[i,j] = C[j,i] + end + end + return C +end + +function corkendall(X::RealMatrix, Y::RealMatrix) + nr = size(X, 2) + nc = size(Y, 2) + C = Matrix{Float64}(undef, nr, nc) + for j = 1:nr + permx = sortperm(X[:,j]) + for i = 1:nc + C[j,i] = corkendall!(X[:,j], Y[:,i], permx) end end return C @@ -122,32 +214,111 @@ end # Auxilliary functions for Kendall's rank correlation -function swaps!(x::RealVector) - n = length(x) - if n == 1 return 0 end - n2 = div(n, 2) - xl = view(x, 1:n2) - xr = view(x, n2+1:n) - nsl = swaps!(xl) - nsr = swaps!(xr) - sort!(xl) - sort!(xr) - return nsl + nsr + mswaps(xl,xr) +""" + countties(x::RealVector, lo::Integer, hi::Integer) + +Return the number of ties within `x[lo:hi]`. Assumes `x` is sorted. +""" +function countties(x::AbstractVector, lo::Integer, hi::Integer) + # Use of widen below prevents possible overflow errors when + # length(x) exceeds 2^16 (32 bit) or 2^32 (64 bit) + thistiecount = result = widen(0) + checkbounds(x, lo:hi) + @inbounds for i = (lo + 1):hi + if x[i] == x[i - 1] + thistiecount += 1 + elseif thistiecount > 0 + result += div(thistiecount * (thistiecount + 1), 2) + thistiecount = widen(0) + end + end + + if thistiecount > 0 + result += div(thistiecount * (thistiecount + 1), 2) + end + result end -function mswaps(x::RealVector, y::RealVector) - i = 1 - j = 1 - nSwaps = 0 - n = length(x) - while i <= n && j <= length(y) - if y[j] < x[i] - nSwaps += n - i + 1 +# Tests appear to show that a value of 64 is optimal, +# but note that the equivalent constant in base/sort.jl is 20. +const SMALL_THRESHOLD = 64 + +# merge_sort! copied from Julia Base +# (commit 28330a2fef4d9d149ba0fd3ffa06347b50067647, dated 20 Sep 2020) +""" + merge_sort!(v::AbstractVector, lo::Integer, hi::Integer, t::AbstractVector=similar(v, 0)) + +Mutates `v` by sorting elements `x[lo:hi]` using the merge sort algorithm. +This method is a copy-paste-edit of sort! in base/sort.jl, amended to return the bubblesort distance. +""" +function merge_sort!(v::AbstractVector, lo::Integer, hi::Integer, t::AbstractVector=similar(v, 0)) + # Use of widen below prevents possible overflow errors when + # length(v) exceeds 2^16 (32 bit) or 2^32 (64 bit) + nswaps = widen(0) + @inbounds if lo < hi + hi - lo <= SMALL_THRESHOLD && return insertion_sort!(v, lo, hi) + + m = midpoint(lo, hi) + (length(t) < m - lo + 1) && resize!(t, m - lo + 1) + + nswaps = merge_sort!(v, lo, m, t) + nswaps += merge_sort!(v, m + 1, hi, t) + + i, j = 1, lo + while j <= m + t[i] = v[j] + i += 1 j += 1 - else + end + + i, k = 1, lo + while k < j <= hi + if v[j] < t[i] + v[k] = v[j] + j += 1 + nswaps += m - lo + 1 - (i - 1) + else + v[k] = t[i] + i += 1 + end + k += 1 + end + while k < j + v[k] = t[i] + k += 1 i += 1 end end - return nSwaps + return nswaps end +# insertion_sort! and midpoint copied from Julia Base +# (commit 28330a2fef4d9d149ba0fd3ffa06347b50067647, dated 20 Sep 2020) +midpoint(lo::T, hi::T) where T <: Integer = lo + ((hi - lo) >>> 0x01) +midpoint(lo::Integer, hi::Integer) = midpoint(promote(lo, hi)...) + +""" + insertion_sort!(v::AbstractVector, lo::Integer, hi::Integer) + +Mutates `v` by sorting elements `x[lo:hi]` using the insertion sort algorithm. +This method is a copy-paste-edit of sort! in base/sort.jl, amended to return the bubblesort distance. +""" +function insertion_sort!(v::AbstractVector, lo::Integer, hi::Integer) + if lo == hi return widen(0) end + nswaps = widen(0) + @inbounds for i = lo + 1:hi + j = i + x = v[i] + while j > lo + if x < v[j - 1] + nswaps += 1 + v[j] = v[j - 1] + j -= 1 + continue + end + break + end + v[j] = x + end + return nswaps +end diff --git a/src/ranking.jl b/src/ranking.jl index 90e4f7bf..9ad868be 100644 --- a/src/ranking.jl +++ b/src/ranking.jl @@ -6,65 +6,70 @@ # The implementations here follow this wikipedia page. # - function _check_randparams(rks, x, p) n = length(rks) length(x) == length(p) == n || raise_dimerror() return n end +# ranking helper function: calls sortperm(x) and then ranking method f! +function _rank(f!, x::AbstractArray, R::Type=Int; sortkwargs...) + rks = similar(x, R) + ord = reshape(sortperm(vec(x); sortkwargs...), size(x)) + return f!(rks, x, ord) +end +# ranking helper function for arrays with missing values +function _rank(f!, x::AbstractArray{>: Missing}, R::Type=Int; sortkwargs...) + inds = findall(!ismissing, vec(x)) + isempty(inds) && return Array{Union{R, Missing}}(missing, size(x)) + xv = convert(AbstractVector{Int}, view(vec(x), inds)) + ordv = sortperm(xv; sortkwargs...) + rks = Array{Union{R, Missing}}(missing, size(x)) + f!(view(rks, inds), xv, ordv) + return rks +end # Ordinal ranking ("1234 ranking") -- use the literal order resulted from sort -function ordinalrank!(rks::AbstractArray, x::AbstractArray, p::IntegerArray) - n = _check_randparams(rks, x, p) - - if n > 0 - i = 1 - while i <= n - rks[p[i]] = i - i += 1 - end +function _ordinalrank!(rks::AbstractArray, x::AbstractArray, p::IntegerArray) + _check_randparams(rks, x, p) + @inbounds for i in eachindex(p) + rks[p[i]] = i end - return rks end """ - ordinalrank(x; lt = isless, rev::Bool = false) + ordinalrank(x; lt=isless, by=identity, rev::Bool=false, ...) Return the [ordinal ranking](https://en.wikipedia.org/wiki/Ranking#Ordinal_ranking_.28.221234.22_ranking.29) -("1234" ranking) of an array. The `lt` keyword allows providing a custom "less -than" function; use `rev=true` to reverse the sorting order. -All items in `x` are given distinct, successive ranks based on their -position in `sort(x; lt = lt, rev = rev)`. +("1234" ranking) of an array. Supports the same keyword arguments as the `sort` function. +All items in `x` are given distinct, successive ranks based on their position +in the sorted vector. Missing values are assigned rank `missing`. """ -ordinalrank(x::AbstractArray; lt = isless, rev::Bool = false) = - ordinalrank!(Array{Int}(undef, size(x)), x, sortperm(x; lt = lt, rev = rev)) +ordinalrank(x::AbstractArray; sortkwargs...) = + _rank(_ordinalrank!, x; sortkwargs...) # Competition ranking ("1224" ranking) -- resolve tied ranks using min -function competerank!(rks::AbstractArray, x::AbstractArray, p::IntegerArray) +function _competerank!(rks::AbstractArray, x::AbstractArray, p::IntegerArray) n = _check_randparams(rks, x, p) - if n > 0 + @inbounds if n > 0 p1 = p[1] v = x[p1] rks[p1] = k = 1 - i = 2 - while i <= n + for i in 2:n pi = p[i] xi = x[pi] - if xi == v - rks[pi] = k - else - rks[pi] = k = i + if xi != v v = xi + k = i end - i += 1 + rks[pi] = k end end @@ -73,39 +78,35 @@ end """ - competerank(x; lt = isless, rev::Bool = false) + competerank(x; lt=isless, by=identity, rev::Bool=false, ...) Return the [standard competition ranking](http://en.wikipedia.org/wiki/Ranking#Standard_competition_ranking_.28.221224.22_ranking.29) -("1224" ranking) of an array. The `lt` keyword allows providing a custom "less -than" function; use `rev=true` to reverse the sorting order. -Items that compare equal are given the same rank, then a gap is left -in the rankings the size of the number of tied items - 1. +("1224" ranking) of an array. Supports the same keyword arguments as the `sort` function. +Equal (*"tied"*) items are given the same rank, and the next rank comes after a gap +that is equal to the number of tied items - 1. Missing values are assigned rank `missing`. """ -competerank(x::AbstractArray; lt = isless, rev::Bool = false) = - competerank!(Array{Int}(undef, size(x)), x, sortperm(x; lt = lt, rev = rev)) +competerank(x::AbstractArray; sortkwargs...) = + _rank(_competerank!, x; sortkwargs...) # Dense ranking ("1223" ranking) -- resolve tied ranks using min -function denserank!(rks::AbstractArray, x::AbstractArray, p::IntegerArray) +function _denserank!(rks::AbstractArray, x::AbstractArray, p::IntegerArray) n = _check_randparams(rks, x, p) - if n > 0 + @inbounds if n > 0 p1 = p[1] v = x[p1] rks[p1] = k = 1 - i = 2 - while i <= n + for i in 2:n pi = p[i] xi = x[pi] - if xi == v - rks[pi] = k - else - rks[pi] = (k += 1) + if xi != v v = xi + k += 1 end - i += 1 + rks[pi] = k end end @@ -114,29 +115,27 @@ end """ - denserank(x) + denserank(x; lt=isless, by=identity, rev::Bool=false, ...) Return the [dense ranking](http://en.wikipedia.org/wiki/Ranking#Dense_ranking_.28.221223.22_ranking.29) -("1223" ranking) of an array. The `lt` keyword allows providing a custom "less -than" function; use `rev=true` to reverse the sorting order. Items that -compare equal receive the same ranking, and the next subsequent rank is +("1223" ranking) of an array. Supports the same keyword arguments as the `sort` function. +Equal items receive the same rank, and the next subsequent rank is assigned with no gap. Missing values are assigned rank `missing`. """ -denserank(x::AbstractArray; lt = isless, rev::Bool = false) = - denserank!(Array{Int}(undef, size(x)), x, sortperm(x; lt = lt, rev = rev)) +denserank(x::AbstractArray; sortkwargs...) = + _rank(_denserank!, x; sortkwargs...) # Tied ranking ("1 2.5 2.5 4" ranking) -- resolve tied ranks using average -function tiedrank!(rks::AbstractArray, x::AbstractArray, p::IntegerArray) +function _tiedrank!(rks::AbstractArray, x::AbstractArray, p::IntegerArray) n = _check_randparams(rks, x, p) - if n > 0 + @inbounds if n > 0 v = x[p[1]] s = 1 # starting index of current range - e = 2 # pass-by-end index of current range - while e <= n + for e in 2:n # e is pass-by-end index of current range cx = x[p[e]] if cx != v # fill average rank to s : e-1 @@ -148,10 +147,9 @@ function tiedrank!(rks::AbstractArray, x::AbstractArray, p::IntegerArray) s = e v = cx end - e += 1 end - # the last range (e == n+1) + # the last range ar = (s + n) / 2 for i = s : n rks[p[i]] = ar @@ -161,33 +159,15 @@ function tiedrank!(rks::AbstractArray, x::AbstractArray, p::IntegerArray) return rks end -# order (aka. rank), resolving ties using the mean rank """ - tiedrank(x) + tiedrank(x; lt=isless, by=identity, rev::Bool=false, ...) Return the [tied ranking](http://en.wikipedia.org/wiki/Ranking#Fractional_ranking_.28.221_2.5_2.5_4.22_ranking.29), also called fractional or "1 2.5 2.5 4" ranking, -of an array. The `lt` keyword allows providing a custom "less -than" function; use `rev=true` to reverse the sorting order. -Items that compare equal receive the mean of the -rankings they would have been assigned under ordinal ranking. +of an array. Supports the same keyword arguments as the `sort` function. +Equal (*"tied"*) items receive the mean of the ranks they would +have been assigned under the ordinal ranking (see [`ordinalrank`](@ref)). Missing values are assigned rank `missing`. """ -tiedrank(x::AbstractArray; lt = isless, rev::Bool = false) = - tiedrank!(Array{Float64}(undef, size(x)), x, sortperm(x; lt = lt, rev = rev)) - -for (f, f!, S) in zip([:ordinalrank, :competerank, :denserank, :tiedrank], - [:ordinalrank!, :competerank!, :denserank!, :tiedrank!], - [Int, Int, Int, Float64]) - @eval begin - function $f(x::AbstractArray{>: Missing}; lt = isless, rev::Bool = false) - inds = findall(!ismissing, x) - isempty(inds) && return Array{Union{$S, Missing}}(missing, size(x)) - xv = convert(Array{Int}, view(x, inds)) - sp = sortperm(xv; lt = lt, rev = rev) - rks = Vector{Union{$S, Missing}}(missing, length(x)) - $(f!)(view(rks, inds), xv, sp) - rks - end - end -end \ No newline at end of file +tiedrank(x::AbstractArray; sortkwargs...) = + _rank(_tiedrank!, x, Float64; sortkwargs...) diff --git a/src/reliability.jl b/src/reliability.jl new file mode 100644 index 00000000..aebb94b2 --- /dev/null +++ b/src/reliability.jl @@ -0,0 +1,74 @@ +struct CronbachAlpha{T <: Real} + alpha::T + dropped::Vector{T} +end + +function Base.show(io::IO, x::CronbachAlpha) + @printf(io, "Cronbach's alpha for all items: %.4f\n", x.alpha) + isempty(x.dropped) && return + println(io, "\nCronbach's alpha if an item is dropped:") + for (idx, val) in enumerate(x.dropped) + @printf(io, "item %i: %.4f\n", idx, val) + end +end + +""" + cronbachalpha(covmatrix::AbstractMatrix{<:Real}) + +Calculate Cronbach's alpha (1951) from a covariance matrix `covmatrix` according to +the [formula](https://en.wikipedia.org/wiki/Cronbach%27s_alpha): + +```math +\\rho = \\frac{k}{k-1} (1 - \\frac{\\sum^k_{i=1} \\sigma^2_i}{\\sum_{i=1}^k \\sum_{j=1}^k \\sigma_{ij}}) +``` + +where ``k`` is the number of items, i.e. columns, ``\\sigma_i^2`` the item variance, +and ``\\sigma_{ij}`` the inter-item covariance. + +Returns a `CronbachAlpha` object that holds: + +* `alpha`: the Cronbach's alpha score for all items, i.e. columns, in `covmatrix`; and +* `dropped`: a vector giving Cronbach's alpha scores if a specific item, + i.e. column, is dropped from `covmatrix`. + +# Example +```jldoctest +julia> using StatsBase + +julia> cov_X = [10 6 6 6; + 6 11 6 6; + 6 6 12 6; + 6 6 6 13]; + +julia> cronbachalpha(cov_X) +Cronbach's alpha for all items: 0.8136 + +Cronbach's alpha if an item is dropped: +item 1: 0.7500 +item 2: 0.7606 +item 3: 0.7714 +item 4: 0.7826 +``` +""" +function cronbachalpha(covmatrix::AbstractMatrix{<:Real}) + isposdef(covmatrix) || throw(ArgumentError("Covariance matrix must be positive definite.")) + k = size(covmatrix, 2) + k > 1 || throw(ArgumentError("Covariance matrix must have more than one column.")) + v = vec(sum(covmatrix, dims=1)) + σ = sum(v) + for i in axes(v, 1) + v[i] -= covmatrix[i, i] + end + σ_diag = sum(i -> covmatrix[i, i], 1:k) + + alpha = k * (1 - σ_diag / σ) / (k - 1) + if k > 2 + dropped = typeof(alpha)[(k - 1) * (1 - (σ_diag - covmatrix[i, i]) / (σ - 2*v[i] - covmatrix[i, i])) / (k - 2) + for i in 1:k] + else + # if k = 2 do not produce dropped; this has to be also + # correctly handled in show + dropped = Vector{typeof(alpha)}() + end + return CronbachAlpha(alpha, dropped) +end diff --git a/src/robust.jl b/src/robust.jl index c1d93264..a186be4b 100644 --- a/src/robust.jl +++ b/src/robust.jl @@ -7,96 +7,103 @@ ############################# # Trimmed set +"Return the upper and lower bound elements used by `trim` and `winsor`" +function uplo(x::AbstractVector; prop::Real=0.0, count::Integer=0) + n = length(x) + n > 0 || throw(ArgumentError("x can not be empty.")) + + if count == 0 + 0 <= prop < 0.5 || throw(ArgumentError("prop must satisfy 0 ≤ prop < 0.5.")) + count = floor(Int, n * prop) + else + prop == 0 || throw(ArgumentError("prop and count can not both be > 0.")) + 0 <= count < n/2 || throw(ArgumentError("count must satisfy 0 ≤ count < length(x)/2.")) + end + + # indices for lowest count values + x2 = copy(x) + lo = partialsort!(x2, 1:count+1)[end] + # indices for largest count values + up = partialsort!(x2, n-count:n)[1] + + up, lo +end + """ - trim(x; prop=0.0, count=0) + trim(x::AbstractVector; prop=0.0, count=0) + +Return an iterator of all elements of `x` that omits either `count` or proportion +`prop` of the highest and lowest elements. -Return a copy of `x` with either `count` or proportion `prop` of the highest -and lowest elements removed. To compute the trimmed mean of `x` use -`mean(trim(x))`; to compute the variance use `trimvar(x)` (see [`trimvar`](@ref)). +The number of trimmed elements could be smaller than specified if several +elements equal the lower or upper bound. + +To compute the trimmed mean of `x` use `mean(trim(x))`; +to compute the variance use `trimvar(x)` (see [`trimvar`](@ref)). # Example ```julia -julia> trim([1,2,3,4,5], prop=0.2) +julia> collect(trim([5,2,4,3,1], prop=0.2)) 3-element Array{Int64,1}: 2 - 3 4 + 3 ``` """ function trim(x::AbstractVector; prop::Real=0.0, count::Integer=0) - trim!(copy(x); prop=prop, count=count) + up, lo = uplo(x; prop=prop, count=count) + + (xi for xi in x if lo <= xi <= up) end """ - trim!(x; prop=0.0, count=0) + trim!(x::AbstractVector; prop=0.0, count=0) A variant of [`trim`](@ref) that modifies `x` in place. """ function trim!(x::AbstractVector; prop::Real=0.0, count::Integer=0) - n = length(x) - n > 0 || throw(ArgumentError("x can not be empty.")) - - if count == 0 - 0 <= prop < 0.5 || throw(ArgumentError("prop must satisfy 0 ≤ prop < 0.5.")) - count = floor(Int, n * prop) - else - prop == 0 || throw(ArgumentError("prop and count can not both be > 0.")) - 0 <= count < n/2 || throw(ArgumentError("count must satisfy 0 ≤ count < length(x)/2.")) - end - - partialsort!(x, (n-count+1):n) - partialsort!(x, 1:count) - deleteat!(x, (n-count+1):n) - deleteat!(x, 1:count) - + up, lo = uplo(x; prop=prop, count=count) + ix = (i for (i,xi) in enumerate(x) if lo > xi || xi > up) + deleteat!(x, ix) return x end """ - winsor(x; prop=0.0, count=0) + winsor(x::AbstractVector; prop=0.0, count=0) -Return a copy of `x` with either `count` or proportion `prop` of the lowest -elements of `x` replaced with the next-lowest, and an equal number of the -highest elements replaced with the previous-highest. To compute the Winsorized -mean of `x` use `mean(winsor(x))`. +Return an iterator of all elements of `x` that replaces either `count` or +proportion `prop` of the highest elements with the previous-highest element +and an equal number of the lowest elements with the next-lowest element. + +The number of replaced elements could be smaller than specified if several +elements equal the lower or upper bound. + +To compute the Winsorized mean of `x` use `mean(winsor(x))`. # Example ```julia -julia> winsor([1,2,3,4,5], prop=0.2) +julia> collect(winsor([5,2,3,4,1], prop=0.2)) 5-element Array{Int64,1}: - 2 + 4 2 3 4 - 4 + 2 ``` """ function winsor(x::AbstractVector; prop::Real=0.0, count::Integer=0) - winsor!(copy(x); prop=prop, count=count) + up, lo = uplo(x; prop=prop, count=count) + + (clamp(xi, lo, up) for xi in x) end """ - winsor!(x; prop=0.0, count=0) + winsor!(x::AbstractVector; prop=0.0, count=0) A variant of [`winsor`](@ref) that modifies vector `x` in place. """ function winsor!(x::AbstractVector; prop::Real=0.0, count::Integer=0) - n = length(x) - n > 0 || throw(ArgumentError("x can not be empty.")) - - if count == 0 - 0 <= prop < 0.5 || throw(ArgumentError("prop must satisfy 0 ≤ prop < 0.5.")) - count = floor(Int, n * prop) - else - prop == 0 || throw(ArgumentError("prop and count can not both be > 0.")) - 0 <= count < n/2 || throw(ArgumentError("count must satisfy 0 ≤ count < length(x)/2.")) - end - - partialsort!(x, (n-count+1):n) - partialsort!(x, 1:count) - x[1:count] .= x[count+1] - x[n-count+1:end] .= x[n-count] - + copyto!(x, winsor(x; prop=prop, count=count)) return x end diff --git a/src/sampling.jl b/src/sampling.jl index 1e0d4797..d12fd56e 100644 --- a/src/sampling.jl +++ b/src/sampling.jl @@ -5,12 +5,12 @@ # ########################################################### -using Random: RangeGenerator, Random.GLOBAL_RNG +using Random: Sampler, Random.GLOBAL_RNG ### Algorithms for sampling with replacement function direct_sample!(rng::AbstractRNG, a::UnitRange, x::AbstractArray) - s = RangeGenerator(1:length(a)) + s = Sampler(rng, 1:length(a)) b = a[1] - 1 if b == 0 for i = 1:length(x) @@ -34,7 +34,7 @@ and set `x[j] = a[i]`, with `n=length(a)` and `k=length(x)`. This algorithm consumes `k` random numbers. """ function direct_sample!(rng::AbstractRNG, a::AbstractArray, x::AbstractArray) - s = RangeGenerator(1:length(a)) + s = Sampler(rng, 1:length(a)) for i = 1:length(x) @inbounds x[i] = a[rand(rng, s)] end @@ -42,6 +42,49 @@ function direct_sample!(rng::AbstractRNG, a::AbstractArray, x::AbstractArray) end direct_sample!(a::AbstractArray, x::AbstractArray) = direct_sample!(Random.GLOBAL_RNG, a, x) +# check whether we can use T to store indices 1:n exactly, and +# use some heuristics to decide whether it is beneficial for k samples +# (true for a subset of hardware-supported numeric types) +_storeindices(n, k, ::Type{T}) where {T<:Integer} = n ≤ typemax(T) +_storeindices(n, k, ::Type{T}) where {T<:Union{Float32,Float64}} = k < 22 && n ≤ maxintfloat(T) +_storeindices(n, k, ::Type{Complex{T}}) where {T} = _storeindices(n, k, T) +_storeindices(n, k, ::Type{Rational{T}}) where {T} = k < 16 && _storeindices(n, k, T) +_storeindices(n, k, T) = false +storeindices(n, k, ::Type{T}) where {T<:Base.HWNumber} = _storeindices(n, k, T) +storeindices(n, k, T) = false + +# order results of a sampler that does not order automatically +function sample_ordered!(sampler!, rng::AbstractRNG, a::AbstractArray, x::AbstractArray) + n, k = length(a), length(x) + # todo: if eltype(x) <: Real && eltype(a) <: Real, + # in some cases it might be faster to check + # issorted(a) to see if we can just sort x + if storeindices(n, k, eltype(x)) + sort!(sampler!(rng, Base.OneTo(n), x), by=real, lt=<) + @inbounds for i = 1:k + x[i] = a[Int(x[i])] + end + else + indices = Array{Int}(undef, k) + sort!(sampler!(rng, Base.OneTo(n), indices)) + @inbounds for i = 1:k + x[i] = a[indices[i]] + end + end + return x +end + +# special case of a range can be done more efficiently +sample_ordered!(sampler!, rng::AbstractRNG, a::AbstractRange, x::AbstractArray) = + sort!(sampler!(rng, a, x), rev=step(a)<0) + +# weighted case: +sample_ordered!(sampler!, rng::AbstractRNG, a::AbstractArray, + wv::AbstractWeights, x::AbstractArray) = + sample_ordered!(rng, a, x) do rng, a, x + sampler!(rng, a, wv, x) + end + ### draw a pair of distinct integers in [1:n] """ @@ -107,7 +150,7 @@ function knuths_sample!(rng::AbstractRNG, a::AbstractArray, x::AbstractArray; end # scan remaining - s = RangeGenerator(1:k) + s = Sampler(rng, 1:k) for i = k+1:n if rand(rng) * i < k # keep it with probability k / i @inbounds x[rand(rng, s)] = a[i] @@ -185,7 +228,7 @@ function self_avoid_sample!(rng::AbstractRNG, a::AbstractArray, x::AbstractArray s = Set{Int}() sizehint!(s, k) - rgen = RangeGenerator(1:n) + rgen = Sampler(rng, 1:n) # first one idx = rand(rng, rgen) @@ -286,7 +329,93 @@ function seqsample_c!(rng::AbstractRNG, a::AbstractArray, x::AbstractArray) end seqsample_c!(a::AbstractArray, x::AbstractArray) = seqsample_c!(Random.GLOBAL_RNG, a, x) -## TODO: implement Algorithm D (page 716 - 717) +""" + seqsample_d!([rng], a::AbstractArray, x::AbstractArray) + +Random subsequence sampling using algorithm D described in the following paper (page 716-17): +Jeffrey Scott Vitter. "Faster Methods for Random Sampling". Communications of the ACM, +27 (7), July 1984. + +This algorithm consumes ``O(k)`` random numbers, with `k=length(x)`. +The outputs are ordered. +""" +function seqsample_d!(rng::AbstractRNG, a::AbstractArray, x::AbstractArray) + N = length(a) + n = length(x) + n <= N || error("length(x) should not exceed length(a)") + + i = 0 + j = 0 + + vprime = exp(-randexp(rng)/n) + q1 = N - n + 1 + q2 = q1 / N + alpha = 1 / 13 # choose alpha value + threshold = alpha * n + + while n > 1 && threshold < N + while true + local X + while true + X = N * (1 - vprime) + s = trunc(Int, X) + if s < q1 + break + end + vprime = exp(-randexp(rng)/n) + end + + y = rand(rng) / q2 + lhs = exp(log(y) / (n - 1)) + rhs = ((q1 - s) / q1) * (N / (N - X)) + + if lhs <= rhs + vprime = lhs / rhs + break + end + + if n - 1 > s + bottom = N - n + limit = N - s + else + bottom = N - s - 1 + limit = q1 + end + + top = N - 1 + + while top >= limit + y = y * top / bottom + bottom -= 1 + top -= 1 + end + + if log(y) < (n - 1)*(log(N) - log(N - X)) + vprime = exp(-randexp(rng) / (n-1)) + break + end + vprime = exp(-randexp(rng)/n) + end + + j += 1 + i += s+1 + @inbounds x[j] = a[i] + N = N - s - 1 + n -= 1 + q1 -= s + q2 = q1 / N + threshold -= alpha + end + + if n > 1 + seqsample_a!(rng, a[i+1:end], @view x[j+1:end]) + else + s = trunc(Int, N * vprime) + @inbounds x[j+=1] = a[i+=s+1] + end +end + +seqsample_d!(a::AbstractArray, x::AbstractArray) = seqsample_d!(Random.GLOBAL_RNG, a, x) ### Interface functions (poly-algorithms) @@ -310,21 +439,24 @@ Draw a random sample of `length(x)` elements from an array `a` and store the result in `x`. A polyalgorithm is used for sampling. Sampling probabilities are proportional to the weights given in `wv`, if provided. `replace` dictates whether sampling is performed with -replacement and `order` dictates whether an ordered sample, also called -a sequential sample, should be taken. +replacement. `ordered` dictates whether +an ordered sample (also called a sequential sample, i.e. a sample where +items appear in the same order as in `a`) should be taken. Optionally specify a random number generator `rng` as the first argument (defaults to `Random.GLOBAL_RNG`). """ function sample!(rng::AbstractRNG, a::AbstractArray, x::AbstractArray; replace::Bool=true, ordered::Bool=false) + 1 == firstindex(a) == firstindex(x) || + throw(ArgumentError("non 1-based arrays are not supported")) n = length(a) k = length(x) k == 0 && return x if replace # with replacement if ordered - sort!(direct_sample!(rng, a, x)) + sample_ordered!(direct_sample!, rng, a, x) else direct_sample!(rng, a, x) end @@ -362,8 +494,9 @@ sample!(a::AbstractArray, x::AbstractArray; replace::Bool=true, ordered::Bool=fa Select a random, optionally weighted sample of size `n` from an array `a` using a polyalgorithm. Sampling probabilities are proportional to the weights given in `wv`, if provided. `replace` dictates whether sampling is performed -with replacement and `order` dictates whether an ordered sample, also called -a sequential sample, should be taken. +with replacement. `ordered` dictates whether +an ordered sample (also called a sequential sample, i.e. a sample where +items appear in the same order as in `a`) should be taken. Optionally specify a random number generator `rng` as the first argument (defaults to `Random.GLOBAL_RNG`). @@ -382,8 +515,9 @@ sample(a::AbstractArray, n::Integer; replace::Bool=true, ordered::Bool=false) = Select a random, optionally weighted sample from an array `a` specifying the dimensions `dims` of the output array. Sampling probabilities are proportional to the weights given in `wv`, if provided. `replace` dictates -whether sampling is performed with replacement and `order` dictates whether -an ordered sample, also called a sequential sample, should be taken. +whether sampling is performed with replacement. `ordered` dictates whether +an ordered sample (also called a sequential sample, i.e. a sample where +items appear in the same order as in `a`) should be taken. Optionally specify a random number generator `rng` as the first argument (defaults to `Random.GLOBAL_RNG`). @@ -412,13 +546,12 @@ Optionally specify a random number generator `rng` as the first argument """ function sample(rng::AbstractRNG, wv::AbstractWeights) t = rand(rng) * sum(wv) - w = values(wv) - n = length(w) + n = length(wv) i = 1 - cw = w[1] + cw = wv[1] while cw < t && i < n i += 1 - @inbounds cw += w[i] + @inbounds cw += wv[i] end return i end @@ -530,10 +663,10 @@ function alias_sample!(rng::AbstractRNG, a::AbstractArray, wv::AbstractWeights, # create alias table ap = Vector{Float64}(undef, n) alias = Vector{Int}(undef, n) - make_alias_table!(values(wv), sum(wv), ap, alias) + make_alias_table!(wv, sum(wv), ap, alias) # sampling - s = RangeGenerator(1:n) + s = Sampler(rng, 1:n) for i = 1:length(x) j = rand(rng, s) x[i] = rand(rng) < ap[j] ? a[j] : a[alias[j]] @@ -561,7 +694,7 @@ function naive_wsample_norep!(rng::AbstractRNG, a::AbstractArray, k = length(x) w = Vector{Float64}(undef, n) - copyto!(w, values(wv)) + copyto!(w, wv) wsum = sum(wv) for i = 1:k @@ -696,7 +829,8 @@ Noting `k=length(x)` and `n=length(a)`, this algorithm takes ``O(k \\log(k) \\lo processing time to draw ``k`` elements. It consumes ``O(k \\log(n / k))`` random numbers. """ function efraimidis_aexpj_wsample_norep!(rng::AbstractRNG, a::AbstractArray, - wv::AbstractWeights, x::AbstractArray) + wv::AbstractWeights, x::AbstractArray; + ordered::Bool=false) n = length(a) length(wv) == n || throw(DimensionMismatch("a and wv must be of same length (got $n and $(length(wv))).")) k = length(x) @@ -739,24 +873,36 @@ function efraimidis_aexpj_wsample_norep!(rng::AbstractRNG, a::AbstractArray, threshold = pq[1].first X = threshold * randexp(rng) end - - # fill output array with items in descending order - @inbounds for i in k:-1:1 - x[i] = a[heappop!(pq).second] + if ordered + # fill output array with items sorted as in a + sort!(pq, by=last) + @inbounds for i in 1:k + x[i] = a[pq[i].second] + end + else + # fill output array with items in descending order + @inbounds for i in k:-1:1 + x[i] = a[heappop!(pq).second] + end end return x end -efraimidis_aexpj_wsample_norep!(a::AbstractArray, wv::AbstractWeights, x::AbstractArray) = - efraimidis_aexpj_wsample_norep!(Random.GLOBAL_RNG, a, wv, x) +efraimidis_aexpj_wsample_norep!(a::AbstractArray, wv::AbstractWeights, x::AbstractArray; + ordered::Bool=false) = + efraimidis_aexpj_wsample_norep!(Random.GLOBAL_RNG, a, wv, x; ordered=ordered) function sample!(rng::AbstractRNG, a::AbstractArray, wv::AbstractWeights, x::AbstractArray; replace::Bool=true, ordered::Bool=false) + 1 == firstindex(a) == firstindex(wv) == firstindex(x) || + throw(ArgumentError("non 1-based arrays are not supported")) n = length(a) k = length(x) if replace if ordered - sort!(direct_sample!(rng, a, wv, x)) + sample_ordered!(rng, a, wv, x) do rng, a, wv, x + sample!(rng, a, wv, x; replace=true, ordered=false) + end else if n < 40 direct_sample!(rng, a, wv, x) @@ -771,16 +917,13 @@ function sample!(rng::AbstractRNG, a::AbstractArray, wv::AbstractWeights, x::Abs end else k <= n || error("Cannot draw $n samples from $k samples without replacement.") - - efraimidis_aexpj_wsample_norep!(rng, a, wv, x) - if ordered - sort!(x) - end + efraimidis_aexpj_wsample_norep!(rng, a, wv, x; ordered=ordered) end return x end -sample!(a::AbstractArray, wv::AbstractWeights, x::AbstractArray) = - sample!(Random.GLOBAL_RNG, a, wv, x) +sample!(a::AbstractArray, wv::AbstractWeights, x::AbstractArray; + replace::Bool=true, ordered::Bool=false) = + sample!(Random.GLOBAL_RNG, a, wv, x; replace=replace, ordered=ordered) sample(rng::AbstractRNG, a::AbstractArray{T}, wv::AbstractWeights, n::Integer; replace::Bool=true, ordered::Bool=false) where {T} = @@ -803,8 +946,9 @@ sample(a::AbstractArray, wv::AbstractWeights, dims::Dims; Select a weighted sample from an array `a` and store the result in `x`. Sampling probabilities are proportional to the weights given in `w`. `replace` dictates -whether sampling is performed with replacement and `order` dictates whether an -ordered sample, also called a sequential sample, should be taken. +whether sampling is performed with replacement. `ordered` dictates whether +an ordered sample (also called a sequential sample, i.e. a sample where +items appear in the same order as in `a`) should be taken. Optionally specify a random number generator `rng` as the first argument (defaults to `Random.GLOBAL_RNG`). @@ -837,8 +981,9 @@ wsample(a::AbstractArray, w::RealVector) = wsample(Random.GLOBAL_RNG, a, w) Select a weighted random sample of size `n` from `a` with probabilities proportional to the weights given in `w` if `a` is present, otherwise select a random sample of size `n` of the weights given in `w`. `replace` dictates whether sampling is performed with -replacement and `order` dictates whether an ordered sample, also called a sequential -sample, should be taken. +replacement. `ordered` dictates whether +an ordered sample (also called a sequential sample, i.e. a sample where +items appear in the same order as in `a`) should be taken. Optionally specify a random number generator `rng` as the first argument (defaults to `Random.GLOBAL_RNG`). diff --git a/src/scalarstats.jl b/src/scalarstats.jl index f8fb6c95..87213be6 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -47,10 +47,11 @@ end # compute mode, given the range of integer values """ mode(a, [r]) + mode(a::AbstractArray, wv::AbstractWeights) Return the mode (most common number) of an array, optionally -over a specified range `r`. If several modes exist, the first -one (in order of appearance) is returned. +over a specified range `r` or weighted via a vector `wv`. +If several modes exist, the first one (in order of appearance) is returned. """ function mode(a::AbstractArray{T}, r::UnitRange{T}) where T<:Integer isempty(a) && throw(ArgumentError("mode is not defined for empty collections")) @@ -75,9 +76,10 @@ end """ modes(a, [r])::Vector + mode(a::AbstractArray, wv::AbstractWeights)::Vector Return all modes (most common numbers) of an array, optionally over a -specified range `r`. +specified range `r` or weighted via vector `wv`. """ function modes(a::AbstractArray{T}, r::UnitRange{T}) where T<:Integer r0 = r[1] @@ -158,6 +160,47 @@ function modes(a) return [x for (x, c) in cnts if c == mc] end +# Weighted mode of arbitrary vectors of values +function mode(a::AbstractVector, wv::AbstractWeights{T}) where T <: Real + isempty(a) && throw(ArgumentError("mode is not defined for empty collections")) + length(a) == length(wv) || + throw(ArgumentError("data and weight vectors must be the same size, got $(length(a)) and $(length(wv))")) + + # Iterate through the data + mv = first(a) + mw = first(wv) + weights = Dict{eltype(a), T}() + for (x, w) in zip(a, wv) + _w = get!(weights, x, zero(T)) + w + if _w > mw + mv = x + mw = _w + end + weights[x] = _w + end + + return mv +end + +function modes(a::AbstractVector, wv::AbstractWeights{T}) where T <: Real + isempty(a) && throw(ArgumentError("mode is not defined for empty collections")) + length(a) == length(wv) || + throw(ArgumentError("data and weight vectors must be the same size, got $(length(a)) and $(length(wv))")) + + # Iterate through the data + mw = first(wv) + weights = Dict{eltype(a), T}() + for (x, w) in zip(a, wv) + _w = get!(weights, x, zero(T)) + w + if _w > mw + mw = _w + end + weights[x] = _w + end + + # find values corresponding to maximum counts + return [x for (x, w) in weights if w == mw] +end ############################# # @@ -222,7 +265,7 @@ function mad(x; center=nothing, normalize::Union{Bool, Nothing}=nothing, constan # Knowing the eltype allows allocating a single array able to hold both original values # and differences from the center, instead of two arrays S = isconcretetype(T) ? promote_type(T, typeof(middle(zero(T)))) : T - x2 = x isa AbstractArray ? LinearAlgebra.copy_oftype(x, S) : collect(S, x) + x2 = x isa AbstractArray ? copyto!(similar(x, S), x) : collect(S, x) c = center === nothing ? median!(x2) : center if isconcretetype(T) x2 .= abs.(x2 .- c) @@ -458,7 +501,7 @@ function describe(a::AbstractArray{T}) where T<:Union{Real,Missing} R = typeof(m) n = length(a) ns = length(s) - qs = if m == 0 || n == 0 + qs = if ns == 0 R[NaN, NaN, NaN, NaN, NaN] elseif T >: Missing quantile!(s, [0.00, 0.25, 0.50, 0.75, 1.00]) diff --git a/src/signalcorr.jl b/src/signalcorr.jl index 5daa3524..1adf9791 100644 --- a/src/signalcorr.jl +++ b/src/signalcorr.jl @@ -14,7 +14,8 @@ default_laglen(lx::Int) = min(lx-1, round(Int,10*log10(lx))) check_lags(lx::Int, lags::AbstractVector) = (maximum(lags) < lx || error("lags must be less than the sample length.")) -function demean_col!(z::AbstractVector{T}, x::AbstractMatrix{T}, j::Int, demean::Bool) where T<:RealFP +function demean_col!(z::RealVector, x::RealMatrix, j::Int, demean::Bool) + T = eltype(z) m = size(x, 1) @assert m == length(z) b = m * (j-1) @@ -42,7 +43,8 @@ end default_autolags(lx::Int) = 0 : default_laglen(lx) -_autodot(x::AbstractVector{<:RealFP}, lx::Int, l::Int) = dot(x, 1:lx-l, x, 1+l:lx) +_autodot(x::AbstractVector{<:RealFP}, lx::Int, l::Int) = dot(x, 1:(lx-l), x, (1+l):lx) +_autodot(x::RealVector, lx::Int, l::Int) = dot(view(x, 1:(lx-l)), view(x, (1+l):lx)) ## autocov @@ -53,18 +55,19 @@ Compute the autocovariance of a vector or matrix `x` at `lags` and store the res in `r`. `demean` denotes whether the mean of `x` should be subtracted from `x` before computing the autocovariance. -If `x` is a vector, `r` must be a vector of the same length as `x`. +If `x` is a vector, `r` must be a vector of the same length as `lags`. If `x` is a matrix, `r` must be a matrix of size `(length(lags), size(x,2))`, and where each column in the result will correspond to a column in `x`. The output is not normalized. See [`autocor!`](@ref) for a method with normalization. """ -function autocov!(r::RealVector, x::AbstractVector{T}, lags::IntegerVector; demean::Bool=true) where T<:RealFP +function autocov!(r::RealVector, x::RealVector, lags::IntegerVector; demean::Bool=true) lx = length(x) m = length(lags) length(r) == m || throw(DimensionMismatch()) check_lags(lx, lags) + T = typeof(zero(eltype(x)) / 1) z::Vector{T} = demean ? x .- mean(x) : x for k = 1 : m # foreach lag value r[k] = _autodot(z, lx, lags[k]) / lx @@ -72,13 +75,14 @@ function autocov!(r::RealVector, x::AbstractVector{T}, lags::IntegerVector; deme return r end -function autocov!(r::RealMatrix, x::AbstractMatrix{T}, lags::IntegerVector; demean::Bool=true) where T<:RealFP +function autocov!(r::RealMatrix, x::RealMatrix, lags::IntegerVector; demean::Bool=true) lx = size(x, 1) ns = size(x, 2) m = length(lags) size(r) == (m, ns) || throw(DimensionMismatch()) check_lags(lx, lags) + T = typeof(zero(eltype(x)) / 1) z = Vector{T}(undef, lx) for j = 1 : ns demean_col!(z, x, j, demean) @@ -97,7 +101,7 @@ Compute the autocovariance of a vector or matrix `x`, optionally specifying the `lags` at which to compute the autocovariance. `demean` denotes whether the mean of `x` should be subtracted from `x` before computing the autocovariance. -If `x` is a vector, return a vector of the same length as `x`. +If `x` is a vector, return a vector of the same length as `lags`. If `x` is a matrix, return a matrix of size `(length(lags), size(x,2))`, where each column in the result corresponds to a column in `x`. @@ -106,15 +110,18 @@ When left unspecified, the lags used are the integers from 0 to The output is not normalized. See [`autocor`](@ref) for a function with normalization. """ -function autocov(x::AbstractVector{T}, lags::IntegerVector; demean::Bool=true) where T<:Real - autocov!(Vector{fptype(T)}(undef, length(lags)), float(x), lags; demean=demean) +function autocov(x::RealVector, lags::IntegerVector; demean::Bool=true) + out = Vector{float(eltype(x))}(undef, length(lags)) + autocov!(out, x, lags; demean=demean) end -function autocov(x::AbstractMatrix{T}, lags::IntegerVector; demean::Bool=true) where T<:Real - autocov!(Matrix{fptype(T)}(undef, length(lags), size(x,2)), float(x), lags; demean=demean) +function autocov(x::RealMatrix, lags::IntegerVector; demean::Bool=true) + out = Matrix{float(eltype(x))}(undef, length(lags), size(x,2)) + autocov!(out, x, lags; demean=demean) end -autocov(x::AbstractVecOrMat{<:Real}; demean::Bool=true) = autocov(x, default_autolags(size(x,1)); demean=demean) +autocov(x::AbstractVecOrMat{<:Real}; demean::Bool=true) = + autocov(x, default_autolags(size(x,1)); demean=demean) ## autocor @@ -125,19 +132,20 @@ Compute the autocorrelation function (ACF) of a vector or matrix `x` at `lags` and store the result in `r`. `demean` denotes whether the mean of `x` should be subtracted from `x` before computing the ACF. -If `x` is a vector, `r` must be a vector of the same length as `x`. +If `x` is a vector, `r` must be a vector of the same length as `lags`. If `x` is a matrix, `r` must be a matrix of size `(length(lags), size(x,2))`, and where each column in the result will correspond to a column in `x`. The output is normalized by the variance of `x`, i.e. so that the lag 0 autocorrelation is 1. See [`autocov!`](@ref) for the unnormalized form. """ -function autocor!(r::RealVector, x::AbstractVector{T}, lags::IntegerVector; demean::Bool=true) where T<:RealFP +function autocor!(r::RealVector, x::RealVector, lags::IntegerVector; demean::Bool=true) lx = length(x) m = length(lags) length(r) == m || throw(DimensionMismatch()) check_lags(lx, lags) + T = typeof(zero(eltype(x)) / 1) z::Vector{T} = demean ? x .- mean(x) : x zz = dot(z, z) for k = 1 : m # foreach lag value @@ -146,13 +154,14 @@ function autocor!(r::RealVector, x::AbstractVector{T}, lags::IntegerVector; deme return r end -function autocor!(r::RealMatrix, x::AbstractMatrix{T}, lags::IntegerVector; demean::Bool=true) where T<:RealFP +function autocor!(r::RealMatrix, x::RealMatrix, lags::IntegerVector; demean::Bool=true) lx = size(x, 1) ns = size(x, 2) m = length(lags) size(r) == (m, ns) || throw(DimensionMismatch()) check_lags(lx, lags) + T = typeof(zero(eltype(x)) / 1) z = Vector{T}(undef, lx) for j = 1 : ns demean_col!(z, x, j, demean) @@ -172,7 +181,7 @@ Compute the autocorrelation function (ACF) of a vector or matrix `x`, optionally specifying the `lags`. `demean` denotes whether the mean of `x` should be subtracted from `x` before computing the ACF. -If `x` is a vector, return a vector of the same length as `x`. +If `x` is a vector, return a vector of the same length as `lags`. If `x` is a matrix, return a matrix of size `(length(lags), size(x,2))`, where each column in the result corresponds to a column in `x`. @@ -182,15 +191,18 @@ When left unspecified, the lags used are the integers from 0 to The output is normalized by the variance of `x`, i.e. so that the lag 0 autocorrelation is 1. See [`autocov`](@ref) for the unnormalized form. """ -function autocor(x::AbstractVector{T}, lags::IntegerVector; demean::Bool=true) where T<:Real - autocor!(Vector{fptype(T)}(undef, length(lags)), float(x), lags; demean=demean) +function autocor(x::RealVector, lags::IntegerVector; demean::Bool=true) + out = Vector{float(eltype(x))}(undef, length(lags)) + autocor!(out, x, lags; demean=demean) end -function autocor(x::AbstractMatrix{T}, lags::IntegerVector; demean::Bool=true) where T<:Real - autocor!(Matrix{fptype(T)}(undef, length(lags), size(x,2)), float(x), lags; demean=demean) +function autocor(x::RealMatrix, lags::IntegerVector; demean::Bool=true) + out = Matrix{float(eltype(x))}(undef, length(lags), size(x,2)) + autocor!(out, x, lags; demean=demean) end -autocor(x::AbstractVecOrMat{<:Real}; demean::Bool=true) = autocor(x, default_autolags(size(x,1)); demean=demean) +autocor(x::AbstractVecOrMat{<:Real}; demean::Bool=true) = + autocor(x, default_autolags(size(x,1)); demean=demean) ####################################### @@ -201,8 +213,20 @@ autocor(x::AbstractVecOrMat{<:Real}; demean::Bool=true) = autocor(x, default_aut default_crosslags(lx::Int) = (l=default_laglen(lx); -l:l) -_crossdot(x::AbstractVector{T}, y::AbstractVector{T}, lx::Int, l::Int) where {T<:RealFP} = - (l >= 0 ? dot(x, 1:lx-l, y, 1+l:lx) : dot(x, 1-l:lx, y, 1:lx+l)) +function _crossdot(x::AbstractVector{T}, y::AbstractVector{T}, lx::Int, l::Int) where {T<:RealFP} + if l >= 0 + dot(x, 1:(lx-l), y, (1+l):lx) + else + dot(x, (1-l):lx, y, 1:(lx+l)) + end +end +function _crossdot(x::RealVector, y::RealVector, lx::Int, l::Int) + if l >= 0 + dot(view(x, 1:(lx-l)), view(y, (1+l):lx)) + else + dot(view(x, (1-l):lx), view(y, 1:(lx+l))) + end +end ## crosscov @@ -222,13 +246,15 @@ three-dimensional array of size `(length(lags), size(x, 2), size(y, 2))`. The output is not normalized. See [`crosscor!`](@ref) for a function with normalization. """ -function crosscov!(r::RealVector, x::AbstractVector{T}, y::AbstractVector{T}, lags::IntegerVector; demean::Bool=true) where T<:RealFP +function crosscov!(r::RealVector, x::RealVector, y::RealVector, lags::IntegerVector; demean::Bool=true) lx = length(x) m = length(lags) (length(y) == lx && length(r) == m) || throw(DimensionMismatch()) check_lags(lx, lags) + T = typeof(zero(eltype(x)) / 1) zx::Vector{T} = demean ? x .- mean(x) : x + S = typeof(zero(eltype(y)) / 1) zy::Vector{T} = demean ? y .- mean(y) : y for k = 1 : m # foreach lag value r[k] = _crossdot(zx, zy, lx, lags[k]) / lx @@ -236,15 +262,17 @@ function crosscov!(r::RealVector, x::AbstractVector{T}, y::AbstractVector{T}, la return r end -function crosscov!(r::RealMatrix, x::AbstractMatrix{T}, y::AbstractVector{T}, lags::IntegerVector; demean::Bool=true) where T<:RealFP +function crosscov!(r::RealMatrix, x::RealMatrix, y::RealVector, lags::IntegerVector; demean::Bool=true) lx = size(x, 1) ns = size(x, 2) m = length(lags) (length(y) == lx && size(r) == (m, ns)) || throw(DimensionMismatch()) check_lags(lx, lags) + T = typeof(zero(eltype(x)) / 1) zx = Vector{T}(undef, lx) - zy::Vector{T} = demean ? y .- mean(y) : y + S = typeof(zero(eltype(y)) / 1) + zy::Vector{S} = demean ? y .- mean(y) : y for j = 1 : ns demean_col!(zx, x, j, demean) for k = 1 : m @@ -254,15 +282,17 @@ function crosscov!(r::RealMatrix, x::AbstractMatrix{T}, y::AbstractVector{T}, la return r end -function crosscov!(r::RealMatrix, x::AbstractVector{T}, y::AbstractMatrix{T}, lags::IntegerVector; demean::Bool=true) where T<:RealFP +function crosscov!(r::RealMatrix, x::RealVector, y::RealMatrix, lags::IntegerVector; demean::Bool=true) lx = length(x) ns = size(y, 2) m = length(lags) (size(y, 1) == lx && size(r) == (m, ns)) || throw(DimensionMismatch()) check_lags(lx, lags) + T = typeof(zero(eltype(x)) / 1) zx::Vector{T} = demean ? x .- mean(x) : x - zy = Vector{T}(undef, lx) + S = typeof(zero(eltype(y)) / 1) + zy = Vector{S}(undef, lx) for j = 1 : ns demean_col!(zy, y, j, demean) for k = 1 : m @@ -272,7 +302,7 @@ function crosscov!(r::RealMatrix, x::AbstractVector{T}, y::AbstractMatrix{T}, la return r end -function crosscov!(r::AbstractArray{T,3}, x::AbstractMatrix{T}, y::AbstractMatrix{T}, lags::IntegerVector; demean::Bool=true) where T<:RealFP +function crosscov!(r::AbstractArray{<:Real,3}, x::RealMatrix, y::RealMatrix, lags::IntegerVector; demean::Bool=true) lx = size(x, 1) nx = size(x, 2) ny = size(y, 2) @@ -281,6 +311,7 @@ function crosscov!(r::AbstractArray{T,3}, x::AbstractMatrix{T}, y::AbstractMatri check_lags(lx, lags) # cached (centered) columns of x + T = typeof(zero(eltype(x)) / 1) zxs = Vector{T}[] sizehint!(zxs, nx) for j = 1 : nx @@ -294,8 +325,8 @@ function crosscov!(r::AbstractArray{T,3}, x::AbstractMatrix{T}, y::AbstractMatri push!(zxs, xj) end - zx = Vector{T}(undef, lx) - zy = Vector{T}(undef, lx) + S = typeof(zero(eltype(y)) / 1) + zy = Vector{S}(undef, lx) for j = 1 : ny demean_col!(zy, y, j, demean) for i = 1 : nx @@ -325,23 +356,28 @@ When left unspecified, the lags used are the integers from The output is not normalized. See [`crosscor`](@ref) for a function with normalization. """ -function crosscov(x::AbstractVector{T}, y::AbstractVector{T}, lags::IntegerVector; demean::Bool=true) where T<:Real - crosscov!(Vector{fptype(T)}(undef, length(lags)), float(x), float(y), lags; demean=demean) +function crosscov(x::RealVector, y::RealVector, lags::IntegerVector; demean::Bool=true) + out = Vector{float(Base.promote_eltype(x, y))}(undef, length(lags)) + crosscov!(out, x, y, lags; demean=demean) end -function crosscov(x::AbstractMatrix{T}, y::AbstractVector{T}, lags::IntegerVector; demean::Bool=true) where T<:Real - crosscov!(Matrix{fptype(T)}(undef, length(lags), size(x,2)), float(x), float(y), lags; demean=demean) +function crosscov(x::RealMatrix, y::RealVector, lags::IntegerVector; demean::Bool=true) + out = Matrix{float(Base.promote_eltype(x, y))}(undef, length(lags), size(x,2)) + crosscov!(out, x, y, lags; demean=demean) end -function crosscov(x::AbstractVector{T}, y::AbstractMatrix{T}, lags::IntegerVector; demean::Bool=true) where T<:Real - crosscov!(Matrix{fptype(T)}(undef, length(lags), size(y,2)), float(x), float(y), lags; demean=demean) +function crosscov(x::RealVector, y::RealMatrix, lags::IntegerVector; demean::Bool=true) + out = Matrix{float(Base.promote_eltype(x, y))}(undef, length(lags), size(y,2)) + crosscov!(out, x, y, lags; demean=demean) end -function crosscov(x::AbstractMatrix{T}, y::AbstractMatrix{T}, lags::IntegerVector; demean::Bool=true) where T<:Real - crosscov!(Array{fptype(T),3}(undef, length(lags), size(x,2), size(y,2)), float(x), float(y), lags; demean=demean) +function crosscov(x::RealMatrix, y::RealMatrix, lags::IntegerVector; demean::Bool=true) + out = Array{float(Base.promote_eltype(x, y)),3}(undef, length(lags), size(x,2), size(y,2)) + crosscov!(out, x, y, lags; demean=demean) end -crosscov(x::AbstractVecOrMat{T}, y::AbstractVecOrMat{T}; demean::Bool=true) where {T<:Real} = crosscov(x, y, default_crosslags(size(x,1)); demean=demean) +crosscov(x::AbstractVecOrMat{<:Real}, y::AbstractVecOrMat{<:Real}; demean::Bool=true) = + crosscov(x, y, default_crosslags(size(x,1)); demean=demean) ## crosscor @@ -361,13 +397,15 @@ three-dimensional array of size `(length(lags), size(x, 2), size(y, 2))`. The output is normalized by `sqrt(var(x)*var(y))`. See [`crosscov!`](@ref) for the unnormalized form. """ -function crosscor!(r::RealVector, x::AbstractVector{T}, y::AbstractVector{T}, lags::IntegerVector; demean::Bool=true) where T<:RealFP +function crosscor!(r::RealVector, x::RealVector, y::RealVector, lags::IntegerVector; demean::Bool=true) lx = length(x) m = length(lags) (length(y) == lx && length(r) == m) || throw(DimensionMismatch()) check_lags(lx, lags) + T = typeof(zero(eltype(x)) / 1) zx::Vector{T} = demean ? x .- mean(x) : x + S = typeof(zero(eltype(y)) / 1) zy::Vector{T} = demean ? y .- mean(y) : y sc = sqrt(dot(zx, zx) * dot(zy, zy)) for k = 1 : m # foreach lag value @@ -376,15 +414,17 @@ function crosscor!(r::RealVector, x::AbstractVector{T}, y::AbstractVector{T}, la return r end -function crosscor!(r::RealMatrix, x::AbstractMatrix{T}, y::AbstractVector{T}, lags::IntegerVector; demean::Bool=true) where T<:RealFP +function crosscor!(r::RealMatrix, x::RealMatrix, y::RealVector, lags::IntegerVector; demean::Bool=true) lx = size(x, 1) ns = size(x, 2) m = length(lags) (length(y) == lx && size(r) == (m, ns)) || throw(DimensionMismatch()) check_lags(lx, lags) + T = typeof(zero(eltype(x)) / 1) zx = Vector{T}(undef, lx) - zy::Vector{T} = demean ? y .- mean(y) : y + S = typeof(zero(eltype(y)) / 1) + zy::Vector{S} = demean ? y .- mean(y) : y yy = dot(zy, zy) for j = 1 : ns demean_col!(zx, x, j, demean) @@ -396,15 +436,17 @@ function crosscor!(r::RealMatrix, x::AbstractMatrix{T}, y::AbstractVector{T}, la return r end -function crosscor!(r::RealMatrix, x::AbstractVector{T}, y::AbstractMatrix{T}, lags::IntegerVector; demean::Bool=true) where T<:RealFP +function crosscor!(r::RealMatrix, x::RealVector, y::RealMatrix, lags::IntegerVector; demean::Bool=true) lx = length(x) ns = size(y, 2) m = length(lags) (size(y, 1) == lx && size(r) == (m, ns)) || throw(DimensionMismatch()) check_lags(lx, lags) + T = typeof(zero(eltype(x)) / 1) zx::Vector{T} = demean ? x .- mean(x) : x - zy = Vector{T}(undef, lx) + S = typeof(zero(eltype(y)) / 1) + zy = Vector{S}(undef, lx) xx = dot(zx, zx) for j = 1 : ns demean_col!(zy, y, j, demean) @@ -416,7 +458,7 @@ function crosscor!(r::RealMatrix, x::AbstractVector{T}, y::AbstractMatrix{T}, la return r end -function crosscor!(r::AbstractArray{T,3}, x::AbstractMatrix{T}, y::AbstractMatrix{T}, lags::IntegerVector; demean::Bool=true) where T<:RealFP +function crosscor!(r::AbstractArray{<:Real,3}, x::RealMatrix, y::RealMatrix, lags::IntegerVector; demean::Bool=true) lx = size(x, 1) nx = size(x, 2) ny = size(y, 2) @@ -425,6 +467,7 @@ function crosscor!(r::AbstractArray{T,3}, x::AbstractMatrix{T}, y::AbstractMatri check_lags(lx, lags) # cached (centered) columns of x + T = typeof(zero(eltype(x)) / 1) zxs = Vector{T}[] sizehint!(zxs, nx) xxs = Vector{T}(undef, nx) @@ -441,8 +484,8 @@ function crosscor!(r::AbstractArray{T,3}, x::AbstractMatrix{T}, y::AbstractMatri xxs[j] = dot(xj, xj) end - zx = Vector{T}(undef, lx) - zy = Vector{T}(undef, lx) + S = typeof(zero(eltype(y)) / 1) + zy = Vector{S}(undef, lx) for j = 1 : ny demean_col!(zy, y, j, demean) yy = dot(zy, zy) @@ -474,23 +517,28 @@ When left unspecified, the lags used are the integers from The output is normalized by `sqrt(var(x)*var(y))`. See [`crosscov`](@ref) for the unnormalized form. """ -function crosscor(x::AbstractVector{T}, y::AbstractVector{T}, lags::IntegerVector; demean::Bool=true) where T<:Real - crosscor!(Vector{fptype(T)}(undef, length(lags)), float(x), float(y), lags; demean=demean) +function crosscor(x::RealVector, y::RealVector, lags::IntegerVector; demean::Bool=true) + out = Vector{float(Base.promote_eltype(x, y))}(undef, length(lags)) + crosscor!(out, x, y, lags; demean=demean) end -function crosscor(x::AbstractMatrix{T}, y::AbstractVector{T}, lags::IntegerVector; demean::Bool=true) where T<:Real - crosscor!(Matrix{fptype(T)}(undef, length(lags), size(x,2)), float(x), float(y), lags; demean=demean) +function crosscor(x::RealMatrix, y::RealVector, lags::IntegerVector; demean::Bool=true) + out = Matrix{float(Base.promote_eltype(x, y))}(undef, length(lags), size(x,2)) + crosscor!(out, x, y, lags; demean=demean) end -function crosscor(x::AbstractVector{T}, y::AbstractMatrix{T}, lags::IntegerVector; demean::Bool=true) where T<:Real - crosscor!(Matrix{fptype(T)}(undef, length(lags), size(y,2)), float(x), float(y), lags; demean=demean) +function crosscor(x::RealVector, y::RealMatrix, lags::IntegerVector; demean::Bool=true) + out = Matrix{float(Base.promote_eltype(x, y))}(undef, length(lags), size(y,2)) + crosscor!(out, x, y, lags; demean=demean) end -function crosscor(x::AbstractMatrix{T}, y::AbstractMatrix{T}, lags::IntegerVector; demean::Bool=true) where T<:Real - crosscor!(Array{fptype(T),3}(undef, length(lags), size(x,2), size(y,2)), float(x), float(y), lags; demean=demean) +function crosscor(x::RealMatrix, y::RealMatrix, lags::IntegerVector; demean::Bool=true) + out = Array{float(Base.promote_eltype(x, y)),3}(undef, length(lags), size(x,2), size(y,2)) + crosscor!(out, x, y, lags; demean=demean) end -crosscor(x::AbstractVecOrMat{T}, y::AbstractVecOrMat{T}; demean::Bool=true) where {T<:Real} = crosscor(x, y, default_crosslags(size(x,1)); demean=demean) +crosscor(x::AbstractVecOrMat{<:Real}, y::AbstractVecOrMat{<:Real}; demean::Bool=true) = + crosscor(x, y, default_crosslags(size(x,1)); demean=demean) ####################################### @@ -501,9 +549,9 @@ crosscor(x::AbstractVecOrMat{T}, y::AbstractVecOrMat{T}; demean::Bool=true) wher # ####################################### -function pacf_regress!(r::RealMatrix, X::AbstractMatrix{T}, lags::IntegerVector, mk::Integer) where T<:RealFP +function pacf_regress!(r::RealMatrix, X::RealMatrix, lags::IntegerVector, mk::Integer) lx = size(X, 1) - tmpX = ones(T, lx, mk + 1) + tmpX = ones(eltype(X), lx, mk + 1) for j = 1 : size(X,2) for l = 1 : mk for i = 1+l:lx @@ -573,10 +621,11 @@ If `x` is a vector, return a vector of the same length as `lags`. If `x` is a matrix, return a matrix of size `(length(lags), size(x, 2))`, where each column in the result corresponds to a column in `x`. """ -function pacf(X::AbstractMatrix{T}, lags::IntegerVector; method::Symbol=:regression) where T<:Real - pacf!(Matrix{fptype(T)}(undef, length(lags), size(X,2)), float(X), lags; method=method) +function pacf(X::RealMatrix, lags::IntegerVector; method::Symbol=:regression) + out = Matrix{float(eltype(X))}(undef, length(lags), size(X,2)) + pacf!(out, float(X), lags; method=method) end -function pacf(x::AbstractVector{T}, lags::IntegerVector; method::Symbol=:regression) where T<:Real +function pacf(x::RealVector, lags::IntegerVector; method::Symbol=:regression) vec(pacf(reshape(x, length(x), 1), lags, method=method)) end diff --git a/src/statmodels.jl b/src/statmodels.jl index a0386120..0e2b4af2 100644 --- a/src/statmodels.jl +++ b/src/statmodels.jl @@ -3,211 +3,237 @@ abstract type StatisticalModel end """ - coef(obj::StatisticalModel) + coef(model::StatisticalModel) Return the coefficients of the model. """ -coef(obj::StatisticalModel) = error("coef is not defined for $(typeof(obj)).") +coef(model::StatisticalModel) = error("coef is not defined for $(typeof(model)).") """ - coefnames(obj::StatisticalModel) + coefnames(model::StatisticalModel) Return the names of the coefficients. """ -coefnames(obj::StatisticalModel) = error("coefnames is not defined for $(typeof(obj)).") +coefnames(model::StatisticalModel) = error("coefnames is not defined for $(typeof(model)).") """ - coeftable(obj::StatisticalModel; level::Real=0.95) + coeftable(model::StatisticalModel; level::Real=0.95) -Return a table of class `CoefTable` with coefficients and related statistics. +Return a table with coefficients and related statistics of the model. `level` determines the level for confidence intervals (by default, 95%). + +The returned `CoefTable` object implements the +[Tables.jl](https://github.com/JuliaData/Tables.jl/) interface, and can be +converted e.g. to a `DataFrame` via `using DataFrames; DataFrame(coeftable(model))`. """ -coeftable(obj::StatisticalModel) = error("coeftable is not defined for $(typeof(obj)).") +coeftable(model::StatisticalModel) = error("coeftable is not defined for $(typeof(model)).") """ - confint(obj::StatisticalModel; level::Real=0.95) + confint(model::StatisticalModel; level::Real=0.95) Compute confidence intervals for coefficients, with confidence level `level` (by default 95%). """ -confint(obj::StatisticalModel) = error("confint is not defined for $(typeof(obj)).") +confint(model::StatisticalModel) = error("confint is not defined for $(typeof(model)).") """ - deviance(obj::StatisticalModel) + deviance(model::StatisticalModel) Return the deviance of the model relative to a reference, which is usually when applicable the saturated model. It is equal, *up to a constant*, to ``-2 \\log L``, with ``L`` the likelihood of the model. """ -deviance(obj::StatisticalModel) = error("deviance is not defined for $(typeof(obj)).") +deviance(model::StatisticalModel) = error("deviance is not defined for $(typeof(model)).") """ - islinear(obj::StatisticalModel) + islinear(model::StatisticalModel) Indicate whether the model is linear. """ -islinear(obj::StatisticalModel) = error("islinear is not defined for $(typeof(obj)).") +islinear(model::StatisticalModel) = error("islinear is not defined for $(typeof(model)).") """ - nulldeviance(obj::StatisticalModel) + nulldeviance(model::StatisticalModel) Return the deviance of the null model, that is the one including only the intercept. """ -nulldeviance(obj::StatisticalModel) = error("nulldeviance is not defined for $(typeof(obj)).") +nulldeviance(model::StatisticalModel) = + error("nulldeviance is not defined for $(typeof(model)).") """ - loglikelihood(obj::StatisticalModel) + loglikelihood(model::StatisticalModel) Return the log-likelihood of the model. """ -loglikelihood(obj::StatisticalModel) = error("loglikelihood is not defined for $(typeof(obj)).") +loglikelihood(model::StatisticalModel) = + error("loglikelihood is not defined for $(typeof(model)).") """ - loglikelihood(obj::StatisticalModel) + loglikelihood(model::StatisticalModel) -Return the log-likelihood of the null model corresponding to model `obj`. +Return the log-likelihood of the null model corresponding to `model`. This is usually the model containing only the intercept. """ -nullloglikelihood(obj::StatisticalModel) = error("nullloglikelihood is not defined for $(typeof(obj)).") +nullloglikelihood(model::StatisticalModel) = + error("nullloglikelihood is not defined for $(typeof(model)).") + +""" + loglikelihood(model::StatisticalModel, ::Colon) + +Return a vector of each observation's contribution to the log-likelihood of the model. +In other words, this is the vector of the pointwise log-likelihood contributions. + +In general, `sum(loglikehood(model, :)) == loglikelihood(model)`. +""" +loglikelihood(model::StatisticalModel, ::Colon) = + error("loglikelihood(model::StatisticalModel, ::Colon) is not defined for $(typeof(model)).") + +""" + loglikelihood(model::StatisticalModel, observation) + +Return the contribution of `observation` to the log-likelihood of `model`. +""" +loglikelihood(model::StatisticalModel, observation) = + error("loglikelihood(model::StatisticalModel, observation) is not defined for $(typeof(model)).") """ - score(obj::StatisticalModel) + score(model::StatisticalModel) -Return the score of the statistical model. The score is the gradient of the +Return the score of the model, that is the gradient of the log-likelihood with respect to the coefficients. """ -score(obj::StatisticalModel) = error("score is not defined for $(typeof(obj)).") +score(model::StatisticalModel) = error("score is not defined for $(typeof(model)).") """ - nobs(obj::StatisticalModel) + nobs(model::StatisticalModel) Return the number of independent observations on which the model was fitted. Be careful when using this information, as the definition of an independent observation may vary depending on the model, on the format used to pass the data, on the sampling plan (if specified), etc. """ -nobs(obj::StatisticalModel) = error("nobs is not defined for $(typeof(obj)).") +nobs(model::StatisticalModel) = error("nobs is not defined for $(typeof(model)).") """ - dof(obj::StatisticalModel) + dof(model::StatisticalModel) Return the number of degrees of freedom consumed in the model, including when applicable the intercept and the distribution's dispersion parameter. """ -dof(obj::StatisticalModel) = error("dof is not defined for $(typeof(obj)).") +dof(model::StatisticalModel) = error("dof is not defined for $(typeof(model)).") """ - mss(obj::StatisticalModel) + mss(model::StatisticalModel) Return the model sum of squares. """ -mss(obj::StatisticalModel) = error("mss is not defined for $(typeof(obj)).") +mss(model::StatisticalModel) = error("mss is not defined for $(typeof(model)).") """ - rss(obj::StatisticalModel) + rss(model::StatisticalModel) -Return the residual sum of squares. +Return the residual sum of squares of the model. """ -rss(obj::StatisticalModel) = error("rss is not defined for $(typeof(obj)).") +rss(model::StatisticalModel) = error("rss is not defined for $(typeof(model)).") """ informationmatrix(model::StatisticalModel; expected::Bool = true) -Return the information matrix. By default the Fisher information matrix is returned, -while the observed information matrix can be requested with `expected = false`. +Return the information matrix of the model. By default the Fisher information matrix +is returned, while the observed information matrix can be requested with `expected = false`. """ informationmatrix(model::StatisticalModel; expected::Bool = true) = - error("informationmatrix is not defined for $(typeof(obj)).") + error("informationmatrix is not defined for $(typeof(model)).") """ - stderror(obj::StatisticalModel) + stderror(model::StatisticalModel) Return the standard errors for the coefficients of the model. """ -stderror(obj::StatisticalModel) = sqrt.(diag(vcov(obj))) +stderror(model::StatisticalModel) = sqrt.(diag(vcov(model))) """ - vcov(obj::StatisticalModel) + vcov(model::StatisticalModel) Return the variance-covariance matrix for the coefficients of the model. """ -vcov(obj::StatisticalModel) = error("vcov is not defined for $(typeof(obj)).") +vcov(model::StatisticalModel) = error("vcov is not defined for $(typeof(model)).") """ - weights(obj::StatisticalModel) + weights(model::StatisticalModel) Return the weights used in the model. """ -weights(obj::StatisticalModel) = error("weights is not defined for $(typeof(obj)).") +weights(model::StatisticalModel) = error("weights is not defined for $(typeof(model)).") """ - isfitted(obj::StatisticalModel) + isfitted(model::StatisticalModel) Indicate whether the model has been fitted. """ -isfitted(obj::StatisticalModel) = error("isfitted is not defined for $(typeof(obj)).") +isfitted(model::StatisticalModel) = error("isfitted is not defined for $(typeof(model)).") """ Fit a statistical model. """ -fit(obj::StatisticalModel, args...) = error("fit is not defined for $(typeof(obj)).") +fit(model::StatisticalModel, args...) = error("fit is not defined for $(typeof(model)).") """ Fit a statistical model in-place. """ -fit!(obj::StatisticalModel, args...) = error("fit! is not defined for $(typeof(obj)).") +fit!(model::StatisticalModel, args...) = error("fit! is not defined for $(typeof(model)).") """ - aic(obj::StatisticalModel) + aic(model::StatisticalModel) Akaike's Information Criterion, defined as ``-2 \\log L + 2k``, with ``L`` the likelihood of the model, and `k` its number of consumed degrees of freedom (as returned by [`dof`](@ref)). """ -aic(obj::StatisticalModel) = -2loglikelihood(obj) + 2dof(obj) +aic(model::StatisticalModel) = -2loglikelihood(model) + 2dof(model) """ - aicc(obj::StatisticalModel) + aicc(model::StatisticalModel) Corrected Akaike's Information Criterion for small sample sizes (Hurvich and Tsai 1989), defined as ``-2 \\log L + 2k + 2k(k-1)/(n-k-1)``, with ``L`` the likelihood of the model, ``k`` its number of consumed degrees of freedom (as returned by [`dof`](@ref)), and ``n`` the number of observations (as returned by [`nobs`](@ref)). """ -function aicc(obj::StatisticalModel) - k = dof(obj) - n = nobs(obj) - -2loglikelihood(obj) + 2k + 2k*(k+1)/(n-k-1) +function aicc(model::StatisticalModel) + k = dof(model) + n = nobs(model) + -2loglikelihood(model) + 2k + 2k*(k+1)/(n-k-1) end """ - bic(obj::StatisticalModel) + bic(model::StatisticalModel) Bayesian Information Criterion, defined as ``-2 \\log L + k \\log n``, with ``L`` the likelihood of the model, ``k`` its number of consumed degrees of freedom (as returned by [`dof`](@ref)), and ``n`` the number of observations (as returned by [`nobs`](@ref)). """ -bic(obj::StatisticalModel) = -2loglikelihood(obj) + dof(obj)*log(nobs(obj)) +bic(model::StatisticalModel) = -2loglikelihood(model) + dof(model)*log(nobs(model)) """ - r2(obj::StatisticalModel) - r²(obj::StatisticalModel) + r2(model::StatisticalModel) + r²(model::StatisticalModel) Coefficient of determination (R-squared). For a linear model, the R² is defined as ``ESS/TSS``, with ``ESS`` the explained sum of squares and ``TSS`` the total sum of squares. """ -function r2(obj::StatisticalModel) +function r2(model::StatisticalModel) Base.depwarn("The default r² method for linear models is deprecated. " * "Packages should define their own methods.", :r2) - mss(obj) / deviance(obj) + mss(model) / deviance(model) end """ - r2(obj::StatisticalModel, variant::Symbol) - r²(obj::StatisticalModel, variant::Symbol) + r2(model::StatisticalModel, variant::Symbol) + r²(model::StatisticalModel, variant::Symbol) Pseudo-coefficient of determination (pseudo R-squared). @@ -216,33 +242,43 @@ Supported variants are: - `:MacFadden` (a.k.a. likelihood ratio index), defined as ``1 - \\log (L)/\\log (L_0)``; - `:CoxSnell`, defined as ``1 - (L_0/L)^{2/n}``; - `:Nagelkerke`, defined as ``(1 - (L_0/L)^{2/n})/(1 - L_0^{2/n})``. +- `:devianceratio`, defined as ``1 - D/D_0``. In the above formulas, ``L`` is the likelihood of the model, ``L_0`` is the likelihood of the null model (the model with only an intercept), -``n`` is the number of observations, ``y_i`` are the responses, -``\\hat{y}_i`` are fitted values and ``\\bar{y}`` is the average response. - -Cox and Snell's R² should match the classical R² for linear models. -""" -function r2(obj::StatisticalModel, variant::Symbol) - ll = loglikelihood(obj) - ll0 = nullloglikelihood(obj) - if variant == :McFadden - 1 - ll/ll0 - elseif variant == :CoxSnell - 1 - exp(2 * (ll0 - ll) / nobs(obj)) - elseif variant == :Nagelkerke - (1 - exp(2 * (ll0 - ll) / nobs(obj))) / (1 - exp(2 * ll0 / nobs(obj))) +``D`` is the deviance of the model (from the saturated model), +``D_0`` is the deviance of the null model, +``n`` is the number of observations (given by [`nobs`](@ref)). + +The Cox-Snell and the deviance ratio variants both match the classical definition of R² +for linear models. +""" +function r2(model::StatisticalModel, variant::Symbol) + loglikbased = (:McFadden, :CoxSnell, :Nagelkerke) + if variant in loglikbased + ll = loglikelihood(model) + ll0 = nullloglikelihood(model) + if variant == :McFadden + 1 - ll/ll0 + elseif variant == :CoxSnell + 1 - exp(2 * (ll0 - ll) / nobs(model)) + elseif variant == :Nagelkerke + (1 - exp(2 * (ll0 - ll) / nobs(model))) / (1 - exp(2 * ll0 / nobs(model))) + end + elseif variant == :devianceratio + dev = deviance(model) + dev0 = nulldeviance(model) + 1 - dev/dev0 else - error("variant must be one of :McFadden, :CoxSnell or :Nagelkerke") + error("variant must be one of $(join(loglikbased, ", ")) or :devianceratio") end end const r² = r2 """ - adjr2(obj::StatisticalModel) - adjr²(obj::StatisticalModel) + adjr2(model::StatisticalModel) + adjr²(model::StatisticalModel) Adjusted coefficient of determination (adjusted R-squared). @@ -250,29 +286,35 @@ For linear models, the adjusted R² is defined as ``1 - (1 - (1-R^2)(n-1)/(n-p)) the coefficient of determination, ``n`` the number of observations, and ``p`` the number of coefficients (including the intercept). This definition is generally known as the Wherry Formula I. """ -adjr2(obj::StatisticalModel) = error("adjr2 is not defined for $(typeof(obj)).") +adjr2(model::StatisticalModel) = error("adjr2 is not defined for $(typeof(model)).") """ - adjr2(obj::StatisticalModel, variant::Symbol) - adjr²(obj::StatisticalModel, variant::Symbol) + adjr2(model::StatisticalModel, variant::Symbol) + adjr²(model::StatisticalModel, variant::Symbol) Adjusted pseudo-coefficient of determination (adjusted pseudo R-squared). For nonlinear models, one of the several pseudo R² definitions must be chosen via `variant`. -The only currently supported variant is `:MacFadden`, defined as ``1 - (\\log (L) - k)/\\log (L0)``. -In this formula, ``L`` is the likelihood of the model, ``L0`` that of the null model -(the model including only the intercept). These two quantities are taken to be minus half -`deviance` of the corresponding models. ``k`` is the number of consumed degrees of freedom -of the model (as returned by [`dof`](@ref)). -""" -function adjr2(obj::StatisticalModel, variant::Symbol) - ll = loglikelihood(obj) - ll0 = nullloglikelihood(obj) - k = dof(obj) +The only currently supported variants are `:MacFadden`, defined as ``1 - (\\log (L) - k)/\\log (L0)`` and +`:devianceratio`, defined as ``1 - (D/(n-k))/(D_0/(n-1))``. +In these formulas, ``L`` is the likelihood of the model, ``L0`` that of the null model +(the model including only the intercept), ``D`` is the deviance of the model, +``D_0`` is the deviance of the null model, ``n`` is the number of observations (given by [`nobs`](@ref)) and +``k`` is the number of consumed degrees of freedom of the model (as returned by [`dof`](@ref)). +""" +function adjr2(model::StatisticalModel, variant::Symbol) + k = dof(model) if variant == :McFadden + ll = loglikelihood(model) + ll0 = nullloglikelihood(model) 1 - (ll - k)/ll0 + elseif variant == :devianceratio + n = nobs(model) + dev = deviance(model) + dev0 = nulldeviance(model) + 1 - (dev*(n-1))/(dev0*(n-k)) else - error(":McFadden is the only currently supported variant") + error("variant must be one of :McFadden or :devianceratio") end end @@ -281,57 +323,81 @@ const adjr² = adjr2 abstract type RegressionModel <: StatisticalModel end """ - fitted(obj::RegressionModel) + fitted(model::RegressionModel) Return the fitted values of the model. """ -fitted(obj::RegressionModel) = error("fitted is not defined for $(typeof(obj)).") +fitted(model::RegressionModel) = error("fitted is not defined for $(typeof(model)).") """ - response(obj::RegressionModel) + response(model::RegressionModel) Return the model response (a.k.a. the dependent variable). """ -response(obj::RegressionModel) = error("response is not defined for $(typeof(obj)).") +response(model::RegressionModel) = error("response is not defined for $(typeof(model)).") """ - meanresponse(obj::RegressionModel) + responsename(model::RegressionModel) + +Return the name of the model response (a.k.a. the dependent variable). +""" +responsename(model::RegressionModel) = error("responsename is not defined for $(typeof(model)).") + +""" + meanresponse(model::RegressionModel) Return the mean of the response. """ -meanresponse(obj::RegressionModel) = error("meanresponse is not defined for $(typeof(obj)).") +meanresponse(model::RegressionModel) = error("meanresponse is not defined for $(typeof(model)).") """ - modelmatrix(obj::RegressionModel) + modelmatrix(model::RegressionModel) Return the model matrix (a.k.a. the design matrix). """ -modelmatrix(obj::RegressionModel) = error("modelmatrix is not defined for $(typeof(obj)).") +modelmatrix(model::RegressionModel) = error("modelmatrix is not defined for $(typeof(model)).") """ - leverage(obj::RegressionModel) + crossmodelmatrix(model::RegressionModel) -Return the diagonal of the projection matrix. +Return `X'X` where `X` is the model matrix of `model`. +This function will return a pre-computed matrix stored in `model` if possible. """ -leverage(obj::RegressionModel) = error("leverage is not defined for $(typeof(obj)).") +crossmodelmatrix(model::RegressionModel) = (x = modelmatrix(model); Symmetric(x' * x)) """ - residuals(obj::RegressionModel) + leverage(model::RegressionModel) + +Return the diagonal of the projection matrix of the model. +""" +leverage(model::RegressionModel) = error("leverage is not defined for $(typeof(model)).") + +""" + cooksdistance(model::RegressionModel) + +Compute [Cook's distance](https://en.wikipedia.org/wiki/Cook%27s_distance) +for each observation in linear model `model`, giving an estimate of the influence +of each data point. +""" +cooksdistance(model::RegressionModel) = error("cooksdistance is not defined for $(typeof(model)).") + +""" + residuals(model::RegressionModel) Return the residuals of the model. """ -residuals(obj::RegressionModel) = error("residuals is not defined for $(typeof(obj)).") +residuals(model::RegressionModel) = error("residuals is not defined for $(typeof(model)).") """ - predict(obj::RegressionModel, [newX]) + predict(model::RegressionModel, [newX]) -Form the predicted response of model `obj`. An object with new covariate values `newX` can be supplied, -which should have the same type and structure as that used to fit `obj`; e.g. for a GLM +Form the predicted response of `model`. An object with new covariate values `newX` can be supplied, +which should have the same type and structure as that used to fit `model`; e.g. for a GLM it would generally be a `DataFrame` with the same variable names as the original predictors. """ function predict end -predict(obj::RegressionModel) = error("predict is not defined for $(typeof(obj)).") +predict(model::RegressionModel) = error("predict is not defined for $(typeof(model)).") """ predict! @@ -340,21 +406,21 @@ In-place version of [`predict`](@ref). """ function predict! end -predict!(obj::RegressionModel) = error("predict! is not defined for $(typeof(obj)).") +predict!(model::RegressionModel) = error("predict! is not defined for $(typeof(model)).") """ - dof_residual(obj::RegressionModel) + dof_residual(model::RegressionModel) Return the residual degrees of freedom of the model. """ -dof_residual(obj::RegressionModel) = error("dof_residual is not defined for $(typeof(obj)).") +dof_residual(model::RegressionModel) = error("dof_residual is not defined for $(typeof(model)).") """ - params(obj) + params(model) Return all parameters of a model. """ -params(obj) = error("params is not defined for $(typeof(obj))") +params(model) = error("params is not defined for $(typeof(model))") function params! end ## coefficient tables with specialized show method @@ -386,17 +452,41 @@ mutable struct CoefTable end end +Base.length(ct::CoefTable) = length(ct.cols[1]) +function Base.eltype(ct::CoefTable) + names = isempty(ct.rownms) ? + tuple(Symbol.(ct.colnms)...) : + tuple(Symbol("Name"), Symbol.(ct.colnms)...) + types = isempty(ct.rownms) ? + Tuple{eltype.(ct.cols)...} : + Tuple{eltype(ct.rownms), eltype.(ct.cols)...} + NamedTuple{names, types} +end + +function Base.iterate(ct::CoefTable, i::Integer=1) + if i in 1:length(ct) + cols = getindex.(ct.cols, Ref(i)) + nt = isempty(ct.rownms) ? + eltype(ct)(tuple(cols...)) : + eltype(ct)(tuple(ct.rownms[i], cols...)) + (nt, i+1) + else + nothing + end +end + """ Show a p-value using 6 characters, either using the standard 0.XXXX representation or as , :≥, :(isless), :(isequal)] # isless and < to place nice with NaN + @eval begin + Base.$op(x::Union{TestStat, PValue}, y::Real) = $op(x.v, y) + Base.$op(y::Real, x::Union{TestStat, PValue}) = $op(y, x.v) + Base.$op(x1::Union{TestStat, PValue}, x2::Union{TestStat, PValue}) = $op(x1.v, x2.v) + end +end + +Base.hash(x::Union{TestStat, PValue}, h::UInt) = hash(x.v, h) + +# necessary to avoid a method ambiguity with isless(::TestStat, NaN) +Base.isless(x::Union{TestStat, PValue}, y::AbstractFloat) = isless(x.v, y) +Base.isless(y::AbstractFloat, x::Union{TestStat, PValue},) = isless(y, x.v) +Base.isequal(y::AbstractFloat, x::Union{TestStat, PValue}) = isequal(y, x.v) +Base.isequal(x::Union{TestStat, PValue}, y::AbstractFloat) = isequal(x.v, y) + +Base.isapprox(x::Union{TestStat, PValue}, y::Real; kwargs...) = isapprox(x.v, y; kwargs...) +Base.isapprox(y::Real, x::Union{TestStat, PValue}; kwargs...) = isapprox(y, x.v; kwargs...) +Base.isapprox(x1::Union{TestStat, PValue}, x2::Union{TestStat, PValue}; kwargs...) = isapprox(x1.v, x2.v; kwargs...) + """Wrap a string so that show omits quotes""" struct NoQuote @@ -431,7 +545,7 @@ function show(io::IO, ct::CoefTable) rownms = [lpad("[$i]",floor(Integer, log10(nr))+3) for i in 1:nr] end mat = [j == 1 ? NoQuote(rownms[i]) : - j-1 == ct.pvalcol ? PValue(cols[j-1][i]) : + j-1 == ct.pvalcol ? NoQuote(sprint(show, PValue(cols[j-1][i]))) : j-1 in ct.teststatcol ? TestStat(cols[j-1][i]) : cols[j-1][i] isa AbstractString ? NoQuote(cols[j-1][i]) : cols[j-1][i] for i in 1:nr, j in 1:nc+1] @@ -457,6 +571,54 @@ function show(io::IO, ct::CoefTable) nothing end +function show(io::IO, ::MIME"text/markdown", ct::CoefTable) + cols = ct.cols; rownms = ct.rownms; colnms = ct.colnms; + nc = length(cols) + nr = length(cols[1]) + if length(rownms) == 0 + rownms = [lpad("[$i]",floor(Integer, log10(nr))+3) for i in 1:nr] + end + mat = [j == 1 ? NoQuote(rownms[i]) : + j-1 == ct.pvalcol ? NoQuote(sprint(show, PValue(cols[j-1][i]))) : + j-1 in ct.teststatcol ? TestStat(cols[j-1][i]) : + cols[j-1][i] isa AbstractString ? NoQuote(cols[j-1][i]) : cols[j-1][i] + for i in 1:nr, j in 1:nc+1] + # Code inspired by print_matrix in Base + io = IOContext(io, :compact=>true, :limit=>false) + A = Base.alignment(io, mat, 1:size(mat, 1), 1:size(mat, 2), + typemax(Int), typemax(Int), 3) + nmswidths = pushfirst!(length.(colnms), 0) + A = [nmswidths[i] > sum(A[i]) ? (A[i][1]+nmswidths[i]-sum(A[i]), A[i][2]) : A[i] + for i in 1:length(A)] + totwidth = sum(sum.(A)) + 2 * (length(A) - 1) + + # not using Markdown stdlib here because that won't give us nice decimal + # alignment (even if that is lost when rendering to HTML, it's still nice + # when looking at the markdown itself) + + print(io, '|', ' '^(sum(A[1])+1)) + for j in 1:length(colnms) + print(io, " | ", lpad(colnms[j], sum(A[j+1]))) + end + + println(io, " |") + print(io, '|', rpad(':', sum(A[1])+2, '-')) + for j in 1:length(colnms) + _pad = j-1 in [ct.teststatcol; ct.pvalcol] ? rpad : lpad + print(io, '|', _pad(':', sum(A[j+1])+2, '-')) + end + println(io, '|') + + for i in 1:size(mat, 1) + print(io, "| ") + Base.print_matrix_row(io, mat, A, i, 1:size(mat, 2), " | ") + print(io, " |") + i != size(mat, 1) && println(io) + end + + nothing +end + """ ConvergenceException(iters::Int, lastchange::Real=NaN, tol::Real=NaN) diff --git a/src/transformations.jl b/src/transformations.jl index 4d15353a..a4214b5d 100644 --- a/src/transformations.jl +++ b/src/transformations.jl @@ -8,72 +8,81 @@ abstract type AbstractDataTransform end Apply transformation `t` to vector or matrix `x` in place. """ -transform!(t::AbstractDataTransform, x::AbstractArray{<:Real,1}) = transform!(x, t, x) -transform!(t::AbstractDataTransform, x::AbstractArray{<:Real,2}) = transform!(x, t, x) +transform!(t::AbstractDataTransform, x::AbstractMatrix{<:Real}) = + transform!(x, t, x) +transform!(t::AbstractDataTransform, x::AbstractVector{<:Real}) = + (transform!(t, reshape(x, :, 1)); x) """ transform(t::AbstractDataTransform, x) -Return a row-standardized vector or matrix `x` using `t` transformation. +Return a standardized copy of vector or matrix `x` using transformation `t`. """ -transform(t::AbstractDataTransform, x::AbstractArray{<:Real,1}) = transform!(similar(x), t, x) -transform(t::AbstractDataTransform, x::AbstractArray{<:Real,2}) = transform!(similar(x), t, x) +transform(t::AbstractDataTransform, x::AbstractMatrix{<:Real}) = + transform!(similar(x), t, x) +transform(t::AbstractDataTransform, x::AbstractVector{<:Real}) = + vec(transform(t, reshape(x, :, 1))) # reconstruct the original data from transformed values """ reconstruct!(t::AbstractDataTransform, y) -Perform an in-place reconstruction into an original data scale from a row-transformed -vector or matrix `y` using `t` transformation. +Perform an in-place reconstruction into an original data scale from a transformed +vector or matrix `y` using transformation `t`. """ -reconstruct!(t::AbstractDataTransform, y::AbstractArray{<:Real,1}) = reconstruct!(y, t, y) -reconstruct!(t::AbstractDataTransform, y::AbstractArray{<:Real,2}) = reconstruct!(y, t, y) +reconstruct!(t::AbstractDataTransform, y::AbstractMatrix{<:Real}) = + reconstruct!(y, t, y) +reconstruct!(t::AbstractDataTransform, y::AbstractVector{<:Real}) = + (reconstruct!(t, reshape(y, :, 1)); y) """ reconstruct(t::AbstractDataTransform, y) -Return a reconstruction of an originally scaled data from a row-transformed vector -or matrix `y` using `t` transformation. +Return a reconstruction of an originally scaled data from a transformed vector +or matrix `y` using transformation `t`. """ -reconstruct(t::AbstractDataTransform, y::AbstractArray{<:Real,1}) = reconstruct!(similar(y), t, y) -reconstruct(t::AbstractDataTransform, y::AbstractArray{<:Real,2}) = reconstruct!(similar(y), t, y) +reconstruct(t::AbstractDataTransform, y::AbstractMatrix{<:Real}) = + reconstruct!(similar(y), t, y) +reconstruct(t::AbstractDataTransform, y::AbstractVector{<:Real}) = + vec(reconstruct(t, reshape(y, :, 1))) """ - Standardization (Z-score transformation) +Standardization (Z-score transformation) """ -struct ZScoreTransform{T<:Real} <: AbstractDataTransform - dim::Int - mean::Vector{T} - scale::Vector{T} +struct ZScoreTransform{T<:Real, U<:AbstractVector{T}} <: AbstractDataTransform + len::Int + dims::Int + mean::U + scale::U - function ZScoreTransform(d::Int, m::Vector{T}, s::Vector{T}) where T + function ZScoreTransform(l::Int, dims::Int, m::U, s::U) where {T<:Real, U<:AbstractVector{T}} lenm = length(m) lens = length(s) - lenm == d || lenm == 0 || throw(DimensionMismatch("Inconsistent dimensions.")) - lens == d || lens == 0 || throw(DimensionMismatch("Inconsistent dimensions.")) - new{T}(d, m, s) + lenm == l || lenm == 0 || throw(DimensionMismatch("Inconsistent dimensions.")) + lens == l || lens == 0 || throw(DimensionMismatch("Inconsistent dimensions.")) + new{T, U}(l, dims, m, s) end end function Base.getproperty(t::ZScoreTransform, p::Symbol) if p === :indim || p === :outdim - return t.dim + return t.len else return getfield(t, p) end end """ - fit(ZScoreTransform, X; center=true, scale=true) + fit(ZScoreTransform, X; dims=nothing, center=true, scale=true) -Fit standardization parameters to `X` and return a `ZScoreTransform` transformation object. - -# Arguments - -* `data`: matrix of samples to fit transformation parameters. +Fit standardization parameters to vector or matrix `X` +and return a `ZScoreTransform` transformation object. # Keyword arguments +* `dims`: if `1` fit standardization parameters in column-wise fashion; + if `2` fit in row-wise fashion. The default is `nothing`, which is equivalent to `dims=2` with a deprecation warning. + * `center`: if `true` (the default) center data so that its mean is zero. * `scale`: if `true` (the default) scale the data so that its variance is equal to one. @@ -84,105 +93,135 @@ Fit standardization parameters to `X` and return a `ZScoreTransform` transformat julia> using StatsBase julia> X = [0.0 -0.5 0.5; 0.0 1.0 2.0] -2×3 Array{Float64,2}: +2×3 Matrix{Float64}: 0.0 -0.5 0.5 0.0 1.0 2.0 -julia> dt = fit(ZScoreTransform, X) -ZScoreTransform{Float64}(2, [0.0, 1.0], [0.5, 1.0]) +julia> dt = fit(ZScoreTransform, X, dims=2) +ZScoreTransform{Float64, Vector{Float64}}(2, 2, [0.0, 1.0], [0.5, 1.0]) julia> StatsBase.transform(dt, X) -2×3 Array{Float64,2}: +2×3 Matrix{Float64}: 0.0 -1.0 1.0 -1.0 0.0 1.0 ``` """ -function fit(::Type{ZScoreTransform}, X::AbstractArray{<:Real,2}; center::Bool=true, scale::Bool=true) - d, n = size(X) - n >= 2 || error("X must contain at least two columns.") - - T = eltype(X) - m, s = mean_and_std(X, 2) - - return ZScoreTransform(d, (center ? vec(m) : zeros(T, 0)), - (scale ? vec(s) : zeros(T, 0))) +function fit(::Type{ZScoreTransform}, X::AbstractMatrix{<:Real}; + dims::Union{Integer,Nothing}=nothing, center::Bool=true, scale::Bool=true) + if dims === nothing + Base.depwarn("fit(t, x) is deprecated: use fit(t, x, dims=2) instead", :fit) + dims = 2 + end + if dims == 1 + n, l = size(X) + n >= 2 || error("X must contain at least two rows.") + m, s = mean_and_std(X, 1) + elseif dims == 2 + l, n = size(X) + n >= 2 || error("X must contain at least two columns.") + m, s = mean_and_std(X, 2) + else + throw(DomainError(dims, "fit only accept dims to be 1 or 2.")) + end + return ZScoreTransform(l, dims, (center ? vec(m) : similar(m, 0)), + (scale ? vec(s) : similar(s, 0))) end -function transform!(y::AbstractVecOrMat{<:Real}, t::ZScoreTransform, x::AbstractVecOrMat{<:Real}) - d = t.dim - size(x,1) == size(y,1) == d || throw(DimensionMismatch("Inconsistent dimensions.")) - n = size(y,2) - size(x,2) == n || throw(DimensionMismatch("Inconsistent dimensions.")) +function fit(::Type{ZScoreTransform}, X::AbstractVector{<:Real}; + dims::Integer=1, center::Bool=true, scale::Bool=true) + if dims != 1 + throw(DomainError(dims, "fit only accepts dims=1 over a vector. Try fit(t, x, dims=1).")) + end - m = t.mean - s = t.scale + return fit(ZScoreTransform, reshape(X, :, 1); dims=dims, center=center, scale=scale) +end - if isempty(m) - if isempty(s) - if x !== y - copyto!(y, x) +function transform!(y::AbstractMatrix{<:Real}, t::ZScoreTransform, x::AbstractMatrix{<:Real}) + if t.dims == 1 + l = t.len + size(x,2) == size(y,2) == l || throw(DimensionMismatch("Inconsistent dimensions.")) + n = size(y,1) + size(x,1) == n || throw(DimensionMismatch("Inconsistent dimensions.")) + + m = t.mean + s = t.scale + + if isempty(m) + if isempty(s) + if x !== y + copyto!(y, x) + end + else + broadcast!(/, y, x, s') end else - broadcast!(/, y, x, s) - end - else - if isempty(s) - broadcast!(-, y, x, m) - else - broadcast!((x,m,s)->(x-m)/s, y, x, m, s) + if isempty(s) + broadcast!(-, y, x, m') + else + broadcast!((x,m,s)->(x-m)/s, y, x, m', s') + end end + elseif t.dims == 2 + t_ = ZScoreTransform(t.len, 1, t.mean, t.scale) + transform!(y', t_, x') end return y end -function reconstruct!(x::AbstractVecOrMat{<:Real}, t::ZScoreTransform, y::AbstractVecOrMat{<:Real}) - d = t.dim - size(x,1) == size(y,1) == d || throw(DimensionMismatch("Inconsistent dimensions.")) - n = size(y,2) - size(x,2) == n || throw(DimensionMismatch("Inconsistent dimensions.")) - - m = t.mean - s = t.scale - - if isempty(m) - if isempty(s) - if y !== x - copyto!(x, y) +function reconstruct!(x::AbstractMatrix{<:Real}, t::ZScoreTransform, y::AbstractMatrix{<:Real}) + if t.dims == 1 + l = t.len + size(x,2) == size(y,2) == l || throw(DimensionMismatch("Inconsistent dimensions.")) + n = size(y,1) + size(x,1) == n || throw(DimensionMismatch("Inconsistent dimensions.")) + + m = t.mean + s = t.scale + + if isempty(m) + if isempty(s) + if y !== x + copyto!(x, y) + end + else + broadcast!(*, x, y, s') end else - broadcast!(*, x, y, s) - end - else - if isempty(s) - broadcast!(+, x, y, m) - else - broadcast!((y,m,s)->y*s+m, x, y, m, s) + if isempty(s) + broadcast!(+, x, y, m') + else + broadcast!((y,m,s)->y*s+m, x, y, m', s') + end end + elseif t.dims == 2 + t_ = ZScoreTransform(t.len, 1, t.mean, t.scale) + reconstruct!(x', t_, y') end return x end """ - Unit range normalization +Unit range normalization """ -struct UnitRangeTransform{T<:Real} <: AbstractDataTransform - dim::Int +struct UnitRangeTransform{T<:Real, U<:AbstractVector} <: AbstractDataTransform + len::Int + dims::Int unit::Bool - min::Vector{T} - scale::Vector{T} + min::U + scale::U - function UnitRangeTransform(d::Int, unit::Bool, min::Vector{T}, max::Vector{T}) where {T} + function UnitRangeTransform(l::Int, dims::Int, unit::Bool, min::U, max::U) where {T, U<:AbstractVector{T}} lenmin = length(min) lenmax = length(max) - lenmin == d || lenmin == 0 || throw(DimensionMismatch("Inconsistent dimensions.")) - lenmax == d || lenmax == 0 || throw(DimensionMismatch("Inconsistent dimensions.")) - new{T}(d, unit, min, max) + lenmin == l || lenmin == 0 || throw(DimensionMismatch("Inconsistent dimensions.")) + lenmax == l || lenmax == 0 || throw(DimensionMismatch("Inconsistent dimensions.")) + new{T, U}(l, dims, unit, min, max) end end function Base.getproperty(t::UnitRangeTransform, p::Symbol) if p === :indim || p === :outdim - return t.dim + return t.len else return getfield(t, p) end @@ -190,19 +229,17 @@ end # fit a unit transform """ - fit(UnitRangeTransform, X; center=true, scale=true) - -Fit a scaling parameters to `X` and return transformation description. + fit(UnitRangeTransform, X; dims=nothing, unit=true) -# Arguments - -* `data`: matrix of samples to fit transformation parameters. +Fit a scaling parameters to vector or matrix `X` +and return a `UnitRangeTransform` transformation object. # Keyword arguments -* `center`: if `true` (the default) centere data around zero. +* `dims`: if `1` fit standardization parameters in column-wise fashion; + if `2` fit in row-wise fashion. The default is `nothing`. -* `scale`: if `true` (the default) perform variance scaling. +* `unit`: if `true` (the default) shift the minimum data to zero. # Examples @@ -210,77 +247,102 @@ Fit a scaling parameters to `X` and return transformation description. julia> using StatsBase julia> X = [0.0 -0.5 0.5; 0.0 1.0 2.0] -2×3 Array{Float64,2}: +2×3 Matrix{Float64}: 0.0 -0.5 0.5 0.0 1.0 2.0 -julia> dt = fit(UnitRangeTransform, X) -UnitRangeTransform{Float64}(2, true, [-0.5, 0.0], [1.0, 0.5]) +julia> dt = fit(UnitRangeTransform, X, dims=2) +UnitRangeTransform{Float64, Vector{Float64}}(2, 2, true, [-0.5, 0.0], [1.0, 0.5]) julia> StatsBase.transform(dt, X) -2×3 Array{Float64,2}: +2×3 Matrix{Float64}: 0.5 0.0 1.0 0.0 0.5 1.0 ``` """ -function fit(::Type{UnitRangeTransform}, X::AbstractArray{<:Real,2}; unit::Bool=true) - d, n = size(X) - - tmin = X[:, 1] - tmax = X[:, 1] - for j = 2:n - @inbounds for i = 1:d - if X[i, j] < tmin[i] - tmin[i] = X[i, j] - elseif X[i, j] > tmax[i] - tmax[i] = X[i, j] - end - end +function fit(::Type{UnitRangeTransform}, X::AbstractMatrix{<:Real}; + dims::Union{Integer,Nothing}=nothing, unit::Bool=true) + if dims === nothing + Base.depwarn("fit(t, x) is deprecated: use fit(t, x, dims=2) instead", :fit) + dims = 2 end - for i = 1:d - @inbounds tmax[i] = 1 / (tmax[i] - tmin[i]) + dims ∈ (1, 2) || throw(DomainError(dims, "fit only accept dims to be 1 or 2.")) + tmin, tmax = _compute_extrema(X, dims) + @. tmax = 1 / (tmax - tmin) + l = length(tmin) + return UnitRangeTransform(l, dims, unit, tmin, tmax) +end + +function _compute_extrema(X::AbstractMatrix, dims::Integer) + dims == 2 && return _compute_extrema(X', 1) + l = size(X, 2) + tmin = similar(X, l) + tmax = similar(X, l) + for i in 1:l + @inbounds tmin[i], tmax[i] = extrema(@view(X[:, i])) end - return UnitRangeTransform(d, unit, tmin, tmax) + return tmin, tmax end -function transform!(y::AbstractVecOrMat{<:Real}, t::UnitRangeTransform, x::AbstractVecOrMat{<:Real}) - d = t.dim - size(x,1) == size(y,1) == d || throw(DimensionMismatch("Inconsistent dimensions.")) - n = size(x,2) - size(y,2) == n || throw(DimensionMismatch("Inconsistent dimensions.")) +function fit(::Type{UnitRangeTransform}, X::AbstractVector{<:Real}; + dims::Integer=1, unit::Bool=true) + if dims != 1 + throw(DomainError(dims, "fit only accept dims=1 over a vector. Try fit(t, x, dims=1).")) + end + tmin, tmax = extrema(X) + tmax = 1 / (tmax - tmin) + return UnitRangeTransform(1, dims, unit, [tmin], [tmax]) +end - tmin = t.min - tscale = t.scale +function transform!(y::AbstractMatrix{<:Real}, t::UnitRangeTransform, x::AbstractMatrix{<:Real}) + if t.dims == 1 + l = t.len + size(x,2) == size(y,2) == l || throw(DimensionMismatch("Inconsistent dimensions.")) + n = size(x,1) + size(y,1) == n || throw(DimensionMismatch("Inconsistent dimensions.")) - if t.unit - broadcast!((x,s,m) -> (x-m)*s, y, x, tscale, tmin) - else - broadcast!(*, y, x, tscale) + tmin = t.min + tscale = t.scale + + if t.unit + broadcast!((x,s,m)->(x-m)*s, y, x, tscale', tmin') + else + broadcast!(*, y, x, tscale') + end + elseif t.dims == 2 + t_ = UnitRangeTransform(t.len, 1, t.unit, t.min, t.scale) + transform!(y', t_, x') end return y end -function reconstruct!(x::AbstractVecOrMat{<:Real}, t::UnitRangeTransform, y::AbstractVecOrMat{<:Real}) - d = t.dim - size(x,1) == size(y,1) == d || throw(DimensionMismatch("Inconsistent dimensions.")) - n = size(y,2) - size(x,2) == n || throw(DimensionMismatch("Inconsistent dimensions.")) +function reconstruct!(x::AbstractMatrix{<:Real}, t::UnitRangeTransform, y::AbstractMatrix{<:Real}) + if t.dims == 1 + l = t.len + size(x,2) == size(y,2) == l || throw(DimensionMismatch("Inconsistent dimensions.")) + n = size(y,1) + size(x,1) == n || throw(DimensionMismatch("Inconsistent dimensions.")) - tmin = t.min - tscale = t.scale + tmin = t.min + tscale = t.scale - if t.unit - broadcast!((y,s,m) -> y/s + m, x, y, tscale, tmin) - else - broadcast!(/, x, y, tscale) + if t.unit + broadcast!((y,s,m)->y/s+m, x, y, tscale', tmin') + else + broadcast!(/, x, y, tscale') + end + elseif t.dims == 2 + t_ = UnitRangeTransform(t.len, 1, t.unit, t.min, t.scale) + reconstruct!(x', t_, y') end return x end """ - standardize(DT, X; kwargs...) + standardize(DT, X; dims=nothing, kwargs...) -Return a row-standardized matrix `X` using `DT` transformation which is a subtype of `AbstractDataTransform`: + Return a standardized copy of vector or matrix `X` along dimensions `dims` + using transformation `DT` which is a subtype of `AbstractDataTransform`: - `ZScoreTransform` - `UnitRangeTransform` @@ -290,17 +352,17 @@ Return a row-standardized matrix `X` using `DT` transformation which is a subtyp ```jldoctest julia> using StatsBase -julia> standardize(ZScoreTransform, [0.0 -0.5 0.5; 0.0 1.0 2.0]) -2×3 Array{Float64,2}: +julia> standardize(ZScoreTransform, [0.0 -0.5 0.5; 0.0 1.0 2.0], dims=2) +2×3 Matrix{Float64}: 0.0 -1.0 1.0 -1.0 0.0 1.0 -julia> standardize(UnitRangeTransform, [0.0 -0.5 0.5; 0.0 1.0 2.0]) -2×3 Array{Float64,2}: +julia> standardize(UnitRangeTransform, [0.0 -0.5 0.5; 0.0 1.0 2.0], dims=2) +2×3 Matrix{Float64}: 0.5 0.0 1.0 0.0 0.5 1.0 ``` """ -function standardize(::Type{DT}, X::AbstractArray{<:Real,2}; kwargs...) where {DT<:AbstractDataTransform} +function standardize(::Type{DT}, X::AbstractVecOrMat{<:Real}; kwargs...) where {DT <: AbstractDataTransform} return transform(fit(DT, X; kwargs...), X) end diff --git a/src/weights.jl b/src/weights.jl index 7497a410..58ea878b 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -28,19 +28,30 @@ macro weights(name) values::V sum::S end - $(esc(name))(vs) = $(esc(name))(vs, sum(vs)) + $(esc(name))(values::AbstractVector{<:Real}) = $(esc(name))(values, sum(values)) end end -Base.eltype(wv::AbstractWeights) = eltype(wv.values) Base.length(wv::AbstractWeights) = length(wv.values) -Base.values(wv::AbstractWeights) = wv.values Base.sum(wv::AbstractWeights) = wv.sum Base.isempty(wv::AbstractWeights) = isempty(wv.values) - -Base.getindex(wv::AbstractWeights, i::Int) = getindex(wv.values, i) Base.size(wv::AbstractWeights) = size(wv.values) +Base.convert(::Type{Vector}, wv::AbstractWeights) = convert(Vector, wv.values) + +Base.@propagate_inbounds function Base.getindex(wv::AbstractWeights, i::Integer) + @boundscheck checkbounds(wv, i) + @inbounds wv.values[i] +end + +Base.@propagate_inbounds function Base.getindex(wv::W, i::AbstractArray) where W <: AbstractWeights + @boundscheck checkbounds(wv, i) + @inbounds v = wv.values[i] + W(v, sum(v)) +end + +Base.getindex(wv::W, ::Colon) where {W <: AbstractWeights} = W(copy(wv.values), sum(wv)) + Base.@propagate_inbounds function Base.setindex!(wv::AbstractWeights, v::Real, i::Int) s = v - wv[i] wv.values[i] = v @@ -48,6 +59,16 @@ Base.@propagate_inbounds function Base.setindex!(wv::AbstractWeights, v::Real, i v end +""" + varcorrection(n::Integer, corrected=false) + +Compute a bias correction factor for calculating `var`, `std` and `cov` with +`n` observations. Returns ``\\frac{1}{n - 1}`` when `corrected=true` +(i.e. [Bessel's correction](https://en.wikipedia.org/wiki/Bessel's_correction)), +otherwise returns ``\\frac{1}{n}`` (i.e. no correction). +""" +@inline varcorrection(n::Integer, corrected::Bool=false) = 1 / (n - Int(corrected)) + @weights Weights @doc """ @@ -73,6 +94,18 @@ See the documentation for [`Weights`](@ref) for more details. weights(vs::AbstractVector{<:Real}) = Weights(vs) weights(vs::AbstractArray{<:Real}) = Weights(vec(vs)) +""" + varcorrection(w::Weights, corrected=false) + +Returns ``\\frac{1}{\\sum w}`` when `corrected=false` and throws an `ArgumentError` +if `corrected=true`. +""" +@inline function varcorrection(w::Weights, corrected::Bool=false) + corrected && throw(ArgumentError("Weights type does not support bias correction: " * + "use FrequencyWeights, AnalyticWeights or ProbabilityWeights if applicable.")) + 1 / w.sum +end + @weights AnalyticWeights @doc """ @@ -102,6 +135,23 @@ See the documentation for [`AnalyticWeights`](@ref) for more details. aweights(vs::AbstractVector{<:Real}) = AnalyticWeights(vs) aweights(vs::AbstractArray{<:Real}) = AnalyticWeights(vec(vs)) +""" + varcorrection(w::AnalyticWeights, corrected=false) + +* `corrected=true`: ``\\frac{1}{\\sum w - \\sum {w^2} / \\sum w}`` +* `corrected=false`: ``\\frac{1}{\\sum w}`` +""" +@inline function varcorrection(w::AnalyticWeights, corrected::Bool=false) + s = w.sum + + if corrected + sum_sn = sum(x -> (x / s) ^ 2, w) + 1 / (s * (1 - sum_sn)) + else + 1 / s + end +end + @weights FrequencyWeights @doc """ @@ -129,6 +179,22 @@ See the documentation for [`FrequencyWeights`](@ref) for more details. fweights(vs::AbstractVector{<:Real}) = FrequencyWeights(vs) fweights(vs::AbstractArray{<:Real}) = FrequencyWeights(vec(vs)) +""" + varcorrection(w::FrequencyWeights, corrected=false) + +* `corrected=true`: ``\\frac{1}{\\sum{w} - 1}`` +* `corrected=false`: ``\\frac{1}{\\sum w}`` +""" +@inline function varcorrection(w::FrequencyWeights, corrected::Bool=false) + s = w.sum + + if corrected + 1 / (s - 1) + else + 1 / s + end +end + @weights ProbabilityWeights @doc """ @@ -157,7 +223,153 @@ See the documentation for [`ProbabilityWeights`](@ref) for more details. pweights(vs::AbstractVector{<:Real}) = ProbabilityWeights(vs) pweights(vs::AbstractArray{<:Real}) = ProbabilityWeights(vec(vs)) -##### Equality tests ##### +""" + varcorrection(w::ProbabilityWeights, corrected=false) + +* `corrected=true`: ``\\frac{n}{(n - 1) \\sum w}``, where ``n`` equals `count(!iszero, w)` +* `corrected=false`: ``\\frac{1}{\\sum w}`` +""" +@inline function varcorrection(w::ProbabilityWeights, corrected::Bool=false) + s = w.sum + + if corrected + n = count(!iszero, w) + n / (s * (n - 1)) + else + 1 / s + end +end + +""" + eweights(t::AbstractVector{<:Integer}, λ::Real) + eweights(t::AbstractVector{T}, r::StepRange{T}, λ::Real) where T + eweights(n::Integer, λ::Real) + +Construct a [`Weights`](@ref) vector which assigns exponentially decreasing weights to past +observations, which in this case corresponds to larger integer values `i` in `t`. +If an integer `n` is provided, weights are generated for values from 1 to `n` +(equivalent to `t = 1:n`). + +For each element `i` in `t` the weight value is computed as: + +``λ (1 - λ)^{1 - i}`` + +# Arguments + +- `t::AbstractVector`: temporal indices or timestamps +- `r::StepRange`: a larger range to use when constructing weights from a subset of timestamps +- `n::Integer`: if provided instead of `t`, temporal indices are taken to be `1:n` +- `λ::Real`: a smoothing factor or rate parameter such that ``0 < λ ≤ 1``. + As this value approaches 0, the resulting weights will be almost equal, + while values closer to 1 will put greater weight on the tail elements of the vector. + +# Examples +```julia-repl +julia> eweights(1:10, 0.3) +10-element Weights{Float64,Float64,Array{Float64,1}}: + 0.3 + 0.42857142857142855 + 0.6122448979591837 + 0.8746355685131197 + 1.249479383590171 + 1.7849705479859588 + 2.549957925694227 + 3.642797036706039 + 5.203995766722913 + 7.434279666747019 +``` +""" +function eweights(t::AbstractVector{T}, λ::Real) where T<:Integer + 0 < λ <= 1 || throw(ArgumentError("Smoothing factor must be between 0 and 1")) + + w0 = map(t) do i + i > 0 || throw(ArgumentError("Time indices must be non-zero positive integers")) + λ * (1 - λ)^(1 - i) + end + + s = sum(w0) + Weights(w0, s) +end + +eweights(n::Integer, λ::Real) = eweights(1:n, λ) +eweights(t::AbstractVector, r::AbstractRange, λ::Real) = + eweights(something.(indexin(t, r)), λ) + +# NOTE: no variance correction is implemented for exponential weights + +struct UnitWeights{T<:Real} <: AbstractWeights{Int, T, V where V<:Vector{T}} + len::Int +end + +@doc """ + UnitWeights{T}(s) + +Construct a `UnitWeights` vector with length `s` and weight elements of type `T`. +All weight elements are identically one. +""" UnitWeights + +Base.sum(wv::UnitWeights{T}) where T = convert(T, length(wv)) +Base.isempty(wv::UnitWeights) = iszero(wv.len) +Base.length(wv::UnitWeights) = wv.len +Base.size(wv::UnitWeights) = tuple(length(wv)) + +Base.convert(::Type{Vector}, wv::UnitWeights{T}) where {T} = ones(T, length(wv)) + +Base.@propagate_inbounds function Base.getindex(wv::UnitWeights{T}, i::Integer) where T + @boundscheck checkbounds(wv, i) + one(T) +end + +Base.@propagate_inbounds function Base.getindex(wv::UnitWeights{T}, i::AbstractArray{<:Int}) where T + @boundscheck checkbounds(wv, i) + UnitWeights{T}(length(i)) +end + +function Base.getindex(wv::UnitWeights{T}, i::AbstractArray{Bool}) where T + length(wv) == length(i) || throw(DimensionMismatch()) + UnitWeights{T}(count(i)) +end + +Base.getindex(wv::UnitWeights{T}, ::Colon) where {T} = UnitWeights{T}(wv.len) + +""" + uweights(s::Integer) + uweights(::Type{T}, s::Integer) where T<:Real + +Construct a `UnitWeights` vector with length `s` and weight elements of type `T`. +All weight elements are identically one. + +# Examples +```julia-repl +julia> uweights(3) +3-element UnitWeights{Int64}: + 1 + 1 + 1 + +julia> uweights(Float64, 3) +3-element UnitWeights{Float64}: + 1.0 + 1.0 + 1.0 +``` +""" +uweights(s::Int) = UnitWeights{Int}(s) +uweights(::Type{T}, s::Int) where {T<:Real} = UnitWeights{T}(s) + +""" + varcorrection(w::UnitWeights, corrected=false) + +* `corrected=true`: ``\\frac{n}{n - 1}``, where ``n`` is the length of the weight vector +* `corrected=false`: ``\\frac{1}{n}``, where ``n`` is the length of the weight vector + +This definition is equivalent to the correction applied to unweighted data. +""" +@inline function varcorrection(w::UnitWeights, corrected::Bool=false) + corrected ? (1 / (w.len - 1)) : (1 / w.len) +end + +#### Equality tests ##### for w in (AnalyticWeights, FrequencyWeights, ProbabilityWeights, Weights) @eval begin @@ -166,5 +378,8 @@ for w in (AnalyticWeights, FrequencyWeights, ProbabilityWeights, Weights) end end +Base.isequal(x::UnitWeights, y::UnitWeights) = isequal(x.len, y.len) +Base.:(==)(x::UnitWeights, y::UnitWeights) = (x.len == y.len) + Base.isequal(x::AbstractWeights, y::AbstractWeights) = false -Base.:(==)(x::AbstractWeights, y::AbstractWeights) = false \ No newline at end of file +Base.:(==)(x::AbstractWeights, y::AbstractWeights) = false diff --git a/src/wsum.jl b/src/wsum.jl index 92418561..0a245665 100644 --- a/src/wsum.jl +++ b/src/wsum.jl @@ -21,6 +21,22 @@ function _wsum(A::AbstractArray, dims::Colon, w::AbstractArray{<:Real}) s end +function _wsum(A::AbstractArray, dims, w::UnitWeights) + size(A, dims) != length(w) && throw(DimensionMismatch("Inconsistent array dimension.")) + return sum(A, dims=dims) +end + +function _wsum(A::AbstractArray, dims::Colon, w::UnitWeights) + length(A) != length(w) && throw(DimensionMismatch("Inconsistent array dimension.")) + return sum(A) +end + +# To fix ambiguity +function _wsum(A::AbstractArray{<:BlasReal}, dims::Colon, w::UnitWeights) + length(A) != length(w) && throw(DimensionMismatch("Inconsistent array dimension.")) + return sum(A) +end + wsum!(r::AbstractArray, A::AbstractArray; init::Bool=true, weights::AbstractArray) = _wsum!(r, A, weights; init=init) @@ -141,7 +157,7 @@ end # Optimized method for weighted sum with BlasReal # dot cannot be used for other types as it uses + rather than add_sum for accumulation, # and therefore does not return the correct type -_wsum(A::AbstractArray{T}, dims::Colon, w::AbstractArray{T}) where {T<:BlasReal} = +_wsum(A::AbstractArray{<:BlasReal}, dims::Colon, w::AbstractArray{<:BlasReal}) = dot(vec(A), vec(w)) # Optimized methods for weighted sum over dimensions with BlasReal diff --git a/test/counts.jl b/test/counts.jl index 2fd50832..d7b6fea0 100644 --- a/test/counts.jl +++ b/test/counts.jl @@ -80,6 +80,14 @@ cm = countmap(x) @test cm["a"] == 3 @test cm["b"] == 2 @test cm["c"] == 1 + +# iterator, non-radixsort +cm_missing = countmap(skipmissing(x)) +cm_any_itr = countmap((i for i in x)) +@test cm_missing == cm_any_itr == cm +@test cm_missing isa Dict{String, Int} +@test cm_any_itr isa Dict{Any, Int} + pm = proportionmap(x) @test pm["a"] ≈ (1/2) @test pm["b"] ≈ (1/3) @@ -91,6 +99,18 @@ xx = repeat([6, 1, 3, 1], outer=100_000) cm = countmap(xx) @test cm == Dict(1 => 200_000, 3 => 100_000, 6 => 100_000) +# with iterator +cm_missing = countmap(skipmissing(xx)) +@test cm_missing isa Dict{Int, Int} +@test cm_missing == cm + +cm_any_itr = countmap((i for i in xx)) +@test cm_any_itr isa Dict{Any,Int} # no knowledge about type +@test cm_missing == cm + +# with empty array +@test countmap(Int[]) == Dict{Int, Int}() + # testing the radixsort-based addcounts xx = repeat([6, 1, 3, 1], outer=100_000) cm = Dict{Int, Int}() @@ -99,11 +119,20 @@ StatsBase.addcounts_radixsort!(cm,xx) xx2 = repeat([7, 1, 3, 1], outer=100_000) StatsBase.addcounts_radixsort!(cm,xx2) @test cm == Dict(1 => 400_000, 3 => 200_000, 6 => 100_000, 7 => 100_000) +# with iterator +cm_missing = Dict{Int, Int}() +StatsBase.addcounts_radixsort!(cm_missing,skipmissing(xx)) +@test cm_missing == Dict(1 => 200_000, 3 => 100_000, 6 => 100_000) +StatsBase.addcounts_radixsort!(cm_missing,skipmissing(xx2)) +@test cm_missing == Dict(1 => 400_000, 3 => 200_000, 6 => 100_000, 7 => 100_000) # testing the Dict-based addcounts cm = Dict{Int, Int}() +cm_itr = Dict{Int, Int}() StatsBase.addcounts_dict!(cm,xx) -@test cm == Dict(1 => 200_000, 3 => 100_000, 6 => 100_000) +StatsBase.addcounts_dict!(cm_itr,skipmissing(xx)) +@test cm_itr == cm == Dict(1 => 200_000, 3 => 100_000, 6 => 100_000) +@test cm_itr isa Dict{Int, Int} cm = countmap(x, weights(w)) @test cm["a"] == 5.5 @@ -119,11 +148,16 @@ pm = proportionmap(x, weights(w)) # testing small bits type bx = [true, false, true, true, false] -@test countmap(bx) == Dict(true => 3, false => 2) +cm_bx_missing = countmap(skipmissing(bx)) +@test cm_bx_missing == countmap(bx) == Dict(true => 3, false => 2) +@test cm_bx_missing isa Dict{Bool, Int} for T in [UInt8, UInt16, Int8, Int16] tx = T[typemin(T), 8, typemax(T), 19, 8] - @test countmap(tx) == Dict(typemin(T) => 1, typemax(T) => 1, 8 => 2, 19 => 1) + tx_missing = skipmissing(T[typemin(T), 8, typemax(T), 19, 8]) + cm_tx_missing = countmap(tx_missing) + @test cm_tx_missing == countmap(tx) == Dict(typemin(T) => 1, typemax(T) => 1, 8 => 2, 19 => 1) + @test cm_tx_missing isa Dict{T, Int} end @testset "views" begin diff --git a/test/empirical.jl b/test/empirical.jl index 311d64bd..0c22f341 100644 --- a/test/empirical.jl +++ b/test/empirical.jl @@ -12,4 +12,44 @@ using Test fnecdf = ecdf([0.5]) @test fnecdf([zeros(5000); ones(5000)]) == [zeros(5000); ones(5000)] @test extrema(fnecdf) == (minimum(fnecdf), maximum(fnecdf)) == (0.5, 0.5) + @test isnan(ecdf([1,2,3])(NaN)) + @test_throws ArgumentError ecdf([1, NaN]) +end + +@testset "Weighted ECDF" begin + x = randn(10000000) + w1 = rand(10000000) + w2 = weights(w1) + fnecdf = ecdf(x, weights=w1) + fnecdfalt = ecdf(x, weights=w2) + @test fnecdf.sorted_values == fnecdfalt.sorted_values + @test fnecdf.weights == fnecdfalt.weights + @test fnecdf.weights != w1 # check that w wasn't accidently modified in place + @test fnecdfalt.weights != w2 + y = [-1.96, -1.644854, -1.281552, -0.6744898, 0, 0.6744898, 1.281552, 1.644854, 1.96] + @test isapprox(fnecdf(y), [0.025, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.975], atol=1e-3) + @test isapprox(fnecdf(1.96), 0.975, atol=1e-3) + @test fnecdf(y) ≈ map(fnecdf, y) + @test extrema(fnecdf) == (minimum(fnecdf), maximum(fnecdf)) == extrema(x) + fnecdf = ecdf([1.0, 0.5], weights=weights([3, 1])) + @test fnecdf(0.75) == 0.25 + @test extrema(fnecdf) == (minimum(fnecdf), maximum(fnecdf)) == (0.5, 1.0) + @test_throws ArgumentError ecdf(rand(8), weights=weights(rand(10))) + # Check frequency weights + v = randn(100) + r = rand(1:100, 100) + vv = vcat(fill.(v, r)...) # repeat elements of v according to r + fw = fweights(r) + frecdf1 = ecdf(v, weights=fw) + frecdf2 = ecdf(vv) + @test frecdf1(y) ≈ frecdf2(y) + # Check probability weights + a = randn(100) + b = rand(100) + b̃ = abs(10randn()) * b + bw1 = pweights(b) + bw2 = pweights(b̃) + precdf1 = ecdf(a, weights=bw1) + precdf2 = ecdf(a, weights=bw2) + @test precdf1(y) ≈ precdf2(y) end diff --git a/test/misc.jl b/test/misc.jl index 6773d2c2..63fc24df 100644 --- a/test/misc.jl +++ b/test/misc.jl @@ -15,6 +15,17 @@ vals, lens = rle(z) @test lens == [2, 2, 1, 1, 3] @test inverse_rle(vals, lens) == z +z = BitArray([true, true, false, false, true]) +(vals, lens) = rle(z) +@test vals == [true, false, true] +@test lens == [2, 2, 1] + +z = [1, 1, 2, missing, 2, 3, 1, missing, missing, 3, 3, 3, 3] +vals, lens = rle(z) +@test isequal(vals, [1, 2, missing, 2, 3, 1, missing, 3]) +@test lens == [2, 1, 1, 1, 1, 1, 2, 4] +@test isequal(inverse_rle(vals, lens), z) + # levelsmap a = [1, 1, 2, 2, 2, 3, 1, 2, 2, 3, 3, 3, 3, 2] b = [true, false, false, true, false, true, true, false] diff --git a/test/moments.jl b/test/moments.jl index b1dbbbe8..e867767e 100644 --- a/test/moments.jl +++ b/test/moments.jl @@ -56,8 +56,8 @@ expected_std = sqrt.(expected_var) end x = rand(5, 6) -w1 = rand(5) -w2 = rand(6) +w1 = [0.57, 5.10, 0.91, 1.72, 0.0] +w2 = [3.84, 2.70, 8.29, 8.91, 9.71, 0.0] @testset "Uncorrected with $f" for f in weight_funcs wv1 = f(w1) diff --git a/test/pairwise.jl b/test/pairwise.jl new file mode 100644 index 00000000..09699b27 --- /dev/null +++ b/test/pairwise.jl @@ -0,0 +1,261 @@ +using StatsBase +using Test, Random, Statistics, LinearAlgebra +using Missings + +const ≅ = isequal + +Random.seed!(1) + +# to avoid using specialized method +arbitrary_fun(x, y) = cor(x, y) + +@testset "pairwise and pairwise! with $f" for f in (arbitrary_fun, cor, cov) + @testset "basic interface" begin + x = [rand(10) for _ in 1:4] + y = [rand(Float32, 10) for _ in 1:5] + # to test case where inference of returned eltype fails + z = [Vector{Any}(rand(Float32, 10)) for _ in 1:5] + + res = @inferred pairwise(f, x, y) + @test res isa Matrix{Float64} + res2 = zeros(Float64, size(res)) + @test pairwise!(f, res2, x, y) === res2 + @test res == res2 == [f(xi, yi) for xi in x, yi in y] + + res = pairwise(f, y, z) + @test res isa Matrix{Float32} + res2 = zeros(Float32, size(res)) + @test pairwise!(f, res2, y, z) === res2 + @test res == res2 == [f(yi, zi) for yi in y, zi in z] + + res = pairwise(f, Any[[1.0, 2.0, 3.0], [1.0f0, 3.0f0, 10.5f0]]) + @test res isa Matrix{Float64} + res2 = zeros(AbstractFloat, size(res)) + @test pairwise!(f, res2, Any[[1.0, 2.0, 3.0], [1.0f0, 3.0f0, 10.5f0]]) === res2 + @test res == res2 == + [f(xi, yi) for xi in ([1.0, 2.0, 3.0], [1.0f0, 3.0f0, 10.5f0]), + yi in ([1.0, 2.0, 3.0], [1.0f0, 3.0f0, 10.5f0])] + @test res isa Matrix{Float64} + + @inferred pairwise(f, x, y) + + @test_throws Union{ArgumentError,MethodError} pairwise(f, [Int[]], [Int[]]) + @test_throws Union{ArgumentError,MethodError} pairwise!(f, zeros(1, 1), [Int[]], [Int[]]) + + res = pairwise(f, [], []) + @test size(res) == (0, 0) + @test res isa Matrix{Any} + res2 = zeros(0, 0) + @test pairwise!(f, res2, [], []) === res2 + + res = pairwise(f, Vector{Int}[], Vector{Int}[]) + @test size(res) == (0, 0) + @test res isa Matrix{Float64} + res2 = zeros(0, 0) + @test pairwise!(f, res2, Vector{Int}[], Vector{Int}[]) === res2 + + res = pairwise(f, [[1, 2]], Vector{Int}[]) + @test size(res) == (1, 0) + @test res isa Matrix{Float64} + res2 = zeros(1, 0) + @test pairwise!(f, res2, [[1, 2]], Vector{Int}[]) === res2 + + res = pairwise(f, Vector{Int}[], [[1, 2], [2, 3]]) + @test size(res) == (0, 2) + @test res isa Matrix{Float64} + res2 = zeros(0, 2) + @test pairwise!(f, res2, [], [[1, 2], [2, 3]]) === res2 + + @test_throws DimensionMismatch pairwise!(f, zeros(1, 2), x, y) + @test_throws DimensionMismatch pairwise!(f, zeros(1, 2), [], []) + @test_throws DimensionMismatch pairwise!(f, zeros(0, 0), + [], [[1, 2], [2, 3]]) + end + + @testset "missing values handling interface" begin + xm = [ifelse.(rand(100) .> 0.9, missing, rand(100)) for _ in 1:4] + ym = [ifelse.(rand(100) .> 0.9, missing, rand(Float32, 100)) for _ in 1:4] + zm = [ifelse.(rand(100) .> 0.9, missing, rand(Float32, 100)) for _ in 1:4] + + res = pairwise(f, xm, ym) + @test res isa Matrix{Missing} + res2 = zeros(Union{Float64, Missing}, size(res)) + @test pairwise!(f, res2, xm, ym) === res2 + @test res ≅ res2 ≅ [missing for xi in xm, yi in ym] + + res = pairwise(f, xm, ym, skipmissing=:pairwise) + @test res isa Matrix{Float64} + res2 = zeros(Union{Float64, Missing}, size(res)) + @test pairwise!(f, res2, xm, ym, skipmissing=:pairwise) === res2 + @test res ≅ res2 + @test isapprox(res, [f(collect.(skipmissings(xi, yi))...) for xi in xm, yi in ym], + rtol=1e-6) + + res = pairwise(f, ym, zm, skipmissing=:pairwise) + @test res isa Matrix{Float32} + res2 = zeros(Union{Float32, Missing}, size(res)) + @test pairwise!(f, res2, ym, zm, skipmissing=:pairwise) === res2 + @test res ≅ res2 + @test isapprox(res, [f(collect.(skipmissings(yi, zi))...) for yi in ym, zi in zm], + rtol=1e-6) + + nminds = mapreduce(x -> .!ismissing.(x), + (x, y) -> x .& y, + [xm; ym]) + res = pairwise(f, xm, ym, skipmissing=:listwise) + @test res isa Matrix{Float64} + res2 = zeros(Union{Float64, Missing}, size(res)) + @test pairwise!(f, res2, xm, ym, skipmissing=:listwise) === res2 + @test res ≅ res2 + @test isapprox(res, [f(view(xi, nminds), view(yi, nminds)) for xi in xm, yi in ym], + rtol=1e-6) + + if VERSION >= v"1.6.0-DEV" + # inference of cor fails so use an inferrable function + # to check that pairwise itself is inferrable + for skipmissing in (:none, :pairwise, :listwise) + g(x, y=x) = pairwise((x, y) -> x[1] * y[1], x, y, skipmissing=skipmissing) + @test Core.Compiler.return_type(g, Tuple{Vector{Vector{Union{Float64, Missing}}}}) == + Core.Compiler.return_type(g, Tuple{Vector{Vector{Union{Float64, Missing}}}, + Vector{Vector{Union{Float64, Missing}}}}) == + Matrix{<: Union{Float64, Missing}} + if skipmissing in (:pairwise, :listwise) + @test_broken Core.Compiler.return_type(g, Tuple{Vector{Vector{Union{Float64, Missing}}}}) == + Core.Compiler.return_type(g, Tuple{Vector{Vector{Union{Float64, Missing}}}, + Vector{Vector{Union{Float64, Missing}}}}) == + Matrix{Float64} + end + end + end + + @test_throws ArgumentError pairwise(f, xm, ym, skipmissing=:something) + @test_throws ArgumentError pairwise!(f, zeros(Union{Float64, Missing}, + length(xm), length(ym)), xm, ym, + skipmissing=:something) + + # variable with only missings + xm = [fill(missing, 10), rand(10)] + ym = [rand(10), rand(10)] + + res = pairwise(f, xm, ym) + @test res isa Matrix{Union{Float64, Missing}} + res2 = zeros(Union{Float64, Missing}, size(res)) + @test pairwise!(f, res2, xm, ym) === res2 + @test res ≅ res2 ≅ [f(xi, yi) for xi in xm, yi in ym] + + if VERSION >= v"1.5" # Fails with UndefVarError on Julia 1.0 + @test_throws Union{ArgumentError,MethodError} pairwise(f, xm, ym, skipmissing=:pairwise) + @test_throws Union{ArgumentError,MethodError} pairwise(f, xm, ym, skipmissing=:listwise) + + res = zeros(Union{Float64, Missing}, length(xm), length(ym)) + @test_throws Union{ArgumentError,MethodError} pairwise!(f, res, xm, ym, skipmissing=:pairwise) + @test_throws Union{ArgumentError,MethodError} pairwise!(f, res, xm, ym, skipmissing=:listwise) + end + + for sm in (:pairwise, :listwise) + @test_throws ArgumentError pairwise(f, [[1, 2]], [1], skipmissing=sm) + @test_throws ArgumentError pairwise(f, [1], [[1, 2]], skipmissing=sm) + @test_throws ArgumentError pairwise(f, [1], [1], skipmissing=sm) + end + end + + @testset "iterators" begin + x = (v for v in [rand(10) for _ in 1:4]) + y = (v for v in [rand(10) for _ in 1:4]) + + res = @inferred pairwise(f, x, y) + res2 = zeros(size(res)) + @test pairwise!(f, res2, x, y) === res2 + @test res == res2 == pairwise(f, collect(x), collect(y)) + + res = @inferred(pairwise(f, x)) + res2 = zeros(size(res)) + @test pairwise!(f, res2, x) === res2 + @test res == res2 == pairwise(f, collect(x)) + end + + @testset "non-vector entries" begin + x = (Iterators.drop(v, 1) for v in [rand(10) for _ in 1:4]) + y = (Iterators.drop(v, 1) for v in [rand(10) for _ in 1:4]) + + @test pairwise((x, y) -> f(collect(x), collect(y)), x, y) == + [f(collect(xi), collect(yi)) for xi in x, yi in y] + @test pairwise((x, y) -> f(collect(x), collect(y)), x) == + [f(collect(xi1), collect(xi2)) for xi1 in x, xi2 in x] + @test_throws ArgumentError pairwise((x, y) -> f(collect(x), collect(y)), x, y, + skipmissing=:pairwise) + @test_throws ArgumentError pairwise((x, y) -> f(collect(x), collect(y)), x, y, + skipmissing=:listwise) + end + + @testset "two-argument method" begin + x = [rand(10) for _ in 1:4] + res = pairwise(f, x) + res2 = zeros(size(res)) + @test pairwise!(f, res2, x) === res2 + @test res == res2 == pairwise(f, x, x) + end + + @testset "symmetric" begin + x = [rand(10) for _ in 1:4] + y = [rand(10) for _ in 1:4] + + @test pairwise(f, x, x, symmetric=true) == + pairwise(f, x, symmetric=true) == + Symmetric(pairwise(f, x, x), :U) + + res = zeros(4, 4) + res2 = zeros(4, 4) + @test pairwise!(f, res, x, x, symmetric=true) === res + @test pairwise!(f, res2, x, symmetric=true) === res2 + @test res == res2 == Symmetric(pairwise(f, x, x), :U) + + @test_throws ArgumentError pairwise(f, x, y, symmetric=true) + @test_throws ArgumentError pairwise!(f, res, x, y, symmetric=true) + end + + @testset "cor corner cases" begin + # Integer inputs must give a Float64 output + res = pairwise(cor, [[1, 2, 3], [1, 5, 2]]) + @test res isa Matrix{Float64} + @test res == [cor(xi, yi) for xi in ([1, 2, 3], [1, 5, 2]), + yi in ([1, 2, 3], [1, 5, 2])] + + # NaNs are ignored for the diagonal + res = pairwise(cor, [[1, 2, NaN], [1, 5, 2]]) + @test res isa Matrix{Float64} + @test res ≅ [1.0 NaN + NaN 1.0] + + # missings are ignored for the diagonal + res = pairwise(cor, [[1, 2, 7], [1, 5, missing]]) + @test res isa Matrix{Union{Float64, Missing}} + @test res ≅ [1.0 missing + missing 1.0] + res = pairwise(cor, Vector{Union{Int, Missing}}[[missing, missing, missing], + [missing, missing, missing]]) + @test res isa Matrix{Union{Float64, Missing}} + @test res ≅ [1.0 missing + missing 1.0] + if VERSION >= v"1.5" + # except when eltype is Missing + res = pairwise(cor, [[missing, missing, missing], + [missing, missing, missing]]) + @test res isa Matrix{Missing} + @test res ≅ [missing missing + missing missing] + end + + for sm in (:pairwise, :listwise) + res = pairwise(cor, [[1, 2, NaN, 4], [1, 5, 5, missing]], skipmissing=sm) + @test res isa Matrix{Float64} + @test res ≅ [1.0 NaN + NaN 1.0] + if VERSION >= v"1.5" + @test_throws ArgumentError pairwise(cor, [[missing, missing, missing], + [missing, missing, missing]], + skipmissing=sm) + end + end + end +end \ No newline at end of file diff --git a/test/rankcorr.jl b/test/rankcorr.jl index 8669ed64..7356dbdd 100644 --- a/test/rankcorr.jl +++ b/test/rankcorr.jl @@ -2,10 +2,11 @@ using Statistics using Test X = Float64[1 0; 2 1; 3 0; 4 1; 5 10] +Y = Float64[5 5 6; 3 4 1; 4 0 4; 2 6 1; 5 7 10] x1 = X[:,1] x2 = X[:,2] -y = [5, 3, 4, 2, 5] +y = Y[:,1] # corspearman @@ -23,20 +24,138 @@ c22 = corspearman(x2, x2) @test corspearman(X, X) ≈ [c11 c12; c12 c22] @test corspearman(X) ≈ [c11 c12; c12 c22] +@test corspearman(X, Y) == + [corspearman(X[:,i], Y[:,j]) for i in axes(X, 2), j in axes(Y, 2)] # corkendall -@test corkendall(x1, y) ≈ -0.105409255338946 -@test corkendall(x2, y) ≈ -0.117851130197758 +# Check error, handling of NaN, Inf etc +@test_throws ErrorException("Vectors must have same length") corkendall([1,2,3,4], [1,2,3]) +@test isnan(corkendall([1,2], [3,NaN])) +@test isnan(corkendall([1,1,1], [1,2,3])) +@test corkendall([-Inf,-0.0,Inf],[1,2,3]) == 1.0 -@test corkendall(X, y) ≈ [-0.105409255338946, -0.117851130197758] -@test corkendall(y, X) ≈ [-0.105409255338946 -0.117851130197758] +# Test, with exact equality, some known results. +# RealVector, RealVector +@test corkendall(x1, y) == -1/sqrt(90) +@test corkendall(x2, y) == -1/sqrt(72) +# RealMatrix, RealVector +@test corkendall(X, y) == [-1/sqrt(90), -1/sqrt(72)] +# RealVector, RealMatrix +@test corkendall(y, X) == [-1/sqrt(90) -1/sqrt(72)] + +# n = 78_000 tests for overflow errors on 32 bit +# Testing for overflow errors on 64bit would require n be too large for practicality +# This also tests merge_sort! since n is (much) greater than SMALL_THRESHOLD. +n = 78_000 +# Test with many repeats +@test corkendall(repeat(x1, n), repeat(y, n)) ≈ -1/sqrt(90) +@test corkendall(repeat(x2, n), repeat(y, n)) ≈ -1/sqrt(72) +@test corkendall(repeat(X, n), repeat(y, n)) ≈ [-1/sqrt(90), -1/sqrt(72)] +@test corkendall(repeat(y, n), repeat(X, n)) ≈ [-1/sqrt(90) -1/sqrt(72)] +@test corkendall(repeat([0,1,1,0], n), repeat([1,0,1,0], n)) == 0.0 + +# Test with no repeats, note testing for exact equality +@test corkendall(collect(1:n), collect(1:n)) == 1.0 +@test corkendall(collect(1:n), reverse(collect(1:n))) == -1.0 + +# All elements identical should yield NaN +@test isnan(corkendall(repeat([1], n), collect(1:n))) c11 = corkendall(x1, x1) c12 = corkendall(x1, x2) c22 = corkendall(x2, x2) -@test c11 ≈ 1.0 -@test c22 ≈ 1.0 +# RealMatrix, RealMatrix @test corkendall(X, X) ≈ [c11 c12; c12 c22] +# RealMatrix @test corkendall(X) ≈ [c11 c12; c12 c22] + +@test c11 == 1.0 +@test c22 == 1.0 +@test c12 == 3/sqrt(20) + +# Finished testing for overflow, so redefine n for speedier tests +n = 100 + +@test corkendall(repeat(X, n), repeat(X, n)) ≈ [c11 c12; c12 c22] +@test corkendall(repeat(X, n)) ≈ [c11 c12; c12 c22] + +# All eight three-element permutations +z = [1 1 1; + 1 1 2; + 1 2 2; + 1 2 2; + 1 2 1; + 2 1 2; + 1 1 2; + 2 2 2] + +@test corkendall(z) == [1 0 1/3; 0 1 0; 1/3 0 1] +@test corkendall(z, z) == [1 0 1/3; 0 1 0; 1/3 0 1] +@test corkendall(z[:,1], z) == [1 0 1/3] +@test corkendall(z, z[:,1]) == [1; 0; 1/3] + +z = float(z) +@test corkendall(z) == [1 0 1/3; 0 1 0; 1/3 0 1] +@test corkendall(z, z) == [1 0 1/3; 0 1 0; 1/3 0 1] +@test corkendall(z[:,1], z) == [1 0 1/3] +@test corkendall(z, z[:,1]) == [1; 0; 1/3] + +w = repeat(z, n) +@test corkendall(w) == [1 0 1/3; 0 1 0; 1/3 0 1] +@test corkendall(w, w) == [1 0 1/3; 0 1 0; 1/3 0 1] +@test corkendall(w[:,1], w) == [1 0 1/3] +@test corkendall(w, w[:,1]) == [1; 0; 1/3] + +Statistics.midpoint(1,10) == 5 +Statistics.midpoint(1,widen(10)) == 5 + + +# NaN handling + +Xnan = copy(X) +Xnan[1,1] = NaN +Ynan = copy(Y) +Ynan[2,1] = NaN + +for f in (corspearman, corkendall) + @test isnan(f([1.0, NaN, 2.0], [2.0, 1.0, 3.4])) + @test all(isnan, f([1.0, NaN], [1 2; 3 4])) + @test all(isnan, f([1 2; 3 4], [1.0, NaN])) + @test isequal(f([1 NaN; NaN 4]), [1 NaN; NaN 1]) + @test all(isnan, f([1 NaN; NaN 4], [1 NaN; NaN 4])) + @test all(isnan, f([1 NaN; NaN 4], [NaN 1; NaN 4])) + + @test isequal(f(Xnan, Ynan), + [f(Xnan[:,i], Ynan[:,j]) for i in axes(Xnan, 2), j in axes(Ynan, 2)]) + @test isequal(f(Xnan), + [i == j ? 1.0 : f(Xnan[:,i], Xnan[:,j]) + for i in axes(Xnan, 2), j in axes(Xnan, 2)]) + for k in 1:2 + @test isequal(f(Xnan[:,k], Ynan), + [f(Xnan[:,k], Ynan[:,j]) for i in 1:1, j in axes(Ynan, 2)]) + # TODO: fix corkendall (PR#659) + if f === corspearman + @test isequal(f(Xnan, Ynan[:,k]), + [f(Xnan[:,i], Ynan[:,k]) for i in axes(Xnan, 2), j in 1:1]) + else + @test isequal(f(Xnan, Ynan[:,k]), + [f(Xnan[:,i], Ynan[:,k]) for i in axes(Xnan, 2)]) + end + end +end + + +# Wrong dimensions + +@test_throws DimensionMismatch corspearman([1], [1, 2]) +@test_throws DimensionMismatch corspearman([1], [1 2; 3 4]) +@test_throws DimensionMismatch corspearman([1 2; 3 4], [1]) +@test_throws ArgumentError corspearman([1 2; 3 4: 4 6], [1 2; 3 4]) + +# TODO: fix corkendall to match corspearman (PR#659) +@test_throws ErrorException corkendall([1], [1, 2]) +@test_throws ErrorException corkendall([1], [1 2; 3 4]) +@test_throws ErrorException corkendall([1 2; 3 4], [1]) +@test_throws ArgumentError corkendall([1 2; 3 4: 4 6], [1 2; 3 4]) \ No newline at end of file diff --git a/test/reliability.jl b/test/reliability.jl new file mode 100644 index 00000000..916e097c --- /dev/null +++ b/test/reliability.jl @@ -0,0 +1,80 @@ +using StatsBase +using LinearAlgebra, Random, Test + +@testset "Cronbach's Alpha" begin + # basic vanilla test + cov_X = [10 6 6 6; + 6 11 6 6; + 6 6 12 6; + 6 6 6 13] + cronbach_X = cronbachalpha(cov_X) + @test cronbach_X isa CronbachAlpha{Float64} + @test cronbach_X.alpha ≈ 0.8135593220338981 + @test cronbach_X.dropped ≈ + [0.75, 0.7605633802816901, 0.7714285714285715, 0.782608695652174] + + # testing Rational + cov_rational = cov_X .// 1 + cronbach_rational = cronbachalpha(cov_rational) + @test cronbach_rational isa CronbachAlpha{Rational{Int}} + @test cronbach_rational.alpha == 48 // 59 + @test cronbach_rational.dropped == + [3 // 4, 54 // 71, 27 // 35, 18 // 23] + + # testing BigFloat + cov_bigfloat = BigFloat.(cov_X) + cronbach_bigfloat = cronbachalpha(cov_bigfloat) + @test cronbach_bigfloat isa CronbachAlpha{BigFloat} + @test cronbach_bigfloat.alpha ≈ 0.8135593220338981 + @test cronbach_bigfloat.dropped ≈ + [0.75, 0.7605633802816901, 0.7714285714285715, 0.782608695652174] + + # testing corner cases + @test_throws MethodError cronbachalpha([1.0, 2.0]) + cov_k2 = [10 6; + 6 11] + cronbach_k2 = cronbachalpha(cov_k2) + @test cronbach_k2.alpha ≈ 0.7272727272727273 + @test isempty(cronbach_k2.dropped) + + # testing when Matrix is not positive-definite + cov_not_pos = [-1 1; + -1 1] + @test_throws ArgumentError cronbachalpha(cov_not_pos) + + # testing with a zero + cov_zero = [1 2; + 0 1] + @test_throws ArgumentError cronbachalpha(cov_not_pos) + + # testing with one column + cov_k1 = reshape([1, 2], 2, 1) + @test_throws ArgumentError cronbachalpha(cov_k1) + + # testing with Missing + cov_missing = [1 2; + missing 1] + @test_throws MethodError cronbachalpha(cov_missing) + + + # testing Base.show + cronbach_X = cronbachalpha(cov_X) + io = IOBuffer() + show(io, cronbach_X) + str = String(take!(io)) + @test str == """ + Cronbach's alpha for all items: 0.8136 + + Cronbach's alpha if an item is dropped: + item 1: 0.7500 + item 2: 0.7606 + item 3: 0.7714 + item 4: 0.7826 + """ + # for two columns + io = IOBuffer() + show(io, cronbach_k2) + str = String(take!(io)) + @test str == "Cronbach's alpha for all items: 0.7273\n" + +end # @testset "Cronbach's Alpha" diff --git a/test/robust.jl b/test/robust.jl index e9bcdb51..07a72368 100644 --- a/test/robust.jl +++ b/test/robust.jl @@ -3,39 +3,40 @@ using Test ### Trimming outliers -@test trim([1,2,3,4,5,6,7,8], prop=0.1) == [1,2,3,4,5,6,7,8] -@test trim([1,2,3,4,5,6,7,8], prop=0.2) == [2,3,4,5,6,7] -@test trim([1,2,3,4,5,6,7,8,9], prop=0.4) == [4,5,6] -@test trim([1,2,3,4,5,6,7,8], count=1) == [2,3,4,5,6,7] -@test trim([1,2,3,4,5,6,7,8,9], count=3) == [4,5,6] +@test collect(trim([8,2,3,4,5,6,7,1], prop=0.1)) == [8,2,3,4,5,6,7,1] +@test collect(trim([8,2,3,4,5,6,7,1], prop=0.2)) == [2,3,4,5,6,7] +@test collect(trim([1,2,3,4,5,6,7,8,9], prop=0.4)) == [4,5,6] +@test collect(trim([8,7,6,5,4,3,2,1], count=1)) == [7,6,5,4,3,2] +@test collect(trim([1,2,3,4,5,6,7,8,9], count=3)) == [4,5,6] @test_throws ArgumentError trim([]) @test_throws ArgumentError trim([1,2,3,4,5], prop=0.5) -@test trim!([1,2,3,4,5,6,7,8], prop=0.1) == [1,2,3,4,5,6,7,8] -@test trim!([1,2,3,4,5,6,7,8], prop=0.2) == [2,3,4,5,6,7] -@test trim!([1,2,3,4,5,6,7,8,9], prop=0.4) == [4,5,6] -@test trim!([1,2,3,4,5,6,7,8], count=1) == [2,3,4,5,6,7] -@test trim!([1,2,3,4,5,6,7,8,9], count=3) == [4,5,6] +@test collect(trim!([8,2,3,4,5,6,7,1], prop=0.1)) == [8,2,3,4,5,6,7,1] +@test collect(trim!([8,2,3,4,5,6,7,1], prop=0.2)) == [2,3,4,5,6,7] +@test collect(trim!([1,2,3,4,5,6,7,8,9], prop=0.4)) == [4,5,6] +@test collect(trim!([8,7,6,5,4,3,2,1], count=1)) == [7,6,5,4,3,2] +@test collect(trim!([1,2,3,4,5,6,7,8,9], count=3)) == [4,5,6] @test_throws ArgumentError trim!([]) @test_throws ArgumentError trim!([1,2,3,4,5], prop=0.5) -@test winsor([1,2,3,4,5,6,7,8], prop=0.1) == [1,2,3,4,5,6,7,8] -@test winsor([1,2,3,4,5,6,7,8], prop=0.2) == [2,2,3,4,5,6,7,7] -@test winsor([1,2,3,4,5,6,7,8,9], prop=0.4) == [4,4,4,4,5,6,6,6,6] -@test winsor([1,2,3,4,5,6,7,8], count=1) == [2,2,3,4,5,6,7,7] -@test winsor([1,2,3,4,5,6,7,8,9], count=3) == [4,4,4,4,5,6,6,6,6] +@test collect(winsor([8,2,3,4,5,6,7,1], prop=0.1)) == [8,2,3,4,5,6,7,1] +@test collect(winsor([8,2,3,4,5,6,7,1], prop=0.2)) == [7,2,3,4,5,6,7,2] +@test collect(winsor([1,2,3,4,5,6,7,8,9], prop=0.4)) == [4,4,4,4,5,6,6,6,6] +@test collect(winsor([1,2,3,4,5,6,7,8], count=1)) == [2,2,3,4,5,6,7,7] +@test collect(winsor([8,7,6,5,4,3,2,1], count=1)) == [7,7,6,5,4,3,2,2] +@test collect(winsor([1,2,3,4,5,6,7,8,9], count=3)) == [4,4,4,4,5,6,6,6,6] @test_throws ArgumentError winsor([]) @test_throws ArgumentError winsor([1,2,3,4,5], prop=0.5) -@test winsor!([1,2,3,4,5,6,7,8], prop=0.1) == [1,2,3,4,5,6,7,8] -@test winsor!([1,2,3,4,5,6,7,8], prop=0.2) == [2,2,3,4,5,6,7,7] -@test winsor!([1,2,3,4,5,6,7,8,9], prop=0.4) == [4,4,4,4,5,6,6,6,6] -@test winsor!([1,2,3,4,5,6,7,8], count=1) == [2,2,3,4,5,6,7,7] -@test winsor!([1,2,3,4,5,6,7,8,9], count=3) == [4,4,4,4,5,6,6,6,6] +@test collect(winsor!([8,2,3,4,5,6,7,1], prop=0.1)) == [8,2,3,4,5,6,7,1] +@test collect(winsor!([8,2,3,4,5,6,7,1], prop=0.2)) == [7,2,3,4,5,6,7,2] +@test collect(winsor!([1,2,3,4,5,6,7,8,9], prop=0.4)) == [4,4,4,4,5,6,6,6,6] +@test collect(winsor!([8,7,6,5,4,3,2,1], count=1)) == [7,7,6,5,4,3,2,2] +@test collect(winsor!([1,2,3,4,5,6,7,8,9], count=3)) == [4,4,4,4,5,6,6,6,6] @test_throws ArgumentError winsor!([]) @test_throws ArgumentError winsor!([1,2,3,4,5], prop=0.5) diff --git a/test/sampling.jl b/test/sampling.jl index 4f307d87..15bf69f3 100644 --- a/test/sampling.jl +++ b/test/sampling.jl @@ -1,5 +1,5 @@ using StatsBase -using Test, Random +using Test, Random, StableRNGs Random.seed!(1234) @@ -27,19 +27,19 @@ end #### sample with replacement -function check_sample_wrep(a::AbstractArray, vrgn, ptol::Real; ordered::Bool=false) +function check_sample_wrep(a::AbstractArray, vrgn, ptol::Real; ordered::Bool=false, rev::Bool=false) vmin, vmax = vrgn (amin, amax) = extrema(a) @test vmin <= amin <= amax <= vmax n = vmax - vmin + 1 p0 = fill(1/n, n) if ordered - @test issorted(a) + @test issorted(a; rev=rev) if ptol > 0 @test isapprox(proportions(a, vmin:vmax), p0, atol=ptol) end else - @test !issorted(a) + @test !issorted(a; rev=rev) ncols = size(a,2) if ncols == 1 @test isapprox(proportions(a, vmin:vmax), p0, atol=ptol) @@ -68,30 +68,36 @@ test_rng_use(direct_sample!, 1:10, zeros(Int, 6)) a = sample(3:12, n) check_sample_wrep(a, (3, 12), 5.0e-3; ordered=false) -a = sample(3:12, n; ordered=true) -check_sample_wrep(a, (3, 12), 5.0e-3; ordered=true) +for rev in (true, false), T in (Int, Int16, Float64, Float16, BigInt, ComplexF64, Rational{Int}) + r = rev ? reverse(3:12) : (3:12) + r = T===Int ? r : T.(r) + aa = Int.(sample(r, n; ordered=true)) + check_sample_wrep(aa, (3, 12), 5.0e-3; ordered=true, rev=rev) -a = sample(3:12, 10; ordered=true) -check_sample_wrep(a, (3, 12), 0; ordered=true) + aa = Int.(sample(r, 10; ordered=true)) + check_sample_wrep(aa, (3, 12), 0; ordered=true, rev=rev) +end + +@test StatsBase._storeindices(1, 1, BigFloat) == StatsBase._storeindices(1, 1, BigFloat) == false test_rng_use(sample, 1:10, 10) @testset "sampling pairs" begin - Random.seed!(1) + rng = StableRNG(1) - @test samplepair(2) === (1, 2) - @test samplepair(10) === (8, 2) + @test samplepair(rng, 2) === (2, 1) + @test samplepair(rng, 10) === (5, 6) - @test samplepair([3, 4, 2, 6, 8]) === (2, 6) - @test samplepair([1, 2]) === (1, 2) + @test samplepair(rng, [3, 4, 2, 6, 8]) === (3, 8) + @test samplepair(rng, [1, 2]) === (1, 2) end test_rng_use(samplepair, 1000) #### sample without replacement -function check_sample_norep(a::AbstractArray, vrgn, ptol::Real; ordered::Bool=false) +function check_sample_norep(a::AbstractArray, vrgn, ptol::Real; ordered::Bool=false, rev::Bool=false) # each column of a for one run vmin, vmax = vrgn @@ -103,7 +109,7 @@ function check_sample_norep(a::AbstractArray, vrgn, ptol::Real; ordered::Bool=fa aj = view(a,:,j) @assert allunique(aj) if ordered - @assert issorted(aj) + @assert issorted(aj, rev=rev) end end @@ -122,7 +128,7 @@ function check_sample_norep(a::AbstractArray, vrgn, ptol::Real; ordered::Bool=fa end import StatsBase: knuths_sample!, fisher_yates_sample!, self_avoid_sample! -import StatsBase: seqsample_a!, seqsample_c! +import StatsBase: seqsample_a!, seqsample_c!, seqsample_d! a = zeros(Int, 5, n) for j = 1:size(a,2) @@ -164,12 +170,23 @@ check_sample_norep(a, (3, 12), 5.0e-3; ordered=true) test_rng_use(seqsample_c!, 1:10, zeros(Int, 6)) +a = zeros(Int, 5, n) +for j = 1:size(a,2) + seqsample_d!(3:12, view(a,:,j)) +end +check_sample_norep(a, (3, 12), 5.0e-3; ordered=true) + +test_rng_use(seqsample_d!, 1:10, zeros(Int, 6)) + a = sample(3:12, 5; replace=false) check_sample_norep(a, (3, 12), 0; ordered=false) a = sample(3:12, 5; replace=false, ordered=true) check_sample_norep(a, (3, 12), 0; ordered=true) +a = sample(reverse(3:12), 5; replace=false, ordered=true) +check_sample_norep(a, (3, 12), 0; ordered=true, rev=true) + # tests of multidimensional sampling a = sample(3:12, (2, 2); replace=false) @@ -202,8 +219,29 @@ wv = Weights([zeros(5); 1:4; -1]) @test_throws ErrorException sample(a, wv, 1, replace=false) #### weighted sampling with dimension -Random.seed!(1); -@test sample([1, 2], Weights([1, 1]), (2,2)) == ones(2,2) +# weights respected; this works because of the 0-weight @test sample([1, 2], Weights([0, 1]), (2,2)) == [2 2 ; 2 2] -@test sample(collect(1:4), Weights(1:4), (2,2), replace=false) == [4 1; 3 2] +wm = sample(collect(1:4), Weights(1:4), (2,2), replace=false) +@test size(wm) == (2, 2) # correct shape +@test length(Set(wm)) == 4 # no duplicates in elements + + +#### check that sample and sample! do the same thing +function test_same(;kws...) + wv = Weights(rand(20)) + Random.seed!(1) + x1 = sample(1:20, wv, 10; kws...) + Random.seed!(1) + x2 = zeros(Int, 10) + sample!(1:20, wv, x2; kws...) + @test x1 == x2 +end + +test_same() +test_same(replace=true) +test_same(replace=false) +test_same(replace=true, ordered=true) +test_same(replace=false, ordered=true) +test_same(replace=true, ordered=false) +test_same(replace=false, ordered=false) diff --git a/test/scalarstats.jl b/test/scalarstats.jl index cc4dfc42..0859e504 100644 --- a/test/scalarstats.jl +++ b/test/scalarstats.jl @@ -1,6 +1,7 @@ -using StatsBase +using Statistics using Test using DelimitedFiles +using Statistics ##### Location @@ -43,10 +44,24 @@ using DelimitedFiles @test modes(skipmissing([1, missing, missing, 3, 2, 2, missing])) == [2] @test sort(modes(skipmissing([1, missing, 3, 3, 2, 2, missing]))) == [2, 3] +d1 = [1, 2, 3, 3, 4, 5, 5, 3] +d2 = ['a', 'b', 'c', 'c', 'd', 'e', 'e', 'c'] +wv = weights([0.1:0.1:0.7; 0.1]) +@test mode(d1) == 3 +@test mode(d2) == 'c' +@test mode(d1, wv) == 5 +@test mode(d2, wv) == 'e' +@test sort(modes(d1[1:end-1], weights(ones(7)))) == [3, 5] +@test sort(modes(d1, weights([.9, .1, .1, .1, .9, .1, .1, .1]))) == [1, 4] + @test_throws ArgumentError mode(Int[]) @test_throws ArgumentError modes(Int[]) @test_throws ArgumentError mode(Any[]) @test_throws ArgumentError modes(Any[]) +@test_throws ArgumentError mode([], weights(Float64[])) +@test_throws ArgumentError modes([], weights(Float64[])) +@test_throws ArgumentError mode([1, 2, 3], weights([0.1, 0.3])) +@test_throws ArgumentError modes([1, 2, 3], weights([0.1, 0.3])) ##### Dispersion @@ -65,19 +80,19 @@ using DelimitedFiles @test mad(1:5; center=3, normalize=true) ≈ 1.4826022185056018 @test mad(skipmissing([missing; 1:5; missing]); center=3, normalize=true) ≈ 1.4826022185056018 -@test StatsBase.mad!([1:5;]; center=3, normalize=true) ≈ 1.4826022185056018 +@test mad!([1:5;]; center=3, normalize=true) ≈ 1.4826022185056018 @test mad(1:5, normalize=true) ≈ 1.4826022185056018 @test mad(1:5, normalize=false) ≈ 1.0 @test mad(skipmissing([missing; 1:5; missing]), normalize=true) ≈ 1.4826022185056018 @test mad(skipmissing([missing; 1:5; missing]), normalize=false) ≈ 1.0 -@test StatsBase.mad!([1:5;], normalize=false) ≈ 1.0 +@test mad!([1:5;], normalize=false) ≈ 1.0 @test mad(1:5, center=3, normalize=false) ≈ 1.0 @test mad(skipmissing([missing; 1:5; missing]), center=3, normalize=false) ≈ 1.0 -@test StatsBase.mad!([1:5;], center=3, normalize=false) ≈ 1.0 +@test mad!([1:5;], center=3, normalize=false) ≈ 1.0 @test mad((x for x in (1, 2.1)), normalize=false) ≈ 0.55 @test mad(Any[1, 2.1], normalize=false) ≈ 0.55 @test mad(Union{Int,Missing}[1, 2], normalize=false) ≈ 0.5 -@test_throws ArgumentError mad(Int[]) +@test_throws ArgumentError mad(Int[], normalize = true) # Issue 197 @test mad(1:2, normalize=true) ≈ 0.7413011092528009 @@ -159,7 +174,7 @@ scale = rand() ##### describe s = describe(1:5) -@test isa(s, StatsBase.SummaryStats) +@test isa(s, Statistics.SummaryStats) @test s.min == 1.0 @test s.max == 5.0 @test s.mean ≈ 3.0 @@ -170,7 +185,7 @@ s = describe(1:5) @test s.nmiss = 0 @test s.isnumeric -@test sprint(show, StatsBase.describe(1:5)) == """ +@test sprint(show, describe(1:5)) == """ Summary Stats: Length: 5 Missing Count: 0 @@ -184,7 +199,7 @@ s = describe(1:5) """ s = describe([1:5; missing]) -@test isa(s, StatsBase.SummaryStats) +@test isa(s, Statistics.SummaryStats) @test s.min == 1.0 @test s.max == 5.0 @test s.mean ≈ 3.0 @@ -196,7 +211,7 @@ s = describe([1:5; missing]) @test s.isnumeric s = describe(["a", "b"]) -@test isa(s, StatsBase.SummaryStats) +@test isa(s, Statistics.SummaryStats) @test s.min === NaN @test s.max === NaN @test s.mean === NaN @@ -206,3 +221,31 @@ s = describe(["a", "b"]) @test s.nobs == 2 @test s.nmiss == 0 @test !s.isnumeric + +# Issue #631 +s = describe([-2, -1, 0, 1, 2, missing]) +@test isa(s, Statistics.SummaryStats) +@test s.min == -2.0 +@test s.max == 2.0 +@test s.mean ≈ 0.0 +@test s.median ≈ 0.0 +@test s.q25 ≈ -1.0 +@test s.q75 ≈ +1.0 + +# Issue #631 +s = describe(zeros(10)) +@test isa(s, Statistics.SummaryStats) +@test s.min == 0.0 +@test s.max == 0.0 +@test s.mean ≈ 0.0 +@test s.median ≈ 0.0 +@test s.q25 ≈ 0.0 +@test s.q75 ≈ 0.0 + +# Issue #631 +s = describe(Union{Float64,Missing}[missing, missing]) +@test isa(s, Statistics.SummaryStats) +@test s.nobs == 2 +@test s.nmiss == 2 +@test isnan(s.mean) +@test isnan(s.median) diff --git a/test/signalcorr.jl b/test/signalcorr.jl index 78a14f4f..2dd9d366 100644 --- a/test/signalcorr.jl +++ b/test/signalcorr.jl @@ -22,6 +22,9 @@ x = [-2.133252557240862 -.7445937365828654; x1 = view(x, :, 1) x2 = view(x, :, 2) +realx = convert(AbstractMatrix{Real}, x) +realx1 = convert(AbstractVector{Real}, x1) +realx2 = convert(AbstractVector{Real}, x2) # autocov & autocorr @@ -40,7 +43,9 @@ racovx1 = [1.839214242630635709475, -0.088687020167434751916] @test autocov(x1) ≈ racovx1 +@test autocov(realx1) ≈ racovx1 @test autocov(x) ≈ [autocov(x1) autocov(x2)] +@test autocov(realx) ≈ [autocov(realx1) autocov(realx2)] racorx1 = [0.999999999999999888978, -0.221173011668873431557, @@ -54,7 +59,9 @@ racorx1 = [0.999999999999999888978, -0.048220059475281865091] @test autocor(x1) ≈ racorx1 +@test autocor(realx1) ≈ racorx1 @test autocor(x) ≈ [autocor(x1) autocor(x2)] +@test autocor(realx) ≈ [autocor(realx1) autocor(realx2)] # crosscov & crosscor @@ -76,10 +83,14 @@ c11 = crosscov(x1, x1) c12 = crosscov(x1, x2) c21 = crosscov(x2, x1) c22 = crosscov(x2, x2) +@test crosscov(realx1, realx2) ≈ c12 @test crosscov(x, x1) ≈ [c11 c21] +@test crosscov(realx, realx1) ≈ [c11 c21] @test crosscov(x1, x) ≈ [c11 c12] +@test crosscov(realx1, realx) ≈ [c11 c12] @test crosscov(x, x) ≈ cat([c11 c21], [c12 c22], dims=3) +@test crosscov(realx, realx) ≈ cat([c11 c21], [c12 c22], dims=3) rcor0 = [0.230940107675850, -0.230940107675850, @@ -98,10 +109,14 @@ c11 = crosscor(x1, x1) c12 = crosscor(x1, x2) c21 = crosscor(x2, x1) c22 = crosscor(x2, x2) +@test crosscor(realx1, realx2) ≈ c12 @test crosscor(x, x1) ≈ [c11 c21] +@test crosscor(realx, realx1) ≈ [c11 c21] @test crosscor(x1, x) ≈ [c11 c12] +@test crosscor(realx1, realx) ≈ [c11 c12] @test crosscor(x, x) ≈ cat([c11 c21], [c12 c22], dims=3) +@test crosscor(realx, realx) ≈ cat([c11 c21], [c12 c22], dims=3) ## pacf @@ -119,4 +134,3 @@ rpacfy = [-0.221173011668873, -0.175020669835420] @test pacf(x[:,1], 1:4, method=:yulewalker) ≈ rpacfy - diff --git a/test/statmodels.jl b/test/statmodels.jl index 80acafe1..da8824cf 100644 --- a/test/statmodels.jl +++ b/test/statmodels.jl @@ -1,4 +1,5 @@ using StatsBase +using StatsBase: PValue, TestStat using Test, Random v1 = [1.45666, -23.14, 1.56734e-13] @@ -6,9 +7,14 @@ v2 = ["Good", "Great", "Bad"] v3 = [1, 56, 2] v4 = [-12.56, 0.1326, 2.68e-16] v5 = [0.12, 0.3467, 1.345e-16] -@test sprint(show, CoefTable(Any[v1, v2, v3, v4, v5], - ["Estimate", "Comments", "df", "t", "p"], - ["x1", "x2", "x3"], 5, 4)) == """ +ct = CoefTable(Any[v1, v2, v3, v4, v5], + ["Estimate", "Comments", "df", "t", "p"], + ["x1", "x2", "x3"], 5, 4) +ct_noname = CoefTable(Any[v1, v2, v3, v4, v5], + ["Estimate", "Comments", "df", "t", "p"], + [], 5, 4) + +@test sprint(show, ct) == """ ─────────────────────────────────────────────── Estimate Comments df t p ─────────────────────────────────────────────── @@ -17,9 +23,45 @@ x2 -23.14 Great 56 0.13 0.3467 x3 1.56734e-13 Bad 2 0.00 <1e-15 ───────────────────────────────────────────────""" -Random.seed!(10) -m = rand(3,4) -@test sprint(show, CoefTable(m, ["Estimate", "Stderror", "df", "p"], [], 4)) == """ +@test sprint(show, ct_noname) == """ +──────────────────────────────────────────────── + Estimate Comments df t p +──────────────────────────────────────────────── +[1] 1.45666 Good 1 -12.56 0.1200 +[2] -23.14 Great 56 0.13 0.3467 +[3] 1.56734e-13 Bad 2 0.00 <1e-15 +────────────────────────────────────────────────""" + +@test sprint(show, MIME"text/markdown"(), ct) == """ +| | Estimate | Comments | df | t | p | +|:---|--------------:|---------:|---:|-------:|:-------| +| x1 | 1.45666 | Good | 1 | -12.56 | 0.1200 | +| x2 | -23.14 | Great | 56 | 0.13 | 0.3467 | +| x3 | 1.56734e-13 | Bad | 2 | 0.00 | <1e-15 |""" + +@test sprint(show, MIME"text/markdown"(), ct_noname) == """ +| | Estimate | Comments | df | t | p | +|:----|--------------:|---------:|---:|-------:|:-------| +| [1] | 1.45666 | Good | 1 | -12.56 | 0.1200 | +| [2] | -23.14 | Great | 56 | 0.13 | 0.3467 | +| [3] | 1.56734e-13 | Bad | 2 | 0.00 | <1e-15 |""" + +@test length(ct) === 3 +@test eltype(ct) == + NamedTuple{(:Name, :Estimate, :Comments, :df, :t, :p), + Tuple{String,Float64,String,Int,Float64,Float64}} +@test collect(ct) == [ + (Name = "x1", Estimate = 1.45666, Comments = "Good", df = 1, t = -12.56, p = 0.12) + (Name = "x2", Estimate = -23.14, Comments = "Great", df = 56, t = 0.1326, p = 0.3467) + (Name = "x3", Estimate = 1.56734e-13, Comments = "Bad", df = 2, t = 2.68e-16, p = 1.345e-16) +] + + +m = [0.11258244478647295 0.05664544616214151 0.38181274408522614 0.8197779704008801 + 0.36831406658084287 0.12078054506961555 0.8151038332483567 0.6699313951612162 + 0.3444540231363058 0.17957407667101322 0.2422083248151139 0.4530583319523316] +ct = CoefTable(m, ["Estimate", "Stderror", "df", "p"], [], 4) +@test sprint(show, ct) == """ ────────────────────────────────────────── Estimate Stderror df p ────────────────────────────────────────── @@ -27,13 +69,57 @@ m = rand(3,4) [2] 0.368314 0.120781 0.815104 0.6699 [3] 0.344454 0.179574 0.242208 0.4531 ──────────────────────────────────────────""" +@test length(ct) === 3 +@test eltype(ct) == + NamedTuple{(:Estimate, :Stderror, :df, :p), + Tuple{Float64,Float64,Float64,Float64}} +@test collect(ct) == [ + (Estimate = 0.11258244478647295, Stderror = 0.05664544616214151, + df = 0.38181274408522614, p = 0.8197779704008801) + (Estimate = 0.36831406658084287, Stderror = 0.12078054506961555, + df = 0.8151038332483567, p = 0.6699313951612162) + (Estimate = 0.3444540231363058, Stderror = 0.17957407667101322, + df = 0.2422083248151139, p = 0.4530583319523316) +] + +@test sprint(show, PValue(1.0)) == "1.0000" +@test sprint(show, PValue(1e-1)) == "0.1000" +if VERSION > v"1.6.0-DEV" + @test sprint(show, PValue(1e-5)) == "<1e-04" +else + @test sprint(show, PValue(1e-5)) == "<1e-4" +end +@test sprint(show, PValue(NaN)) == "NaN" +@test_throws ErrorException PValue(-0.1) +@test_throws ErrorException PValue(1.1) + +@test sprint(show, TestStat(NaN)) == "NaN" +@test sprint(show, TestStat(1e-1)) == "0.10" +@test sprint(show, TestStat(1e-5)) == "0.00" +@test sprint(show, TestStat(π)) == "3.14" + +@testset "Union{PValue, TestStat} is Real" begin + vals = [0.0, Rational(1,3), NaN] + for T in [PValue, TestStat], + f in (==, <, ≤, >, ≥, isless, isequal), + lhs in vals, rhs in vals + # make sure that T behaves like a Real, + # regardless of whether it's on the LHS, RHS or both + @test f(T(lhs), T(rhs)) == f(lhs, rhs) + @test f(lhs, T(rhs)) == f(lhs, rhs) + @test f(T(lhs), rhs) == f(lhs, rhs) + end -@test sprint(show, StatsBase.PValue(1.0)) == "1.0000" -@test sprint(show, StatsBase.PValue(1e-1)) == "0.1000" -@test sprint(show, StatsBase.PValue(1e-5)) == "<1e-4" -@test sprint(show, StatsBase.PValue(NaN)) == "NaN" -@test_throws ErrorException StatsBase.PValue(-0.1) -@test_throws ErrorException StatsBase.PValue(1.1) + # the (approximate) equality operators get a bit more attention + for T in [PValue, TestStat] + @test T(Rational(1,3)) ≈ T(1/3) + @test Rational(1,3) ≈ T(1/3) atol=0.01 + @test T(Rational(1,3)) isa Real + @test T(T(0.05)) === T(0.05) + @test hash(T(0.05)) == hash(0.05) + @test hash(T(0.05), UInt(42)) == hash(0.05, UInt(42)) + end +end @test sprint(showerror, ConvergenceException(10)) == "failure to converge after 10 iterations." diff --git a/test/transformations.jl b/test/transformations.jl index 393fb27e..7d8e2b0a 100644 --- a/test/transformations.jl +++ b/test/transformations.jl @@ -1,66 +1,182 @@ using StatsBase -import StatsBase: transform, reconstruct +import StatsBase: transform, reconstruct, transform!, reconstruct! using Statistics using Test @testset "Transformations" begin + # matrix X = rand(5, 8) + X_ = copy(X) - t = fit(ZScoreTransform, X; center=false, scale=false) + t = fit(ZScoreTransform, X, dims=1, center=false, scale=false) Y = transform(t, X) @test isa(t, AbstractDataTransform) @test isempty(t.mean) @test isempty(t.scale) @test isequal(X, Y) - @test transform(t, X[:,1]) ≈ Y[:,1] - @test reconstruct(t, Y[:,1]) ≈ X[:,1] @test reconstruct(t, Y) ≈ X + @test transform!(t, X) === X + @test isequal(X, Y) + @test reconstruct!(t, Y) === Y + @test Y ≈ X_ - t = fit(ZScoreTransform, X; center=false) + X = copy(X_) + t = fit(ZScoreTransform, X, dims=1, center=false) Y = transform(t, X) @test isempty(t.mean) - @test length(t.scale) == 5 - @test Y ≈ X ./ std(X, dims=2) - @test transform(t, X[:,1]) ≈ Y[:,1] - @test reconstruct(t, Y[:,1]) ≈ X[:,1] + @test length(t.scale) == 8 + @test Y ≈ X ./ std(X, dims=1) @test reconstruct(t, Y) ≈ X + @test transform!(t, X) === X + @test isequal(X, Y) + @test reconstruct!(t, Y) === Y + @test Y ≈ X_ - t = fit(ZScoreTransform, X; scale=false) + X = copy(X_) + t = fit(ZScoreTransform, X, dims=1, scale=false) Y = transform(t, X) - @test length(t.mean) == 5 + @test length(t.mean) == 8 @test isempty(t.scale) - @test Y ≈ X .- mean(X, dims=2) - @test transform(t, X[:,1]) ≈ Y[:,1] - @test reconstruct(t, Y[:,1]) ≈ X[:,1] + @test Y ≈ X .- mean(X, dims=1) @test reconstruct(t, Y) ≈ X + @test transform!(t, X) === X + @test isequal(X, Y) + @test reconstruct!(t, Y) === Y + @test Y ≈ X_ - t = fit(ZScoreTransform, X) + X = copy(X_) + t = fit(ZScoreTransform, X, dims=1) + Y = transform(t, X) + @test length(t.mean) == 8 + @test length(t.scale) == 8 + @test Y ≈ (X .- mean(X, dims=1)) ./ std(X, dims=1) + @test reconstruct(t, Y) ≈ X + @test Y ≈ standardize(ZScoreTransform, X, dims=1) + @test transform!(t, X) === X + @test isequal(X, Y) + @test reconstruct!(t, Y) === Y + @test Y ≈ X_ + + X = copy(X_) + t = fit(ZScoreTransform, X, dims=2) Y = transform(t, X) @test length(t.mean) == 5 @test length(t.scale) == 5 @test Y ≈ (X .- mean(X, dims=2)) ./ std(X, dims=2) - @test transform(t, X[:,1]) ≈ Y[:,1] - @test reconstruct(t, Y[:,1]) ≈ X[:,1] @test reconstruct(t, Y) ≈ X + @test Y ≈ standardize(ZScoreTransform, X, dims=2) + @test transform!(t, X) === X + @test isequal(X, Y) + @test reconstruct!(t, Y) === Y + @test Y ≈ X_ + + X = copy(X_) + t = fit(UnitRangeTransform, X, dims=1, unit=false) + Y = transform(t, X) + @test length(t.min) == 8 + @test length(t.scale) == 8 + @test Y ≈ X ./ (maximum(X, dims=1) .- minimum(X, dims=1)) + @test reconstruct(t, Y) ≈ X + @test transform!(t, X) === X + @test isequal(X, Y) + @test reconstruct!(t, Y) === Y + @test Y ≈ X_ - t = fit(UnitRangeTransform, X) + X = copy(X_) + t = fit(UnitRangeTransform, X, dims=1) + Y = transform(t, X) + @test isa(t, AbstractDataTransform) + @test length(t.min) == 8 + @test length(t.scale) == 8 + @test Y ≈ (X .- minimum(X, dims=1)) ./ (maximum(X, dims=1) .- minimum(X, dims=1)) + @test reconstruct(t, Y) ≈ X + @test Y ≈ standardize(UnitRangeTransform, X, dims=1) + @test transform!(t, X) === X + @test isequal(X, Y) + @test reconstruct!(t, Y) === Y + @test Y ≈ X_ + + X = copy(X_) + t = fit(UnitRangeTransform, X, dims=2) Y = transform(t, X) @test isa(t, AbstractDataTransform) @test length(t.min) == 5 @test length(t.scale) == 5 @test Y ≈ (X .- minimum(X, dims=2)) ./ (maximum(X, dims=2) .- minimum(X, dims=2)) - @test transform(t, X[:,1]) ≈ Y[:,1] - @test reconstruct(t, Y[:,1]) ≈ X[:,1] @test reconstruct(t, Y) ≈ X + @test transform!(t, X) === X + @test isequal(X, Y) + @test reconstruct!(t, Y) === Y + @test Y ≈ X_ - t = fit(UnitRangeTransform, X; unit=false) + # vector + X = rand(10) + X_ = copy(X) + + t = fit(ZScoreTransform, X, dims=1, center=false, scale=false) Y = transform(t, X) - @test length(t.min) == 5 - @test length(t.scale) == 5 - @test Y ≈ X ./ (maximum(X, dims=2) .- minimum(X, dims=2)) - @test transform(t, X[:,1]) ≈ Y[:,1] - @test reconstruct(t, Y[:,1]) ≈ X[:,1] + @test transform(t, X) ≈ Y + @test reconstruct(t, Y) ≈ X + @test transform!(t, X) === X + @test isequal(X, Y) + @test reconstruct!(t, Y) === Y + @test Y ≈ X_ + + X = copy(X_) + t = fit(ZScoreTransform, X, dims=1, center=false) + Y = transform(t, X) + @test Y ≈ X ./ std(X, dims=1) + @test transform(t, X) ≈ Y + @test reconstruct(t, Y) ≈ X + @test transform!(t, X) === X + @test isequal(X, Y) + @test reconstruct!(t, Y) === Y + @test Y ≈ X_ + + X = copy(X_) + t = fit(ZScoreTransform, X, dims=1, scale=false) + Y = transform(t, X) + @test Y ≈ X .- mean(X, dims=1) + @test transform(t, X) ≈ Y + @test reconstruct(t, Y) ≈ X + @test transform!(t, X) === X + @test isequal(X, Y) + @test reconstruct!(t, Y) === Y + @test Y ≈ X_ + + X = copy(X_) + t = fit(ZScoreTransform, X, dims=1) + Y = transform(t, X) + @test Y ≈ (X .- mean(X, dims=1)) ./ std(X, dims=1) + @test transform(t, X) ≈ Y + @test reconstruct(t, Y) ≈ X + @test Y ≈ standardize(ZScoreTransform, X, dims=1) + @test transform!(t, X) === X + @test isequal(X, Y) + @test reconstruct!(t, Y) === Y + @test Y ≈ X_ + + X = copy(X_) + t = fit(UnitRangeTransform, X, dims=1) + Y = transform(t, X) + @test Y ≈ (X .- minimum(X, dims=1)) ./ (maximum(X, dims=1) .- minimum(X, dims=1)) + @test transform(t, X) ≈ Y @test reconstruct(t, Y) ≈ X + @test transform!(t, X) === X + @test isequal(X, Y) + @test reconstruct!(t, Y) === Y + @test Y ≈ X_ + + X = copy(X_) + t = fit(UnitRangeTransform, X, dims=1, unit=false) + Y = transform(t, X) + @test Y ≈ X ./ (maximum(X, dims=1) .- minimum(X, dims=1)) + @test transform(t, X) ≈ Y + @test reconstruct(t, Y) ≈ X + @test Y ≈ standardize(UnitRangeTransform, X, dims=1, unit=false) + @test transform!(t, X) === X + @test isequal(X, Y) + @test reconstruct!(t, Y) === Y + @test Y ≈ X_ - @test Y == standardize(UnitRangeTransform, X; unit=false) end diff --git a/test/weights.jl b/test/weights.jl index 72888fb0..a72c2208 100644 --- a/test/weights.jl +++ b/test/weights.jl @@ -1,10 +1,11 @@ using Statistics -using LinearAlgebra, Random, SparseArrays, Test +using LinearAlgebra, Random, SparseArrays, Test, Dates @testset "Weights" begin weight_funcs = (weights, aweights, fweights, pweights) -# Construction +## Construction + @testset "$f" for f in weight_funcs @test isa(f([1, 2, 3]), AbstractWeights{Int}) @test isa(f([1., 2., 3.]), AbstractWeights{Float64}) @@ -17,7 +18,7 @@ weight_funcs = (weights, aweights, fweights, pweights) wv = f(w) @test eltype(wv) === Float64 @test length(wv) === 3 - @test values(wv) === w + @test wv == w @test sum(wv) === 6.0 @test !isempty(wv) @@ -25,7 +26,7 @@ weight_funcs = (weights, aweights, fweights, pweights) bv = f(b) @test eltype(bv) === Bool @test length(bv) === 3 - @test values(bv) === b + @test convert(Vector, bv) == b @test sum(bv) === 3 @test !isempty(bv) end @@ -37,20 +38,20 @@ end # Check getindex & sum @test wv[1] === 1. @test sum(wv) === 6. - @test values(wv) == w + @test wv == w # Test setindex! success @test (wv[1] = 4) === 4 # setindex! returns original val @test wv[1] === 4. # value correctly converted and set @test sum(wv) === 9. # sum updated - @test values(wv) == [4., 2., 3.] # Test state of all values + @test wv == [4., 2., 3.] # Test state of all values # Test mulivalue setindex! wv[1:2] = [3., 5.] @test wv[1] === 3. @test wv[2] === 5. @test sum(wv) === 11. - @test values(wv) == [3., 5., 3.] # Test state of all values + @test wv == [3., 5., 3.] # Test state of all values # Test failed setindex! due to conversion error w = [1, 2, 3] @@ -59,7 +60,7 @@ end @test_throws InexactError wv[1] = 1.5 # Returns original value @test wv[1] === 1 # value not updated @test sum(wv) === 6 # sum not corrupted - @test values(wv) == [1, 2, 3] # Test state of all values + @test wv == [1, 2, 3] # Test state of all values end @testset "$f, isequal and ==" for f in weight_funcs @@ -90,6 +91,24 @@ end @test x == y end +@testset "Unit weights" begin + wv = uweights(Float64, 3) + @test wv[1] === 1. + @test wv[1:3] == fill(1.0, 3) + @test wv[:] == fill(1.0, 3) + @test !isempty(wv) + @test length(wv) === 3 + @test size(wv) === (3,) + @test sum(wv) === 3. + @test wv == fill(1.0, 3) + @test Statistics.varcorrection(wv) == 1/3 + @test !isequal(wv, fweights(fill(1.0, 3))) + @test isequal(wv, uweights(3)) + @test wv != fweights(fill(1.0, 3)) + @test wv == uweights(3) + @test wv[[true, false, false]] == uweights(Float64, 1) +end + @testset "Mean $f" for f in weight_funcs @test mean([1:3;], weights=f([1.0, 1.0, 0.5])) ≈ 1.8 @test mean(1:3, weights=f([1.0, 1.0, 0.5])) ≈ 1.8 @@ -106,7 +125,6 @@ end end end - @testset "Quantile fweights" begin data = ( [7, 1, 2, 4, 10], @@ -298,7 +316,6 @@ end @test_throws ArgumentError quantile(1:4, 0.1, weights=f(1:4), alpha=2, beta=2) end - @testset "Median $f" for f in weight_funcs data = [4, 3, 2, 1] wt = [0, 0, 0, 0] @@ -326,4 +343,86 @@ end quantile(data, 0.5, weights=f(wt)) atol = 1e-5 end -end # @testset Weights \ No newline at end of file +@testset "Sum, mean, quantiles and variance for unit weights" begin + a = reshape(1.0:27.0, 3, 3, 3) + wt = uweights(Float64, 3) + + @test Statistics.wsum([1.0, 2.0, 3.0], weights=wt) ≈ 6.0 + @test mean([1.0, 2.0, 3.0], weights=wt) ≈ 2.0 + + @test Statistics.wsum(a, weights=wt, dims=1) ≈ sum(a, dims=1) + @test Statistics.wsum(a, weights=wt, dims=2) ≈ sum(a, dims=2) + @test Statistics.wsum(a, weights=wt, dims=3) ≈ sum(a, dims=3) + + @test Statistics.wsum(a, weights=wt, dims=1) ≈ sum(a, dims=1) + @test Statistics.wsum(a, weights=wt, dims=2) ≈ sum(a, dims=2) + @test Statistics.wsum(a, weights=wt, dims=3) ≈ sum(a, dims=3) + + @test mean(a, weights=wt, dims=1) ≈ mean(a, dims=1) + @test mean(a, weights=wt, dims=2) ≈ mean(a, dims=2) + @test mean(a, weights=wt, dims=3) ≈ mean(a, dims=3) + + @test_throws DimensionMismatch Statistics.wsum(a, weights=wt) + @test_throws DimensionMismatch Statistics.wsum(a, weights=wt, dims=4) + @test_throws DimensionMismatch Statistics.wsum(a, weights=wt, dims=4) + @test_throws DimensionMismatch mean(a, weights=wt, dims=4) + + @test quantile([1.0, 4.0, 6.0, 8.0, 10.0], [0.5], weights=uweights(5)) ≈ [6.0] + @test quantile([1.0, 4.0, 6.0, 8.0, 10.0], 0.5, weights=uweights(5)) ≈ 6.0 + @test median([1.0, 4.0, 6.0, 8.0, 10.0], weights=uweights(5)) ≈ 6.0 + + @test_throws DimensionMismatch var(a, weights=uweights(Float64, 27)) +end + +@testset "Exponential Weights" begin + @testset "Usage" begin + θ = 5.25 + λ = 1 - exp(-1 / θ) # simple conversion for the more common/readable method + v = [λ*(1-λ)^(1-i) for i = 1:4] + w = Weights(v) + + @test round.(w, digits=4) == [0.1734, 0.2098, 0.2539, 0.3071] + + @testset "basic" begin + @test eweights(1:4, λ) ≈ w + end + + @testset "1:n" begin + @test eweights(4, λ) ≈ w + end + + @testset "indexin" begin + v = [λ*(1-λ)^(1-i) for i = 1:10] + + # Test that we should be able to skip indices easily + @test eweights([1, 3, 5, 7], 1:10, λ) ≈ Weights(v[[1, 3, 5, 7]]) + + # This should also work with actual time types + t1 = DateTime(2019, 1, 1, 1) + tx = t1 + Hour(7) + tn = DateTime(2019, 1, 2, 1) + + @test eweights(t1:Hour(2):tx, t1:Hour(1):tn, λ) ≈ Weights(v[[1, 3, 5, 7]]) + end + end + + @testset "Empty" begin + @test eweights(0, 0.3) == Weights(Float64[]) + @test eweights(1:0, 0.3) == Weights(Float64[]) + @test eweights(Int[], 1:10, 0.4) == Weights(Float64[]) + end + + @testset "Failure Conditions" begin + # λ > 1.0 + @test_throws ArgumentError eweights(1, 1.1) + + # time indices are not all positive non-zero integers + @test_throws ArgumentError eweights([0, 1, 2, 3], 0.3) + + # Passing in an array of bools will work because Bool <: Integer, + # but any `false` values will trigger the same argument error as 0.0 + @test_throws ArgumentError eweights([true, false, true, true], 0.3) + end +end + +end # @testset Weights diff --git a/test/wsampling.jl b/test/wsampling.jl index fc9a10a3..5ff725f7 100644 --- a/test/wsampling.jl +++ b/test/wsampling.jl @@ -5,19 +5,21 @@ Random.seed!(1234) #### weighted sample with replacement -function check_wsample_wrep(a::AbstractArray, vrgn, wv::AbstractWeights, ptol::Real; ordered::Bool=false) +function check_wsample_wrep(a::AbstractArray, vrgn, wv::AbstractWeights, ptol::Real; + ordered::Bool=false, rev::Bool=false) K = length(wv) (vmin, vmax) = vrgn (amin, amax) = extrema(a) @test vmin <= amin <= amax <= vmax - p0 = values(wv) ./ sum(wv) + p0 = wv ./ sum(wv) + rev && reverse!(p0) if ordered - @test issorted(a) + @test issorted(a; rev=rev) if ptol > 0 @test isapprox(proportions(a, vmin:vmax), p0, atol=ptol) end else - @test !issorted(a) + @test !issorted(a; rev=rev) ncols = size(a,2) if ncols == 1 @test isapprox(proportions(a, vmin:vmax), p0, atol=ptol) @@ -32,7 +34,7 @@ end import StatsBase: direct_sample!, alias_sample! -n = 10^5 +n = 10^6 wv = weights([0.2, 0.8, 0.4, 0.6]) a = direct_sample!(4:7, wv, zeros(Int, n, 3)) @@ -45,13 +47,20 @@ check_wsample_wrep(a, (4, 7), wv, 5.0e-3; ordered=false) a = sample(4:7, wv, n; ordered=false) check_wsample_wrep(a, (4, 7), wv, 5.0e-3; ordered=false) -a = sample(4:7, wv, n; ordered=true) -check_wsample_wrep(a, (4, 7), wv, 5.0e-3; ordered=true) +for rev in (true, false), T in (Int, Int16, Float64, Float16, BigInt, ComplexF64, Rational{Int}) + r = rev ? reverse(4:7) : (4:7) + r = T===Int ? r : T.(r) + aa = Int.(sample(r, wv, n; ordered=true)) + check_wsample_wrep(aa, (4, 7), wv, 5.0e-3; ordered=true, rev=rev) + aa = Int.(sample(r, wv, 10; ordered=true)) + check_wsample_wrep(aa, (4, 7), wv, -1; ordered=true, rev=rev) +end #### weighted sampling without replacement -function check_wsample_norep(a::AbstractArray, vrgn, wv::AbstractWeights, ptol::Real; ordered::Bool=false) +function check_wsample_norep(a::AbstractArray, vrgn, wv::AbstractWeights, ptol::Real; + ordered::Bool=false, rev::Bool=false) # each column of a for one run vmin, vmax = vrgn @@ -63,12 +72,13 @@ function check_wsample_norep(a::AbstractArray, vrgn, wv::AbstractWeights, ptol:: aj = view(a,:,j) @assert allunique(aj) if ordered - @assert issorted(aj) + @assert issorted(aj; rev=rev) end end if ptol > 0 - p0 = values(wv) ./ sum(wv) + p0 = wv ./ sum(wv) + rev && reverse!(p0) @test isapprox(proportions(a[1,:], vmin:vmax), p0, atol=ptol) end end @@ -110,5 +120,9 @@ test_rng_use(efraimidis_aexpj_wsample_norep!, 4:7, wv, zeros(Int, 2)) a = sample(4:7, wv, 3; replace=false, ordered=false) check_wsample_norep(a, (4, 7), wv, -1; ordered=false) -a = sample(4:7, wv, 3; replace=false, ordered=true) -check_wsample_norep(a, (4, 7), wv, -1; ordered=true) +for rev in (true, false), T in (Int, Int16, Float64, Float16, BigInt, ComplexF64, Rational{Int}) + r = rev ? reverse(4:7) : (4:7) + r = T===Int ? r : T.(r) + aa = Int.(sample(r, wv, 3; replace=false, ordered=true)) + check_wsample_norep(aa, (4, 7), wv, -1; ordered=true, rev=rev) +end