JuliaStats · gragusa · Jun 10, 2022 · Jun 15, 2022 · Jun 15, 2022 · Jun 17, 2022
diff --git a/.github/workflows/CI-stable.yml b/.github/workflows/CI-stable.yml
@@ -19,7 +19,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        version: ['1.0', '1']
+        version: ['1.6', '1']
         os: ['ubuntu-latest', 'macos-latest', 'windows-latest']
         arch: ['x64']
     steps:

diff --git a/docs/src/api.md b/docs/src/api.md
@@ -2,7 +2,7 @@
 
 ```@meta
 DocTestSetup = quote
-    using CategoricalArrays, DataFrames, Distributions, GLM, RDatasets
+    using CategoricalArrays, DataFrames, Distributions, GLM, RDatasets, StableRNGs
 end
 ```
 
@@ -22,7 +22,7 @@ GLM.ModResp
 
 The most general approach to fitting a model is with the `fit` function, as in
 ```jldoctest
-julia> using Random
+julia> using GLM, StableRNGs
 
 julia> fit(LinearModel, hcat(ones(10), 1:10), randn(MersenneTwister(12321), 10))
 LinearModel
@@ -31,14 +31,14 @@ Coefficients:
 ────────────────────────────────────────────────────────────────
         Coef.  Std. Error      t  Pr(>|t|)  Lower 95%  Upper 95%
 ────────────────────────────────────────────────────────────────
-x1   0.717436    0.775175   0.93    0.3818  -1.07012    2.50499
-x2  -0.152062    0.124931  -1.22    0.2582  -0.440153   0.136029
+x1   0.361896    0.69896    0.52    0.6186  -1.24991    1.9737
+x2  -0.012125    0.112648  -0.11    0.9169  -0.271891   0.247641
 ────────────────────────────────────────────────────────────────
 ```
 
 This model can also be fit as
 ```jldoctest
-julia> using Random
+julia> using GLM, StableRNGs
 
 julia> lm(hcat(ones(10), 1:10), randn(MersenneTwister(12321), 10))
 LinearModel
@@ -47,8 +47,8 @@ Coefficients:
 ────────────────────────────────────────────────────────────────
         Coef.  Std. Error      t  Pr(>|t|)  Lower 95%  Upper 95%
 ────────────────────────────────────────────────────────────────
-x1   0.717436    0.775175   0.93    0.3818  -1.07012    2.50499
-x2  -0.152062    0.124931  -1.22    0.2582  -0.440153   0.136029
+x1   0.361896    0.69896    0.52    0.6186  -1.24991    1.9737
+x2  -0.012125    0.112648  -0.11    0.9169  -0.271891   0.247641
 ────────────────────────────────────────────────────────────────
 ```
 

diff --git a/docs/src/examples.md b/docs/src/examples.md
@@ -12,8 +12,8 @@ julia> using DataFrames, GLM, StatsBase
 
 julia> data = DataFrame(X=[1,2,3], Y=[2,4,7])
 3×2 DataFrame
- Row │ X      Y
-     │ Int64  Int64
+ Row │ X      Y     
+     │ Int64  Int64 
 ─────┼──────────────
    1 │     1      2
    2 │     2      4
@@ -61,7 +61,7 @@ julia> dof(ols)
 3
 
 julia> dof_residual(ols)
-1.0
+1
 
 julia> round(aic(ols); digits=5)
 5.84252
@@ -91,8 +91,8 @@ julia> round.(vcov(ols); digits=5)
 ```jldoctest
 julia> data = DataFrame(X=[1,2,2], Y=[1,0,1])
 3×2 DataFrame
- Row │ X      Y
-     │ Int64  Int64
+ Row │ X      Y     
+     │ Int64  Int64 
 ─────┼──────────────
    1 │     1      1
    2 │     2      0
@@ -196,8 +196,8 @@ julia> using GLM, RDatasets
 
 julia> form = dataset("datasets", "Formaldehyde")
 6×2 DataFrame
- Row │ Carb     OptDen
-     │ Float64  Float64
+ Row │ Carb     OptDen  
+     │ Float64  Float64 
 ─────┼──────────────────
    1 │     0.1    0.086
    2 │     0.3    0.269
@@ -350,8 +350,8 @@ julia> dobson = DataFrame(Counts    = [18.,17,15,20,10,21,25,13,13],
                           Outcome   = categorical([1,2,3,1,2,3,1,2,3]),
                           Treatment = categorical([1,1,1,2,2,2,3,3,3]))
 9×3 DataFrame
- Row │ Counts   Outcome  Treatment
-     │ Float64  Cat…     Cat…
+ Row │ Counts   Outcome  Treatment 
+     │ Float64  Cat…     Cat…      
 ─────┼─────────────────────────────
    1 │    18.0  1        1
    2 │    17.0  2        1
@@ -390,29 +390,8 @@ In this example, we choose the best model from a set of λs, based on minimum BI
 ```jldoctest
 julia> using GLM, RDatasets, StatsBase, DataFrames, Optim
 
-julia> trees = DataFrame(dataset("datasets", "trees"))
-31×3 DataFrame
- Row │ Girth    Height  Volume  
-     │ Float64  Int64   Float64 
-─────┼──────────────────────────
-   1 │     8.3      70     10.3
-   2 │     8.6      65     10.3
-   3 │     8.8      63     10.2
-   4 │    10.5      72     16.4
-   5 │    10.7      81     18.8
-   6 │    10.8      83     19.7
-   7 │    11.0      66     15.6
-   8 │    11.0      75     18.2
-  ⋮  │    ⋮       ⋮        ⋮
-  25 │    16.3      77     42.6
-  26 │    17.3      81     55.4
-  27 │    17.5      82     55.7
-  28 │    17.9      80     58.3
-  29 │    18.0      80     51.5
-  30 │    18.0      80     51.0
-  31 │    20.6      87     77.0
-                 16 rows omitted
-
+julia> trees = DataFrame(dataset("datasets", "trees"));
+
 julia> bic_glm(λ) = bic(glm(@formula(Volume ~ Height + Girth), trees, Normal(), PowerLink(λ)));
 
 julia> optimal_bic = optimize(bic_glm, -1.0, 1.0);

diff --git a/docs/src/index.md b/docs/src/index.md
@@ -123,6 +123,110 @@ x: 4         -0.032673    0.0797865  -0.41    0.6831  -0.191048    0.125702
 ───────────────────────────────────────────────────────────────────────────
 ```
 
+## Weighting 
+
+Both `lm` and `glm` allow weighted estimation. The three different 
+[types of weights](https://juliastats.org/StatsBase.jl/stable/weights/) defined in 
+[StatsBase.jl](https://github.com/JuliaStats/StatsBase.jl) can be used to fit a model:
+
+- `AnalyticWeights` describe a non-random relative importance (usually between 0 and 1) for
+  each observation. These weights may also be referred to as reliability weights, precision
+  weights or inverse variance weights. These are typically used when the observations being
+  weighted are aggregate values (e.g., averages) with differing variances.
+- `FrequencyWeights` describe the inverse of the sampling probability for each observation,
+  providing a correction mechanism for under- or over-sampling certain population groups.
+  These weights may also be referred to as sampling weights.
+- `ProbabilityWeights` describe how the sample can be scaled back to the population.
+  Usually are the reciprocals of sampling probabilities.
- `FrequencyWeights` describe the inverse of the sampling probability for each observation,
-  providing a correction mechanism for under- or over-sampling certain population groups.
-  These weights may also be referred to as sampling weights.
- `ProbabilityWeights` describe how the sample can be scaled back to the population.
-  Usually are the reciprocals of sampling probabilities.
+- `FrequencyWeights` describe the number of times (or frequency) each observation was seen.
+  These weights may also be referred to as case weights or repeat weights.
+- `ProbabilityWeights` represent the inverse of the sampling probability for each observation,
+  providing a correction mechanism for under- or over-sampling certain population groups.
+  These weights may also be referred to as sampling weights.
- `FrequencyWeights` describe the inverse of the sampling probability for each observation,
-  providing a correction mechanism for under- or over-sampling certain population groups.
-  These weights may also be referred to as sampling weights.
- `ProbabilityWeights` describe how the sample can be scaled back to the population.
-  Usually are the reciprocals of sampling probabilities.
+- `FrequencyWeights` describe the number of times (or frequency) each observation was seen.
+  These weights may also be referred to as case weights or repeat weights.
+- `ProbabilityWeights` represent the inverse of the sampling probability for each observation,
+  providing a correction mechanism for under- or over-sampling certain population groups.
+  These weights may also be referred to as sampling weights.
+
+To indicate which kind of weights should be used, the vector of weights must be wrapped in
+one of the three weights types, and then passed to the `weights` keyword argument.
+Short-hand functions `aweights`, `fweights`, and `pweights` can be used to construct
+`AnalyticWeights`, `FrequencyWeights`, and `ProbabilityWeights`, respectively.
+
+We illustrate the API with randomly generated data.
+
+```jldoctest weights
+julia> using StableRNGs, DataFrames, GLM
+
+julia> data = DataFrame(y = rand(StableRNG(1), 100), x = randn(StableRNG(2), 100), weights = repeat([1, 2, 3, 4], 25), );
+
+julia> m = lm(@formula(y ~ x), data)
+LinearModel
+
+y ~ 1 + x
+
+Coefficients:
+──────────────────────────────────────────────────────────────────────────
+                  Coef.  Std. Error      t  Pr(>|t|)  Lower 95%  Upper 95%
+──────────────────────────────────────────────────────────────────────────
+(Intercept)   0.517369    0.0280232  18.46    <1e-32   0.461758  0.57298
+x            -0.0500249   0.0307201  -1.63    0.1066  -0.110988  0.0109382
+──────────────────────────────────────────────────────────────────────────
+
+julia> m_aweights = lm(@formula(y ~ x), data, wts=aweights(data.weights))
+LinearModel
+
+y ~ 1 + x
+
+Coefficients:
+──────────────────────────────────────────────────────────────────────────
+                  Coef.  Std. Error      t  Pr(>|t|)  Lower 95%  Upper 95%
+──────────────────────────────────────────────────────────────────────────
+(Intercept)   0.51673     0.0270707  19.09    <1e-34   0.463009  0.570451
+x            -0.0478667   0.0308395  -1.55    0.1239  -0.109067  0.0133333
+──────────────────────────────────────────────────────────────────────────
+
+julia> m_fweights = lm(@formula(y ~ x), data, wts=fweights(data.weights))
+LinearModel
+
+y ~ 1 + x
+
+Coefficients:
+─────────────────────────────────────────────────────────────────────────────
+                  Coef.  Std. Error      t  Pr(>|t|)   Lower 95%    Upper 95%
+─────────────────────────────────────────────────────────────────────────────
+(Intercept)   0.51673     0.0170172  30.37    <1e-84   0.483213    0.550246
+x            -0.0478667   0.0193863  -2.47    0.0142  -0.0860494  -0.00968394
+─────────────────────────────────────────────────────────────────────────────
+
+julia> m_pweights = lm(@formula(y ~ x), data, wts=pweights(data.weights))
+LinearModel
+
+y ~ 1 + x
+
+Coefficients:
+───────────────────────────────────────────────────────────────────────────
+                  Coef.  Std. Error      t  Pr(>|t|)  Lower 95%   Upper 95%
+───────────────────────────────────────────────────────────────────────────
+(Intercept)   0.51673     0.0288654  17.90    <1e-31   0.459447  0.574012
+x            -0.0478667   0.0266884  -1.79    0.0760  -0.100829  0.00509556
+───────────────────────────────────────────────────────────────────────────
+```
+
+!!! warning
+
+  In the old API, weights were passed as `AbstractVectors` and were silently treated in
+  the internal computation of standard errors and related quantities as `FrequencyWeights`.
+  Passing weights as `AbstractVector` is still allowed for backward compatibility, but it
+  is deprecated. When weights are passed following the old API, they are now coerced to
+  `FrequencyWeights` and a deprecation warning is issued.
+
+The type of the weights will affect the variance of the estimated coefficients and the
+quantities involving this variance. The coefficient point estimates will be the same
+regardless of the type of weights.
+
+```jldoctest weights
+julia> loglikelihood(m_aweights)
+-16.296307561384253
+
+julia> loglikelihood(m_fweights)
+-25.51860961756451
+
+julia> loglikelihood(m_pweights)
+-16.296307561384253
+```
+
 ## Comparing models with F-test
 
 Comparisons between two or more linear models can be performed using the `ftest` function,
@@ -176,8 +280,8 @@ Many of the methods provided by this package have names similar to those in [R](
 - `vcov`: variance-covariance matrix of the coefficient estimates
 
 
-Note that the canonical link for negative binomial regression is `NegativeBinomialLink`, but
-in practice one typically uses `LogLink`.
+Note that the canonical link for negative binomial regression is `NegativeBinomialLink`, 
+but in practice one typically uses `LogLink`.
 
 ```jldoctest methods
 julia> using GLM, DataFrames, StatsBase
@@ -209,7 +313,9 @@ julia> round.(predict(mdl, test_data); digits=8)
  9.33333333
 ```
 
-The [`cooksdistance`](@ref) method computes [Cook's distance](https://en.wikipedia.org/wiki/Cook%27s_distance) for each observation used to fit a linear model, giving an estimate of the influence of each data point.
+The [`cooksdistance`](@ref) method computes
+[Cook's distance](https://en.wikipedia.org/wiki/Cook%27s_distance) for each observation
+used to fit a linear model, giving an estimate of the influence of each data point.
 Note that it's currently only implemented for linear models without weights.
 
 ```jldoctest methods

diff --git a/src/GLM.jl b/src/GLM.jl
@@ -11,17 +11,18 @@ module GLM
     import LinearAlgebra: cholesky, cholesky!
     import Statistics: cor
     import StatsBase: coef, coeftable, coefnames, confint, deviance, nulldeviance, dof, dof_residual,
-                      loglikelihood, nullloglikelihood, nobs, stderror, vcov,
-                      residuals, predict, predict!,
-                      fitted, fit, model_response, response, modelmatrix, r2, r², adjr2, adjr², PValue
+                      loglikelihood, nullloglikelihood, nobs, stderror, vcov, residuals, predict, predict!,
+                      fitted, fit, model_response, response, modelmatrix, r2, r², adjr2, adjr², 
+                      PValue, weights, leverage
     import StatsFuns: xlogy
     import SpecialFunctions: erfc, erfcinv, digamma, trigamma
     import StatsModels: hasintercept
     import Tables
     export coef, coeftable, confint, deviance, nulldeviance, dof, dof_residual,
-           loglikelihood, nullloglikelihood, nobs, stderror, vcov, residuals, predict,
+           loglikelihood, nullloglikelihood, nobs, stderror, vcov, residuals, predict, predict!,
            fitted, fit, fit!, model_response, response, modelmatrix, r2, r², adjr2, adjr²,
-           cooksdistance, hasintercept, dispersion
+           cooksdistance, hasintercept, dispersion, weights, AnalyticWeights, ProbabilityWeights, FrequencyWeights, 
+           UnitWeights, uweights, fweights, pweights, aweights, leverage
 
     export
         # types