Skip to content

Commit

Permalink
Merge pull request #109 from dramanica:anova_lmer
Browse files Browse the repository at this point in the history
Implement lrt for lmer object
  • Loading branch information
ejolly authored Aug 5, 2022
2 parents b9106b5 + 578e853 commit a92ae31
Show file tree
Hide file tree
Showing 5 changed files with 384 additions and 61 deletions.
117 changes: 114 additions & 3 deletions pymer4/models/Lmer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import os

#os.environ["R_HOME"] = "/Users/Esh/anaconda3/envs/pymer4_dev/lib/R"
# os.environ["R_HOME"] = "/Users/Esh/anaconda3/envs/pymer4_dev/lib/R"
from copy import copy
from rpy2.robjects.packages import importr
import rpy2.robjects as robjects
Expand Down Expand Up @@ -58,7 +58,8 @@ class Lmer(object):
data (pd.DataFrame): model copy of input data
grps (dict): groups and number of observations per groups recognized by lmer
design_matrix (pd.DataFrame): model design matrix determined by lmer
AIC (float): model akaike information criterion
AIC (float): model Akaike information criterion
BIC (float): model Bayesian information criterion
logLike (float): model Log-likelihood
family (string): model family
warnings (list): warnings output from R or Python
Expand Down Expand Up @@ -93,6 +94,7 @@ def __init__(self, formula, data, family="gaussian"):
self.data = copy(data)
self.grps = None
self.AIC = None
self.BIC = None
self.logLike = None
self.warnings = []
self.ranef_var = None
Expand Down Expand Up @@ -478,7 +480,11 @@ def fit(
"The rpy2, lme4, or lmerTest API appears to have changed again. Please file a bug report at https://github.com/ejolly/pymer4/issues with your R, Python, rpy2, lme4, and lmerTest versions and the OS you're running pymer4 on. Apologies."
)

self.AIC = unsum.rx2("AICtab")[0]
# self.AIC = unsum.rx2("AICtab")[0]
# the above is incorrect. The AICtab of summary.lmerMod gives the deviance (which in this context is really
# -2 LogLik, NOT the AIC
self.AIC = stats.AIC(self.model_obj)
self.BIC = stats.BIC(self.model_obj)
self.logLike = unsum.rx2("logLik")[0]

# First check for lme4 printed messages (e.g. convergence info is usually here instead of in warnings)
Expand Down Expand Up @@ -1023,6 +1029,10 @@ def summary(self):

if not self.fitted:
raise RuntimeError("Model must be fitted to generate summary!")
if self._REML:
print("Linear mixed model fit by REML [’lmerMod’]")
else:
print("Linear mixed model fit by maximum likelihood ['lmerMod']")

print("Formula: {}\n".format(self.formula))
print("Family: {}\t Inference: {}\n".format(self.family, self.sig_type))
Expand Down Expand Up @@ -1553,3 +1563,104 @@ def plot(
if ylabel:
ax.set_ylabel(ylabel)
return ax

def confint(
self,
parm=None,
level=0.95,
method="Wald",
zeta=None,
nsim=500,
boot_type="perc",
quiet=False,
oldnames=False,
):
"""
Compute confidence intervals on the parameters of a Lmer object (this is a wrapper for confint.merMod in lme4).
Args:
self (Lmer): the Lmer object for which confidence intervals should be computed
parm (list of str): parameter names for which intervals are sought. Specified by an integer vector of positions
(leave blank to compute ci for all parameters)
level (float): confidence level <1, typically above 0.90
method (str): which method to compute confidence intervals; 'profile', 'Wald' (default), or 'boot'
(parametric bootstrap)
zeta (float): (for method = "profile" only:) likelihood cutoff (if not specified, as by default, computed
from level in R).
nsim (int): number of bootstrap intervals if bootstrapped confidence intervals are requests; default 500
boot_type (str): bootstrap confidence interval type (one of "perc","basic","norm", as defined in boot_ci in R)
quiet (bool): (logical) suppress messages about computationally intensive profiling?
oldnames: (logical) use old-style names for variance-covariance parameters, e.g. ".sig01", rather than newer
(more informative) names such as "sd_(Intercept)|Subject"?
Returns:
pd.DataFrame: confidence intervals for the parameters of interest
Examples:
The following examples demonstrate how to get different types of confidence intervals.
The default Wald estimates for all parameters
>>> model.confint()
Boostrap estimate for the variance component of the random intercept from factor "Group" and the error
variance. You will need more than 100 repeats for a real application
>>> model.confint(method="boot", nsim=100, parm=["sd_(Intercept)|Group","sigma"])
Same as above but using oldnames for those variances, which is still the devault in lme4
>>> model.confint(method="boot", nsim=100, oldnames=True, parm=[".sig01",".sigma"])
"""

# record messages going to screen
r_console = []

def _f(x):
r_console.append(x)

callbacks.consolewrite_warnerror = _f
# create string for parm
parm_string = ""
if parm is not None:
parm_string = """,parm=c('"""
for i, this_parm in enumerate(parm):
if i != 0:
parm_string = parm_string + """,'"""
parm_string = parm_string + this_parm + """'"""
parm_string = parm_string + """)"""
# create string of command
rstring = (
"""
function(model){
out_ci <- as.data.frame(confint(model"""
+ parm_string
+ """,level="""
+ str(level)
+ """,method='"""
+ method
+ """'"""
+ ((""",zeta=""" + str(zeta)) if zeta is not None else """""")
+ """,nsim="""
+ str(nsim)
+ """,boot.type='"""
+ boot_type
+ """'"""
+ """,quiet="""
+ ("""TRUE""" if quiet else """FALSE""")
+ """,oldNames="""
+ ("""TRUE""" if oldnames else """FALSE""")
+ """))
list(out_ci,rownames(out_ci))
}"""
)
confint_func = robjects.r(rstring)
out_summary, out_rownames = confint_func(self.model_obj)
df = pd.DataFrame(out_summary)
df.index = out_rownames

# restore outputs
callbacks.consolewrite_warnerror = consolewrite_warning_backup
for message in r_console:
print(message)
return df
145 changes: 89 additions & 56 deletions pymer4/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,19 @@

import numpy as np
import pandas as pd
import copy
from scipy.special import expit
from scipy.stats import pearsonr, spearmanr, ttest_ind, ttest_rel, ttest_1samp
from functools import partial
from itertools import product

from pymer4.utils import (
_check_random_state,
_welch_ingredients,
_mean_diff,
_get_params,
_lrt,
_sig_stars,
)
from joblib import Parallel, delayed

Expand Down Expand Up @@ -544,62 +549,90 @@ def welch_dof(x, y):
raise TypeError("Both x and y must be 1d numpy arrays")


# def lrt(models):
# """
# WARNING EXPERIMENTAL!
# Compute a likelihood ratio test between models. This produces similar but not identical results to R's anova() function when comparing models. Will automatically determine the the model order based on comparing all models to the one that has the fewest parameters.

# Todo:
# 0) Figure out discrepancy with R result
# 1) Generalize function to perform LRT, or vuong test
# 2) Offer nested and non-nested vuong test, as well as AIC/BIC correction
# 3) Given a single model expand out to all separate term tests
# """

# raise NotImplementedError("This function is not yet implemented")

# if not isinstance(models, list):
# models = [models]
# if len(models) < 2:
# raise ValueError("Must have at least 2 models to perform comparison")

# # Get number of coefs for each model
# all_params = []
# for m in models:
# all_params.append(_get_params(m))

# # Sort from fewest params to most
# all_params = np.array(all_params)
# idx = np.argsort(all_params)
# all_params = all_params[idx]
# models = np.array(models)[idx]

# model_pairs = list(product(models, repeat=2))

# model_pairs = model_pairs[1 : len(models)]
# s = []
# for p in model_pairs:
# s.append(_lrt(p))
# out = pd.DataFrame()
# for i, m in enumerate(models):
# pval = s[i - 1] if i > 0 else np.nan
# out = out.append(
# pd.DataFrame(
# {
# "model": m.formula,
# "DF": m.coefs.loc["Intercept", "DF"],
# "AIC": m.AIC,
# "BIC": m.BIC,
# "log-likelihood": m.logLike,
# "P-val": pval,
# },
# index=[0],
# ),
# ignore_index=True,
# )
# out["Sig"] = out["P-val"].apply(lambda x: _sig_stars(x))
# out = out[["model", "log-likelihood", "AIC", "BIC", "DF", "P-val", "Sig"]]
# return out
def lrt(models, refit=True):
"""
Compute a likelihood ratio test between two Lmer models. This produces identical results to R's anova() function when comparing models. Will automatically determine the the model order based on comparing all models to the one that has the fewest parameters.
# Possible additions:
# 1) Generalize function to perform LRT, or vuong test
# 2) Offer nested and non-nested vuong test, as well as AIC/BIC correction
# 3) Given a single model expand out to all separate term tests
Args:
models (list): a list of two Lmer models to be compared
refit (bool): should REML models be refitted as ML before comparison (defaults to True)
Returns:
df (pandas.DataFrame): dataframe of the anova results
"""
models_list = copy.deepcopy(models)
if not isinstance(models_list, list):
models_list = [models_list]
if len(models_list) < 2:
raise ValueError("Must have 2 models to perform comparison")

# refit models if needed
refitted = False
if refit:
for i, m in enumerate(models_list):
if m._REML:
refitted = True
m.fit(REML=False, summarize=False)
models_list[i] = m

# Get number of coefs for each model
all_params = []
for m in models_list:
all_params.append(_get_params(m))
all_params = np.array(all_params)
idx = np.argsort(all_params)
all_params = all_params[idx]
models_list = np.array(models_list)[idx]
out = pd.DataFrame()
for i, m in enumerate(models_list):
df = _get_params(m) - (_get_params(models_list[i - 1])) if i > 0 else np.nan
chisq = (
((-2 * models_list[i - 1].logLike) - (-2 * m.logLike)) if i > 0 else np.nan
)
pval = _lrt([models_list[index] for index in [i - 1, i]]) if i > 0 else np.nan
out = pd.concat(
[
out,
pd.DataFrame(
{
"model": m.formula,
"npar": _get_params(m),
"AIC": m.AIC,
"BIC": m.BIC,
"deviance": -2 * m.logLike,
"log-likelihood": m.logLike,
"Chisq": chisq,
"Df": df,
"P-val": pval,
},
index=[0],
),
],
ignore_index=True,
)
out["Sig"] = out["P-val"].apply(lambda x: _sig_stars(x))
out = out[
[
"model",
"npar",
"AIC",
"BIC",
"log-likelihood",
"deviance",
"Chisq",
"Df",
"P-val",
"Sig",
]
]

if refitted:
print("refitting model(s) with ML (instead of REML)")
return out.fillna("")


def rsquared(y, res, has_constant=True):
Expand Down
14 changes: 14 additions & 0 deletions pymer4/tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,20 @@ def test_gaussian_lmm():
assert isinstance(out, pd.DataFrame)
assert out.shape == (model.data.shape[0], 2)

# Test confint
# Wald confidence interval
wald_confint = model.confint()
assert isinstance(wald_confint, pd.DataFrame)
assert wald_confint.shape == (8, 2)
# there should be no estimates for the random effects
assert wald_confint["2.5 %"].isna().sum() == 5
# bootstrapped confidence intervals
boot_confint = model.confint(method="boot", nsim=10)
assert isinstance(boot_confint, pd.DataFrame)
assert boot_confint.shape == (8, 2)
# ci for random effects should be estimates by bootstrapping
assert boot_confint["2.5 %"].isna().sum() == 0

# Smoketest for old_optimizer
model.fit(summarize=False, old_optimizer=True)

Expand Down
Loading

0 comments on commit a92ae31

Please sign in to comment.