CDCgov · zsusswein · Aug 29, 2024 · Jun 30, 2024 · Jun 30, 2024 · Jul 2, 2024
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -14,6 +14,7 @@ repos:
     -   id: parsable-R
     -   id: no-browser-statement
     -   id: no-print-statement
+        exclude: '^tests/testthat/test-print\.R$'
     -   id: no-debug-statement
     -   id: deps-in-desc
 -   repo: https://github.com/pre-commit/pre-commit-hooks
@@ -25,6 +26,7 @@ repos:
         files: '^\.Rbuildignore$'
     -   id: end-of-file-fixer
         exclude: '\.Rd'
+        exclude: 'tests/testthat/_snaps/'
 -   repo: https://github.com/pre-commit-ci/pre-commit-ci-config
     rev: v1.6.1
     hooks:

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -24,7 +24,8 @@ URL: https://github.com/cdcgov/cfa-gam-rt,
 BugReports: https://github.com/cdcgov/cfa-gam-rt/issues
 Suggests:
     testthat (>= 3.0.0),
-    pkgdown
+    pkgdown,
+    withr
 Config/testthat/edition: 3
 Encoding: UTF-8
 Roxygen: list(markdown = TRUE)

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,6 +1,11 @@
 # Generated by roxygen2: do not edit by hand
 
+S3method(fit_model,RtGam_bam)
+S3method(fit_model,RtGam_gam)
+S3method(fit_model,default)
+S3method(print,RtGam)
 export(RtGam)
+export(check_diagnostics)
 export(penalty_dim_heuristic)
 export(smooth_dim_heuristic)
 importFrom(rlang,abort)
diff --git a/R/RtGam.R b/R/RtGam.R
@@ -18,8 +18,8 @@
 #' @param reference_date The associated date on which the count of incident
 #'   `cases` occurred. Missing dates are not allowed and dates can only occur
 #'   once.
-#' @param group The grouping variable for the case/reference-date pair. Not
-#'   yet implemented and a value other than `NULL` will throw an error.
+#' @param group The grouping variable for the case/reference-date pair. Not yet
+#'   implemented and a value other than `NULL` will throw an error.
 #' @param k An integer, the _total_ dimension of all the smoothing basis
 #'   functions. Defaults to `smooth_dim_heuristic(length(cases))`, which picks a
 #'   reasonable estimate based on the number of provided data points. This total
@@ -32,37 +32,85 @@
 #'   time. An increase in this value above the default should be done carefully.
 #'   See [penalty_dim_heuristic()] for more information on `m` and when to
 #'   consider changing the default.
+#' @param backend One of `gam` or `bam`; defaults to `gam`. In general, models
+#'   should be fit with [mgcv::gam()]. If [mgcv::gam()] is too slow,
+#'   [mgcv::bam()] converges more quickly but introduces some additional
+#'   numerical error. Note that the `bam` backend uses the `discrete = TRUE`
+#'   option for an additional speedup. See [mgcv::bam()] for more information.
+#' @param warn_for_diagnostic_failure Should warnings be issued for
+#'   automatically identified diagnostic issues? Defaults to TRUE. A list of
+#'   quantitative model diagnostics can be inspected in the `diagnostics` slot
+#'   of the returned `RtGam` object.
+#' @param ... Additional arguments passed to the specified modelling backend.
+#'   For example, the default negative binomial error structure could be changed
+#'   to poisson in the default [mgcv::gam] backend by passing `family =
+#'   "poisson"`.
 #' @seealso [smooth_dim_heuristic()] more information on the smoothing basis
-#'   dimension and [mgcv::choose.k] for more general guidance on GAMs from
-#'   `mgcv`
+#'   dimension, [mgcv::choose.k] for more general guidance on GAMs from `mgcv`,
+#'   and [mgcv::gam]/[mgcv::bam] for documentation on arguments to the model
+#'   fitting functions.
 #' @return Stub function: NULL
 #' @export
 #' @examples
-#' cases <- c(1, 2, 3)
-#' reference_date <- as.Date(c("2023-01-01", "2023-01-02", "2023-01-03"))
-#' mod <- RtGam::RtGam(cases, reference_date)
+#' withr::with_seed(12345, {
+#'   cases <- rpois(20, 10)
+#' })
+#' reference_date <- seq.Date(
+#'   from = as.Date("2023-01-01"),
+#'   length.out = 20,
+#'   by = "day"
+#' )
+#' fit <- RtGam(cases, reference_date)
+#' fit
 RtGam <- function(cases,
                   reference_date,
                   group = NULL,
                   k = smooth_dim_heuristic(length(cases)),
-                  m = penalty_dim_heuristic(length(unique(reference_date)))) {
+                  m = penalty_dim_heuristic(length(unique(reference_date))),
+                  backend = "gam",
+                  warn_for_diagnostic_failure = TRUE,
+                  ...) {
   check_required_inputs_provided(
     cases,
     reference_date,
     group,
     k,
-    m
+    m,
+    backend
   )
   validate(cases, reference_date, group, k, m)
 
-  df <- prepare_inputs(cases, reference_date, group)
+  df <- dataset_creator(cases, reference_date, group, backend)
   formula <- formula_creator(
     k = k,
     m = m,
     is_grouped = !rlang::is_null(group)
   )
 
-  invisible(NULL)
+  fit <- do.call(
+    fit_model,
+    list(
+      data = df,
+      formula = formula,
+      ...
+    )
+  )
+  diagnostics <- calculate_diagnostics(fit)
+
+  RtGam_object <- new_RtGam(
+    fit = fit,
+    df = df,
+    group = group,
+    k = k,
+    m = m,
+    backend = backend,
+    formula = formula,
+    diagnostics = diagnostics
+  )
+
+  check_diagnostics(RtGam_object, warn_for_diagnostic_failure)
+
+  return(RtGam_object)
 }
 
 #' Propose total smoothing basis dimension from number of data points
@@ -201,6 +249,8 @@ smooth_dim_heuristic <- function(n) {
 #' ## Very slow
 #'
 #' Decreasing the penalty basis dimension makes the model less demanding to fit.
+#' `mgcv` describes an adaptive penalty with 10 basis dimensions and 200 data
+#' points as roughly equivalent to fitting 10 GAMs each from 20 data points.
 #' Using a single penalty throughout the model is much simpler than using an
 #' adaptive smooth and should be preferred where possible. See
 #' `[mgcv::smooth.construct.ad.smooth.spec]` for more information on how the

diff --git a/R/checkers.R b/R/checkers.R
@@ -103,6 +103,7 @@ check_required_inputs_provided <- function(cases,
                                            group,
                                            k,
                                            m,
+                                           backend,
                                            call = rlang::caller_env()) {
   rlang::check_required(cases, "cases", call = call)
   rlang::check_required(reference_date, "reference_date", call = call)
@@ -128,6 +129,23 @@ check_no_missingness <- function(x, arg = "x", call = rlang::caller_env()) {
   }
 }
 
+check_elements_below_max <- function(x, arg, max, call = rlang::caller_env()) {
+  # Greater than or equal to 0 or is NA
+  is_below_max <- all((x <= max) | is.na(x))
+  if (!all(is_below_max)) {
+    cli::cli_abort(
+      c("{.arg {arg}} has elements larger than {.val {max}}",
+        "!" = "All elements must be {.val {max}} or less",
+        "i" = "Elements {.val {which(!is_below_max)}} are larger"
+      ),
+      class = "RtGam_invalid_input",
+      call = call
+    )
+  }
+  invisible()
+}
+
+
 check_elements_above_min <- function(x, arg, min, call = rlang::caller_env()) {
   # Greater than or equal to 0 or is NA
   is_above_min <- (x >= min) | is.na(x)

diff --git a/R/prepare_inputs.R → R/dataset_creator.R b/R/prepare_inputs.R → R/dataset_creator.R
@@ -2,7 +2,9 @@
 #'
 #' @inheritParams RtGam
 #' @return A dataframe for mgcv
-prepare_inputs <- function(cases, reference_date, group) {
+dataset_creator <- function(cases, reference_date, group, backend) {
+  cases_int <- integerify_cases(cases)
+
   timestep <- dates_to_timesteps(
     reference_date,
     min_supplied_date = min(reference_date),
@@ -13,12 +15,33 @@ prepare_inputs <- function(cases, reference_date, group) {
     group <- rep(NA, length(cases))
   }
 
-  data.frame(
-    cases = cases,
+  dat <- data.frame(
+    cases = cases_int,
     timestep = timestep,
     reference_date = reference_date,
     group = group
   )
+
+  class(dat) <- c(glue::glue("RtGam_{backend}"), class(dat))
+  dat
+}
+
+#' Convert dates to an integer if needed
+#'
+#' @param cases The user-supplied cases vector
+#' @return cases_int Cases verified to be an int
+#' @noRd
+integerify_cases <- function(cases) {
+  if (!rlang::is_integer(cases)) {
+    cli::cli_warn(c(
+      "Coercing {.arg cases} to an integer vector",
+      "i" = "{.arg cases} is a {.obj_type_friendly {cases}}",
+      "x" = "RtGam uses a count model, requiring integer-valued cases"
+    ))
+    as.integer(cases)
+  } else {
+    cases
+  }
 }
 
 #' Convert an arbitrary vector of dates to a vector of timesteps

diff --git a/R/diagnostics.R b/R/diagnostics.R
@@ -0,0 +1,108 @@
+#' Check quantitative diagnostics from a fitted RtGam model
+#'
+#' Evaluates for convergence, effective degrees of freedom, and residual
+#' autocorrelation. If `warn_for_diagnostic_failure` is set to TRUE, will issue
+#' warnings when potential diagnostic issues are detected. The diagnostics are
+#' invisibly returned as a list and also stored within the `diagnostics` element
+#' of the provided model object.
+#'
+#' @param fit A fitted `RtGam` model object. This should be the result of
+#'   calling `RtGam::RtGam()` with appropriate data.
+#' @param warn_for_diagnostic_failure A logical value indicating whether to
+#'   issue warnings if diagnostic checks suggest potential issues with the model
+#'   fit. Defaults to TRUE, meaning that warnings will be issued by default.
+#'
+#' @return Invisibly returns a list containing diagnostic results:
+#'   - `model_converged`: Logical indicating if the model has converged.
+#'   - `k_prime`: The maximum available number of degrees of freedom that could
+#'   be used in the GAM fit.
+#'   - `k_edf`: Estimated degrees of freedom actually used by the smooth terms
+#'   in the model.
+#'   - `k_index`: The ratio of the residual variance of differenced
+#'   near-neighbor residuals to the overall residual variance. This should be
+#'   near 1 or above.
+#'   - `k_p_value`: P-value for testing if k' is adequate for modeling the data.
+#'   - `k_to_edf_ratio`: Ratio of k' to effective degrees of freedom of the
+#'   smooth terms. k' should be well below the available edf.
+#'   - `residual_autocorrelation`: Autocorrelation coefficients for residuals
+#'   up to lag 7 or one-tenth of series length, whichever is smaller.
+#'
+#' @export
+#' @seealso [mgcv::k.check] for a description of the diagnostic tests,
+#'   [mgcv::choose.k] for a description of discussion of choosing the basis
+#'   dimension, and Wood, Simon N. Generalized additive models: an introduction
+#'   with R. chapman and hall/CRC, 2017. for a derivation of the metrics.
+#' @examples
+#' withr::with_seed(12345, {
+#'   cases <- rpois(20, 10)
+#' })
+#' reference_date <- seq.Date(
+#'   from = as.Date("2023-01-01"),
+#'   length.out = 20,
+#'   by = "day"
+#' )
+#' fit <- RtGam::RtGam(cases, reference_date)
+#' check_diagnostics(fit)
+check_diagnostics <- function(fit, warn_for_diagnostic_failure = TRUE) {
+  diagnostics <- fit[["diagnostics"]]
+  if (warn_for_diagnostic_failure) {
+    issue_diagnostic_warnings(diagnostics)
+  }
+  invisible(diagnostics)
+}
+
+calculate_diagnostics <- function(fit) {
+  converged <- fit$converged
+  k_check <- mgcv::k.check(fit)
+  max_lag <- min(7, round(nrow(fit$model) / 7))
+  rho <- stats::acf(fit$residuals, plot = FALSE, lag.max = max_lag)[[1]][, , 1]
+
+  list(
+    model_converged = converged,
+    k_prime = k_check[1],
+    k_edf = k_check[2],
+    k_index = k_check[3],
+    k_p_value = k_check[4],
+    k_to_edf_ratio = k_check[2] / k_check[1],
+    residual_autocorrelation = rho[2:length(rho)]
+  )
+}
+
+issue_diagnostic_warnings <- function(diagnostics) {
+  if (!diagnostics[["model_converged"]]) {
+    cli::cli_alert_danger(
+      c("Model failed to converge. Inference is not reliable.")
+    )
+  }
+  if (diagnostics[["k_to_edf_ratio"]] > 0.9) {
+    cli::cli_bullets(c(
+      "x" = "Effective degrees of freedom is near the supplied upper bound",
+      "!" = "Consider increasing {.arg k}",
+      "*" = "Actual: {.val {round(diagnostics[['k_edf']], 3)}}",
+      "*" = "Upper bound: {.val {diagnostics[['k_prime']]}}"
+    ))
+  }
+  if (diagnostics[["k_p_value"]] < 0.05) {
+    cli::cli_bullets(
+      c(
+        "!" = "k-index for one or more smooths is below 1",
+        "*" = "k-index: {.val {round(diagnostics[['k_index']], 3)}}",
+        "*" = "Associated p-value: {.val {round(diagnostics[['k_p_value']],
+                                                   2)}}",
+        "!" = "Suggests potential unmodeled residual trend.
+                 Inspect model and/or consider increasing {.arg k}"
+      )
+    )
+  }
+  if (any(abs(diagnostics[["residual_autocorrelation"]]) > 0.5)) {
+    cli::cli_bullets(c(
+      "x" = "Residual autocorrelation present",
+      "*" = "Rho: {.val {round(diagnostics[['residual_autocorrelation']],
+                                 2)}}",
+      "*" = "Inspect manually with {.code acf(residuals(fit$model))}",
+      "!" = "Consider increasing {.arg k} and/or
+               specifying {.arg rho} with {.arg backend} bam"
+    ))
+  }
+  invisible(NULL)
+}
diff --git a/R/fit_model.R b/R/fit_model.R
@@ -0,0 +1,58 @@
+fit_model <- function(data, formula, ...) {
+  UseMethod("fit_model")
+}
+
+#' @export
+fit_model.RtGam_gam <- function(
+    data,
+    formula,
+    family = "nb",
+    method = "REML",
+    ...) {
+  # Override the defaults in formals with the user-supplied args in dots
+  mgcv::gam(
+    formula = formula,
+    family = family,
+    data = data,
+    method = method,
+    ...
+  )
+}
+
+#' @export
+fit_model.RtGam_bam <- function(
+    data,
+    formula,
+    family = "nb",
+    method = "fREML",
+    discrete = TRUE,
+    ...) {
+  mgcv::bam(
+    formula = formula,
+    family = family,
+    data = data,
+    method = method,
+    discrete = discrete,
+    ...
+  )
+}
+
+#' Used to throw informative error if non-supported backend supplied
+#' @export
+#' @noRd
+fit_model.default <- function(
+    data,
+    formula,
+    ...) {
+  requested_backend <- class(data)[1]
+  all_backends <- methods(fit_model)
+  # Drop fit_model.default
+  supported_backends <- all_backends[!(all_backends == "fit_model.default")]
+
+  cli::cli_abort(
+    c("Requested {.field backend} {.val {requested_backend}} not supported",
+      "!" = "Supported backends: {.val {supported_backends}}"
+    ),
+    class = "RtGam_invalid_input"
+  )
+}