Move data creation for estimate_truncation() to data_raw (#584)

* Add script to create data for estimate truncation example * Remove data creation from estimate_truncation example * Generate and store dataset * Function to add truncation to dataset * Add NEWS item * Use usethis to save data * Replace hardcoding with seq() * Save data * Move function from utilities.R to data-raw * Improve comment in example * Document the truncation example dataset * Add reviewers * Use saveRDS to save data * Shorten name to example_truncated * Add punctuation and remove extra comment tag * Don't load here package * Use usethis to save data * Resave generated data and delete old one * Replace data generation in test with stored example data * CI fixes: Add usethis and here packages to setup-r-dependencies action * Fix typo in package name * Improve example Co-authored-by: Sebastian Funk <sebastian.funk@lshtm.ac.uk> * Improve NEWS item * Generate docs * Use example_truncated dataset directly in tests --------- Co-authored-by: Sebastian Funk <sebastian.funk@lshtm.ac.uk>
epiforecasts · Feb 29, 2024 · 0c5c576 · 0c5c576
1 parent c7213ce
commit 0c5c576
Show file tree

Hide file tree

Showing 11 changed files with 114 additions and 123 deletions.
diff --git a/.github/workflows/lint-only-changed-files.yaml b/.github/workflows/lint-only-changed-files.yaml
@@ -23,6 +23,8 @@ jobs:
         with:
           dependencies: NA
           extra-packages: |
+            here
+            usethis
             stan-dev/cmdstanr
             any::gh
             any::lintr

diff --git a/NEWS.md b/NEWS.md
@@ -26,6 +26,7 @@
 * Added input checking to `estimate_infections()`, `estimate_secondary()`, `estimate_truncation()`, `simulate_infections()`, and `epinow()`. `check_reports_valid()` has been added to validate the reports dataset passed to these functions. Tests are added to check `check_reports_valid()`. As part of input validation, the various `*_opts()` functions now return subclasses of the same name as the functions and are tested against passed arguments to ensure the right `*_opts()` is passed to the right argument. For example, the `obs` argument in `estimate_secondary()` is expected to only receive arguments passed through `obs_opts()` and will error otherwise. By @jamesmbaazam in #476 and reviewed by @sbfnk and @seabbs.
 * Added the possibility of specifying a fixed observation scaling. By @sbfnk in #550 and reviewed by @seabbs.
 * Added the possibility of specifying fixed overdispersion. By @sbfnk in #560 and reviewed by @seabbs.
+* The example in `estimate_truncation()` has been simplified. The package now ships with a dataset `example_truncated`, which is used in the `estimate_truncation()` example and tests. The steps for creating the `example_truncated` is stored in `./data-raw/estimate-truncation.R`. By @jamesmbaazam in #584 and reviewed by @seabbs and @sbfnk.
 
 ## Model changes
 

diff --git a/R/data.R b/R/data.R
@@ -45,3 +45,19 @@
 #' An example data frame of observed cases
 #' @format A data frame containing cases reported on each date.
 "example_confirmed"
+
+#' Example Case Data Set with Truncation
+#'
+#' @description `r lifecycle::badge("stable")`
+#' An example dataset of observed cases with truncation applied.
+#' This data is generated internally for use in the example of
+#' `estimate_truncation()`. For details on how the data is generated, see
+#' <https://github.com/epiforecasts/EpiNow2/blob/main/data-raw/estimate-truncation.R> #nolint
+#' @format A list of `data.table`s containing cases reported on each date until
+#' a point of truncation.
+#' Each element of the list is a `data.table` with the following columns:
+#' \describe{
+#'   \item{date}{Date of case report.}
+#'   \item{confirm}{Number of confirmed cases.}
+#' }
+"example_truncated"
diff --git a/R/estimate_truncation.R b/R/estimate_truncation.R
@@ -83,49 +83,9 @@
 #' old_opts <- options()
 #' options(mc.cores = ifelse(interactive(), 4, 1))
 #'
-#' # get example case counts
-#' reported_cases <- example_confirmed[1:60]
-#'
-#' # define example truncation distribution (note not integer adjusted)
-#' trunc <- dist_spec(
-#'   mean = convert_to_logmean(3, 2),
-#'   mean_sd = 0.1,
-#'   sd = convert_to_logsd(3, 2),
-#'   sd_sd = 0.1,
-#'   max = 10
-#' )
-#'
-#' # apply truncation to example data
-#' construct_truncation <- function(index, cases, dist) {
-#'   set.seed(index)
-#'   if (dist$dist == 0) {
-#'     dfunc <- dlnorm
-#'   } else {
-#'     dfunc <- dgamma
-#'   }
-#'   cmf <- cumsum(
-#'     dfunc(
-#'       1:(dist$max + 1),
-#'       rnorm(1, dist$mean_mean, dist$mean_sd),
-#'       rnorm(1, dist$sd_mean, dist$sd_sd)
-#'     )
-#'   )
-#'   cmf <- cmf / cmf[dist$max + 1]
-#'   cmf <- rev(cmf)[-1]
-#'   trunc_cases <- data.table::copy(cases)[1:(.N - index)]
-#'   trunc_cases[
-#'     (.N - length(cmf) + 1):.N, confirm := as.integer(confirm * cmf)
-#'   ]
-#'   return(trunc_cases)
-#' }
-#' example_data <- purrr::map(c(20, 15, 10, 0),
-#'   construct_truncation,
-#'   cases = reported_cases,
-#'   dist = trunc
-#' )
-#'
 #' # fit model to example data
-#' est <- estimate_truncation(example_data,
+#' # See [example_truncated] for more details
+#' est <- estimate_truncation(example_truncated,
 #'   verbose = interactive(),
 #'   chains = 2, iter = 2000
 #' )

diff --git a/data-raw/estimate-truncation.R b/data-raw/estimate-truncation.R
@@ -0,0 +1,60 @@
+library("EpiNow2")
+
+#' Apply truncation to a data set
+#'
+#' @param index Index from which to truncate
+#' @param data Data set
+#' @param dist Truncation distribution
+#' @importFrom stats dgamma dlnorm
+#'
+#' @return A truncated data set
+#' @keywords internal
+apply_truncation <- function(index, data, dist) {
+  set.seed(index)
+  if (dist$dist == 0) {
+    dfunc <- dlnorm
+  } else {
+    dfunc <- dgamma
+  }
+  cmf <- cumsum(
+    dfunc(
+      1:(dist$max + 1),
+      rnorm(1, dist$mean_mean, dist$mean_sd),
+      rnorm(1, dist$sd_mean, dist$sd_sd)
+    )
+  )
+  cmf <- cmf / cmf[dist$max + 1]
+  cmf <- rev(cmf)[-1]
+  trunc_data <- data.table::copy(data)[1:(.N - index)]
+  trunc_data[
+    (.N - length(cmf) + 1):.N, confirm := as.integer(confirm * cmf)
+  ]
+  return(trunc_data)
+}
+
+# get example case counts
+reported_cases <- example_confirmed[1:60]
+
+# define example truncation distribution (note not integer adjusted)
+trunc_dist <- dist_spec(
+  mean = convert_to_logmean(3, 2),
+  mean_sd = 0.1,
+  sd = convert_to_logsd(3, 2),
+  sd_sd = 0.1,
+  max = 10
+)
+
+# Use the make_truncated_data() function to generate example data for
+# an example using estimate_truncation()
+example_truncated <- purrr::map(
+  seq(20, 0, -5),
+  apply_truncation,
+  data = reported_cases,
+  dist = trunc_dist
+)
+
+usethis::use_data(
+  example_truncated,
+  overwrite = TRUE,
+  compress = "xz"
+)
diff --git a/data/example_truncated.rda b/data/example_truncated.rda
diff --git a/inst/extdata/example_truncated.rds b/inst/extdata/example_truncated.rds
diff --git a/inst/extdata/example_truncation_data.rds b/inst/extdata/example_truncation_data.rds
diff --git a/man/estimate_truncation.Rd b/man/estimate_truncation.Rd
diff --git a/man/example_truncated.Rd b/man/example_truncated.Rd
diff --git a/tests/testthat/test-estimate_truncation.R b/tests/testthat/test-estimate_truncation.R
@@ -6,44 +6,10 @@ futile.logger::flog.threshold("FATAL")
 old_opts <- options()
 options(mc.cores = ifelse(interactive(), 4, 1))
 
-# get example case counts
-reported_cases <- example_confirmed[1:60]
-
-# define example truncation distribution (note not integer adjusted)
-trunc_dist <- dist_spec(
-  mean = convert_to_logmean(3, 2),
-  mean_sd = 0.1,
-  sd = convert_to_logsd(3, 2),
-  sd_sd = 0.1,
-  max = 10
-)
-
-# apply truncation to example data
-construct_truncation <- function(index, cases, dist) {
-  set.seed(index)
-  cmf <- cumsum(
-    dlnorm(
-      1:(dist$max + 1),
-      rnorm(1, dist$mean_mean, dist$mean_sd),
-      rnorm(1, dist$sd_mean, dist$sd_sd)
-    )
-  )
-  cmf <- cmf / cmf[dist$max + 1]
-  cmf <- rev(cmf)[-1]
-  trunc_cases <- data.table::copy(cases)[1:(.N - index)]
-  trunc_cases[(.N - length(cmf) + 1):.N, confirm := as.integer(confirm * cmf)]
-  return(trunc_cases)
-}
-example_data <- purrr::map(c(20, 15, 10, 0),
-  construct_truncation,
-  cases = reported_cases,
-  dist = trunc_dist
-)
-
 test_that("estimate_truncation can return values from simulated data and plot
            them", {
   # fit model to example data
-  est <- estimate_truncation(example_data,
+  est <- estimate_truncation(example_truncated,
     verbose = FALSE, chains = 2, iter = 1000, warmup = 250
   )
   expect_equal(
@@ -59,7 +25,7 @@ test_that("estimate_truncation can return values from simulated data with the
   # fit model to example data
   skip_on_os("windows")
   output <- capture.output(suppressMessages(suppressWarnings(
-    est <- estimate_truncation(example_data,
+    est <- estimate_truncation(example_truncated,
       verbose = FALSE, chains = 2, iter = 1000, warmup = 250,
       stan = stan_opts(backend = "cmdstanr")
   ))))
@@ -73,13 +39,13 @@ test_that("estimate_truncation can return values from simulated data with the
 
 test_that("deprecated arguments are recognised", {
   options(warn = 2)
-  expect_error(estimate_truncation(example_data,
+  expect_error(estimate_truncation(example_truncated,
     verbose = FALSE, trunc_max = 10
   ), "deprecated")
-  expect_error(estimate_truncation(example_data,
+  expect_error(estimate_truncation(example_truncated,
     verbose = FALSE, max_truncation = 10
   ), "deprecated")
-  expect_error(estimate_truncation(example_data,
+  expect_error(estimate_truncation(example_truncated,
     verbose = FALSE, trunc_dist = "lognormal"
   ), "deprecated")
 })