From 6e870b602f6ce0e302b38fd896e008beac2e8d16 Mon Sep 17 00:00:00 2001
From: Sebastian Funk <sebastian.funk@lshtm.ac.uk>
Date: Fri, 18 Oct 2024 14:58:43 +0100
Subject: [PATCH] actually deprecate bootstrapping

---
 R/contact_matrix.r              | 348 +++++++++++++++-----------------
 R/globals.R                     |   1 +
 man/contact_matrix.Rd           |   6 +-
 tests/testthat/test-agegroups.r |   2 +-
 4 files changed, 168 insertions(+), 189 deletions(-)

diff --git a/R/contact_matrix.r b/R/contact_matrix.r
index 7b5ba29..61311d8 100644
--- a/R/contact_matrix.r
+++ b/R/contact_matrix.r
@@ -1,6 +1,6 @@
 #' Generate a contact matrix from diary survey data
 #'
-#' Samples a contact survey using a bootstrap
+#' Samples a contact survey
 #'
 #' @param survey a [survey()] object
 #' @param countries limit to one or more countries; if not given, will use all countries in the survey; these can be given as country names or 2-letter (ISO Alpha-2) country codes
@@ -19,7 +19,7 @@
 #' @param weigh.dayofweek whether to weigh social contacts data by the day of the week (weight (5/7 / N_week / N) for weekdays and (2/7 / N_weekend / N) for weekends)
 #' @param weigh.age whether to weigh social contacts data by the age of the participants (vs. the populations' age distribution)
 #' @param weight.threshold threshold value for the standardized weights before running an additional standardisation (default 'NA' = no cutoff)
-#' @param sample.all.age.groups what to do if bootstrapping fails to sample participants from one or more age groups; if FALSE (default), corresponding rows will be set to NA, if TRUE the sample will be discarded and a new one taken instead
+#' @param sample.all.age.groups what to do if sampling participants (with `sample.participants = TRUE`) fails to sample participants from one or more age groups; if FALSE (default), corresponding rows will be set to NA, if TRUE the sample will be discarded and a new one taken instead
 #' @param return.part.weights boolean to return the participant weights
 #' @param return.demography boolean to explicitly return demography data that corresponds to the survey data (default 'NA' = if demography data is requested by other function parameters)
 #' @param per.capita whether to return a matrix with contact rates per capita (default is FALSE and not possible if 'counts=TRUE' or 'split=TRUE')
@@ -35,7 +35,7 @@
 #' data(polymod)
 #' contact_matrix(polymod, countries = "United Kingdom", age.limits = c(0, 1, 5, 15))
 #' @author Sebastian Funk
-contact_matrix <- function(survey, countries = NULL, survey.pop, age.limits, filter, n = 1, bootstrap, counts = FALSE, symmetric = FALSE, split = FALSE, sample.participants = FALSE, estimated.participant.age = c("mean", "sample", "missing"), estimated.contact.age = c("mean", "sample", "missing"), missing.participant.age = c("remove", "keep"), missing.contact.age = c("remove", "sample", "keep", "ignore"), weights = NULL, weigh.dayofweek = FALSE, weigh.age = FALSE, weight.threshold = NA, sample.all.age.groups = FALSE, return.part.weights = FALSE, return.demography = NA, per.capita = FALSE, ...) {
+contact_matrix <- function(survey, countries = NULL, survey.pop, age.limits, filter, counts = FALSE, symmetric = FALSE, split = FALSE, sample.participants = FALSE, estimated.participant.age = c("mean", "sample", "missing"), estimated.contact.age = c("mean", "sample", "missing"), missing.participant.age = c("remove", "keep"), missing.contact.age = c("remove", "sample", "keep", "ignore"), weights = NULL, weigh.dayofweek = FALSE, weigh.age = FALSE, weight.threshold = NA, sample.all.age.groups = FALSE, return.part.weights = FALSE, return.demography = NA, per.capita = FALSE, ...) {
   surveys <- c("participants", "contacts")
 
   dot.args <- list(...)
@@ -223,7 +223,7 @@ contact_matrix <- function(survey, countries = NULL, survey.pop, age.limits, fil
   if (missing.contact.age == "remove" &&
     nrow(survey$contacts[is.na(get(columns[["contact.age"]])) |
       get(columns[["contact.age"]]) < min(age.limits)]) > 0) {
-    if (n == 1 && !missing.contact.age.set) {
+    if (!missing.contact.age.set) {
       message(
         "Removing participants that have contacts without age information. ",
         "To change this behaviour, set the 'missing.contact.age' option"
@@ -241,7 +241,7 @@ contact_matrix <- function(survey, countries = NULL, survey.pop, age.limits, fil
   if (missing.contact.age == "ignore" &&
     nrow(survey$contacts[is.na(get(columns[["contact.age"]])) |
       get(columns[["contact.age"]]) < min(age.limits)]) > 0) {
-    if (n == 1 && !missing.contact.age.set) {
+    if (!missing.contact.age.set) {
       message(
         "Ignore contacts without age information. ",
         "To change this behaviour, set the 'missing.contact.age' option"
@@ -583,192 +583,179 @@ contact_matrix <- function(survey, countries = NULL, survey.pop, age.limits, fil
       right = FALSE
     )]
 
-  ## Bootstrap
-  if (n > 1 && !bootstrap) {
-    warning(
-      "n > 1 does not make sense if not bootstrapping. ",
-      "Will return just one sample."
-    )
-    n <- 1
-  }
   ret <- list()
-  for (i in seq_len(n)) {
-    if (sample.participants) {
-      good.sample <- FALSE
-      while (!good.sample) {
-        ## take a sample from the participants
-        part.sample <- sample(participant_ids, replace = TRUE)
-        part.age.limits <-
-          unique(survey$participants[
-            get(columns[["id"]]) %in% part.sample,
-            lower.age.limit
-          ])
-        good.sample <- !sample.all.age.groups ||
-          (length(setdiff(age.limits, part.age.limits)) == 0)
-
-        sample.table <-
-          data.table(id = part.sample, weight = 1)
-        sample.table <-
-          sample.table[, list(bootstrap.weight = sum(weight)), by = id]
-        setnames(sample.table, "id", columns[["id"]])
-        setkeyv(sample.table, columns[["id"]])
-
-        sampled.contacts <- merge(survey$contacts, sample.table)
-        sampled.contacts[, sampled.weight := weight * bootstrap.weight]
-
-        sampled.participants <-
-          merge(survey$participants, sample.table)
-        sampled.participants[, sampled.weight := weight * bootstrap.weight]
-      }
-    } else {
-      ## just use all participants
-      sampled.contacts <- survey$contacts
-      sampled.contacts[, sampled.weight := weight]
-      sampled.participants <- survey$participants
-      sampled.participants[, sampled.weight := weight]
+  if (sample.participants) {
+    good.sample <- FALSE
+    while (!good.sample) {
+      ## take a sample from the participants
+      part.sample <- sample(participant_ids, replace = TRUE)
+      part.age.limits <-
+        unique(survey$participants[
+          get(columns[["id"]]) %in% part.sample,
+          lower.age.limit
+        ])
+      good.sample <- !sample.all.age.groups ||
+        (length(setdiff(age.limits, part.age.limits)) == 0)
+
+      sample.table <-
+        data.table(id = part.sample, weight = 1)
+      sample.table <-
+        sample.table[, list(bootstrap.weight = sum(weight)), by = id]
+      setnames(sample.table, "id", columns[["id"]])
+      setkeyv(sample.table, columns[["id"]])
+
+      sampled.contacts <- merge(survey$contacts, sample.table)
+      sampled.contacts[, sampled.weight := weight * bootstrap.weight]
+
+      sampled.participants <-
+        merge(survey$participants, sample.table)
+      sampled.participants[, sampled.weight := weight * bootstrap.weight]
     }
+  } else {
+    ## just use all participants
+    sampled.contacts <- survey$contacts
+    sampled.contacts[, sampled.weight := weight]
+    sampled.participants <- survey$participants
+    sampled.participants[, sampled.weight := weight]
+  }
 
-    ## calculate weighted contact matrix
-    weighted.matrix <-
-      xtabs(
-        data = sampled.contacts,
-        formula = sampled.weight ~ age.group + contact.age.group,
-        addNA = TRUE
-      )
-
-    dims <- dim(weighted.matrix)
-    dim.names <- dimnames(weighted.matrix)
+  ## calculate weighted contact matrix
+  weighted.matrix <-
+    xtabs(
+      data = sampled.contacts,
+      formula = sampled.weight ~ age.group + contact.age.group,
+      addNA = TRUE
+    )
 
-    if (!counts) { ## normalise to give mean number of contacts
-      ## calculate normalisation vector
-      norm.vector <-
-        xtabs(
-          data = sampled.participants,
-          formula = sampled.weight ~ age.group, addNA = TRUE
-        )
+  dims <- dim(weighted.matrix)
+  dim.names <- dimnames(weighted.matrix)
 
-      ## normalise contact matrix
-      weighted.matrix <-
-        array(apply(weighted.matrix, 2, function(x) x / norm.vector),
-          dim = dims,
-          dimnames = dim.names
-        )
-      ## set non-existent data to NA
-      weighted.matrix[is.nan(weighted.matrix)] <- NA_real_
-    }
+  if (!counts) { ## normalise to give mean number of contacts
+    ## calculate normalisation vector
+    norm.vector <-
+      xtabs(
+        data = sampled.participants,
+        formula = sampled.weight ~ age.group, addNA = TRUE
+      )
 
-    ## construct a warning in case there are NAs
-    na.headers <- anyNA(dimnames(weighted.matrix), recursive = TRUE)
-    na.content <- anyNA(weighted.matrix)
-    na.present <- na.headers || na.content
-
-    if (na.present) {
-      warning.suggestion <- "  Consider "
-      if (na.headers) {
-        warning.suggestion <- paste0(warning.suggestion, "setting ")
-        suggested.options <- NULL
-        if (anyNA(rownames(weighted.matrix))) {
-          suggested.options <- c(suggested.options, "'missing.participant.age'")
-        }
-        if (anyNA(colnames(weighted.matrix))) {
-          suggested.options <- c(suggested.options, "'missing.contact.age'")
-        }
+    ## normalise contact matrix
+    weighted.matrix <-
+      array(apply(weighted.matrix, 2, function(x) x / norm.vector),
+        dim = dims,
+        dimnames = dim.names
+      )
+    ## set non-existent data to NA
+    weighted.matrix[is.nan(weighted.matrix)] <- NA_real_
+  }
 
-        warning.suggestion <-
-          paste0(warning.suggestion, paste(suggested.options, collapse = " and "))
-        if (na.content) {
-          warning.suggestion <- paste0(warning.suggestion, ", and ")
-        } else {
-          warning.suggestion <- paste0(warning.suggestion, ".")
-        }
+  ## construct a warning in case there are NAs
+  na.headers <- anyNA(dimnames(weighted.matrix), recursive = TRUE)
+  na.content <- anyNA(weighted.matrix)
+  na.present <- na.headers || na.content
+
+  if (na.present) {
+    warning.suggestion <- "  Consider "
+    if (na.headers) {
+      warning.suggestion <- paste0(warning.suggestion, "setting ")
+      suggested.options <- NULL
+      if (anyNA(rownames(weighted.matrix))) {
+        suggested.options <- c(suggested.options, "'missing.participant.age'")
       }
-      if (na.content) {
-        warning.suggestion <- paste0(warning.suggestion, "adjusting the age limits.")
+      if (anyNA(colnames(weighted.matrix))) {
+        suggested.options <- c(suggested.options, "'missing.contact.age'")
       }
-    }
 
-    if (symmetric && prod(dim(as.matrix(weighted.matrix))) > 1) {
-      if (counts) {
-        warning(
-          "'symmetric=TRUE' does not make sense with 'counts=TRUE'; ",
-          "will not make matrix symmetric."
-        )
-      } else if (na.present) {
-        warning(
-          "'symmetric=TRUE' does not work with missing data; ",
-          "will not make matrix symmetric\n",
-          warning.suggestion
-        )
+      warning.suggestion <-
+        paste0(warning.suggestion, paste(suggested.options, collapse = " and "))
+      if (na.content) {
+        warning.suggestion <- paste0(warning.suggestion, ", and ")
       } else {
-        ## set c_{ij} N_i and c_{ji} N_j (which should both be equal) to
-        ## 0.5 * their sum; then c_{ij} is that sum / N_i
-        normalised.weighted.matrix <- diag(survey.pop$population) %*% weighted.matrix
-        weighted.matrix <- 0.5 * diag(1 / survey.pop$population) %*%
-          (normalised.weighted.matrix + t(normalised.weighted.matrix))
+        warning.suggestion <- paste0(warning.suggestion, ".")
       }
     }
+    if (na.content) {
+      warning.suggestion <- paste0(warning.suggestion, "adjusting the age limits.")
+    }
+  }
 
-    ret[[i]] <- list()
+  if (symmetric && prod(dim(as.matrix(weighted.matrix))) > 1) {
+    if (counts) {
+      warning(
+        "'symmetric=TRUE' does not make sense with 'counts=TRUE'; ",
+        "will not make matrix symmetric."
+      )
+    } else if (na.present) {
+      warning(
+        "'symmetric=TRUE' does not work with missing data; ",
+        "will not make matrix symmetric\n",
+        warning.suggestion
+      )
+    } else {
+      ## set c_{ij} N_i and c_{ji} N_j (which should both be equal) to
+      ## 0.5 * their sum; then c_{ij} is that sum / N_i
+      normalised.weighted.matrix <- diag(survey.pop$population) %*% weighted.matrix
+      weighted.matrix <- 0.5 * diag(1 / survey.pop$population) %*%
+        (normalised.weighted.matrix + t(normalised.weighted.matrix))
+    }
+  }
 
-    if (split) {
-      if (counts) {
-        warning(
-          "'split=TRUE' does not make sense with 'counts=TRUE'; ",
-          "will not split the contact matrix."
-        )
-      } else if (na.present) {
-        warning(
-          "'split=TRUE' does not work with missing data; ",
-          "will not split contact.matrix.\n",
-          warning.suggestion
-        )
-        ret[[i]][["mean.contacts"]] <- NA
-        ret[[i]][["normalisation"]] <- NA
-        ret[[i]][["contacts"]] <- rep(NA, nrow(weighted.matrix))
-      } else {
-        ## get rid of name but preserve row and column names
-        weighted.matrix <- unname(weighted.matrix)
+  if (split) {
+    if (counts) {
+      warning(
+        "'split=TRUE' does not make sense with 'counts=TRUE'; ",
+        "will not split the contact matrix."
+      )
+    } else if (na.present) {
+      warning(
+        "'split=TRUE' does not work with missing data; ",
+        "will not split contact.matrix.\n",
+        warning.suggestion
+      )
+      ret[["mean.contacts"]] <- NA
+      ret[["normalisation"]] <- NA
+      ret[["contacts"]] <- rep(NA, nrow(weighted.matrix))
+    } else {
+      ## get rid of name but preserve row and column names
+      weighted.matrix <- unname(weighted.matrix)
 
-        nb.contacts <- rowSums(weighted.matrix)
-        mean.contacts <- sum(survey.pop$population * nb.contacts) /
+      nb.contacts <- rowSums(weighted.matrix)
+      mean.contacts <- sum(survey.pop$population * nb.contacts) /
           sum(survey.pop$population)
-        spectrum.matrix <- weighted.matrix
-        spectrum.matrix[is.na(spectrum.matrix)] <- 0
-        spectrum <- as.numeric(eigen(spectrum.matrix, only.values = TRUE)$values[1])
-        ret[[i]][["mean.contacts"]] <- mean.contacts
-        ret[[i]][["normalisation"]] <- spectrum / mean.contacts
-
-        age.proportions <- survey.pop$population / sum(survey.pop$population)
-        weighted.matrix <-
-          diag(1 / nb.contacts) %*% weighted.matrix %*% diag(1 / age.proportions)
-        nb.contacts <- nb.contacts / spectrum
-        ret[[i]][["contacts"]] <- nb.contacts
-      }
-    }
+      spectrum.matrix <- weighted.matrix
+      spectrum.matrix[is.na(spectrum.matrix)] <- 0
+      spectrum <- as.numeric(eigen(spectrum.matrix, only.values = TRUE)$values[1])
+      ret[["mean.contacts"]] <- mean.contacts
+      ret[["normalisation"]] <- spectrum / mean.contacts
 
-    # make sure the dim.names are retained after symmetric or split procedure
-    dimnames(weighted.matrix) <- dim.names
+      age.proportions <- survey.pop$population / sum(survey.pop$population)
+      weighted.matrix <-
+        diag(1 / nb.contacts) %*% weighted.matrix %*% diag(1 / age.proportions)
+      nb.contacts <- nb.contacts / spectrum
+      ret[["contacts"]] <- nb.contacts
+    }
+  }
+  # make sure the dim.names are retained after symmetric or split procedure
+  dimnames(weighted.matrix) <- dim.names
 
-    ret[[i]][["matrix"]] <- weighted.matrix
+  ret[["matrix"]] <- weighted.matrix
 
-    # option to add matrix per capita, i.e. the contact rate of age i with one individual of age j in the population.
-    if (per.capita) {
-      if (counts) {
-        warning(
-          "'per.capita=TRUE' does not make sense with 'counts=TRUE'; ",
-          "will not return the contact matrix per capita."
-        )
-      } else if (split) {
-        warning(
-          "'per.capita=TRUE' does not make sense with 'split=TRUE'; ",
-          "will not return the contact matrix per capita."
-        )
-      } else {
-        survey.pop$population
-        weighted.matrix.per.capita <- weighted.matrix / matrix(rep(survey.pop$population, nrow(survey.pop)), ncol = nrow(survey.pop), byrow = TRUE)
-        weighted.matrix.per.capita
-        ret[[i]][["matrix.per.capita"]] <- weighted.matrix.per.capita
-      }
+  # option to add matrix per capita, i.e. the contact rate of age i with one individual of age j in the population.
+  if (per.capita) {
+    if (counts) {
+      warning(
+        "'per.capita=TRUE' does not make sense with 'counts=TRUE'; ",
+        "will not return the contact matrix per capita."
+      )
+    } else if (split) {
+      warning(
+        "'per.capita=TRUE' does not make sense with 'split=TRUE'; ",
+        "will not return the contact matrix per capita."
+      )
+    } else {
+      survey.pop$population
+      weighted.matrix.per.capita <- weighted.matrix / matrix(rep(survey.pop$population, nrow(survey.pop)), ncol = nrow(survey.pop), byrow = TRUE)
+      weighted.matrix.per.capita
+      ret[["matrix.per.capita"]] <- weighted.matrix.per.capita
     }
   }
 
@@ -795,20 +782,13 @@ contact_matrix <- function(survey, countries = NULL, survey.pop, age.limits, fil
   setnames(part.pop, c("age.group", "participants"))
   part.pop[, proportion := participants / sum(participants)]
 
-  # set function output
-  if (length(ret) > 1) {
-    return_value <- list(matrices = ret)
-  } else {
-    return_value <- ret[[1]]
-  }
-
-  if (!is.null(return_value)) {
+  if (!is.null(ret)) {
     if (need.survey.pop && (is.na(return.demography) || return.demography)) {
       # change survey.pop$age.group factors into characters (cfr. part.pop)
       survey.pop[, age.group := as.character(age.group)]
-      return_value[["demography"]] <- survey.pop[]
+      ret[["demography"]] <- survey.pop[]
     }
-    return_value[["participants"]] <- part.pop[]
+    ret[["participants"]] <- part.pop[]
   }
 
   # option to return participant weights
@@ -832,10 +812,10 @@ contact_matrix <- function(survey, countries = NULL, survey.pop, age.limits, fil
     # set name of last column
     names(part.weights)[ncol(part.weights)] <- "participants"
 
-    # add proportion and add to return_value
+    # add proportion and add to ret
     part.weights[, proportion := participants / sum(participants)]
-    return_value[["participants.weights"]] <- part.weights[]
+    ret[["participants.weights"]] <- part.weights[]
   }
 
-  return(return_value)
+  return(ret)
 }
diff --git a/R/globals.R b/R/globals.R
index 549da04..2bfd358 100644
--- a/R/globals.R
+++ b/R/globals.R
@@ -4,6 +4,7 @@ utils::globalVariables(c(
   "..age.unit", # <clean.survey>
   "..high", # <clean.survey>
   "..low", # <clean.survey>
+  "n", # <contact_matrix>
   "lower.age.limit", # <contact_matrix>
   "age.group", # <contact_matrix>
   "population", # <contact_matrix>
diff --git a/man/contact_matrix.Rd b/man/contact_matrix.Rd
index 8d2d4aa..635b371 100644
--- a/man/contact_matrix.Rd
+++ b/man/contact_matrix.Rd
@@ -10,8 +10,6 @@ contact_matrix(
   survey.pop,
   age.limits,
   filter,
-  n = 1,
-  bootstrap,
   counts = FALSE,
   symmetric = FALSE,
   split = FALSE,
@@ -66,7 +64,7 @@ contact_matrix(
 
 \item{weight.threshold}{threshold value for the standardized weights before running an additional standardisation (default 'NA' = no cutoff)}
 
-\item{sample.all.age.groups}{what to do if bootstrapping fails to sample participants from one or more age groups; if FALSE (default), corresponding rows will be set to NA, if TRUE the sample will be discarded and a new one taken instead}
+\item{sample.all.age.groups}{what to do if sampling participants (with \code{sample.participants = TRUE}) fails to sample participants from one or more age groups; if FALSE (default), corresponding rows will be set to NA, if TRUE the sample will be discarded and a new one taken instead}
 
 \item{return.part.weights}{boolean to return the participant weights}
 
@@ -80,7 +78,7 @@ contact_matrix(
 a contact matrix, and the underlying demography of the surveyed population
 }
 \description{
-Samples a contact survey using a bootstrap
+Samples a contact survey
 }
 \examples{
 data(polymod)
diff --git a/tests/testthat/test-agegroups.r b/tests/testthat/test-agegroups.r
index efb3595..b7d257a 100644
--- a/tests/testthat/test-agegroups.r
+++ b/tests/testthat/test-agegroups.r
@@ -21,7 +21,7 @@ test_that("age groups are ordered factors", {
   ages <- seq_len(50)
   age_limits <- c(0, 5, 10)
   groups <- reduce_agegroups(ages, age_limits)
-  age_groups <- limits_to_agegroups(groups)
+  age_groups <- limits_to_agegroups(groups, notation = "dashes")
   expect_s3_class(age_groups, "ordered")
   expect_s3_class(age_groups, "factor")
 })