don't automatically take mean for ranges (#139)

epiforecasts · Sep 30, 2024 · 3b87bad · 3b87bad
1 parent c6dec25
commit 3b87bad
Show file tree

Hide file tree

Showing 7 changed files with 79 additions and 80 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -3,6 +3,7 @@
 * The speed of loading surveys has been increased.
 * An error has been fixed causing NA contact matrices if any 5-year age band in the population data was missing.
 * Results of function calls accessing Zenodo repository are now cached for speedup and to avoid multiple web requests
+* A bug was fixed where ages given as ranges had been set to the average of estimated ones
 
 # socialmixr 0.3.2
 

diff --git a/R/check.r b/R/check.r
@@ -9,7 +9,7 @@ check <- function(x, ...) UseMethod("check")
 #' @param x A [survey()] object
 #' @param columns deprecated argument, ignored
 #' @param id.column the column in both the `participants` and `contacts` data frames that links contacts to participants
-#' @param participant.age.column the column in the `participants` data frame containing participants' age
+#' @param participant.age.column the column in the `participants` data frame containing participants' age; if this does not exist, at least columns "..._exact", "..._est_min" and "..._est_max" must (see the `estimated.participant.age` option in [contact_matrix()])
 #' @param country.column the column in the `participants` data frame containing the country in which the participant was queried
 #' @param year.column the column in the `participants` data frame containing the year in which the participant was queried
 #' @param contact.age.column the column in the `contacts` data frame containing the age of contacts; if this does not exist, at least columns "..._exact", "..._est_min" and "..._est_max" must (see the `estimated.contact.age` option in [contact_matrix()])
@@ -46,15 +46,16 @@ check.survey <- function(x, columns, id.column = "part_id", participant.age.colu
   }
 
   if (!(participant.age.column %in% colnames(x$participants))) {
+    exact.column <- paste(participant.age.column, "exact", sep = "_")
     min.column <- paste(participant.age.column, "est_min", sep = "_")
     max.column <- paste(participant.age.column, "est_max", sep = "_")
 
-    if (!(min.column %in% colnames(x$participant) && 
-           max.column %in% colnames(x$participant))) {
+    if (!((exact.column %in% colnames(x$participants)) ||
+      (min.column %in% colnames(x$participants) && max.column %in% colnames(x$participants)))) {
       warning(
         "participant age column '", participant.age.column,
-        "' or columns to estimate participant age ('", min.column, "' and '", 
-        max.column, "') do not exist in the participant data frame"
+        "' or columns to estimate participant age ('", exact.column, "' or '",
+        min.column, "' and '", max.column, "') do not exist in the participant data frame"
       )
       success <- FALSE
     }

diff --git a/R/clean.r b/R/clean.r
@@ -39,12 +39,14 @@ clean.survey <- function(x, country.column = "country", participant.age.column =
 
   if (nrow(x$participants) > 0 &&
     participant.age.column %in% colnames(x$participants) &&
-    !is.numeric(x$participants[, get(participant.age.column)])) {
+    (!is.numeric(x$participants[, get(participant.age.column)]) ||
+       anyNA(x$participants[, get(participant.age.column)]))
+  ) {
     ## set any entries not containing numbers to NA
     x$participants <- x$participants[,
       paste(participant.age.column) := fifelse(
         grepl("[0-9]", get(participant.age.column)),
-        get(participant.age.column),
+        as.character(get(participant.age.column)),
         NA_character_
       )
     ]
@@ -89,18 +91,14 @@ clean.survey <- function(x, country.column = "country", participant.age.column =
           seconds_in_year,
       ]
     }
-    # include mean, though it would be better not to assign an average to an "exact age column"
-    x$participants <- x$participants[,
-      paste(participant.age.column) := (..low + ..high) / 2
-    ]
-    # include included min and max age
-    x$participants <- x$participants[,
-      paste0(participant.age.column,'_est_min') := ..low
-    ]
+
     x$participants <- x$participants[,
-    paste0(participant.age.column,'_est_max') := ..high
+      paste(participant.age.column, "exact", sep = "_") := suppressWarnings(
+        as.integer(get(participant.age.column))
+      )
     ]
-
+    x$participants[, paste(participant.age.column) := NULL]
+
     x$participants[, ..high := NULL]
     x$participants[, ..low := NULL]
     x$participants[, ..age.unit := NULL]

diff --git a/R/contact_matrix.r b/R/contact_matrix.r
@@ -13,7 +13,7 @@
 #' @param symmetric whether to make matrix symmetric, such that \eqn{c_{ij}N_i = c_{ji}N_j}.
 #' @param split whether to split the number of contacts and assortativity
 #' @param sample.participants whether to sample participants randomly (with replacement); done multiple times this can be used to assess uncertainty in the generated contact matrices. See the "Bootstrapping" section in the vignette for how to do this..
-#' @param estimated.participant.age if set to "mean" (default), people whose ages are given as a range (in columns named "..._est_min" and "..._est_max") but not exactly (if the "part_age" column contains NA) will have their age set to the mid-point of the range; if set to "sample", the age will be sampled from the range; if set to "missing", age ranges will be treated as missing
+#' @param estimated.participant.age if set to "mean" (default), people whose ages are given as a range (in columns named "..._est_min" and "..._est_max") but not exactly (in a column named "..._exact") will have their age set to the mid-point of the range; if set to "sample", the age will be sampled from the range; if set to "missing", age ranges will be treated as missing
 #' @param estimated.contact.age if set to "mean" (default), contacts whose ages are given as a range (in columns named "..._est_min" and "..._est_max") but not exactly (in a column named "..._exact") will have their age set to the mid-point of the range; if set to "sample", the age will be sampled from the range; if set to "missing", age ranges will be treated as missing
 #' @param missing.participant.age if set to "remove" (default), participants without age information are removed; if set to "keep", participants with missing age are kept and treated as a separate age group
 #' @param missing.contact.age if set to "remove" (default), participants that have contacts without age information are removed; if set to "sample", contacts without age information are sampled from all the contacts of participants of the same age group; if set to "keep", contacts with missing age are kept and treated as a separate age group; if set to "ignore", contact with missing age are ignored in the contact analysis
@@ -128,62 +128,24 @@ contact_matrix <- function(survey, countries = NULL, survey.pop, age.limits, fil
   }
 
   ## check maximum participant age in the data
+  part_exact.column <- paste(columns[["participant.age"]], "exact", sep = "_")
   part_min.column <- paste(columns[["participant.age"]], "est_min", sep = "_")
   part_max.column <- paste(columns[["participant.age"]], "est_max", sep = "_")
 
-  if (!(columns[["participant.age"]] %in% colnames(survey$participants))) {
+  if (part_exact.column %in% colnames(survey$participants)) {
+    survey$participants[,
+      paste(columns[["participant.age"]]) := as.integer(get(part_exact.column))
+    ]
+  } else if (!(columns[["participant.age"]] %in% colnames(survey$participants))) {
     survey$participants[, paste(columns[["participant.age"]]) := NA_integer_]
   }
 
-  if (part_max.column %in% colnames(survey$participants)) {
-    max.age <- max(
-      c(
-        survey$participants[, get(columns[["participant.age"]])],
-        survey$participants[, get(part_max.column)]
-      ),
-      na.rm = TRUE
-    ) + 1
-  } else {
-    max.age <- max(
-      survey$participants[, get(columns[["participant.age"]])], na.rm = TRUE
-    ) + 1
-  }
-
-  if (missing(age.limits)) {
-    all.ages <-
-      unique(as.integer(survey$participants[, get(columns[["participant.age"]])]))
-    all.ages <- all.ages[!is.na(all.ages)]
-    all.ages <- sort(all.ages)
-    age.limits <- union(0, all.ages)
-  }
-
-  ## check if any filters have been requested
-  if (!missing(filter)) {
-    missing_columns <- list()
-    for (table in surveys) {
-      if (nrow(survey[[table]]) > 0) {
-        missing_columns <-
-          c(missing_columns, list(setdiff(names(filter), colnames(survey[[table]]))))
-        ## filter contact data
-        for (column in names(filter)) {
-          if (column %in% colnames(survey[[table]])) {
-            survey[[table]] <- survey[[table]][get(column) == filter[[column]]]
-          }
-        }
-      }
-    }
-    missing_all <- do.call(intersect, missing_columns)
-    if (length(missing_all) > 0) {
-      warning("filter column(s) ", toString(missing_all), " not found")
-    }
-  }
-
   ## sample estimated participant ages
   if (part_min.column %in% colnames(survey$participants) &&
     part_max.column %in% colnames(survey$participants)) {
     if (estimated.participant.age == "mean") {
       survey$participants[
-        is.na(get(columns[["participant.age"]])) &
+        is.na(get(part_exact.column)) &
           !is.na(get(part_min.column)) & !is.na(get(part_max.column)),
         paste(columns[["participant.age"]]) :=
           as.integer(rowMeans(.SD)),
@@ -205,6 +167,27 @@ contact_matrix <- function(survey, countries = NULL, survey.pop, age.limits, fil
     # note: do nothing when "missing" is specified
   }
 
+  if (part_max.column %in% colnames(survey$participants)) {
+    max.age <- max(
+      c(
+        survey$participants[, get(part_exact.column)],
+        survey$participants[, get(part_max.column)]
+      ),
+      na.rm = TRUE
+    ) + 1
+  } else {
+    max.age <- max(
+      survey$participants[, get(columns[["participant.age"]])], na.rm = TRUE
+    ) + 1
+  }
+
+  if (missing(age.limits)) {
+    all.ages <-
+      unique(as.integer(survey$participants[, get(columns[["participant.age"]])]))
+    all.ages <- all.ages[!is.na(all.ages)]
+    all.ages <- sort(all.ages)
+    age.limits <- union(0, all.ages)
+  }
 
   if (missing.participant.age == "remove" &&
     nrow(survey$participants[is.na(get(columns[["participant.age"]])) |
@@ -225,15 +208,12 @@ contact_matrix <- function(survey, countries = NULL, survey.pop, age.limits, fil
   max.column <- paste(columns[["contact.age"]], "est_max", sep = "_")
 
   ## set contact age if it's not in the data
-  if (!(columns[["contact.age"]] %in% colnames(survey$contacts))) {
+  if (exact.column %in% colnames(survey$contacts)) {
+    survey$contacts[,
+      paste(columns[["contact.age"]]) := as.integer(get(exact.column))
+    ]
+  } else {
     survey$contacts[, paste(columns[["contact.age"]]) := NA_integer_]
-
-    if (exact.column %in% colnames(survey$contacts)) {
-      survey$contacts[
-        !is.na(get(exact.column)),
-        paste(columns[["contact.age"]]) := get(exact.column)
-      ]
-    }
   }
 
   ## convert factors to integers
@@ -291,7 +271,6 @@ contact_matrix <- function(survey, countries = NULL, survey.pop, age.limits, fil
     survey$participants <- survey$participants[!(get(columns[["id"]]) %in% missing.age.id)]
   }
 
-
   if (missing.contact.age == "ignore" &&
     nrow(survey$contacts[is.na(get(columns[["contact.age"]])) |
       get(columns[["contact.age"]]) < min(age.limits)]) > 0) {
@@ -305,6 +284,27 @@ contact_matrix <- function(survey, countries = NULL, survey.pop, age.limits, fil
       get(columns[["contact.age"]]) >= min(age.limits), ]
   }
 
+  ## check if any filters have been requested
+  if (!missing(filter)) {
+    missing_columns <- list()
+    for (table in surveys) {
+      if (nrow(survey[[table]]) > 0) {
+        missing_columns <-
+          c(missing_columns, list(setdiff(names(filter), colnames(survey[[table]]))))
+        ## filter contact data
+        for (column in names(filter)) {
+          if (column %in% colnames(survey[[table]])) {
+            survey[[table]] <- survey[[table]][get(column) == filter[[column]]]
+          }
+        }
+      }
+    }
+    missing_all <- do.call(intersect, missing_columns)
+    if (length(missing_all) > 0) {
+      warning("filter column(s) ", toString(missing_all), " not found")
+    }
+  }
+
   # adjust age.group.brakes to the lower and upper ages in the survey
   survey$participants[, lower.age.limit := reduce_agegroups(
     get(columns[["participant.age"]]),

diff --git a/man/check.Rd b/man/check.Rd
diff --git a/man/contact_matrix.Rd b/man/contact_matrix.Rd
diff --git a/tests/testthat/test-matrix.r b/tests/testthat/test-matrix.r
@@ -12,7 +12,7 @@ polymod10 <- get_survey(polymod)
 polymod11 <- get_survey(polymod)
 
 polymod2$participants$added_weight <- 0.5
-polymod2$contacts$cnt_age <- factor(polymod2$contacts$cnt_age)
+polymod2$contacts$cnt_age_exact <- factor(polymod2$contacts$cnt_age_exact)
 polymod2$participants$part_age[1] <- "3-5"
 polymod3$participants$dayofweek <- NULL
 polymod3$participants$year <- NULL
@@ -25,16 +25,15 @@ polymod7$participants$country <- NULL
 polymod8$contacts$cnt_age_exact <- NA_real_
 polymod8$contacts$cnt_age_est_min <- NA_real_
 polymod8$contacts$cnt_age_est_max <- NA_real_
-polymod8$contacts$cnt_age <- NA_real_
-polymod8$contacts[polymod$contacts$part_id == 10, "cnt_age"] <- 10
-polymod8$contacts[polymod$contacts$part_id == 20, "cnt_age"] <- 20
+polymod8$contacts[polymod$contacts$part_id == 10, "cnt_age_exact"] <- 10
+polymod8$contacts[polymod$contacts$part_id == 20, "cnt_age_exact"] <- 20
 polymod9$participants$part_age_est_min <- 1
 polymod9$participants$part_age_est_max <- 15
-polymod9$participants$part_age <- NULL
+polymod9$participants$part_age_exact <- NULL
 polymod9$participants$part_age_est_min <- 1
 polymod9$participants$part_age_est_max <- 15
 nn <- nrow(polymod9$participants)
-polymod9$participants$part_age <- ifelse(runif(nn) > 0.7, 20, NA)
+polymod9$participants$part_age_exact <- ifelse(runif(nn) > 0.7, 20, NA)
 polymod10$participants$added_weight <-
   ifelse(polymod10$participants$hh_size > 1, 2, 1)
 polymod10$participants$added_weight2 <- 0.3