Skip to content

Commit

Permalink
don't automatically take mean for ranges (#139)
Browse files Browse the repository at this point in the history
  • Loading branch information
sbfnk authored Sep 30, 2024
1 parent c6dec25 commit 3b87bad
Show file tree
Hide file tree
Showing 7 changed files with 79 additions and 80 deletions.
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
* The speed of loading surveys has been increased.
* An error has been fixed causing NA contact matrices if any 5-year age band in the population data was missing.
* Results of function calls accessing Zenodo repository are now cached for speedup and to avoid multiple web requests
* A bug was fixed where ages given as ranges had been set to the average of estimated ones

# socialmixr 0.3.2

Expand Down
11 changes: 6 additions & 5 deletions R/check.r
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ check <- function(x, ...) UseMethod("check")
#' @param x A [survey()] object
#' @param columns deprecated argument, ignored
#' @param id.column the column in both the `participants` and `contacts` data frames that links contacts to participants
#' @param participant.age.column the column in the `participants` data frame containing participants' age
#' @param participant.age.column the column in the `participants` data frame containing participants' age; if this does not exist, at least columns "..._exact", "..._est_min" and "..._est_max" must (see the `estimated.participant.age` option in [contact_matrix()])
#' @param country.column the column in the `participants` data frame containing the country in which the participant was queried
#' @param year.column the column in the `participants` data frame containing the year in which the participant was queried
#' @param contact.age.column the column in the `contacts` data frame containing the age of contacts; if this does not exist, at least columns "..._exact", "..._est_min" and "..._est_max" must (see the `estimated.contact.age` option in [contact_matrix()])
Expand Down Expand Up @@ -46,15 +46,16 @@ check.survey <- function(x, columns, id.column = "part_id", participant.age.colu
}

if (!(participant.age.column %in% colnames(x$participants))) {
exact.column <- paste(participant.age.column, "exact", sep = "_")
min.column <- paste(participant.age.column, "est_min", sep = "_")
max.column <- paste(participant.age.column, "est_max", sep = "_")

if (!(min.column %in% colnames(x$participant) &&
max.column %in% colnames(x$participant))) {
if (!((exact.column %in% colnames(x$participants)) ||
(min.column %in% colnames(x$participants) && max.column %in% colnames(x$participants)))) {
warning(
"participant age column '", participant.age.column,
"' or columns to estimate participant age ('", min.column, "' and '",
max.column, "') do not exist in the participant data frame"
"' or columns to estimate participant age ('", exact.column, "' or '",
min.column, "' and '", max.column, "') do not exist in the participant data frame"
)
success <- FALSE
}
Expand Down
22 changes: 10 additions & 12 deletions R/clean.r
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,14 @@ clean.survey <- function(x, country.column = "country", participant.age.column =

if (nrow(x$participants) > 0 &&
participant.age.column %in% colnames(x$participants) &&
!is.numeric(x$participants[, get(participant.age.column)])) {
(!is.numeric(x$participants[, get(participant.age.column)]) ||
anyNA(x$participants[, get(participant.age.column)]))
) {
## set any entries not containing numbers to NA
x$participants <- x$participants[,
paste(participant.age.column) := fifelse(
grepl("[0-9]", get(participant.age.column)),
get(participant.age.column),
as.character(get(participant.age.column)),
NA_character_
)
]
Expand Down Expand Up @@ -89,18 +91,14 @@ clean.survey <- function(x, country.column = "country", participant.age.column =
seconds_in_year,
]
}
# include mean, though it would be better not to assign an average to an "exact age column"
x$participants <- x$participants[,
paste(participant.age.column) := (..low + ..high) / 2
]
# include included min and max age
x$participants <- x$participants[,
paste0(participant.age.column,'_est_min') := ..low
]

x$participants <- x$participants[,
paste0(participant.age.column,'_est_max') := ..high
paste(participant.age.column, "exact", sep = "_") := suppressWarnings(
as.integer(get(participant.age.column))
)
]

x$participants[, paste(participant.age.column) := NULL]

x$participants[, ..high := NULL]
x$participants[, ..low := NULL]
x$participants[, ..age.unit := NULL]
Expand Down
110 changes: 55 additions & 55 deletions R/contact_matrix.r
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
#' @param symmetric whether to make matrix symmetric, such that \eqn{c_{ij}N_i = c_{ji}N_j}.
#' @param split whether to split the number of contacts and assortativity
#' @param sample.participants whether to sample participants randomly (with replacement); done multiple times this can be used to assess uncertainty in the generated contact matrices. See the "Bootstrapping" section in the vignette for how to do this..
#' @param estimated.participant.age if set to "mean" (default), people whose ages are given as a range (in columns named "..._est_min" and "..._est_max") but not exactly (if the "part_age" column contains NA) will have their age set to the mid-point of the range; if set to "sample", the age will be sampled from the range; if set to "missing", age ranges will be treated as missing
#' @param estimated.participant.age if set to "mean" (default), people whose ages are given as a range (in columns named "..._est_min" and "..._est_max") but not exactly (in a column named "..._exact") will have their age set to the mid-point of the range; if set to "sample", the age will be sampled from the range; if set to "missing", age ranges will be treated as missing
#' @param estimated.contact.age if set to "mean" (default), contacts whose ages are given as a range (in columns named "..._est_min" and "..._est_max") but not exactly (in a column named "..._exact") will have their age set to the mid-point of the range; if set to "sample", the age will be sampled from the range; if set to "missing", age ranges will be treated as missing
#' @param missing.participant.age if set to "remove" (default), participants without age information are removed; if set to "keep", participants with missing age are kept and treated as a separate age group
#' @param missing.contact.age if set to "remove" (default), participants that have contacts without age information are removed; if set to "sample", contacts without age information are sampled from all the contacts of participants of the same age group; if set to "keep", contacts with missing age are kept and treated as a separate age group; if set to "ignore", contact with missing age are ignored in the contact analysis
Expand Down Expand Up @@ -128,62 +128,24 @@ contact_matrix <- function(survey, countries = NULL, survey.pop, age.limits, fil
}

## check maximum participant age in the data
part_exact.column <- paste(columns[["participant.age"]], "exact", sep = "_")
part_min.column <- paste(columns[["participant.age"]], "est_min", sep = "_")
part_max.column <- paste(columns[["participant.age"]], "est_max", sep = "_")

if (!(columns[["participant.age"]] %in% colnames(survey$participants))) {
if (part_exact.column %in% colnames(survey$participants)) {
survey$participants[,
paste(columns[["participant.age"]]) := as.integer(get(part_exact.column))
]
} else if (!(columns[["participant.age"]] %in% colnames(survey$participants))) {
survey$participants[, paste(columns[["participant.age"]]) := NA_integer_]
}

if (part_max.column %in% colnames(survey$participants)) {
max.age <- max(
c(
survey$participants[, get(columns[["participant.age"]])],
survey$participants[, get(part_max.column)]
),
na.rm = TRUE
) + 1
} else {
max.age <- max(
survey$participants[, get(columns[["participant.age"]])], na.rm = TRUE
) + 1
}

if (missing(age.limits)) {
all.ages <-
unique(as.integer(survey$participants[, get(columns[["participant.age"]])]))
all.ages <- all.ages[!is.na(all.ages)]
all.ages <- sort(all.ages)
age.limits <- union(0, all.ages)
}

## check if any filters have been requested
if (!missing(filter)) {
missing_columns <- list()
for (table in surveys) {
if (nrow(survey[[table]]) > 0) {
missing_columns <-
c(missing_columns, list(setdiff(names(filter), colnames(survey[[table]]))))
## filter contact data
for (column in names(filter)) {
if (column %in% colnames(survey[[table]])) {
survey[[table]] <- survey[[table]][get(column) == filter[[column]]]
}
}
}
}
missing_all <- do.call(intersect, missing_columns)
if (length(missing_all) > 0) {
warning("filter column(s) ", toString(missing_all), " not found")
}
}

## sample estimated participant ages
if (part_min.column %in% colnames(survey$participants) &&
part_max.column %in% colnames(survey$participants)) {
if (estimated.participant.age == "mean") {
survey$participants[
is.na(get(columns[["participant.age"]])) &
is.na(get(part_exact.column)) &
!is.na(get(part_min.column)) & !is.na(get(part_max.column)),
paste(columns[["participant.age"]]) :=
as.integer(rowMeans(.SD)),
Expand All @@ -205,6 +167,27 @@ contact_matrix <- function(survey, countries = NULL, survey.pop, age.limits, fil
# note: do nothing when "missing" is specified
}

if (part_max.column %in% colnames(survey$participants)) {
max.age <- max(
c(
survey$participants[, get(part_exact.column)],
survey$participants[, get(part_max.column)]
),
na.rm = TRUE
) + 1
} else {
max.age <- max(
survey$participants[, get(columns[["participant.age"]])], na.rm = TRUE
) + 1
}

if (missing(age.limits)) {
all.ages <-
unique(as.integer(survey$participants[, get(columns[["participant.age"]])]))
all.ages <- all.ages[!is.na(all.ages)]
all.ages <- sort(all.ages)
age.limits <- union(0, all.ages)
}

if (missing.participant.age == "remove" &&
nrow(survey$participants[is.na(get(columns[["participant.age"]])) |
Expand All @@ -225,15 +208,12 @@ contact_matrix <- function(survey, countries = NULL, survey.pop, age.limits, fil
max.column <- paste(columns[["contact.age"]], "est_max", sep = "_")

## set contact age if it's not in the data
if (!(columns[["contact.age"]] %in% colnames(survey$contacts))) {
if (exact.column %in% colnames(survey$contacts)) {
survey$contacts[,
paste(columns[["contact.age"]]) := as.integer(get(exact.column))
]
} else {
survey$contacts[, paste(columns[["contact.age"]]) := NA_integer_]

if (exact.column %in% colnames(survey$contacts)) {
survey$contacts[
!is.na(get(exact.column)),
paste(columns[["contact.age"]]) := get(exact.column)
]
}
}

## convert factors to integers
Expand Down Expand Up @@ -291,7 +271,6 @@ contact_matrix <- function(survey, countries = NULL, survey.pop, age.limits, fil
survey$participants <- survey$participants[!(get(columns[["id"]]) %in% missing.age.id)]
}


if (missing.contact.age == "ignore" &&
nrow(survey$contacts[is.na(get(columns[["contact.age"]])) |
get(columns[["contact.age"]]) < min(age.limits)]) > 0) {
Expand All @@ -305,6 +284,27 @@ contact_matrix <- function(survey, countries = NULL, survey.pop, age.limits, fil
get(columns[["contact.age"]]) >= min(age.limits), ]
}

## check if any filters have been requested
if (!missing(filter)) {
missing_columns <- list()
for (table in surveys) {
if (nrow(survey[[table]]) > 0) {
missing_columns <-
c(missing_columns, list(setdiff(names(filter), colnames(survey[[table]]))))
## filter contact data
for (column in names(filter)) {
if (column %in% colnames(survey[[table]])) {
survey[[table]] <- survey[[table]][get(column) == filter[[column]]]
}
}
}
}
missing_all <- do.call(intersect, missing_columns)
if (length(missing_all) > 0) {
warning("filter column(s) ", toString(missing_all), " not found")
}
}

# adjust age.group.brakes to the lower and upper ages in the survey
survey$participants[, lower.age.limit := reduce_agegroups(
get(columns[["participant.age"]]),
Expand Down
2 changes: 1 addition & 1 deletion man/check.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/contact_matrix.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

11 changes: 5 additions & 6 deletions tests/testthat/test-matrix.r
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ polymod10 <- get_survey(polymod)
polymod11 <- get_survey(polymod)

polymod2$participants$added_weight <- 0.5
polymod2$contacts$cnt_age <- factor(polymod2$contacts$cnt_age)
polymod2$contacts$cnt_age_exact <- factor(polymod2$contacts$cnt_age_exact)
polymod2$participants$part_age[1] <- "3-5"
polymod3$participants$dayofweek <- NULL
polymod3$participants$year <- NULL
Expand All @@ -25,16 +25,15 @@ polymod7$participants$country <- NULL
polymod8$contacts$cnt_age_exact <- NA_real_
polymod8$contacts$cnt_age_est_min <- NA_real_
polymod8$contacts$cnt_age_est_max <- NA_real_
polymod8$contacts$cnt_age <- NA_real_
polymod8$contacts[polymod$contacts$part_id == 10, "cnt_age"] <- 10
polymod8$contacts[polymod$contacts$part_id == 20, "cnt_age"] <- 20
polymod8$contacts[polymod$contacts$part_id == 10, "cnt_age_exact"] <- 10
polymod8$contacts[polymod$contacts$part_id == 20, "cnt_age_exact"] <- 20
polymod9$participants$part_age_est_min <- 1
polymod9$participants$part_age_est_max <- 15
polymod9$participants$part_age <- NULL
polymod9$participants$part_age_exact <- NULL
polymod9$participants$part_age_est_min <- 1
polymod9$participants$part_age_est_max <- 15
nn <- nrow(polymod9$participants)
polymod9$participants$part_age <- ifelse(runif(nn) > 0.7, 20, NA)
polymod9$participants$part_age_exact <- ifelse(runif(nn) > 0.7, 20, NA)
polymod10$participants$added_weight <-
ifelse(polymod10$participants$hh_size > 1, 2, 1)
polymod10$participants$added_weight2 <- 0.3
Expand Down

0 comments on commit 3b87bad

Please sign in to comment.