From ccb3230bd6cd91106531568a7a1bd7fdda8850d7 Mon Sep 17 00:00:00 2001 From: gabriellemackinnon Date: Mon, 8 Jul 2024 10:20:08 -0400 Subject: [PATCH] fix identify_scales --- R/iidda/NAMESPACE | 2 +- R/iidda/R/data_prep_tools.R | 97 ++++++++++--------- ...ime_scales.Rd => normalize_time_scales.Rd} | 17 ++-- 3 files changed, 63 insertions(+), 53 deletions(-) rename R/iidda/man/{filter_out_time_scales.Rd => normalize_time_scales.Rd} (67%) diff --git a/R/iidda/NAMESPACE b/R/iidda/NAMESPACE index d69d5d43..533209d7 100644 --- a/R/iidda/NAMESPACE +++ b/R/iidda/NAMESPACE @@ -34,7 +34,6 @@ export(factor_time_scale) export(failed_prep_script_outcomes) export(fill_and_wrap) export(fill_re_template) -export(filter_out_time_scales) export(find_unaccounted_cases) export(fix_csv) export(flatten_disease_hierarchy) @@ -83,6 +82,7 @@ export(melt_tracking_table_keys) export(memoise_remove_age) export(nlist) export(normalize_diseases) +export(normalize_time_scales) export(open_locally) export(or_pattern) export(pipeline_exploration_starter) diff --git a/R/iidda/R/data_prep_tools.R b/R/iidda/R/data_prep_tools.R index 1916327e..4bb47829 100644 --- a/R/iidda/R/data_prep_tools.R +++ b/R/iidda/R/data_prep_tools.R @@ -505,6 +505,7 @@ identify_time_scales = function(data){ %>% mutate(time_scale = ifelse(period_end_date == as.Date(period_start_date) + 6, "wk", "2wk")) %>% mutate(time_scale = ifelse(as.Date(period_end_date)-as.Date(period_start_date) > 14, "mo", time_scale)) %>% mutate(time_scale = ifelse(as.Date(period_end_date)-as.Date(period_start_date) > 40, "qr", time_scale)) + %>% mutate(time_scale = ifelse(as.Date(period_end_date)-as.Date(period_start_date) > 260, "3qr", time_scale)) %>% mutate(time_scale = ifelse(as.Date(period_end_date)-as.Date(period_start_date) > 300, "yr", time_scale)) ) } @@ -869,38 +870,44 @@ filter_out_time_scales_old = function(data } -#' Filter out Time Scales +#' Normalize Time Scales #' #' Choose a single best `time_scale` for each year in a dataset, grouped by #' nesting disease. This best `time_scale` is defined as the longest #' of the shortest time scales in each location and sub-disease. #' -#' @param data A tidy data set with a `time_scale` column. +#' @param data A tidy data set with a `time_scale` and `year` column #' @param initial_group Character vector naming columns for defining #' the initial grouping used to compute the shortest time scales. #' @param final_group Character vector naming columns for defining the final #' grouping used to compute the longest of the shortest time scales. +#' @param get_implied_zeros Add zeros that are implied by a '0' reported at a coarser timescale. #' @param aggregate_if_unavailable If a location is not reporting for the determined #' 'best timescale', but is reporting at a finer timescale, aggregate this finer #' timescale to the 'best timescale' #' -#' @return A data set only containing records with the best time scale. +#' @return A data set only containing records with the optimal time scale. #' #' @importFrom lubridate year #' @export -filter_out_time_scales = function(data - , initial_group = c("iso_3166", "iso_3166_2", "disease", "nesting_disease", "basal_disease") +normalize_time_scales = function(data + , initial_group = c("year", "iso_3166", "iso_3166_2", "disease", "nesting_disease", "basal_disease") , final_group = c("basal_disease") + , get_implied_zeros = TRUE , aggregate_if_unavailable = TRUE ) { + if(get_implied_zeros) data = get_implied_zeros(data) + if (length(unique(data$time_scale)) == 1L) return(data) + if (!"year" %in% colnames(data)) {stop("The column 'year' does not exist in the dataset.")} + new_data = (data # remove '_unaccounted' cases when deciding best time_scale |> factor_time_scale() |> filter(!grepl("_unaccounted$", disease)) - |> mutate(year = year(period_end_date)) + # |> mutate(year = year(period_end_date)) |> group_by(across(all_of(c("year", initial_group)))) |> mutate(shortest_time_scale = time_scale_chooser(time_scale, which.min)) |> ungroup() @@ -920,7 +927,7 @@ filter_out_time_scales = function(data |> rbind(new_data) ) - if (isTRUE(aggregate_if_unavailable)) { + if(aggregate_if_unavailable) { # coarse scales to aggregate to scales = (all_new_data |> select(period_start_date, period_end_date, disease, nesting_disease, basal_disease) @@ -933,7 +940,7 @@ filter_out_time_scales = function(data # available at a finer timescale data_to_aggregate = (data |> factor_time_scale() - |> mutate(year = year(period_end_date)) + # |> mutate(year = year(period_end_date)) |> left_join(select(all_new_data, "year","disease", "nesting_disease", "basal_disease", "time_scale") |> unique(), by = c("year", "disease", "nesting_disease", "basal_disease"), suffix = c('_old', '_new')) @@ -968,11 +975,12 @@ filter_out_time_scales = function(data period_start_date, period_end_date, .keep_all = TRUE) # add back days_this_period and period_mid_date for the coarser start and end dates + # FIXME: apparently using iidda analysis functions will cause issues. oops |> mutate(days_this_period = iidda.analysis::num_days(period_start_date, period_end_date)) |> mutate(period_mid_date = iidda.analysis::mid_dates(period_start_date, period_end_date, days_this_period)) |> select(-time_scale_old) - |> mutate(record_origin = 'derived') + |> mutate(record_origin = 'derived-aggregated-timescales') ) final = (all_new_data @@ -998,50 +1006,49 @@ filter_out_time_scales = function(data get_implied_zeros = function(data){ starting_data = (data - |> mutate(year = year(as.Date(period_end_date))) - |> factor_time_scale() - - |> group_by(iso_3166_2, disease, year, original_dataset_id) - |> mutate(all_zero = ifelse(sum(as.numeric(cases_this_period)) == 0, TRUE, FALSE)) - |> ungroup() - - |> group_by(disease, year, original_dataset_id) - |> mutate(finest_timescale = min(time_scale)) - |> ungroup() + |> mutate(year = year(as.Date(period_end_date))) + |> factor_time_scale() + + |> group_by(iso_3166_2, disease, year, original_dataset_id) + |> mutate(all_zero = ifelse(sum(as.numeric(cases_this_period)) == 0, TRUE, FALSE)) + |> ungroup() + + |> group_by(disease, year, original_dataset_id) + |> mutate(finest_timescale = min(time_scale)) + |> ungroup() ) scales = (starting_data - |> filter(time_scale == finest_timescale) - |> distinct(disease, nesting_disease, basal_disease, - year, time_scale, period_start_date, period_end_date, - period_mid_date, days_this_period, original_dataset_id) + |> filter(time_scale == finest_timescale) + |> distinct(disease, nesting_disease, basal_disease, + year, time_scale, period_start_date, period_end_date, + period_mid_date, days_this_period, original_dataset_id) ) # records for which all_zero = true and finest_timescale isn't available get_new_zeros = (starting_data - |> filter(time_scale > finest_timescale, all_zero) - - # filter for timescales that are not in the original data - |> anti_join(starting_data - , by = c('iso_3166_2', 'year', 'finest_timescale' = 'time_scale', - 'disease', 'nesting_disease', 'basal_disease', 'dataset_id' - )) # nesting/basal too? see if that changes result! - - |> select(-period_start_date, -period_end_date, -period_mid_date, - -days_this_period) + |> filter(time_scale > finest_timescale, all_zero) + + # filter for timescales that are not in the original data + |> anti_join(starting_data + , by = c('iso_3166_2', 'year', 'finest_timescale' = 'time_scale', + 'disease', 'nesting_disease', 'basal_disease', 'dataset_id' + )) # nesting/basal too? see if that changes result! + + |> select(-period_start_date, -period_end_date, -period_mid_date, + -days_this_period) ) # for rows in get_new_records, find the periods (i.e. start and end dates) # for the finest_timescale for a given year, disease, and original_dataset_id - new_zeros = - (get_new_zeros - |> left_join(scales, by = c('disease', 'year', 'finest_timescale' = 'time_scale', - 'nesting_disease', 'basal_disease', 'original_dataset_id' - ), relationship = "many-to-many") - |> select(-time_scale, -year, -all_zero) - |> rename(time_scale = finest_timescale) - - |> mutate(record_origin = 'derived') + new_zeros = (get_new_zeros + |> left_join(scales, by = c('disease', 'year', 'finest_timescale' = 'time_scale', + 'nesting_disease', 'basal_disease', 'original_dataset_id'), + relationship = "many-to-many") + |> select(-time_scale, -year, -all_zero) + |> rename(time_scale = finest_timescale) + + |> mutate(record_origin = 'derived-implied-zero') ) # join back to original data @@ -1104,12 +1111,12 @@ find_unaccounted_cases = function(data){ %>% mutate(original_dataset_id = '') %>% mutate(historical_disease = '') - %>% mutate(record_origin = 'derived') + %>% mutate(record_origin = 'derived-unaccounted-cases') ) (data - |> mutate(record_origin = ifelse("record_origin" %in% names(data), record_origin, 'historical')) - |> rbind(unaccounted_data) + %>% mutate(record_origin = ifelse("record_origin" %in% names(data), record_origin, 'historical')) + %>% rbind(unaccounted_data) ) } diff --git a/R/iidda/man/filter_out_time_scales.Rd b/R/iidda/man/normalize_time_scales.Rd similarity index 67% rename from R/iidda/man/filter_out_time_scales.Rd rename to R/iidda/man/normalize_time_scales.Rd index 5b444de4..4bc5e90a 100644 --- a/R/iidda/man/filter_out_time_scales.Rd +++ b/R/iidda/man/normalize_time_scales.Rd @@ -1,19 +1,20 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/data_prep_tools.R -\name{filter_out_time_scales} -\alias{filter_out_time_scales} -\title{Filter out Time Scales} +\name{normalize_time_scales} +\alias{normalize_time_scales} +\title{Normalize Time Scales} \usage{ -filter_out_time_scales( +normalize_time_scales( data, - initial_group = c("iso_3166", "iso_3166_2", "disease", "nesting_disease", + initial_group = c("year", "iso_3166", "iso_3166_2", "disease", "nesting_disease", "basal_disease"), final_group = c("basal_disease"), + get_implied_zeros = TRUE, aggregate_if_unavailable = TRUE ) } \arguments{ -\item{data}{A tidy data set with a `time_scale` column.} +\item{data}{A tidy data set with a `time_scale` and `year` column} \item{initial_group}{Character vector naming columns for defining the initial grouping used to compute the shortest time scales.} @@ -21,12 +22,14 @@ the initial grouping used to compute the shortest time scales.} \item{final_group}{Character vector naming columns for defining the final grouping used to compute the longest of the shortest time scales.} +\item{get_implied_zeros}{Add zeros that are implied by a '0' reported at a coarser timescale.} + \item{aggregate_if_unavailable}{If a location is not reporting for the determined 'best timescale', but is reporting at a finer timescale, aggregate this finer timescale to the 'best timescale'} } \value{ -A data set only containing records with the best time scale. +A data set only containing records with the optimal time scale. } \description{ Choose a single best `time_scale` for each year in a dataset, grouped by