diff --git a/R/iidda/DESCRIPTION b/R/iidda/DESCRIPTION index 29da3828..938ef3cc 100644 --- a/R/iidda/DESCRIPTION +++ b/R/iidda/DESCRIPTION @@ -29,4 +29,4 @@ Imports: readr Encoding: UTF-8 LazyData: true -RoxygenNote: 7.2.3.9000 +RoxygenNote: 7.3.2 diff --git a/R/iidda/NAMESPACE b/R/iidda/NAMESPACE index 1add868b..d69d5d43 100644 --- a/R/iidda/NAMESPACE +++ b/R/iidda/NAMESPACE @@ -30,10 +30,12 @@ export(extract_all_between_paren) export(extract_between_paren) export(extract_char_or_blank) export(extract_or_blank) +export(factor_time_scale) export(failed_prep_script_outcomes) export(fill_and_wrap) export(fill_re_template) export(filter_out_time_scales) +export(find_unaccounted_cases) export(fix_csv) export(flatten_disease_hierarchy) export(freq_to_by) @@ -43,6 +45,7 @@ export(get_canmod_digitization_metadata) export(get_dataset_path) export(get_elements) export(get_firsts) +export(get_implied_zeros) export(get_items) export(get_lookup_table) export(get_main_script) diff --git a/R/iidda/R/data_prep_tools.R b/R/iidda/R/data_prep_tools.R index a04e8cb7..1916327e 100644 --- a/R/iidda/R/data_prep_tools.R +++ b/R/iidda/R/data_prep_tools.R @@ -505,7 +505,7 @@ identify_time_scales = function(data){ %>% mutate(time_scale = ifelse(period_end_date == as.Date(period_start_date) + 6, "wk", "2wk")) %>% mutate(time_scale = ifelse(as.Date(period_end_date)-as.Date(period_start_date) > 14, "mo", time_scale)) %>% mutate(time_scale = ifelse(as.Date(period_end_date)-as.Date(period_start_date) > 40, "qr", time_scale)) - %>% mutate(time_scale = ifelse(as.Date(period_end_date)-as.Date(period_start_date) > 100, "yr", time_scale)) + %>% mutate(time_scale = ifelse(as.Date(period_end_date)-as.Date(period_start_date) > 300, "yr", time_scale)) ) } canada_province_scale_finder = function(data) { @@ -803,7 +803,25 @@ time_scale_chooser = function(time_scale, which_fun) { r } -#' Filter out Time Scales +#' Factor Time Scale +#' +#' @param data A tidy data set with a `time_scale` column. +#' +#' @return A data set with a factored time_scale column. +#' +#' @export +factor_time_scale = function(data){ + if (is.factor(data$time_scale)) { + return(data) + } + time_scale_map = c(wk = "wk", yr = "yr", mo = "mo", `2wk` = "2wk", mt = "mo", `two-wks` = "2wk", qrtr = "qr", qr = "qr") + data$time_scale = time_scale_map[as.character(data$time_scale)] + order = c("wk", "2wk", "mo", "qr", "yr") + + return(mutate(data, time_scale = factor(data$time_scale, levels = order, ordered = TRUE))) +} + +#' Filter out Time Scales OLD #' #' Choose a single best `time_scale` for each year in a dataset, grouped by #' nesting disease. This best `time_scale` is defined as the longest @@ -820,8 +838,8 @@ time_scale_chooser = function(time_scale, which_fun) { #' @return A data set only containing records with the best time scale. #' #' @importFrom lubridate year -#' @export -filter_out_time_scales = function(data +#' @noRd +filter_out_time_scales_old = function(data , initial_group = c("iso_3166", "iso_3166_2", "disease", "nesting_disease") , final_group = c("basal_disease") , cleanup = TRUE @@ -849,3 +867,249 @@ filter_out_time_scales = function(data } new_data } + + +#' Filter out Time Scales +#' +#' Choose a single best `time_scale` for each year in a dataset, grouped by +#' nesting disease. This best `time_scale` is defined as the longest +#' of the shortest time scales in each location and sub-disease. +#' +#' @param data A tidy data set with a `time_scale` column. +#' @param initial_group Character vector naming columns for defining +#' the initial grouping used to compute the shortest time scales. +#' @param final_group Character vector naming columns for defining the final +#' grouping used to compute the longest of the shortest time scales. +#' @param aggregate_if_unavailable If a location is not reporting for the determined +#' 'best timescale', but is reporting at a finer timescale, aggregate this finer +#' timescale to the 'best timescale' +#' +#' @return A data set only containing records with the best time scale. +#' +#' @importFrom lubridate year +#' @export +filter_out_time_scales = function(data + , initial_group = c("iso_3166", "iso_3166_2", "disease", "nesting_disease", "basal_disease") + , final_group = c("basal_disease") + , aggregate_if_unavailable = TRUE +) { + + if (length(unique(data$time_scale)) == 1L) return(data) + + new_data = (data + # remove '_unaccounted' cases when deciding best time_scale + |> factor_time_scale() + |> filter(!grepl("_unaccounted$", disease)) + |> mutate(year = year(period_end_date)) + |> group_by(across(all_of(c("year", initial_group)))) + |> mutate(shortest_time_scale = time_scale_chooser(time_scale, which.min)) + |> ungroup() + |> group_by(across(all_of(c("year", final_group)))) + |> mutate(best_time_scale = time_scale_chooser(shortest_time_scale, which.max)) + |> ungroup() + |> filter(as.character(time_scale) == best_time_scale) + |> select(-best_time_scale, -shortest_time_scale) + ) + + # adding "unaccounted" data back, at the best_time_scale + all_new_data = (data + |> filter(grepl("_unaccounted$", disease)) + |> mutate(year = year(period_end_date)) + |> semi_join(select(new_data, "year", "time_scale", "disease", "nesting_disease", "basal_disease") |> unique(), + by = c("year", "time_scale", final_group)) + |> rbind(new_data) + ) + + if (isTRUE(aggregate_if_unavailable)) { + # coarse scales to aggregate to + scales = (all_new_data + |> select(period_start_date, period_end_date, disease, nesting_disease, basal_disease) + |> unique() + |> rename(coarser_start_date = period_start_date, + coarser_end_date = period_end_date) + ) + + # data which isn't available at 'best_time_scale' for the year, but is + # available at a finer timescale + data_to_aggregate = (data + |> factor_time_scale() + |> mutate(year = year(period_end_date)) + |> left_join(select(all_new_data, "year","disease", "nesting_disease", "basal_disease", "time_scale") |> unique(), + by = c("year", "disease", "nesting_disease", "basal_disease"), + suffix = c('_old', '_new')) + |> filter(time_scale_old < time_scale_new) + |> mutate(period_start_date = as.Date(period_start_date), + period_end_date = as.Date(period_end_date)) + + # keep only data which isn't available at the 'best time scale' (which is now the timescale in all_new_data) + |> anti_join(select(all_new_data,"iso_3166_2", "year","disease", "nesting_disease", "basal_disease", "time_scale") |> unique() + , by = c('time_scale_new' = 'time_scale', 'disease', 'year', 'nesting_disease','basal_disease', 'iso_3166_2')) + ) + + aggregated_unavailable_data = (scales + |> inner_join(data_to_aggregate, by = c("disease", "nesting_disease", "basal_disease"), relationship = 'many-to-many') + |> filter(period_end_date > coarser_start_date & period_end_date <= coarser_end_date) + |> select(names(data_to_aggregate), coarser_start_date, coarser_end_date) + + |> group_by(iso_3166, iso_3166_2, disease, nesting_disease, basal_disease, coarser_start_date, coarser_end_date) + |> mutate(cases_coarse_period = sum(as.numeric(cases_this_period))) + |> mutate(population = round(mean(as.numeric(population))), + population_reporting = round(mean(as.numeric(population_reporting)))) + |> ungroup() + + |> select(-cases_this_period, -period_start_date, -period_end_date, + -days_this_period, -period_mid_date) + |> rename(time_scale = time_scale_new, + cases_this_period = cases_coarse_period, + period_start_date = coarser_start_date, + period_end_date = coarser_end_date) + + |> distinct(iso_3166, iso_3166_2, disease, nesting_disease, basal_disease, + period_start_date, period_end_date, .keep_all = TRUE) + + # add back days_this_period and period_mid_date for the coarser start and end dates + |> mutate(days_this_period = iidda.analysis::num_days(period_start_date, period_end_date)) + |> mutate(period_mid_date = iidda.analysis::mid_dates(period_start_date, period_end_date, days_this_period)) + |> select(-time_scale_old) + + |> mutate(record_origin = 'derived') + ) + + final = (all_new_data + |> mutate(record_origin = ifelse("record_origin" %in% names(all_new_data), record_origin, 'historical')) + |> rbind(aggregated_unavailable_data) + ) + + return(final) + } else{ + return(all_new_data) + } +} + +#' Get Implied Zeros +#' +#' Add zeros to data set that are implied by a '0' reported at a coarser timescale. +#' +#' @param data A tidy data set +#' +#' @return A tidy data set with inferred 0s +#' +#' @export +get_implied_zeros = function(data){ + + starting_data = (data + |> mutate(year = year(as.Date(period_end_date))) + |> factor_time_scale() + + |> group_by(iso_3166_2, disease, year, original_dataset_id) + |> mutate(all_zero = ifelse(sum(as.numeric(cases_this_period)) == 0, TRUE, FALSE)) + |> ungroup() + + |> group_by(disease, year, original_dataset_id) + |> mutate(finest_timescale = min(time_scale)) + |> ungroup() + ) + + scales = (starting_data + |> filter(time_scale == finest_timescale) + |> distinct(disease, nesting_disease, basal_disease, + year, time_scale, period_start_date, period_end_date, + period_mid_date, days_this_period, original_dataset_id) + ) + + # records for which all_zero = true and finest_timescale isn't available + get_new_zeros = (starting_data + |> filter(time_scale > finest_timescale, all_zero) + + # filter for timescales that are not in the original data + |> anti_join(starting_data + , by = c('iso_3166_2', 'year', 'finest_timescale' = 'time_scale', + 'disease', 'nesting_disease', 'basal_disease', 'dataset_id' + )) # nesting/basal too? see if that changes result! + + |> select(-period_start_date, -period_end_date, -period_mid_date, + -days_this_period) + ) + + # for rows in get_new_records, find the periods (i.e. start and end dates) + # for the finest_timescale for a given year, disease, and original_dataset_id + new_zeros = + (get_new_zeros + |> left_join(scales, by = c('disease', 'year', 'finest_timescale' = 'time_scale', + 'nesting_disease', 'basal_disease', 'original_dataset_id' + ), relationship = "many-to-many") + |> select(-time_scale, -year, -all_zero) + |> rename(time_scale = finest_timescale) + + |> mutate(record_origin = 'derived') + ) + + # join back to original data + (data + |> mutate(record_origin = ifelse("record_origin" %in% names(data), record_origin, 'historical')) + |> rbind(new_zeros) + ) +} + + +#' Find Unaccounted Cases +#' +#' Make new records for instances when the sum of leaf diseases is less than +#' the reported total for their basal disease. The difference between these +#' counts gets disease name 'basal_disease'_unaccounted'. +#' +#' +#' @param data A tidy data set with a `basal_disease` column. +#' +#' @return A data set containing records that are the difference between a +#' reported total for a basal_disease and the sum of their leaf diseases +#' +#' @export +find_unaccounted_cases = function(data){ + + # check if sum of leaf diseases = reported sum of basal disease + sum_of_leaf = ( + data + %>% filter(!disease %in% unique(nesting_disease)) + %>% filter(disease != basal_disease) + %>% group_by(iso_3166, iso_3166_2, period_start_date, period_end_date, nesting_disease) + %>% summarise(cases_this_period = sum(as.numeric(cases_this_period))) + ) + + reported_totals = ( + data + %>% filter(nesting_disease == '') + %>% filter(disease %in% sum_of_leaf$nesting_disease) + %>% select(-nesting_disease) + %>% rename(nesting_disease = disease) + ) + + # if sum of leaf diseases is < reported sum of basal disease, + # make new sub-disease called 'disease-name'_unaccounted, which contains + # the difference between sum of leaf diseases and the reported sum of the disease + unaccounted_data = + (inner_join(sum_of_leaf, reported_totals, by = + c('iso_3166', 'iso_3166_2', 'period_start_date', 'period_end_date', 'nesting_disease'), + suffix = c('_sum', '_reported')) + + %>% mutate(cases_this_period_reported = as.numeric(cases_this_period_reported), + cases_this_period_sum = as.numeric(cases_this_period_sum)) + + %>% filter(cases_this_period_sum < cases_this_period_reported) + %>% mutate(diff = cases_this_period_reported - cases_this_period_sum) + %>% rename(cases_this_period = diff) + %>% mutate(disease = paste(nesting_disease, 'unaccounted', sep = '_')) + + %>% select(-cases_this_period_reported, -cases_this_period_sum) + + %>% mutate(original_dataset_id = '') + %>% mutate(historical_disease = '') + %>% mutate(record_origin = 'derived') + ) + + (data + |> mutate(record_origin = ifelse("record_origin" %in% names(data), record_origin, 'historical')) + |> rbind(unaccounted_data) + ) +} + diff --git a/R/iidda/man/factor_time_scale.Rd b/R/iidda/man/factor_time_scale.Rd new file mode 100644 index 00000000..22e7784f --- /dev/null +++ b/R/iidda/man/factor_time_scale.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_prep_tools.R +\name{factor_time_scale} +\alias{factor_time_scale} +\title{Factor Time Scale} +\usage{ +factor_time_scale(data) +} +\arguments{ +\item{data}{A tidy data set with a `time_scale` column.} +} +\value{ +A data set with a factored time_scale column. +} +\description{ +Factor Time Scale +} diff --git a/R/iidda/man/filter_out_time_scales.Rd b/R/iidda/man/filter_out_time_scales.Rd index 20583649..5b444de4 100644 --- a/R/iidda/man/filter_out_time_scales.Rd +++ b/R/iidda/man/filter_out_time_scales.Rd @@ -6,9 +6,10 @@ \usage{ filter_out_time_scales( data, - initial_group = c("iso_3166", "iso_3166_2", "disease", "nesting_disease"), + initial_group = c("iso_3166", "iso_3166_2", "disease", "nesting_disease", + "basal_disease"), final_group = c("basal_disease"), - cleanup = TRUE + aggregate_if_unavailable = TRUE ) } \arguments{ @@ -20,8 +21,9 @@ the initial grouping used to compute the shortest time scales.} \item{final_group}{Character vector naming columns for defining the final grouping used to compute the longest of the shortest time scales.} -\item{cleanup}{Should intermediate columns be removed before returning the -output} +\item{aggregate_if_unavailable}{If a location is not reporting for the determined +'best timescale', but is reporting at a finer timescale, aggregate this finer +timescale to the 'best timescale'} } \value{ A data set only containing records with the best time scale. diff --git a/R/iidda/man/find_unaccounted_cases.Rd b/R/iidda/man/find_unaccounted_cases.Rd new file mode 100644 index 00000000..d7ca81e5 --- /dev/null +++ b/R/iidda/man/find_unaccounted_cases.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_prep_tools.R +\name{find_unaccounted_cases} +\alias{find_unaccounted_cases} +\title{Find Unaccounted Cases} +\usage{ +find_unaccounted_cases(data) +} +\arguments{ +\item{data}{A tidy data set with a `basal_disease` column.} +} +\value{ +A data set containing records that are the difference between a +reported total for a basal_disease and the sum of their leaf diseases +} +\description{ +Make new records for instances when the sum of leaf diseases is less than +the reported total for their basal disease. The difference between these +counts gets disease name 'basal_disease'_unaccounted'. +} diff --git a/R/iidda/man/get_implied_zeros.Rd b/R/iidda/man/get_implied_zeros.Rd new file mode 100644 index 00000000..d6452474 --- /dev/null +++ b/R/iidda/man/get_implied_zeros.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_prep_tools.R +\name{get_implied_zeros} +\alias{get_implied_zeros} +\title{Get Implied Zeros} +\usage{ +get_implied_zeros(data) +} +\arguments{ +\item{data}{A tidy data set} +} +\value{ +A tidy data set with inferred 0s +} +\description{ +Add zeros to data set that are implied by a '0' reported at a coarser timescale. +}