Skip to content

Commit

Permalink
new prep tools
Browse files Browse the repository at this point in the history
  • Loading branch information
gabriellemackinnon committed Jul 4, 2024
1 parent 8bc5f91 commit 82e1ab6
Show file tree
Hide file tree
Showing 7 changed files with 332 additions and 9 deletions.
2 changes: 1 addition & 1 deletion R/iidda/DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,4 @@ Imports:
readr
Encoding: UTF-8
LazyData: true
RoxygenNote: 7.2.3.9000
RoxygenNote: 7.3.2
3 changes: 3 additions & 0 deletions R/iidda/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,12 @@ export(extract_all_between_paren)
export(extract_between_paren)
export(extract_char_or_blank)
export(extract_or_blank)
export(factor_time_scale)
export(failed_prep_script_outcomes)
export(fill_and_wrap)
export(fill_re_template)
export(filter_out_time_scales)
export(find_unaccounted_cases)
export(fix_csv)
export(flatten_disease_hierarchy)
export(freq_to_by)
Expand All @@ -43,6 +45,7 @@ export(get_canmod_digitization_metadata)
export(get_dataset_path)
export(get_elements)
export(get_firsts)
export(get_implied_zeros)
export(get_items)
export(get_lookup_table)
export(get_main_script)
Expand Down
272 changes: 268 additions & 4 deletions R/iidda/R/data_prep_tools.R
Original file line number Diff line number Diff line change
Expand Up @@ -505,7 +505,7 @@ identify_time_scales = function(data){
%>% mutate(time_scale = ifelse(period_end_date == as.Date(period_start_date) + 6, "wk", "2wk"))
%>% mutate(time_scale = ifelse(as.Date(period_end_date)-as.Date(period_start_date) > 14, "mo", time_scale))
%>% mutate(time_scale = ifelse(as.Date(period_end_date)-as.Date(period_start_date) > 40, "qr", time_scale))
%>% mutate(time_scale = ifelse(as.Date(period_end_date)-as.Date(period_start_date) > 100, "yr", time_scale))
%>% mutate(time_scale = ifelse(as.Date(period_end_date)-as.Date(period_start_date) > 300, "yr", time_scale))
)
}
canada_province_scale_finder = function(data) {
Expand Down Expand Up @@ -803,7 +803,25 @@ time_scale_chooser = function(time_scale, which_fun) {
r
}

#' Filter out Time Scales
#' Factor Time Scale
#'
#' @param data A tidy data set with a `time_scale` column.
#'
#' @return A data set with a factored time_scale column.
#'
#' @export
factor_time_scale = function(data){
if (is.factor(data$time_scale)) {
return(data)
}
time_scale_map = c(wk = "wk", yr = "yr", mo = "mo", `2wk` = "2wk", mt = "mo", `two-wks` = "2wk", qrtr = "qr", qr = "qr")
data$time_scale = time_scale_map[as.character(data$time_scale)]
order = c("wk", "2wk", "mo", "qr", "yr")

return(mutate(data, time_scale = factor(data$time_scale, levels = order, ordered = TRUE)))
}

#' Filter out Time Scales OLD
#'
#' Choose a single best `time_scale` for each year in a dataset, grouped by
#' nesting disease. This best `time_scale` is defined as the longest
Expand All @@ -820,8 +838,8 @@ time_scale_chooser = function(time_scale, which_fun) {
#' @return A data set only containing records with the best time scale.
#'
#' @importFrom lubridate year
#' @export
filter_out_time_scales = function(data
#' @noRd
filter_out_time_scales_old = function(data
, initial_group = c("iso_3166", "iso_3166_2", "disease", "nesting_disease")
, final_group = c("basal_disease")
, cleanup = TRUE
Expand Down Expand Up @@ -849,3 +867,249 @@ filter_out_time_scales = function(data
}
new_data
}


#' Filter out Time Scales
#'
#' Choose a single best `time_scale` for each year in a dataset, grouped by
#' nesting disease. This best `time_scale` is defined as the longest
#' of the shortest time scales in each location and sub-disease.
#'
#' @param data A tidy data set with a `time_scale` column.
#' @param initial_group Character vector naming columns for defining
#' the initial grouping used to compute the shortest time scales.
#' @param final_group Character vector naming columns for defining the final
#' grouping used to compute the longest of the shortest time scales.
#' @param aggregate_if_unavailable If a location is not reporting for the determined
#' 'best timescale', but is reporting at a finer timescale, aggregate this finer
#' timescale to the 'best timescale'
#'
#' @return A data set only containing records with the best time scale.
#'
#' @importFrom lubridate year
#' @export
filter_out_time_scales = function(data
, initial_group = c("iso_3166", "iso_3166_2", "disease", "nesting_disease", "basal_disease")
, final_group = c("basal_disease")
, aggregate_if_unavailable = TRUE
) {

if (length(unique(data$time_scale)) == 1L) return(data)

new_data = (data
# remove '_unaccounted' cases when deciding best time_scale
|> factor_time_scale()
|> filter(!grepl("_unaccounted$", disease))
|> mutate(year = year(period_end_date))
|> group_by(across(all_of(c("year", initial_group))))
|> mutate(shortest_time_scale = time_scale_chooser(time_scale, which.min))
|> ungroup()
|> group_by(across(all_of(c("year", final_group))))
|> mutate(best_time_scale = time_scale_chooser(shortest_time_scale, which.max))
|> ungroup()
|> filter(as.character(time_scale) == best_time_scale)
|> select(-best_time_scale, -shortest_time_scale)
)

# adding "unaccounted" data back, at the best_time_scale
all_new_data = (data
|> filter(grepl("_unaccounted$", disease))
|> mutate(year = year(period_end_date))
|> semi_join(select(new_data, "year", "time_scale", "disease", "nesting_disease", "basal_disease") |> unique(),
by = c("year", "time_scale", final_group))
|> rbind(new_data)
)

if (isTRUE(aggregate_if_unavailable)) {
# coarse scales to aggregate to
scales = (all_new_data
|> select(period_start_date, period_end_date, disease, nesting_disease, basal_disease)
|> unique()
|> rename(coarser_start_date = period_start_date,
coarser_end_date = period_end_date)
)

# data which isn't available at 'best_time_scale' for the year, but is
# available at a finer timescale
data_to_aggregate = (data
|> factor_time_scale()
|> mutate(year = year(period_end_date))
|> left_join(select(all_new_data, "year","disease", "nesting_disease", "basal_disease", "time_scale") |> unique(),
by = c("year", "disease", "nesting_disease", "basal_disease"),
suffix = c('_old', '_new'))
|> filter(time_scale_old < time_scale_new)
|> mutate(period_start_date = as.Date(period_start_date),
period_end_date = as.Date(period_end_date))

# keep only data which isn't available at the 'best time scale' (which is now the timescale in all_new_data)
|> anti_join(select(all_new_data,"iso_3166_2", "year","disease", "nesting_disease", "basal_disease", "time_scale") |> unique()
, by = c('time_scale_new' = 'time_scale', 'disease', 'year', 'nesting_disease','basal_disease', 'iso_3166_2'))
)

aggregated_unavailable_data = (scales
|> inner_join(data_to_aggregate, by = c("disease", "nesting_disease", "basal_disease"), relationship = 'many-to-many')
|> filter(period_end_date > coarser_start_date & period_end_date <= coarser_end_date)
|> select(names(data_to_aggregate), coarser_start_date, coarser_end_date)

|> group_by(iso_3166, iso_3166_2, disease, nesting_disease, basal_disease, coarser_start_date, coarser_end_date)
|> mutate(cases_coarse_period = sum(as.numeric(cases_this_period)))
|> mutate(population = round(mean(as.numeric(population))),
population_reporting = round(mean(as.numeric(population_reporting))))
|> ungroup()

|> select(-cases_this_period, -period_start_date, -period_end_date,
-days_this_period, -period_mid_date)
|> rename(time_scale = time_scale_new,
cases_this_period = cases_coarse_period,
period_start_date = coarser_start_date,
period_end_date = coarser_end_date)

|> distinct(iso_3166, iso_3166_2, disease, nesting_disease, basal_disease,
period_start_date, period_end_date, .keep_all = TRUE)

# add back days_this_period and period_mid_date for the coarser start and end dates
|> mutate(days_this_period = iidda.analysis::num_days(period_start_date, period_end_date))
|> mutate(period_mid_date = iidda.analysis::mid_dates(period_start_date, period_end_date, days_this_period))
|> select(-time_scale_old)

|> mutate(record_origin = 'derived')
)

final = (all_new_data
|> mutate(record_origin = ifelse("record_origin" %in% names(all_new_data), record_origin, 'historical'))
|> rbind(aggregated_unavailable_data)
)

return(final)
} else{
return(all_new_data)
}
}

#' Get Implied Zeros
#'
#' Add zeros to data set that are implied by a '0' reported at a coarser timescale.
#'
#' @param data A tidy data set
#'
#' @return A tidy data set with inferred 0s
#'
#' @export
get_implied_zeros = function(data){

starting_data = (data
|> mutate(year = year(as.Date(period_end_date)))
|> factor_time_scale()

|> group_by(iso_3166_2, disease, year, original_dataset_id)
|> mutate(all_zero = ifelse(sum(as.numeric(cases_this_period)) == 0, TRUE, FALSE))
|> ungroup()

|> group_by(disease, year, original_dataset_id)
|> mutate(finest_timescale = min(time_scale))
|> ungroup()
)

scales = (starting_data
|> filter(time_scale == finest_timescale)
|> distinct(disease, nesting_disease, basal_disease,
year, time_scale, period_start_date, period_end_date,
period_mid_date, days_this_period, original_dataset_id)
)

# records for which all_zero = true and finest_timescale isn't available
get_new_zeros = (starting_data
|> filter(time_scale > finest_timescale, all_zero)

# filter for timescales that are not in the original data
|> anti_join(starting_data
, by = c('iso_3166_2', 'year', 'finest_timescale' = 'time_scale',
'disease', 'nesting_disease', 'basal_disease', 'dataset_id'
)) # nesting/basal too? see if that changes result!

|> select(-period_start_date, -period_end_date, -period_mid_date,
-days_this_period)
)

# for rows in get_new_records, find the periods (i.e. start and end dates)
# for the finest_timescale for a given year, disease, and original_dataset_id
new_zeros =
(get_new_zeros
|> left_join(scales, by = c('disease', 'year', 'finest_timescale' = 'time_scale',
'nesting_disease', 'basal_disease', 'original_dataset_id'
), relationship = "many-to-many")
|> select(-time_scale, -year, -all_zero)
|> rename(time_scale = finest_timescale)

|> mutate(record_origin = 'derived')
)

# join back to original data
(data
|> mutate(record_origin = ifelse("record_origin" %in% names(data), record_origin, 'historical'))
|> rbind(new_zeros)
)
}


#' Find Unaccounted Cases
#'
#' Make new records for instances when the sum of leaf diseases is less than
#' the reported total for their basal disease. The difference between these
#' counts gets disease name 'basal_disease'_unaccounted'.
#'
#'
#' @param data A tidy data set with a `basal_disease` column.
#'
#' @return A data set containing records that are the difference between a
#' reported total for a basal_disease and the sum of their leaf diseases
#'
#' @export
find_unaccounted_cases = function(data){

# check if sum of leaf diseases = reported sum of basal disease
sum_of_leaf = (
data
%>% filter(!disease %in% unique(nesting_disease))
%>% filter(disease != basal_disease)
%>% group_by(iso_3166, iso_3166_2, period_start_date, period_end_date, nesting_disease)
%>% summarise(cases_this_period = sum(as.numeric(cases_this_period)))
)

reported_totals = (
data
%>% filter(nesting_disease == '')
%>% filter(disease %in% sum_of_leaf$nesting_disease)
%>% select(-nesting_disease)
%>% rename(nesting_disease = disease)
)

# if sum of leaf diseases is < reported sum of basal disease,
# make new sub-disease called 'disease-name'_unaccounted, which contains
# the difference between sum of leaf diseases and the reported sum of the disease
unaccounted_data =
(inner_join(sum_of_leaf, reported_totals, by =
c('iso_3166', 'iso_3166_2', 'period_start_date', 'period_end_date', 'nesting_disease'),
suffix = c('_sum', '_reported'))

%>% mutate(cases_this_period_reported = as.numeric(cases_this_period_reported),
cases_this_period_sum = as.numeric(cases_this_period_sum))

%>% filter(cases_this_period_sum < cases_this_period_reported)
%>% mutate(diff = cases_this_period_reported - cases_this_period_sum)
%>% rename(cases_this_period = diff)
%>% mutate(disease = paste(nesting_disease, 'unaccounted', sep = '_'))

%>% select(-cases_this_period_reported, -cases_this_period_sum)

%>% mutate(original_dataset_id = '')
%>% mutate(historical_disease = '')
%>% mutate(record_origin = 'derived')
)

(data
|> mutate(record_origin = ifelse("record_origin" %in% names(data), record_origin, 'historical'))
|> rbind(unaccounted_data)
)
}

17 changes: 17 additions & 0 deletions R/iidda/man/factor_time_scale.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 6 additions & 4 deletions R/iidda/man/filter_out_time_scales.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

20 changes: 20 additions & 0 deletions R/iidda/man/find_unaccounted_cases.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 82e1ab6

Please sign in to comment.