Skip to content

Commit

Permalink
fix identify_scales
Browse files Browse the repository at this point in the history
  • Loading branch information
gabriellemackinnon committed Jul 8, 2024
1 parent a4b9106 commit ccb3230
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 53 deletions.
2 changes: 1 addition & 1 deletion R/iidda/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ export(factor_time_scale)
export(failed_prep_script_outcomes)
export(fill_and_wrap)
export(fill_re_template)
export(filter_out_time_scales)
export(find_unaccounted_cases)
export(fix_csv)
export(flatten_disease_hierarchy)
Expand Down Expand Up @@ -83,6 +82,7 @@ export(melt_tracking_table_keys)
export(memoise_remove_age)
export(nlist)
export(normalize_diseases)
export(normalize_time_scales)
export(open_locally)
export(or_pattern)
export(pipeline_exploration_starter)
Expand Down
97 changes: 52 additions & 45 deletions R/iidda/R/data_prep_tools.R
Original file line number Diff line number Diff line change
Expand Up @@ -505,6 +505,7 @@ identify_time_scales = function(data){
%>% mutate(time_scale = ifelse(period_end_date == as.Date(period_start_date) + 6, "wk", "2wk"))
%>% mutate(time_scale = ifelse(as.Date(period_end_date)-as.Date(period_start_date) > 14, "mo", time_scale))
%>% mutate(time_scale = ifelse(as.Date(period_end_date)-as.Date(period_start_date) > 40, "qr", time_scale))
%>% mutate(time_scale = ifelse(as.Date(period_end_date)-as.Date(period_start_date) > 260, "3qr", time_scale))
%>% mutate(time_scale = ifelse(as.Date(period_end_date)-as.Date(period_start_date) > 300, "yr", time_scale))
)
}
Expand Down Expand Up @@ -869,38 +870,44 @@ filter_out_time_scales_old = function(data
}


#' Filter out Time Scales
#' Normalize Time Scales
#'
#' Choose a single best `time_scale` for each year in a dataset, grouped by
#' nesting disease. This best `time_scale` is defined as the longest
#' of the shortest time scales in each location and sub-disease.
#'
#' @param data A tidy data set with a `time_scale` column.
#' @param data A tidy data set with a `time_scale` and `year` column
#' @param initial_group Character vector naming columns for defining
#' the initial grouping used to compute the shortest time scales.
#' @param final_group Character vector naming columns for defining the final
#' grouping used to compute the longest of the shortest time scales.
#' @param get_implied_zeros Add zeros that are implied by a '0' reported at a coarser timescale.
#' @param aggregate_if_unavailable If a location is not reporting for the determined
#' 'best timescale', but is reporting at a finer timescale, aggregate this finer
#' timescale to the 'best timescale'
#'
#' @return A data set only containing records with the best time scale.
#' @return A data set only containing records with the optimal time scale.
#'
#' @importFrom lubridate year
#' @export
filter_out_time_scales = function(data
, initial_group = c("iso_3166", "iso_3166_2", "disease", "nesting_disease", "basal_disease")
normalize_time_scales = function(data
, initial_group = c("year", "iso_3166", "iso_3166_2", "disease", "nesting_disease", "basal_disease")
, final_group = c("basal_disease")
, get_implied_zeros = TRUE
, aggregate_if_unavailable = TRUE
) {

if(get_implied_zeros) data = get_implied_zeros(data)

if (length(unique(data$time_scale)) == 1L) return(data)

if (!"year" %in% colnames(data)) {stop("The column 'year' does not exist in the dataset.")}

new_data = (data
# remove '_unaccounted' cases when deciding best time_scale
|> factor_time_scale()
|> filter(!grepl("_unaccounted$", disease))
|> mutate(year = year(period_end_date))
# |> mutate(year = year(period_end_date))
|> group_by(across(all_of(c("year", initial_group))))
|> mutate(shortest_time_scale = time_scale_chooser(time_scale, which.min))
|> ungroup()
Expand All @@ -920,7 +927,7 @@ filter_out_time_scales = function(data
|> rbind(new_data)
)

if (isTRUE(aggregate_if_unavailable)) {
if(aggregate_if_unavailable) {
# coarse scales to aggregate to
scales = (all_new_data
|> select(period_start_date, period_end_date, disease, nesting_disease, basal_disease)
Expand All @@ -933,7 +940,7 @@ filter_out_time_scales = function(data
# available at a finer timescale
data_to_aggregate = (data
|> factor_time_scale()
|> mutate(year = year(period_end_date))
# |> mutate(year = year(period_end_date))
|> left_join(select(all_new_data, "year","disease", "nesting_disease", "basal_disease", "time_scale") |> unique(),
by = c("year", "disease", "nesting_disease", "basal_disease"),
suffix = c('_old', '_new'))
Expand Down Expand Up @@ -968,11 +975,12 @@ filter_out_time_scales = function(data
period_start_date, period_end_date, .keep_all = TRUE)

# add back days_this_period and period_mid_date for the coarser start and end dates
# FIXME: apparently using iidda analysis functions will cause issues. oops
|> mutate(days_this_period = iidda.analysis::num_days(period_start_date, period_end_date))
|> mutate(period_mid_date = iidda.analysis::mid_dates(period_start_date, period_end_date, days_this_period))
|> select(-time_scale_old)

|> mutate(record_origin = 'derived')
|> mutate(record_origin = 'derived-aggregated-timescales')
)

final = (all_new_data
Expand All @@ -998,50 +1006,49 @@ filter_out_time_scales = function(data
get_implied_zeros = function(data){

starting_data = (data
|> mutate(year = year(as.Date(period_end_date)))
|> factor_time_scale()
|> group_by(iso_3166_2, disease, year, original_dataset_id)
|> mutate(all_zero = ifelse(sum(as.numeric(cases_this_period)) == 0, TRUE, FALSE))
|> ungroup()
|> group_by(disease, year, original_dataset_id)
|> mutate(finest_timescale = min(time_scale))
|> ungroup()
|> mutate(year = year(as.Date(period_end_date)))
|> factor_time_scale()

|> group_by(iso_3166_2, disease, year, original_dataset_id)
|> mutate(all_zero = ifelse(sum(as.numeric(cases_this_period)) == 0, TRUE, FALSE))
|> ungroup()

|> group_by(disease, year, original_dataset_id)
|> mutate(finest_timescale = min(time_scale))
|> ungroup()
)

scales = (starting_data
|> filter(time_scale == finest_timescale)
|> distinct(disease, nesting_disease, basal_disease,
year, time_scale, period_start_date, period_end_date,
period_mid_date, days_this_period, original_dataset_id)
|> filter(time_scale == finest_timescale)
|> distinct(disease, nesting_disease, basal_disease,
year, time_scale, period_start_date, period_end_date,
period_mid_date, days_this_period, original_dataset_id)
)

# records for which all_zero = true and finest_timescale isn't available
get_new_zeros = (starting_data
|> filter(time_scale > finest_timescale, all_zero)
# filter for timescales that are not in the original data
|> anti_join(starting_data
, by = c('iso_3166_2', 'year', 'finest_timescale' = 'time_scale',
'disease', 'nesting_disease', 'basal_disease', 'dataset_id'
)) # nesting/basal too? see if that changes result!
|> select(-period_start_date, -period_end_date, -period_mid_date,
-days_this_period)
|> filter(time_scale > finest_timescale, all_zero)

# filter for timescales that are not in the original data
|> anti_join(starting_data
, by = c('iso_3166_2', 'year', 'finest_timescale' = 'time_scale',
'disease', 'nesting_disease', 'basal_disease', 'dataset_id'
)) # nesting/basal too? see if that changes result!

|> select(-period_start_date, -period_end_date, -period_mid_date,
-days_this_period)
)

# for rows in get_new_records, find the periods (i.e. start and end dates)
# for the finest_timescale for a given year, disease, and original_dataset_id
new_zeros =
(get_new_zeros
|> left_join(scales, by = c('disease', 'year', 'finest_timescale' = 'time_scale',
'nesting_disease', 'basal_disease', 'original_dataset_id'
), relationship = "many-to-many")
|> select(-time_scale, -year, -all_zero)
|> rename(time_scale = finest_timescale)

|> mutate(record_origin = 'derived')
new_zeros = (get_new_zeros
|> left_join(scales, by = c('disease', 'year', 'finest_timescale' = 'time_scale',
'nesting_disease', 'basal_disease', 'original_dataset_id'),
relationship = "many-to-many")
|> select(-time_scale, -year, -all_zero)
|> rename(time_scale = finest_timescale)

|> mutate(record_origin = 'derived-implied-zero')
)

# join back to original data
Expand Down Expand Up @@ -1104,12 +1111,12 @@ find_unaccounted_cases = function(data){

%>% mutate(original_dataset_id = '')
%>% mutate(historical_disease = '')
%>% mutate(record_origin = 'derived')
%>% mutate(record_origin = 'derived-unaccounted-cases')
)

(data
|> mutate(record_origin = ifelse("record_origin" %in% names(data), record_origin, 'historical'))
|> rbind(unaccounted_data)
%>% mutate(record_origin = ifelse("record_origin" %in% names(data), record_origin, 'historical'))
%>% rbind(unaccounted_data)
)
}

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit ccb3230

Please sign in to comment.