From ccb3230bd6cd91106531568a7a1bd7fdda8850d7 Mon Sep 17 00:00:00 2001
From: gabriellemackinnon <gabrielle.mackinnon@mail.mcgill.ca>
Date: Mon, 8 Jul 2024 10:20:08 -0400
Subject: [PATCH] fix identify_scales

---
 R/iidda/NAMESPACE                             |  2 +-
 R/iidda/R/data_prep_tools.R                   | 97 ++++++++++---------
 ...ime_scales.Rd => normalize_time_scales.Rd} | 17 ++--
 3 files changed, 63 insertions(+), 53 deletions(-)
 rename R/iidda/man/{filter_out_time_scales.Rd => normalize_time_scales.Rd} (67%)

diff --git a/R/iidda/NAMESPACE b/R/iidda/NAMESPACE
index d69d5d43..533209d7 100644
--- a/R/iidda/NAMESPACE
+++ b/R/iidda/NAMESPACE
@@ -34,7 +34,6 @@ export(factor_time_scale)
 export(failed_prep_script_outcomes)
 export(fill_and_wrap)
 export(fill_re_template)
-export(filter_out_time_scales)
 export(find_unaccounted_cases)
 export(fix_csv)
 export(flatten_disease_hierarchy)
@@ -83,6 +82,7 @@ export(melt_tracking_table_keys)
 export(memoise_remove_age)
 export(nlist)
 export(normalize_diseases)
+export(normalize_time_scales)
 export(open_locally)
 export(or_pattern)
 export(pipeline_exploration_starter)
diff --git a/R/iidda/R/data_prep_tools.R b/R/iidda/R/data_prep_tools.R
index 1916327e..4bb47829 100644
--- a/R/iidda/R/data_prep_tools.R
+++ b/R/iidda/R/data_prep_tools.R
@@ -505,6 +505,7 @@ identify_time_scales = function(data){
    %>% mutate(time_scale = ifelse(period_end_date == as.Date(period_start_date) + 6, "wk", "2wk"))
    %>% mutate(time_scale = ifelse(as.Date(period_end_date)-as.Date(period_start_date) > 14, "mo", time_scale))
    %>% mutate(time_scale = ifelse(as.Date(period_end_date)-as.Date(period_start_date) > 40, "qr", time_scale))
+   %>% mutate(time_scale = ifelse(as.Date(period_end_date)-as.Date(period_start_date) > 260, "3qr", time_scale))
    %>% mutate(time_scale = ifelse(as.Date(period_end_date)-as.Date(period_start_date) > 300, "yr", time_scale))
   )
 }
@@ -869,38 +870,44 @@ filter_out_time_scales_old = function(data
 }
 
 
-#' Filter out Time Scales
+#' Normalize Time Scales
 #'
 #' Choose a single best `time_scale` for each year in a dataset, grouped by
 #' nesting disease. This best `time_scale` is defined as the longest
 #' of the shortest time scales in each location and sub-disease.
 #'
-#' @param data A tidy data set with a `time_scale` column.
+#' @param data A tidy data set with a `time_scale` and `year` column
 #' @param initial_group Character vector naming columns for defining
 #' the initial grouping used to compute the shortest time scales.
 #' @param final_group Character vector naming columns for defining the final
 #' grouping used to compute the longest of the shortest time scales.
+#' @param get_implied_zeros Add zeros that are implied by a '0' reported at a coarser timescale.
 #' @param aggregate_if_unavailable If a location is not reporting for the determined
 #' 'best timescale', but is reporting at a finer timescale, aggregate this finer 
 #' timescale to the 'best timescale'
 #'
-#' @return A data set only containing records with the best time scale.
+#' @return A data set only containing records with the optimal time scale.
 #'
 #' @importFrom lubridate year
 #' @export
-filter_out_time_scales = function(data
-    , initial_group = c("iso_3166", "iso_3166_2", "disease", "nesting_disease", "basal_disease")
+normalize_time_scales = function(data
+    , initial_group = c("year", "iso_3166", "iso_3166_2", "disease", "nesting_disease", "basal_disease")
     , final_group = c("basal_disease")
+    , get_implied_zeros = TRUE
     , aggregate_if_unavailable = TRUE
 ) {
   
+  if(get_implied_zeros) data = get_implied_zeros(data)
+  
   if (length(unique(data$time_scale)) == 1L) return(data)
   
+  if (!"year" %in% colnames(data)) {stop("The column 'year' does not exist in the dataset.")}
+  
   new_data = (data
     # remove '_unaccounted' cases when deciding best time_scale
     |> factor_time_scale()
     |> filter(!grepl("_unaccounted$", disease))
-    |> mutate(year = year(period_end_date))
+   # |> mutate(year = year(period_end_date))
     |> group_by(across(all_of(c("year", initial_group))))
     |> mutate(shortest_time_scale = time_scale_chooser(time_scale, which.min))
     |> ungroup()
@@ -920,7 +927,7 @@ filter_out_time_scales = function(data
     |> rbind(new_data)
   )
   
-  if (isTRUE(aggregate_if_unavailable)) {
+  if(aggregate_if_unavailable) {
     # coarse scales to aggregate to
     scales = (all_new_data
     |> select(period_start_date, period_end_date, disease, nesting_disease, basal_disease)
@@ -933,7 +940,7 @@ filter_out_time_scales = function(data
     # available at a finer timescale
     data_to_aggregate = (data 
      |> factor_time_scale()
-     |> mutate(year = year(period_end_date))
+   #  |> mutate(year = year(period_end_date))
      |> left_join(select(all_new_data, "year","disease", "nesting_disease", "basal_disease", "time_scale") |> unique(),
                   by = c("year", "disease", "nesting_disease", "basal_disease"),
                   suffix = c('_old', '_new'))
@@ -968,11 +975,12 @@ filter_out_time_scales = function(data
                  period_start_date, period_end_date, .keep_all = TRUE)
      
      # add back days_this_period and period_mid_date for the coarser start and end dates
+     # FIXME: apparently using iidda analysis functions will cause issues. oops
      |> mutate(days_this_period = iidda.analysis::num_days(period_start_date, period_end_date))
      |> mutate(period_mid_date = iidda.analysis::mid_dates(period_start_date, period_end_date, days_this_period))
      |> select(-time_scale_old)
      
-     |> mutate(record_origin = 'derived')
+     |> mutate(record_origin = 'derived-aggregated-timescales')
     )
     
     final = (all_new_data
@@ -998,50 +1006,49 @@ filter_out_time_scales = function(data
 get_implied_zeros = function(data){
   
   starting_data = (data
-                   |> mutate(year = year(as.Date(period_end_date)))
-                   |> factor_time_scale()
-                   
-                   |> group_by(iso_3166_2, disease, year, original_dataset_id)
-                   |> mutate(all_zero = ifelse(sum(as.numeric(cases_this_period)) == 0, TRUE, FALSE))
-                   |> ungroup()
-                   
-                   |> group_by(disease, year, original_dataset_id)
-                   |> mutate(finest_timescale = min(time_scale))
-                   |> ungroup()
+   |> mutate(year = year(as.Date(period_end_date)))
+   |> factor_time_scale()
+   
+   |> group_by(iso_3166_2, disease, year, original_dataset_id)
+   |> mutate(all_zero = ifelse(sum(as.numeric(cases_this_period)) == 0, TRUE, FALSE))
+   |> ungroup()
+   
+   |> group_by(disease, year, original_dataset_id)
+   |> mutate(finest_timescale = min(time_scale))
+   |> ungroup()
   )
   
   scales = (starting_data
-            |> filter(time_scale == finest_timescale)
-            |> distinct(disease, nesting_disease, basal_disease,
-                        year, time_scale, period_start_date, period_end_date, 
-                        period_mid_date, days_this_period, original_dataset_id)
+  |> filter(time_scale == finest_timescale)
+  |> distinct(disease, nesting_disease, basal_disease,
+              year, time_scale, period_start_date, period_end_date, 
+              period_mid_date, days_this_period, original_dataset_id)
   )
   
   # records for which all_zero = true and finest_timescale isn't available
   get_new_zeros = (starting_data                
-                   |> filter(time_scale > finest_timescale, all_zero)
-                   
-                   # filter for timescales that are not in the original data
-                   |> anti_join(starting_data
-                                , by = c('iso_3166_2', 'year', 'finest_timescale' = 'time_scale',
-                                         'disease', 'nesting_disease', 'basal_disease', 'dataset_id'
-                                )) # nesting/basal too? see if that changes result!
-                   
-                   |> select(-period_start_date, -period_end_date, -period_mid_date,
-                             -days_this_period)
+ |> filter(time_scale > finest_timescale, all_zero)
+ 
+ # filter for timescales that are not in the original data
+ |> anti_join(starting_data
+              , by = c('iso_3166_2', 'year', 'finest_timescale' = 'time_scale',
+                       'disease', 'nesting_disease', 'basal_disease', 'dataset_id'
+              )) # nesting/basal too? see if that changes result!
+ 
+ |> select(-period_start_date, -period_end_date, -period_mid_date,
+           -days_this_period)
   ) 
   
   # for rows in get_new_records, find the periods (i.e. start and end dates)
   # for the finest_timescale for a given year, disease, and original_dataset_id
-  new_zeros = 
-    (get_new_zeros
-     |> left_join(scales, by = c('disease', 'year', 'finest_timescale' = 'time_scale',
-                                 'nesting_disease', 'basal_disease', 'original_dataset_id'
-     ), relationship = "many-to-many")
-     |> select(-time_scale, -year, -all_zero)
-     |> rename(time_scale = finest_timescale)
-     
-     |> mutate(record_origin = 'derived')
+  new_zeros = (get_new_zeros
+   |> left_join(scales, by = c('disease', 'year', 'finest_timescale' = 'time_scale',
+                               'nesting_disease', 'basal_disease', 'original_dataset_id'), 
+                relationship = "many-to-many")
+   |> select(-time_scale, -year, -all_zero)
+   |> rename(time_scale = finest_timescale)
+   
+   |> mutate(record_origin = 'derived-implied-zero')
     )
   
   # join back to original data
@@ -1104,12 +1111,12 @@ find_unaccounted_cases = function(data){
      
      %>% mutate(original_dataset_id = '')
      %>% mutate(historical_disease = '')
-     %>% mutate(record_origin = 'derived')
+     %>% mutate(record_origin = 'derived-unaccounted-cases')
     ) 
   
   (data
-    |> mutate(record_origin = ifelse("record_origin" %in% names(data), record_origin, 'historical'))
-    |> rbind(unaccounted_data)
+    %>% mutate(record_origin = ifelse("record_origin" %in% names(data), record_origin, 'historical'))
+    %>% rbind(unaccounted_data)
   )
 }
 
diff --git a/R/iidda/man/filter_out_time_scales.Rd b/R/iidda/man/normalize_time_scales.Rd
similarity index 67%
rename from R/iidda/man/filter_out_time_scales.Rd
rename to R/iidda/man/normalize_time_scales.Rd
index 5b444de4..4bc5e90a 100644
--- a/R/iidda/man/filter_out_time_scales.Rd
+++ b/R/iidda/man/normalize_time_scales.Rd
@@ -1,19 +1,20 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/data_prep_tools.R
-\name{filter_out_time_scales}
-\alias{filter_out_time_scales}
-\title{Filter out Time Scales}
+\name{normalize_time_scales}
+\alias{normalize_time_scales}
+\title{Normalize Time Scales}
 \usage{
-filter_out_time_scales(
+normalize_time_scales(
   data,
-  initial_group = c("iso_3166", "iso_3166_2", "disease", "nesting_disease",
+  initial_group = c("year", "iso_3166", "iso_3166_2", "disease", "nesting_disease",
     "basal_disease"),
   final_group = c("basal_disease"),
+  get_implied_zeros = TRUE,
   aggregate_if_unavailable = TRUE
 )
 }
 \arguments{
-\item{data}{A tidy data set with a `time_scale` column.}
+\item{data}{A tidy data set with a `time_scale` and `year` column}
 
 \item{initial_group}{Character vector naming columns for defining
 the initial grouping used to compute the shortest time scales.}
@@ -21,12 +22,14 @@ the initial grouping used to compute the shortest time scales.}
 \item{final_group}{Character vector naming columns for defining the final
 grouping used to compute the longest of the shortest time scales.}
 
+\item{get_implied_zeros}{Add zeros that are implied by a '0' reported at a coarser timescale.}
+
 \item{aggregate_if_unavailable}{If a location is not reporting for the determined
 'best timescale', but is reporting at a finer timescale, aggregate this finer 
 timescale to the 'best timescale'}
 }
 \value{
-A data set only containing records with the best time scale.
+A data set only containing records with the optimal time scale.
 }
 \description{
 Choose a single best `time_scale` for each year in a dataset, grouped by