-
Notifications
You must be signed in to change notification settings - Fork 0
/
clean_wastewater_and_case_data.R
74 lines (59 loc) · 3.84 KB
/
clean_wastewater_and_case_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
library(tidyverse)
library(janitor)
library(here)
library(lubridate)
library(rio)
# Load case numbers
# Use 7 day rolling average of all cases (confirmed and probable)
cases <- read_csv(here("data", "raw", "cases-by-day.csv")) %>%
clean_names() %>%
select(date_of_interest, all_case_count_7day_avg) %>%
rename(date = date_of_interest, cases = all_case_count_7day_avg) %>%
mutate(date = as.Date(date, format = "%m/%d/%Y")) %>%
select(date, cases)
# Load wastewater data
# Aggregate data by sample date and raw concentrations metric
wastewater <- read_csv(here("data", "raw", "SARS-CoV-2_concentrations_measured_in_NYC_Wastewater.csv")) %>%
clean_names() %>%
mutate(date = as.Date(sample_date, format = "%m/%d/%Y")) %>%
rename(plant_name = wrrf_name, load = concentration_sars_co_v_2_gene_target_n1_copies_l, pop = population_served_estimated) %>%
select(date, plant_name, pop, annotation, load)
# There are hundreds of null values in the data, and annoations explain why that is
annotations <- wastewater %>%
distinct(annotation)
# Based on conversations with Dave Larsen at Syracuse, we decided to handle null values in two ways
# 1. For annotations of null values with "Below Limit of Detection", we'll input 15, which is midway between 0 and lowest value in the data. These are the following annotations:
#Concentration below Method Limit of Detection
#Concentration below Method Limit of Detection - No signal in 1 out of 3 RT-qPCR wells, result is obtained by averaging signal from the two remaining RT-qPCR wells
#Concentration below Method Limit of Detection - No signal is 2 out of 3 RT-qPCR wells, result in obtained by averaging signal from the remaining RT-qPCR well
#Concentration below Method Limit of DetectionNo signal is 2 out of 3 RT-qPCR wells, result in obtained by averaging signal from the remaining RT-qPCR well
# 2. For all other null values, we'll input 0.
wastewater_clean <- wastewater %>%
mutate(load = if_else(is.na(annotation), load,
# annoations with na aren't the problem, but we need to set this first so we don't create more NAs in the following lines of code
if_else(annotation == "Concentration below Method Limit of Detection", 15,
if_else(annotation == "Concentration below Method Limit of Detection - No signal in 1 out of 3 RT-qPCR wells, result is obtained by averaging signal from the two remaining RT-qPCR wells", 15,
if_else(annotation == "Concentration below Method Limit of Detection - No signal is 2 out of 3 RT-qPCR wells, result in obtained by averaging signal from the remaining RT-qPCR well", 15,
if_else(annotation == "Concentration below Method Limit of DetectionNo signal is 2 out of 3 RT-qPCR wells, result in obtained by averaging signal from the remaining RT-qPCR well", 15,
if_else(is.na(load), 0, load)))))))
# Now with the clean data, we'll sum total load for that day and divide by total population covered by the plants
wastewater_daily_avg <-
wastewater_clean %>%
group_by(date) %>%
summarize(total_pop = sum(pop), total_load = sum(load)) %>%
mutate(weighted_avg = total_load/total_pop)
# Create a csv with wastewater just 2022 for datawrapper
wastewater_daily_avg_2022 <-
wastewater_clean %>%
filter(date >= "2022-01-01") %>%
group_by(date) %>%
summarize(total_pop = sum(pop), total_load = sum(load)) %>%
mutate(weighted_avg = total_load/total_pop)
# Create a csv with cases just 2022 for datawrapper
cases_2022 <- cases %>%
filter(date >= "2022-01-01")
# Export the data for DataWrapper
export(wastewater_daily_avg,"data/processed/wasterwater_daily_avg.csv")
export(wastewater_daily_avg_2022,"data/processed/wasterwater_daily_avg_2022.csv")
export(cases, "data/processed/cases.csv")
export(cases_2022, "data/processed/cases_2022.csv")