-
Notifications
You must be signed in to change notification settings - Fork 0
/
analysis.R
110 lines (90 loc) · 2.99 KB
/
analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# This script will load the processed data and run the model.
# Prior to this, we aggregated 2013-2019 death index data into daily death counts for each county, loaded in historical daily max heat index data to determine heat-event days between 2013-2019, merged these with population and mass shootings data, subtracted mass shooting deaths, and filtered for the 41 most populous counties.
# Library
library(tidyverse)
library(mgcv)
# Load processed data
df_processed <- read.csv("data/processed.csv") %>%
# only using april to september in this setting
filter(month >= 4 & month <= 9)
# County names
counties <- df_processed$county %>%
unique()
# Prepare variables to store output data
output <- NULL
# Loop through the counties
for(i in 1:length(counties)) {
c <- counties[i]
print(paste0(i, " ", c, "..."))
# filter data to that county
df_county <- df_processed %>%
filter(county == c)
# model
model <- gam(
deaths ~
is_heat_event +
year +
as.factor(month) +
as.factor(is_weekend) +
offset(log(population)),
family = nb(link = "log"),
data = df_county
)
# create new data for daily predicted values
new_df_county <- df_county %>%
# suppose there were no heat days
mutate(is_heat_event = F)
# predict daily deaths had there been no heat-event days
ginv <- model$family$linkinv
prs <- predict(
model,
newdata = new_df_county,
type = "link",
se.fit = T,
interval = "predictions"
)
new_df_county$pred <- ginv(prs[[1]])
new_df_county$lo <- ginv(prs[[1]] - 1.96 * prs[[2]])
new_df_county$up <- ginv(prs[[1]] + 1.96 * prs[[2]])
# original distribution of heat-event days
new_df_county$is_heat_event <- df_county$is_heat_event
# subset of heat-event days and non heat-event days
new_df_county_heat_days <- subset(
new_df_county,
is_heat_event == T
)
new_df_county_non_heat_days <- subset(
new_df_county,
is_heat_event == F
)
# actual and predicted deaths on heat-event event days
deaths.heatdays <- sum(new_df_county_heat_days$deaths)
deaths.predicted <- sum(new_df_county_heat_days$pred)
# relative risk
rr <- deaths.heatdays / deaths.predicted
rr.lo <- exp(log(rr - 1.96 * sqrt(1 / deaths.heatdays + 1 / deaths.predicted)))
rr.up <- exp(log(rr + 1.96 * sqrt(1 / deaths.heatdays + 1 / deaths.predicted)))
# excess deaths
excess <- sum(new_df_county_heat_days$deaths - new_df_county_heat_days$pred)
excess_up <- sum(new_df_county_heat_days$deaths - new_df_county_heat_days$lo)
excess_lo <- sum(new_df_county_heat_days$deaths - new_df_county_heat_days$up)
# average population
avg_population <- mean(df_county$population)
# output row
county_output <- data.frame(
county = c,
avg_population,
deaths.heatdays,
deaths.predicted,
rr,
rr.lo,
rr.up,
excess,
excess_up,
excess_lo
)
output <- rbind(output, county_output)
}
# Write output as CSV
output %>%
write.csv("data/output.csv", row.names = F)