-
Notifications
You must be signed in to change notification settings - Fork 0
/
demo analysis_stats_RQ1 & RQ2.R
215 lines (177 loc) · 8.69 KB
/
demo analysis_stats_RQ1 & RQ2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
# library(R.utils)
`%notin%` <- Negate(`%in%`)
# Metadata of compounds that appear in at least 2 samples HPLCTOFMS ===================
stats_rq1 <- gc_hplc %>%
select(File, collapsed_compound, product_cat, Values) %>%
# since we have duplicates with different values of the same compound in some samples, we summarize these values by taking the mean of them
group_by(File, product_cat, collapsed_compound) %>%
summarise(across(Values, mean)) %>%
pivot_wider(names_from = collapsed_compound, values_from = Values) %>%
column_to_rownames(., var = "File")
select_col <- c()
for (c in 2:ncol(stats_rq1)) {
non_na <- c()
for (prod in unique(stats_rq1$product_cat)) {
non_na <- c(non_na, sum(!is.na(stats_rq1[which(stats_rq1$product_cat == prod), c])))
}
if (sum(non_na > 2) > 2) {
# print(paste0("non_na", non_na))
next
} else {
# print(paste0("c", c))
select_col <- c(select_col, c)
}
}
new_stats <- stats_rq1[, select_col]
new_stats$product_cat <- stats_rq1$product_cat
# Fill in missing value with LOD
for (r in 1:nrow(new_stats)) {
new_stats[r, which(base::is.na(new_stats[r,]))] <- runif(length(which(base::is.na(new_stats[r,]))),
min = sort(gc_hplc$Values)[1],
max = sort(gc_hplc$Values)[2])
}
# stats_rq1 <- stats_rq1 %>%
# rownames_to_column(., var = "product_cat")
### Examining for normal distribution ----------------
## Histogram
# Create empty list
histlist <- list()
# For loop creating histograms of each plastic type
i <- 1
for (plastic in unique(transpose_df$plastic_type)) {
grouped_plastic <- which(transpose_df$plastic_type == plastic)
histlist[[i]] <- ggplot() +
geom_histogram(mapping = aes(as.vector(t(transpose_df[grouped_plastic, 3:ncol(transpose_df)])))) +
labs(x = plastic, title = plastic)
i <- i + 1
}
# Appending each histogram to the empty list in order to view all of them
gridExtra::grid.arrange(grobs = histlist)
## QQ Plot
# Create empty list
qqlist <- list()
# For loop creating QQ plots of each plastic type
i <- 1
# Appending QQ plots together
par(mfrow=c(3,3))
for (product in unique(transpose_df$product_cat)) {
grouped_product <- which(transpose_df$product_cat == product)
qqlist[[i]] <- qqnorm(as.vector(t(transpose_df[grouped_product, 3:ncol(transpose_df)])), main = product, xlab = product, col = 'steelblue')
qqline(as.vector(t(transpose_df[grouped_product, 3:ncol(transpose_df)])), col = 'red')
i <- i + 1
}
### Conclusion = they are not normally distributed
### Examining for equality of variance -----------------------------
# Levene’s test-non-normally distributed data (car::leveneTest()), significant if p-value < 0.05
library(car)
results <- list()
for (col in 1:ncol(utils::combn(unique(transpose_df$product_cat), 2))) {
# extract the combinations of plastic type pairs
p_1 <- utils::combn(unique(transpose_df$product_cat), 2)[,col][1]
p_2 <- utils::combn(unique(transpose_df$product_cat), 2)[,col][2]
idx1 <- which(transpose_df$product_cat == p_1)
idx2 <- which(transpose_df$product_cat == p_2)
V1 <- as.vector(t(transpose_df[idx1, 3:ncol(transpose_df)]))
V2 <- as.vector(t(transpose_df[idx2, 3:ncol(transpose_df)]))
datat <- c(V1, V2)
grouped <- as.factor(c(rep(p_1, times = length(V1)), rep(p_2, times = length(V2))))
non_norm <- data.frame(datat, grouped)
results[[paste0(p_1, p_2)]] <- car::leveneTest(datat ~ grouped, data = non_norm)
}
# -> p-value is 0.4096 which is greater the significance level of 0.05 therefore can conclude there is no significant difference between the variances
# Fligner-Killeen's test (fligner.test()), significant if p-value < 0.05
fligner.test(datat ~ grouped, data = non_norm)
# p-value less than significance level therefore there is significant difference between variances.
# Test to determine the homogeneity of group variances.
# Wilcoxon test to verify non normality of samples -----------
# Examine a difference in median value between distributions.
# -> p-value smaller than significance level meaning two samples differ in distribution and median
summary_table_of_tests <- data.frame(comp=character(), pair=character(), wilcox_test=integer(), ks_test=integer())
for (comp in colnames(new_stats[,3:100])) {
# looping through the combinations of product categories
for (col in 1:ncol(utils::combn(unique(new_stats$product_cat), 2))) {
# extract the combinations of product category pairs
p_1 <- utils::combn(unique(new_stats$product_cat), 2)[,col][1]
p_2 <- utils::combn(unique(new_stats$product_cat), 2)[,col][2]
# calculating the p-value between each product category pair
wilcox_test <- wilcox.test(new_stats[which(new_stats$product_cat == p_1), comp],
new_stats[which(new_stats$product_cat == p_2), comp])$p.value
ks_test <- ks.test(new_stats[which(new_stats$product_cat == p_1), comp],
new_stats[which(new_stats$product_cat == p_2), comp])$p.value
# assigning row information
summary_table_of_tests[nrow(summary_table_of_tests) + 1,] <- c(comp, paste0(p_1, "&", p_2), wilcox_test, ks_test)
}
}
summary_table_of_tests$adjusted_pvalue_holm <- stats::p.adjust(summary_table_of_tests$wilcox_test, method = "holm")
summary_table_of_tests$adjusted_pvalue_hochberg <- stats::p.adjust(summary_table_of_tests$wilcox_test, method = "hochberg")
summary_table_of_tests$adjusted_pvalue_hommel <- stats::p.adjust(summary_table_of_tests$wilcox_test, method = "hommel")
summary_table_of_tests$adjusted_pvalue_bonferroni <- stats::p.adjust(summary_table_of_tests$wilcox_test, method = "bonferroni")
summary_table_of_tests$adjusted_pvalue_BH <- stats::p.adjust(summary_table_of_tests$wilcox_test, method = "BH")
summary_table_of_tests$adjusted_pvalue_BY <- stats::p.adjust(summary_table_of_tests$wilcox_test, method = "BY")
summary_table_of_tests$adjusted_pvalue_fdr <- stats::p.adjust(summary_table_of_tests$wilcox_test, method = "fdr")
alpha <- summary_table_of_tests %>%
filter(., adjusted_pvalue_holm < 0.5) %>%
arrange(adjusted_pvalue_holm)
summary_p0.05 <- summary_table_of_tests %>% filter(wilcox_test < 0.05 & ks_test < 0.05)
unique(summary_p0.05$comp)
top <- summary_p0.05 %>% arrange(wilcox_test, ks_test)
top_wil <- summary_p0.05 %>% arrange(wilcox_test)
top_ks <- summary_p0.05 %>% arrange(ks_test)
# 5 assumptions of multiple linear regression --------------------
transpose_df <- transpose_df %>%
mutate(plastic_type = factor(x = plastic_type, levels=unique(plastic_type))) %>%
mutate(File = factor(x = File, levels=unique(File)))
must_convert<-sapply(transpose_df, is.factor)
new_pt <- sapply(transpose_df[,must_convert], unclass)
# Using FactoMineR
out <- cbind(new_pt,var$contrib)
rownames(out) <- NULL
out <- as.data.frame(out) %>%
column_to_rownames(., var = "File")
# Linear relationship (predicting the plastic type)
linearity <- stats::lm(plastic_type ~ ., data = out)
summary(linearity)
#No linearity between predictor and response variables
# Multicollinearity
library(car)
vif(linearity)
# all VIFs are between 1 and 5 which indicates moderate correlation between variables
# but not severe enough.
# check correlation between predictor variables
cor_matrix <- out[, 2:ncol(out)]
cor(cor_matrix)
# create heat map to see correlation
library(reshape2)
cor_mat <- round(cor(cor_matrix), 3)
melt_cor_mat <- melt(cor_mat)
#ggplot(data = melt_cor_mat, aes(x=Var1, y=Var2, fill=value)) +
# geom_tile()
install.packages("ggcorrplot")
library(ggcorrplot)
ggcorrplot::ggcorrplot(cor(cor_matrix))
# Independence
# the Durbin-Watson Test
# Used to detect autocorrelation in residuals.
# Ranges from 0 to 4, 2 is no autocorrelation, below 2 is positive autocorrelation
# and above 2 is negative correlation.
library(car)
durbinWatsonTest(linearity)
# The statistic is 0.46 and p-value is 0 so
# reject null hypothesis and conclude residuals are auto correlated.
# alternative hypothesis is true (true autocorrelation is greater than 0).
# Homoscedasticity
plot(fitted(linearity), resid(linearity), xlab='Fitted Values', ylab='Residuals')
abline(0,0)
# residuals seem to follow an equal variance throughout the plot.
# Formally tests for heteroscedasticity using Breusch-Pagan test
install.packages("lmtest")
library(zoo)
library(lmtest)
bptest(linearity)
# since the p-value is greater than 0.05 we don't reject the null hypothesis
# and conclude homoscedasticity is present and not violated.
# Multivariate Normality
# Use a QQ plot to determine whether or not residuals of a model follow normal
# distribution, if points form straight line then assumption is met.
# qq norm takes numerical data like our data set out, but not in data frames.
out %>% mutate_all(out)