Pooling paper.Rmd

---
title: 'Pooling: Paper'
author: "Xianbin Cheng"
date: "February 15, 2019"
output: html_document
---

# Objective

Clean up data for the pooling paper

```{r, warning = FALSE, message = FALSE}
library(tidyverse)
source("STD_simulation.R")
library(mc2d)
library(MASS)
```

```{r}
sessionInfo()
```

```{r}
df = read.csv("Texas Conc and Class 04_12_2018.csv", header = TRUE, row.names = 1)
str(df)
```

```{r}
df1 = df %>%
  mutate(plate = str_split(string = .$Kernel_ID, pattern = "-", simplify = TRUE)[,1] %>% as.numeric(),
         kernel = str_split(string = .$Kernel_ID, pattern = "-", simplify = TRUE)[,2] %>% as.numeric()) %>%
  arrange(plate, kernel) %>%
  dplyr::filter(! plate %in% c(102, 902))

df1$AF_class = factor(x = df1$AF_class, levels = c("H", "M", "L"))

str(df1)
```

```{r}
summary(df1$AF_class)

summary(subset(x = df1, subset = AF_class == "L" , select = AF.ppb., drop = TRUE))
summary(subset(x = df1, subset = AF_class %in% c("M", "H"), select = AF.ppb., drop = TRUE))
```

```{r}
##### How many NAs?
count_na = function(data, type){
  if(type == "AF"){
    a = "AF.ppb."
  } else if (type == "FM"){
    a = "FM.ppm."
  }
  sum(is.na(data[[a]]))
}

# How many NA's in aflatoxin?
df1 %>%
  split(x = ., f = df1$plate) %>%
  map(.x = ., .f = count_na, type = "AF") %>%
  unlist()
```

```{r}
n = 1000000
prev = 0.04

set.seed(123)
# PERT for healthy kernels and Gamma for contaminated kernels
neg = rpert(n = n * (1 - prev), min = 0, mode = 0.7, max = 19.99, shape = 80)
pos = rgamma_lim(n = n * prev, alpha = 2, mode = 40000, lb = 20, ub = NULL)

sim_data = data.frame(conc = c(neg, pos), class = c(rep("neg", times = length(neg)), rep("pos", times = length(pos)))) %>%
  mutate(func = "PERT")
```

```{r}
ggplot(data = sim_data) +
  geom_histogram(aes(x = conc, fill = class), bins = 100) +
  scale_x_log10(breaks = c(1e-1, 1, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6)) +
  scale_fill_discrete(h = c(190, 15)) +
  labs(x = "AF concentration") +
  theme_bw()
```

```{r}
# Real data
quantile(x = df1$AF.ppb., na.rm = TRUE)
neg_exp = subset(x = df1, subset =AF_class == "L", select = AF.ppb.)
pos_exp = subset(x = df1, subset = AF_class == "H", select = AF.ppb.)

# PERT + Gamma
quantile(x = sim_data$conc)
```

```{r}
# Real data VS PERT + Gamma
wilcox.test(x = df1$AF.ppb., y = sim_data$conc, alternative = "two.sided")

ks.test(x = neg_exp$AF.ppb., y = neg)
ks.test(x = pos_exp$AF.ppb., y = pos)
ks.test(x = df1$AF.ppb., y = sim_data$conc)
```

```{r}
AF_hist = ggplot(data = df1)+
  geom_histogram(aes(x = AF.ppb., fill = AF_class), bins = 50) +
  labs(x = "Aflatoxin concentration (ppb)", y = "Number of Kernels")+
  scale_x_log10(breaks = c(1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4, 1e5)) +
  scale_fill_discrete(name = "Single Kernel Aflatoxin Level", labels = c("[AF]>=20", "[AF]<20")) +
  theme_classic() +
  theme(legend.position = "top")
```

```{r}
combined = ggplot() +
  geom_histogram(data = df1, aes(x = AF.ppb., y = ..density..), binwidth = 0.1) +
  geom_density(data = sim_data, aes(x = conc), fill = "grey", alpha = 0.3) +
  labs(x = "Aflatoxin concentration (ppb)", y = "Density")+
  scale_x_log10(breaks = c(1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4, 1e5)) +
  #scale_y_sqrt() +
  theme_classic() +
  theme(legend.position = "top")
```

```{r}
AF_hist
combined
```

```{r, eval = FALSE}
pdf("plot.pdf")
  AF_hist
  combined
dev.off()
```

```{r, echo= FALSE}
sessionInfo()
```