-
Notifications
You must be signed in to change notification settings - Fork 0
/
sampling.Rmd
73 lines (58 loc) · 2.51 KB
/
sampling.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# Creating artificial ground truth data
Here we will sample some metagenomes with known composition. Those will consist of 90% of a representative food-free sample containing bacteria and human reads
and of 10% of staggered abundance food reads.
## Building up a representative sample
First we will calculate the mean abundances from the a healthy cohort taken from the iHMP project.
```{r}
library(data.table)
present <- list.files("simulated/data/background_genomes", ".fna.gz") |> strsplit(".fna.gz", fixed=T) |> unlist() |> as.numeric()
counts <- fread("ihmp/data/S_counts.csv")[grepl("k__Bacteria|s__Homo sapiens", lineage)]
mock <- counts[, .(reads=mean(new_est_reads)), by=c("name", "taxonomy_id", "lineage", "taxid_lineage")][reads > 100 & taxonomy_id %in% present]
mock[, "relative_abundance" := reads / sum(reads)]
mock[, "type" := "background"]
mock[, uniqueN(name)]
```
This gives us our reference sample. Now we will start generating random samples by adding in food reads to the data.
First we read in our manifest of covered foods.
```{r}
matches <- fread("db/data/matches.csv")
duped <- matches[duplicated(id), unique(id)]
duped <- matches[id %in% duped, unique(matched_taxid)]
manifest <- fread("db/data/manifest.csv")
foods <- fread("db/data/dbs/food_matches.csv") |> unique(by = c("matched_taxid", "rank"))
foods <- foods[manifest, on="matched_taxid"][rank == "species"& (!matched_taxid %in% duped)]
foods
```
```{r}
set.seed(42)
FOOD_FRAC <- 0.1
N_FOODS <- 10
DEPTH <- 1e7
N_NEGATIVE <- 4
mock_sample <- copy(mock)[, .(id=taxonomy_id, relative_abundance=reads / sum(reads), sample_id="negative", file=paste0(taxonomy_id, ".fna.gz"), type)]
samples <- list()
for (i in 1:16) {
food_sample <- foods[sample.int(nrow(foods), N_FOODS), ] |> copy()
food_sample[, "taxon" := .SD[[rank]], by="matched_taxid"]
food_sample <- food_sample[, .(id=matched_taxid, reads=4^(1:N_FOODS), file=basename(filename))]
food_sample[, "relative_abundance" := reads / sum(reads) * FOOD_FRAC]
food_sample[, "type" := "food"]
sa <- rbind(
mock_sample[, .(id, relative_abundance = relative_abundance * (1 - FOOD_FRAC), file, type)],
food_sample[, .(id, relative_abundance, file, type)]
)
sa[, "sample_id" := paste0("positive_", i)]
samples[[i]] <- sa
}
for (i in 1:N_NEGATIVE) {
neg <- copy(mock_sample)
neg$sample_id <- paste0("negative_", i)
samples[[length(samples) + i]] <- neg
}
samples <- rbindlist(samples, use.names = T)
samples[, "depth" := DEPTH]
```
```{r}
head(samples)
fwrite(samples, "simulated/data/manifest.csv")
```