-
Notifications
You must be signed in to change notification settings - Fork 2
/
HeatTree_Final_SUS.Rmd
148 lines (119 loc) · 5.5 KB
/
HeatTree_Final_SUS.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
---
title: "HeatTreee Visualizations"
output: html_document
authors: Jane Lucas and Neil Infante
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
##Load packages
library(tidyverse)
library(metacoder)
```
```{r cars}
#Begin by setting your working directory to the folder with each of your files
#setwd()
#Load your taxa table
otu <- read_csv('otu.csv')
#Because our data did not have a label for the first column, we are naming it otu_id
names(otu)[1] <- 'otu_id'
##Some of our samples had a - in their file name. This can sometimes cause issues with R so we have changed the
#sample names to no longer have the - but not have a _
names(otu) <- sapply(names(otu), function(x) gsub('-', '_', as.character(x)))
# Import metadata
hmp_samples <- read_csv('Tara_oceans_mapping_file.csv')
#name the first column sample_id
names(hmp_samples)[1] <-'sample_id'
##Some of our samples had a - in their file name. This can sometimes cause issues with R so we have changed the
#sample names to no longer have the - but not have a _
hmp_samples <- hmp_samples %>% mutate(sample_id = gsub('-','_', sample_id))
hmp_samples <- hmp_samples %>% group_by(Depth)
hmp_samples <- hmp_samples %>% group_by(Province, add=T)
hmp_samples <- hmp_samples %>% group_by(Size_fraction, add=T)
##Load taxonomic data file
taxa <- read_csv('taxa.csv')
#taxa <- as.data.frame(taxa)
#rename the first column as otu_id
names(taxa)[1] <- 'otu_id'
#visualizae the taxa file
str(taxa)
##Because the code needs the taxonomic data to be in a specific format, we are adding in additional
#information to each of out taxonomic columns
taxa <- taxa
taxa$Domain <- paste0("k__", taxa$Domain)
taxa$Phylum <- paste0("p__", taxa$Phylum)
taxa$Class <- paste0("c__", taxa$Class)
taxa$Order <- paste0("o__", taxa$Order)
taxa$Family <- paste0("f__", taxa$Family)
taxa$Genus <- paste0("g__", taxa$Genus)
##We are merging together all of our taxonomic groups into one continuous string
taxa <- taxa %>% mutate(lineage = paste("r__Root", Domain, Phylum, Class, Order, Family, Genus, sep=";"))
#This column is called lineage
taxa <- taxa %>% select(otu_id, lineage)
#View the first bit of the OTU table to make sure the columns look correct
otu[1:5, 1:3]
```
```{r}
##Join the otu and taxonomic files together
to_mc <- inner_join(taxa, otu)
#View joined files
to_mc[1:5, 1:4]
```
```{r}
##Create an object that is complementary with the metacoder program.
obj <- parse_tax_data(to_mc,
class_cols = "lineage", # the column that contains taxonomic information
class_sep = ";", # The character used to separate taxa in the classification
class_regex = "^(.+)__(.+)$", # Regex identifying where the data for each taxon is
class_key = c(tax_rank = "info", # A key describing each regex capture group
tax_name = "taxon_name"))
#View this object
print(obj)
```
```{r}
obj$data$tax_data <- zero_low_counts(obj, data = "tax_data", min_count = 5)
no_reads <- rowSums(obj$data$tax_data[, hmp_samples$sample_id]) == 0
sum(no_reads)
obj <- filter_obs(obj, data = "tax_data", ! no_reads, drop_taxa = TRUE)
print(obj)
```
```{r}
obj$data$tax_data <- calc_obs_props(obj, "tax_data")
obj$data$tax_abund <- calc_taxon_abund(obj, "tax_data",cols = hmp_samples$sample_id)
obj$data$tax_occ <- calc_n_samples(obj, "tax_abund", groups = hmp_samples$Depth)
set.seed(1) # This makes the plot appear the same each time it is run
heat_tree(obj,
node_label = taxon_names,
node_size = n_obs,
node_color = DCM, #This is where you determine which factor you want to display
node_size_axis_label = "OTU count",
node_label_size_range = c(0.02, 0.02),
node_label_max = 50,
node_color_axis_label = "Samples with reads",
layout = "davidson-harel", # The primary layout algorithm
initial_layout = "reingold-tilford") # The layout algorithm that initializes node locations
```
```{r}
obj$data$diff_table$wilcox_p_value <- p.adjust(obj$data$diff_table$wilcox_p_value, method = "fdr")
range(obj$data$diff_table$wilcox_p_value, finite = TRUE)
obj$data$diff_table$log2_median_ratio[obj$data$diff_table$wilcox_p_value > 0.05] <- 0
obj$data$diff_table <- compare_groups(obj, dataset = "tax_abund",
cols = hmp_samples$sample_id, # What columns of sample data to use
groups = hmp_samples$Depth) # What category each sample is assigned to
print(obj$data$diff_table)
```
```{r}
set.seed(1)
heat_tree_matrix(obj,
dataset = "diff_table",
node_size = n_obs, # n_obs is a function that calculates, in this case, the number of OTUs per taxon
node_label = taxon_names,
node_color = log2_median_ratio, # A column from obj$data$diff_table
node_color_range = diverging_palette(), # The built-in palette for diverging data
node_color_trans = "linear", # The default is scaled by circle area
node_color_interval = c(-3, 3), # The range of log2_median_ratio to display
edge_color_interval = c(-3, 3), # The range of log2_median_ratio to display
node_size_axis_label = "Number of OTUs",
node_color_axis_label = "Log2 ratio median proportions",
layout = "davidson-harel", # The primary layout algorithm
initial_layout = "reingold-tilford") # The layout algorithm that initializes node locations
```