The DspikeIn package was developed to facilitate:
- Verifying the phylogenetic distances of ASVs/OTUs resulting from spiked species.
- Preprocessing data.
- Calculating the spike-in scaling factor.
- Converting relative abundance to absolute abundance.
- Estimating acceptable spiked species retrieval %
- Data transformation, Differential abundance and visualization.
Tetragenococcus halophilus and Dekkera bruxellensis were selected as taxa to spike into gut microbiome samples based on our previous studies WalkerLab.
If you encounter issues installing the package due to missing dependencies, follow these steps to install all required packages first:
To install the required packages, use the following script:
# Install CRAN packages
install.packages(c("stats", "dplyr", "ggplot2", "flextable","ggpubr", "randomForest", "ggridges", "ggalluvial","tibble", "matrixStats", "RColorBrewer", "ape", "rlang", "scales", "magrittr", "phangorn"))
# Load CRAN packages
lapply(c("stats", "dplyr", "ggplot2", "flextable","ggpubr","randomForest", "ggridges", "ggalluvial","tibble", "matrixStats", "RColorBrewer", "ape", "rlang", "scales", "magrittr", "phangorn"), library, character.only = TRUE)
# Install BiocManager if not installed
if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager")
# Install Bioconductor packages
BiocManager::install(c("phyloseq", "msa", "DESeq2","ggtree", "edgeR", "Biostrings", "DECIPHER", "microbiome"))
# Load Bioconductor packages
lapply(c("phyloseq", "msa", "DESeq2", "edgeR", "Biostrings","ggtree", "DECIPHER", "microbiome"), library, character.only = TRUE)
# Install remotes if not installed
install.packages("remotes")
# Install GitHub packages
remotes::install_github("mikemc/speedyseq")
remotes::install_github("microsud/microbiomeutilities")
# Load GitHub packages
library(speedyseq)
library(microbiomeutilities)
# Installation
#Instructions for how to install the DspikeIn package.
# Using devtools
install.packages("devtools")
devtools::install_github("mghotbi/DspikeIn")
library(DspikeIn)
# Or using remotes
install.packages("remotes")
remotes::install_github("mghotbi/DspikeIn")
library(DspikeIn)
DspikeIn builds on the excellent phyloseq package.
# Make a new directory and set it as your working directory
create_directory("DspikeIn_16S_OTU", set_working_dir = TRUE)
getwd()
# Therefore, please start by creating a phyloseq object and follow the instructions.
# To create your phyloseq object, please refer to the phyloseq tutorial (https://joey711.github.io/phyloseq).
# The phyloseq object needs to include OTU/ASV, Taxa, phylogenetic tree, DNA reference,
# and metadata containing spiked species volume, starting from 0 (no spike species added) to 4 (4 μl of spike cell added).
# Note: DspikeIn requires 'spiked.volume'; any other format is not readable."¯\\_(ツ)_/¯ ¯\\_(ツ)_/¯ ¯\\_(ツ)_/¯ ¯\\_(ツ)_/¯"
# We are going to work with a subset of the dataset for both ASVs and OTUs
# approaches to accelerate this workshop.
physeq_16SOTU <-readRDS("physeq_16SOTU.rds")
physeq_ITSOTU <-readRDS("physeq_ITSOTU.rds")
physeq_16SOTU <- tidy_phyloseq(physeq_16SOTU)
# Ensure your metadata contains spiked volumes:
physeq_16SOTU@sam_data$spiked.volume
# Required Information
# Please note that the Spike cell numbers, species name, and selected hashcodes are customizable and can be tailored to the specific needs of individual studies.
# Moreover, to proceed with the DspikeIn package, you only need to select one method to specify your spiked species: either by hashcodes or species name.
library(phyloseq)
# 16S rRNA
# presence of 'spiked.volume' column in metadata
spiked_cells <-1847
species_name <- spiked_species <- c("Tetragenococcus_halophilus", "Tetragenococcus_sp")
merged_spiked_species<-"Tetragenococcus_halophilus"
Tetra <- subset_taxa(physeq_16SOTU,Species=="Tetragenococcus_halophilus" | Species=="Tetragenococcus_sp")
hashcodes <- row.names(phyloseq::tax_table(Tetra))
# ITS rDNA
# presence of 'spiked.volume' column in metadata
spiked_cells <- 733
species_name <- spiked_species<-merged_spiked_species<-"Dekkera_bruxellensis"
Dekkera <- subset_taxa(physeq_ITSOTU, Species=="Dekkera_bruxellensis")
hashcodes <- row.names(phyloseq::tax_table(Dekkera))
# Define the list of spiked-in species
spiked_species <- c("Pseudomonas aeruginosa", "Escherichia coli", "Clostridium difficile")
# Define the corresponding copy numbers for each spiked-in species
spiked_cells_list <- c(10000, 20000, 15000) # or equal number of copies-> spiked_cells_list <- c(200, 200, 200)
This step will be helpful for handling ASVs with/without Gene Copy Number Correction This section demonstrates how to use various functions from the package to plot and analyze phylogenetic trees.
# In case there are several OTUs/ASVs resulting from the spiked species, you may want to check the phylogenetic distances.
# We first read DNA sequences from a FASTA file, to perform multiple sequence alignment and compute a distance matrix using the maximum likelihood method, then we construct a phylogenetic tree
# Use the Neighbor-Joining method based on a Jukes-Cantor distance matrix and plot the tree with bootstrap values.
# we compare the Sanger read of Tetragenococcus halophilus with the FASTA sequence of Tetragenococcus halophilus from our phyloseq object.
# Load required libraries
library(Biostrings)
library(msa)
library(phangorn)
library(ape)
library(speedyseq)
library(ggtree)
# Subset the phyloseq object to include only Tetragenococcus species first
Tetra <- subset_taxa(Tetra, !is.na(taxa_names(Tetra)) & taxa_names(Tetra) != "")
tree <- phy_tree(Tetra)
ref_sequences_Tetra <- refseq(Tetra)
writeXStringSet(ref_sequences_Tetra, "ref_sequences_Tetra.fasta")
# postitive control
Tetra_control_sequences <- Biostrings::readDNAStringSet("~/Tetra_Ju.fasta")
# combine the Tetragenococcus FASTA files (from your dataset and the Sanger fasta of Tetragenococcus, positive control)
combined_sequences <- c(ref_sequences_Tetra, Tetra_control_sequences)
writeXStringSet(combined_sequences, filepath = "~/combined_fasta_file")
combined_sequences <- Biostrings::readDNAStringSet("~/combined_fasta_file")
# Plot Neighbor-Joining tree with bootstrap values to compare Tetragenococcus in your dataset with your positive control
fasta_path <- "~/combined_fasta_file"
plot_tree_nj(fasta_path, output_file = "neighbor_joining_tree_with_bootstrap.png")
# Plot phylogenetic tree
plot_tree_custom(Tetra, output_prefix = "p0", width = 18, height = 18, layout = "circular")
# Plot the tree with glommed OTUs at 0.2 resolution/ or modify it
plot_glommed_tree(Tetra, resolution = 0.2, output_prefix = "top", width = 18, height = 18)
# Plot the phylogenetic tree with multiple sequence alignment
plot_tree_with_alignment(Tetra, output_prefix = "tree_alignment", width = 15, height = 15)
# Plot phylogenetic tree with bootstrap values and cophenetic distances
Bootstrap_phy_tree_with_cophenetic(Tetra, output_file = "tree_with_bootstrap_and_cophenetic.png", bootstrap_replicates = 500)
Figure 1.
Neighbor Joining Tree with Bootstrap | Tetra Plot with Bootstrap | Cophenetic tree with Bootstrap |
---|---|---|
## Aligned Sequences
The result of the aligned sequences is shown below:
DNAStringSet object of length 5:
width seq names
[1] 292 -------------------TACGTAGGTGGCAAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGC...CTGGTCTGTAACTGACGCTGAGGCTCGAAAGCGTGGGTAGCAAACAGG-------------------- 2ddb215ff668b6a24...
[2] 292 -------------------TACGTAGGTGGCAAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGC...CTGGTCTGTAACTGACGCTGAGGCTCGAAAGCGTGGGTAGCAAACAGG-------------------- Tetragenococcus h...
[3] 292 -------------------TACGTAGGTGGCAAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGC...CTGGTCTGTAACTGACGCTGAGGCTCGAAAGCGTAGGTAGCAAACAGG-------------------- 65ab824f29da71010...
[4] 292 -------------------TACGTAGGTGGCAAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGC...CTGGTCTGTAACTGACGCTGAGACTCGAAAGCGTGGGTAGCAAACAGG-------------------- e49935179f23c00fb...
[5] 292 -------------------TACGTAGGTGGCAAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGC...CTGGACTGTAACTGACGCTGAGGCTCGAAAGCGTGGGGAGCAAACAGG-------------------- 0350f990080b4757a...
Retrieved spiked species are correlated with abundance, richness, and beta diversity, and their recovery can therefore vary depending on the system under study. Our results indicate that the acceptable range of retrieved spiked species can be expanded to 35% in our model system. This contrasts with the findings of Roa et al., 2021, who reported an acceptable range of 0.1% to 10%. We selected the OTU approach using VSEARCH with de novo robust clustering algorithms at a 97% similarity threshold to minimize potential errors, following the methods outlined by Westcott and Schloss (2015).
ASVs Or OTUs | Acceptable range |
---|---|
Subset the part of the data which is spiked. Keep solely spiked samples using the spiked.volume
column.
# Subset spiked samples (264 samples are spiked)
spiked_16S_OTU <- subset_samples(physeq16S_OTU, spiked.volume %in% c("2", "1"))
spiked_16S_OTU <- tidy_phyloseq(spiked_16S_OTU)
# Summarize the initial statistics for ASVs/OTUs
initial_stat_ASV <- summ_phyloseq_ASV_OTUID(spiked_16S_OTU)
# Summarize the initial statistics sample-wise
initial_stat_sampleWise <- summ_phyloseq_sampleID(spiked_16S_OTU)
# Summarize the count data
summ_count_phyloseq(physeq_16S_OTU)
# Check the summary statistics
# Ensure the input is in dataframe format for this function
calculate_summary_stats_table(initial_stat_sampleWise)
Check if transformation is required for spike volume variation.
# Adjust abundance by one-third
readAdj16S <- adjust_abundance_one_third(spiked_16S_OTU, factor = 3)
summ_count_phyloseq(readAdj16S)
# Random subsampling with reduction factor foe count and taxa
red16S <- random_subsample_WithReductionFactor(spiked_16S_OTU, reduction_factor = 3)
summ_count_phyloseq(red16S)
If the spiked species appear in several OTUs/ASVs, check their phylogenetic distances and compare them to the reference sequences of your positive control.
# Modify the threshold of acceptable spiked species % as needed.
# For detailed guidance on acceptable thresholds (passed_range),
# please refer to the instructions in our upcoming paper.
# Merge the spiked species
# merge_method = "max": Selects the maximum abundance among OTUs/ASVs of the spiked species, ensuring the most abundant ASV is retained.
# merge_method = "sum": Sums the abundances of OTUs/ASVs of the spiked species, providing a cumulative total.
species_name <- "Tetragenococcus_halophilus"
# Merge using "sum" or "max" method
Spiked_16S_sum_scaled <- Pre_processing_species(
spiked_16S_OTU,
species_name,
merge_method = "sum",
output_file = "merged_physeq_sum.rds")
# Merge hashcodes using "sum" or "max" method
Spiked_16S_sum_scaled <- Pre_processing_hashcodes(
spiked_16S_OTU,
hashcodes,
merge_method = "sum",
output_prefix = "merged_physeq_sum")
# Summarize count
summ_count_phyloseq(Spiked_16S_sum_scaled)
# Tidy phyloseq object
Spiked_16S_OTU_scaled <- tidy_phyloseq(Spiked_16S_sum_scaled)
# Now calculate the spiked species retrieval percentage.
# Customize the passed_range and merged_spiked_species/merged_spiked_hashcodes based on your preferences.
# passed_range = "c(0.1, 11)": threshold of acceptable spiked species %
# Select either merged_spiked_species or merged_spiked_hashcodes
merged_spiked_species <- c("Tetragenococcus_halophilus")
result <- calculate_spike_percentage(
Spiked_16S_sum_scaled,
merged_spiked_species,
passed_range = c(0.1, 11))
calculate_summary_stats_table(result)
# Define your merged_spiked_hashcodes
merged_Tetra <- subset_taxa(
Spiked_16S_OTU_scaled,
Species == "Tetragenococcus_halophilus")
merged_spiked_hashcodes <- row.names(tax_table(merged_Tetra))
result <- calculate_spike_percentage(
Spiked_16S_OTU_scaled,
merged_spiked_hashcodes,
passed_range = c(0.1, 11))
calculate_summary_stats_table(result)
# If you decide to remove the failed reads and go forward with passed reads, here is what you need to do
# You can also go forward with the original file and remove the failed reads
# after converting relative to absolute abundance
# Filter to get only the samples that passed
passed_samples <- result$Sample[result$Result == "passed"]
# Subset the original phyloseq object to keep only the samples that passed
passed_physeq <- prune_samples(
passed_samples,
Spiked_16S_OTU_scaled)
To estimate scaling factors, ensure you have the merged_spiked_species
data, which contains the merged species derived from the spiking process.
As we have already merged either hashcodes or spiked species and are aware of the contents of the taxa table, we can proceed from here with merged_spiked_species.
# Define the merged spiked species
merged_spiked_species <- c("Tetragenococcus_halophilus")
# Calculate spikeIn factors
result <- calculate_spikeIn_factors(Spiked_16S_OTU_scaled, spiked_cells, merged_spiked_species)
# Check the outputs
scaling_factors <- result$scaling_factors
physeq_no_spiked <- result$physeq_no_spiked
spiked_16S_total_reads <- result$spiked_16S_total_reads
spiked_species_reads <- result$spiked_species_reads
# Convert relative counts data to absolute counts
absolute <- convert_to_absolute_counts(Spiked_16S_OTU_scaled, scaling_factors)
absolute_counts <- absolute$absolute_counts
physeq_absolute_abundance_16S_OTU <- absolute$physeq_obj
# summary statistics
post_eval_summary <- calculate_summary_stats_table(absolute_counts)
print(post_eval_summary)
# Define the parameters once.
merged_spiked_species <- c("Tetragenococcus_halophilus")
max_passed_range <- 35
# Subset the phyloseq object to exclude blanks
physeq_absolute_abundance_16S_OTU_perc <- subset_samples(physeq_absolute_abundance_16S_OTU, sample.or.blank != "blank")
# Generate the spike success report and summary statistics
summary_stats <- conclusion(physeq_absolute_abundance_16S_OTU_perc, merged_spiked_species, max_passed_range)
print(summary_stats)
Here is an example of a success or failure report:
#Save your file for later. Please stay tuned for the rest: Comparisons and several visualization methods to show how important it is to convert relative to absolute abundance in the context of microbial ecology.
physeq_absolute_16S_OTU <- tidy_phyloseq(physeq_absolute_abundance_16S_OTU_perc)
saveRDS(physeq_absolute_16S_OTU, "physeq_absolute_16S_OTU.rds")
# Bolstad, B.M., Irizarry, R.A., Åstrand, M. and Speed, T.P., 2003. A comparison of normalization methods for high-density oligonucleotide array data based on variance and bias. Bioinformatics, 19(2), pp.185-193.
# Gagnon-Bartsch, J.A. and Speed, T.P., 2012. Using control genes to correct for unwanted variation in microarray data. Biostatistics, 13(3), pp.539-552.
# Risso, D., Ngai, J., Speed, T.P. and Dudoit, S., 2014. Normalization of RNA-seq data using factor analysis of control genes or samples. Nature biotechnology, 32(9), pp.896-902.
# Gagnon-Bartsch, J.A., Jacob, L. and Speed, T.P., 2013. Removing unwanted variation from high dimensional data with negative controls. Berkeley: Tech Reports from Dep Stat Univ California, pp.1-112.
# Load required libraries
library(phyloseq)
library(DESeq2)
library(edgeR)
library(BiocGenerics)
#ps is a phyloseq object without spiked species counts
ps <- physeq_absolute_16S_OTU
ps <- remove_zero_negative_count_samples(physeq_absolute_abundance_16S_OTU)
ps <- convert_categorical_to_factors(physeq_absolute_abundance_16S_OTU)
# group_var <- "Animal.ecomode"
# Normalization Methods:
# result_DESeq <- normalization_set(ps, method = "DESeq", groups = "group_var")
# result_TMM <- normalization_set(ps, method = "TMM", groups = "group_var")
# result_CLR <- normalization_set(ps, method = "clr")
# result_UQ <- normalization_set(ps, method = "UQ", groups = group_var)
# result_med <- normalization_set(ps, method = "med", groups = group_var)
# result_Poisson <- normalization_set(ps, method = "Poisson", groups = "group_var")
# result_UQ <- normalization_set(ps, method = "UQ", groups = "group_var")
# result_med <- normalization_set(ps, method = "med", groups = "group_var")
# result_rle <- normalization_set(ps, method = "rle")
# result_css <- normalization_set(ps, method = "CSS")
# result_tss <- normalization_set(ps, method = "tss")
# result_rar <- normalization_set(ps, method = "rar")
# Customized filtering and transformations
# Proportion adjustment
normalized_physeq <- proportion_adj(ps, output_file = "proportion_adjusted_physeq.rds")
summ_count_phyloseq(normalized_16S)
# Relativize and filter taxa based on selected thresholds
FT_physeq <- relativized_filtered_taxa(
ps,
threshold_percentage = 0.0001,
threshold_mean_abundance = 0.0001,
threshold_count = 5,
threshold_relative_abundance = 0.0001)
summ_count_phyloseq(FT_physeq)
# Adjust prevalence based on the minimum reads
physeq_min <- adjusted_prevalence(ps, method = "min")
# taxa barplot
#abundance_type = "absolute"/"relative"
bp_ab <- taxa_barplot(physeq_absolute_16S_OTU, target_glom = "Genus", treatment_variable = "Host.genus", abundance_type = "absolute", x_angle = 90, fill_variable = "Genus", facet_variable = "Diet", top_n_taxa = 20)
print(bp_ab$barplot)
# original relative count -> spiked_16S_OTU
bp_rel <- taxa_barplot(spiked_16S_OTU, target_glom = "Genus", treatment_variable = "Host.genus", abundance_type = "relative", x_angle = 90, fill_variable = "Genus", facet_variable = "Diet", top_n_taxa = 20)
print(bp_rel$barplot)
Absolute Abundance | Relative Abundance |
---|---|
# simple barplot of taxonomy abundance
# Plot relativized abundance
plot <- plotbar_abundance(physeq_absolute_16S_OTU, level = "Family", group = "Env.broad.scale.x", top = 10, x_size = 10, y_size = 10, legend_key_size = 2, legend_text_size = 14, legend_nrow = 10, relativize = TRUE, output_prefix = "relativized_abundance_plot")
print(plot)
# Plot non-relativized (absolute) abundance
plot_absolute <- plotbar_abundance(spiked_16S_OTU, level = "Family", group = "Env.broad.scale.x", top = 10, x_size = 10, y_size = 10, legend_key_size = 2, legend_text_size = 14, legend_nrow = 10, relativize = FALSE, output_prefix = "non_relativized_abundance_plot")
print(plot_absolute)
# Check abundance distribution via Ridge Plots before and after converting to absolute abundance
ridgeP_before <- ridge_plot_it(spiked_16S_OTU, taxrank = "Family", top_n = 10)
ridgeP_after <- ridge_plot_it(physeq_absolute_16S_OTU, taxrank = "Family", top_n = 10)
Absolute Abundance | Relative Abundance |
---|---|
# core_microbiome
custom_detections <- 10^seq(log10(3e-1), log10(0.5), length = 5)
PCM_rel <- plot_core_microbiome_custom(spiked_16S_OTU, detections = custom_detections, taxrank = "Family", output_core_rds = "core_microbiome.rds", output_core_csv = "core_microbiome.csv")
PCM_Abs <- plot_core_microbiome_custom(physeq_absolute_16S_OTU, detections = custom_detections, taxrank = "Family", output_core_rds = "core_microbiome.rds", output_core_csv = "core_microbiome.csv")
# core.microbiome is automatically saved in your working directory so yoou can go ahead and barplot it
core.microbiome <- readRDS("core.microbiome.rds")
Absolute Abundance | Relative Abundance |
---|---|
# shift to long-format data frame and plot the abundance of taxa across the factor of your interest
# Generate alluvial plot
ps_physeq_absolute_16S_OTU <- psmelt(physeq_absolute_16S_OTU)
# Define total reads for relative abundance calculation
total_reads <- sum(ps_physeq_absolute_16S_OTU$Abundance)
# Generate alluvial plot for absolute abundance
alluvial_plot_abs <- alluvial_plot(data = ps_physeq_absolute_16S_OTU,
axes = c("Host.genus", "Ecoregion.III"),
abundance_threshold = 1000, fill_variable = "Family",
silent = TRUE, abundance_type = "absolute",
top_taxa = 10, facet_vars = c("Diet"))
Absolute Abundance | Relative Abundance |
---|---|
# selecting the most important ASVs/OTUs through RandomForest classification
# Salamander_absolute= subset of our phyloseq object
rf_physeq <- RandomForest_selected_ASVs(ps_physeq_absolute_16S_OTU, response_var = "Host_Species", na_vars = c("Habitat","Diet", "Ecoregion_III", "Host_genus", "Animal_type"))
RP=ridge_plot_it(rf_physeq)
RP+facet_wrap(~Diet)
#detect common ASVs/OTUs
# The input is the list of phyloseq objects
results <- detect_common_asvs_taxa(list(rf_physeq, FTspiked_16S , core.microbiome),
output_common_asvs_rds = "common_asvs.rds",
output_common_taxa_rds = "common_taxa.rds")
common_asvs_phyloseq <- results$common_asvs_phyloseq
common_taxa_phyloseq <- results$common_taxa_phyloseq
plotbar_abundance(common_taxa_phyloseq, level = "Family", group = "Env.broad.scale", top = 10, return = TRUE)