Merge pull request #4 from leahkemp/dev

Dev
leahkemp · Jan 27, 2022 · 45d2f76 · 45d2f76
2 parents c6e9fd2 + ee44c32
commit 45d2f76
Show file tree

Hide file tree

Showing 3 changed files with 78 additions and 24 deletions.
diff --git a/diff_expression/diff_expression.Rmd b/diff_expression/diff_expression.Rmd
@@ -121,9 +121,13 @@ mirna_smrnaseq_data <- mirna_smrnaseq_data %>%
   # make all count data numeric
   dplyr::mutate_all(as.numeric)
 
-  # replace "." with "-" so RNA names are the same
+# replace "." with "-" so RNA names are the same
 base::row.names(mirna_smrnaseq_data) <- base::gsub("\\.", "\\-", base::row.names(mirna_smrnaseq_data))
 
+# remove weird tRNA row
+trna_excerpt_data <- trna_excerpt_data %>%
+  dplyr::filter(!(rownames(trna_excerpt_data) == "tRNA"))
+
 # create a list defining which code chunks to analyse (that are set to TRUE for both the datasets and contrasts to analyse) based on the yaml user configuration file
 to_eval_chunk <- config[c("mirna_smrnaseq",
                           "mirna_excerpt",
@@ -132,6 +136,9 @@ to_eval_chunk <- config[c("mirna_smrnaseq",
                           "circrna_excerpt",
                           "gencode_excerpt")] %>%
   rep(times = 6)
+  
+# make everything true before conditionally making false
+to_eval_chunk <- base::replace(to_eval_chunk, 1:base::length(to_eval_chunk), TRUE)
 
 names(to_eval_chunk)[1:6] <- paste(names(to_eval_chunk)[1:6], "1", sep = "_")
 names(to_eval_chunk)[7:12] <- paste(names(to_eval_chunk)[7:12], "2", sep = "_")
@@ -140,7 +147,7 @@ names(to_eval_chunk)[19:24] <- paste(names(to_eval_chunk)[19:24], "4", sep = "_"
 names(to_eval_chunk)[25:30] <- paste(names(to_eval_chunk)[25:30], "5", sep = "_")
 names(to_eval_chunk)[31:36] <- paste(names(to_eval_chunk)[31:36], "6", sep = "_")
 
-n_contrasts <- length(config$contrasts)
+n_contrasts <- base::length(config$contrasts)
 
 to_eval_chunk <- if(n_contrasts == 1) {
   replace(to_eval_chunk, 7:36, FALSE)
@@ -249,7 +256,7 @@ ordered_treatments <- treatments %>%
 
 # make sure the samples (columns) in all the datasets are in the same order for specifying the treatments downstream
 # (that depends on the columns being in the correct order)  (loop over all count datasets)
-count_datasets<- base::lapply(count_datasets, function(x) {
+count_datasets <- base::lapply(count_datasets, function(x) {
   
   x[ , gtools::mixedsort(names(x))]
   
@@ -279,7 +286,7 @@ plotSA_interactive <- function(data) {
     y[allzero] <- NA
   }
   
-  if (length(data$df.prior) > 1L) {
+  if (base::length(data$df.prior) > 1L) {
     df2 <- base::max(data$df.prior)
     s2 <- data$sigma^2/fit$s2.prior
     pdn <- stats::pf(s2, df1 = data$df.residual, df2 = df2)
@@ -432,7 +439,7 @@ base::cat(base::paste0("- ", config$contrasts, "\n"))
 
 ```{r, results = "asis"}
 # print the number of samples analysed
-base::cat(base::paste0("Total number of samples: ", length(unique(metadata$sample))))
+base::cat(base::paste0("Total number of samples: ", base::length(base::unique(metadata$sample))))
 ```
 
 Number of samples in each treatment group:
@@ -449,7 +456,7 @@ base::cat(base::paste0("- ", n_samples_by_treatment$treatment, ": ", n_samples_b
 ## limma/voom
 
 ```{r, results = "asis"}
-base::cat(base::paste0("The data was filtered using the [filterByExpr](https://rdrr.io/bioc/edgeR/man/filterByExpr.html) function. The filtering keeps RNA's that have count-per-million (CPM) above min.count (", config$min_count, ")", " in n samples (", length(unique(metadata$sample)), "). In addition, each kept RNA is required to have at least min.total.count reads (", config$min_total_count, ") across all the samples. From a statistical point of view, removing low count RNA's allows the mean-variance relationship in the data to be estimated with greater reliability ([Law et al., (2018)](https://f1000research.com/articles/5-1408))."))
+base::cat(base::paste0("The data was filtered using the [filterByExpr](https://rdrr.io/bioc/edgeR/man/filterByExpr.html) function. The filtering keeps RNA's that have count-per-million (CPM) above min.count (", config$min_count, ")", " in n samples (", base::length(base::unique(metadata$sample)), "). In addition, each kept RNA is required to have at least min.total.count reads (", config$min_total_count, ") across all the samples. From a statistical point of view, removing low count RNA's allows the mean-variance relationship in the data to be estimated with greater reliability ([Law et al., (2018)](https://f1000research.com/articles/5-1408))."))
 ```
 
 ```{r, results = "asis"}

diff --git a/heatmaps/heatmaps.Rmd b/heatmaps/heatmaps.Rmd
@@ -59,9 +59,18 @@ config <- yaml::yaml.load_file("../config/config.yaml")
 metadata <- utils::read.csv(file.path(config$metadata))
 
 # read in differential expression results
-diff_expr_results <- utils::read.table("../diff_expression/diff_expr_results/diff_expr_results.tsv",
+diff_expr_results_model_1 <- utils::read.table("../diff_expression/diff_expr_results/diff_expr_results_model_1.tsv",
                                        header = TRUE)
 
+diff_expr_results_model_2 <- utils::read.table("../diff_expression/diff_expr_results/diff_expr_results_model_2.tsv",
+                                       header = TRUE)
+
+diff_expr_results <- rbind(diff_expr_results_model_1, diff_expr_results_model_2)
+
+# write differential expression results to file
+diff_expr_results %>%
+  utils::write.table(file = "../diff_expression/diff_expr_results/diff_expr_results.tsv", row.names=FALSE, sep="\t")
+
 # load all the count datasets
 mirna_excerpt_data <- utils::read.table(base::file.path(config$excerpt_merged_results_dir,
                                                         "exceRpt_miRNA_ReadCounts.txt"),
@@ -122,6 +131,10 @@ mirna_smrnaseq_data <- mirna_smrnaseq_data %>%
   # replace "." with "-" so RNA names are the same
 base::row.names(mirna_smrnaseq_data) <- base::gsub("\\.", "\\-", base::row.names(mirna_smrnaseq_data))
 
+# remove weird tRNA row
+trna_excerpt_data <- trna_excerpt_data %>%
+  dplyr::filter(!(rownames(trna_excerpt_data) == "tRNA"))
+
 # create a vector defining the count datasets to analyse (that are set to TRUE) based on the yaml user configuration file
 to_analyse <- config[c("mirna_smrnaseq",
                        "mirna_excerpt",
@@ -205,17 +218,26 @@ log_counts_per_million <- base::lapply(log_counts_per_million, function(x)
 )
 
 # setup a string that scales the widths of the subplots for each heatmap based on the number of treatment groups
-num_treatments <- base::length(base::unique(metadata$treatment))
-widths <- 1/num_treatments
-all_widths <- base::rep(widths, num_treatments)
+all_widths <- metadata %>%
+  group_by(treatment) %>%
+  dplyr::summarise(n_samples = n()) %>%
+  dplyr::mutate(n_samples = n_samples/base::length(metadata$treatment)) %>%
+  dplyr::pull(n_samples)
 
 # set heatmap figure heights to automatically scale to how many rows are present in the data
-fig_height_mirna_smrnaseq <- base::nrow(log_counts_per_million$mirna_smrnaseq) / 2
-fig_height_mirna_excerpt <- base::nrow(log_counts_per_million$mirna_excerpt) / 2
-fig_height_pirna_excerpt <- base::nrow(log_counts_per_million$pirna_excerpt) / 2
-fig_height_trna_excerpt <- base::nrow(log_counts_per_million$trna_excerpt) / 2
-fig_height_circrna_excerpt <- base::nrow(log_counts_per_million$circrna_excerpt) / 2
-fig_height_gencode_excerpt <- base::nrow(log_counts_per_million$gencode_excerpt) / 2
+fig_height_mirna_smrnaseq <- base::nrow(log_counts_per_million$mirna_smrnaseq) / 6
+fig_height_mirna_excerpt <- base::nrow(log_counts_per_million$mirna_excerpt) / 6
+fig_height_pirna_excerpt <- base::nrow(log_counts_per_million$pirna_excerpt) / 6
+fig_height_trna_excerpt <- base::nrow(log_counts_per_million$trna_excerpt) / 6
+fig_height_circrna_excerpt <- base::nrow(log_counts_per_million$circrna_excerpt) / 6
+fig_height_gencode_excerpt <- base::nrow(log_counts_per_million$gencode_excerpt) / 6
+
+fig_height_mirna_smrnaseq_free <- base::nrow(log_counts_per_million$mirna_smrnaseq) / 4
+fig_height_mirna_excerpt_free <- base::nrow(log_counts_per_million$mirna_excerpt) / 4
+fig_height_pirna_excerpt_free <- base::nrow(log_counts_per_million$pirna_excerpt) / 4
+fig_height_trna_excerpt_free <- base::nrow(log_counts_per_million$trna_excerpt) / 4
+fig_height_circrna_excerpt_free <- base::nrow(log_counts_per_million$circrna_excerpt) / 4
+fig_height_gencode_excerpt_free <- base::nrow(log_counts_per_million$gencode_excerpt) / 4
 
 # need to set the figure height to NOT ZERO when the figure height ends up being less than 1 so rmarkdown doesn't error out on knitting
 fig_height_mirna_smrnaseq[fig_height_mirna_smrnaseq <1 ] <- 1
@@ -225,6 +247,13 @@ fig_height_trna_excerpt[fig_height_trna_excerpt <1 ] <- 1
 fig_height_circrna_excerpt[fig_height_circrna_excerpt <1 ] <- 1
 fig_height_gencode_excerpt[fig_height_gencode_excerpt <1 ] <- 1
 
+fig_height_mirna_smrnaseq_free[fig_height_mirna_smrnaseq_free <1 ] <- 1
+fig_height_mirna_excerpt_free[fig_height_mirna_excerpt_free <1 ] <- 1
+fig_height_pirna_excerpt_free[fig_height_pirna_excerpt_free <1 ] <- 1
+fig_height_trna_excerpt_free[fig_height_trna_excerpt_free <1 ] <- 1
+fig_height_circrna_excerpt_free[fig_height_circrna_excerpt_free <1 ] <- 1
+fig_height_gencode_excerpt_free[fig_height_gencode_excerpt_free <1 ] <- 1
+
 # also need to set the figure height to NOT ZERO when there are no rows of data so rmarkdown doesn't error out on knitting
 fig_height_mirna_smrnaseq[is.na(fig_height_mirna_smrnaseq[1])] <- 1
 fig_height_mirna_excerpt[is.na(fig_height_mirna_excerpt[1])] <- 1
@@ -233,6 +262,13 @@ fig_height_trna_excerpt[is.na(fig_height_trna_excerpt[1])] <- 1
 fig_height_circrna_excerpt[is.na(fig_height_circrna_excerpt[1])] <- 1
 fig_height_gencode_excerpt[is.na(fig_height_gencode_excerpt[1])] <- 1
 
+fig_height_mirna_smrnaseq_free[is.na(fig_height_mirna_smrnaseq_free[1])] <- 1
+fig_height_mirna_excerpt_free[is.na(fig_height_mirna_excerpt_free[1])] <- 1
+fig_height_pirna_excerpt_free[is.na(fig_height_pirna_excerpt_free[1])] <- 1
+fig_height_trna_excerpt_free[is.na(fig_height_trna_excerpt_free[1])] <- 1
+fig_height_circrna_excerpt_free[is.na(fig_height_circrna_excerpt_free[1])] <- 1
+fig_height_gencode_excerpt_free[is.na(fig_height_gencode_excerpt_free[1])] <- 1
+
 # specify treatments by creating a string of conditions that match the order of the columns/samples in the count data
 # get the treatments and samples names from the metadata file
 treatments <- metadata %>%
@@ -259,11 +295,18 @@ base::cat(base::paste0(" - mirna smrnaseq: ", config$mirna_smrnaseq, "\n",
                        " - gencode excerpt: ", config$gencode_excerpt, "\n"))
 ```
 
-Treatment comparisons: 
+Treatment comparisons (model 1): 
 
 ```{r, results = "asis"}
 # print the treatment group comparisons the user has chosen to analyse
-base::cat(base::paste0("- ", config$contrasts, "\n"))
+base::cat(base::paste0("- ", config$contrasts_model_1, "\n"))
+```
+
+Treatment comparisons (model 2): 
+
+```{r, results = "asis"}
+# print the treatment group comparisons the user has chosen to analyse
+base::cat(base::paste0("- ", config$contrasts_model_2, "\n"))
 ```
 
 ```{r, results = "asis"}
@@ -911,7 +954,7 @@ if (all(is.na(lcpm_gencode_excerpt)) == FALSE & nrow(lcpm_gencode_excerpt) > 3 &
 base::cat("#### miRNA's (smrnaseq)")
 ```
 
-```{r, eval = config$mirna_smrnaseq, fig.height = fig_height_mirna_smrnaseq, out.width = "100%"}
+```{r, eval = config$mirna_smrnaseq, fig.height = fig_height_mirna_smrnaseq_free, out.width = "100%"}
 # extract datasets
 lcpm_mirna_smrnaseq <- log_counts_per_million$mirna_smrnaseq
 
@@ -975,7 +1018,7 @@ if (all(is.na(lcpm_mirna_smrnaseq)) == FALSE & nrow(lcpm_mirna_smrnaseq) > 3 & "
 base::cat("#### miRNA's (excerpt)")
 ```
 
-```{r, eval = config$mirna_excerpt, fig.height = fig_height_mirna_excerpt, out.width = "100%"}
+```{r, eval = config$mirna_excerpt, fig.height = fig_height_mirna_excerpt_free, out.width = "100%"}
 # extract datasets
 lcpm_mirna_excerpt <- log_counts_per_million$mirna_excerpt
 
@@ -1039,7 +1082,7 @@ if (all(is.na(lcpm_mirna_excerpt)) == FALSE & nrow(lcpm_mirna_excerpt) > 3 & "mi
 base::cat("#### piRNA's (excerpt)")
 ```
 
-```{r, eval = config$pirna_excerpt, fig.height = fig_height_pirna_excerpt, out.width = "100%"}
+```{r, eval = config$pirna_excerpt, fig.height = fig_height_pirna_excerpt_free, out.width = "100%"}
 # extract datasets
 lcpm_pirna_excerpt <- log_counts_per_million$pirna_excerpt
 
@@ -1103,7 +1146,7 @@ if (all(is.na(lcpm_pirna_excerpt)) == FALSE & nrow(lcpm_pirna_excerpt) > 3 & "pi
 base::cat("#### tRNA's (excerpt)")
 ```
 
-```{r, eval = config$trna_excerpt, fig.height = fig_height_trna_excerpt, out.width = "100%"}
+```{r, eval = config$trna_excerpt, fig.height = fig_height_trna_excerpt_free, out.width = "100%"}
 # extract datasets
 lcpm_trna_excerpt <- log_counts_per_million$trna_excerpt
 
@@ -1167,7 +1210,7 @@ if (all(is.na(lcpm_trna_excerpt)) == FALSE & nrow(lcpm_trna_excerpt) > 3 & "trna
 base::cat("#### circRNA's (excerpt)")
 ```
 
-```{r, eval = config$circrna_excerpt, fig.height = fig_height_circrna_excerpt, out.width = "100%"}
+```{r, eval = config$circrna_excerpt, fig.height = fig_height_circrna_excerpt_free, out.width = "100%"}
 # extract datasets
 lcpm_circrna_excerpt <- log_counts_per_million$circrna_excerpt
 
@@ -1231,7 +1274,7 @@ if (all(is.na(lcpm_circrna_excerpt)) == FALSE & nrow(lcpm_circrna_excerpt) > 3 &
 base::cat("#### gencode's (excerpt)")
 ```
 
-```{r, eval = config$gencode_excerpt, fig.height = fig_height_gencode_excerpt, out.width = "100%"}
+```{r, eval = config$gencode_excerpt, fig.height = fig_height_gencode_excerpt_free, out.width = "100%"}
 # extract datasets
 lcpm_gencode_excerpt <- log_counts_per_million$gencode_excerpt
 

diff --git a/prepare_counts/prepare_counts.R b/prepare_counts/prepare_counts.R
@@ -69,6 +69,10 @@ mirna_smrnaseq_data <- mirna_smrnaseq_data %>%
 # replace "." with "-" so RNA names are the same
 base::row.names(mirna_smrnaseq_data) <- base::gsub("\\.", "\\-", base::row.names(mirna_smrnaseq_data))
 
+# remove weird tRNA row
+trna_excerpt_data <- trna_excerpt_data %>%
+  dplyr::filter(!(rownames(trna_excerpt_data) == "tRNA"))
+
 # create a vector defining the count datasets to analyse (that are set to TRUE) based on the yaml user configuration file
 to_analyse <- config[c("mirna_smrnaseq",
                        "mirna_excerpt",