#643 Part 1: Upstream changes required for transcriptomic overview fi…

…gure (#669) * Make GSVA directories more flexible; rerun * Make GSVA module doc more general too * Fix case where not using CIBERSORT * Rerun transcriptome dimension reduction * Add upstream steps for txomic overview figure * Ignore the immune-deconv specifically for figures * Use --vanilla Fig 1 and Fig 2 * Let method 2 be NULL * Only use xCell for figure generation * Add comment about not using --method
AlexsLemonade · Apr 4, 2020 · 203319b · 203319b
1 parent e31fed6
commit 203319b
Show file tree

Hide file tree

Showing 47 changed files with 451 additions and 427 deletions.
diff --git a/analyses/collapse-rnaseq/02-analyze-drops.nb.html b/analyses/collapse-rnaseq/02-analyze-drops.nb.html
diff --git a/analyses/gene-set-enrichment-analysis/01-conduct-gsea-analysis.R b/analyses/gene-set-enrichment-analysis/01-conduct-gsea-analysis.R
@@ -1,5 +1,5 @@
 ################################################################################
-# This script conducts gene set enrichment analysis, specifically using the GSVA method [1] for scoring hallmark human pathway enrichment from RNA-Seq results. 
+# This script conducts gene set enrichment analysis, specifically using the GSVA method [1] for scoring hallmark human pathway enrichment from RNA-Seq results.
 #
 # The GSVA scores (i.e., enrichment scores) are calculated to produce a **Gaussian distribution of scores** "under the null hypothesis of no change in the pathway activity throughout the sample population."
 # The authors claim a benefit to this approach:
@@ -14,9 +14,9 @@
 #
 # ####### USAGE, assumed to be run from top-level of project:
 # Rscript --vanilla 'analyses/gene-set-enrichment-analysis/01-conduct-gsea-analysis.R --input <expression input file> --output <output file for writing scores>
-#     --input : The name of the input expression data file to use for calculating scores. This file is assumed to live in the project's `data/` directory.
-#     --output: The name of the TSV-formatted output file of GSVA scores, to be saved in the `results/` directory of this analysis.
-# 
+#     --input_file: The name of the input expression data file to use for calculating scores.
+#     --output_file: The name of the TSV-formatted output file of GSVA scores.
+#
 # Reference:
 # 1. Sonja Hänzelmann, Robert Castelo, and Justin Guinney. 2013. “GSVA: Gene Set Variation Analysis for Microarray and RNA-Seq Data.” BMC Bioinformatics 14 (1): 7. https://doi.org/10.1186/1471-2105-14-7.
 ################################################################################
@@ -53,39 +53,38 @@ library(GSVA)    ## Performs GSEA analysis
 ## Define arguments
 option_list <- list(
   optparse::make_option(
-    c("--input"),
+    c("--input_file"),
     type = "character",
     default = NA,
     help = "The input file of expression data from which scores will be calculated."
   ),
-    optparse::make_option(
-    c("--output"),
+  optparse::make_option(
+    c("--output_file"),
     type = "character",
-    default = NA, 
+    default = NA,
     help = "The output file for writing GSVA scores in TSV format."
   )
 )
 
 ## Read in arguments
 opt_parser <- optparse::OptionParser(option_list = option_list)
 opt <- optparse::parse_args(opt_parser)
-if (is.na(opt$input)) stop("\n\nERROR: You must provide an input file with expression data with the flag --input, assumed to be in the `data/` directory of the OpenPBTA repository..") 
-if (is.na(opt$output)) stop("\n\nERROR: You must provide an output file for saving GSVA scores with the flag --output, assumed to be placed in the `results/` directory of this analysis.") 
+if (is.na(opt$input_file)) stop("\n\nERROR: You must provide an input file with expression data with the flag --input, assumed to be in the `data/` directory of the OpenPBTA repository..")
+if (is.na(opt$output_file)) stop("\n\nERROR: You must provide an output file for saving GSVA scores with the flag --output, assumed to be placed in the `results/` directory of this analysis.")
 
 
 #### Set Up paths and file names --------------------------------------------------------------------
 
-## Define directories
-data_dir    <- file.path("..", "..", "data") 
-results_dir <- "results"
-if (!dir.exists(results_dir)) dir.create(results_dir)
+# If the output directory does not exist, create it
+output_dir <- dirname(opt$output_file)
+if (!dir.exists(output_dir)) {
+  dir.create(output_dir, recursive = TRUE)
+}
 
-## Ensure the input file exists in `data/` and specify input/output files 
-expression_data_file <- file.path(data_dir, basename(opt$input))
+## Ensure the input file exists in `data/` and specify input/output files
+expression_data_file <- opt$input_file
 if (!file.exists(expression_data_file)) stop("\n\nERROR: Provided input file does not exist.")
-scores_output_file <- file.path(results_dir, basename(opt$output))
-
-
+scores_output_file <- file.path(output_dir, basename(opt$output_file))
 
 #### Load input files --------------------------------------------------------------------
 expression_data <- as.data.frame( readr::read_rds(expression_data_file) )
@@ -114,14 +113,14 @@ gsea_scores <- GSVA::gsva(expression_data_log2_matrix,
 ### Clean scoring into tidy format
 gsea_scores_df <- as.data.frame(gsea_scores) %>%
                     rownames_to_column(var = "hallmark_name")
-  
+
 #first/last_bs needed for use in gather (we are not on tidyr1.0)
 first_bs <- head(colnames(gsea_scores), n=1)
 last_bs  <- tail(colnames(gsea_scores), n=1)
-  
+
 gsea_scores_df_tidy <- gsea_scores_df %>%
                         tidyr::gather(Kids_First_Biospecimen_ID, gsea_score, !!first_bs : !!last_bs) %>%
-                        dplyr::select(Kids_First_Biospecimen_ID, hallmark_name, gsea_score) 
+                        dplyr::select(Kids_First_Biospecimen_ID, hallmark_name, gsea_score)
 
 
 #### Export GSEA scores to TSV --------------------------------------------------------------------

diff --git a/analyses/gene-set-enrichment-analysis/README.md b/analyses/gene-set-enrichment-analysis/README.md
@@ -19,19 +19,17 @@ bash analyses/gene-set-enrichment-analysis/run-gsea.sh
 
 ## Folder Content
 
-+ `01-conduct-gsea-analysis.R` performs the GSVA analysis using RSEM FPKM expression data for both stranded and polyA data. Results are saved in `results/` TSV files.
++ `01-conduct-gsea-analysis.R` performs the GSVA analysis using RSEM FPKM expression data for both stranded and polyA data. Results are saved in `results/` TSV files when run via `run-gsea.sh`.
 
 + `02-model-gsea.Rmd` performs ANOVA and Tukey tests on GSVA scores to evaluate, for each hallmark pathway, differences in GSVA across groups (e.g. short histology or disease type).
 
 + FORTHCOMING: 	`03-visualize-gsea.Rmd` will create heatmaps and boxplots to visualize scores.
 
-+ `results/gsva_scores_stranded.tsv` represents GSVA scores calculated from `pbta-gene-expression-rsem-fpkm-collapsed.stranded.rds` (data release v12)
-	+ File created with: `Rscript --vanilla 01-conduct-gsea-analysis.R --input pbta-gene-expression-rsem-fpkm-collapsed.stranded.rds --output gsva_scores_stranded.tsv`
-+ `results/gsva_scores_polya.tsv` represents GSVA scores calculated from `pbta-gene-expression-rsem-fpkm-collapsed.polya.rds` (data release v12)
-	+ File created with: `Rscript --vanilla 01-conduct-gsea-analysis.R --input pbta-gene-expression-rsem-fpkm-collapsed.polya.rds --output gsva_scores_stranded.tsv`
++ `results/gsva_scores_stranded.tsv` represents GSVA scores calculated from `pbta-gene-expression-rsem-fpkm-collapsed.stranded.rds` with `Rscript --vanilla 01-conduct-gsea-analysis.R`
 
++ `results/gsva_scores_polya.tsv` represents GSVA scores calculated from `pbta-gene-expression-rsem-fpkm-collapsed.polya.rds` with with `Rscript --vanilla 01-conduct-gsea-analysis.R`
 
 + **Eight** files named as `results/gsva_<tukey/anova>_<stranded/polya>_<integrated_diagnosis/short_histology>.tsv` represent results from modeling
 	+ Files created with: `Rscript --vanilla 02-model-gsea.R`
 	+ Assumes `results/gsva_scores_stranded.tsv` and `results/gsva_scores_polya.tsv` exist
-
+
diff --git a/analyses/gene-set-enrichment-analysis/run-gsea.sh b/analyses/gene-set-enrichment-analysis/run-gsea.sh
@@ -4,8 +4,8 @@
 # Run the GSEA pipeline:
 ## 1. `01-conduct-gsea-analysis.R` to calculate scores
 ## 2. `02-exploratory-gsea.Rmd` to explore the scores, lightly for now
-# 
-# Usage: bash run-gsea.sh 
+#
+# Usage: bash run-gsea.sh
 #
 # Takes one environment variable, `OPENPBTA_TESTING`, which is 1 for running
 # samples in CI for testing, 0 for running the full dataset (Default)
@@ -26,18 +26,20 @@ script_directory="$(perl -e 'use File::Basename;
   print dirname(abs_path(@ARGV[0]));' -- "$0")"
 cd "$script_directory" || exit
 
+DATA_DIR="../../data"
+RESULTS_DIR="results"
 
 ######## Calculate scores from polyA expression data ############
-INPUT_FILE="pbta-gene-expression-rsem-fpkm-collapsed.polya.rds"
-OUTPUT_FILE="gsva_scores_polya.tsv"
+INPUT_FILE="${DATA_DIR}/pbta-gene-expression-rsem-fpkm-collapsed.polya.rds"
+OUTPUT_FILE="${RESULTS_DIR}/gsva_scores_polya.tsv"
 Rscript --vanilla 01-conduct-gsea-analysis.R --input ${INPUT_FILE} --output ${OUTPUT_FILE}
 
 
 ######## Calculate scores from stranded expression data ############
-INPUT_FILE="pbta-gene-expression-rsem-fpkm-collapsed.stranded.rds"
-OUTPUT_FILE="gsva_scores_stranded.tsv"
+INPUT_FILE="${DATA_DIR}/pbta-gene-expression-rsem-fpkm-collapsed.stranded.rds"
+OUTPUT_FILE="${RESULTS_DIR}/gsva_scores_stranded.tsv"
 Rscript --vanilla 01-conduct-gsea-analysis.R --input ${INPUT_FILE} --output ${OUTPUT_FILE}
 
 
 ######## Model GSVA scores ############
-Rscript -e "rmarkdown::render('02-model-gsea.Rmd', clean = TRUE, params=list(is_ci = ${IS_CI}))" 
+Rscript -e "rmarkdown::render('02-model-gsea.Rmd', clean = TRUE, params=list(is_ci = ${IS_CI}))"
diff --git a/analyses/immune-deconv/.gitignore b/analyses/immune-deconv/.gitignore
@@ -1,3 +1,7 @@
+# CIBERSORT related files
 LM22.txt
 CIBERSORT.R
 CIBERSORT-Results.txt
+
+# output data specifically for making figures
+results/deconv-output-for-figures.RData
diff --git a/analyses/immune-deconv/01-immune-deconv.R b/analyses/immune-deconv/01-immune-deconv.R
@@ -1,6 +1,6 @@
 # Author: Komal S. Rathi
 # Date: 11/11/2019
-# Function: 
+# Function:
 # Script to perform immune characterization using xCell etc.
 
 # load libraries
@@ -17,9 +17,9 @@ option_list <- list(
               help = "Clinical file (.TSV)"),
   make_option(c("-m", "--method"), type = "character",
               help = "Deconvolution Method"),
-  make_option(c("-b", "--cibersortbin"), type = "character", 
+  make_option(c("-b", "--cibersortbin"), type = "character",
               help = "Path to Cibersort binary (CIBERSORT.R)"),
-  make_option(c("-g", "--cibersortgenemat"), type = "character", 
+  make_option(c("-g", "--cibersortgenemat"), type = "character",
               help = "Path to Cibersort signature matrix (LM22.txt)"),
   make_option(c("-o","--outputfile"), type = "character",
               help = "Deconv Output (.RData)")
@@ -41,19 +41,21 @@ polya <- opt$polyaexprs
 stranded <- opt$strandedexprs
 clin.file <- opt$clin
 deconv.method <- opt$method
-cibersort_bin <- opt$cibersortbin 
-cibersort_mat <- opt$cibersortgenemat 
+cibersort_bin <- opt$cibersortbin
+cibersort_mat <- opt$cibersortgenemat
 output.file <- opt$outputfile
 
 #### Check model parameter - must be in deconvolution_methods (immunedeconv accepted options)
-if (!(deconv.method %in% deconvolution_methods )){
+if (!is.null(deconv.method)){
+  if (!(deconv.method %in% deconvolution_methods)) {
     stop( paste(c("Specified method not available. Must be one of the following: ", deconvolution_methods), collapse=" ") )
+  }
 }
 
 
 # if cibersort_bin and cibersort_mat are defined
 # then, set path to cibersort binary and matrix
-if(cibersort_bin != "NA" & cibersort_mat != "NA"){
+if(!(is.null(cibersort_bin)) & !(is.null(cibersort_mat))){
   set_cibersort_binary(cibersort_bin)
   set_cibersort_mat(cibersort_mat)
 }
@@ -67,37 +69,36 @@ clin <- read.delim(clin.file, stringsAsFactors = F)
 
 # function to run immunedeconv
 deconv <- function(expr.input, method) {
-  
+
   # get data
   expr.input <- get(expr.input)
-  
+
   # subset clinical
-  clin.sub  <- clin %>% 
+  clin.sub  <- clin %>%
     filter(Kids_First_Biospecimen_ID %in% colnames(expr.input)) %>%
     dplyr::select(Kids_First_Biospecimen_ID, broad_histology, short_histology, molecular_subtype)
-  
+
   # deconvolute using specified method
   res <- deconvolute(gene_expression = as.matrix(expr.input), method = method)
-  res$method <- names(grep(method, deconvolution_methods, value = TRUE)) # assign method name 
-  
+  res$method <- names(grep(method, deconvolution_methods, value = TRUE)) # assign method name
+
   # merge output with clinical data
   res <- res %>%
     gather(sample, fraction, -c(cell_type, method)) %>%
     as.data.frame() %>%
     inner_join(clin.sub, by = c("sample" = "Kids_First_Biospecimen_ID"))
-  
+
   return(res)
 }
 
 # Deconvolute using xCell and the second chosen method
-# these two methods have the max number of immune cell types
-deconv.method <- c("xcell", deconv.method)
+deconv.method <- unique(c("xcell", deconv.method))
 expr.input <- c("polya", "stranded") # datasets
 combo <- expand.grid(expr.input, deconv.method, stringsAsFactors = F) # combination of dataset and methods
 deconv.res <- apply(combo, 1, FUN = function(x) deconv(expr.input = x[1], method = x[2]))
 deconv.res <- do.call(rbind.data.frame, deconv.res)
 
-# save output to RData object 
+# save output to RData object
 print("Writing output to file..")
 save(deconv.res, file = output.file)
 print("Done!")