Skip to content
This repository has been archived by the owner on Jun 21, 2023. It is now read-only.

Commit

Permalink
#643 Part 1: Upstream changes required for transcriptomic overview fi…
Browse files Browse the repository at this point in the history
…gure (#669)

* Make GSVA directories more flexible; rerun

* Make GSVA module doc more general too

* Fix case where not using CIBERSORT

* Rerun transcriptome dimension reduction

* Add upstream steps for txomic overview figure

* Ignore the immune-deconv specifically for figures

* Use --vanilla Fig 1 and Fig 2

* Let method 2 be NULL

* Only use xCell for figure generation

* Add comment about not using --method
  • Loading branch information
jaclyn-taroni authored Apr 4, 2020
1 parent e31fed6 commit 203319b
Show file tree
Hide file tree
Showing 47 changed files with 451 additions and 427 deletions.
24 changes: 12 additions & 12 deletions analyses/collapse-rnaseq/02-analyze-drops.nb.html

Large diffs are not rendered by default.

43 changes: 21 additions & 22 deletions analyses/gene-set-enrichment-analysis/01-conduct-gsea-analysis.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
################################################################################
# This script conducts gene set enrichment analysis, specifically using the GSVA method [1] for scoring hallmark human pathway enrichment from RNA-Seq results.
# This script conducts gene set enrichment analysis, specifically using the GSVA method [1] for scoring hallmark human pathway enrichment from RNA-Seq results.
#
# The GSVA scores (i.e., enrichment scores) are calculated to produce a **Gaussian distribution of scores** "under the null hypothesis of no change in the pathway activity throughout the sample population."
# The authors claim a benefit to this approach:
Expand All @@ -14,9 +14,9 @@
#
# ####### USAGE, assumed to be run from top-level of project:
# Rscript --vanilla 'analyses/gene-set-enrichment-analysis/01-conduct-gsea-analysis.R --input <expression input file> --output <output file for writing scores>
# --input : The name of the input expression data file to use for calculating scores. This file is assumed to live in the project's `data/` directory.
# --output: The name of the TSV-formatted output file of GSVA scores, to be saved in the `results/` directory of this analysis.
#
# --input_file: The name of the input expression data file to use for calculating scores.
# --output_file: The name of the TSV-formatted output file of GSVA scores.
#
# Reference:
# 1. Sonja Hänzelmann, Robert Castelo, and Justin Guinney. 2013. “GSVA: Gene Set Variation Analysis for Microarray and RNA-Seq Data.” BMC Bioinformatics 14 (1): 7. https://doi.org/10.1186/1471-2105-14-7.
################################################################################
Expand Down Expand Up @@ -53,39 +53,38 @@ library(GSVA) ## Performs GSEA analysis
## Define arguments
option_list <- list(
optparse::make_option(
c("--input"),
c("--input_file"),
type = "character",
default = NA,
help = "The input file of expression data from which scores will be calculated."
),
optparse::make_option(
c("--output"),
optparse::make_option(
c("--output_file"),
type = "character",
default = NA,
default = NA,
help = "The output file for writing GSVA scores in TSV format."
)
)

## Read in arguments
opt_parser <- optparse::OptionParser(option_list = option_list)
opt <- optparse::parse_args(opt_parser)
if (is.na(opt$input)) stop("\n\nERROR: You must provide an input file with expression data with the flag --input, assumed to be in the `data/` directory of the OpenPBTA repository..")
if (is.na(opt$output)) stop("\n\nERROR: You must provide an output file for saving GSVA scores with the flag --output, assumed to be placed in the `results/` directory of this analysis.")
if (is.na(opt$input_file)) stop("\n\nERROR: You must provide an input file with expression data with the flag --input, assumed to be in the `data/` directory of the OpenPBTA repository..")
if (is.na(opt$output_file)) stop("\n\nERROR: You must provide an output file for saving GSVA scores with the flag --output, assumed to be placed in the `results/` directory of this analysis.")


#### Set Up paths and file names --------------------------------------------------------------------

## Define directories
data_dir <- file.path("..", "..", "data")
results_dir <- "results"
if (!dir.exists(results_dir)) dir.create(results_dir)
# If the output directory does not exist, create it
output_dir <- dirname(opt$output_file)
if (!dir.exists(output_dir)) {
dir.create(output_dir, recursive = TRUE)
}

## Ensure the input file exists in `data/` and specify input/output files
expression_data_file <- file.path(data_dir, basename(opt$input))
## Ensure the input file exists in `data/` and specify input/output files
expression_data_file <- opt$input_file
if (!file.exists(expression_data_file)) stop("\n\nERROR: Provided input file does not exist.")
scores_output_file <- file.path(results_dir, basename(opt$output))


scores_output_file <- file.path(output_dir, basename(opt$output_file))

#### Load input files --------------------------------------------------------------------
expression_data <- as.data.frame( readr::read_rds(expression_data_file) )
Expand Down Expand Up @@ -114,14 +113,14 @@ gsea_scores <- GSVA::gsva(expression_data_log2_matrix,
### Clean scoring into tidy format
gsea_scores_df <- as.data.frame(gsea_scores) %>%
rownames_to_column(var = "hallmark_name")

#first/last_bs needed for use in gather (we are not on tidyr1.0)
first_bs <- head(colnames(gsea_scores), n=1)
last_bs <- tail(colnames(gsea_scores), n=1)

gsea_scores_df_tidy <- gsea_scores_df %>%
tidyr::gather(Kids_First_Biospecimen_ID, gsea_score, !!first_bs : !!last_bs) %>%
dplyr::select(Kids_First_Biospecimen_ID, hallmark_name, gsea_score)
dplyr::select(Kids_First_Biospecimen_ID, hallmark_name, gsea_score)


#### Export GSEA scores to TSV --------------------------------------------------------------------
Expand Down
10 changes: 4 additions & 6 deletions analyses/gene-set-enrichment-analysis/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,17 @@ bash analyses/gene-set-enrichment-analysis/run-gsea.sh

## Folder Content

+ `01-conduct-gsea-analysis.R` performs the GSVA analysis using RSEM FPKM expression data for both stranded and polyA data. Results are saved in `results/` TSV files.
+ `01-conduct-gsea-analysis.R` performs the GSVA analysis using RSEM FPKM expression data for both stranded and polyA data. Results are saved in `results/` TSV files when run via `run-gsea.sh`.

+ `02-model-gsea.Rmd` performs ANOVA and Tukey tests on GSVA scores to evaluate, for each hallmark pathway, differences in GSVA across groups (e.g. short histology or disease type).

+ FORTHCOMING: `03-visualize-gsea.Rmd` will create heatmaps and boxplots to visualize scores.

+ `results/gsva_scores_stranded.tsv` represents GSVA scores calculated from `pbta-gene-expression-rsem-fpkm-collapsed.stranded.rds` (data release v12)
+ File created with: `Rscript --vanilla 01-conduct-gsea-analysis.R --input pbta-gene-expression-rsem-fpkm-collapsed.stranded.rds --output gsva_scores_stranded.tsv`
+ `results/gsva_scores_polya.tsv` represents GSVA scores calculated from `pbta-gene-expression-rsem-fpkm-collapsed.polya.rds` (data release v12)
+ File created with: `Rscript --vanilla 01-conduct-gsea-analysis.R --input pbta-gene-expression-rsem-fpkm-collapsed.polya.rds --output gsva_scores_stranded.tsv`
+ `results/gsva_scores_stranded.tsv` represents GSVA scores calculated from `pbta-gene-expression-rsem-fpkm-collapsed.stranded.rds` with `Rscript --vanilla 01-conduct-gsea-analysis.R`

+ `results/gsva_scores_polya.tsv` represents GSVA scores calculated from `pbta-gene-expression-rsem-fpkm-collapsed.polya.rds` with with `Rscript --vanilla 01-conduct-gsea-analysis.R`

+ **Eight** files named as `results/gsva_<tukey/anova>_<stranded/polya>_<integrated_diagnosis/short_histology>.tsv` represent results from modeling
+ Files created with: `Rscript --vanilla 02-model-gsea.R`
+ Assumes `results/gsva_scores_stranded.tsv` and `results/gsva_scores_polya.tsv` exist


16 changes: 9 additions & 7 deletions analyses/gene-set-enrichment-analysis/run-gsea.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
# Run the GSEA pipeline:
## 1. `01-conduct-gsea-analysis.R` to calculate scores
## 2. `02-exploratory-gsea.Rmd` to explore the scores, lightly for now
#
# Usage: bash run-gsea.sh
#
# Usage: bash run-gsea.sh
#
# Takes one environment variable, `OPENPBTA_TESTING`, which is 1 for running
# samples in CI for testing, 0 for running the full dataset (Default)
Expand All @@ -26,18 +26,20 @@ script_directory="$(perl -e 'use File::Basename;
print dirname(abs_path(@ARGV[0]));' -- "$0")"
cd "$script_directory" || exit

DATA_DIR="../../data"
RESULTS_DIR="results"

######## Calculate scores from polyA expression data ############
INPUT_FILE="pbta-gene-expression-rsem-fpkm-collapsed.polya.rds"
OUTPUT_FILE="gsva_scores_polya.tsv"
INPUT_FILE="${DATA_DIR}/pbta-gene-expression-rsem-fpkm-collapsed.polya.rds"
OUTPUT_FILE="${RESULTS_DIR}/gsva_scores_polya.tsv"
Rscript --vanilla 01-conduct-gsea-analysis.R --input ${INPUT_FILE} --output ${OUTPUT_FILE}


######## Calculate scores from stranded expression data ############
INPUT_FILE="pbta-gene-expression-rsem-fpkm-collapsed.stranded.rds"
OUTPUT_FILE="gsva_scores_stranded.tsv"
INPUT_FILE="${DATA_DIR}/pbta-gene-expression-rsem-fpkm-collapsed.stranded.rds"
OUTPUT_FILE="${RESULTS_DIR}/gsva_scores_stranded.tsv"
Rscript --vanilla 01-conduct-gsea-analysis.R --input ${INPUT_FILE} --output ${OUTPUT_FILE}


######## Model GSVA scores ############
Rscript -e "rmarkdown::render('02-model-gsea.Rmd', clean = TRUE, params=list(is_ci = ${IS_CI}))"
Rscript -e "rmarkdown::render('02-model-gsea.Rmd', clean = TRUE, params=list(is_ci = ${IS_CI}))"
4 changes: 4 additions & 0 deletions analyses/immune-deconv/.gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# CIBERSORT related files
LM22.txt
CIBERSORT.R
CIBERSORT-Results.txt

# output data specifically for making figures
results/deconv-output-for-figures.RData
35 changes: 18 additions & 17 deletions analyses/immune-deconv/01-immune-deconv.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Author: Komal S. Rathi
# Date: 11/11/2019
# Function:
# Function:
# Script to perform immune characterization using xCell etc.

# load libraries
Expand All @@ -17,9 +17,9 @@ option_list <- list(
help = "Clinical file (.TSV)"),
make_option(c("-m", "--method"), type = "character",
help = "Deconvolution Method"),
make_option(c("-b", "--cibersortbin"), type = "character",
make_option(c("-b", "--cibersortbin"), type = "character",
help = "Path to Cibersort binary (CIBERSORT.R)"),
make_option(c("-g", "--cibersortgenemat"), type = "character",
make_option(c("-g", "--cibersortgenemat"), type = "character",
help = "Path to Cibersort signature matrix (LM22.txt)"),
make_option(c("-o","--outputfile"), type = "character",
help = "Deconv Output (.RData)")
Expand All @@ -41,19 +41,21 @@ polya <- opt$polyaexprs
stranded <- opt$strandedexprs
clin.file <- opt$clin
deconv.method <- opt$method
cibersort_bin <- opt$cibersortbin
cibersort_mat <- opt$cibersortgenemat
cibersort_bin <- opt$cibersortbin
cibersort_mat <- opt$cibersortgenemat
output.file <- opt$outputfile

#### Check model parameter - must be in deconvolution_methods (immunedeconv accepted options)
if (!(deconv.method %in% deconvolution_methods )){
if (!is.null(deconv.method)){
if (!(deconv.method %in% deconvolution_methods)) {
stop( paste(c("Specified method not available. Must be one of the following: ", deconvolution_methods), collapse=" ") )
}
}


# if cibersort_bin and cibersort_mat are defined
# then, set path to cibersort binary and matrix
if(cibersort_bin != "NA" & cibersort_mat != "NA"){
if(!(is.null(cibersort_bin)) & !(is.null(cibersort_mat))){
set_cibersort_binary(cibersort_bin)
set_cibersort_mat(cibersort_mat)
}
Expand All @@ -67,37 +69,36 @@ clin <- read.delim(clin.file, stringsAsFactors = F)

# function to run immunedeconv
deconv <- function(expr.input, method) {

# get data
expr.input <- get(expr.input)

# subset clinical
clin.sub <- clin %>%
clin.sub <- clin %>%
filter(Kids_First_Biospecimen_ID %in% colnames(expr.input)) %>%
dplyr::select(Kids_First_Biospecimen_ID, broad_histology, short_histology, molecular_subtype)

# deconvolute using specified method
res <- deconvolute(gene_expression = as.matrix(expr.input), method = method)
res$method <- names(grep(method, deconvolution_methods, value = TRUE)) # assign method name
res$method <- names(grep(method, deconvolution_methods, value = TRUE)) # assign method name

# merge output with clinical data
res <- res %>%
gather(sample, fraction, -c(cell_type, method)) %>%
as.data.frame() %>%
inner_join(clin.sub, by = c("sample" = "Kids_First_Biospecimen_ID"))

return(res)
}

# Deconvolute using xCell and the second chosen method
# these two methods have the max number of immune cell types
deconv.method <- c("xcell", deconv.method)
deconv.method <- unique(c("xcell", deconv.method))
expr.input <- c("polya", "stranded") # datasets
combo <- expand.grid(expr.input, deconv.method, stringsAsFactors = F) # combination of dataset and methods
deconv.res <- apply(combo, 1, FUN = function(x) deconv(expr.input = x[1], method = x[2]))
deconv.res <- do.call(rbind.data.frame, deconv.res)

# save output to RData object
# save output to RData object
print("Writing output to file..")
save(deconv.res, file = output.file)
print("Done!")
Loading

0 comments on commit 203319b

Please sign in to comment.