Skip to content

Commit

Permalink
Merge pull request #38 from PediatricOpenTargets/fusion_filtering_sta…
Browse files Browse the repository at this point in the history
…ggered_3

Fusion filtering staggered PR3
  • Loading branch information
kgaonkar6 authored Jul 14, 2021
2 parents 84502bf + 751785d commit 0641db3
Show file tree
Hide file tree
Showing 3 changed files with 120 additions and 118 deletions.
3 changes: 1 addition & 2 deletions analyses/fusion_filtering/00-normal-matrix-generation.R
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,5 @@ normal_specimen <- histology_df %>% filter(gtex_group == specimenType) %>%
expressionMatrix <- readRDS(expressionMatrix)
normal_expression_matrix <- expressionMatrix %>% select(rownames(normal_specimen))

saveRDS(normal_expression_matrix,outputFile)


saveRDS(normal_expression_matrix,outputFile)
221 changes: 115 additions & 106 deletions analyses/fusion_filtering/03-Calc-zscore-annotate.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
# GeneSymbol( rownames) to be used to normalize samples
# expressionMatrix : expression matrix (FPKM for samples that need to be zscore normalized)
# saveZscoredMatrix : path to save zscored matrix from GTEx normalization
# outputfile : Output file is a standardized fusion call set with standard
# outputFile : Output file is a standardized fusion call set with standard
# column headers with additional columns below from expression annotation
# zscore_Gene1A : zscore from GTEx/cohort normalization for Gene1A
# zscore_Gene1B : zscore from GTEx/cohort normalization for Gene1B
Expand All @@ -43,15 +43,19 @@ suppressPackageStartupMessages(library("reshape2"))
option_list <- list(
make_option(c("-S", "--standardFusionCalls"),type="character",
help="Standardized fusion calls (.RDS) "),
make_option(c("-c", "--zscoreFilter"), type="integer",default=2,
make_option(c("-z", "--zscoreFilter"), type="integer",default=2,
help="zscore value to use as threshold for annotation of differential expression"),
make_option(c("-e","--expressionMatrix"),type="character",
help="expression matrix (FPKM for samples that need to be zscore normalized .RDS)"),
make_option(c("-c","--clinicalFile"),type="character",
help="clinical file for all samples (.tsv)"),
make_option(c("-y", "--cohortInterest"),type="character",
help="cohorts of interest for the filtering"),
make_option(c("-s","--saveZscoredMatrix"),type="character",
help="path to save zscored matrix from GTEx normalization (.RDS)"),
make_option(c("-n","--normalExpressionMatrix"),type="character",
help="normalization FPKM expression data to compare (.rds)"),
make_option(c("-o","--outputfile"),type="character",
help="normalization TPM expression data to compare (.rds) - please make sure it matches the cohort"),
make_option(c("-o","--outputFile"),type="character",
help="Standardized fusion calls with additional annotation from GTEx zscore comparison (.TSV)")
)

Expand All @@ -60,15 +64,29 @@ standardFusionCalls<-opt$standardFusionCalls
expressionMatrix<-opt$expressionMatrix
zscoreFilter<- opt$zscoreFilter
saveZscoredMatrix<-opt$saveZscoredMatrix
gtexMatrix<-opt$normalExpressionMatrix
clinicalFile <- opt$clinicalFile
cohortInterest<-unlist(strsplit(opt$cohortInterest,","))
gtexMatrix<-unlist(strsplit(opt$normalExpressionMatrix,","))

print(gtexMatrix)

ZscoredAnnotation<-function(standardFusionCalls=standardFusionCalls,zscoreFilter=zscoreFilter,saveZscoredMatrix=saveZscoredMatrix,normData=normData,expressionMatrix=expressionMatrix){
# load standardaized fusion calls cohort
standardFusionCalls<-readRDS(standardFusionCalls)

# load expression Matrix for cohort
expressionMatrix<-readRDS(expressionMatrix)

# load GTEx norm data for each cohort
normData <-lapply(gtexMatrix, readRDS)

ZscoredAnnotation<-function(standardFusionCalls=standardFusionCalls,
normData=normData, zscoreFilter=zscoreFilter,
saveZscoredMatrix=saveZscoredMatrix,cohort_BSids= cohort_BSids,
expressionMatrix=expressionMatrix){
# @param standardFusionCalls : Annotates standardizes fusion calls from callers [STARfusion| Arriba] or QC filtered fusion
# @param zscoreFilter : Zscore value to use as threshold for annotation of differential expression
# @param normData: normalizing expression dataset to calculate zscore
# @param expressionMatrix: Expression matrix associated with the fusion calls
# @param cohort_BSids: biospecimen ID for cohort of interest to filter/subset expression matrix
# @param saveZscoredMatrix: File to save zscored matrix calculated for the normalized data and expression matrix
# @results : expression_annotated_fusions is a standardized fusion call set with standard
# column headers with additional columns below from expression annotation
Expand All @@ -81,57 +99,6 @@ ZscoredAnnotation<-function(standardFusionCalls=standardFusionCalls,zscoreFilter
# note_expression_Gene2A : differentially expressed or no change in expression for Gene2A
# note_expression_Gene2B : differentially expressed or no change in expression for Gene2B

# expressionMatrix collapsed at gene level like Gtex max rowMean
expressionMatrixMatched <- expressionMatrix %>%
unique() %>%
# means for each row per each gene_id
dplyr::mutate(means = rowMeans(select(.,-GeneSymbol,-gene_id,-EnsembleID))) %>%
# arrange descending mean
arrange(desc(means)) %>%
# to keep only first occurence ie. max rowMean per GeneSymbol
distinct(GeneSymbol, .keep_all = TRUE) %>%
ungroup() %>%
dplyr::select(-means,-gene_id,-EnsembleID) %>%
dplyr::filter(GeneSymbol %in% normData$GeneSymbol) %>%
tibble::column_to_rownames("GeneSymbol")

# remove 0s
expressionMatrixMatched<-expressionMatrixMatched[rowMeans(expressionMatrixMatched)!=0,]
# log2 transformation
expressionMatrixMatched<-log2(expressionMatrixMatched+1)



# gene matched
# get log transformed GTEx matrix
normData <- normData %>%
tibble::column_to_rownames("GeneSymbol") %>%
as.matrix()
# rearrange to order with expressionMatrix
normData <- normData[rownames(expressionMatrixMatched), ]
# log2 transformation
normData <- log2(normData + 1)

#normData mean and sd
normData_means <- rowMeans(normData, na.rm = TRUE)
normData_sd <- apply(normData, 1, sd, na.rm = TRUE)
# subtract mean
expressionMatrixzscored <- sweep(expressionMatrixMatched, 1, normData_means, FUN = "-")
# divide by SD remove NAs and Inf values from zscore for genes with 0 in normData
expressionMatrixzscored <- sweep(expressionMatrixzscored, 1,normData_sd, FUN = "/") %>% mutate(GeneSymbol=rownames(.)) %>% na_if(Inf) %>% na.omit()


# To save GTEx/cohort scored matrix
if(!missing(saveZscoredMatrix)){
saveRDS(expressionMatrixzscored,saveZscoredMatrix)
}

# get long format to compare to expression
expression_long_df <- expressionMatrixzscored %>%
# Get the data into long format
reshape2::melt(variable.name = "Sample",
value.name = "zscore_value")

# fusion calls
fusion_sample_gene_df <- standardFusionCalls %>%
# We want to keep track of the gene symbols for each sample-fusion pair
Expand All @@ -144,55 +111,97 @@ ZscoredAnnotation<-function(standardFusionCalls=standardFusionCalls,zscoreFilter
dplyr::arrange(Sample, FusionName) %>%
# Retain only distinct rows
dplyr::distinct()

# check columns for Gene1A,Gene2A,Gene1B and Gene2B
cols<-c(Gene1A=NA, Gene1B=NA, Gene2A=NA, Gene2B=NA)

expression_annotated_fusions <- fusion_sample_gene_df %>%
# join the filtered expression values to the data frame keeping track of symbols
# for each sample-fusion name pair
dplyr::left_join(expression_long_df, by = c("Sample", "GeneSymbol")) %>%
# for each sample-fusion name pair, are all genes under the expression threshold?
dplyr::group_by(FusionName, Sample) %>%
dplyr::select(FusionName, Sample,zscore_value,gene_position) %>%
# cast to keep zscore and gene position
reshape2::dcast(FusionName+Sample ~gene_position,value.var = "zscore_value") %>%
# incase Gene2A/B dont exist like in STARfusion calls
add_column(!!!cols[setdiff(names(cols),names(.))]) %>%
# get annotation from z score
dplyr::mutate(note_expression_Gene1A = ifelse((Gene1A>zscoreFilter| Gene1A< -zscoreFilter),"differentially expressed","no change"),
note_expression_Gene1B = ifelse((Gene1B>zscoreFilter| Gene1B< -zscoreFilter),"differentially expressed","no change"),
note_expression_Gene2A = ifelse((Gene2A>zscoreFilter| Gene2A< -zscoreFilter),"differentially expressed","no change"),
note_expression_Gene2B = ifelse((Gene2B>zscoreFilter| Gene2B< -zscoreFilter),"differentially expressed","no change")) %>%
dplyr::rename(zscore_Gene1A=Gene1A,
zscore_Gene1B=Gene1B,
zscore_Gene2A=Gene2A,
zscore_Gene2B=Gene2B) %>%
# unique FusionName-Sample rows
# use this to filter the QC filtered fusion data frame
dplyr::inner_join(standardFusionCalls, by = c("FusionName", "Sample")) %>%
dplyr::distinct()


# filter the fusion results to contain only samples in the cohort of interest
fusion_sample_gene_df_matched <- fusion_sample_gene_df %>% filter(Sample %in% cohort_BSids)
fusion_sample_list <- fusion_sample_gene_df_matched$Sample %>% unique()

expressionMatrixMatched <- expressionMatrix %>%
select(cohort_BSids) %>%
select(fusion_sample_list)


# remove 0s
expressionMatrixMatched<-expressionMatrixMatched[rowMeans(expressionMatrixMatched)!=0,]
# log2 transformation
expressionMatrixMatched<-log2(expressionMatrixMatched+1)

# convert the expression data to data matrix
normData_matched <- normData_matched %>% data.frame() %>%
as.matrix()

# rearrange to order with expressionMatrix
normData_matched <- normData_matched[rownames(expressionMatrixMatched), ]
# log2 transformation
normData_matched <- log2(normData_matched + 1)
#normData_matched mean and sd
normData_means <- rowMeans(normData_matched, na.rm = TRUE)
normData_sd <- apply(normData_matched, 1, sd, na.rm = TRUE)

# subtract mean
expressionMatrixzscored <- sweep(expressionMatrixMatched, 1, normData_means, FUN = "-")
# divide by SD remove NAs and Inf values from zscore for genes with 0 in normData
expressionMatrixzscored <- sweep(expressionMatrixzscored, 1,normData_sd, FUN = "/") %>% mutate(GeneSymbol=rownames(.)) %>% na_if(Inf) %>% na.omit()


# To save GTEx/cohort scored matrix
if(!missing(saveZscoredMatrix)){
saveRDS(expressionMatrixzscored,saveZscoredMatrix)
}
# get long format to compare to expression
expression_long_df <- expressionMatrixzscored %>%
# Get the data into long format
reshape2::melt(variable.name = "Sample",
value.name = "zscore_value")

# check columns for Gene1A,Gene2A,Gene1B and Gene2B
cols<-c(Gene1A=NA, Gene1B=NA, Gene2A=NA, Gene2B=NA)

expression_annotated_fusions <- data.frame()
for (i in 1:length(fusion_sample_list)){
matched_sample <- fusion_sample_list[i]
fusion_sample_matched <- fusion_sample_gene_df_matched %>% filter(Sample == matched_sample)
expression_sample_matched <- expression_long_df %>% filter(Sample == matched_sample)
expression_annotated_fusion_individual <- fusion_sample_matched %>%
# join the filtered expression values to the data frame keeping track of symbols
# for each sample-fusion name pair
dplyr::left_join(expression_sample_matched, by = c("Sample", "GeneSymbol")) %>%
# for each sample-fusion name pair, are all genes under the expression threshold?
dplyr::group_by(FusionName, Sample) %>%
dplyr::select(FusionName, Sample,zscore_value,gene_position) %>%
# cast to keep zscore and gene position
reshape2::dcast(FusionName+Sample ~gene_position,value.var = "zscore_value") %>%
# incase Gene2A/B dont exist like in STARfusion calls
add_column(!!!cols[setdiff(names(cols),names(.))]) %>%
# get annotation from z score
dplyr::mutate(note_expression_Gene1A = ifelse((Gene1A>zscoreFilter| Gene1A< -zscoreFilter),"differentially expressed","no change"),
note_expression_Gene1B = ifelse((Gene1B>zscoreFilter| Gene1B< -zscoreFilter),"differentially expressed","no change"),
note_expression_Gene2A = ifelse((Gene2A>zscoreFilter| Gene2A< -zscoreFilter),"differentially expressed","no change"),
note_expression_Gene2B = ifelse((Gene2B>zscoreFilter| Gene2B< -zscoreFilter),"differentially expressed","no change")) %>%
dplyr::rename(zscore_Gene1A=Gene1A,
zscore_Gene1B=Gene1B,
zscore_Gene2A=Gene2A,
zscore_Gene2B=Gene2B) %>%
# unique FusionName-Sample rows
# use this to filter the QC filtered fusion data frame
dplyr::inner_join(standardFusionCalls, by = c("FusionName", "Sample")) %>%
dplyr::distinct()
expression_annotated_fusions <- rbind(expression_annotated_fusions, expression_annotated_fusion_individual)
}
return (expression_annotated_fusions)
}

# example run using GTEx
# load GTEx data
normData<-read_tsv(gtexMatrix) %>% rename(GeneSymbol=gene_symbol)

# load standardaized fusion calls cohort
standardFusionCalls<-readRDS(standardFusionCalls)

# load expression Matrix for cohort
expressionMatrix<-readRDS(expressionMatrix)
expressionMatrix <- cbind(expressionMatrix, colsplit(expressionMatrix$gene_id, pattern = '_', names = c("EnsembleID","GeneSymbol")))

# for cohort level run the expressionMatrix divided by broad_histology will be provided to the function as normData and expressionMatrix

GTExZscoredAnnotation_filtered_fusions<- ZscoredAnnotation(standardFusionCalls ,zscoreFilter,normData=normData,expressionMatrix = expressionMatrix)
saveRDS(GTExZscoredAnnotation_filtered_fusions,paste0(opt$outputfile,"_GTExComparison_annotated.RDS"))




GTExZscoredAnnotation_filtered_fusions <- data.frame()
for (j in (1:length(cohortInterest))) {
cohort_BSids <- read.delim(clinicalFile, header = TRUE, sep = "\t", stringsAsFactors = FALSE) %>%
# uses each value x in cohortInterest
filter(cohort == cohortInterest[j]) %>% filter(experimental_strategy == "RNA-Seq") %>%
filter(sample_type == "Tumor") %>% pull("Kids_First_Biospecimen_ID")

# use index to find the matched normal data
normData_matched <- normData[[j]]
GTExZscoredAnnotation_filtered_fusions_individual <-ZscoredAnnotation(standardFusionCalls,zscoreFilter,normData=normData_matched,cohort_BSids=cohort_BSids, expressionMatrix = expressionMatrix)
GTExZscoredAnnotation_filtered_fusions <- rbind(GTExZscoredAnnotation_filtered_fusions, GTExZscoredAnnotation_filtered_fusions_individual)
}

saveRDS(GTExZscoredAnnotation_filtered_fusions,paste0(opt$outputFile,"_GTExComparison_annotated.RDS"))
14 changes: 4 additions & 10 deletions analyses/fusion_filtering/run_fusion_merged.sh
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ Rscript 00-normal-matrix-generation.R --expressionMatrix $rna_expression_file \
--specimenType "Brain" \
--outputFile $normal_expression_brain


# Run Fusion standardization for arriba caller
Rscript 01-fusion-standardization.R --fusionfile $arriba_file \
--caller "arriba" \
Expand All @@ -96,17 +95,12 @@ Rscript 02-fusion-filtering.R --standardFusionFiles $standard_starfusion_file,$s
--outputFile "${scratch_path}/standardFusionExp" \
--readthroughFilter

# Fusion zscore annotation for filtered fusion for polya
Rscript 03-Calc-zscore-annotate.R --standardFusionCalls "${scratch_path}/standardFusionPolyaExp_QC_expression_filtered_annotated.RDS" \
--expressionMatrix $polya_expression_file \
--normalExpressionMatrix $normal_expression_file \
--outputfile "${scratch_path}/standardFusionPolyaExp_QC_expression"

# Fusion zscore annotation for filtered fusion for stranded
Rscript 03-Calc-zscore-annotate.R --standardFusionCalls "${scratch_path}/standardFusionStrandedExp_QC_expression_filtered_annotated.RDS" \
--expressionMatrix $stranded_expression_file \
# Fusion zscore annotation for filtered fusion for the combined RNA expression file
Rscript 03-Calc-zscore-annotate.R --standardFusionCalls "${scratch_path}/standardFusionExp_QC_expression_filtered_annotated.RDS" \
--expressionMatrix $rna_expression_file \
--normalExpressionMatrix $normal_expression_file \
--outputfile "${scratch_path}/standardFusionStrandedExp_QC_expression"
--outputFile "${scratch_path}/standardFusionExp_QC_expression"

# Project specific filtering
Rscript -e "rmarkdown::render('04-project-specific-filtering.Rmd',params=list(base_run = $RUN_FOR_SUBTYPING))"
Expand Down

0 comments on commit 0641db3

Please sign in to comment.