This repository has been archived by the owner on Jun 21, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 67
/
05-recurrent-fusions-per-histology.R
185 lines (146 loc) · 8.67 KB
/
05-recurrent-fusions-per-histology.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
# K. S. Gaonkar 2019
# Identify recurrent fusion and genes per broad histology
#
# Sample selection criteria : removed cell-lines to only keep tumor samples
suppressPackageStartupMessages(library("optparse"))
suppressPackageStartupMessages(library("tidyverse"))
suppressPackageStartupMessages(library("reshape2"))
option_list <- list(
make_option(c("-S", "--standardFusionCalls"),type="character",
help="Standardized fusion calls (.tsv) "),
make_option(c("-c","--clinicalFile"),type="character",
help="clinical file for all samples (.tsv)"),
make_option(c("-o","--outputfolder"),type="character",
help="output folder for results "),
make_option(c("-i","--independentSpecimensFile"),type="character",
help="independent specimens WGS/WXS to match to rnaseq")
)
opt <- parse_args(OptionParser(option_list=option_list))
standardFusionCalls<-opt$standardFusionCalls
clinicalFile<-opt$clinicalFile
outputfolder<-opt$outputfolder
independentSpecimensFile<-opt$independentSpecimensFile
# input data
standardFusionCalls <- read_tsv(standardFusionCalls) %>% as.data.frame()
clinical<-read_tsv(clinicalFile)
# gather RNA-seq from WGS/WXS samples in independent-specimens.wgswxs.primary-plus.tsv
independentSpecimens<-read_tsv(independentSpecimensFile) %>% as.data.frame()
sampleIDMatchedIndependent<-clinical %>% dplyr::filter(Kids_First_Biospecimen_ID %in% independentSpecimens$Kids_First_Biospecimen_ID) %>% dplyr::select(sample_id) %>% as.vector()
# PNOC sampleIDs have .WXS which would need to .RNA-Seq ; Panel not in independent sample list ; so using patient ID to match
clinical_pnoc<-clinical %>%
dplyr::filter(cohort=="PNOC003",experimental_strategy=="RNA-Seq",tumor_descriptor=="Initial CNS Tumor") %>%
dplyr::select(Kids_First_Participant_ID,Kids_First_Biospecimen_ID)
clinical_rna<-clinical %>%
# select WGS/WXS matched sampleIDs + experimental_strategy=="RNA-Seq" +composition == "Solid Tissue"
dplyr::filter(sample_id %in% sampleIDMatchedIndependent$sample_id,
experimental_strategy == "RNA-Seq",
composition == "Solid Tissue") %>%
dplyr::group_by(Kids_First_Participant_ID) %>%
# sample 1 of BS_IDs for multiple sampleID found in matching with WGS/WXS
dplyr::summarize(Kids_First_Biospecimen_ID = sample(Kids_First_Biospecimen_ID, 1))
# match RNASeq only files
clinical_wgs<-clinical %>%
dplyr::filter(experimental_strategy == "WGS" | experimental_strategy == "WXS",sample_type=="Tumor")
# remove samples which have WGS/WXS because those would have been captured from the independent-wgswxs-sample
clinical_rna_v2<-clinical %>%
dplyr::filter(experimental_strategy == "RNA-Seq",!Kids_First_Participant_ID %in% clinical_wgs$Kids_First_Participant_ID)
clinical_rna_intial<-clinical_rna_v2 %>%
dplyr::filter(composition=="Solid Tissue",tumor_descriptor=="Initial CNS Tumor") %>%
select("Kids_First_Participant_ID","Kids_First_Biospecimen_ID")
clinical_rna_non_initial<- clinical_rna_v2 %>%
dplyr::filter(!Kids_First_Participant_ID %in% clinical_rna_intial$Kids_First_Participant_ID ) %>%
dplyr::filter(experimental_strategy=="RNA-Seq" ,composition=="Solid Tissue") %>%
as.data.frame() %>%
dplyr::group_by(Kids_First_Participant_ID) %>%
# random sample 1 of BS_IDs for multiple sampleID found in matching with WGS/WXS
dplyr::summarize(Kids_First_Biospecimen_ID = sample(Kids_First_Biospecimen_ID, 1))
# bind WGS/WXS matched RNA-Seq + RNAseq only samples initial tumor samples + RNAseq only recurrent/progressive samples + RNASeq matched to PNOC samples
clinical_rna<-rbind(clinical_rna,clinical_rna_intial,clinical_rna_non_initial,clinical_pnoc) %>% unique()
clinical_rna<-clinical_rna %>% left_join(clinical,by=c("Kids_First_Participant_ID","Kids_First_Biospecimen_ID"))
# Putative Driver Fusions annotated with broad_histology
standardFusionCalls<-standardFusionCalls %>%
dplyr::filter( Sample%in% clinical_rna$Kids_First_Biospecimen_ID) %>%
left_join(clinical,by=c("Sample"="Kids_First_Biospecimen_ID","Kids_First_Participant_ID")) %>%
dplyr::filter(!is.na(broad_histology)) %>% as.data.frame()
#remove fusions found in benign tumors as internal false positive control
# KCNH1--AL590132.1 and AL590132.1--KCNH1 come up as recurrent in multiple histologies but using the following filter helps remove such fusions.
# get the names of fusions that are present in benign tumors
fusions_in_benign <- standardFusionCalls %>%
dplyr::filter(broad_histology == "Benign tumor") %>%
unique() %>%
dplyr::pull(FusionName)
# remove fusions found in benign tumors as internal false positive control
standardFusionCalls <- standardFusionCalls %>%
dplyr::filter(!(FusionName %in% fusions_in_benign)) %>%
# keep only inframe fusions
dplyr::filter(Fusion_Type %in% c("in-frame"))
# running this to remove GeneA==GeneB which might be non-canonical transcripts/ false positives from arriba documentation
standardFusionCalls<-standardFusionCalls %>%
dplyr::mutate(removal = dplyr::case_when(
Gene1A == Gene2A | Gene1A == Gene2B | Gene2A == Gene1B | Gene1A == Gene1B | Gene2A == Gene2B ~ TRUE,
TRUE ~ FALSE
)) %>%
# remove the ones you want to remove by negating removal
dplyr::filter(!removal) %>%
# get rid of removal column
dplyr::select(-removal)
# gather recurrent fusion per patient per broad_histology
rec_fusions <- standardFusionCalls %>%
dplyr::select("FusionName","broad_histology","Kids_First_Participant_ID")%>%
unique() %>%
dplyr::group_by(FusionName, broad_histology) %>%
dplyr::summarize(count = n())
#find rec fusions per PATIENT per broad_histology
rec_fusions<-rec_fusions[rec_fusions$count>3,]
rec_fusions<-rec_fusions[order(rec_fusions$count,decreasing = TRUE),]
write.table(rec_fusions,file.path(outputfolder,"pbta-fusion-recurrent-fusion-byhistology.tsv"),quote = FALSE,row.names = FALSE,sep="\t")
# binary matrix for recurrent fusions found in SAMPLE per broad_histology
rec_fusions_mat<-rec_fusions %>%
# to add sample with rec fusion
left_join(standardFusionCalls, by=c("FusionName","broad_histology")) %>%
select("FusionName","broad_histology","Sample") %>%
# to add all rna Sample for binary matrix
full_join(clinical_rna,by=c("Sample"="Kids_First_Biospecimen_ID","broad_histology"="broad_histology"))
# adding a full join to recurrent fusion adds NAs to FusionName column that don't have recurrent fusions. Changing NA to No_rec_fusions so in matrix format it columns specifies that instead of NA
rec_fusions_mat[is.na(rec_fusions_mat$FusionName),"FusionName"]<-"No_rec_fusion"
# binary matrix
rec_fusions_mat<-dcast(rec_fusions_mat,Sample~FusionName,value.var = "Sample",fun.aggregate = function(x){as.integer(length(x) > 0)},drop = FALSE)
write.table(rec_fusions_mat,file.path(outputfolder,"pbta-fusion-recurrent-fusion-bysample.tsv"),quote = FALSE,row.names = FALSE,sep="\t")
# gene1A recurrent
rec_gene1A <- standardFusionCalls %>%
dplyr::select("Gene1A","broad_histology","Kids_First_Participant_ID")%>%
unique() %>%
dplyr::rename("Gene"="Gene1A")
# gene1B recurrent
rec_gene1B <- standardFusionCalls %>%
dplyr::select("Gene1B","broad_histology","Kids_First_Participant_ID")%>%
unique() %>%
dplyr::rename("Gene"="Gene1B")
# merge and then summarize
rec_gene<-rbind(rec_gene1A,rec_gene1B) %>%
unique() %>%
dplyr::group_by(Gene,broad_histology) %>%
dplyr::summarize(count = n())
#find rec fused genes per PATIENT per broad_histology
rec_gene<-rec_gene[rec_gene$count>3,]
rec_gene<-rec_gene[order(rec_gene$count,decreasing = TRUE),]
write.table(rec_gene,file.path(outputfolder,"pbta-fusion-recurrently-fused-genes-byhistology.tsv"),quote = FALSE,row.names = FALSE,sep="\t")
# binary matrix for recurrently fused genes found in SAMPLE per broad_histology
rec_geneA_mat<-rec_gene %>%
# to add sample with rec fusion gene A (5' gene)
left_join(standardFusionCalls, by=c("Gene"="Gene1A","broad_histology")) %>%
select("Gene","broad_histology","Sample") %>%
filter(!is.na(Sample))
rec_geneB_mat<-rec_gene %>%
# to add sample with rec fusion gene B (3' gene)
left_join(standardFusionCalls, by=c("Gene"="Gene1B","broad_histology")) %>%
select("Gene","broad_histology","Sample") %>%
filter(!is.na(Sample))
rec_gene_mat<-rbind(rec_geneA_mat,rec_geneB_mat) %>% unique() %>%
# to add all rna Sample for binary matrix
full_join(clinical_rna,by=c("Sample"="Kids_First_Biospecimen_ID","broad_histology"="broad_histology")) %>%
unique()
rec_gene_mat[is.na(rec_gene_mat$Gene),"Gene"]<-"No_rec_fused_gene"
# binary matrix
rec_gene_mat<-dcast(rec_gene_mat,Sample~Gene,value.var = "Sample",fun.aggregate = function(x){as.integer(length(x) > 0)},drop = FALSE)
write.table(rec_gene_mat,file.path(outputfolder,"pbta-fusion-recurrently-fused-genes-bysample.tsv"),quote = FALSE,row.names = FALSE,sep="\t")