forked from AlexsLemonade/OpenPBTA-analysis
-
Notifications
You must be signed in to change notification settings - Fork 0
/
00-tp53-nf1-alterations.R
134 lines (114 loc) · 5.4 KB
/
00-tp53-nf1-alterations.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# Author: Krutika Gaonkar
#
# Read in consensus snv calls to gather alterations in TP53 and NF1
# to evaluate classifier
# @params snvConsensus multi-caller consensus snv calls
# @params cnvConsensus multi-caller consensus cnv calls
# @params histologyFile histology file: pbta-histologies.tsv
# @params outputFolder output folder for alteration file
# @params gencode cds bed file from gencode
suppressPackageStartupMessages(library("optparse"))
suppressPackageStartupMessages(library("tidyverse"))
suppressPackageStartupMessages(library("readr"))
suppressPackageStartupMessages(library("GenomicRanges"))
#### Source functions ----------------------------------------------------------
# We can use functions from the `snv-callers` module of the OpenPBTA project
# TODO: if a common util folder is established, use that instead
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
source(file.path(root_dir,
"analyses",
"snv-callers",
"util",
"tmb_functions.R"))
#### Parse command line options ------------------------------------------------
option_list <- list(
make_option(c("-s", "--snvConsensus"),type="character",
help="Consensus snv calls (.tsv) "),
make_option(c("-c","--cnvConsensus"),type="character",
help="consensus cnv calls (.tsv) "),
make_option(c("-h","--histologyFile"),type="character",
help="histology file for all samples (.tsv)"),
make_option(c("-o","--outputFolder"),type="character",
help="output folder for results "),
make_option(c("-g","--gencode"),type="character",
help="cds gencode bed file")
)
opt <- parse_args(OptionParser(option_list=option_list,add_help_option = FALSE))
snvConsensusFile <- opt$snvConsensus
histologyFile <- opt$histologyFile
outputFolder <- opt$outputFolder
gencodeBed <- opt$gencode
cnvConsesusFile <- opt$cnvConsensus
#### Generate files with TP53, NF1 mutations -----------------------------------
# read in consensus SNV files
consensus_snv <- data.table::fread(snvConsensusFile,
select = c("Chromosome",
"Start_Position",
"End_Position",
"Strand",
"Variant_Classification",
"Tumor_Sample_Barcode",
"Hugo_Symbol"),
data.table = FALSE)
# read in consensus CNV file
cnvConsesus <- data.table::fread( cnvConsesusFile,
select=c("gene_symbol",
"biospecimen_id",
"status"),
)
# gencode cds region BED file
gencode_cds <- read_tsv(gencodeBed, col_names = FALSE)
# histology file
histology <- read_tsv(histologyFile)
# filter the MAF data.frame to only include entries that fall within the
# CDS bed file regions
coding_consensus_snv <- snv_ranges_filter(maf_df = consensus_snv,
keep_ranges = gencode_cds)
# subset to TP53, removing silent mutations and mutations in introns
tp53_coding <- coding_consensus_snv %>%
filter(Hugo_Symbol == "TP53") %>%
filter(!(Variant_Classification %in% c("Silent", "Intron")))
# subset to TP53 cnv loss and format to tp53_coding file format
tp53_loss<-cnvConsesus %>%
filter(gene_symbol=="TP53",
status=="loss") %>%
rename("biospecimen_id"="Tumor_Sample_Barcode",
"status"="Variant_Classification",
"gene_symbol"="Hugo_Symbol")
# subset to NF1, removing silent mutations, mutations in introns, and missense
# mutations -- we exclude missense mutations because they are not annotated
# with OncoKB
# https://github.com/AlexsLemonade/OpenPBTA-analysis/pull/381#issuecomment-570748578
nf1_coding <- coding_consensus_snv %>%
filter(Hugo_Symbol == "NF1") %>%
filter(!(Variant_Classification %in% c("Silent",
"Intron",
"Missense_Mutation")))
# subset to NF1 loss and format to nf1_coding file format
nf1_loss<-cnvConsesus %>%
filter(gene_symbol=="NF1",
status=="loss") %>%
rename("biospecimen_id"="Tumor_Sample_Barcode",
"status"="Variant_Classification",
"gene_symbol"="Hugo_Symbol")
# include only the relevant columns from the MAF file and merge cnv loss dataframes as well
tp53_nf1_coding <- tp53_coding %>%
bind_rows(tp53_loss,nf1_coding,nf1_loss)
# biospecimen IDs for tumor or cell line DNA-seq
bs_ids <- histology %>%
filter(sample_type != "Normal",
experimental_strategy != "RNA-Seq") %>%
pull(Kids_First_Biospecimen_ID)
# all BS ids that are not in the data frame that contain the TP53 and NF1
# coding mutations should be labeled as not having either
bs_ids_without_mut <- setdiff(bs_ids,
unique(tp53_nf1_coding$Tumor_Sample_Barcode))
# add the TP53 and NF1 wildtype samples into the data.frame
tp53_nf1_coding <- bind_rows(tp53_nf1_coding,
data.frame(
Tumor_Sample_Barcode = bs_ids_without_mut,
Hugo_Symbol = "No_TP53_NF1_alt")
)
# save TP53 and NF1 SNV alterations
write_tsv(tp53_nf1_coding,
file.path(outputFolder, "TP53_NF1_snv_alteration.tsv"))