This repository has been archived by the owner on Jun 21, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 67
/
03-calculate_tmb.R
304 lines (271 loc) · 10.9 KB
/
03-calculate_tmb.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
# Calculate TMB for a given db_file using Strelka and Mutect data
#
# C. Savonen for ALSF - CCDL
#
# 2019
#
# Option descriptions
#
# --db_file : Path to sqlite database file made from 01-setup_db.py
# --metadata : Relative file path to MAF file to be analyzed. Can be .gz compressed.
# Assumes file path is given from top directory of 'OpenPBTA-analysis'.
# --all_bed_wgs : File path that specifies the BED regions file to be used for the
# denominator for all mutations TMB for WGS samples.
# --all_bed_wxs : File path that specifies the BED regions file to be used for the
# denominator for all mutations TMB for WXS samples.
# --coding_bed_wgs : File path that specifies the BED regions file to be used for the
# denominator for coding only TMB for WGS samples.
# --coding_bed_wxs : File path that specifies the BED regions file to be used for the
# denominator for coding only TMB for WXS samples.
# --overwrite : If specified, will overwrite any files of the same name. Default is FALSE.
# --tcga: If TRUE, will skip PBTA metadata specific steps
#
# Command line example:
#
# Rscript analyses/snv-callers/scripts/03-calculate_tmb.R \
# --db_file scratch/testing_snv_db.sqlite \
# --output analyses/snv-callers/results/consensus \
# --metadata data/pbta-histologies.tsv \
# --all_bed_wgs scratch/intersect_strelka_mutect_WGS.bed \
# --all_bed_wxs data/WXS.hg38.100bp_padded.bed \
# --coding_bed_wgs scratch/intersect_cds_lancet_strelka_mutect_WGS.bed \
# --coding_bed_wxs scratch/intersect_cds_WXS.bed \
# --overwrite
################################ Initial Set Up ################################
# Establish base dir
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
# Import special functions
source(file.path(root_dir, "analyses", "snv-callers", "util", "tmb_functions.R"))
source(file.path(root_dir, "analyses", "snv-callers", "util", "split_mnv.R"))
# Magrittr pipe
`%>%` <- dplyr::`%>%`
# Load library:
library(optparse)
################################ Set up options ################################
# Set up optparse options
option_list <- list(
make_option(
opt_str = c("-d", "--db_file"), type = "character",
default = NULL, help = "Path to sqlite database file made from 01-setup_db.py",
metavar = "character"
),
make_option(
opt_str = c("-o", "--output"), type = "character",
default = NULL, help = "Path to folder where you would like the
output TMB file from this script to be stored.",
metavar = "character"
),
make_option(
opt_str = "--metadata", type = "character", default = "none",
help = "Relative file path (assuming from top directory of
'OpenPBTA-analysis') to MAF file to be analyzed. Can be .gz compressed.",
metavar = "character"
),
make_option(
opt_str = "--all_bed_wgs", type = "character", default = "none",
help = "File path that specifies the BED regions file to be used for the
denominator for all mutations TMB for WGS samples.",
metavar = "character"
),
make_option(
opt_str = "--all_bed_wxs", type = "character", default = "none",
help = "File path that specifies the BED regions file to be used for the
denominator for all mutations TMB for WXS samples.",
metavar = "character"
),
make_option(
opt_str = "--coding_bed_wgs", type = "character", default = "none",
help = "File path that specifies the BED regions file to be used for the
denominator for coding only TMB for WXS samples. 'OpenPBTA-analysis'",
metavar = "character"
),
make_option(
opt_str = "--coding_bed_wxs", type = "character", default = "none",
help = "File path that specifies the BED regions file to be used for the
denominator for coding only TMB for WXS samples. 'OpenPBTA-analysis'",
metavar = "character"
),
make_option(
opt_str = "--overwrite", action = "store_true",
default = FALSE, help = "If TRUE, will overwrite any files of
the same name. Default is FALSE",
metavar = "character"
),
make_option(
opt_str = "--tcga", action = "store_true",
default = FALSE, help = "If TRUE, will skip PBTA metadata specific steps",
metavar = "character"
)
)
# Parse options
opt <- parse_args(OptionParser(option_list = option_list))
# Make everything relative to root path
opt$metadata <- file.path(root_dir, opt$metadata)
opt$db_file <- file.path(root_dir, opt$db_file)
opt$all_bed_wgs <- file.path(root_dir, opt$all_bed_wgs)
opt$all_bed_wxs <- file.path(root_dir, opt$all_bed_wxs)
opt$coding_bed_wgs <- file.path(root_dir, opt$coding_bed_wgs)
opt$coding_bed_wxs <- file.path(root_dir, opt$coding_bed_wxs)
########### Check that the files we need are in the paths specified ############
needed_files <- c(
opt$metadata, opt$db_file, opt$all_bed_wgs, opt$all_bed_wxs,
opt$coding_bed_wgs, opt$coding_bed_wxs
)
# Get list of which files were found
files_found <- file.exists(needed_files)
# Report error if any of them aren't found
if (!all(files_found)) {
stop(paste("\n Could not find needed file(s):",
needed_files[which(!files_found)],
"Check your options and set up.",
sep = "\n"
))
}
############################### Set Up Output #####################################
# Set and make the plots directory
opt$output <- file.path(root_dir, opt$output)
# Make output folder
if (!dir.exists(opt$output)) {
dir.create(opt$output, recursive = TRUE)
}
# Get data name
data_name <- ifelse(opt$tcga, "tcga", "pbta")
# Declare output file based on data_name
tmb_coding_file <- file.path(
opt$output,
paste0(data_name, "-snv-mutation-tmb-coding.tsv")
)
tmb_all_file <- file.path(
opt$output,
paste0(data_name, "-snv-mutation-tmb-all.tsv")
)
# Don't bother if both files exist already and overwrite is FALSE
if (all(file.exists(c(tmb_coding_file, tmb_all_file)), !opt$overwrite)) {
stop(paste0(tmb_coding_file, tmb_all_file, "both exist and --overwrite is not
being used. Use --overwrite if you would like to overwrite these files."))
}
######################## Obtain Mutect Strelka mutations #######################
# Start up connection
con <- DBI::dbConnect(RSQLite::SQLite(), opt$db_file)
# Designate caller tables from SQL file
strelka <- dplyr::tbl(con, "strelka")
mutect <- dplyr::tbl(con, "mutect")
# Specify the columns to join by
join_cols <- c(
"Chromosome",
"Start_Position",
"Reference_Allele",
"Allele",
"Tumor_Sample_Barcode"
)
# Create the consensus for non-MNVs
strelka_mutect_maf_df <- strelka %>%
# We'll keep the Strelka2 columns and drop Mutect2 columns
dplyr::inner_join(mutect %>%
dplyr::select(join_cols),
by = join_cols,
copy = TRUE) %>%
as.data.frame()
# Get Multi-nucleotide calls from mutect as SNVs
split_mutect_df <- split_mnv(mutect) %>%
dplyr::select(join_cols)
# join MNV calls with strelka
strelka_mutect_mnv <- strelka %>%
dplyr::inner_join(split_mutect_df,
by = join_cols,
copy = TRUE
) %>%
as.data.frame()
if (opt$tcga) {
strelka_mutect_mnv <- strelka_mutect_mnv %>%
# In the TCGA MAF files, the Tumor_Sample_Barcode has the biospecimen
# information but only the first 12 characters are needed to match the metadata
dplyr::mutate(Tumor_Sample_Barcode = substr(Tumor_Sample_Barcode, 0, 12))
}
# Merge in the MNVs
strelka_mutect_maf_df <- strelka_mutect_maf_df %>%
dplyr::union(strelka_mutect_mnv,
by = join_cols
)
########################### Set up metadata columns ############################
# Print progress message
message("Setting up metadata...")
# Have to handle TCGA and PBTA metadata differently
if (opt$tcga) {
# Format two fields of metadata for use with functions
metadata <- readr::read_tsv(opt$metadata) %>%
dplyr::mutate(
experimental_strategy = "WXS", # This field doesn't exist for this data, but all is WXS
short_histology = Primary_diagnosis
) # This field is named differently
# Manifest files only have first 12 letters of the barcode so we gotta chop the end off
strelka_mutect_maf_df <- strelka_mutect_maf_df %>%
dplyr::mutate(Tumor_Sample_Barcode = substr(Tumor_Sample_Barcode, 0, 12))
} else { # pbta data
# Isolate metadata to only the samples that are in the datasets
metadata <- readr::read_tsv(opt$metadata) %>%
dplyr::filter(Kids_First_Biospecimen_ID %in% strelka_mutect_maf_df$Tumor_Sample_Barcode) %>%
dplyr::distinct(Kids_First_Biospecimen_ID, .keep_all = TRUE) %>%
dplyr::rename(Tumor_Sample_Barcode = Kids_First_Biospecimen_ID)
# Make sure that we have metadata for all these samples.
if (!all(unique(strelka_mutect_maf_df$Tumor_Sample_Barcode) %in% metadata$Tumor_Sample_Barcode)) {
stop("There are samples in this MAF file that are not in the metadata.")
}
}
# Add in metadata
strelka_mutect_maf_df <- strelka_mutect_maf_df %>%
dplyr::inner_join(metadata %>%
dplyr::select(
Tumor_Sample_Barcode,
experimental_strategy,
short_histology
),
by = "Tumor_Sample_Barcode")
############################# Coding TMB file ##################################
# If the file exists or the overwrite option is not being used, run TMB calculations
if (file.exists(tmb_coding_file) && !opt$overwrite) {
# Stop if this file exists and overwrite is set to FALSE
warning(cat(
"The 'coding only' Tumor Mutation Burden file already exists: \n",
tmb_coding_file, "\n",
"Use --overwrite if you want to overwrite it."
))
} else {
# Print out warning if this file is going to be overwritten
if (file.exists(tmb_coding_file)) {
warning("Overwriting existing 'coding only' TMB file.")
}
# Print out progress message
message(paste("Calculating 'coding only' TMB..."))
# Calculate coding only TMBs and write to file
tmb_coding_df <- calculate_tmb(strelka_mutect_maf_df,
bed_wgs = opt$coding_bed_wgs,
bed_wxs = opt$coding_bed_wxs
)
readr::write_tsv(tmb_coding_df, tmb_coding_file)
# Print out completion message
message(paste("TMB 'coding only' calculations saved to:", tmb_coding_file))
}
########################### All mutations TMB file #############################
# If the file exists or the overwrite option is not being used, run TMB calculations
if (file.exists(tmb_all_file) && !opt$overwrite) {
# Stop if this file exists and overwrite is set to FALSE
warning(cat(
"The 'all mutations' Tumor Mutation Burden file already exists: \n",
tmb_all_file, "\n",
"Use --overwrite if you want to overwrite it."
))
} else {
# Print out warning if this file is going to be overwritten
if (file.exists(tmb_coding_file)) {
warning("Overwriting existing 'all mutations' TMB file.")
}
# Calculate TMBs and write to TMB file
tmb_all_df <- calculate_tmb(strelka_mutect_maf_df,
bed_wgs = opt$all_bed_wgs,
bed_wxs = opt$all_bed_wxs
)
readr::write_tsv(tmb_all_df, tmb_all_file)
# Print out completion message
message(paste("TMB 'all' calculations saved to:", tmb_all_file))
}