This repository has been archived by the owner on Jun 21, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 67
/
tmb_functions.R
148 lines (133 loc) · 5.37 KB
/
tmb_functions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# Functions for calculating tumor mutation burden
#
# C. Savonen for ALSF - CCDL
#
# 2019
#
################################################################################
########################### Setting Up Functions ###############################
################################################################################
maf_to_granges <- function(maf_df) {
# Turn MAF data.frame into a GRanges object. All of the original data.frame will
# be stored in the `mcols` slot of the GRanges object. This original data
# frame in the `mcols` slots can be extracted later using
# @elementMetadata@listData.
#
# Args:
# maf_df: A MAF formatted data.frame with `Chromosome`, `Start_Position`,
# `End_Position`
#
# Returns:
# A Genomic Ranges formatted object.
#
# Create the GRanges object
GenomicRanges::GRanges(
seqnames = maf_df$Chromosome,
ranges = IRanges::IRanges(
start = maf_df$Start_Position,
end = maf_df$End_Position
),
mcols = maf_df
)
}
snv_ranges_filter <- function(maf_df,
keep_ranges = NULL,
bp_window = 0) {
# Given a MAF formatted data.frame and a BED regions data.frame; filter out
# any variants of the MAF df that are not within the BED regions.
#
# Args:
# maf_df: maf data that has been turned into a data.frame. Can be a maf object
# that is subsetted using `@data`.
# keep_ranges: BED ranges data.frame with columns: chromosome, start, end
# positions in that order or a Genomic Ranges object. If data.frame,
# is given, it will be converted to GRanges object
# bp_window: how many base pairs away can it be from the BED region to still
# be included? Default is 0 bp. This argument gets forwarded
# to GenomicRanges::findOverlaps's `maxgap` argument.
#
# Returns:
# The same MAF formatted data.frame with the mutations that lie outside
# the supplied BED regions filtered out.
# Turn the MAF sample mutations into a GRanges object
maf_granges <- maf_to_granges(maf_df)
# If ranges is given as a data.frame, convert
if (is.data.frame(keep_ranges)) {
# Turn the bed regions df into a GRanges object
keep_ranges <- GenomicRanges::GRanges(
seqnames = keep_ranges$X1,
ranges = IRanges::IRanges(
start = keep_ranges$X2,
end = keep_ranges$X3
)
)
}
# Find the overlap of the BED regions and the mutations This outputs a
# special GenomicRanges object that contains indices of each of these
# ranges that overlap
overlap <- GenomicRanges::findOverlaps(
maf_granges,
keep_ranges,
maxgap = bp_window
)
# Calculate of ratio of variants in this BED using the @from slot which
# indicates the indices of the ranges in `maf_ranges` that have overlaps
# with `keep_ranges`
ratio <- length(overlap@from) / nrow(maf_df)
# What fraction of mutations are in these bed regions?
cat(
"Ratio of variants in this BED:", ratio, "\n",
"Ratio of variants being filtered out:", 1 - ratio, "\n"
)
# Only keep those in the BED regions that overlap the `wxs_bed_granges`
filt_maf_df <- maf_df[unique(overlap@from), ]
# Return this matrix with the WXS mutations filtered but WGS the same
return(filt_maf_df)
}
calculate_tmb <- function(tumor_sample_barcode = NULL,
maf_df,
bed_ranges) {
# Calculate Tumor Mutational Burden a given sample in `Tumor_Sample_Barcode`
# given a target region BED frame. This function uses `snv_ranges_filter` to
# filter out SNVs outside those target BED ranges and uses the total size in
# bp of those BED ranges for the TMB denominator.
#
# TMB = # variants / size of the genome or exome surveyed / Mb
#
# Args:
# tumor_sample_barcode: A string corresponding to a sample
# id in the `Tumor_Sample_Barcode` MAF file in maf_df.
# maf_df: maf data.frame that has been turned into a data.frame, has had
# the experimental_stategy column added from the metadata (can be
# done with the `set_up_maf` function) and has WXS mutations filtered
# using `wxs_bed_filter` function (If the situation calls for it).
# bed_ranges: A GenomicRanges made from the BED file of the target regions to use for TMB
# calculations.
#
# Returns:
# A calculated total region size, and TMB for the given Tumor_Sample_Barcode,
# returned as a single row data.frame. `experimental_strategy`, `short_histology`
# columns are carried along.
# Sum up genome sizes
bed_size <- as.numeric(sum(bed_ranges@ranges@width))
# Filter to only the sample's mutations
sample_maf_df <- maf_df %>%
dplyr::filter(Tumor_Sample_Barcode == tumor_sample_barcode)
# Filter out mutations that are outside of these coding regions.
filt_maf_df <- snv_ranges_filter(sample_maf_df, keep_ranges = bed_ranges)
# Make a genome size variable
tmb <- sample_maf_df %>%
dplyr::group_by(
#TODO: Make this column passing stuff more flexible with some tidyeval maybe
Tumor_Sample_Barcode = tumor_sample_barcode,
experimental_strategy,
short_histology
) %>%
# Count number of mutations for that sample
dplyr::summarize(
mutation_count = dplyr::n(),
region_size = bed_size,
tmb = mutation_count / (region_size / 1000000)
)
return(tmb)
}