This repository has been archived by the owner on Jun 21, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 67
/
bed_to_segfile.R
179 lines (153 loc) · 5.68 KB
/
bed_to_segfile.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
# bed_to_segfile.R
#
# Josh Shapiro for CCDL 2020
#
# Purpose: Convert the bed file output from the CNV consensus workflow to a seg file
#
# Option descriptions
# -i, --cnv_file : path to the cnv consensus file
# -o, --output_file : path for output file
# --segmean-method : method for combining seg.mean values. Default `weight_mean` for the length-weighted mean
#
# example invocation:
# Rscript scripts/bed_to_segfile.R \
# -i results/cnv_consensus.tsv \
# -o results/pbta-cnv-consensus.seg
# Magrittr pipe
`%>%` <- dplyr::`%>%`
# Parse command line options
library(optparse)
option_list <- list(
make_option(
c("-i", "--cnv_file"),
type = "character",
default = NULL,
help = "consensus file tsv",
),
make_option(
c("-o", "--output_file"),
type = "character",
default = NULL,
help = "output file"
),
make_option(
c("-n", "--neutral_file"),
type = "character",
default = NULL,
help = "neutral (uncalled) regions for all samples"
),
make_option(
c("-u", "--uncalled_file"),
type = "character",
default = NULL,
help = "file of uncalled (excluded) samples"
),
make_option(
c("--segmean_method"),
type = "character",
default = "weight_mean",
help = "Method for combining segment mean values.
One of ['weight_mean', 'weight_median', 'median_interpolate'],
corresponding to weighted mean, weighted median, or weighted
median with interpolation. Default is weighted mean."
)
)
opts <- parse_args(OptionParser(option_list = option_list))
# Check options:
if (! (opts$segmean_method %in% c('weight_mean', 'weight_median', 'median_interpolate'))){
stop("--segmean_method must be one of ['weight_mean', 'weight_median', 'median_interpolate']")
}
# Read the cnv output table
cnvs <- readr::read_tsv(opts$cnv_file)
# Columns *_CNV have the format START:END:COPY_NUMBER:SEG.MEAN,START:END:COPY_NUMBER:SEG.MEAN
# This function converts ones such string into a data frane,
# which we can then make a column within the cnv data frame
segstrings_to_df <- function(segment_string){
segment_vector <- stringr::str_split(segment_string, ",")[[1]]
seg_matrix <- stringr::str_split(segment_vector, ":", simplify = TRUE)
# account for rows with no calls
if(dim(seg_matrix)[2] != 4){seg_matrix <- matrix(ncol = 4)}
colnames(seg_matrix) <- c("start", "end", "copies", "segmean")
suppressWarnings(class(seg_matrix) <- "numeric")
seg_df <- as.data.frame(seg_matrix) %>%
dplyr::mutate(length = abs(end - start))
return(seg_df)
}
## Functions to calculate weighted means and medians
## from data frames created with segstrings_to_df()
# weighted mean segmean
seg_wmean <- function(seg_df){
wmean <- weighted.mean(seg_df$segmean, seg_df$length)
}
# weighted median segmean
seg_wmedian <- function(seg_df){
wmed <- matrixStats::weightedMedian(seg_df$segmean, seg_df$length,
interpolate = FALSE)
}
## weighted median with interpolation (probably not used)
seg_wmedian_interpolate <- function(seg_df){
wmed <- matrixStats::weightedMedian(seg_df$segmean, seg_df$length,
interpolate = TRUE)
}
# weighted median copy number
copies_wmedian <- function(seg_df){
wmed <- matrixStats::weightedMedian(seg_df$copies, seg_df$length,
interpolate = FALSE)
}
segmean_function <- if (opts$segmean_method == "weight_mean"){
seg_wmean
} else if (opts$segmean_method == "weight_median"){
seg_wmedian
} else if (opts$segmean_method == "median_interpolate"){
seg_wmedian_interpolate
}
# Calculate summary stats from merged CNV calls. \
cnvs <- cnvs %>%
dplyr::mutate(cnvkit_df = purrr::map(cnvkit_CNVs, segstrings_to_df),
freec_df = purrr::map(freec_CNVs, segstrings_to_df),
segmean = purrr::map_dbl(cnvkit_df, segmean_function),
cnvkit_cn = purrr::map_dbl(cnvkit_df, copies_wmedian),
freec_cn = purrr::map_dbl(freec_df, copies_wmedian),
copynum = ifelse(is.finite(cnvkit_cn), # use cnvkit if available
cnvkit_cn, freec_cn), #otherwise use freec
num.mark = NA)
# read in the neutral regions
neutral <- readr::read_tsv(
opts$neutral_file,
col_names = c("chrom", "start", "end", "specimen")
)
# read in uncalled specimens
uncalled <- readr::read_tsv(opts$uncalled_file)
# exclude specimens that were uncalled by two or more callers
# as these can not have any CN calls by definition, so we don't want to
# call them as unchanged, just missing (they won't appear in the cnv file)
exclude_list <- uncalled %>%
dplyr::group_by(sample)%>%
dplyr::tally() %>%
dplyr::filter(n > 1) %>%
dplyr::pull(sample)
# filter out excluded samples from the neutral data and rearrange
out_neutral <- neutral %>%
dplyr::filter(!(specimen %in% exclude_list)) %>%
dplyr::select(ID = specimen,
chrom = chrom,
loc.start = start,
loc.end = end) %>%
dplyr::mutate(num.mark = NA,
seg.mean = NA,
copy.num = 2)
# unset X and Y copy numbers
out_neutral$copy.num[out_neutral$chrom %in% c("chrX", "chrY")] <- NA
# .seg output format from CNVkit is:
# ID chrom loc.start loc.end num.mark seg.mean copy.num
out_cnvs <- cnvs %>%
dplyr::select(ID = Biospecimen,
chrom = chrom,
loc.start = start,
loc.end = end,
num.mark = num.mark,
seg.mean = segmean,
copy.num = copynum)
out_table <- dplyr::bind_rows(out_neutral, out_cnvs) %>%
dplyr::arrange(ID, chrom, loc.start)
readr::write_tsv(out_table, opts$output_file)