diff --git a/.circleci/config.yml b/.circleci/config.yml
index 247300eab1..5987209a20 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -82,7 +82,7 @@ jobs:
- run:
name: Focal CN Preparation
- command: ./scripts/run_in_ci.sh bash analyses/focal-cn-file-preparation/run-prepare-cn.sh
+ command: OPENPBTA_FILT=1 ./scripts/run_in_ci.sh bash analyses/focal-cn-file-preparation/run-prepare-cn.sh
- run:
name: Interaction plot
diff --git a/analyses/focal-cn-file-preparation/.gitignore b/analyses/focal-cn-file-preparation/.gitignore
new file mode 100644
index 0000000000..f55cc6487f
--- /dev/null
+++ b/analyses/focal-cn-file-preparation/.gitignore
@@ -0,0 +1,2 @@
+# AnnotationDbi object is too large to be committed
+annotation_files/txdb_from_gencode.v27.gtf.db
diff --git a/analyses/focal-cn-file-preparation/00-add-ploidy-cnvkit.Rmd b/analyses/focal-cn-file-preparation/00-add-ploidy-cnvkit.Rmd
new file mode 100644
index 0000000000..99b12a7a96
--- /dev/null
+++ b/analyses/focal-cn-file-preparation/00-add-ploidy-cnvkit.Rmd
@@ -0,0 +1,60 @@
+---
+title: "Add ploidy column, status to CNVkit output"
+output: html_notebook
+author: J. Taroni for ALSF CCDL
+date: 2019
+---
+
+The `pbta-histologies.tsv` file contains a `tumor_ploidy` column, which is tumor ploidy as inferred by ControlFreeC.
+The copy number information should be interpreted in the light of this information (see: [current version of Data Formats section of README](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/390f1e08e481da5ec0b2c62d886d5fd298bbf017#data-formats)).
+
+This notebook adds ploidy information to the CNVkit results and adds a status column that defines gain and loss broadly.
+
+```{r}
+library(dplyr)
+```
+
+### Read in data
+
+```{r}
+cnvkit_file <- file.path("..", "..", "data", "pbta-cnv-cnvkit.seg.gz")
+cnvkit_df <- readr::read_tsv(cnvkit_file)
+```
+
+```{r}
+histologies_file <- file.path("..", "..", "data", "pbta-histologies.tsv")
+histologies_df <- readr::read_tsv(histologies_file)
+```
+
+### Add inferred ploidy information to CNVkit results
+
+```{r}
+add_ploidy_df <- histologies_df %>%
+ select(Kids_First_Biospecimen_ID, tumor_ploidy) %>%
+ inner_join(cnvkit_df, by = c("Kids_First_Biospecimen_ID" = "ID")) %>%
+ select(-tumor_ploidy, everything())
+```
+
+### Add status column
+
+This is intended to mirror the information contained in the ControlFreeC output.
+
+```{r}
+add_ploidy_df <- add_ploidy_df %>%
+ mutate(status = case_when(
+ # when the copy number is less than inferred ploidy, mark this as a loss
+ copy.num < tumor_ploidy ~ "loss",
+ # if copy number is higher than ploidy, mark as a gain
+ copy.num > tumor_ploidy ~ "gain",
+ copy.num == tumor_ploidy ~ "neutral"
+ ))
+
+head(add_ploidy_df, 10)
+```
+
+### Write to `scratch`
+
+```{r}
+output_file <- file.path("..", "..", "scratch", "cnvkit_with_status.tsv")
+readr::write_tsv(add_ploidy_df, output_file)
+```
diff --git a/analyses/focal-cn-file-preparation/00-add-ploidy-cnvkit.nb.html b/analyses/focal-cn-file-preparation/00-add-ploidy-cnvkit.nb.html
new file mode 100644
index 0000000000..ba1a252f8e
--- /dev/null
+++ b/analyses/focal-cn-file-preparation/00-add-ploidy-cnvkit.nb.html
@@ -0,0 +1,1932 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
The pbta-histologies.tsv
file contains a tumor_ploidy
column, which is tumor ploidy as inferred by ControlFreeC. The copy number information should be interpreted in the light of this information (see: current version of Data Formats section of README).
+
This notebook adds ploidy information to the CNVkit results and adds a status column that defines gain and loss broadly.
+
+
+
+
library(dplyr)
+
+
+
+Attaching package: 'dplyr'
+
+
+
The following objects are masked from 'package:stats':
+
+ filter, lag
+
+
+
The following objects are masked from 'package:base':
+
+ intersect, setdiff, setequal, union
+
+
+
+
+
Read in data
+
+
+
+
cnvkit_file <- file.path("..", "..", "data", "pbta-cnv-cnvkit.seg.gz")
+cnvkit_df <- readr::read_tsv(cnvkit_file)
+
+
+
Parsed with column specification:
+cols(
+ ID = col_character(),
+ chrom = col_character(),
+ loc.start = col_double(),
+ loc.end = col_double(),
+ num.mark = col_double(),
+ seg.mean = col_double(),
+ copy.num = col_double()
+)
+
+
+
+
+
+
+
histologies_file <- file.path("..", "..", "data", "pbta-histologies.tsv")
+histologies_df <- readr::read_tsv(histologies_file)
+
+
+
Parsed with column specification:
+cols(
+ .default = col_character(),
+ OS_days = col_double(),
+ age_last_update_days = col_double(),
+ normal_fraction = col_double(),
+ tumor_fraction = col_double(),
+ tumor_ploidy = col_double()
+)
+
+
+
See spec(...) for full column specifications.
+
+
+
+
+
+
+
Add status column
+
This is intended to mirror the information contained in the ControlFreeC output.
+
+
+
+
add_ploidy_df <- add_ploidy_df %>%
+ mutate(status = case_when(
+ # when the copy number is less than inferred ploidy, mark this as a loss
+ copy.num < tumor_ploidy ~ "loss",
+ # if copy number is higher than ploidy, mark as a gain
+ copy.num > tumor_ploidy ~ "gain",
+ copy.num == tumor_ploidy ~ "neutral"
+ ))
+
+head(add_ploidy_df, 10)
+
+
+
+
+
+
+
+
+
Write to scratch
+
+
+
+
output_file <- file.path("..", "..", "scratch", "cnvkit_with_status.tsv")
+readr::write_tsv(add_ploidy_df, output_file)
+
+
+
+
+
LS0tCnRpdGxlOiAiQWRkIHBsb2lkeSBjb2x1bW4sIHN0YXR1cyB0byBDTlZraXQgb3V0cHV0IgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKYXV0aG9yOiBKLiBUYXJvbmkgZm9yIEFMU0YgQ0NETApkYXRlOiAyMDE5Ci0tLQoKVGhlIGBwYnRhLWhpc3RvbG9naWVzLnRzdmAgZmlsZSBjb250YWlucyBhIGB0dW1vcl9wbG9pZHlgIGNvbHVtbiwgd2hpY2ggaXMgdHVtb3IgcGxvaWR5IGFzIGluZmVycmVkIGJ5IENvbnRyb2xGcmVlQy4KVGhlIGNvcHkgbnVtYmVyIGluZm9ybWF0aW9uIHNob3VsZCBiZSBpbnRlcnByZXRlZCBpbiB0aGUgbGlnaHQgb2YgdGhpcyBpbmZvcm1hdGlvbiAoc2VlOiBbY3VycmVudCB2ZXJzaW9uIG9mIERhdGEgRm9ybWF0cyBzZWN0aW9uIG9mIFJFQURNRV0oaHR0cHM6Ly9naXRodWIuY29tL0FsZXhzTGVtb25hZGUvT3BlblBCVEEtYW5hbHlzaXMvdHJlZS8zOTBmMWUwOGU0ODFkYTVlYzBiMmM2MmQ4ODZkNWZkMjk4YmJmMDE3I2RhdGEtZm9ybWF0cykpLgoKVGhpcyBub3RlYm9vayBhZGRzIHBsb2lkeSBpbmZvcm1hdGlvbiB0byB0aGUgQ05Wa2l0IHJlc3VsdHMgYW5kIGFkZHMgYSBzdGF0dXMgY29sdW1uIHRoYXQgZGVmaW5lcyBnYWluIGFuZCBsb3NzIGJyb2FkbHkuCgpgYGB7cn0KbGlicmFyeShkcGx5cikKYGBgCgojIyMgUmVhZCBpbiBkYXRhCgpgYGB7cn0KY252a2l0X2ZpbGUgPC0gZmlsZS5wYXRoKCIuLiIsICIuLiIsICJkYXRhIiwgInBidGEtY252LWNudmtpdC5zZWcuZ3oiKQpjbnZraXRfZGYgPC0gcmVhZHI6OnJlYWRfdHN2KGNudmtpdF9maWxlKQpgYGAKCmBgYHtyfQpoaXN0b2xvZ2llc19maWxlIDwtIGZpbGUucGF0aCgiLi4iLCAiLi4iLCAiZGF0YSIsICJwYnRhLWhpc3RvbG9naWVzLnRzdiIpCmhpc3RvbG9naWVzX2RmIDwtIHJlYWRyOjpyZWFkX3RzdihoaXN0b2xvZ2llc19maWxlKQpgYGAKCiMjIyBBZGQgaW5mZXJyZWQgcGxvaWR5IGluZm9ybWF0aW9uIHRvIENOVmtpdCByZXN1bHRzCgpgYGB7cn0KYWRkX3Bsb2lkeV9kZiA8LSBoaXN0b2xvZ2llc19kZiAlPiUKICBzZWxlY3QoS2lkc19GaXJzdF9CaW9zcGVjaW1lbl9JRCwgdHVtb3JfcGxvaWR5KSAlPiUKICBpbm5lcl9qb2luKGNudmtpdF9kZiwgYnkgPSBjKCJLaWRzX0ZpcnN0X0Jpb3NwZWNpbWVuX0lEIiA9ICJJRCIpKSAlPiUKICBzZWxlY3QoLXR1bW9yX3Bsb2lkeSwgZXZlcnl0aGluZygpKQpgYGAKCiMjIyBBZGQgc3RhdHVzIGNvbHVtbgoKVGhpcyBpcyBpbnRlbmRlZCB0byBtaXJyb3IgdGhlIGluZm9ybWF0aW9uIGNvbnRhaW5lZCBpbiB0aGUgQ29udHJvbEZyZWVDIG91dHB1dC4KCmBgYHtyfQphZGRfcGxvaWR5X2RmIDwtIGFkZF9wbG9pZHlfZGYgJT4lCiAgbXV0YXRlKHN0YXR1cyA9IGNhc2Vfd2hlbigKICAgICMgd2hlbiB0aGUgY29weSBudW1iZXIgaXMgbGVzcyB0aGFuIGluZmVycmVkIHBsb2lkeSwgbWFyayB0aGlzIGFzIGEgbG9zcwogICAgY29weS5udW0gPCB0dW1vcl9wbG9pZHkgfiAibG9zcyIsCiAgICAjIGlmIGNvcHkgbnVtYmVyIGlzIGhpZ2hlciB0aGFuIHBsb2lkeSwgbWFyayBhcyBhIGdhaW4KICAgIGNvcHkubnVtID4gdHVtb3JfcGxvaWR5IH4gImdhaW4iLAogICAgY29weS5udW0gPT0gdHVtb3JfcGxvaWR5IH4gIm5ldXRyYWwiCiAgKSkKCmhlYWQoYWRkX3Bsb2lkeV9kZiwgMTApCmBgYAoKIyMjIFdyaXRlIHRvIGBzY3JhdGNoYAoKYGBge3J9Cm91dHB1dF9maWxlIDwtIGZpbGUucGF0aCgiLi4iLCAiLi4iLCAic2NyYXRjaCIsICJjbnZraXRfd2l0aF9zdGF0dXMudHN2IikKcmVhZHI6OndyaXRlX3RzdihhZGRfcGxvaWR5X2RmLCBvdXRwdXRfZmlsZSkKYGBgCg==
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/analyses/focal-cn-file-preparation/01-prepare-cn-file.R b/analyses/focal-cn-file-preparation/01-prepare-cn-file.R
index e7d12beb8c..59b4964228 100644
--- a/analyses/focal-cn-file-preparation/01-prepare-cn-file.R
+++ b/analyses/focal-cn-file-preparation/01-prepare-cn-file.R
@@ -29,7 +29,7 @@ if (!("IRanges" %in% installed.packages())) {
BiocManager::install("IRanges", update = FALSE)
}
-# Install annotatr
+# Install annotatr
if (!("annotatr" %in% installed.packages())) {
BiocManager::install("annotatr", update = FALSE)
}
@@ -55,10 +55,43 @@ if (!("AnnotationDbi" %in% installed.packages())) {
# Declare command line options
option_list <- list(
optparse::make_option(
- c("-f", "--seg_file"),
+ c("--cnv_file"),
type = "character",
default = NULL,
- help = "file path to SEG file that contains cnv information"
+ help = "file path to file that contains CNV information"
+ ),
+ optparse::make_option(
+ c("--gtf_file"),
+ type = "character",
+ default = NULL,
+ help = "file path to human genome GTF file"
+ ),
+ optparse::make_option(
+ c("--filename_lead"),
+ type = "character",
+ default = "annotated_cn",
+ help = "used in file names"
+ ),
+ optparse::make_option(
+ c("--chrom_filter"),
+ type = "integer",
+ default = 0,
+ help = "integer (0/1) that will be converted into a logical that indicates
+ whether or not filtering to exons on chr 1:22 will be performed."
+ ),
+ optparse::make_option(
+ c("--controlfreec"),
+ type = "logical",
+ action = "store_true",
+ default = FALSE,
+ help = "flag used to indicate if the CNV file is the output of ControlFreeC"
+ ),
+ optparse::make_option(
+ c("--cnvkit"),
+ type = "logical",
+ action = "store_true",
+ default = FALSE,
+ help = "flag used to indicate if the CNV file is the output of CNVkit"
)
)
@@ -66,6 +99,19 @@ option_list <- list(
opt_parser <- optparse::OptionParser(option_list = option_list)
opt <- optparse::parse_args(opt_parser)
+# error handling related to specifying the CNV method
+if (all(opt$controlfreec, opt$cnvkit)) {
+ stop("--controlfreec and --cnvkit are mutually exclusive")
+}
+
+if (!any(opt$controlfreec, opt$cnvkit)) {
+ stop("You must specify the CNV file format by using --controlfreec or
+ --cnvkit")
+}
+
+# convert the chromosome filter command line
+chrom_filter <- as.logical(opt$chrom_filter)
+
#### Directories and Files -----------------------------------------------------
# Detect the ".git" folder -- this will in the project root directory.
@@ -73,7 +119,7 @@ opt <- optparse::parse_args(opt_parser)
# matter where this is called from
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
-# Set path to results directory
+# Set path to results directory
results_dir <-
file.path(root_dir, "analyses", "focal-cn-file-preparation", "results")
@@ -81,61 +127,119 @@ if (!dir.exists(results_dir)) {
dir.create(results_dir)
}
-seg_df <-
- data.table::fread(
- opt$seg_file,
- data.table = FALSE,
- stringsAsFactors = FALSE
- )
-
-#### Format seg file and overlap with hg38 genome annotations ------------------
+# this is the output of GenomicFeatures::makeTxDbFromGFF
+# TODO: possibly update this when the GTF file gets included in the data
+# download; may also remove the --gtf_file option and hardcode it?
+annotation_directory <- file.path(root_dir,
+ "analyses",
+ "focal-cn-file-preparation",
+ "annotation_files")
+annotation_file <- file.path(annotation_directory,
+ "txdb_from_gencode.v27.gtf.db")
+
+#### Format CNV file and overlap with hg38 genome annotations ------------------
+
+# we want to standardize the formats between the two methods here and drop
+# columns we won't need to
+if (opt$cnvkit) {
+ cnv_df <- readr::read_tsv(opt$cnv_file) %>%
+ dplyr::rename(chr = chrom, start = loc.start, end = loc.end,
+ copy_number = copy.num) %>%
+ dplyr::select(-num.mark, -seg.mean) %>%
+ dplyr::select(-Kids_First_Biospecimen_ID, dplyr::everything())
+}
-# Exclude the X and Y chromosomes, exclude `copy.num` == 2, and rearrange
-# ID column to be the last column
-seg_no_xy <- seg_df %>%
- dplyr::filter(!(chrom %in% c("chrX", "chrY")), (copy.num != 2)) %>%
- dplyr::select(-ID, dplyr::everything())
+if (opt$controlfreec) {
+ # TODO: filter based on the p-values rather than just dropping them?
+ cnv_df <- readr::read_tsv(opt$cnv_file) %>%
+ dplyr::rename(copy_number = copy.number) %>%
+ dplyr::mutate(chr = paste0("chr", chr)) %>%
+ dplyr::select(-segment_genotype, -uncertainty, -WilcoxonRankSumTestPvalue,
+ -KolmogorovSmirnovPvalue) %>%
+ dplyr::select(-Kids_First_Biospecimen_ID, dplyr::everything())
+}
-# Make seg data.frame a GRanges object
-seg_gr <- seg_no_xy %>%
+# Remove neutral copy number calls and mark possible amplifcation
+cnv_df <- cnv_df %>%
+ dplyr::filter(status != "neutral") %>%
+ dplyr::mutate(status = dplyr::case_when(
+ copy_number > (2 * tumor_ploidy) ~ "amplification",
+ TRUE ~ status
+ ))
+
+# Addressing autosomes first
+# Exclude the X and Y chromosomes
+cnv_no_xy <- cnv_df %>%
+ dplyr::filter(!(chr %in% c("chrX", "chrY")))
+
+# Make seg data.frame a GRanges object
+cnv_no_xy_gr <- cnv_no_xy %>%
GenomicRanges::makeGRangesFromDataFrame(keep.extra.columns = TRUE,
starts.in.df.are.0based = FALSE)
-# Define the annotations for the hg38 genome
-txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene::TxDb.Hsapiens.UCSC.hg38.knownGene
-genes <- GenomicFeatures::genes(txdb)
+if (!file.exists(annotation_file)) {
+ # Define the annotations for the hg38 genome
+ txdb <- GenomicFeatures::makeTxDbFromGFF(
+ file = opt$gtf_file,
+ format = "gtf"
+ )
+ # can do this even if the directory exists
+ dir.create(annotation_directory, showWarnings = FALSE)
+ # write this to file to save time next time
+ AnnotationDbi::saveDb(txdb, annotation_file)
+} else {
+ txdb <- AnnotationDbi::loadDb(annotation_file)
+}
-# Create a data.frame with the overlaps between the seg file and hg38 genome
-# annotations
-overlaps <- IRanges::mergeByOverlaps(seg_gr, genes)
+# if the chromosome filter is on, then we will only look at exons on chr 1:22
+if (chrom_filter) {
+ # we'll only look at exons on chromosomes 1:22 -- this will get rid of things
+ # like alternates
+ message("Filtering to exons on chromosomes 1:22...")
+ chroms <- paste0("chr", 1:22)
+ chrom_filter <- list(tx_chrom = chroms)
+ tx_exons <- GenomicFeatures::exons(txdb,
+ filter = chrom_filter,
+ columns = "gene_id")
+} else {
+ # extract the exons but include ensembl gene identifiers
+ tx_exons <- GenomicFeatures::exons(txdb, columns = "gene_id")
+}
-overlapSymbols <-
- AnnotationDbi::mapIds(
- org.Hs.eg.db::org.Hs.eg.db,
- keys = overlaps$gene_id,
- column = "SYMBOL",
- keytype = "ENTREZID"
- )
+# Create a data.frame with the overlaps between the seg file and hg38 genome
+# annotations
+overlaps <- IRanges::mergeByOverlaps(cnv_no_xy_gr, tx_exons)
-overlaps$gene_id <- overlapSymbols
+# remove what we no longer need from the environment
+rm(tx_exons, cnv_no_xy_gr, cnv_no_xy)
# Create a data.frame with selected columns from the `overlaps` object
-cn_short <- data.frame(
- gene_symbol = overlaps$gene_id,
- biospecimen_id = overlaps$ID,
- copy_number = overlaps$copy.num) %>%
- dplyr::filter(!(is.na(gene_symbol)))
-
-# Create a `label` column with values specifying the type of CN aberration and
-# save as a data.frame with `gene_symbol`, `biospecimen_id`, and `copy_number`
-annotated_cn <- cn_short %>%
+annotated_cn <- data.frame(biospecimen_id = overlaps$Kids_First_Biospecimen_ID,
+ status = overlaps$status,
+ copy_number = overlaps$copy_number,
+ ploidy = overlaps$tumor_ploidy,
+ ensembl = unlist(overlaps$gene_id),
+ stringsAsFactors = FALSE) %>%
dplyr::distinct() %>%
- dplyr::mutate(label = dplyr::case_when(
- copy_number == 3 | copy_number == 4 ~ "Gain",
- copy_number == 0 ~ "Hom_Deletion",
- copy_number == 1 ~ "Hem_Deletion",
- copy_number >= 5 ~ "Amplification")) %>%
- dplyr::select(gene_symbol, biospecimen_id, label, copy_number)
+ # Discard the gene version information in order to get gene symbols and
+ # cytoband mappings
+ dplyr::mutate(ensembl = gsub("\\..*", "", ensembl))
+
+annotated_cn <- annotated_cn %>%
+ dplyr::mutate(
+ gene_symbol = AnnotationDbi::mapIds(org.Hs.eg.db::org.Hs.eg.db,
+ keys = ensembl,
+ column = "SYMBOL",
+ keytype = "ENSEMBL"),
+ cytoband = AnnotationDbi::mapIds(org.Hs.eg.db::org.Hs.eg.db,
+ keys = ensembl,
+ column = "MAP",
+ keytype = "ENSEMBL")
+ ) %>%
+ dplyr::filter(!is.na(gene_symbol))
+
+# Output file name
+output_file <- paste0(opt$filename_lead, "_autosomes.tsv")
# Save final data.frame to a tsv file
-readr::write_tsv(annotated_cn, file.path(results_dir, "annotated_cn.tsv"))
+readr::write_tsv(annotated_cn, file.path(results_dir, output_file))
diff --git a/analyses/focal-cn-file-preparation/results/annotated_cn.tsv b/analyses/focal-cn-file-preparation/results/cnvkit_annotated_cn_autosomes.tsv.gz
similarity index 75%
rename from analyses/focal-cn-file-preparation/results/annotated_cn.tsv
rename to analyses/focal-cn-file-preparation/results/cnvkit_annotated_cn_autosomes.tsv.gz
index 870ed90f18..4028bd1f36 100644
Binary files a/analyses/focal-cn-file-preparation/results/annotated_cn.tsv and b/analyses/focal-cn-file-preparation/results/cnvkit_annotated_cn_autosomes.tsv.gz differ
diff --git a/analyses/focal-cn-file-preparation/results/controlfreec_annotated_cn_autosomes.tsv.gz b/analyses/focal-cn-file-preparation/results/controlfreec_annotated_cn_autosomes.tsv.gz
new file mode 100644
index 0000000000..36a4437fd8
Binary files /dev/null and b/analyses/focal-cn-file-preparation/results/controlfreec_annotated_cn_autosomes.tsv.gz differ
diff --git a/analyses/focal-cn-file-preparation/run-prepare-cn.sh b/analyses/focal-cn-file-preparation/run-prepare-cn.sh
index 45d1bdc887..01b52aec53 100644
--- a/analyses/focal-cn-file-preparation/run-prepare-cn.sh
+++ b/analyses/focal-cn-file-preparation/run-prepare-cn.sh
@@ -3,6 +3,13 @@
#
# Usage: bash run-prepare-cn.sh
+set -e
+set -o pipefail
+
+# by default we will not filter to exons on chr 1:22, but this will save us
+# time in CI
+CHROMFILT=${OPENPBTA_FILT:-0}
+
# This script should always run as if it were being called from
# the directory it lives in.
script_directory="$(perl -e 'use File::Basename;
@@ -10,5 +17,27 @@ script_directory="$(perl -e 'use File::Basename;
print dirname(abs_path(@ARGV[0]));' -- "$0")"
cd "$script_directory" || exit
+# Prep the CNVkit data
+Rscript --vanilla -e "rmarkdown::render('00-add-ploidy-cnvkit.Rmd', clean = TRUE)"
+
+# Run annotation step for CNVkit
+# TODO: update once GTF file is included with download
Rscript --vanilla 01-prepare-cn-file.R \
- --seg_file ../../data/pbta-cnv-cnvkit.seg.gz
+ --cnv_file ../../scratch/cnvkit_with_status.tsv \
+ --gtf_file ../collapse-rnaseq/gencode.v27.primary_assembly.annotation.gtf.gz \
+ --filename_lead "cnvkit_annotated_cn" \
+ --chrom_filter $CHROMFILT \
+ --cnvkit
+
+# Run annotation step for ControlFreeC
+# TODO: update once GTF file is included with download
+Rscript --vanilla 01-prepare-cn-file.R \
+ --cnv_file ../../data/pbta-cnv-controlfreec.tsv.gz \
+ --gtf_file ../collapse-rnaseq/gencode.v27.primary_assembly.annotation.gtf.gz \
+ --filename_lead "controlfreec_annotated_cn" \
+ --chrom_filter $CHROMFILT \
+ --controlfreec
+
+# gzip the two files in the results folder, overwriting without prompt
+gzip -f results/cnvkit_annotated_cn_autosomes.tsv
+gzip -f results/controlfreec_annotated_cn_autosomes.tsv