diff --git a/.circleci/config.yml b/.circleci/config.yml index 247300eab1..5987209a20 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -82,7 +82,7 @@ jobs: - run: name: Focal CN Preparation - command: ./scripts/run_in_ci.sh bash analyses/focal-cn-file-preparation/run-prepare-cn.sh + command: OPENPBTA_FILT=1 ./scripts/run_in_ci.sh bash analyses/focal-cn-file-preparation/run-prepare-cn.sh - run: name: Interaction plot diff --git a/analyses/focal-cn-file-preparation/.gitignore b/analyses/focal-cn-file-preparation/.gitignore new file mode 100644 index 0000000000..f55cc6487f --- /dev/null +++ b/analyses/focal-cn-file-preparation/.gitignore @@ -0,0 +1,2 @@ +# AnnotationDbi object is too large to be committed +annotation_files/txdb_from_gencode.v27.gtf.db diff --git a/analyses/focal-cn-file-preparation/00-add-ploidy-cnvkit.Rmd b/analyses/focal-cn-file-preparation/00-add-ploidy-cnvkit.Rmd new file mode 100644 index 0000000000..99b12a7a96 --- /dev/null +++ b/analyses/focal-cn-file-preparation/00-add-ploidy-cnvkit.Rmd @@ -0,0 +1,60 @@ +--- +title: "Add ploidy column, status to CNVkit output" +output: html_notebook +author: J. Taroni for ALSF CCDL +date: 2019 +--- + +The `pbta-histologies.tsv` file contains a `tumor_ploidy` column, which is tumor ploidy as inferred by ControlFreeC. +The copy number information should be interpreted in the light of this information (see: [current version of Data Formats section of README](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/390f1e08e481da5ec0b2c62d886d5fd298bbf017#data-formats)). + +This notebook adds ploidy information to the CNVkit results and adds a status column that defines gain and loss broadly. + +```{r} +library(dplyr) +``` + +### Read in data + +```{r} +cnvkit_file <- file.path("..", "..", "data", "pbta-cnv-cnvkit.seg.gz") +cnvkit_df <- readr::read_tsv(cnvkit_file) +``` + +```{r} +histologies_file <- file.path("..", "..", "data", "pbta-histologies.tsv") +histologies_df <- readr::read_tsv(histologies_file) +``` + +### Add inferred ploidy information to CNVkit results + +```{r} +add_ploidy_df <- histologies_df %>% + select(Kids_First_Biospecimen_ID, tumor_ploidy) %>% + inner_join(cnvkit_df, by = c("Kids_First_Biospecimen_ID" = "ID")) %>% + select(-tumor_ploidy, everything()) +``` + +### Add status column + +This is intended to mirror the information contained in the ControlFreeC output. + +```{r} +add_ploidy_df <- add_ploidy_df %>% + mutate(status = case_when( + # when the copy number is less than inferred ploidy, mark this as a loss + copy.num < tumor_ploidy ~ "loss", + # if copy number is higher than ploidy, mark as a gain + copy.num > tumor_ploidy ~ "gain", + copy.num == tumor_ploidy ~ "neutral" + )) + +head(add_ploidy_df, 10) +``` + +### Write to `scratch` + +```{r} +output_file <- file.path("..", "..", "scratch", "cnvkit_with_status.tsv") +readr::write_tsv(add_ploidy_df, output_file) +``` diff --git a/analyses/focal-cn-file-preparation/00-add-ploidy-cnvkit.nb.html b/analyses/focal-cn-file-preparation/00-add-ploidy-cnvkit.nb.html new file mode 100644 index 0000000000..ba1a252f8e --- /dev/null +++ b/analyses/focal-cn-file-preparation/00-add-ploidy-cnvkit.nb.html @@ -0,0 +1,1932 @@ + + + + + + + + + + + + + + + +Add ploidy column, status to CNVkit output + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + +

The pbta-histologies.tsv file contains a tumor_ploidy column, which is tumor ploidy as inferred by ControlFreeC. The copy number information should be interpreted in the light of this information (see: current version of Data Formats section of README).

+

This notebook adds ploidy information to the CNVkit results and adds a status column that defines gain and loss broadly.

+ + + +
library(dplyr)
+ + +

+Attaching package: 'dplyr'
+ + +
The following objects are masked from 'package:stats':
+
+    filter, lag
+ + +
The following objects are masked from 'package:base':
+
+    intersect, setdiff, setequal, union
+ + + +
+

Read in data

+ + + +
cnvkit_file <- file.path("..", "..", "data", "pbta-cnv-cnvkit.seg.gz")
+cnvkit_df <- readr::read_tsv(cnvkit_file)
+ + +
Parsed with column specification:
+cols(
+  ID = col_character(),
+  chrom = col_character(),
+  loc.start = col_double(),
+  loc.end = col_double(),
+  num.mark = col_double(),
+  seg.mean = col_double(),
+  copy.num = col_double()
+)
+ + + + + + +
histologies_file <- file.path("..", "..", "data", "pbta-histologies.tsv")
+histologies_df <- readr::read_tsv(histologies_file)
+ + +
Parsed with column specification:
+cols(
+  .default = col_character(),
+  OS_days = col_double(),
+  age_last_update_days = col_double(),
+  normal_fraction = col_double(),
+  tumor_fraction = col_double(),
+  tumor_ploidy = col_double()
+)
+ + +
See spec(...) for full column specifications.
+ + + +
+
+

Add inferred ploidy information to CNVkit results

+ + + +
add_ploidy_df <- histologies_df %>%
+  select(Kids_First_Biospecimen_ID, tumor_ploidy) %>%
+  inner_join(cnvkit_df, by = c("Kids_First_Biospecimen_ID" = "ID")) %>%
+  select(-tumor_ploidy, everything())
+ + + +
+
+

Add status column

+

This is intended to mirror the information contained in the ControlFreeC output.

+ + + +
add_ploidy_df <- add_ploidy_df %>%
+  mutate(status = case_when(
+    # when the copy number is less than inferred ploidy, mark this as a loss
+    copy.num < tumor_ploidy ~ "loss",
+    # if copy number is higher than ploidy, mark as a gain
+    copy.num > tumor_ploidy ~ "gain",
+    copy.num == tumor_ploidy ~ "neutral"
+  ))
+
+head(add_ploidy_df, 10)
+ +
+ +
+ + +
+
+

Write to scratch

+ + + +
output_file <- file.path("..", "..", "scratch", "cnvkit_with_status.tsv")
+readr::write_tsv(add_ploidy_df, output_file)
+ + +
+ +
LS0tCnRpdGxlOiAiQWRkIHBsb2lkeSBjb2x1bW4sIHN0YXR1cyB0byBDTlZraXQgb3V0cHV0IgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKYXV0aG9yOiBKLiBUYXJvbmkgZm9yIEFMU0YgQ0NETApkYXRlOiAyMDE5Ci0tLQoKVGhlIGBwYnRhLWhpc3RvbG9naWVzLnRzdmAgZmlsZSBjb250YWlucyBhIGB0dW1vcl9wbG9pZHlgIGNvbHVtbiwgd2hpY2ggaXMgdHVtb3IgcGxvaWR5IGFzIGluZmVycmVkIGJ5IENvbnRyb2xGcmVlQy4KVGhlIGNvcHkgbnVtYmVyIGluZm9ybWF0aW9uIHNob3VsZCBiZSBpbnRlcnByZXRlZCBpbiB0aGUgbGlnaHQgb2YgdGhpcyBpbmZvcm1hdGlvbiAoc2VlOiBbY3VycmVudCB2ZXJzaW9uIG9mIERhdGEgRm9ybWF0cyBzZWN0aW9uIG9mIFJFQURNRV0oaHR0cHM6Ly9naXRodWIuY29tL0FsZXhzTGVtb25hZGUvT3BlblBCVEEtYW5hbHlzaXMvdHJlZS8zOTBmMWUwOGU0ODFkYTVlYzBiMmM2MmQ4ODZkNWZkMjk4YmJmMDE3I2RhdGEtZm9ybWF0cykpLgoKVGhpcyBub3RlYm9vayBhZGRzIHBsb2lkeSBpbmZvcm1hdGlvbiB0byB0aGUgQ05Wa2l0IHJlc3VsdHMgYW5kIGFkZHMgYSBzdGF0dXMgY29sdW1uIHRoYXQgZGVmaW5lcyBnYWluIGFuZCBsb3NzIGJyb2FkbHkuCgpgYGB7cn0KbGlicmFyeShkcGx5cikKYGBgCgojIyMgUmVhZCBpbiBkYXRhCgpgYGB7cn0KY252a2l0X2ZpbGUgPC0gZmlsZS5wYXRoKCIuLiIsICIuLiIsICJkYXRhIiwgInBidGEtY252LWNudmtpdC5zZWcuZ3oiKQpjbnZraXRfZGYgPC0gcmVhZHI6OnJlYWRfdHN2KGNudmtpdF9maWxlKQpgYGAKCmBgYHtyfQpoaXN0b2xvZ2llc19maWxlIDwtIGZpbGUucGF0aCgiLi4iLCAiLi4iLCAiZGF0YSIsICJwYnRhLWhpc3RvbG9naWVzLnRzdiIpCmhpc3RvbG9naWVzX2RmIDwtIHJlYWRyOjpyZWFkX3RzdihoaXN0b2xvZ2llc19maWxlKQpgYGAKCiMjIyBBZGQgaW5mZXJyZWQgcGxvaWR5IGluZm9ybWF0aW9uIHRvIENOVmtpdCByZXN1bHRzCgpgYGB7cn0KYWRkX3Bsb2lkeV9kZiA8LSBoaXN0b2xvZ2llc19kZiAlPiUKICBzZWxlY3QoS2lkc19GaXJzdF9CaW9zcGVjaW1lbl9JRCwgdHVtb3JfcGxvaWR5KSAlPiUKICBpbm5lcl9qb2luKGNudmtpdF9kZiwgYnkgPSBjKCJLaWRzX0ZpcnN0X0Jpb3NwZWNpbWVuX0lEIiA9ICJJRCIpKSAlPiUKICBzZWxlY3QoLXR1bW9yX3Bsb2lkeSwgZXZlcnl0aGluZygpKQpgYGAKCiMjIyBBZGQgc3RhdHVzIGNvbHVtbgoKVGhpcyBpcyBpbnRlbmRlZCB0byBtaXJyb3IgdGhlIGluZm9ybWF0aW9uIGNvbnRhaW5lZCBpbiB0aGUgQ29udHJvbEZyZWVDIG91dHB1dC4KCmBgYHtyfQphZGRfcGxvaWR5X2RmIDwtIGFkZF9wbG9pZHlfZGYgJT4lCiAgbXV0YXRlKHN0YXR1cyA9IGNhc2Vfd2hlbigKICAgICMgd2hlbiB0aGUgY29weSBudW1iZXIgaXMgbGVzcyB0aGFuIGluZmVycmVkIHBsb2lkeSwgbWFyayB0aGlzIGFzIGEgbG9zcwogICAgY29weS5udW0gPCB0dW1vcl9wbG9pZHkgfiAibG9zcyIsCiAgICAjIGlmIGNvcHkgbnVtYmVyIGlzIGhpZ2hlciB0aGFuIHBsb2lkeSwgbWFyayBhcyBhIGdhaW4KICAgIGNvcHkubnVtID4gdHVtb3JfcGxvaWR5IH4gImdhaW4iLAogICAgY29weS5udW0gPT0gdHVtb3JfcGxvaWR5IH4gIm5ldXRyYWwiCiAgKSkKCmhlYWQoYWRkX3Bsb2lkeV9kZiwgMTApCmBgYAoKIyMjIFdyaXRlIHRvIGBzY3JhdGNoYAoKYGBge3J9Cm91dHB1dF9maWxlIDwtIGZpbGUucGF0aCgiLi4iLCAiLi4iLCAic2NyYXRjaCIsICJjbnZraXRfd2l0aF9zdGF0dXMudHN2IikKcmVhZHI6OndyaXRlX3RzdihhZGRfcGxvaWR5X2RmLCBvdXRwdXRfZmlsZSkKYGBgCg==
+ + + +
+ + + + + + + + diff --git a/analyses/focal-cn-file-preparation/01-prepare-cn-file.R b/analyses/focal-cn-file-preparation/01-prepare-cn-file.R index e7d12beb8c..59b4964228 100644 --- a/analyses/focal-cn-file-preparation/01-prepare-cn-file.R +++ b/analyses/focal-cn-file-preparation/01-prepare-cn-file.R @@ -29,7 +29,7 @@ if (!("IRanges" %in% installed.packages())) { BiocManager::install("IRanges", update = FALSE) } -# Install annotatr +# Install annotatr if (!("annotatr" %in% installed.packages())) { BiocManager::install("annotatr", update = FALSE) } @@ -55,10 +55,43 @@ if (!("AnnotationDbi" %in% installed.packages())) { # Declare command line options option_list <- list( optparse::make_option( - c("-f", "--seg_file"), + c("--cnv_file"), type = "character", default = NULL, - help = "file path to SEG file that contains cnv information" + help = "file path to file that contains CNV information" + ), + optparse::make_option( + c("--gtf_file"), + type = "character", + default = NULL, + help = "file path to human genome GTF file" + ), + optparse::make_option( + c("--filename_lead"), + type = "character", + default = "annotated_cn", + help = "used in file names" + ), + optparse::make_option( + c("--chrom_filter"), + type = "integer", + default = 0, + help = "integer (0/1) that will be converted into a logical that indicates + whether or not filtering to exons on chr 1:22 will be performed." + ), + optparse::make_option( + c("--controlfreec"), + type = "logical", + action = "store_true", + default = FALSE, + help = "flag used to indicate if the CNV file is the output of ControlFreeC" + ), + optparse::make_option( + c("--cnvkit"), + type = "logical", + action = "store_true", + default = FALSE, + help = "flag used to indicate if the CNV file is the output of CNVkit" ) ) @@ -66,6 +99,19 @@ option_list <- list( opt_parser <- optparse::OptionParser(option_list = option_list) opt <- optparse::parse_args(opt_parser) +# error handling related to specifying the CNV method +if (all(opt$controlfreec, opt$cnvkit)) { + stop("--controlfreec and --cnvkit are mutually exclusive") +} + +if (!any(opt$controlfreec, opt$cnvkit)) { + stop("You must specify the CNV file format by using --controlfreec or + --cnvkit") +} + +# convert the chromosome filter command line +chrom_filter <- as.logical(opt$chrom_filter) + #### Directories and Files ----------------------------------------------------- # Detect the ".git" folder -- this will in the project root directory. @@ -73,7 +119,7 @@ opt <- optparse::parse_args(opt_parser) # matter where this is called from root_dir <- rprojroot::find_root(rprojroot::has_dir(".git")) -# Set path to results directory +# Set path to results directory results_dir <- file.path(root_dir, "analyses", "focal-cn-file-preparation", "results") @@ -81,61 +127,119 @@ if (!dir.exists(results_dir)) { dir.create(results_dir) } -seg_df <- - data.table::fread( - opt$seg_file, - data.table = FALSE, - stringsAsFactors = FALSE - ) - -#### Format seg file and overlap with hg38 genome annotations ------------------ +# this is the output of GenomicFeatures::makeTxDbFromGFF +# TODO: possibly update this when the GTF file gets included in the data +# download; may also remove the --gtf_file option and hardcode it? +annotation_directory <- file.path(root_dir, + "analyses", + "focal-cn-file-preparation", + "annotation_files") +annotation_file <- file.path(annotation_directory, + "txdb_from_gencode.v27.gtf.db") + +#### Format CNV file and overlap with hg38 genome annotations ------------------ + +# we want to standardize the formats between the two methods here and drop +# columns we won't need to +if (opt$cnvkit) { + cnv_df <- readr::read_tsv(opt$cnv_file) %>% + dplyr::rename(chr = chrom, start = loc.start, end = loc.end, + copy_number = copy.num) %>% + dplyr::select(-num.mark, -seg.mean) %>% + dplyr::select(-Kids_First_Biospecimen_ID, dplyr::everything()) +} -# Exclude the X and Y chromosomes, exclude `copy.num` == 2, and rearrange -# ID column to be the last column -seg_no_xy <- seg_df %>% - dplyr::filter(!(chrom %in% c("chrX", "chrY")), (copy.num != 2)) %>% - dplyr::select(-ID, dplyr::everything()) +if (opt$controlfreec) { + # TODO: filter based on the p-values rather than just dropping them? + cnv_df <- readr::read_tsv(opt$cnv_file) %>% + dplyr::rename(copy_number = copy.number) %>% + dplyr::mutate(chr = paste0("chr", chr)) %>% + dplyr::select(-segment_genotype, -uncertainty, -WilcoxonRankSumTestPvalue, + -KolmogorovSmirnovPvalue) %>% + dplyr::select(-Kids_First_Biospecimen_ID, dplyr::everything()) +} -# Make seg data.frame a GRanges object -seg_gr <- seg_no_xy %>% +# Remove neutral copy number calls and mark possible amplifcation +cnv_df <- cnv_df %>% + dplyr::filter(status != "neutral") %>% + dplyr::mutate(status = dplyr::case_when( + copy_number > (2 * tumor_ploidy) ~ "amplification", + TRUE ~ status + )) + +# Addressing autosomes first +# Exclude the X and Y chromosomes +cnv_no_xy <- cnv_df %>% + dplyr::filter(!(chr %in% c("chrX", "chrY"))) + +# Make seg data.frame a GRanges object +cnv_no_xy_gr <- cnv_no_xy %>% GenomicRanges::makeGRangesFromDataFrame(keep.extra.columns = TRUE, starts.in.df.are.0based = FALSE) -# Define the annotations for the hg38 genome -txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene::TxDb.Hsapiens.UCSC.hg38.knownGene -genes <- GenomicFeatures::genes(txdb) +if (!file.exists(annotation_file)) { + # Define the annotations for the hg38 genome + txdb <- GenomicFeatures::makeTxDbFromGFF( + file = opt$gtf_file, + format = "gtf" + ) + # can do this even if the directory exists + dir.create(annotation_directory, showWarnings = FALSE) + # write this to file to save time next time + AnnotationDbi::saveDb(txdb, annotation_file) +} else { + txdb <- AnnotationDbi::loadDb(annotation_file) +} -# Create a data.frame with the overlaps between the seg file and hg38 genome -# annotations -overlaps <- IRanges::mergeByOverlaps(seg_gr, genes) +# if the chromosome filter is on, then we will only look at exons on chr 1:22 +if (chrom_filter) { + # we'll only look at exons on chromosomes 1:22 -- this will get rid of things + # like alternates + message("Filtering to exons on chromosomes 1:22...") + chroms <- paste0("chr", 1:22) + chrom_filter <- list(tx_chrom = chroms) + tx_exons <- GenomicFeatures::exons(txdb, + filter = chrom_filter, + columns = "gene_id") +} else { + # extract the exons but include ensembl gene identifiers + tx_exons <- GenomicFeatures::exons(txdb, columns = "gene_id") +} -overlapSymbols <- - AnnotationDbi::mapIds( - org.Hs.eg.db::org.Hs.eg.db, - keys = overlaps$gene_id, - column = "SYMBOL", - keytype = "ENTREZID" - ) +# Create a data.frame with the overlaps between the seg file and hg38 genome +# annotations +overlaps <- IRanges::mergeByOverlaps(cnv_no_xy_gr, tx_exons) -overlaps$gene_id <- overlapSymbols +# remove what we no longer need from the environment +rm(tx_exons, cnv_no_xy_gr, cnv_no_xy) # Create a data.frame with selected columns from the `overlaps` object -cn_short <- data.frame( - gene_symbol = overlaps$gene_id, - biospecimen_id = overlaps$ID, - copy_number = overlaps$copy.num) %>% - dplyr::filter(!(is.na(gene_symbol))) - -# Create a `label` column with values specifying the type of CN aberration and -# save as a data.frame with `gene_symbol`, `biospecimen_id`, and `copy_number` -annotated_cn <- cn_short %>% +annotated_cn <- data.frame(biospecimen_id = overlaps$Kids_First_Biospecimen_ID, + status = overlaps$status, + copy_number = overlaps$copy_number, + ploidy = overlaps$tumor_ploidy, + ensembl = unlist(overlaps$gene_id), + stringsAsFactors = FALSE) %>% dplyr::distinct() %>% - dplyr::mutate(label = dplyr::case_when( - copy_number == 3 | copy_number == 4 ~ "Gain", - copy_number == 0 ~ "Hom_Deletion", - copy_number == 1 ~ "Hem_Deletion", - copy_number >= 5 ~ "Amplification")) %>% - dplyr::select(gene_symbol, biospecimen_id, label, copy_number) + # Discard the gene version information in order to get gene symbols and + # cytoband mappings + dplyr::mutate(ensembl = gsub("\\..*", "", ensembl)) + +annotated_cn <- annotated_cn %>% + dplyr::mutate( + gene_symbol = AnnotationDbi::mapIds(org.Hs.eg.db::org.Hs.eg.db, + keys = ensembl, + column = "SYMBOL", + keytype = "ENSEMBL"), + cytoband = AnnotationDbi::mapIds(org.Hs.eg.db::org.Hs.eg.db, + keys = ensembl, + column = "MAP", + keytype = "ENSEMBL") + ) %>% + dplyr::filter(!is.na(gene_symbol)) + +# Output file name +output_file <- paste0(opt$filename_lead, "_autosomes.tsv") # Save final data.frame to a tsv file -readr::write_tsv(annotated_cn, file.path(results_dir, "annotated_cn.tsv")) +readr::write_tsv(annotated_cn, file.path(results_dir, output_file)) diff --git a/analyses/focal-cn-file-preparation/results/annotated_cn.tsv b/analyses/focal-cn-file-preparation/results/cnvkit_annotated_cn_autosomes.tsv.gz similarity index 75% rename from analyses/focal-cn-file-preparation/results/annotated_cn.tsv rename to analyses/focal-cn-file-preparation/results/cnvkit_annotated_cn_autosomes.tsv.gz index 870ed90f18..4028bd1f36 100644 Binary files a/analyses/focal-cn-file-preparation/results/annotated_cn.tsv and b/analyses/focal-cn-file-preparation/results/cnvkit_annotated_cn_autosomes.tsv.gz differ diff --git a/analyses/focal-cn-file-preparation/results/controlfreec_annotated_cn_autosomes.tsv.gz b/analyses/focal-cn-file-preparation/results/controlfreec_annotated_cn_autosomes.tsv.gz new file mode 100644 index 0000000000..36a4437fd8 Binary files /dev/null and b/analyses/focal-cn-file-preparation/results/controlfreec_annotated_cn_autosomes.tsv.gz differ diff --git a/analyses/focal-cn-file-preparation/run-prepare-cn.sh b/analyses/focal-cn-file-preparation/run-prepare-cn.sh index 45d1bdc887..01b52aec53 100644 --- a/analyses/focal-cn-file-preparation/run-prepare-cn.sh +++ b/analyses/focal-cn-file-preparation/run-prepare-cn.sh @@ -3,6 +3,13 @@ # # Usage: bash run-prepare-cn.sh +set -e +set -o pipefail + +# by default we will not filter to exons on chr 1:22, but this will save us +# time in CI +CHROMFILT=${OPENPBTA_FILT:-0} + # This script should always run as if it were being called from # the directory it lives in. script_directory="$(perl -e 'use File::Basename; @@ -10,5 +17,27 @@ script_directory="$(perl -e 'use File::Basename; print dirname(abs_path(@ARGV[0]));' -- "$0")" cd "$script_directory" || exit +# Prep the CNVkit data +Rscript --vanilla -e "rmarkdown::render('00-add-ploidy-cnvkit.Rmd', clean = TRUE)" + +# Run annotation step for CNVkit +# TODO: update once GTF file is included with download Rscript --vanilla 01-prepare-cn-file.R \ - --seg_file ../../data/pbta-cnv-cnvkit.seg.gz + --cnv_file ../../scratch/cnvkit_with_status.tsv \ + --gtf_file ../collapse-rnaseq/gencode.v27.primary_assembly.annotation.gtf.gz \ + --filename_lead "cnvkit_annotated_cn" \ + --chrom_filter $CHROMFILT \ + --cnvkit + +# Run annotation step for ControlFreeC +# TODO: update once GTF file is included with download +Rscript --vanilla 01-prepare-cn-file.R \ + --cnv_file ../../data/pbta-cnv-controlfreec.tsv.gz \ + --gtf_file ../collapse-rnaseq/gencode.v27.primary_assembly.annotation.gtf.gz \ + --filename_lead "controlfreec_annotated_cn" \ + --chrom_filter $CHROMFILT \ + --controlfreec + +# gzip the two files in the results folder, overwriting without prompt +gzip -f results/cnvkit_annotated_cn_autosomes.tsv +gzip -f results/controlfreec_annotated_cn_autosomes.tsv