Update CNV segment to gene mapping: support both formats, use GTF, et…

…c. (#253) * Add chromosome 1:22 filtering step * Add notebook for including status in CNVkit * WIP update CN file prep * Remove outdated file * Use GTF file + exons; add cytoband; support both methods * Update module shell script and rerun * Add TODO notes * Remove chromosome filter; fixes to shell script * Add -f to gzip step * Add steps for saving annotation db Ignore due to file size * Fix how results are compressed * Add chromosome filtering option * Revert "Add steps for saving annotation db" This reverts commit 36cbb2b. * Revert "Revert "Add steps for saving annotation db"" This reverts commit b0f3615.
AlexsLemonade · Nov 9, 2019 · 713d2b8 · 713d2b8
1 parent 390f1e0
commit 713d2b8
Show file tree

Hide file tree

Showing 8 changed files with 2,178 additions and 51 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -82,7 +82,7 @@ jobs:
 
       - run:
           name: Focal CN Preparation
-          command: ./scripts/run_in_ci.sh bash analyses/focal-cn-file-preparation/run-prepare-cn.sh
+          command: OPENPBTA_FILT=1 ./scripts/run_in_ci.sh bash analyses/focal-cn-file-preparation/run-prepare-cn.sh
 
       - run:
           name: Interaction plot

diff --git a/analyses/focal-cn-file-preparation/.gitignore b/analyses/focal-cn-file-preparation/.gitignore
@@ -0,0 +1,2 @@
+# AnnotationDbi object is too large to be committed
+annotation_files/txdb_from_gencode.v27.gtf.db
diff --git a/analyses/focal-cn-file-preparation/00-add-ploidy-cnvkit.Rmd b/analyses/focal-cn-file-preparation/00-add-ploidy-cnvkit.Rmd
@@ -0,0 +1,60 @@
+---
+title: "Add ploidy column, status to CNVkit output"
+output: html_notebook
+author: J. Taroni for ALSF CCDL
+date: 2019
+---
+
+The `pbta-histologies.tsv` file contains a `tumor_ploidy` column, which is tumor ploidy as inferred by ControlFreeC.
+The copy number information should be interpreted in the light of this information (see: [current version of Data Formats section of README](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/390f1e08e481da5ec0b2c62d886d5fd298bbf017#data-formats)).
+
+This notebook adds ploidy information to the CNVkit results and adds a status column that defines gain and loss broadly.
+
+```{r}
+library(dplyr)
+```
+
+### Read in data
+
+```{r}
+cnvkit_file <- file.path("..", "..", "data", "pbta-cnv-cnvkit.seg.gz")
+cnvkit_df <- readr::read_tsv(cnvkit_file)
+```
+
+```{r}
+histologies_file <- file.path("..", "..", "data", "pbta-histologies.tsv")
+histologies_df <- readr::read_tsv(histologies_file)
+```
+
+### Add inferred ploidy information to CNVkit results
+
+```{r}
+add_ploidy_df <- histologies_df %>%
+  select(Kids_First_Biospecimen_ID, tumor_ploidy) %>%
+  inner_join(cnvkit_df, by = c("Kids_First_Biospecimen_ID" = "ID")) %>%
+  select(-tumor_ploidy, everything())
+```
+
+### Add status column
+
+This is intended to mirror the information contained in the ControlFreeC output.
+
+```{r}
+add_ploidy_df <- add_ploidy_df %>%
+  mutate(status = case_when(
+    # when the copy number is less than inferred ploidy, mark this as a loss
+    copy.num < tumor_ploidy ~ "loss",
+    # if copy number is higher than ploidy, mark as a gain
+    copy.num > tumor_ploidy ~ "gain",
+    copy.num == tumor_ploidy ~ "neutral"
+  ))
+
+head(add_ploidy_df, 10)
+```
+
+### Write to `scratch`
+
+```{r}
+output_file <- file.path("..", "..", "scratch", "cnvkit_with_status.tsv")
+readr::write_tsv(add_ploidy_df, output_file)
+```
diff --git a/analyses/focal-cn-file-preparation/00-add-ploidy-cnvkit.nb.html b/analyses/focal-cn-file-preparation/00-add-ploidy-cnvkit.nb.html
diff --git a/analyses/focal-cn-file-preparation/01-prepare-cn-file.R b/analyses/focal-cn-file-preparation/01-prepare-cn-file.R
@@ -29,7 +29,7 @@ if (!("IRanges" %in% installed.packages())) {
   BiocManager::install("IRanges", update = FALSE)
 }
 
-# Install annotatr 
+# Install annotatr
 if (!("annotatr" %in% installed.packages())) {
   BiocManager::install("annotatr", update = FALSE)
 }
@@ -55,87 +55,191 @@ if (!("AnnotationDbi" %in% installed.packages())) {
 # Declare command line options
 option_list <- list(
   optparse::make_option(
-    c("-f", "--seg_file"),
+    c("--cnv_file"),
     type = "character",
     default = NULL,
-    help = "file path to SEG file that contains cnv information"
+    help = "file path to file that contains CNV information"
+  ),
+  optparse::make_option(
+    c("--gtf_file"),
+    type = "character",
+    default = NULL,
+    help = "file path to human genome GTF file"
+  ),
+  optparse::make_option(
+    c("--filename_lead"),
+    type = "character",
+    default = "annotated_cn",
+    help = "used in file names"
+  ),
+  optparse::make_option(
+    c("--chrom_filter"),
+    type = "integer",
+    default = 0,
+    help = "integer (0/1) that will be converted into a logical that indicates
+            whether or not filtering to exons on chr 1:22 will be performed."
+  ),
+  optparse::make_option(
+    c("--controlfreec"),
+    type = "logical",
+    action = "store_true",
+    default = FALSE,
+    help = "flag used to indicate if the CNV file is the output of ControlFreeC"
+  ),
+  optparse::make_option(
+    c("--cnvkit"),
+    type = "logical",
+    action = "store_true",
+    default = FALSE,
+    help = "flag used to indicate if the CNV file is the output of CNVkit"
   )
 )
 
 # Read the arguments passed
 opt_parser <- optparse::OptionParser(option_list = option_list)
 opt <- optparse::parse_args(opt_parser)
 
+# error handling related to specifying the CNV method
+if (all(opt$controlfreec, opt$cnvkit)) {
+  stop("--controlfreec and --cnvkit are mutually exclusive")
+}
+
+if (!any(opt$controlfreec, opt$cnvkit)) {
+  stop("You must specify the CNV file format by using --controlfreec or
+       --cnvkit")
+}
+
+# convert the chromosome filter command line
+chrom_filter <- as.logical(opt$chrom_filter)
+
 #### Directories and Files -----------------------------------------------------
 
 # Detect the ".git" folder -- this will in the project root directory.
 # Use this as the root directory to ensure proper sourcing of functions no
 # matter where this is called from
 root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
 
-# Set path to results directory 
+# Set path to results directory
 results_dir <-
   file.path(root_dir, "analyses", "focal-cn-file-preparation", "results")
 
 if (!dir.exists(results_dir)) {
   dir.create(results_dir)
 }
 
-seg_df <-
-  data.table::fread(
-    opt$seg_file,
-    data.table = FALSE,
-    stringsAsFactors = FALSE
-  )
-
-#### Format seg file and overlap with hg38 genome annotations ------------------
+# this is the output of GenomicFeatures::makeTxDbFromGFF
+# TODO: possibly update this when the GTF file gets included in the data
+#       download; may also remove the --gtf_file option and hardcode it?
+annotation_directory <- file.path(root_dir,
+                                  "analyses",
+                                  "focal-cn-file-preparation",
+                                  "annotation_files")
+annotation_file <- file.path(annotation_directory,
+                             "txdb_from_gencode.v27.gtf.db")
+
+#### Format CNV file and overlap with hg38 genome annotations ------------------
+
+# we want to standardize the formats between the two methods here and drop
+# columns we won't need to
+if (opt$cnvkit) {
+  cnv_df <- readr::read_tsv(opt$cnv_file) %>%
+    dplyr::rename(chr = chrom, start = loc.start, end = loc.end,
+                  copy_number = copy.num) %>%
+    dplyr::select(-num.mark, -seg.mean) %>%
+    dplyr::select(-Kids_First_Biospecimen_ID, dplyr::everything())
+}
 
-# Exclude the X and Y chromosomes, exclude `copy.num` == 2, and rearrange 
-# ID column to be the last column
-seg_no_xy <- seg_df %>%
-  dplyr::filter(!(chrom %in% c("chrX", "chrY")), (copy.num != 2)) %>%
-  dplyr::select(-ID, dplyr::everything()) 
+if (opt$controlfreec) {
+  # TODO: filter based on the p-values rather than just dropping them?
+  cnv_df <- readr::read_tsv(opt$cnv_file) %>%
+    dplyr::rename(copy_number = copy.number) %>%
+    dplyr::mutate(chr = paste0("chr", chr)) %>%
+    dplyr::select(-segment_genotype, -uncertainty, -WilcoxonRankSumTestPvalue,
+                  -KolmogorovSmirnovPvalue) %>%
+    dplyr::select(-Kids_First_Biospecimen_ID, dplyr::everything())
+}
 
-# Make seg data.frame a GRanges object 
-seg_gr <- seg_no_xy %>%
+# Remove neutral copy number calls and mark possible amplifcation
+cnv_df <- cnv_df %>%
+  dplyr::filter(status != "neutral") %>%
+  dplyr::mutate(status = dplyr::case_when(
+    copy_number > (2 * tumor_ploidy) ~ "amplification",
+    TRUE ~ status
+  ))
+
+# Addressing autosomes first
+# Exclude the X and Y chromosomes
+cnv_no_xy <- cnv_df %>%
+  dplyr::filter(!(chr %in% c("chrX", "chrY")))
+
+# Make seg data.frame a GRanges object
+cnv_no_xy_gr <- cnv_no_xy %>%
   GenomicRanges::makeGRangesFromDataFrame(keep.extra.columns = TRUE,
                                           starts.in.df.are.0based = FALSE)
 
-# Define the annotations for the hg38 genome
-txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene::TxDb.Hsapiens.UCSC.hg38.knownGene
-genes <- GenomicFeatures::genes(txdb)
+if (!file.exists(annotation_file)) {
+  # Define the annotations for the hg38 genome
+  txdb <- GenomicFeatures::makeTxDbFromGFF(
+    file = opt$gtf_file,
+    format = "gtf"
+  )
+  # can do this even if the directory exists
+  dir.create(annotation_directory, showWarnings = FALSE)
+  # write this to file to save time next time
+  AnnotationDbi::saveDb(txdb, annotation_file)
+} else {
+  txdb <- AnnotationDbi::loadDb(annotation_file)
+}
 
-# Create a data.frame with the overlaps between the seg file and hg38 genome 
-# annotations 
-overlaps <- IRanges::mergeByOverlaps(seg_gr, genes)
+# if the chromosome filter is on, then we will only look at exons on chr 1:22
+if (chrom_filter) {
+  # we'll only look at exons on chromosomes 1:22 -- this will get rid of things
+  # like alternates
+  message("Filtering to exons on chromosomes 1:22...")
+  chroms <- paste0("chr", 1:22)
+  chrom_filter <- list(tx_chrom = chroms)
+  tx_exons <- GenomicFeatures::exons(txdb,
+                                     filter = chrom_filter,
+                                     columns = "gene_id")
+} else {
+  # extract the exons but include ensembl gene identifiers
+  tx_exons <- GenomicFeatures::exons(txdb, columns = "gene_id")
+}
 
-overlapSymbols <-
-  AnnotationDbi::mapIds(
-    org.Hs.eg.db::org.Hs.eg.db,
-    keys = overlaps$gene_id,
-    column = "SYMBOL",
-    keytype = "ENTREZID"
-  )
+# Create a data.frame with the overlaps between the seg file and hg38 genome
+# annotations
+overlaps <- IRanges::mergeByOverlaps(cnv_no_xy_gr, tx_exons)
 
-overlaps$gene_id <- overlapSymbols
+# remove what we no longer need from the environment
+rm(tx_exons, cnv_no_xy_gr, cnv_no_xy)
 
 # Create a data.frame with selected columns from the `overlaps` object
-cn_short <- data.frame(
-  gene_symbol = overlaps$gene_id,
-  biospecimen_id = overlaps$ID,
-  copy_number = overlaps$copy.num) %>%
-  dplyr::filter(!(is.na(gene_symbol)))
-
-# Create a `label` column with values specifying the type of CN aberration and
-# save as a data.frame with `gene_symbol`, `biospecimen_id`, and `copy_number`
-annotated_cn <- cn_short %>%
+annotated_cn <- data.frame(biospecimen_id = overlaps$Kids_First_Biospecimen_ID,
+                           status = overlaps$status,
+                           copy_number = overlaps$copy_number,
+                           ploidy = overlaps$tumor_ploidy,
+                           ensembl = unlist(overlaps$gene_id),
+                           stringsAsFactors = FALSE) %>%
   dplyr::distinct() %>%
-  dplyr::mutate(label = dplyr::case_when(
-    copy_number == 3 | copy_number == 4  ~ "Gain",
-    copy_number == 0 ~ "Hom_Deletion",
-    copy_number == 1 ~ "Hem_Deletion",
-    copy_number >= 5 ~ "Amplification")) %>%
-  dplyr::select(gene_symbol, biospecimen_id, label, copy_number)
+  # Discard the gene version information in order to get gene symbols and
+  # cytoband mappings
+  dplyr::mutate(ensembl = gsub("\\..*", "", ensembl))
+
+annotated_cn <- annotated_cn %>%
+  dplyr::mutate(
+    gene_symbol = AnnotationDbi::mapIds(org.Hs.eg.db::org.Hs.eg.db,
+                                        keys = ensembl,
+                                        column = "SYMBOL",
+                                        keytype = "ENSEMBL"),
+    cytoband = AnnotationDbi::mapIds(org.Hs.eg.db::org.Hs.eg.db,
+                                     keys = ensembl,
+                                     column = "MAP",
+                                     keytype = "ENSEMBL")
+  ) %>%
+  dplyr::filter(!is.na(gene_symbol))
+
+# Output file name
+output_file <- paste0(opt$filename_lead, "_autosomes.tsv")
 
 # Save final data.frame to a tsv file
-readr::write_tsv(annotated_cn, file.path(results_dir, "annotated_cn.tsv"))
+readr::write_tsv(annotated_cn, file.path(results_dir, output_file))
diff --git a/...file-preparation/results/annotated_cn.tsv → ...ults/cnvkit_annotated_cn_autosomes.tsv.gz b/...file-preparation/results/annotated_cn.tsv → ...ults/cnvkit_annotated_cn_autosomes.tsv.gz
diff --git a/analyses/focal-cn-file-preparation/results/controlfreec_annotated_cn_autosomes.tsv.gz b/analyses/focal-cn-file-preparation/results/controlfreec_annotated_cn_autosomes.tsv.gz
diff --git a/analyses/focal-cn-file-preparation/run-prepare-cn.sh b/analyses/focal-cn-file-preparation/run-prepare-cn.sh
@@ -3,12 +3,41 @@
 #
 # Usage: bash run-prepare-cn.sh
 
+set -e
+set -o pipefail
+
+# by default we will not filter to exons on chr 1:22, but this will save us
+# time in CI
+CHROMFILT=${OPENPBTA_FILT:-0}
+
 # This script should always run as if it were being called from
 # the directory it lives in.
 script_directory="$(perl -e 'use File::Basename;
   use Cwd "abs_path";
   print dirname(abs_path(@ARGV[0]));' -- "$0")"
 cd "$script_directory" || exit
 
+# Prep the CNVkit data
+Rscript --vanilla -e "rmarkdown::render('00-add-ploidy-cnvkit.Rmd', clean = TRUE)"
+
+# Run annotation step for CNVkit
+# TODO: update once GTF file is included with download
 Rscript --vanilla 01-prepare-cn-file.R \
-	--seg_file ../../data/pbta-cnv-cnvkit.seg.gz
+  --cnv_file ../../scratch/cnvkit_with_status.tsv \
+  --gtf_file ../collapse-rnaseq/gencode.v27.primary_assembly.annotation.gtf.gz \
+  --filename_lead "cnvkit_annotated_cn" \
+  --chrom_filter $CHROMFILT \
+  --cnvkit
+
+# Run annotation step for ControlFreeC
+# TODO: update once GTF file is included with download
+Rscript --vanilla 01-prepare-cn-file.R \
+  --cnv_file ../../data/pbta-cnv-controlfreec.tsv.gz \
+  --gtf_file ../collapse-rnaseq/gencode.v27.primary_assembly.annotation.gtf.gz \
+  --filename_lead "controlfreec_annotated_cn" \
+  --chrom_filter $CHROMFILT \
+  --controlfreec
+
+# gzip the two files in the results folder, overwriting without prompt
+gzip -f results/cnvkit_annotated_cn_autosomes.tsv
+gzip -f results/controlfreec_annotated_cn_autosomes.tsv