AlexsLemonade · jaclyn-taroni · Nov 9, 2019 · Nov 5, 2019 · Nov 8, 2019 · Nov 8, 2019
diff --git a/analyses/focal-cn-file-preparation/00-add-ploidy-cnvkit.Rmd b/analyses/focal-cn-file-preparation/00-add-ploidy-cnvkit.Rmd
@@ -0,0 +1,60 @@
+---
+title: "Add ploidy column, status to CNVkit output"
+output: html_notebook
+author: J. Taroni for ALSF CCDL
+date: 2019
+---
+
+The `pbta-histologies.tsv` file contains a `tumor_ploidy` column, which is tumor ploidy as inferred by ControlFreeC.
+The copy number information should be interpreted in the light of this information (see: [current version of Data Formats section of README](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/390f1e08e481da5ec0b2c62d886d5fd298bbf017#data-formats)).
+
+This notebook adds ploidy information to the CNVkit results and adds a status column that defines gain and loss broadly.
+
+```{r}
+library(dplyr)
+```
+
+### Read in data
+
+```{r}
+cnvkit_file <- file.path("..", "..", "data", "pbta-cnv-cnvkit.seg.gz")
+cnvkit_df <- readr::read_tsv(cnvkit_file)
+```
+
+```{r}
+histologies_file <- file.path("..", "..", "data", "pbta-histologies.tsv")
+histologies_df <- readr::read_tsv(histologies_file)
+```
+
+### Add inferred ploidy information to CNVkit results
+
+```{r}
+add_ploidy_df <- histologies_df %>%
+  select(Kids_First_Biospecimen_ID, tumor_ploidy) %>%
+  inner_join(cnvkit_df, by = c("Kids_First_Biospecimen_ID" = "ID")) %>%
+  select(-tumor_ploidy, everything())
+```
+
+### Add status column
+
+This is intended to mirror the information contained in the ControlFreeC output.
+
+```{r}
+add_ploidy_df <- add_ploidy_df %>%
+  mutate(status = case_when(
+    # when the copy number is less than inferred ploidy, mark this as a loss
+    copy.num < tumor_ploidy ~ "loss",
+    # if copy number is higher than ploidy, mark as a gain
+    copy.num > tumor_ploidy ~ "gain",
+    copy.num == tumor_ploidy ~ "neutral"
+  ))
+
+head(add_ploidy_df, 10)
+```
+
+### Write to `scratch`
+
+```{r}
+output_file <- file.path("..", "..", "scratch", "cnvkit_with_status.tsv")
+readr::write_tsv(add_ploidy_df, output_file)
+```
diff --git a/analyses/focal-cn-file-preparation/00-add-ploidy-cnvkit.nb.html b/analyses/focal-cn-file-preparation/00-add-ploidy-cnvkit.nb.html
diff --git a/analyses/focal-cn-file-preparation/01-prepare-cn-file.R b/analyses/focal-cn-file-preparation/01-prepare-cn-file.R
@@ -29,7 +29,7 @@ if (!("IRanges" %in% installed.packages())) {
   BiocManager::install("IRanges", update = FALSE)
 }
 
-# Install annotatr 
+# Install annotatr
 if (!("annotatr" %in% installed.packages())) {
   BiocManager::install("annotatr", update = FALSE)
 }
@@ -55,87 +55,159 @@ if (!("AnnotationDbi" %in% installed.packages())) {
 # Declare command line options
 option_list <- list(
   optparse::make_option(
-    c("-f", "--seg_file"),
+    c("--cnv_file"),
     type = "character",
     default = NULL,
-    help = "file path to SEG file that contains cnv information"
+    help = "file path to file that contains CNV information"
+  ),
+  optparse::make_option(
+    c("--gtf_file"),
+    type = "character",
+    default = NULL,
+    help = "file path to human genome GTF file"
+  ),
+  optparse::make_option(
+    c("--filename_lead"),
+    type = "character",
+    default = "annotated_cn",
+    help = "used in file names"
+  ),
+  optparse::make_option(
+    c("--controlfreec"),
+    type = "logical",
+    action = "store_true",
+    default = FALSE,
+    help = "flag used to indicate if the CNV file is the output of ControlFreeC"
+  ),
+  optparse::make_option(
+    c("--cnvkit"),
+    type = "logical",
+    action = "store_true",
+    default = FALSE,
+    help = "flag used to indicate if the CNV file is the output of CNVkit"
   )
 )
 
 # Read the arguments passed
 opt_parser <- optparse::OptionParser(option_list = option_list)
 opt <- optparse::parse_args(opt_parser)
 
+# error handling related to specifying the CNV method
+if (all(opt$controlfreec, opt$cnvkit)) {
+  stop("--controlfreec and --cnvkit are mutually exclusive")
+}
+
+if (!any(opt$controlfreec, opt$cnvkit)) {
+  stop("You must specify the CNV file format by using --controlfreec or
+       --cnvkit")
+}
+
 #### Directories and Files -----------------------------------------------------
 
 # Detect the ".git" folder -- this will in the project root directory.
 # Use this as the root directory to ensure proper sourcing of functions no
 # matter where this is called from
 root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
 
-# Set path to results directory 
+# Set path to results directory
 results_dir <-
   file.path(root_dir, "analyses", "focal-cn-file-preparation", "results")
 
 if (!dir.exists(results_dir)) {
   dir.create(results_dir)
 }
 
-seg_df <-
-  data.table::fread(
-    opt$seg_file,
-    data.table = FALSE,
-    stringsAsFactors = FALSE
-  )
+#### Format CNV file and overlap with hg38 genome annotations ------------------
 
-#### Format seg file and overlap with hg38 genome annotations ------------------
+# we want to standardize the formats between the two methods here and drop
+# columns we won't need to
+if (opt$cnvkit) {
+  cnv_df <- readr::read_tsv(opt$cnv_file) %>%
+    dplyr::rename(chr = chrom, start = loc.start, end = loc.end,
+                  copy_number = copy.num) %>%
+    dplyr::select(-num.mark, -seg.mean) %>%
+    dplyr::select(-Kids_First_Biospecimen_ID, dplyr::everything())
+}
 
-# Exclude the X and Y chromosomes, exclude `copy.num` == 2, and rearrange 
-# ID column to be the last column
-seg_no_xy <- seg_df %>%
-  dplyr::filter(!(chrom %in% c("chrX", "chrY")), (copy.num != 2)) %>%
-  dplyr::select(-ID, dplyr::everything()) 
+if (opt$controlfreec) {
+  # TODO: filter based on the p-values rather than just dropping them?
+  cnv_df <- readr::read_tsv(opt$cnv_file) %>%
+    dplyr::rename(copy_number = copy.number) %>%
+    dplyr::mutate(chr = paste0("chr", chr)) %>%
+    dplyr::select(-segment_genotype, -uncertainty, -WilcoxonRankSumTestPvalue,
+                  -KolmogorovSmirnovPvalue) %>%
+    dplyr::select(-Kids_First_Biospecimen_ID, dplyr::everything())
+}
 
-# Make seg data.frame a GRanges object 
-seg_gr <- seg_no_xy %>%
+# Remove neutral copy number calls and mark possible amplifcation
+cnv_df <- cnv_df %>%
+  dplyr::filter(status != "neutral") %>%
+  dplyr::mutate(status = dplyr::case_when(
+    copy_number > (2 * tumor_ploidy) ~ "amplification",
+    TRUE ~ status
+  ))
+
+# Addressing autosomes first
+# Exclude the X and Y chromosomes
+cnv_no_xy <- cnv_df %>%
+  dplyr::filter(!(chr %in% c("chrX", "chrY")))
+
+# Make seg data.frame a GRanges object
+cnv_no_xy_gr <- cnv_no_xy %>%
   GenomicRanges::makeGRangesFromDataFrame(keep.extra.columns = TRUE,
                                           starts.in.df.are.0based = FALSE)
 
 # Define the annotations for the hg38 genome
-txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene::TxDb.Hsapiens.UCSC.hg38.knownGene
-genes <- GenomicFeatures::genes(txdb)
-
-# Create a data.frame with the overlaps between the seg file and hg38 genome 
-# annotations 
-overlaps <- IRanges::mergeByOverlaps(seg_gr, genes)
-
-overlapSymbols <-
-  AnnotationDbi::mapIds(
-    org.Hs.eg.db::org.Hs.eg.db,
-    keys = overlaps$gene_id,
-    column = "SYMBOL",
-    keytype = "ENTREZID"
-  )
+txdb <- GenomicFeatures::makeTxDbFromGFF(
+  file = opt$gtf_file,
+  format = "gtf"
+)
+
+# we'll only look at genes on chromosomes 1:22 -- this will get rid of things
+# like alternates
+chroms <- paste0("chr", 1:22)
+chrom_filter <- list(tx_chrom = chroms)
+
+# extract the genes using the chromosome filter
+chr_exons <- GenomicFeatures::exons(txdb,
+                                    filter = chrom_filter,
+                                    columns = "gene_id")
+
+# Create a data.frame with the overlaps between the seg file and hg38 genome
+# annotations
+overlaps <- IRanges::mergeByOverlaps(cnv_no_xy_gr, chr_exons)
 
-overlaps$gene_id <- overlapSymbols
+# remove what we no longer need from the environment
+rm(chr_exons, chroms, chrom_filter, cnv_no_xy_gr, cnv_no_xy)
 
 # Create a data.frame with selected columns from the `overlaps` object
-cn_short <- data.frame(
-  gene_symbol = overlaps$gene_id,
-  biospecimen_id = overlaps$ID,
-  copy_number = overlaps$copy.num) %>%
-  dplyr::filter(!(is.na(gene_symbol)))
-
-# Create a `label` column with values specifying the type of CN aberration and
-# save as a data.frame with `gene_symbol`, `biospecimen_id`, and `copy_number`
-annotated_cn <- cn_short %>%
+annotated_cn <- data.frame(ensembl = unlist(overlaps$gene_id),
+                           biospecimen_id = overlaps$Kids_First_Biospecimen_ID,
+                           status = overlaps$status,
+                           copy_number = overlaps$copy_number,
+                           ploidy = overlaps$tumor_ploidy) %>%
   dplyr::distinct() %>%
-  dplyr::mutate(label = dplyr::case_when(
-    copy_number == 3 | copy_number == 4  ~ "Gain",
-    copy_number == 0 ~ "Hom_Deletion",
-    copy_number == 1 ~ "Hem_Deletion",
-    copy_number >= 5 ~ "Amplification")) %>%
-  dplyr::select(gene_symbol, biospecimen_id, label, copy_number)
+  # Discard the gene version information in order to get gene symbols and
+  # cytoband mappings
+  dplyr::mutate(ensembl = gsub("\\..*", "", ensembl))
+
+annotated_cn <- annotated_cn %>%
+  dplyr::mutate(
+    gene_symbol = AnnotationDbi::mapIds(org.Hs.eg.db::org.Hs.eg.db,
+                                        keys = ensembl,
+                                        column = "SYMBOL",
+                                        keytype = "ENSEMBL"),
+    cytoband = AnnotationDbi::mapIds(org.Hs.eg.db::org.Hs.eg.db,
+                                     keys = ensembl,
+                                     column = "MAP",
+                                     keytype = "ENSEMBL")
+  ) %>%
+  dplyr::filter(!is.na(gene_symbol)) %>%
+  # put the ensembl identifiers last
+  dplyr::select(-ensembl, dplyr::everything())
+
+# Output file name
+output_file <- paste0(opt$filename_lead, "_autosomes.tsv")
 
 # Save final data.frame to a tsv file
-readr::write_tsv(annotated_cn, file.path(results_dir, "annotated_cn.tsv"))
+readr::write_tsv(annotated_cn, file.path(results_dir, output_file))
diff --git a/...file-preparation/results/annotated_cn.tsv → ...ults/cnvkit_annotated_cn_autosomes.tsv.gz b/...file-preparation/results/annotated_cn.tsv → ...ults/cnvkit_annotated_cn_autosomes.tsv.gz
diff --git a/analyses/focal-cn-file-preparation/results/controlfreec_annotated_cn_autosomes.tsv.gz b/analyses/focal-cn-file-preparation/results/controlfreec_annotated_cn_autosomes.tsv.gz
diff --git a/analyses/focal-cn-file-preparation/run-prepare-cn.sh b/analyses/focal-cn-file-preparation/run-prepare-cn.sh
@@ -10,5 +10,24 @@ script_directory="$(perl -e 'use File::Basename;
   print dirname(abs_path(@ARGV[0]));' -- "$0")"
 cd "$script_directory" || exit
 
+# Prep the CNVkit data
+Rscript --vanilla -e "rmarkdown::render('00-add-ploidy-cnvkit.Rmd', clean = TRUE)"
+
+# Run annotation step for CNVkit
+Rscript --vanilla 01-prepare-cn-file.R \
+  --cnv_file ../../scratch/cnvkit_with_status.tsv \
+  # TODO: update once this is included with download
+  --gtf_file ../collapse-rnaseq/gencode.v27.primary_assembly.annotation.gtf.gz \
+  --filename_lead "cnvkit_annotated_cn" \
+  --cnvkit
+
+# Run annotation step for ControlFreeC
 Rscript --vanilla 01-prepare-cn-file.R \
-	--seg_file ../../data/pbta-cnv-cnvkit.seg.gz
+  --cnv_file ../../data/pbta-cnv-controlfreec.tsv.gz \
+  # TODO: update once this is included with download
+  --gtf_file ../collapse-rnaseq/gencode.v27.primary_assembly.annotation.gtf.gz \
+  --filename_lead "controlfreec_annotated_cn" \
+  --controlfreec
+
+# gzip everything in the results folder
+gzip -r results