AlexsLemonade · jaclyn-taroni · Mar 23, 2020 · Mar 9, 2020 · Mar 9, 2020 · Mar 11, 2020
diff --git a/analyses/focal-cn-file-preparation/02-add-ploidy-consensus.Rmd b/analyses/focal-cn-file-preparation/02-add-ploidy-consensus.Rmd
@@ -34,6 +34,10 @@ library(tidyverse)
 
 ```{r}
 scratch_dir <- file.path("..", "..", "scratch")
+output_dir <- file.path(scratch_dir, "cytoband_status", "segments")
+if(!dir.exists(output_dir)) {
+  dir.create(output_dir, recursive = TRUE)
+}
 ```
 
 ```{r}
@@ -146,3 +150,69 @@ add_status_df %>%
 output_file <- file.path("..", "..", "scratch", "consensus_seg_with_status.tsv")
 write_tsv(add_status_df, output_file)
 ```
+
+### Prepare separate bed files for losses/gains for bedtools coverage function
+
+```{r}
+bed_status_df <- add_status_df %>%
+  select(chrom, loc.start, loc.end, everything()) %>%
+  group_by(Kids_First_Biospecimen_ID) %>%
+  arrange(chrom, loc.start, loc.end)
+
+losses_bed_status_df <- bed_status_df %>%
+  filter(status == "loss")
+
+gains_bed_status_df <- bed_status_df %>%
+  filter(status == "gain")
+
+# make lists of data frames by sample
+bed_status_list <- bed_status_df %>%
+  group_split()
+names(bed_status_list) <- bed_status_df %>%
+  group_keys() %>%
+  pull(Kids_First_Biospecimen_ID)
+
+bed_loss_list <- losses_bed_status_df %>%
+  group_split()
+names(bed_loss_list) <- losses_bed_status_df %>%
+  group_keys() %>%
+  pull(Kids_First_Biospecimen_ID)
+
+bed_gain_list <- gains_bed_status_df %>%
+  group_split()
+names(bed_gain_list) <- gains_bed_status_df %>%
+  group_keys() %>%
+  pull(Kids_First_Biospecimen_ID)
+```
+
+### Write to file
+
+```{r}
+temp <- purrr::imap(bed_status_list,
+                    ~ write_tsv(.x,
+                                file.path(
+                                  output_dir, paste0("consensus_callable.", .y, ".bed")
+                                ),
+                                col_names = FALSE))
+
+temp_loss <- purrr::imap(bed_loss_list,
+                         ~ write_tsv(.x,
+                                     file.path(
+                                       output_dir, paste0("consensus_loss.", .y, ".bed")
+                                     ),
+                                     col_names = FALSE))
+
+temp_gain <- purrr::imap(bed_gain_list,
+                         ~ write_tsv(.x,
+                                     file.path(
+                                       output_dir, paste0("consensus_gain.", .y, ".bed")
+                                     ),
+                                     col_names = FALSE))
+```
+
+## Session Info
+
+```{r}
+sessionInfo()
+```
+
diff --git a/analyses/focal-cn-file-preparation/02-add-ploidy-consensus.nb.html b/analyses/focal-cn-file-preparation/02-add-ploidy-consensus.nb.html
diff --git a/analyses/focal-cn-file-preparation/03-add-cytoband-status-consensus.Rmd b/analyses/focal-cn-file-preparation/03-add-cytoband-status-consensus.Rmd
@@ -0,0 +1,160 @@
+---
+title: "Add dominant status column to consensus SEG file with cytoband field"
+output: 
+  html_notebook:
+    toc: TRUE
+    toc_float: TRUE
+author: Chante Bethell for ALSF CCDL
+date: 2020
+---
+
+This notebook adds dominant status information per cytoband to the consensus SEG files prepared in `run-prepare-cn.sh` using the `bedtools coverage` function (these files are stored in the project's scratch directory as bed files).
+
+## Usage
+
+This notebook is intended to be run from the command line with the following (assumes you are in the root directory of the repository):
+
+```
+Rscript -e "rmarkdown::render('analyses/focal-cn-file-preparation/03-.Rmd', clean = TRUE)"
+```
+
+## Set up
+
+### Libraries and functions
+
+```{r}
+library(tidyverse)
+```
+
+### Files and directories
+
+```{r}
+scratch_dir <- file.path("..", "..", "scratch")
+```
+
+### Read in files 
+
+The files we are reading in here are the bed files prepared in the shell script using `bedtools coverage`.
+
+```{r message = FALSE}
+all_callable_cytoband_status_df <-
+  read_tsv(file.path(scratch_dir, "intersect_with_cytoband_callable.bed"),
+                  col_names = FALSE)
+
+loss_cytoband_status_df <-
+  read_tsv(file.path(scratch_dir, "intersect_with_cytoband_loss.bed"),
+                  col_names = FALSE)
+
+gain_cytoband_status_df <-
+  read_tsv(file.path(scratch_dir, "intersect_with_cytoband_gain.bed"),
+                  col_names = FALSE)
+```
+
+## Wrangle and merge consensus cytoband status data
+
+### Filter each of the cytoband status data.frames
+
+```{r}
+all_callable_cytoband_status_df <-
+  all_callable_cytoband_status_df %>%
+  select(
+    chr = X1,
+    cytoband = X4,
+    callable_fraction = X9,
+    Kids_First_Biospecimen_ID = X10
+  ) %>%
+  filter(!is.na(cytoband))
+
+gain_cytoband_status_df <- gain_cytoband_status_df %>%
+  select(
+    chr = X1,
+    cytoband = X4,
+    gain_fraction = X9,
+    Kids_First_Biospecimen_ID = X10
+  ) %>%
+  filter(!is.na(cytoband))
+
+loss_cytoband_status_df <- loss_cytoband_status_df %>%
+  select(
+    chr = X1,
+    cytoband = X4,
+    loss_fraction = X9,
+    Kids_First_Biospecimen_ID = X10
+  ) %>%
+  filter(!is.na(cytoband))
+```
+
+### Join all data.frames together
+
+```{r}
+final_df <- all_callable_cytoband_status_df %>%
+  left_join(gain_cytoband_status_df,
+            by = c("chr", "cytoband", "Kids_First_Biospecimen_ID")) %>%
+  left_join(loss_cytoband_status_df,
+            by = c("chr", "cytoband", "Kids_First_Biospecimen_ID"))
+```
+
+### Add chromosome arm column
+
+```{r}
+# Add a column that tells us the position of the p or q and then use this to
+# split the cytoband column
+final_df <- final_df %>%
+  mutate(cytoband_with_arm = paste0(gsub("chr", "", chr), cytoband),
+         chromosome_arm = gsub("(p|q).*", "\\1", cytoband_with_arm)) %>%
+  select(-cytoband_with_arm)
+```
+
+### Add `dominant_status` field to final data.frame
+
+```{r}
+# Create a dominant status column
+final_df <- final_df %>%
+  replace_na(list(
+    gain_fraction = 0,
+    loss_fraction = 0
+  )) %>%
+  group_by(chromosome_arm) %>%
+  mutate(
+    dominant_status = case_when(
+      callable_fraction < 0.5 ~ "uncallable",
+      gain_fraction / callable_fraction > 0.5 ~ "gain",
+      loss_fraction / callable_fraction > 0.5 ~ "loss",
+      (gain_fraction + loss_fraction) / callable_fraction > 0.5 ~ "unstable",
+      TRUE ~ "neutral"
+    )
+  )
+```
+
+### Read in and join original UCSC cytoband data
+
+This step is to define any additional uncallable cytoband regions.
+
+```{r message = FALSE}
+ucsc_cytoband_df <- data.table::fread("http://hgdownload.cse.ucsc.edu/goldenpath/hg38/database/cytoBand.txt.gz", data.table = FALSE)
+
+ucsc_cytoband_df <- ucsc_cytoband_df %>%
+  select(chr = V1, cytoband = V4)
+
+# Join the cytoband data from the original UCSC cytoband file with the data
+# bedtools coverage files wrangled in the steps above
+final_df <- final_df %>%
+  right_join(ucsc_cytoband_df, by = c("chr", "cytoband")) %>%
+  select(Kids_First_Biospecimen_ID, chr, cytoband, dominant_status, everything())
+
+# If there are any NA's in the `dominant_status` column after we merge the
+# original cytoband data, replace with "uncallable"
+final_df$dominant_status <- replace_na(final_df$dominant_status, "uncallable")
+
+# Display final table with `uncallable` value added to `dominant_status` column
+final_df
+
+# Write to file
+write_tsv(final_df, file.path("results", "consensus_seg_with_ucsc_cytoband_status.tsv.gz"))
+```
+## Session Info
+
+```{r}
+sessionInfo()
+```
+
diff --git a/analyses/focal-cn-file-preparation/03-add-cytoband-status-consensus.nb.html b/analyses/focal-cn-file-preparation/03-add-cytoband-status-consensus.nb.html
diff --git a/...-cn-file-preparation/03-prepare-cn-file.R → ...-cn-file-preparation/04-prepare-cn-file.R b/...-cn-file-preparation/03-prepare-cn-file.R → ...-cn-file-preparation/04-prepare-cn-file.R
@@ -11,7 +11,7 @@
 # This script is intended to be run via the command line.
 # This example assumes it is being run from the root of the repository.
 #
-# Rscript --vanilla analyses/oncoprint-landscape/03-prepare-cn-file.R \
+# Rscript --vanilla analyses/focal-cn-file-preparation/03-prepare-cn-file.R \
 #   --cnv_file data/pbta-cnv-controlfreec.tsv.gz \
 #   --gtf_file data/gencode.v27.primary_assembly.annotation.gtf.gz \
 #   --metadata data/pbta-histologies.tsv \

diff --git a/analyses/focal-cn-file-preparation/README.md b/analyses/focal-cn-file-preparation/README.md
@@ -1,6 +1,6 @@
 ## Focal copy number file preparation
 
-**Module authors:** Chante Bethell ([@cbethell](https://github.com/cbethell)) and Jaclyn Taroni ([@jaclyn-taroni](https://github.com/jaclyn-taroni))
+**Module authors:** Chante Bethell ([@cbethell](https://github.com/cbethell)), Joshua Shapiro ([@jashapiro](https://github.com/jashapiro)), and Jaclyn Taroni ([@jaclyn-taroni](https://github.com/jaclyn-taroni))
 
 The copy number data from OpenPBTA are provided as ranges or segments.
 The purpose of this module is to map from those ranges to gene identifiers for consumption by downstream analyses (e.g., OncoPrint plotting).
@@ -12,6 +12,8 @@ To run this analysis _only on consensus SEG file_, use the following (from the r
 ```
 bash analyses/focal-cn-file-preparation/run-prepare-cn.sh
 ```
+**Note**: The `run-bedtools.snakemake` script is implemented in `run-prepare-cn.sh` to run the bedtools coverage steps between the UCSC cytoband file and the samples in the copy number files produced in `02-add-ploidy-consensus.Rmd`.
+This script currently takes a while to run, and therefore slows down the processing speed of the main shell script `run-prepare-cn.sh`.
 
 Running the following from the root directory of the repository will run the steps for the original copy number call files (CNVkit and ControlFreeC) in addition to the consensus SEG file:
 
@@ -28,9 +30,14 @@ RUN_ORIGINAL=1 bash analyses/focal-cn-file-preparation/run-prepare-cn.sh
 
 * `02-add-ploidy-consensus.Rmd` - This is very similar to the CNVkit file prep (`01-add-ploidy-cnvkit.Rmd`).
 However, there are instances in the consensus SEG file where `copy.num` is `NA` which are removed.
-See the notebook for more information.
+See the notebook for more information. This notebook also prepares lists of copy number bed files by sample for use in the implementation of bedtools coverage in `run-bedtools.snakemake`.
 
-* `03-prepare-cn-file.R` - This script performs the ranges to annotation mapping using the GENCODE v27 GTF included via the data download step; it takes the ControlFreeC file or a SEG (e.g., CNVkit, consensus SEG) file prepared with `01-add-ploidy-cnvkit.Rmd` and  `02-add-ploidy-cnvkit.Rmd` as input.
+* `03-add-cytoband-status-consensus.Rmd` - This notebook reads in the bedtools coverage output files and defines the dominant copy number status for each chromosome arm. The output of this notebook is a table with the following columns:
+
+  | `Kids_First_Biospecimen_ID` | chr | cytoband | dominant_status | callable_fraction | gain_fraction | loss_fraction | chromosome_arm |
+  |----------------|--------|-------------|--------|---------|-------------|---------|---------------|
+
+* `04-prepare-cn-file.R` - This script performs the ranges to annotation mapping using the GENCODE v27 GTF included via the data download step; it takes the ControlFreeC file or a SEG (e.g., CNVkit, consensus SEG) file prepared with `01-add-ploidy-cnvkit.Rmd` and  `02-add-ploidy-cnvkit.Rmd` as input.
   **The mapping is limited to _exons_.**
   Mapping to cytobands is performed with the [`org.Hs.eg.db`](https://doi.org/doi:10.18129/B9.bioc.org.Hs.eg.db) package.
   A table with the following columns is returned:
@@ -66,9 +73,43 @@ focal-cn-file-preparation
 ├── 01-add-ploidy-cnvkit.nb.html
 ├── 02-add-ploidy-consensus.Rmd
 ├── 02-add-ploidy-consensus.nb.html
-├── 03-prepare-cn-file.R
+├── 03-add-cytoband-status-consensus.Rmd
+├── 03-add-cytoband-status-consensus.nb.html
+├── 04-prepare-cn-file.R
 ├── README.md
+├── annotation_files
+│   └── txdb_from_gencode.v27.gtf.db
 ├── display-plots.md
+├── gistic-results
+│   └── pbta-cnv-cnvkit-gistic
+│       ├── D.cap1.5.mat
+│       ├── all_data_by_genes.txt
+│       ├── all_lesions.conf_90.txt
+│       ├── all_thresholded.by_genes.txt
+│       ├── amp_genes.conf_90.txt
+│       ├── amp_qplot.pdf
+│       ├── amp_qplot.png
+│       ├── broad_data_by_genes.txt
+│       ├── broad_gistic_plot.pdf
+│       ├── broad_significance_results.txt
+│       ├── broad_values_by_arm.txt
+│       ├── del_genes.conf_90.txt
+│       ├── del_qplot.pdf
+│       ├── del_qplot.png
+│       ├── focal_dat.0.98.mat
+│       ├── focal_data_by_genes.txt
+│       ├── freqarms_vs_ngenes.pdf
+│       ├── gistic_inputs.mat
+│       ├── peak_regs.mat
+│       ├── perm_ads.mat
+│       ├── raw_copy_number.pdf
+│       ├── raw_copy_number.png
+│       ├── regions_track.conf_90.bed
+│       ├── sample_cutoffs.txt
+│       ├── sample_seg_counts.txt
+│       ├── scores.0.98.mat
+│       ├── scores.gistic
+│       └── wide_peak_regs.mat
 ├── plots
 │   ├── cnvkit_annotated_cn_autosomes_polya_loss_cor_plot.png
 │   ├── cnvkit_annotated_cn_autosomes_polya_stacked_plot.png
@@ -111,9 +152,11 @@ focal-cn-file-preparation
 │   ├── cnvkit_annotated_cn_x_and_y.tsv.gz
 │   ├── consensus_seg_annotated_cn_autosomes.tsv.gz
 │   ├── consensus_seg_annotated_cn_x_and_y.tsv.gz
+│   ├── consensus_seg_with_ucsc_cytoband.tsv.gz
 │   ├── controlfreec_annotated_cn_autosomes.tsv.gz
 │   └── controlfreec_annotated_cn_x_and_y.tsv.gz
 ├── rna-expression-validation.R
+├── run-bedtools.snakemake
 ├── run-prepare-cn.sh
 └── util
     └── rna-expression-functions.R

diff --git a/analyses/focal-cn-file-preparation/results/consensus_seg_with_ucsc_cytoband_status.tsv.gz b/analyses/focal-cn-file-preparation/results/consensus_seg_with_ucsc_cytoband_status.tsv.gz