AlexsLemonade · jaclyn-taroni · Oct 22, 2019 · Oct 2, 2019 · Oct 2, 2019 · Oct 7, 2019
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -49,6 +49,10 @@ jobs:
           name: ssGSEA Analysis
           command: OPENPBTA_ANOVAPVALUE=0.25 OPENPBTA_TUKEYPVALUE=0.50 OPENPBTA_PERCKEEP=0.50 ./scripts/run_in_ci.sh bash analyses/ssgsea-hallmark/run-ssgsea-hallmark.sh
 
+      - run:
+          name: CNV Caller Comparison  
+          command: ./scripts/run_in_ci.sh Rscript analyses/cnv-comparison/01-cnv-comparison-plotting.R      
+
         #### Add your analysis here ####
 
   deploy:

diff --git a/Dockerfile b/Dockerfile
@@ -50,6 +50,9 @@ RUN apt-get update -qq && apt-get -y --no-install-recommends install \
 # maftools for proof of concept in create-subset-files
 RUN R -e "BiocManager::install(c('maftools'), update = FALSE)"
 
+# This is needed for the CNV frequency and proportion aberration plots
+RUN R -e "BiocManager::install(c('GenVisR'), update = FALSE)"
+
 # These packages are for the genomic region analysis for snv-callers
 RUN R -e "BiocManager::install(c('annotatr', 'TxDb.Hsapiens.UCSC.hg38.knownGene', 'org.Hs.eg.db'), update = FALSE)"
 

diff --git a/analyses/cnv-comparison/01-cnv-comparison-plotting.R b/analyses/cnv-comparison/01-cnv-comparison-plotting.R
@@ -0,0 +1,169 @@
+# Plot and compare detected CNV aberrations given CNVkit and Control-FREEC 
+# output
+#
+# Chante Bethell for CCDL 2019
+#
+# Usage:
+# This script is intended to be run via the command line from the top directory 
+# of the repository as follows:
+#
+# Rscript analyses/cnv-comparison/01-cnv-comparison-plotting.R
+#
+
+#### Install packages ----------------------------------------------------------
+if (!("GenVisR" %in% installed.packages())) {
+  install.packages("BiocManager")
+  BiocManager::install("GenVisR")
+}
+
+if (!("cowplot" %in% installed.packages())) {
+  install.packages("BiocManager")
+  BiocManager::install("cowplot")
+}
+
+##### Set up functions ---------------------------------------------------------
+
+# Magrittr pipe
+`%>%` <- dplyr::`%>%`
+
+# Detect the ".git" folder -- this will in the project root directory.
+# Use this as the root directory to ensure proper sourcing of functions no
+# matter where this is called from
+root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
+
+# Source custom functions script
+source(file.path(root_dir, "analyses", "cnv-comparison", "util",
+                 "cnv-comparison-functions.R"))
+
+#### Set up file paths ---------------------------------------------------------
+
+input_directory <- file.path(root_dir, "data")
+
+# Create the output directory if it does not exist
+output_directory <- file.path(root_dir, "analyses", "cnv-comparison", "plots")
+
+if (!dir.exists(output_directory)) {
+  dir.create(output_directory, recursive = TRUE)
+}
+
+#### Read in data --------------------- ----------------------------------------
+
+# Read in metadata
+metadata <-
+  readr::read_tsv(file.path(input_directory, "pbta-histologies.tsv"))
+
+# Read in cnvkit data
+cnvkit <- read_in_cnv(input_directory, "pbta-cnv-cnvkit.seg.gz")
+
+# Read in cnvkit subset data
+cnvkit_subset <-
+  read_in_cnv(input_directory, "testing/pbta-cnv-cnvkit.seg.gz")
+
+# Read in controlfreec data
+controlfreec <-
+  read_in_cnv(input_directory, "pbta-cnv-controlfreec.seg.gz")
+
+# Read in controlfreec subset data
+controlfreec_subset <-
+  read_in_cnv(input_directory, "testing/pbta-cnv-controlfreec.seg.gz")
+
+#### Filter data ---------------------------------------------------------------
+
+# Filter the cnvkit data by cutoff segmean in preparation for plotting
+cnvkit_format <- filter_segmean(cnvkit)
+
+# Filter the cnvkit subset data by cutoff segmean in preparation for plotting
+cnvkit_subset <- filter_segmean(cnvkit_subset)
+
+# Filter the controlfreec data by cutoff segmean in preparation for plotting
+controlfreec_format <- filter_segmean(controlfreec)
+
+# Filter the controlfreec subset data for cutoff in preparation for plotting
+controlfreec_subset <- filter_segmean(controlfreec_subset)
+
+#### Plot frequency and proportion using GenVisR -------------------------------
+
+# Run `plot_cnFreq` for cnvkit (has to be run with subset because
+# original dataset is too large for function)
+cnvkit_prop_plot <-
+  plot_cnFreq(cnvkit_format, "proportion", "CNVkit proportion")
+cnvkit_freq_plot <-
+  plot_cnFreq(cnvkit_format, "frequency", "CNVkit frequency")
+
+# Run `plot_cnFreq` for controlfreec
+controlfreec_prop_plot <-
+  plot_cnFreq(controlfreec_format,
+              "proportion",
+              "Control-FREEC proportion")
+controlfreec_freq_plot <-
+  plot_cnFreq(controlfreec_format,
+              "frequency",
+              "Control-FREEC frequency")
+
+# Plot cowplot of frequency plots and save
+plot_cowplot(
+  cnvkit_freq_plot,
+  controlfreec_freq_plot,
+  output_directory,
+  "compare_cnv_output_frequency.pdf"
+)
+
+# Plot cowplot of proportion plots and save
+plot_cowplot(
+  cnvkit_prop_plot,
+  controlfreec_prop_plot,
+  output_directory,
+  "compare_cnv_output_proportion.pdf"
+)
+
+#### Plot boxplots using ggplot2 -----------------------------------------------
+
+# Run `plot_boxplot` on cnvkit
+cnvkit_boxplot <- plot_boxplot(cnvkit_format, "CNVkit boxplot")
+
+# Run `plot_boxplot` on controlfreec
+controlfreec_boxplot <-
+  plot_boxplot(controlfreec_format, "Control-FREEC boxplot")
+
+# Save the plot combining the cnvkit and controlfreec boxplots
+plot_cowplot(
+  cnvkit_boxplot,
+  controlfreec_boxplot,
+  output_directory,
+  "compare_cnv_output_boxplot.pdf"
+)
+
+##### Plot barplots using ggplot2 ----------------------------------------------
+
+# Run `plot_histology_barplot` on cnvkit
+cnvkit_annotated_barplot_histology <-
+  plot_histology_barplot(cnvkit_format, metadata, "CNVkit")
+
+# Run `plot_histology_barplot` on controlfreec
+controlfreec_annotated_barplot_histology <-
+  plot_histology_barplot(controlfreec_format,
+               metadata,
+               "Control-FREEC")
+
+# Run `plot_aberration_barplot` on cnvkit
+cnvkit_annotated_barplot_aberration <-
+  plot_aberration_barplot(cnvkit_format, "CNVkit")
+
+# Run `plot_abberration_barplot` on controlfreec
+controlfreec_annotated_barplot_aberration <-
+  plot_aberration_barplot(controlfreec_format,
+                         "Control-FREEC")
+
+# Save the plot combining the cnvkit and controlfreec barplots
+plot_cowplot(
+  cnvkit_annotated_barplot_histology,
+  controlfreec_annotated_barplot_histology,
+  output_directory,
+  "compare_cnv_output_barplot_histology.pdf"
+)
+plot_cowplot(
+  cnvkit_annotated_barplot_aberration,
+  controlfreec_annotated_barplot_aberration,
+  output_directory,
+  "compare_cnv_output_barplot_aberration.pdf"
+)
diff --git a/analyses/cnv-comparison/plots/compare_cnv_output_barplot_aberration.pdf b/analyses/cnv-comparison/plots/compare_cnv_output_barplot_aberration.pdf
diff --git a/analyses/cnv-comparison/plots/compare_cnv_output_barplot_histology.pdf b/analyses/cnv-comparison/plots/compare_cnv_output_barplot_histology.pdf
diff --git a/analyses/cnv-comparison/plots/compare_cnv_output_boxplot.pdf b/analyses/cnv-comparison/plots/compare_cnv_output_boxplot.pdf
diff --git a/analyses/cnv-comparison/plots/compare_cnv_output_frequency.pdf b/analyses/cnv-comparison/plots/compare_cnv_output_frequency.pdf
diff --git a/analyses/cnv-comparison/plots/compare_cnv_output_proportion.pdf b/analyses/cnv-comparison/plots/compare_cnv_output_proportion.pdf
diff --git a/analyses/cnv-comparison/util/cnv-comparison-functions.R b/analyses/cnv-comparison/util/cnv-comparison-functions.R
@@ -0,0 +1,200 @@
+# This script defines filtering and plotting functions to be sourced in 
+# `01-cnv-comparison-plotting.R`
+#
+# Chante Bethell for CCDL 2019
+# 
+# Usage:
+# This script is intended to be run via the command line from the top directory 
+# of the repository as follows:
+#
+# Rscript analyses/cnv-comparison/util/cnv-comparison-functions.R
+
+read_in_cnv <- function(input_directory, file_path){
+  # Given the file path of the CNV data, read in the data.
+  #
+  # Arg:
+  #   input_directory: file path of input directory
+  #   file_path: file path of input data
+  #
+  # Return:
+  #   dataframe: the data frame containing the input data
+
+  # Read in cnv data
+  dataframe <-
+    read.table(gzfile(file.path(input_directory, file_path)), header = TRUE)
+
+  # Rename the columns
+  dataframe <-
-  dataframe <-
+  colnames(dataframe) <- c("chrom", "loc.start", "loc.end", "ID", "num.mark", "seg.mean")
-  dataframe <-
+  colnames(dataframe) <- c("chrom", "loc.start", "loc.end", "ID", "num.mark", "seg.mean")
+    dataframe[c("chrom", "loc.start", "loc.end", "ID", "num.mark", "seg.mean")]
+
+  return(dataframe)
+
+}
+
+filter_segmean <- function(dataframe){
+  # Given the data.frame containing CNV caller output, return a data.frame with
+  # a column with the values 0 and 1 to represent loss and gain, respectively.
+  #
+  # Args:
+  #   dataframe: data.frame containing CNV caller output
+  #
+  # Return:
+  #   cnv_matrix: the data.frame given, returned with a column containing an
+  #               aberration label for loss or gain
+
+  # Rename columns for GenVisR function downstream
+  colnames(dataframe) <-
+    c("chromosome",
+      "start",
+      "end",
+      "sample",
+      "probes",
+      "segmean")
+
+  cnv_matrix <- dataframe %>%
+    dplyr::mutate(aberration = "NA") %>%
+    dplyr::mutate(aberration = dplyr::case_when(segmean < -0.5 ~ 0,
+                                  segmean > 0.5 ~ 1)) %>%
-                                  segmean > 0.5 ~ 1)) %>%
+                                  segmean > 0.5 ~ 1,
+                                  NA)) %>%
-                                  segmean > 0.5 ~ 1)) %>%
+                                  segmean > 0.5 ~ 1,
+                                  NA)) %>%
+    dplyr::filter(!is.na(aberration))
+
+  return(cnv_matrix)
+
+}
+
+plot_cnFreq <-
-plot_cnFreq <-
+plot_cn_freq <-
-plot_cnFreq <-
+plot_cn_freq <-
+  function(filtered_dataframe,
+           plot_type,
+           plot_title) {
+    # Given the data.frame filtered for size of aberrations, plot the proportion
+    # or frequency (denoted by the plot_type argument) of aberrations across
+    # chromosomes with `GenVisR::cnFreq`
+    #
+    # Args:
+    #   filtered_dataframe: data.frame filtered for cutoff size of aberrations
+    #   plot_type: the type of plot (proportion or frequency)
-    #   plot_type: the type of plot (proportion or frequency)
+    #   plot_type: the type of plot options are character strings of either `proportion` or `frequency`.
-    #   plot_type: the type of plot (proportion or frequency)
+    #   plot_type: the type of plot options are character strings of either `proportion` or `frequency`.
+    #   plot_title: a title string for the plot produced
+    #
+    # Return:
+    #   aberration_plot: plot depicting the aberrations detected across
+    #                    chromosomes
+
+    aberration_plot <- GenVisR::cnFreq(
+      filtered_dataframe,
+      genome = "hg38",
+      CN_low_cutoff = 0,
+      CN_high_cutoff = .2,
+      plot_title = toupper(plot_title),
+      plotType = plot_type
+    )
+
+    return(aberration_plot)
+
+  }
+
+plot_boxplot <- function(dataframe, plot_title) {
+  # Given the data.frame filtered for size of aberrations, plot the proportion
+  # or frequency (denoted by the plot_type argument) of aberrations across
+  # chromosomes with `geom_boxplot`
+  #
+  # Args:
+  #   dataframe: data.frame filtered for cutoff size of aberrations
+  #   plot_title: a title string for the plot produced
+  #
+  # Return:
+  #   boxplot: boxplot depicting the log2 transformed sizes of aberrations
+  #            detected across chromosomes
+
+  # Create boxplot where the y-axis represents the log2 transformed segmean
+  boxplot <- ggplot2::ggplot(dataframe,
+                             ggplot2::aes(x = chromosome,
+                                          y = (log2(segmean) + 1))) +
+    ggplot2::geom_boxplot() +
+    ggplot2::theme_bw() +
+    ggplot2::ylab("Log transformed segmean values") +
+    ggplot2::ggtitle(toupper(plot_title))
+
+  return(boxplot)
+
+}
+
+plot_histology_barplot <- function(dataframe, metadata, plot_title) {
+  # Given the data.frame filtered by cutoff segmean value, plot the proportion
-  # Given the data.frame filtered by cutoff segmean value, plot the proportion
+  # Given the data.frame filtered by cutoff segmean value with `filter_segmean`, plot the proportion
-  # Given the data.frame filtered by cutoff segmean value, plot the proportion
+  # Given the data.frame filtered by cutoff segmean value with `filter_segmean`, plot the proportion
+  # of aberrations across chromosomes with `geom_barplot`
+  #
+  # Args:
+  #   dataframe: data.frame filtered for cutoff size of aberrations and
+  #              joined with the metadata
+  #   metadata: the relevant metadata data.frame
+  #   plot_title: a title string for the plot produced
+  #
+  # Return:
+  #   barplot: barplot depicting the proportion of aberrations detected across
+  #            chromosomes, annoated by the broad_histology variable in the
-  #            chromosomes, annoated by the broad_histology variable in the
+  #            chromosomes, annotated by the `broad_histology` variable in the
-  #            chromosomes, annoated by the broad_histology variable in the
+  #            chromosomes, annotated by the `broad_histology` variable in the
+  #            metadata
+
+  # Create a data.frame with the filtered dataframe joined with the metadata
+  meta_joined <- dataframe %>%
+    dplyr::inner_join(metadata, by = c("sample" = "Kids_First_Biospecimen_ID"))
+
+  # Create barplot where the y-axis represents the size of aberration
+  barplot <- ggplot2::ggplot(meta_joined,
+                             ggplot2::aes(x = chromosome,
+                                          y = broad_histology,
+                                          fill = broad_histology)) +
+    ggplot2::geom_bar(stat = "identity") +
+    ggplot2::theme_bw() +
+    ggplot2::theme(axis.text.y = ggplot2::element_blank()) +
+    ggplot2::ggtitle(toupper(plot_title))
+
+  return(barplot)
+
+}
+
+plot_aberration_barplot <- function(dataframe, plot_title) {
+  # Given the data.frame filtered by cutoff segmean value, plot the proportion
+  # of aberrations across chromosomes with `geom_barplot`
+  #
+  # Args:
+  #   dataframe: data.frame filtered by cutoff segmean value
+  #   plot_title: a title string for the plot produced
+  #
+  # Return:
+  #   barplot: barplot depicting the proportion of aberrations detected across
+  #            chromosomes
+
+
+  # Create barplot where the y-axis represents the size of aberration
+  barplot <- ggplot2::ggplot(dataframe,
+                             ggplot2::aes(x = chromosome,
+                                          y = as.factor(aberration),
+                                          fill = as.factor(aberration))) +
+    ggplot2::geom_bar(stat = "identity") +
+    ggplot2::theme_bw() +
+    ggplot2::ylab("Proportion of Aberrations") +
+    ggplot2::labs(fill = "Loss(0)/Gain(1)") +
+    ggplot2::ggtitle(toupper(plot_title))
+
+  return(barplot)
+
+}
+
+plot_cowplot <- function(plot_a, plot_b, output_path, plot_name){
+  # Given two plots, create a combined cowplot and save as a PDF in the
+  # specified directory
+  # Args:
+  #   plot_a: the first plot, which will be positioned on top
+  #   plot_b: the second plot, which will be positioned below the first
+  #   output_path: the file.path to the output directory
+  #   plot_name: the name the plot should be saved as
+
+  # Save a combined cowplot plot of the cnvkit and controlfreec plots
+  grid <- cowplot::plot_grid(plot_a, plot_b, ncol = 1)
+
+  cowplot::save_plot(
+    file.path(output_path, plot_name),
+    grid,
+    base_height = 12,
+    base_width = 30
+  )
+
+}