diff --git a/.circleci/config.yml b/.circleci/config.yml index 8e157e0c3e..d834c91284 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -53,6 +53,10 @@ jobs: name: ssGSEA Analysis command: OPENPBTA_ANOVAPVALUE=0.25 OPENPBTA_TUKEYPVALUE=0.50 OPENPBTA_PERCKEEP=0.50 ./scripts/run_in_ci.sh bash analyses/ssgsea-hallmark/run-ssgsea-hallmark.sh + - run: + name: CNV Caller Comparison + command: ./scripts/run_in_ci.sh Rscript -e "rmarkdown::render('analyses/cnv-comparison/01-cnv-comparison-plotting.Rmd', clean = TRUE)" + #### Add your analysis here #### deploy: diff --git a/Dockerfile b/Dockerfile index c76999804c..bd1f735d19 100644 --- a/Dockerfile +++ b/Dockerfile @@ -50,6 +50,9 @@ RUN apt-get update -qq && apt-get -y --no-install-recommends install \ # maftools for proof of concept in create-subset-files RUN R -e "BiocManager::install(c('maftools'), update = FALSE)" +# This is needed for the CNV frequency and proportion aberration plots +RUN R -e "BiocManager::install(c('GenVisR'), update = FALSE)" + # These packages are for the genomic region analysis for snv-callers RUN R -e "BiocManager::install(c('annotatr', 'TxDb.Hsapiens.UCSC.hg38.knownGene', 'org.Hs.eg.db'), update = FALSE)" diff --git a/analyses/cnv-comparison/01-cnv-comparison-plotting.Rmd b/analyses/cnv-comparison/01-cnv-comparison-plotting.Rmd new file mode 100644 index 0000000000..20d7711550 --- /dev/null +++ b/analyses/cnv-comparison/01-cnv-comparison-plotting.Rmd @@ -0,0 +1,190 @@ +--- +title: "CNV Comparison Plots" +output: + html_notebook: + toc: true + toc_float: true +--- + +This notebook plots and compares detected CNV aberrations given CNVkit and +Control-FREEC output. + +## Output Files + +- `analyses/cnv-comparison/plots/compare_cnv_output_proportion.pdf` +- `analyses/cnv-comparison/plots/compare_cnv_output_frequency.pdf` +- `analyses/cnv-comparison/plots/compare_cnv_output_violin_plot.pdf` +- `analyses/cnv-comparison/plots/compare_cnv_output_barplot_histology.pdf` +- `analyses/cnv-comparison/plots/compare_cnv_output_barplot_aberration.pdf` + +## Usage + +This script is intended to be run via the command line from the top directory +of the repository as follows: + +``` +Rscript -e "rmarkdown::render('analyses/cnv-comparison/01-cnv-comparison-plotting.Rmd', + clean = TRUE)" +``` + +# Set Up + +```{r} +# This will be needed to create the frequency and proportion aberration plots +if (!("GenVisR" %in% installed.packages())) { + install.packages("BiocManager") + BiocManager::install("GenVisR") +} + +# This will be need to combine plots +if (!("cowplot" %in% installed.packages())) { + install.packages("cowplot") +} + +# Magrittr pipe +`%>%` <- dplyr::`%>%` + +# Source custom functions script +source(file.path("util", "cnv-comparison-functions.R")) +``` + +# Directories and Files + +```{r} +# Path to input directory +input_directory <- file.path("..", "..", "data") + +# Path to output directory +output_directory <- "plots" + +# Create the output directory if it does not exist +if (!dir.exists(output_directory)) { + dir.create(output_directory, recursive = TRUE) +} + +# List of file paths to the CNV data +cnv_list <- + list( + cnvkit = file.path(input_directory, "pbta-cnv-cnvkit.seg.gz"), + controlfreec = file.path(input_directory, "pbta-cnv-controlfreec.seg.gz") + ) +``` + +# Read in data + +```{r} +# Read in list of CNV data using custom `read_in_cnv` function +cnv_data <- lapply(cnv_list, read_in_cnv) + +# Read in metadata +metadata <- + readr::read_tsv(file.path(input_directory, "pbta-histologies.tsv")) +``` + +# Filter data + +```{r} +# Filter CNV data by cutoff segmean using custom `filter_segmean` function +cnv_filtered <- + lapply(cnv_data, filter_segmean, segmean_cutoff = 0.5) + +# Bind rows of dataframes in cnv_filtered for use with ggplots +combined_cnv_filtered <- + dplyr::bind_rows(cnv_filtered, .id = "cnv_caller") +``` + +# GenVisR plots + +```{r, fig.height = 25, fig.width = 40} +# Run `GenVisR::cnFreq` +cnv_proportion_plot <- + lapply( + cnv_filtered, + GenVisR::cnFreq, + genome = "hg38", + CN_low_cutoff = 0, + CN_high_cutoff = .2, + plotType = "proportion" + ) +cnv_frequency_plot <- lapply( + cnv_filtered, + GenVisR::cnFreq, + genome = "hg38", + CN_low_cutoff = 0, + CN_high_cutoff = .2, + plotType = "frequency" +) + +# Plot cowplot of frequency plots and save +plot_cowplot( + cnv_proportion_plot, + output_directory, + "compare_cnv_output_proportion.pdf" +) + +# Plot cowplot of proportion plots and save +plot_cowplot( + cnv_frequency_plot, + output_directory, + "compare_cnv_output_frequency.pdf" +) +``` + +# Violin plots + +These plots represent the size of aberrations. In other words, there is no +differention between gain and loss. +```{r, fig.height = 25, fig.width = 40} +# Run `plot_violin` on CNV data +cnv_violin_plots <- plot_violin(combined_cnv_filtered) + +# Save plot +pdf( + file.path("plots", "compare_cnv_output_violin_plot.pdf"), + height = 12, + width = 30 +) +cnv_violin_plots +dev.off() + +cnv_violin_plots +``` + +# Barplots + +```{r, fig.height = 25, fig.width = 40} +# Run `plot_histology_barplot` +cnv_histology_barplots <- + plot_histology_barplot(combined_cnv_filtered, metadata) + +# Save plot +pdf( + file.path("plots", "compare_cnv_output_barplot_histology.pdf"), + height = 12, + width = 30 +) +cnv_histology_barplots +dev.off() + +# Run `plot_aberration_barplot` +cnv_aberration_barplots <- plot_aberration_barplot(combined_cnv_filtered) + +# Save plot +pdf( + file.path("plots", "compare_cnv_output_barplot_aberration.pdf"), + height = 12, + width = 30 +) +cnv_aberration_barplots +dev.off() + +cnv_histology_barplots +cnv_aberration_barplots +``` + +# Session Info + +```{r} +sessionInfo() +``` + diff --git a/analyses/cnv-comparison/01-cnv-comparison-plotting.nb.html b/analyses/cnv-comparison/01-cnv-comparison-plotting.nb.html new file mode 100644 index 0000000000..47e5718d0d --- /dev/null +++ b/analyses/cnv-comparison/01-cnv-comparison-plotting.nb.html @@ -0,0 +1,3304 @@ + + + + + + + + + + + + + + +CNV Comparison Plots + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+
+
+
+
+ +
+ + + + + + + + +

This notebook plots and compares detected CNV aberrations given CNVkit and Control-FREEC output.

+
+

Output Files

+
    +
  • analyses/cnv-comparison/plots/compare_cnv_output_proportion.pdf
  • +
  • analyses/cnv-comparison/plots/compare_cnv_output_frequency.pdf
  • +
  • analyses/cnv-comparison/plots/compare_cnv_output_violin_plot.pdf
  • +
  • analyses/cnv-comparison/plots/compare_cnv_output_barplot_histology.pdf
  • +
  • analyses/cnv-comparison/plots/compare_cnv_output_barplot_aberration.pdf
  • +
+
+
+

Usage

+

This script is intended to be run via the command line from the top directory of the repository as follows:

+
Rscript -e "rmarkdown::render('analyses/cnv-comparison/01-cnv-comparison-plotting.Rmd', 
+                              clean = TRUE)"
+
+
+

Set Up

+ + + +
# This will be needed to create the frequency and proportion aberration plots 
+if (!("GenVisR" %in% installed.packages())) {
+  install.packages("BiocManager")
+  BiocManager::install("GenVisR")
+}
+
+# This will be need to combine plots 
+if (!("cowplot" %in% installed.packages())) {
+  install.packages("cowplot")
+}
+
+# Magrittr pipe
+`%>%` <- dplyr::`%>%`
+
+# Source custom functions script
+source(file.path("util", "cnv-comparison-functions.R"))
+ + + +
+
+

Directories and Files

+ + + +
# Path to input directory
+input_directory <- file.path("..", "..", "data")
+
+# Path to output directory
+output_directory <- "plots"
+
+# Create the output directory if it does not exist
+if (!dir.exists(output_directory)) {
+  dir.create(output_directory, recursive = TRUE)
+}
+
+# List of file paths to the CNV data 
+cnv_list <-
+  list(
+    cnvkit = file.path(input_directory, "pbta-cnv-cnvkit.seg.gz"),
+    controlfreec = file.path(input_directory, "pbta-cnv-controlfreec.seg.gz")
+  )
+ + + +
+
+

Read in data

+ + + +
# Read in list of CNV data using custom `read_in_cnv` function
+cnv_data <- lapply(cnv_list, read_in_cnv)
+
+# Read in metadata
+metadata <-
+  readr::read_tsv(file.path(input_directory, "pbta-histologies.tsv"))
+ + +
Parsed with column specification:
+cols(
+  .default = col_character(),
+  age_at_diagnosis_days = col_double(),
+  OS_days = col_double()
+)
+See spec(...) for full column specifications.
+ + + +
+
+

Filter data

+ + + +
# Filter CNV data by cutoff segmean using custom `filter_segmean` function
+cnv_filtered <-
+  lapply(cnv_data, filter_segmean, segmean_cutoff = 0.5)
+
+# Bind rows of dataframes in cnv_filtered for use with ggplots
+combined_cnv_filtered <-
+  dplyr::bind_rows(cnv_filtered, .id = "cnv_caller")
+ + + +
+
+

GenVisR plots

+ + + +
# Run `GenVisR::cnFreq` 
+cnv_proportion_plot <-
+  lapply(
+    cnv_filtered,
+    GenVisR::cnFreq,
+    genome = "hg38",
+    CN_low_cutoff = 0,
+    CN_high_cutoff = .2,
+    plotType = "proportion"
+  )
+ + +
Did not detect identical genomic segments for all samples ...Performing disjoin operation
+Detected "chr" in the chromosome column of x... proceeding
+genome specified is preloaded, retrieving data...
+Did not detect identical genomic segments for all samples ...Performing disjoin operation
+Detected "chr" in the chromosome column of x... proceeding
+genome specified is preloaded, retrieving data...
+ + +
cnv_frequency_plot <- lapply(
+  cnv_filtered,
+  GenVisR::cnFreq,
+  genome = "hg38",
+  CN_low_cutoff = 0,
+  CN_high_cutoff = .2,
+  plotType = "frequency"
+)
+ + +
Did not detect identical genomic segments for all samples ...Performing disjoin operation
+Detected "chr" in the chromosome column of x... proceeding
+genome specified is preloaded, retrieving data...
+Did not detect identical genomic segments for all samples ...Performing disjoin operation
+Detected "chr" in the chromosome column of x... proceeding
+genome specified is preloaded, retrieving data...
+ + +
# Plot cowplot of frequency plots and save
+plot_cowplot(
+  cnv_proportion_plot,
+  output_directory,
+  "compare_cnv_output_proportion.pdf"
+)
+ + +

+ + +

+# Plot cowplot of proportion plots and save
+plot_cowplot(
+  cnv_frequency_plot,
+  output_directory,
+  "compare_cnv_output_frequency.pdf"
+)
+ + +

+ + + +
+
+

Violin plots

+

These plots represent the size of aberrations. In other words, there is no differention between gain and loss.

+ + + +
# Run `plot_violin` on CNV data
+cnv_violin_plots <- plot_violin(combined_cnv_filtered)
+
+# Save plot
+pdf(
+  file.path("plots", "compare_cnv_output_violin_plot.pdf"),
+  height = 12,
+  width = 30
+)
+cnv_violin_plots
+dev.off()
+ + +
null device 
+          1 
+ + +
cnv_violin_plots
+ + +

+ + + +
+
+

Barplots

+ + + +
# Run `plot_histology_barplot`
+cnv_histology_barplots <-
+  plot_histology_barplot(combined_cnv_filtered, metadata)
+
+# Save plot
+pdf(
+  file.path("plots", "compare_cnv_output_barplot_histology.pdf"),
+  height = 12,
+  width = 30
+)
+cnv_histology_barplots
+dev.off()
+ + +
null device 
+          1 
+ + +
# Run `plot_aberration_barplot`
+cnv_aberration_barplots <- plot_aberration_barplot(combined_cnv_filtered)
+
+# Save plot
+pdf(
+  file.path("plots", "compare_cnv_output_barplot_aberration.pdf"),
+  height = 12,
+  width = 30
+)
+ + +
cnv_aberration_barplots
+dev.off()
+ + +
null device 
+          1 
+ + +
cnv_histology_barplots
+ + +

+ + +
cnv_aberration_barplots
+ + +

+ + + +
+
+

Session Info

+ + + +
sessionInfo()
+ + +
R version 3.6.1 (2019-07-05)
+Platform: x86_64-apple-darwin15.6.0 (64-bit)
+Running under: macOS Mojave 10.14.4
+
+Matrix products: default
+BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
+LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
+
+locale:
+[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
+
+attached base packages:
+[1] parallel  stats     graphics  grDevices utils     datasets  methods   base     
+
+other attached packages:
+[1] maftools_2.0.15     Biobase_2.45.0      BiocGenerics_0.31.5
+
+loaded via a namespace (and not attached):
+  [1] bitops_1.0-6                matrixStats_0.54.0          bit64_0.9-7                 doParallel_1.0.15           RColorBrewer_1.1-2         
+  [6] progress_1.2.2              httr_1.4.1                  GenomeInfoDb_1.21.1         tools_3.6.1                 backports_1.1.4            
+ [11] R6_2.4.0                    DBI_1.0.0                   lazyeval_0.2.2              colorspace_1.4-1            withr_2.1.2                
+ [16] gridExtra_2.3               tidyselect_0.2.5            prettyunits_1.0.2           curl_4.0                    bit_1.1-14                 
+ [21] compiler_3.6.1              DelayedArray_0.11.4         pkgmaker_0.27               labeling_0.3                rtracklayer_1.45.5         
+ [26] scales_1.0.0                readr_1.3.1                 NMF_0.21.0                  askpass_1.1                 rappdirs_0.3.1             
+ [31] stringr_1.4.0               digest_0.6.20               Rsamtools_2.1.3             rmarkdown_1.14              XVector_0.25.0             
+ [36] lintr_1.0.3                 base64enc_0.1-3             htmltools_0.3.6             pkgconfig_2.0.2             bibtex_0.4.2               
+ [41] dbplyr_1.4.2                BSgenome_1.53.2             rlang_0.4.0                 rstudioapi_0.10             RSQLite_2.1.2              
+ [46] jsonlite_1.6                gtools_3.8.1                BiocParallel_1.19.2         GenVisR_1.17.2              dplyr_0.8.3                
+ [51] VariantAnnotation_1.31.4    RCurl_1.95-4.12             magrittr_1.5                GenomeInfoDbData_1.2.1      wordcloud_2.6              
+ [56] Matrix_1.2-17               Rcpp_1.0.2                  munsell_0.5.0               S4Vectors_0.23.23           viridis_0.5.1              
+ [61] yaml_2.2.0                  stringi_1.4.3               SummarizedExperiment_1.15.6 zlibbioc_1.31.0             plyr_1.8.4                 
+ [66] FField_0.1.0                BiocFileCache_1.9.1         grid_3.6.1                  blob_1.2.0                  crayon_1.3.4               
+ [71] lattice_0.20-38             cowplot_1.0.0               Biostrings_2.53.2           splines_3.6.1               GenomicFeatures_1.37.4     
+ [76] hms_0.5.0                   zeallot_0.1.0               knitr_1.24                  pillar_1.4.2                GenomicRanges_1.37.14      
+ [81] rngtools_1.4                reshape2_1.4.3              codetools_0.2-16            biomaRt_2.41.8              stats4_3.6.1               
+ [86] XML_3.98-1.20               glue_1.3.1                  evaluate_0.14               rex_1.1.2                   BiocManager_1.30.7         
+ [91] data.table_1.12.2           vctrs_0.2.0                 foreach_1.4.7               gtable_0.3.0                openssl_1.4.1              
+ [96] purrr_0.3.2                 assertthat_0.2.1            ggplot2_3.2.1               xfun_0.8                    gridBase_0.4-7             
+[101] xtable_1.8-4                viridisLite_0.3.0           survival_2.44-1.1           tibble_2.1.3                iterators_1.0.12           
+[106] GenomicAlignments_1.21.4    AnnotationDbi_1.47.0        registry_0.5-1              memoise_1.1.0               IRanges_2.19.10            
+[111] cluster_2.1.0              
+ + + + +
+ +
LS0tCnRpdGxlOiAiQ05WIENvbXBhcmlzb24gUGxvdHMiCm91dHB1dDogICAKICBodG1sX25vdGVib29rOiAKICAgIHRvYzogdHJ1ZQogICAgdG9jX2Zsb2F0OiB0cnVlCi0tLQoKVGhpcyBub3RlYm9vayBwbG90cyBhbmQgY29tcGFyZXMgZGV0ZWN0ZWQgQ05WIGFiZXJyYXRpb25zIGdpdmVuIENOVmtpdCBhbmQgCkNvbnRyb2wtRlJFRUMgb3V0cHV0LgoKIyMgT3V0cHV0IEZpbGVzCgotIGBhbmFseXNlcy9jbnYtY29tcGFyaXNvbi9wbG90cy9jb21wYXJlX2Nudl9vdXRwdXRfcHJvcG9ydGlvbi5wZGZgCi0gYGFuYWx5c2VzL2Nudi1jb21wYXJpc29uL3Bsb3RzL2NvbXBhcmVfY252X291dHB1dF9mcmVxdWVuY3kucGRmYAotIGBhbmFseXNlcy9jbnYtY29tcGFyaXNvbi9wbG90cy9jb21wYXJlX2Nudl9vdXRwdXRfdmlvbGluX3Bsb3QucGRmYAotIGBhbmFseXNlcy9jbnYtY29tcGFyaXNvbi9wbG90cy9jb21wYXJlX2Nudl9vdXRwdXRfYmFycGxvdF9oaXN0b2xvZ3kucGRmYAotIGBhbmFseXNlcy9jbnYtY29tcGFyaXNvbi9wbG90cy9jb21wYXJlX2Nudl9vdXRwdXRfYmFycGxvdF9hYmVycmF0aW9uLnBkZmAKCiMjIFVzYWdlCgpUaGlzIHNjcmlwdCBpcyBpbnRlbmRlZCB0byBiZSBydW4gdmlhIHRoZSBjb21tYW5kIGxpbmUgZnJvbSB0aGUgdG9wIGRpcmVjdG9yeQpvZiB0aGUgcmVwb3NpdG9yeSBhcyBmb2xsb3dzOgoKYGBgClJzY3JpcHQgLWUgInJtYXJrZG93bjo6cmVuZGVyKCdhbmFseXNlcy9jbnYtY29tcGFyaXNvbi8wMS1jbnYtY29tcGFyaXNvbi1wbG90dGluZy5SbWQnLCAKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgY2xlYW4gPSBUUlVFKSIKYGBgCgojIFNldCBVcAoKYGBge3J9CiMgVGhpcyB3aWxsIGJlIG5lZWRlZCB0byBjcmVhdGUgdGhlIGZyZXF1ZW5jeSBhbmQgcHJvcG9ydGlvbiBhYmVycmF0aW9uIHBsb3RzIAppZiAoISgiR2VuVmlzUiIgJWluJSBpbnN0YWxsZWQucGFja2FnZXMoKSkpIHsKICBpbnN0YWxsLnBhY2thZ2VzKCJCaW9jTWFuYWdlciIpCiAgQmlvY01hbmFnZXI6Omluc3RhbGwoIkdlblZpc1IiKQp9CgojIFRoaXMgd2lsbCBiZSBuZWVkIHRvIGNvbWJpbmUgcGxvdHMgCmlmICghKCJjb3dwbG90IiAlaW4lIGluc3RhbGxlZC5wYWNrYWdlcygpKSkgewogIGluc3RhbGwucGFja2FnZXMoImNvd3Bsb3QiKQp9CgojIE1hZ3JpdHRyIHBpcGUKYCU+JWAgPC0gZHBseXI6OmAlPiVgCgojIFNvdXJjZSBjdXN0b20gZnVuY3Rpb25zIHNjcmlwdApzb3VyY2UoZmlsZS5wYXRoKCJ1dGlsIiwgImNudi1jb21wYXJpc29uLWZ1bmN0aW9ucy5SIikpCmBgYAoKIyBEaXJlY3RvcmllcyBhbmQgRmlsZXMKCmBgYHtyfQojIFBhdGggdG8gaW5wdXQgZGlyZWN0b3J5CmlucHV0X2RpcmVjdG9yeSA8LSBmaWxlLnBhdGgoIi4uIiwgIi4uIiwgImRhdGEiKQoKIyBQYXRoIHRvIG91dHB1dCBkaXJlY3RvcnkKb3V0cHV0X2RpcmVjdG9yeSA8LSAicGxvdHMiCgojIENyZWF0ZSB0aGUgb3V0cHV0IGRpcmVjdG9yeSBpZiBpdCBkb2VzIG5vdCBleGlzdAppZiAoIWRpci5leGlzdHMob3V0cHV0X2RpcmVjdG9yeSkpIHsKICBkaXIuY3JlYXRlKG91dHB1dF9kaXJlY3RvcnksIHJlY3Vyc2l2ZSA9IFRSVUUpCn0KCiMgTGlzdCBvZiBmaWxlIHBhdGhzIHRvIHRoZSBDTlYgZGF0YSAKY252X2xpc3QgPC0KICBsaXN0KAogICAgY252a2l0ID0gZmlsZS5wYXRoKGlucHV0X2RpcmVjdG9yeSwgInBidGEtY252LWNudmtpdC5zZWcuZ3oiKSwKICAgIGNvbnRyb2xmcmVlYyA9IGZpbGUucGF0aChpbnB1dF9kaXJlY3RvcnksICJwYnRhLWNudi1jb250cm9sZnJlZWMuc2VnLmd6IikKICApCmBgYAoKIyBSZWFkIGluIGRhdGEgCgpgYGB7cn0KIyBSZWFkIGluIGxpc3Qgb2YgQ05WIGRhdGEgdXNpbmcgY3VzdG9tIGByZWFkX2luX2NudmAgZnVuY3Rpb24KY252X2RhdGEgPC0gbGFwcGx5KGNudl9saXN0LCByZWFkX2luX2NudikKCiMgUmVhZCBpbiBtZXRhZGF0YQptZXRhZGF0YSA8LQogIHJlYWRyOjpyZWFkX3RzdihmaWxlLnBhdGgoaW5wdXRfZGlyZWN0b3J5LCAicGJ0YS1oaXN0b2xvZ2llcy50c3YiKSkKYGBgCgojIEZpbHRlciBkYXRhCgpgYGB7cn0KIyBGaWx0ZXIgQ05WIGRhdGEgYnkgY3V0b2ZmIHNlZ21lYW4gdXNpbmcgY3VzdG9tIGBmaWx0ZXJfc2VnbWVhbmAgZnVuY3Rpb24KY252X2ZpbHRlcmVkIDwtCiAgbGFwcGx5KGNudl9kYXRhLCBmaWx0ZXJfc2VnbWVhbiwgc2VnbWVhbl9jdXRvZmYgPSAwLjUpCgojIEJpbmQgcm93cyBvZiBkYXRhZnJhbWVzIGluIGNudl9maWx0ZXJlZCBmb3IgdXNlIHdpdGggZ2dwbG90cwpjb21iaW5lZF9jbnZfZmlsdGVyZWQgPC0KICBkcGx5cjo6YmluZF9yb3dzKGNudl9maWx0ZXJlZCwgLmlkID0gImNudl9jYWxsZXIiKQpgYGAKCiMgR2VuVmlzUiBwbG90cwoKYGBge3IsIGZpZy5oZWlnaHQgPSAyNSwgZmlnLndpZHRoID0gNDB9CiMgUnVuIGBHZW5WaXNSOjpjbkZyZXFgIApjbnZfcHJvcG9ydGlvbl9wbG90IDwtCiAgbGFwcGx5KAogICAgY252X2ZpbHRlcmVkLAogICAgR2VuVmlzUjo6Y25GcmVxLAogICAgZ2Vub21lID0gImhnMzgiLAogICAgQ05fbG93X2N1dG9mZiA9IDAsCiAgICBDTl9oaWdoX2N1dG9mZiA9IC4yLAogICAgcGxvdFR5cGUgPSAicHJvcG9ydGlvbiIKICApCmNudl9mcmVxdWVuY3lfcGxvdCA8LSBsYXBwbHkoCiAgY252X2ZpbHRlcmVkLAogIEdlblZpc1I6OmNuRnJlcSwKICBnZW5vbWUgPSAiaGczOCIsCiAgQ05fbG93X2N1dG9mZiA9IDAsCiAgQ05faGlnaF9jdXRvZmYgPSAuMiwKICBwbG90VHlwZSA9ICJmcmVxdWVuY3kiCikKCiMgUGxvdCBjb3dwbG90IG9mIGZyZXF1ZW5jeSBwbG90cyBhbmQgc2F2ZQpwbG90X2Nvd3Bsb3QoCiAgY252X3Byb3BvcnRpb25fcGxvdCwKICBvdXRwdXRfZGlyZWN0b3J5LAogICJjb21wYXJlX2Nudl9vdXRwdXRfcHJvcG9ydGlvbi5wZGYiCikKCiMgUGxvdCBjb3dwbG90IG9mIHByb3BvcnRpb24gcGxvdHMgYW5kIHNhdmUKcGxvdF9jb3dwbG90KAogIGNudl9mcmVxdWVuY3lfcGxvdCwKICBvdXRwdXRfZGlyZWN0b3J5LAogICJjb21wYXJlX2Nudl9vdXRwdXRfZnJlcXVlbmN5LnBkZiIKKQpgYGAKCiMgVmlvbGluIHBsb3RzCgpUaGVzZSBwbG90cyByZXByZXNlbnQgdGhlIHNpemUgb2YgYWJlcnJhdGlvbnMuIEluIG90aGVyIHdvcmRzLCB0aGVyZSBpcyBubyAKZGlmZmVyZW50aW9uIGJldHdlZW4gZ2FpbiBhbmQgbG9zcy4gCmBgYHtyLCBmaWcuaGVpZ2h0ID0gMjUsIGZpZy53aWR0aCA9IDQwfQojIFJ1biBgcGxvdF92aW9saW5gIG9uIENOViBkYXRhCmNudl92aW9saW5fcGxvdHMgPC0gcGxvdF92aW9saW4oY29tYmluZWRfY252X2ZpbHRlcmVkKQoKIyBTYXZlIHBsb3QKcGRmKAogIGZpbGUucGF0aCgicGxvdHMiLCAiY29tcGFyZV9jbnZfb3V0cHV0X3Zpb2xpbl9wbG90LnBkZiIpLAogIGhlaWdodCA9IDEyLAogIHdpZHRoID0gMzAKKQpjbnZfdmlvbGluX3Bsb3RzCmRldi5vZmYoKQoKY252X3Zpb2xpbl9wbG90cwpgYGAKCiMgQmFycGxvdHMKCmBgYHtyLCBmaWcuaGVpZ2h0ID0gMjUsIGZpZy53aWR0aCA9IDQwfQojIFJ1biBgcGxvdF9oaXN0b2xvZ3lfYmFycGxvdGAKY252X2hpc3RvbG9neV9iYXJwbG90cyA8LQogIHBsb3RfaGlzdG9sb2d5X2JhcnBsb3QoY29tYmluZWRfY252X2ZpbHRlcmVkLCBtZXRhZGF0YSkKCiMgU2F2ZSBwbG90CnBkZigKICBmaWxlLnBhdGgoInBsb3RzIiwgImNvbXBhcmVfY252X291dHB1dF9iYXJwbG90X2hpc3RvbG9neS5wZGYiKSwKICBoZWlnaHQgPSAxMiwKICB3aWR0aCA9IDMwCikKY252X2hpc3RvbG9neV9iYXJwbG90cwpkZXYub2ZmKCkKCiMgUnVuIGBwbG90X2FiZXJyYXRpb25fYmFycGxvdGAKY252X2FiZXJyYXRpb25fYmFycGxvdHMgPC0gcGxvdF9hYmVycmF0aW9uX2JhcnBsb3QoY29tYmluZWRfY252X2ZpbHRlcmVkKQoKIyBTYXZlIHBsb3QKcGRmKAogIGZpbGUucGF0aCgicGxvdHMiLCAiY29tcGFyZV9jbnZfb3V0cHV0X2JhcnBsb3RfYWJlcnJhdGlvbi5wZGYiKSwKICBoZWlnaHQgPSAxMiwKICB3aWR0aCA9IDMwCikKY252X2FiZXJyYXRpb25fYmFycGxvdHMKZGV2Lm9mZigpCgpjbnZfaGlzdG9sb2d5X2JhcnBsb3RzCmNudl9hYmVycmF0aW9uX2JhcnBsb3RzCmBgYAoKIyBTZXNzaW9uIEluZm8KCmBgYHtyfQpzZXNzaW9uSW5mbygpCmBgYAoK
+ + +
+
+ +
+ + + + + + + + + + + + + + + + + diff --git a/analyses/cnv-comparison/plots/compare_cnv_output_barplot_aberration.pdf b/analyses/cnv-comparison/plots/compare_cnv_output_barplot_aberration.pdf new file mode 100644 index 0000000000..cb7c5a4f38 Binary files /dev/null and b/analyses/cnv-comparison/plots/compare_cnv_output_barplot_aberration.pdf differ diff --git a/analyses/cnv-comparison/plots/compare_cnv_output_barplot_histology.pdf b/analyses/cnv-comparison/plots/compare_cnv_output_barplot_histology.pdf new file mode 100644 index 0000000000..0a7663fef7 Binary files /dev/null and b/analyses/cnv-comparison/plots/compare_cnv_output_barplot_histology.pdf differ diff --git a/analyses/cnv-comparison/plots/compare_cnv_output_frequency.pdf b/analyses/cnv-comparison/plots/compare_cnv_output_frequency.pdf new file mode 100644 index 0000000000..454fe1cac2 Binary files /dev/null and b/analyses/cnv-comparison/plots/compare_cnv_output_frequency.pdf differ diff --git a/analyses/cnv-comparison/plots/compare_cnv_output_proportion.pdf b/analyses/cnv-comparison/plots/compare_cnv_output_proportion.pdf new file mode 100644 index 0000000000..1890782daa Binary files /dev/null and b/analyses/cnv-comparison/plots/compare_cnv_output_proportion.pdf differ diff --git a/analyses/cnv-comparison/plots/compare_cnv_output_violin_plot.pdf b/analyses/cnv-comparison/plots/compare_cnv_output_violin_plot.pdf new file mode 100644 index 0000000000..484e528ba8 Binary files /dev/null and b/analyses/cnv-comparison/plots/compare_cnv_output_violin_plot.pdf differ diff --git a/analyses/cnv-comparison/util/cnv-comparison-functions.R b/analyses/cnv-comparison/util/cnv-comparison-functions.R new file mode 100644 index 0000000000..bf194b74fc --- /dev/null +++ b/analyses/cnv-comparison/util/cnv-comparison-functions.R @@ -0,0 +1,179 @@ +# This script defines filtering and plotting functions to be sourced in +# `01-cnv-comparison-plotting.R` +# +# Chante Bethell for CCDL 2019 +# +# Usage: +# This script is intended to be run via the command line from the top directory +# of the repository as follows: +# +# Rscript analyses/cnv-comparison/util/cnv-comparison-functions.R + +read_in_cnv <- function(file_path){ + # Given the file path of the CNV data, read in the data. + # + # Arg: + # file_path: file path of input data + # + # Return: + # cnv_df: the data frame containing the input data + + # Read in cnv data + cnv_df <- + read.table(gzfile(file_path), header = TRUE, stringsAsFactors = FALSE) + + # Rearrange the columns + cnv_df <- cnv_df %>% + dplyr::select("chrom", "loc.start", "loc.end", "ID", "num.mark", "seg.mean") %>% + dplyr::mutate(chrom = factor(chrom, levels = paste0("chr", c(1:22, "X", "Y")))) + + return(cnv_df) + +} + +filter_segmean <- function(cnv_df, segmean_cutoff){ + # Given the data.frame containing CNV caller output, return a data.frame with + # a column with the values 0 and 1 to represent loss and gain, respectively. + # + # Args: + # cnv_df: data.frame containing CNV caller output + # segmean_cutoff: segment mean value cutoff + # + # Return: + # filtered_cnv_df: the data.frame given, returned with a column containing + # an aberration label for loss or gain + + # Rename columns for GenVisR function downstream + colnames(cnv_df) <- + c("chromosome", + "start", + "end", + "sample", + "probes", + "segmean") + + # Define an aberration column + filtered_cnv_df <- cnv_df %>% + dplyr::mutate(aberration = dplyr::case_when(segmean < -segmean_cutoff ~ 0, + segmean > segmean_cutoff ~ 1)) %>% + dplyr::filter(!is.na(aberration)) + + filtered_cnv_df$aberration <- + factor(filtered_cnv_df$aberration, labels = c("Loss", "Gain")) + + return(filtered_cnv_df) + +} + +plot_violin <- function(filtered_cnv_df) { + # Given the data.frame filtered for size of aberrations, plot the proportion + # or frequency (denoted by the plot_type argument) of aberrations across + # chromosomes with `geom_violin` + # + # Args: + # filtered_cnv_df: data.frame filtered for cutoff size of aberrations + # + # Return: + # violin_plot: violin plot depicting the log2 transformed sizes of + # aberrations detected across chromosomes + + # Create violin plot where the y-axis represents the log2 transformed segmean + violin_plot <- ggplot2::ggplot(filtered_cnv_df, + ggplot2::aes(x = chromosome, + y = (log2(abs(segmean)) + 1))) + + ggplot2::geom_violin() + + ggplot2::theme_bw() + + ggplot2::facet_grid(cnv_caller ~ .) + + ggplot2::ylab("Log transformed segmean values") + + ggplot2::theme(strip.text.y = ggplot2::element_text(size = 14)) + + return(violin_plot) + +} + +plot_histology_barplot <- function(filtered_cnv_df, metadata) { + # Given the data.frame filtered by cutoff segmean value with `filter_segmean`, + # plot the proportion of aberrations across chromosomes with `geom_barplot` + # + # Args: + # filtered_cnv_df: data.frame filtered for cutoff size of aberrations and + # joined with the metadata + # metadata: the relevant metadata data.frame + # + # Return: + # barplot: barplot depicting the proportion of aberrations detected across + # chromosomes, annotated by the `broad_histology`` variable in the + # metadata + + # Create a data.frame with the filtered dataframe joined with the metadata + meta_joined <- filtered_cnv_df %>% + dplyr::inner_join(metadata, by = c("sample" = "Kids_First_Biospecimen_ID")) + + # Create barplot where the y-axis represents the size of aberration + barplot <- ggplot2::ggplot(meta_joined, + ggplot2::aes(x = chromosome, + fill = broad_histology)) + + ggplot2::geom_bar() + + ggplot2::theme_bw() + + ggplot2::facet_grid(cnv_caller ~ aberration) + + ggplot2::theme(strip.text.x = ggplot2::element_text(size = 14), + strip.text.y = ggplot2::element_text(size = 14)) + + return(barplot) + +} + +plot_aberration_barplot <- function(filtered_cnv_df) { + # Given the data.frame filtered by cutoff segmean value with `filter_segmean`, + # plot the proportion of aberrations across chromosomes with `geom_barplot` + # + # Args: + # filtered_cnv_df: data.frame filtered by cutoff segmean value + # + # Return: + # barplot: barplot depicting the proportion of aberrations detected across + # chromosomes + + # Create barplot where the y-axis represents the size of aberration + barplot <- ggplot2::ggplot(filtered_cnv_df, + ggplot2::aes(x = chromosome, + fill = aberration)) + + ggplot2::geom_bar() + + ggplot2::theme_bw() + + ggplot2::facet_grid(cnv_caller ~ aberration) + + ggplot2::ylab("Proportion of Aberrations") + + ggplot2::theme(strip.text.x = ggplot2::element_text(size = 14), + strip.text.y = ggplot2::element_text(size = 14)) + + return(barplot) + +} + +plot_cowplot <- function(plot_list, output_path, plot_name){ + # Given two plots, create a combined cowplot and save as a PDF in the + # specified directory + # Args: + # plot_list: list of plots + # output_path: the file.path to the output directory + # plot_name: the name the plot should be saved as + + # Save a combined cowplot plot of the cnvkit and controlfreec plots + grid <- + cowplot::plot_grid( + plot_list[[1]], + plot_list[[2]], + ncol = 1, + labels = c("CNVkit", "ControlFREEC"), + label_y = 1.05 + ) + ggplot2::theme(plot.margin = ggplot2::unit(c(1, 1, 1, 2), "cm")) + + cowplot::save_plot( + file.path(output_path, plot_name), + grid, + base_height = 12, + base_width = 30 + ) + + return(grid) + +}