diff --git a/.circleci/config.yml b/.circleci/config.yml index 8e157e0c3e..d834c91284 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -53,6 +53,10 @@ jobs: name: ssGSEA Analysis command: OPENPBTA_ANOVAPVALUE=0.25 OPENPBTA_TUKEYPVALUE=0.50 OPENPBTA_PERCKEEP=0.50 ./scripts/run_in_ci.sh bash analyses/ssgsea-hallmark/run-ssgsea-hallmark.sh + - run: + name: CNV Caller Comparison + command: ./scripts/run_in_ci.sh Rscript -e "rmarkdown::render('analyses/cnv-comparison/01-cnv-comparison-plotting.Rmd', clean = TRUE)" + #### Add your analysis here #### deploy: diff --git a/Dockerfile b/Dockerfile index c76999804c..bd1f735d19 100644 --- a/Dockerfile +++ b/Dockerfile @@ -50,6 +50,9 @@ RUN apt-get update -qq && apt-get -y --no-install-recommends install \ # maftools for proof of concept in create-subset-files RUN R -e "BiocManager::install(c('maftools'), update = FALSE)" +# This is needed for the CNV frequency and proportion aberration plots +RUN R -e "BiocManager::install(c('GenVisR'), update = FALSE)" + # These packages are for the genomic region analysis for snv-callers RUN R -e "BiocManager::install(c('annotatr', 'TxDb.Hsapiens.UCSC.hg38.knownGene', 'org.Hs.eg.db'), update = FALSE)" diff --git a/analyses/cnv-comparison/01-cnv-comparison-plotting.Rmd b/analyses/cnv-comparison/01-cnv-comparison-plotting.Rmd new file mode 100644 index 0000000000..20d7711550 --- /dev/null +++ b/analyses/cnv-comparison/01-cnv-comparison-plotting.Rmd @@ -0,0 +1,190 @@ +--- +title: "CNV Comparison Plots" +output: + html_notebook: + toc: true + toc_float: true +--- + +This notebook plots and compares detected CNV aberrations given CNVkit and +Control-FREEC output. + +## Output Files + +- `analyses/cnv-comparison/plots/compare_cnv_output_proportion.pdf` +- `analyses/cnv-comparison/plots/compare_cnv_output_frequency.pdf` +- `analyses/cnv-comparison/plots/compare_cnv_output_violin_plot.pdf` +- `analyses/cnv-comparison/plots/compare_cnv_output_barplot_histology.pdf` +- `analyses/cnv-comparison/plots/compare_cnv_output_barplot_aberration.pdf` + +## Usage + +This script is intended to be run via the command line from the top directory +of the repository as follows: + +``` +Rscript -e "rmarkdown::render('analyses/cnv-comparison/01-cnv-comparison-plotting.Rmd', + clean = TRUE)" +``` + +# Set Up + +```{r} +# This will be needed to create the frequency and proportion aberration plots +if (!("GenVisR" %in% installed.packages())) { + install.packages("BiocManager") + BiocManager::install("GenVisR") +} + +# This will be need to combine plots +if (!("cowplot" %in% installed.packages())) { + install.packages("cowplot") +} + +# Magrittr pipe +`%>%` <- dplyr::`%>%` + +# Source custom functions script +source(file.path("util", "cnv-comparison-functions.R")) +``` + +# Directories and Files + +```{r} +# Path to input directory +input_directory <- file.path("..", "..", "data") + +# Path to output directory +output_directory <- "plots" + +# Create the output directory if it does not exist +if (!dir.exists(output_directory)) { + dir.create(output_directory, recursive = TRUE) +} + +# List of file paths to the CNV data +cnv_list <- + list( + cnvkit = file.path(input_directory, "pbta-cnv-cnvkit.seg.gz"), + controlfreec = file.path(input_directory, "pbta-cnv-controlfreec.seg.gz") + ) +``` + +# Read in data + +```{r} +# Read in list of CNV data using custom `read_in_cnv` function +cnv_data <- lapply(cnv_list, read_in_cnv) + +# Read in metadata +metadata <- + readr::read_tsv(file.path(input_directory, "pbta-histologies.tsv")) +``` + +# Filter data + +```{r} +# Filter CNV data by cutoff segmean using custom `filter_segmean` function +cnv_filtered <- + lapply(cnv_data, filter_segmean, segmean_cutoff = 0.5) + +# Bind rows of dataframes in cnv_filtered for use with ggplots +combined_cnv_filtered <- + dplyr::bind_rows(cnv_filtered, .id = "cnv_caller") +``` + +# GenVisR plots + +```{r, fig.height = 25, fig.width = 40} +# Run `GenVisR::cnFreq` +cnv_proportion_plot <- + lapply( + cnv_filtered, + GenVisR::cnFreq, + genome = "hg38", + CN_low_cutoff = 0, + CN_high_cutoff = .2, + plotType = "proportion" + ) +cnv_frequency_plot <- lapply( + cnv_filtered, + GenVisR::cnFreq, + genome = "hg38", + CN_low_cutoff = 0, + CN_high_cutoff = .2, + plotType = "frequency" +) + +# Plot cowplot of frequency plots and save +plot_cowplot( + cnv_proportion_plot, + output_directory, + "compare_cnv_output_proportion.pdf" +) + +# Plot cowplot of proportion plots and save +plot_cowplot( + cnv_frequency_plot, + output_directory, + "compare_cnv_output_frequency.pdf" +) +``` + +# Violin plots + +These plots represent the size of aberrations. In other words, there is no +differention between gain and loss. +```{r, fig.height = 25, fig.width = 40} +# Run `plot_violin` on CNV data +cnv_violin_plots <- plot_violin(combined_cnv_filtered) + +# Save plot +pdf( + file.path("plots", "compare_cnv_output_violin_plot.pdf"), + height = 12, + width = 30 +) +cnv_violin_plots +dev.off() + +cnv_violin_plots +``` + +# Barplots + +```{r, fig.height = 25, fig.width = 40} +# Run `plot_histology_barplot` +cnv_histology_barplots <- + plot_histology_barplot(combined_cnv_filtered, metadata) + +# Save plot +pdf( + file.path("plots", "compare_cnv_output_barplot_histology.pdf"), + height = 12, + width = 30 +) +cnv_histology_barplots +dev.off() + +# Run `plot_aberration_barplot` +cnv_aberration_barplots <- plot_aberration_barplot(combined_cnv_filtered) + +# Save plot +pdf( + file.path("plots", "compare_cnv_output_barplot_aberration.pdf"), + height = 12, + width = 30 +) +cnv_aberration_barplots +dev.off() + +cnv_histology_barplots +cnv_aberration_barplots +``` + +# Session Info + +```{r} +sessionInfo() +``` + diff --git a/analyses/cnv-comparison/01-cnv-comparison-plotting.nb.html b/analyses/cnv-comparison/01-cnv-comparison-plotting.nb.html new file mode 100644 index 0000000000..47e5718d0d --- /dev/null +++ b/analyses/cnv-comparison/01-cnv-comparison-plotting.nb.html @@ -0,0 +1,3304 @@ + + + + +
+ + + + + + + + + +This notebook plots and compares detected CNV aberrations given CNVkit and Control-FREEC output.
+analyses/cnv-comparison/plots/compare_cnv_output_proportion.pdf
analyses/cnv-comparison/plots/compare_cnv_output_frequency.pdf
analyses/cnv-comparison/plots/compare_cnv_output_violin_plot.pdf
analyses/cnv-comparison/plots/compare_cnv_output_barplot_histology.pdf
analyses/cnv-comparison/plots/compare_cnv_output_barplot_aberration.pdf
This script is intended to be run via the command line from the top directory of the repository as follows:
+Rscript -e "rmarkdown::render('analyses/cnv-comparison/01-cnv-comparison-plotting.Rmd',
+ clean = TRUE)"
+# This will be needed to create the frequency and proportion aberration plots
+if (!("GenVisR" %in% installed.packages())) {
+ install.packages("BiocManager")
+ BiocManager::install("GenVisR")
+}
+
+# This will be need to combine plots
+if (!("cowplot" %in% installed.packages())) {
+ install.packages("cowplot")
+}
+
+# Magrittr pipe
+`%>%` <- dplyr::`%>%`
+
+# Source custom functions script
+source(file.path("util", "cnv-comparison-functions.R"))
+
+
+
+# Path to input directory
+input_directory <- file.path("..", "..", "data")
+
+# Path to output directory
+output_directory <- "plots"
+
+# Create the output directory if it does not exist
+if (!dir.exists(output_directory)) {
+ dir.create(output_directory, recursive = TRUE)
+}
+
+# List of file paths to the CNV data
+cnv_list <-
+ list(
+ cnvkit = file.path(input_directory, "pbta-cnv-cnvkit.seg.gz"),
+ controlfreec = file.path(input_directory, "pbta-cnv-controlfreec.seg.gz")
+ )
+
+
+
+# Read in list of CNV data using custom `read_in_cnv` function
+cnv_data <- lapply(cnv_list, read_in_cnv)
+
+# Read in metadata
+metadata <-
+ readr::read_tsv(file.path(input_directory, "pbta-histologies.tsv"))
+
+
+Parsed with column specification:
+cols(
+ .default = col_character(),
+ age_at_diagnosis_days = [32mcol_double()[39m,
+ OS_days = [32mcol_double()[39m
+)
+See spec(...) for full column specifications.
+
+
+
+# Filter CNV data by cutoff segmean using custom `filter_segmean` function
+cnv_filtered <-
+ lapply(cnv_data, filter_segmean, segmean_cutoff = 0.5)
+
+# Bind rows of dataframes in cnv_filtered for use with ggplots
+combined_cnv_filtered <-
+ dplyr::bind_rows(cnv_filtered, .id = "cnv_caller")
+
+
+
+# Run `GenVisR::cnFreq`
+cnv_proportion_plot <-
+ lapply(
+ cnv_filtered,
+ GenVisR::cnFreq,
+ genome = "hg38",
+ CN_low_cutoff = 0,
+ CN_high_cutoff = .2,
+ plotType = "proportion"
+ )
+
+
+Did not detect identical genomic segments for all samples ...Performing disjoin operation
+Detected "chr" in the chromosome column of x... proceeding
+genome specified is preloaded, retrieving data...
+Did not detect identical genomic segments for all samples ...Performing disjoin operation
+Detected "chr" in the chromosome column of x... proceeding
+genome specified is preloaded, retrieving data...
+
+
+cnv_frequency_plot <- lapply(
+ cnv_filtered,
+ GenVisR::cnFreq,
+ genome = "hg38",
+ CN_low_cutoff = 0,
+ CN_high_cutoff = .2,
+ plotType = "frequency"
+)
+
+
+Did not detect identical genomic segments for all samples ...Performing disjoin operation
+Detected "chr" in the chromosome column of x... proceeding
+genome specified is preloaded, retrieving data...
+Did not detect identical genomic segments for all samples ...Performing disjoin operation
+Detected "chr" in the chromosome column of x... proceeding
+genome specified is preloaded, retrieving data...
+
+
+# Plot cowplot of frequency plots and save
+plot_cowplot(
+ cnv_proportion_plot,
+ output_directory,
+ "compare_cnv_output_proportion.pdf"
+)
+
+
+
+
+
+
+# Plot cowplot of proportion plots and save
+plot_cowplot(
+ cnv_frequency_plot,
+ output_directory,
+ "compare_cnv_output_frequency.pdf"
+)
+
+
+
+
+
+
+These plots represent the size of aberrations. In other words, there is no differention between gain and loss.
+ + + +# Run `plot_violin` on CNV data
+cnv_violin_plots <- plot_violin(combined_cnv_filtered)
+
+# Save plot
+pdf(
+ file.path("plots", "compare_cnv_output_violin_plot.pdf"),
+ height = 12,
+ width = 30
+)
+cnv_violin_plots
+dev.off()
+
+
+null device
+ 1
+
+
+cnv_violin_plots
+
+
+
+
+
+
+# Run `plot_histology_barplot`
+cnv_histology_barplots <-
+ plot_histology_barplot(combined_cnv_filtered, metadata)
+
+# Save plot
+pdf(
+ file.path("plots", "compare_cnv_output_barplot_histology.pdf"),
+ height = 12,
+ width = 30
+)
+cnv_histology_barplots
+dev.off()
+
+
+null device
+ 1
+
+
+# Run `plot_aberration_barplot`
+cnv_aberration_barplots <- plot_aberration_barplot(combined_cnv_filtered)
+
+# Save plot
+pdf(
+ file.path("plots", "compare_cnv_output_barplot_aberration.pdf"),
+ height = 12,
+ width = 30
+)
+
+
+cnv_aberration_barplots
+dev.off()
+
+
+null device
+ 1
+
+
+cnv_histology_barplots
+
+
+
+
+
+cnv_aberration_barplots
+
+
+
+
+
+
+sessionInfo()
+
+
+R version 3.6.1 (2019-07-05)
+Platform: x86_64-apple-darwin15.6.0 (64-bit)
+Running under: macOS Mojave 10.14.4
+
+Matrix products: default
+BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
+LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
+
+locale:
+[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
+
+attached base packages:
+[1] parallel stats graphics grDevices utils datasets methods base
+
+other attached packages:
+[1] maftools_2.0.15 Biobase_2.45.0 BiocGenerics_0.31.5
+
+loaded via a namespace (and not attached):
+ [1] bitops_1.0-6 matrixStats_0.54.0 bit64_0.9-7 doParallel_1.0.15 RColorBrewer_1.1-2
+ [6] progress_1.2.2 httr_1.4.1 GenomeInfoDb_1.21.1 tools_3.6.1 backports_1.1.4
+ [11] R6_2.4.0 DBI_1.0.0 lazyeval_0.2.2 colorspace_1.4-1 withr_2.1.2
+ [16] gridExtra_2.3 tidyselect_0.2.5 prettyunits_1.0.2 curl_4.0 bit_1.1-14
+ [21] compiler_3.6.1 DelayedArray_0.11.4 pkgmaker_0.27 labeling_0.3 rtracklayer_1.45.5
+ [26] scales_1.0.0 readr_1.3.1 NMF_0.21.0 askpass_1.1 rappdirs_0.3.1
+ [31] stringr_1.4.0 digest_0.6.20 Rsamtools_2.1.3 rmarkdown_1.14 XVector_0.25.0
+ [36] lintr_1.0.3 base64enc_0.1-3 htmltools_0.3.6 pkgconfig_2.0.2 bibtex_0.4.2
+ [41] dbplyr_1.4.2 BSgenome_1.53.2 rlang_0.4.0 rstudioapi_0.10 RSQLite_2.1.2
+ [46] jsonlite_1.6 gtools_3.8.1 BiocParallel_1.19.2 GenVisR_1.17.2 dplyr_0.8.3
+ [51] VariantAnnotation_1.31.4 RCurl_1.95-4.12 magrittr_1.5 GenomeInfoDbData_1.2.1 wordcloud_2.6
+ [56] Matrix_1.2-17 Rcpp_1.0.2 munsell_0.5.0 S4Vectors_0.23.23 viridis_0.5.1
+ [61] yaml_2.2.0 stringi_1.4.3 SummarizedExperiment_1.15.6 zlibbioc_1.31.0 plyr_1.8.4
+ [66] FField_0.1.0 BiocFileCache_1.9.1 grid_3.6.1 blob_1.2.0 crayon_1.3.4
+ [71] lattice_0.20-38 cowplot_1.0.0 Biostrings_2.53.2 splines_3.6.1 GenomicFeatures_1.37.4
+ [76] hms_0.5.0 zeallot_0.1.0 knitr_1.24 pillar_1.4.2 GenomicRanges_1.37.14
+ [81] rngtools_1.4 reshape2_1.4.3 codetools_0.2-16 biomaRt_2.41.8 stats4_3.6.1
+ [86] XML_3.98-1.20 glue_1.3.1 evaluate_0.14 rex_1.1.2 BiocManager_1.30.7
+ [91] data.table_1.12.2 vctrs_0.2.0 foreach_1.4.7 gtable_0.3.0 openssl_1.4.1
+ [96] purrr_0.3.2 assertthat_0.2.1 ggplot2_3.2.1 xfun_0.8 gridBase_0.4-7
+[101] xtable_1.8-4 viridisLite_0.3.0 survival_2.44-1.1 tibble_2.1.3 iterators_1.0.12
+[106] GenomicAlignments_1.21.4 AnnotationDbi_1.47.0 registry_0.5-1 memoise_1.1.0 IRanges_2.19.10
+[111] cluster_2.1.0
+
+
+
+
+