diff --git a/.circleci/config.yml b/.circleci/config.yml index 4020906ae1..eb9c2ef617 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -225,21 +225,21 @@ jobs: # name: RNA-Seq composition # command: ./scripts/run_in_ci.sh Rscript -e "rmarkdown::render('analyses/rna-seq-composition/rna-seq-composition.Rmd', clean = TRUE)" - # - run: - # name: TCGA SNV Caller Analysis - # command: ./scripts/run_in_ci.sh bash analyses/snv-callers/run_caller_consensus_analysis-tcga.sh + - run: + name: TCGA SNV Caller Analysis + command: ./scripts/run_in_ci.sh bash analyses/snv-callers/run_caller_consensus_analysis-tcga.sh - # - run: - # name: SNV Caller Analysis - # command: OPENPBTA_VAF_CUTOFF=0.5 ./scripts/run_in_ci.sh bash analyses/snv-callers/run_caller_consensus_analysis-pbta.sh + - run: + name: SNV Caller Analysis + command: OPENPBTA_VAF_CUTOFF=0.5 ./scripts/run_in_ci.sh bash analyses/snv-callers/run_caller_consensus_analysis-pbta.sh - # - run: - # name: Tumor mutation burden with TCGA - # command: ./scripts/run_in_ci.sh Rscript -e "rmarkdown::render('analyses/tmb-compare/compare-tcga-pbta.Rmd', clean = TRUE)" + - run: + name: Tumor mutation burden with TCGA + command: ./scripts/run_in_ci.sh Rscript -e "rmarkdown::render('analyses/tmb-compare/compare-tcga-pbta.Rmd', clean = TRUE)" - # - run: - # name: Exploration of nonsynonymous filter - # command: ./scripts/run_in_ci.sh bash analyses/snv-callers/explore_variant_classifications/run_explorations.sh + - run: + name: Exploration of nonsynonymous filter + command: ./scripts/run_in_ci.sh bash analyses/snv-callers/explore_variant_classifications/run_explorations.sh # This analysis was used to explore the TCGA PBTA data when the BED files used to calculate TCGA # were incorrect https://github.com/AlexsLemonade/OpenPBTA-analysis/issues/568 @@ -275,9 +275,9 @@ jobs: # name: d3b TMB code # command: ./scripts/run_in_ci.sh bash analyses/tmb-compare/TMB_d3b_code/run_tmb_d3b.sh - # - run: - # name: Compare TMB calculations - # command: ./scripts/run_in_ci.sh Rscript -e "rmarkdown::render('analyses/tmb-compare/compare-tmb-calculations.Rmd', clean = TRUE)" + - run: + name: Compare TMB calculations + command: ./scripts/run_in_ci.sh Rscript -e "rmarkdown::render('analyses/tmb-compare/compare-tmb-calculations.Rmd', clean = TRUE)" - run: name: Run survival plots diff --git a/analyses/snv-callers/compare_snv_callers_plots-tcga.nb.html b/analyses/snv-callers/compare_snv_callers_plots-tcga.nb.html index 8ef4c44a03..7b082dc6cf 100644 --- a/analyses/snv-callers/compare_snv_callers_plots-tcga.nb.html +++ b/analyses/snv-callers/compare_snv_callers_plots-tcga.nb.html @@ -2971,7 +2971,7 @@

Setup

Read in set up script.

- +
# Magrittr pipe
 `%>%` <- dplyr::`%>%`
@@ -2980,7 +2980,7 @@

Setup

Set up directories.

- +
scratch_dir <- file.path("..", "..", "scratch")
 plots_dir <- file.path("plots", "tcga-caller-comparison")
 
@@ -2993,7 +2993,7 @@ 

Setup

Import custom function for upset plot.

- +
source(file.path("util", "upset_plot.R"))
@@ -3004,7 +3004,7 @@

Connect to TCGA database

Connect to SQLite database.

- +
# Start up connection
 con <- DBI::dbConnect(RSQLite::SQLite(), 
                       file.path(scratch_dir, "tcga_snv_db.sqlite"))
@@ -3014,7 +3014,7 @@

Connect to TCGA database

Note what columns we will join by.

- +
join_cols = c("Chromosome",
               "Start_Position",
               "Reference_Allele",
@@ -3027,7 +3027,7 @@ 

Connect to TCGA database

Set up tables from the database but only call the columns we need.

- +
strelka <- dplyr::tbl(con, "strelka") %>% 
   dplyr::select(join_cols, "VAF") %>% 
   as.data.frame()
@@ -3045,7 +3045,7 @@ 

Connect to TCGA database

Full join to get the summary of mutations called by each caller.

- +
# Bring out the data.frame
 all_caller_df <- strelka %>%
   dplyr::full_join(mutect, by = join_cols, 
@@ -3059,13 +3059,11 @@ 

Connect to TCGA database

# Take a peek at what this looks like head(all_caller_df)
-
- @@ -3074,7 +3072,7 @@

Get consensus numbers

Make a data.frame with only the VAFs and turn into detected or not.

- +
detect_mat <- all_caller_df %>% 
  # Bring over VAF columns and the index
  dplyr::select(dplyr::starts_with("VAF_")) %>% 
@@ -3091,7 +3089,7 @@ 

Get consensus numbers

Count up how many callers call each mutation.

- +
# Add up how many callers identify each mutation
 consensus_count <- rowSums(detect_mat)
 
@@ -3103,15 +3101,15 @@ 

Get consensus numbers

Print out the numbers for the three callers’ consensus.

- +
cat(" At least 1 out of 3:", length(consensus_count), "\n",
     "At least 2 out of 3:", sum(consensus_count >= 2), "\n",
     "3 out of 3:", sum(consensus_count == 3))
- -
 At least 1 out of 3: 157788 
- At least 2 out of 3: 50310 
- 3 out of 3: 28269
+ +
 At least 1 out of 3: 138482 
+ At least 2 out of 3: 47987 
+ 3 out of 3: 27841
@@ -3121,25 +3119,25 @@

Upset graphs

Make the upset plot.

- +
upset_png(detect_mat, 
           plot_file_path = file.path(plots_dir, "tcga-upset-plot.png"), 
           has_vardict = FALSE)
- -
null device 
-          1 
+ +
png 
+  2 
-Upset plot - all data +Upset plot - all data

Upset plot - all data

Transcripts only upset plot.

- +
transcripts <- !(all_caller_df$Variant_Classification %in% c("5'Flank", "3'Flank", "IGR", "Intron"))
 
 upset_png(detect_mat, 
@@ -3147,20 +3145,20 @@ 

Upset graphs

subset_vector = transcripts, has_vardict = FALSE)
- -
null device 
-          1 
+ +
png 
+  2 
-Upset plot - transcripts only +Upset plot - transcripts only

Upset plot - transcripts only

Non transcript upset plot.

- +
not_transcripts <- all_caller_df$Variant_Classification %in% c("5'Flank", "3'Flank", "IGR", "Intron") 
 
 upset_png(detect_mat,
@@ -3168,14 +3166,14 @@ 

Upset graphs

subset_vector = not_transcripts, has_vardict = FALSE)
- -
null device 
-          1 
+ +
png 
+  2 
-Upset plot - non-transcripts only +Upset plot - non-transcripts only

Upset plot - non-transcripts only

@@ -3184,7 +3182,7 @@

VAF distribution for each caller

Create consensus VAF based on strelka’s VAFs and which mutations would be included.

- +
# Make new VAF column based on strelka's VAFs
 all_caller_df$VAF_consensus <- all_caller_df$VAF_strelka
 
@@ -3199,7 +3197,7 @@ 

VAF distribution for each caller

Make a long form VAF data.frame for exploring mutations VAFs from each caller.

- +
vaf_df <- all_caller_df %>% 
   dplyr::select(index, dplyr::starts_with("VAF_"), Variant_Classification) %>%
   # Make long format
@@ -3212,7 +3210,7 @@ 

VAF distribution for each caller

- +
# Make this plot
 vaf_plot <- vaf_df %>%
   ggplot2::ggplot(ggplot2::aes(x = caller, y = vaf, color = caller)) +
@@ -3225,22 +3223,22 @@ 

VAF distribution for each caller

# Print out vaf_plot
- -

+ +

Lancet appears to be calling quite a few low VAF mutations.

- +
# Save this plot
 ggplot2::ggsave(plot = vaf_plot, 
                 file.path(plots_dir, "tcga-vaf-distribution-plot.png"))
- -
Saving 7 x 7 in image
- + +
Saving 7 x 5 in image
+ @@ -3249,30 +3247,36 @@

VAF correlations

Correlate VAF’s across callers.

- +
cor_vaf <- all_caller_df %>% 
   dplyr::select(dplyr::starts_with("VAF_"), -VAF_consensus) %>% 
-  GGally::ggpairs(mapping = ggplot2::aes(alpha = 0.05)) 
-
-# Print out without progress bar
+  GGally::ggpairs(mapping = ggplot2::aes(alpha = 0.05)) 
+ + +
Registered S3 method overwritten by 'GGally':
+  method from   
+  +.gg   ggplot2
+ + +
# Print out without progress bar
 cor_vaf %>%
   print(progress = FALSE)
- -

+ +

VAF’s are pretty related across callers.

- +
ggplot2::ggsave(filename = file.path(plots_dir, "tcga-vaf-cor-matrix.png"), 
                 plot = cor_vaf)
- -
Saving 7 x 7 in image
- + +
Saving 7 x 5 in image
+ @@ -3281,7 +3285,7 @@

Base changes

Summarize base change information per caller.

- +
# Summarize by the number of times each base change shows up in each category.
 frac_change_df <- all_caller_df %>% 
   # Select only the columns we need
@@ -3319,7 +3323,7 @@ 

Base changes

Make a barplot illustrating the percent of the mutations for each caller that that are each type of change.

- +
frac_change_df %>%
   ggplot2::ggplot(ggplot2::aes(x = change, y = percent)) +
   ggplot2::theme_classic() +
@@ -3332,20 +3336,20 @@ 

Base changes

ggplot2::ylab("Percent of callers' mutations") + colorblindr::scale_fill_OkabeIto()
- -

+ +

- +
# Save this plot
 ggplot2::ggsave(file.path(plots_dir, "tcga-frac-base-change-plot.png"))
- -
Saving 7 x 7 in image
- + +
Saving 7 x 5 in image
+ @@ -3354,7 +3358,7 @@

Mutation region barplot

Summarize Variant_Classification information per caller.

- +
# Summarize by the number of times each base change shows up in each category.
 frac_var_df <- vaf_df %>% 
   # Summarize the number of mutations per caller
@@ -3368,7 +3372,7 @@ 

Mutation region barplot

Make a barplot illustrating the percent of the mutations for each caller that that are each type of change.

- +
frac_var_df %>%
   ggplot2::ggplot(ggplot2::aes(x = caller, y = percent)) +
   ggplot2::theme_classic() +
@@ -3380,20 +3384,20 @@ 

Mutation region barplot

ggplot2::xlab("") + ggplot2::ylab("Fraction of callers' mutations")
- -

+ +

- +
# Save this plot
 ggplot2::ggsave(file.path(plots_dir, "tcga-variant-classification-plot.png"))
- -
Saving 7 x 7 in image
- + +
Saving 7 x 5 in image
+ @@ -3401,10 +3405,10 @@

Mutation region barplot

Session Info

- +
sessionInfo()
- +
R version 3.6.0 (2019-04-26)
 Platform: x86_64-pc-linux-gnu (64-bit)
 Running under: Debian GNU/Linux 9 (stretch)
@@ -3421,48 +3425,26 @@ 

Session Info

[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C attached base packages: -[1] stats4 parallel stats graphics grDevices utils -[7] datasets methods base - -other attached packages: - [1] plyr_1.8.4 DT_0.7 knitr_1.23 - [4] org.Hs.eg.db_3.10.0 AnnotationDbi_1.48.0 IRanges_2.20.2 - [7] S4Vectors_0.24.4 Biobase_2.46.0 BiocGenerics_0.32.0 -[10] MM2S_1.0.6 pheatmap_1.0.12 lattice_0.20-38 -[13] kknn_1.3.1 GSVA_1.34.0 medulloPackage_0.1.0 -[16] optparse_1.6.2 forcats_0.4.0 stringr_1.4.0 -[19] dplyr_0.8.3 purrr_0.3.2 readr_1.3.1 -[22] tidyr_0.8.3 tibble_2.1.3 ggplot2_3.2.0 -[25] tidyverse_1.2.1 +[1] stats graphics grDevices utils datasets methods base loaded via a namespace (and not attached): - [1] nlme_3.1-140 bitops_1.0-6 lubridate_1.7.4 - [4] bit64_0.9-7 progress_1.2.2 RColorBrewer_1.1-2 - [7] httr_1.4.0 UpSetR_1.4.0 rprojroot_1.3-2 -[10] tools_3.6.0 backports_1.1.4 R6_2.4.0 -[13] DBI_1.0.0 lazyeval_0.2.2 colorspace_1.4-1 -[16] withr_2.1.2 prettyunits_1.0.2 gridExtra_2.3 -[19] GGally_1.4.0 tidyselect_0.2.5 bit_1.1-14 -[22] compiler_3.6.0 graph_1.64.0 cli_1.1.0 -[25] rvest_0.3.4 xml2_1.2.0 labeling_0.3 -[28] scales_1.0.0 digest_0.6.20 rmarkdown_1.13 -[31] base64enc_0.1-3 pkgconfig_2.0.2 htmltools_0.3.6 -[34] dbplyr_1.4.2 htmlwidgets_1.3 rlang_0.4.0 -[37] readxl_1.3.1 rstudioapi_0.10 RSQLite_2.1.1 -[40] shiny_1.3.2 generics_0.0.2 jsonlite_1.6 -[43] crosstalk_1.0.0 RCurl_1.95-4.12 magrittr_1.5 -[46] Matrix_1.2-17 Rcpp_1.0.1 munsell_0.5.0 -[49] stringi_1.4.3 yaml_2.2.0 grid_3.6.0 -[52] blob_1.1.1 promises_1.0.1 crayon_1.3.4 -[55] haven_2.1.1 annotate_1.64.0 hms_0.4.2 -[58] pillar_1.4.2 igraph_1.2.4.1 geneplotter_1.64.0 -[61] reshape2_1.4.3 XML_3.98-1.20 glue_1.3.1 -[64] evaluate_0.14 modelr_0.1.4 colorblindr_0.1.0 -[67] httpuv_1.5.1 cellranger_1.1.0 gtable_0.3.0 -[70] getopt_1.20.3 reshape_0.8.8 assertthat_0.2.1 -[73] xfun_0.8 mime_0.7 xtable_1.8-4 -[76] broom_0.5.2 later_0.8.0 shinythemes_1.1.2 -[79] memoise_1.1.0 GSEABase_1.48.0
+ [1] Rcpp_1.0.1 RColorBrewer_1.1-2 plyr_1.8.4 + [4] pillar_1.4.2 compiler_3.6.0 dbplyr_1.4.2 + [7] forcats_0.4.0 base64enc_0.1-3 tools_3.6.0 +[10] digest_0.6.20 bit_1.1-14 jsonlite_1.6 +[13] RSQLite_2.1.1 evaluate_0.14 memoise_1.1.0 +[16] tibble_2.1.3 gtable_0.3.0 pkgconfig_2.0.2 +[19] rlang_0.4.0 GGally_1.4.0 DBI_1.0.0 +[22] yaml_2.2.0 xfun_0.8 gridExtra_2.3 +[25] colorblindr_0.1.0 UpSetR_1.4.0 dplyr_0.8.3 +[28] stringr_1.4.0 knitr_1.23 bit64_0.9-7 +[31] grid_3.6.0 tidyselect_0.2.5 reshape_0.8.8 +[34] glue_1.3.1 R6_2.4.0 rmarkdown_1.13 +[37] reshape2_1.4.3 tidyr_0.8.3 purrr_0.3.2 +[40] ggplot2_3.2.0 blob_1.1.1 magrittr_1.5 +[43] scales_1.0.0 htmltools_0.3.6 assertthat_0.2.1 +[46] colorspace_1.4-1 labeling_0.3 stringi_1.4.3 +[49] lazyeval_0.2.2 munsell_0.5.0 crayon_1.3.4
diff --git a/analyses/snv-callers/compare_snv_callers_plots.nb.html b/analyses/snv-callers/compare_snv_callers_plots.nb.html index 8775c72d7d..6e4bd52620 100644 --- a/analyses/snv-callers/compare_snv_callers_plots.nb.html +++ b/analyses/snv-callers/compare_snv_callers_plots.nb.html @@ -2974,7 +2974,7 @@

Setup

Read in set up script.

- +
# Magrittr pipe
 `%>%` <- dplyr::`%>%`
@@ -2983,7 +2983,7 @@

Setup

Set up directories.

- +
scratch_dir <- file.path("..", "..", "scratch")
 plots_dir <- file.path("plots", "pbta-caller-comparison")
 
@@ -2996,7 +2996,7 @@ 

Setup

Import custom function for upset plot.

- +
source(file.path("util", "upset_plot.R"))
@@ -3007,7 +3007,7 @@

Connect to database

Connect to SQLite database.

- +
# Start up connection
 con <- DBI::dbConnect(RSQLite::SQLite(), 
                       file.path(scratch_dir, "snv_db.sqlite"))
@@ -3017,7 +3017,7 @@

Connect to database

Note what columns we will join by.

- +
join_cols = c("Chromosome",
               "Start_Position",
               "Reference_Allele",
@@ -3030,7 +3030,7 @@ 

Connect to database

Set up tables from the database but only call the columns we need.

- +
strelka <- dplyr::tbl(con, "strelka") %>% 
   dplyr::select(join_cols, "VAF")
 
@@ -3048,7 +3048,7 @@ 

Connect to database

Because DBI does not support full_join, we had to use a series of left_join and union_all calls in order to get a full join of all the callers.

- +
# This script will do the full join of the data for us
 source(file.path("util", "full_join_callers.R"))
 
@@ -3060,6 +3060,11 @@ 

Connect to database

# Take a peek at what this looks like head(all_caller_df)
+
+ +
@@ -3068,7 +3073,7 @@

Get consensus numbers

Make a data.frame with only the VAFs and turn into detected or not.

- +
detect_mat <- all_caller_df %>% 
  # Bring over VAF columns and the index
  dplyr::select(dplyr::starts_with("VAF_")) %>% 
@@ -3085,7 +3090,7 @@ 

Get consensus numbers

Count up how many callers call each mutation but ignore VarDict.

- +
# Add up how many callers identify each mutation, but ignore VarDict
 consensus_count <- rowSums(detect_mat[, colnames(detect_mat) != "VAF_vardict"])
 
@@ -3097,11 +3102,16 @@ 

Get consensus numbers

Print out the numbers for the three callers’ consensus.

- +
cat(" At least 1 out of 3:", length(consensus_count), "\n",
     "At least 2 out of 3:", sum(consensus_count >= 2), "\n",
     "3 out of 3:", sum(consensus_count == 3))
+ +
 At least 1 out of 3: 13912263 
+ At least 2 out of 3: 5262082 
+ 3 out of 3: 1223664
+ @@ -3110,30 +3120,38 @@

Upset graphs

Make the upset plot.

- +
upset_png(detect_mat, 
           plot_file_path = file.path(plots_dir, "pbta-upset-plot.png"))
+ +
png 
+  2 
+
-Upset plot - all data +Upset plot - all data

Upset plot - all data

Transcripts only upset plot.

- +
transcripts <- !(all_caller_df$Variant_Classification %in% c("5'Flank", "3'Flank", "IGR", "Intron"))
 
 upset_png(detect_mat, 
           plot_file_path = file.path(plots_dir, "pbta-upset-plot-transcipts-only.png"), 
           subset_vector = transcripts)
+ +
png 
+  2 
+
-Upset plot - transcripts only +Upset plot - transcripts only

Upset plot - transcripts only

Non transcript upset plot.

@@ -3146,10 +3164,14 @@

Upset graphs

plot_file_path = file.path(plots_dir, "pbta-upset-plot-not-transcipts-only.png"), subset_vector = not_transcripts)
+ +
png 
+  2 
+
-Upset plot - non transcripts only +Upset plot - non transcripts only

Upset plot - non transcripts only

@@ -3158,7 +3180,7 @@

VAF distribution for each caller

Create consensus VAF based on strelka’s VAFs and which mutations would be included

- +
# Make new VAF column based on strelka's VAFs
 all_caller_df$VAF_consensus <- all_caller_df$VAF_strelka
 
@@ -3173,7 +3195,7 @@ 

VAF distribution for each caller

Make a long form VAF data.frame for exploring mutations VAFs from each caller.

- +
vaf_df <- all_caller_df %>% 
   dplyr::select(index, dplyr::starts_with("VAF_"), Variant_Classification) %>%
   # Make long format
@@ -3186,7 +3208,7 @@ 

VAF distribution for each caller

- +
# Make this plot
 vaf_plot <- vaf_df %>%
   ggplot2::ggplot(ggplot2::aes(x = caller, y = vaf, color = caller)) +
@@ -3199,15 +3221,21 @@ 

VAF distribution for each caller

# Print out vaf_plot
+ +

+ - +
# Save this plot
 ggplot2::ggsave(plot = vaf_plot, 
                 file.path(plots_dir, "pbta-vaf_distribution_plot.png"))
+ +
Saving 7 x 5 in image
+ @@ -3216,24 +3244,36 @@

VAF correlations

Correlate VAF’s across callers.

- +
cor_vaf <- all_caller_df %>% 
   dplyr::select(dplyr::starts_with("VAF_"), -VAF_consensus) %>% 
-  GGally::ggpairs(mapping = ggplot2::aes(alpha = 0.05))  
-
-# Print out without progress bar
+  GGally::ggpairs(mapping = ggplot2::aes(alpha = 0.05))  
+ + +
Registered S3 method overwritten by 'GGally':
+  method from   
+  +.gg   ggplot2
+ + +
# Print out without progress bar
 cor_vaf %>%
   print(progress = FALSE)
+ +

+

VAF’s are pretty related across callers.

- +
ggplot2::ggsave(filename = file.path(plots_dir, "pbta-vaf_cor_matrix.png"), 
                 plot = cor_vaf)
+ +
Saving 7 x 5 in image
+ @@ -3242,7 +3282,7 @@

Base changes

Summarize base change information per caller.

- +
# Summarize by the number of times each base change shows up in each category.
 frac_change_df <- all_caller_df %>% 
   # Select only the columns we need
@@ -3280,7 +3320,7 @@ 

Base changes

Make a barplot illustrating the percent of the mutations for each caller that that are each type of change.

- +
frac_change_df %>%
   ggplot2::ggplot(ggplot2::aes(x = change, y = percent)) +
   ggplot2::theme_classic() +
@@ -3293,14 +3333,20 @@ 

Base changes

ggplot2::ylab("Percent of callers' mutations") + colorblindr::scale_fill_OkabeIto()
+ +

+ - +
# Save this plot
 ggplot2::ggsave(file.path(plots_dir, "pbta-frac_base_change_plot.png"))
+ +
Saving 7 x 5 in image
+ @@ -3309,7 +3355,7 @@

Mutation region barplot

Summarize Variant_Classification information per caller.

- +
# Summarize by the number of times each base change shows up in each category.
 frac_var_df <- vaf_df %>% 
   # Summarize the number of mutations per caller
@@ -3323,7 +3369,7 @@ 

Mutation region barplot

Make a barplot illustrating the percent of the mutations for each caller that that are each type of change.

- +
frac_var_df %>%
   ggplot2::ggplot(ggplot2::aes(x = caller, y = percent)) +
   ggplot2::theme_classic() +
@@ -3335,14 +3381,20 @@ 

Mutation region barplot

ggplot2::xlab("") + ggplot2::ylab("Fraction of callers' mutations")
+ +

+ - +
# Save this plot
 ggplot2::ggsave(file.path(plots_dir, "pbta-variant_classification_plot.png"))
+ +
Saving 7 x 5 in image
+ @@ -3350,9 +3402,47 @@

Mutation region barplot

Session Info

- +
sessionInfo()
+ +
R version 3.6.0 (2019-04-26)
+Platform: x86_64-pc-linux-gnu (64-bit)
+Running under: Debian GNU/Linux 9 (stretch)
+
+Matrix products: default
+BLAS/LAPACK: /usr/lib/libopenblasp-r0.2.19.so
+
+locale:
+ [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
+ [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
+ [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=C             
+ [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
+ [9] LC_ADDRESS=C               LC_TELEPHONE=C            
+[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
+
+attached base packages:
+[1] stats     graphics  grDevices utils     datasets  methods   base     
+
+loaded via a namespace (and not attached):
+ [1] Rcpp_1.0.1         RColorBrewer_1.1-2 plyr_1.8.4        
+ [4] pillar_1.4.2       compiler_3.6.0     dbplyr_1.4.2      
+ [7] forcats_0.4.0      base64enc_0.1-3    tools_3.6.0       
+[10] digest_0.6.20      bit_1.1-14         jsonlite_1.6      
+[13] RSQLite_2.1.1      evaluate_0.14      memoise_1.1.0     
+[16] tibble_2.1.3       gtable_0.3.0       pkgconfig_2.0.2   
+[19] rlang_0.4.0        GGally_1.4.0       DBI_1.0.0         
+[22] yaml_2.2.0         xfun_0.8           gridExtra_2.3     
+[25] colorblindr_0.1.0  UpSetR_1.4.0       dplyr_0.8.3       
+[28] stringr_1.4.0      knitr_1.23         bit64_0.9-7       
+[31] grid_3.6.0         tidyselect_0.2.5   reshape_0.8.8     
+[34] glue_1.3.1         R6_2.4.0           rmarkdown_1.13    
+[37] reshape2_1.4.3     tidyr_0.8.3        purrr_0.3.2       
+[40] ggplot2_3.2.0      blob_1.1.1         magrittr_1.5      
+[43] scales_1.0.0       htmltools_0.3.6    assertthat_0.2.1  
+[46] colorspace_1.4-1   labeling_0.3       stringi_1.4.3     
+[49] lazyeval_0.2.2     munsell_0.5.0      crayon_1.3.4      
+ diff --git a/analyses/snv-callers/plots/pbta-caller-comparison/pbta-frac_base_change_plot.png b/analyses/snv-callers/plots/pbta-caller-comparison/pbta-frac_base_change_plot.png index 8e4757076c..7d6b00a7f8 100644 Binary files a/analyses/snv-callers/plots/pbta-caller-comparison/pbta-frac_base_change_plot.png and b/analyses/snv-callers/plots/pbta-caller-comparison/pbta-frac_base_change_plot.png differ diff --git a/analyses/snv-callers/plots/pbta-caller-comparison/pbta-upset-plot-not-transcipts-only.png b/analyses/snv-callers/plots/pbta-caller-comparison/pbta-upset-plot-not-transcipts-only.png index d31a56fa56..6a5e58a000 100644 Binary files a/analyses/snv-callers/plots/pbta-caller-comparison/pbta-upset-plot-not-transcipts-only.png and b/analyses/snv-callers/plots/pbta-caller-comparison/pbta-upset-plot-not-transcipts-only.png differ diff --git a/analyses/snv-callers/plots/pbta-caller-comparison/pbta-upset-plot-transcipts-only.png b/analyses/snv-callers/plots/pbta-caller-comparison/pbta-upset-plot-transcipts-only.png index d3c7a8ac05..f46cbdc504 100644 Binary files a/analyses/snv-callers/plots/pbta-caller-comparison/pbta-upset-plot-transcipts-only.png and b/analyses/snv-callers/plots/pbta-caller-comparison/pbta-upset-plot-transcipts-only.png differ diff --git a/analyses/snv-callers/plots/pbta-caller-comparison/pbta-upset-plot.png b/analyses/snv-callers/plots/pbta-caller-comparison/pbta-upset-plot.png index 585dbd1444..c3f6c8aa06 100644 Binary files a/analyses/snv-callers/plots/pbta-caller-comparison/pbta-upset-plot.png and b/analyses/snv-callers/plots/pbta-caller-comparison/pbta-upset-plot.png differ diff --git a/analyses/snv-callers/plots/pbta-caller-comparison/pbta-vaf_cor_matrix.png b/analyses/snv-callers/plots/pbta-caller-comparison/pbta-vaf_cor_matrix.png index c7f91e80cc..acfe991f04 100644 Binary files a/analyses/snv-callers/plots/pbta-caller-comparison/pbta-vaf_cor_matrix.png and b/analyses/snv-callers/plots/pbta-caller-comparison/pbta-vaf_cor_matrix.png differ diff --git a/analyses/snv-callers/plots/pbta-caller-comparison/pbta-vaf_distribution_plot.png b/analyses/snv-callers/plots/pbta-caller-comparison/pbta-vaf_distribution_plot.png index 539c809921..e9eabe4df1 100644 Binary files a/analyses/snv-callers/plots/pbta-caller-comparison/pbta-vaf_distribution_plot.png and b/analyses/snv-callers/plots/pbta-caller-comparison/pbta-vaf_distribution_plot.png differ diff --git a/analyses/snv-callers/plots/pbta-caller-comparison/pbta-variant_classification_plot.png b/analyses/snv-callers/plots/pbta-caller-comparison/pbta-variant_classification_plot.png index 054656c768..d9d41ca3cb 100644 Binary files a/analyses/snv-callers/plots/pbta-caller-comparison/pbta-variant_classification_plot.png and b/analyses/snv-callers/plots/pbta-caller-comparison/pbta-variant_classification_plot.png differ diff --git a/analyses/snv-callers/plots/tcga-caller-comparison/tcga-frac-base-change-plot.png b/analyses/snv-callers/plots/tcga-caller-comparison/tcga-frac-base-change-plot.png index b4041aeb73..9df684fba9 100644 Binary files a/analyses/snv-callers/plots/tcga-caller-comparison/tcga-frac-base-change-plot.png and b/analyses/snv-callers/plots/tcga-caller-comparison/tcga-frac-base-change-plot.png differ diff --git a/analyses/snv-callers/plots/tcga-caller-comparison/tcga-upset-plot-not-transcipts-only.png b/analyses/snv-callers/plots/tcga-caller-comparison/tcga-upset-plot-not-transcipts-only.png index cfdb3ff762..3966798c3f 100644 Binary files a/analyses/snv-callers/plots/tcga-caller-comparison/tcga-upset-plot-not-transcipts-only.png and b/analyses/snv-callers/plots/tcga-caller-comparison/tcga-upset-plot-not-transcipts-only.png differ diff --git a/analyses/snv-callers/plots/tcga-caller-comparison/tcga-upset-plot-transcipts-only.png b/analyses/snv-callers/plots/tcga-caller-comparison/tcga-upset-plot-transcipts-only.png index ba483584b5..4a8079c08b 100644 Binary files a/analyses/snv-callers/plots/tcga-caller-comparison/tcga-upset-plot-transcipts-only.png and b/analyses/snv-callers/plots/tcga-caller-comparison/tcga-upset-plot-transcipts-only.png differ diff --git a/analyses/snv-callers/plots/tcga-caller-comparison/tcga-upset-plot.png b/analyses/snv-callers/plots/tcga-caller-comparison/tcga-upset-plot.png index f20bab0902..5cb9991d51 100644 Binary files a/analyses/snv-callers/plots/tcga-caller-comparison/tcga-upset-plot.png and b/analyses/snv-callers/plots/tcga-caller-comparison/tcga-upset-plot.png differ diff --git a/analyses/snv-callers/plots/tcga-caller-comparison/tcga-vaf-cor-matrix.png b/analyses/snv-callers/plots/tcga-caller-comparison/tcga-vaf-cor-matrix.png index ec1c8fa9c5..f3425cc6bf 100644 Binary files a/analyses/snv-callers/plots/tcga-caller-comparison/tcga-vaf-cor-matrix.png and b/analyses/snv-callers/plots/tcga-caller-comparison/tcga-vaf-cor-matrix.png differ diff --git a/analyses/snv-callers/plots/tcga-caller-comparison/tcga-vaf-distribution-plot.png b/analyses/snv-callers/plots/tcga-caller-comparison/tcga-vaf-distribution-plot.png index d6ce8dc089..fc7d58fbbf 100644 Binary files a/analyses/snv-callers/plots/tcga-caller-comparison/tcga-vaf-distribution-plot.png and b/analyses/snv-callers/plots/tcga-caller-comparison/tcga-vaf-distribution-plot.png differ diff --git a/analyses/snv-callers/plots/tcga-caller-comparison/tcga-variant-classification-plot.png b/analyses/snv-callers/plots/tcga-caller-comparison/tcga-variant-classification-plot.png index 430ce21ad6..f7686b9c90 100644 Binary files a/analyses/snv-callers/plots/tcga-caller-comparison/tcga-variant-classification-plot.png and b/analyses/snv-callers/plots/tcga-caller-comparison/tcga-variant-classification-plot.png differ diff --git a/analyses/snv-callers/run_caller_consensus_analysis-pbta.sh b/analyses/snv-callers/run_caller_consensus_analysis-pbta.sh index 7bc2a38e01..30181f386b 100644 --- a/analyses/snv-callers/run_caller_consensus_analysis-pbta.sh +++ b/analyses/snv-callers/run_caller_consensus_analysis-pbta.sh @@ -26,6 +26,7 @@ vaf_cutoff=${OPENPBTA_VAF_CUTOFF:-0} run_plots_nb=${OPENPBTA_PLOTS:-0} ################################ Set Up Database ################################ +echo "Setting up Database" python3 analyses/snv-callers/scripts/01-setup_db.py \ --db-file $dbfile \ --strelka-file data/pbta-snv-strelka2.vep.maf.gz \ @@ -35,6 +36,7 @@ python3 analyses/snv-callers/scripts/01-setup_db.py \ --meta-file data/pbta-histologies.tsv ##################### Merge callers' files into total files #################### +echo "Merging callers" Rscript analyses/snv-callers/scripts/02-merge_callers.R \ --db_file $dbfile \ --output_file $consensus_file \ @@ -42,12 +44,14 @@ Rscript analyses/snv-callers/scripts/02-merge_callers.R \ --overwrite ########################## Add consensus to db ################################ +echo "Adding consensus to database" python3 analyses/snv-callers/scripts/01-setup_db.py \ --db-file $dbfile \ --consensus-file $consensus_file ############# Create intersection BED files for TMB calculations ############### # Make All mutations BED files +echo "Making intersection bed files" bedtools intersect \ -a data/WGS.hg38.strelka2.unpadded.bed \ -b data/WGS.hg38.mutect2.vardict.unpadded.bed \ @@ -56,6 +60,7 @@ bedtools intersect \ #################### Make coding regions file # Convert GTF to BED file for use in bedtools # Here we are only extracting lines with as a CDS i.e. are coded in protein +echo "Making CDS bed file" gunzip -c data/gencode.v27.primary_assembly.annotation.gtf.gz \ | awk '$3 ~ /CDS/' \ | convert2bed --do-not-sort --input=gtf - \ @@ -64,6 +69,7 @@ gunzip -c data/gencode.v27.primary_assembly.annotation.gtf.gz \ > $cds_file ######################### Calculate consensus TMB ############################## +echo "Calculating TMB" Rscript analyses/snv-callers/scripts/03-calculate_tmb.R \ --db_file $dbfile \ --output analyses/snv-callers/results/consensus \ @@ -79,5 +85,6 @@ gzip $consensus_file ############################# Comparison Plots ################################# if [ "$run_plots_nb" -gt "0" ] then + echo "Making comparison plots" Rscript -e "rmarkdown::render('analyses/snv-callers/compare_snv_callers_plots.Rmd', clean = TRUE)" fi diff --git a/analyses/snv-callers/scripts/01-setup_db.py b/analyses/snv-callers/scripts/01-setup_db.py index cc0528669b..ce5e8616c9 100644 --- a/analyses/snv-callers/scripts/01-setup_db.py +++ b/analyses/snv-callers/scripts/01-setup_db.py @@ -101,19 +101,6 @@ ('Matched_Norm_Sample_Barcode', 'TEXT'), ('Match_Norm_Seq_Allele1', 'TEXT'), ('Match_Norm_Seq_Allele2', 'TEXT'), - ('Tumor_Validation_Allele1', 'TEXT'), - ('Tumor_Validation_Allele2', 'TEXT'), - ('Match_Norm_Validation_Allele1', 'TEXT'), - ('Match_Norm_Validation_Allele2', 'TEXT'), - ('Verification_Status', 'TEXT'), - ('Validation_Status', 'TEXT'), - ('Mutation_Status', 'TEXT'), - ('Sequencing_Phase', 'TEXT'), - ('Sequence_Source', 'TEXT'), - ('Validation_Method', 'TEXT'), - ('Score', 'TEXT'), - ('BAM_File', 'TEXT'), - ('Sequencer', 'TEXT'), ('Tumor_Sample_UUID', 'TEXT'), ('Matched_Norm_Sample_UUID', 'TEXT'), ('HGVSc', 'TEXT'), @@ -161,7 +148,6 @@ ('AF', 'TEXT'), ('AFR_AF', 'TEXT'), ('AMR_AF', 'TEXT'), - ('ASN_AF', 'TEXT'), ('EAS_AF', 'TEXT'), ('EUR_AF', 'TEXT'), ('SAS_AF', 'TEXT'), @@ -180,31 +166,11 @@ ('TSL', 'TEXT'), ('HGVS_OFFSET', 'TEXT'), ('PHENO', 'TEXT'), - ('MINIMISED', 'TEXT'), - ('ExAC_AF', 'TEXT'), - ('ExAC_AF_AFR', 'TEXT'), - ('ExAC_AF_AMR', 'TEXT'), - ('ExAC_AF_EAS', 'TEXT'), - ('ExAC_AF_FIN', 'TEXT'), - ('ExAC_AF_NFE', 'TEXT'), - ('ExAC_AF_OTH', 'TEXT'), - ('ExAC_AF_SAS', 'TEXT'), ('GENE_PHENO', 'TEXT'), ('FILTER', 'TEXT'), ('flanking_bps', 'TEXT'), ('vcf_id', 'TEXT'), ('vcf_qual', 'REAL'), - ('ExAC_AF_Adj', 'TEXT'), - ('ExAC_AC_AN_Adj', 'TEXT'), - ('ExAC_AC_AN', 'TEXT'), - ('ExAC_AC_AN_AFR', 'TEXT'), - ('ExAC_AC_AN_AMR', 'TEXT'), - ('ExAC_AC_AN_EAS', 'TEXT'), - ('ExAC_AC_AN_FIN', 'TEXT'), - ('ExAC_AC_AN_NFE', 'TEXT'), - ('ExAC_AC_AN_OTH', 'TEXT'), - ('ExAC_AC_AN_SAS', 'TEXT'), - ('ExAC_FILTER', 'TEXT'), ('gnomAD_AF', 'TEXT'), ('gnomAD_AFR_AF', 'TEXT'), ('gnomAD_AMR_AF', 'TEXT'), @@ -215,9 +181,12 @@ ('gnomAD_OTH_AF', 'TEXT'), ('gnomAD_SAS_AF', 'TEXT'), ('vcf_pos', 'INTEGER'), + ('HotSpotAllele', 'INTEGER'), ('VAF', 'REAL') ] +common_cols = [col for col, type in maf_types] + needed_cols = [ 'Hugo_Symbol', 'Entrez_Gene_Id', @@ -304,7 +273,9 @@ # process the chunk chunk['VAF'] = (chunk['t_alt_count'] / (chunk['t_ref_count'] + chunk['t_alt_count'])) - if table_name not in ('strelka', 'lancet', 'consensus'): + if table_name in ('strelka', 'lancet', 'consensus'): + chunk = chunk[common_cols] + else: chunk = chunk[needed_cols] chunk.to_sql(table_name, con, if_exists='append') # create indexes diff --git a/analyses/snv-callers/scripts/03-calculate_tmb.R b/analyses/snv-callers/scripts/03-calculate_tmb.R index 2778ecd747..abb091d401 100644 --- a/analyses/snv-callers/scripts/03-calculate_tmb.R +++ b/analyses/snv-callers/scripts/03-calculate_tmb.R @@ -284,8 +284,8 @@ strelka_mutect_maf_df <- strelka_mutect_maf_df %>% ), by = "Tumor_Sample_Barcode" ) %>% - # Remove samples if they are labeled "Panel" - dplyr::filter(experimental_strategy != "Panel") + # Remove samples if they are not WGS or WXS + dplyr::filter(experimental_strategy %in% c("WGS", "WXS")) ############################# Set Up BED Files ################################# # Make a data.frame of the unique BED file paths and their names