diff --git a/.circleci/config.yml b/.circleci/config.yml index 4020906ae1..eb9c2ef617 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -225,21 +225,21 @@ jobs: # name: RNA-Seq composition # command: ./scripts/run_in_ci.sh Rscript -e "rmarkdown::render('analyses/rna-seq-composition/rna-seq-composition.Rmd', clean = TRUE)" - # - run: - # name: TCGA SNV Caller Analysis - # command: ./scripts/run_in_ci.sh bash analyses/snv-callers/run_caller_consensus_analysis-tcga.sh + - run: + name: TCGA SNV Caller Analysis + command: ./scripts/run_in_ci.sh bash analyses/snv-callers/run_caller_consensus_analysis-tcga.sh - # - run: - # name: SNV Caller Analysis - # command: OPENPBTA_VAF_CUTOFF=0.5 ./scripts/run_in_ci.sh bash analyses/snv-callers/run_caller_consensus_analysis-pbta.sh + - run: + name: SNV Caller Analysis + command: OPENPBTA_VAF_CUTOFF=0.5 ./scripts/run_in_ci.sh bash analyses/snv-callers/run_caller_consensus_analysis-pbta.sh - # - run: - # name: Tumor mutation burden with TCGA - # command: ./scripts/run_in_ci.sh Rscript -e "rmarkdown::render('analyses/tmb-compare/compare-tcga-pbta.Rmd', clean = TRUE)" + - run: + name: Tumor mutation burden with TCGA + command: ./scripts/run_in_ci.sh Rscript -e "rmarkdown::render('analyses/tmb-compare/compare-tcga-pbta.Rmd', clean = TRUE)" - # - run: - # name: Exploration of nonsynonymous filter - # command: ./scripts/run_in_ci.sh bash analyses/snv-callers/explore_variant_classifications/run_explorations.sh + - run: + name: Exploration of nonsynonymous filter + command: ./scripts/run_in_ci.sh bash analyses/snv-callers/explore_variant_classifications/run_explorations.sh # This analysis was used to explore the TCGA PBTA data when the BED files used to calculate TCGA # were incorrect https://github.com/AlexsLemonade/OpenPBTA-analysis/issues/568 @@ -275,9 +275,9 @@ jobs: # name: d3b TMB code # command: ./scripts/run_in_ci.sh bash analyses/tmb-compare/TMB_d3b_code/run_tmb_d3b.sh - # - run: - # name: Compare TMB calculations - # command: ./scripts/run_in_ci.sh Rscript -e "rmarkdown::render('analyses/tmb-compare/compare-tmb-calculations.Rmd', clean = TRUE)" + - run: + name: Compare TMB calculations + command: ./scripts/run_in_ci.sh Rscript -e "rmarkdown::render('analyses/tmb-compare/compare-tmb-calculations.Rmd', clean = TRUE)" - run: name: Run survival plots diff --git a/analyses/snv-callers/compare_snv_callers_plots-tcga.nb.html b/analyses/snv-callers/compare_snv_callers_plots-tcga.nb.html index 8ef4c44a03..7b082dc6cf 100644 --- a/analyses/snv-callers/compare_snv_callers_plots-tcga.nb.html +++ b/analyses/snv-callers/compare_snv_callers_plots-tcga.nb.html @@ -2971,7 +2971,7 @@
Read in set up script.
- +# Magrittr pipe
`%>%` <- dplyr::`%>%`
@@ -2980,7 +2980,7 @@ Set up directories.
- +scratch_dir <- file.path("..", "..", "scratch")
plots_dir <- file.path("plots", "tcga-caller-comparison")
@@ -2993,7 +2993,7 @@ Setup
Import custom function for upset plot.
-
+
source(file.path("util", "upset_plot.R"))
@@ -3004,7 +3004,7 @@ Connect to TCGA database
Connect to SQLite database.
-
+
# Start up connection
con <- DBI::dbConnect(RSQLite::SQLite(),
file.path(scratch_dir, "tcga_snv_db.sqlite"))
@@ -3014,7 +3014,7 @@ Connect to TCGA database
Note what columns we will join by.
-
+
join_cols = c("Chromosome",
"Start_Position",
"Reference_Allele",
@@ -3027,7 +3027,7 @@ Connect to TCGA database
Set up tables from the database but only call the columns we need.
-
+
strelka <- dplyr::tbl(con, "strelka") %>%
dplyr::select(join_cols, "VAF") %>%
as.data.frame()
@@ -3045,7 +3045,7 @@ Connect to TCGA database
Full join to get the summary of mutations called by each caller.
-
+
# Bring out the data.frame
all_caller_df <- strelka %>%
dplyr::full_join(mutect, by = join_cols,
@@ -3059,13 +3059,11 @@ Connect to TCGA database
# Take a peek at what this looks like
head(all_caller_df)
-
-
@@ -3074,7 +3072,7 @@ Get consensus numbers
Make a data.frame with only the VAFs and turn into detected or not.
-
+
detect_mat <- all_caller_df %>%
# Bring over VAF columns and the index
dplyr::select(dplyr::starts_with("VAF_")) %>%
@@ -3091,7 +3089,7 @@ Get consensus numbers
Count up how many callers call each mutation.
-
+
# Add up how many callers identify each mutation
consensus_count <- rowSums(detect_mat)
@@ -3103,15 +3101,15 @@ Get consensus numbers
Print out the numbers for the three callers’ consensus.
-
+
cat(" At least 1 out of 3:", length(consensus_count), "\n",
"At least 2 out of 3:", sum(consensus_count >= 2), "\n",
"3 out of 3:", sum(consensus_count == 3))
-
- At least 1 out of 3: 157788
- At least 2 out of 3: 50310
- 3 out of 3: 28269
+
+ At least 1 out of 3: 138482
+ At least 2 out of 3: 47987
+ 3 out of 3: 27841
@@ -3121,25 +3119,25 @@ Upset graphs
Make the upset plot.
-
+
upset_png(detect_mat,
plot_file_path = file.path(plots_dir, "tcga-upset-plot.png"),
has_vardict = FALSE)
-
-null device
- 1
+
+png
+ 2
-
+
Transcripts only upset plot.
-
+
transcripts <- !(all_caller_df$Variant_Classification %in% c("5'Flank", "3'Flank", "IGR", "Intron"))
upset_png(detect_mat,
@@ -3147,20 +3145,20 @@ Upset graphs
subset_vector = transcripts,
has_vardict = FALSE)
-
-null device
- 1
+
+png
+ 2
-
+
Non transcript upset plot.
-
+
not_transcripts <- all_caller_df$Variant_Classification %in% c("5'Flank", "3'Flank", "IGR", "Intron")
upset_png(detect_mat,
@@ -3168,14 +3166,14 @@ Upset graphs
subset_vector = not_transcripts,
has_vardict = FALSE)
-
-null device
- 1
+
+png
+ 2
-
+
@@ -3184,7 +3182,7 @@
VAF distribution for each caller
Create consensus VAF based on strelka’s VAFs and which mutations would be included.
-
+
# Make new VAF column based on strelka's VAFs
all_caller_df$VAF_consensus <- all_caller_df$VAF_strelka
@@ -3199,7 +3197,7 @@ VAF distribution for each caller
Make a long form VAF data.frame for exploring mutations VAFs from each caller.
-
+
vaf_df <- all_caller_df %>%
dplyr::select(index, dplyr::starts_with("VAF_"), Variant_Classification) %>%
# Make long format
@@ -3212,7 +3210,7 @@ VAF distribution for each caller
-
+
# Make this plot
vaf_plot <- vaf_df %>%
ggplot2::ggplot(ggplot2::aes(x = caller, y = vaf, color = caller)) +
@@ -3225,22 +3223,22 @@ VAF distribution for each caller
# Print out
vaf_plot
-
-
+
+
Lancet appears to be calling quite a few low VAF mutations.
-
+
# Save this plot
ggplot2::ggsave(plot = vaf_plot,
file.path(plots_dir, "tcga-vaf-distribution-plot.png"))
-
-Saving 7 x 7 in image
-
+
+Saving 7 x 5 in image
+
@@ -3249,30 +3247,36 @@ VAF correlations
Correlate VAF’s across callers.
-
+
cor_vaf <- all_caller_df %>%
dplyr::select(dplyr::starts_with("VAF_"), -VAF_consensus) %>%
- GGally::ggpairs(mapping = ggplot2::aes(alpha = 0.05))
-
-# Print out without progress bar
+ GGally::ggpairs(mapping = ggplot2::aes(alpha = 0.05))
+
+
+Registered S3 method overwritten by 'GGally':
+ method from
+ +.gg ggplot2
+
+
+# Print out without progress bar
cor_vaf %>%
print(progress = FALSE)
-
-
+
+
VAF’s are pretty related across callers.
-
+
ggplot2::ggsave(filename = file.path(plots_dir, "tcga-vaf-cor-matrix.png"),
plot = cor_vaf)
-
-Saving 7 x 7 in image
-
+
+Saving 7 x 5 in image
+
@@ -3281,7 +3285,7 @@ Base changes
Summarize base change information per caller.
-
+
# Summarize by the number of times each base change shows up in each category.
frac_change_df <- all_caller_df %>%
# Select only the columns we need
@@ -3319,7 +3323,7 @@ Base changes
Make a barplot illustrating the percent of the mutations for each caller that that are each type of change.
-
+
frac_change_df %>%
ggplot2::ggplot(ggplot2::aes(x = change, y = percent)) +
ggplot2::theme_classic() +
@@ -3332,20 +3336,20 @@ Base changes
ggplot2::ylab("Percent of callers' mutations") +
colorblindr::scale_fill_OkabeIto()
-
-
+
+
-
+
# Save this plot
ggplot2::ggsave(file.path(plots_dir, "tcga-frac-base-change-plot.png"))
-
-Saving 7 x 7 in image
-
+
+Saving 7 x 5 in image
+
@@ -3354,7 +3358,7 @@ Mutation region barplot
Summarize Variant_Classification
information per caller.
-
+
# Summarize by the number of times each base change shows up in each category.
frac_var_df <- vaf_df %>%
# Summarize the number of mutations per caller
@@ -3368,7 +3372,7 @@ Mutation region barplot
Make a barplot illustrating the percent of the mutations for each caller that that are each type of change.
-
+
frac_var_df %>%
ggplot2::ggplot(ggplot2::aes(x = caller, y = percent)) +
ggplot2::theme_classic() +
@@ -3380,20 +3384,20 @@ Mutation region barplot
ggplot2::xlab("") +
ggplot2::ylab("Fraction of callers' mutations")
-
-
+
+
-
+
# Save this plot
ggplot2::ggsave(file.path(plots_dir, "tcga-variant-classification-plot.png"))
-
-Saving 7 x 7 in image
-
+
+Saving 7 x 5 in image
+
@@ -3401,10 +3405,10 @@ Mutation region barplot
Session Info
-
+
sessionInfo()
-
+
R version 3.6.0 (2019-04-26)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Debian GNU/Linux 9 (stretch)
@@ -3421,48 +3425,26 @@ Session Info
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
-[1] stats4 parallel stats graphics grDevices utils
-[7] datasets methods base
-
-other attached packages:
- [1] plyr_1.8.4 DT_0.7 knitr_1.23
- [4] org.Hs.eg.db_3.10.0 AnnotationDbi_1.48.0 IRanges_2.20.2
- [7] S4Vectors_0.24.4 Biobase_2.46.0 BiocGenerics_0.32.0
-[10] MM2S_1.0.6 pheatmap_1.0.12 lattice_0.20-38
-[13] kknn_1.3.1 GSVA_1.34.0 medulloPackage_0.1.0
-[16] optparse_1.6.2 forcats_0.4.0 stringr_1.4.0
-[19] dplyr_0.8.3 purrr_0.3.2 readr_1.3.1
-[22] tidyr_0.8.3 tibble_2.1.3 ggplot2_3.2.0
-[25] tidyverse_1.2.1
+[1] stats graphics grDevices utils datasets methods base
loaded via a namespace (and not attached):
- [1] nlme_3.1-140 bitops_1.0-6 lubridate_1.7.4
- [4] bit64_0.9-7 progress_1.2.2 RColorBrewer_1.1-2
- [7] httr_1.4.0 UpSetR_1.4.0 rprojroot_1.3-2
-[10] tools_3.6.0 backports_1.1.4 R6_2.4.0
-[13] DBI_1.0.0 lazyeval_0.2.2 colorspace_1.4-1
-[16] withr_2.1.2 prettyunits_1.0.2 gridExtra_2.3
-[19] GGally_1.4.0 tidyselect_0.2.5 bit_1.1-14
-[22] compiler_3.6.0 graph_1.64.0 cli_1.1.0
-[25] rvest_0.3.4 xml2_1.2.0 labeling_0.3
-[28] scales_1.0.0 digest_0.6.20 rmarkdown_1.13
-[31] base64enc_0.1-3 pkgconfig_2.0.2 htmltools_0.3.6
-[34] dbplyr_1.4.2 htmlwidgets_1.3 rlang_0.4.0
-[37] readxl_1.3.1 rstudioapi_0.10 RSQLite_2.1.1
-[40] shiny_1.3.2 generics_0.0.2 jsonlite_1.6
-[43] crosstalk_1.0.0 RCurl_1.95-4.12 magrittr_1.5
-[46] Matrix_1.2-17 Rcpp_1.0.1 munsell_0.5.0
-[49] stringi_1.4.3 yaml_2.2.0 grid_3.6.0
-[52] blob_1.1.1 promises_1.0.1 crayon_1.3.4
-[55] haven_2.1.1 annotate_1.64.0 hms_0.4.2
-[58] pillar_1.4.2 igraph_1.2.4.1 geneplotter_1.64.0
-[61] reshape2_1.4.3 XML_3.98-1.20 glue_1.3.1
-[64] evaluate_0.14 modelr_0.1.4 colorblindr_0.1.0
-[67] httpuv_1.5.1 cellranger_1.1.0 gtable_0.3.0
-[70] getopt_1.20.3 reshape_0.8.8 assertthat_0.2.1
-[73] xfun_0.8 mime_0.7 xtable_1.8-4
-[76] broom_0.5.2 later_0.8.0 shinythemes_1.1.2
-[79] memoise_1.1.0 GSEABase_1.48.0
+ [1] Rcpp_1.0.1 RColorBrewer_1.1-2 plyr_1.8.4
+ [4] pillar_1.4.2 compiler_3.6.0 dbplyr_1.4.2
+ [7] forcats_0.4.0 base64enc_0.1-3 tools_3.6.0
+[10] digest_0.6.20 bit_1.1-14 jsonlite_1.6
+[13] RSQLite_2.1.1 evaluate_0.14 memoise_1.1.0
+[16] tibble_2.1.3 gtable_0.3.0 pkgconfig_2.0.2
+[19] rlang_0.4.0 GGally_1.4.0 DBI_1.0.0
+[22] yaml_2.2.0 xfun_0.8 gridExtra_2.3
+[25] colorblindr_0.1.0 UpSetR_1.4.0 dplyr_0.8.3
+[28] stringr_1.4.0 knitr_1.23 bit64_0.9-7
+[31] grid_3.6.0 tidyselect_0.2.5 reshape_0.8.8
+[34] glue_1.3.1 R6_2.4.0 rmarkdown_1.13
+[37] reshape2_1.4.3 tidyr_0.8.3 purrr_0.3.2
+[40] ggplot2_3.2.0 blob_1.1.1 magrittr_1.5
+[43] scales_1.0.0 htmltools_0.3.6 assertthat_0.2.1
+[46] colorspace_1.4-1 labeling_0.3 stringi_1.4.3
+[49] lazyeval_0.2.2 munsell_0.5.0 crayon_1.3.4
diff --git a/analyses/snv-callers/compare_snv_callers_plots.nb.html b/analyses/snv-callers/compare_snv_callers_plots.nb.html
index 8775c72d7d..6e4bd52620 100644
--- a/analyses/snv-callers/compare_snv_callers_plots.nb.html
+++ b/analyses/snv-callers/compare_snv_callers_plots.nb.html
@@ -2974,7 +2974,7 @@ Setup
Read in set up script.
-
+
# Magrittr pipe
`%>%` <- dplyr::`%>%`
@@ -2983,7 +2983,7 @@ Setup
Set up directories.
-
+
scratch_dir <- file.path("..", "..", "scratch")
plots_dir <- file.path("plots", "pbta-caller-comparison")
@@ -2996,7 +2996,7 @@ Setup
Import custom function for upset plot.
-
+
source(file.path("util", "upset_plot.R"))
@@ -3007,7 +3007,7 @@ Connect to database
Connect to SQLite database.
-
+
# Start up connection
con <- DBI::dbConnect(RSQLite::SQLite(),
file.path(scratch_dir, "snv_db.sqlite"))
@@ -3017,7 +3017,7 @@ Connect to database
Note what columns we will join by.
-
+
join_cols = c("Chromosome",
"Start_Position",
"Reference_Allele",
@@ -3030,7 +3030,7 @@ Connect to database
Set up tables from the database but only call the columns we need.
-
+
strelka <- dplyr::tbl(con, "strelka") %>%
dplyr::select(join_cols, "VAF")
@@ -3048,7 +3048,7 @@ Connect to database
Because DBI
does not support full_join, we had to use a series of left_join
and union_all
calls in order to get a full join of all the callers.
-
+
# This script will do the full join of the data for us
source(file.path("util", "full_join_callers.R"))
@@ -3060,6 +3060,11 @@ Connect to database
# Take a peek at what this looks like
head(all_caller_df)
+
+
+
@@ -3068,7 +3073,7 @@ Get consensus numbers
Make a data.frame with only the VAFs and turn into detected or not.
-
+
detect_mat <- all_caller_df %>%
# Bring over VAF columns and the index
dplyr::select(dplyr::starts_with("VAF_")) %>%
@@ -3085,7 +3090,7 @@ Get consensus numbers
Count up how many callers call each mutation but ignore VarDict.
-
+
# Add up how many callers identify each mutation, but ignore VarDict
consensus_count <- rowSums(detect_mat[, colnames(detect_mat) != "VAF_vardict"])
@@ -3097,11 +3102,16 @@ Get consensus numbers
Print out the numbers for the three callers’ consensus.
-
+
cat(" At least 1 out of 3:", length(consensus_count), "\n",
"At least 2 out of 3:", sum(consensus_count >= 2), "\n",
"3 out of 3:", sum(consensus_count == 3))
+
+ At least 1 out of 3: 13912263
+ At least 2 out of 3: 5262082
+ 3 out of 3: 1223664
+
@@ -3110,30 +3120,38 @@ Upset graphs
Make the upset plot.
-
+
upset_png(detect_mat,
plot_file_path = file.path(plots_dir, "pbta-upset-plot.png"))
+
+png
+ 2
+
-
+
Transcripts only upset plot.
-
+
transcripts <- !(all_caller_df$Variant_Classification %in% c("5'Flank", "3'Flank", "IGR", "Intron"))
upset_png(detect_mat,
plot_file_path = file.path(plots_dir, "pbta-upset-plot-transcipts-only.png"),
subset_vector = transcripts)
+
+png
+ 2
+
-
+
Non transcript upset plot.
@@ -3146,10 +3164,14 @@ Upset graphs
plot_file_path = file.path(plots_dir, "pbta-upset-plot-not-transcipts-only.png"),
subset_vector = not_transcripts)
+
+png
+ 2
+
-
+
@@ -3158,7 +3180,7 @@
VAF distribution for each caller
Create consensus VAF based on strelka’s VAFs and which mutations would be included
-
+
# Make new VAF column based on strelka's VAFs
all_caller_df$VAF_consensus <- all_caller_df$VAF_strelka
@@ -3173,7 +3195,7 @@ VAF distribution for each caller
Make a long form VAF data.frame for exploring mutations VAFs from each caller.
-
+
vaf_df <- all_caller_df %>%
dplyr::select(index, dplyr::starts_with("VAF_"), Variant_Classification) %>%
# Make long format
@@ -3186,7 +3208,7 @@ VAF distribution for each caller
-
+
# Make this plot
vaf_plot <- vaf_df %>%
ggplot2::ggplot(ggplot2::aes(x = caller, y = vaf, color = caller)) +
@@ -3199,15 +3221,21 @@ VAF distribution for each caller
# Print out
vaf_plot
+
+
+
-
+
# Save this plot
ggplot2::ggsave(plot = vaf_plot,
file.path(plots_dir, "pbta-vaf_distribution_plot.png"))
+
+Saving 7 x 5 in image
+
@@ -3216,24 +3244,36 @@ VAF correlations
Correlate VAF’s across callers.
-
+
cor_vaf <- all_caller_df %>%
dplyr::select(dplyr::starts_with("VAF_"), -VAF_consensus) %>%
- GGally::ggpairs(mapping = ggplot2::aes(alpha = 0.05))
-
-# Print out without progress bar
+ GGally::ggpairs(mapping = ggplot2::aes(alpha = 0.05))
+
+
+Registered S3 method overwritten by 'GGally':
+ method from
+ +.gg ggplot2
+
+
+# Print out without progress bar
cor_vaf %>%
print(progress = FALSE)
+
+
+
VAF’s are pretty related across callers.
-
+
ggplot2::ggsave(filename = file.path(plots_dir, "pbta-vaf_cor_matrix.png"),
plot = cor_vaf)
+
+Saving 7 x 5 in image
+
@@ -3242,7 +3282,7 @@ Base changes
Summarize base change information per caller.
-
+
# Summarize by the number of times each base change shows up in each category.
frac_change_df <- all_caller_df %>%
# Select only the columns we need
@@ -3280,7 +3320,7 @@ Base changes
Make a barplot illustrating the percent of the mutations for each caller that that are each type of change.
-
+
frac_change_df %>%
ggplot2::ggplot(ggplot2::aes(x = change, y = percent)) +
ggplot2::theme_classic() +
@@ -3293,14 +3333,20 @@ Base changes
ggplot2::ylab("Percent of callers' mutations") +
colorblindr::scale_fill_OkabeIto()
+
+
+
-
+
# Save this plot
ggplot2::ggsave(file.path(plots_dir, "pbta-frac_base_change_plot.png"))
+
+Saving 7 x 5 in image
+
@@ -3309,7 +3355,7 @@ Mutation region barplot
Summarize Variant_Classification
information per caller.
-
+
# Summarize by the number of times each base change shows up in each category.
frac_var_df <- vaf_df %>%
# Summarize the number of mutations per caller
@@ -3323,7 +3369,7 @@ Mutation region barplot
Make a barplot illustrating the percent of the mutations for each caller that that are each type of change.
-
+
frac_var_df %>%
ggplot2::ggplot(ggplot2::aes(x = caller, y = percent)) +
ggplot2::theme_classic() +
@@ -3335,14 +3381,20 @@ Mutation region barplot
ggplot2::xlab("") +
ggplot2::ylab("Fraction of callers' mutations")
+
+
+
-
+
# Save this plot
ggplot2::ggsave(file.path(plots_dir, "pbta-variant_classification_plot.png"))
+
+Saving 7 x 5 in image
+
@@ -3350,9 +3402,47 @@ Mutation region barplot
Session Info
-
+
sessionInfo()
+
+R version 3.6.0 (2019-04-26)
+Platform: x86_64-pc-linux-gnu (64-bit)
+Running under: Debian GNU/Linux 9 (stretch)
+
+Matrix products: default
+BLAS/LAPACK: /usr/lib/libopenblasp-r0.2.19.so
+
+locale:
+ [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
+ [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
+ [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=C
+ [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
+ [9] LC_ADDRESS=C LC_TELEPHONE=C
+[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
+
+attached base packages:
+[1] stats graphics grDevices utils datasets methods base
+
+loaded via a namespace (and not attached):
+ [1] Rcpp_1.0.1 RColorBrewer_1.1-2 plyr_1.8.4
+ [4] pillar_1.4.2 compiler_3.6.0 dbplyr_1.4.2
+ [7] forcats_0.4.0 base64enc_0.1-3 tools_3.6.0
+[10] digest_0.6.20 bit_1.1-14 jsonlite_1.6
+[13] RSQLite_2.1.1 evaluate_0.14 memoise_1.1.0
+[16] tibble_2.1.3 gtable_0.3.0 pkgconfig_2.0.2
+[19] rlang_0.4.0 GGally_1.4.0 DBI_1.0.0
+[22] yaml_2.2.0 xfun_0.8 gridExtra_2.3
+[25] colorblindr_0.1.0 UpSetR_1.4.0 dplyr_0.8.3
+[28] stringr_1.4.0 knitr_1.23 bit64_0.9-7
+[31] grid_3.6.0 tidyselect_0.2.5 reshape_0.8.8
+[34] glue_1.3.1 R6_2.4.0 rmarkdown_1.13
+[37] reshape2_1.4.3 tidyr_0.8.3 purrr_0.3.2
+[40] ggplot2_3.2.0 blob_1.1.1 magrittr_1.5
+[43] scales_1.0.0 htmltools_0.3.6 assertthat_0.2.1
+[46] colorspace_1.4-1 labeling_0.3 stringi_1.4.3
+[49] lazyeval_0.2.2 munsell_0.5.0 crayon_1.3.4
+
diff --git a/analyses/snv-callers/plots/pbta-caller-comparison/pbta-frac_base_change_plot.png b/analyses/snv-callers/plots/pbta-caller-comparison/pbta-frac_base_change_plot.png
index 8e4757076c..7d6b00a7f8 100644
Binary files a/analyses/snv-callers/plots/pbta-caller-comparison/pbta-frac_base_change_plot.png and b/analyses/snv-callers/plots/pbta-caller-comparison/pbta-frac_base_change_plot.png differ
diff --git a/analyses/snv-callers/plots/pbta-caller-comparison/pbta-upset-plot-not-transcipts-only.png b/analyses/snv-callers/plots/pbta-caller-comparison/pbta-upset-plot-not-transcipts-only.png
index d31a56fa56..6a5e58a000 100644
Binary files a/analyses/snv-callers/plots/pbta-caller-comparison/pbta-upset-plot-not-transcipts-only.png and b/analyses/snv-callers/plots/pbta-caller-comparison/pbta-upset-plot-not-transcipts-only.png differ
diff --git a/analyses/snv-callers/plots/pbta-caller-comparison/pbta-upset-plot-transcipts-only.png b/analyses/snv-callers/plots/pbta-caller-comparison/pbta-upset-plot-transcipts-only.png
index d3c7a8ac05..f46cbdc504 100644
Binary files a/analyses/snv-callers/plots/pbta-caller-comparison/pbta-upset-plot-transcipts-only.png and b/analyses/snv-callers/plots/pbta-caller-comparison/pbta-upset-plot-transcipts-only.png differ
diff --git a/analyses/snv-callers/plots/pbta-caller-comparison/pbta-upset-plot.png b/analyses/snv-callers/plots/pbta-caller-comparison/pbta-upset-plot.png
index 585dbd1444..c3f6c8aa06 100644
Binary files a/analyses/snv-callers/plots/pbta-caller-comparison/pbta-upset-plot.png and b/analyses/snv-callers/plots/pbta-caller-comparison/pbta-upset-plot.png differ
diff --git a/analyses/snv-callers/plots/pbta-caller-comparison/pbta-vaf_cor_matrix.png b/analyses/snv-callers/plots/pbta-caller-comparison/pbta-vaf_cor_matrix.png
index c7f91e80cc..acfe991f04 100644
Binary files a/analyses/snv-callers/plots/pbta-caller-comparison/pbta-vaf_cor_matrix.png and b/analyses/snv-callers/plots/pbta-caller-comparison/pbta-vaf_cor_matrix.png differ
diff --git a/analyses/snv-callers/plots/pbta-caller-comparison/pbta-vaf_distribution_plot.png b/analyses/snv-callers/plots/pbta-caller-comparison/pbta-vaf_distribution_plot.png
index 539c809921..e9eabe4df1 100644
Binary files a/analyses/snv-callers/plots/pbta-caller-comparison/pbta-vaf_distribution_plot.png and b/analyses/snv-callers/plots/pbta-caller-comparison/pbta-vaf_distribution_plot.png differ
diff --git a/analyses/snv-callers/plots/pbta-caller-comparison/pbta-variant_classification_plot.png b/analyses/snv-callers/plots/pbta-caller-comparison/pbta-variant_classification_plot.png
index 054656c768..d9d41ca3cb 100644
Binary files a/analyses/snv-callers/plots/pbta-caller-comparison/pbta-variant_classification_plot.png and b/analyses/snv-callers/plots/pbta-caller-comparison/pbta-variant_classification_plot.png differ
diff --git a/analyses/snv-callers/plots/tcga-caller-comparison/tcga-frac-base-change-plot.png b/analyses/snv-callers/plots/tcga-caller-comparison/tcga-frac-base-change-plot.png
index b4041aeb73..9df684fba9 100644
Binary files a/analyses/snv-callers/plots/tcga-caller-comparison/tcga-frac-base-change-plot.png and b/analyses/snv-callers/plots/tcga-caller-comparison/tcga-frac-base-change-plot.png differ
diff --git a/analyses/snv-callers/plots/tcga-caller-comparison/tcga-upset-plot-not-transcipts-only.png b/analyses/snv-callers/plots/tcga-caller-comparison/tcga-upset-plot-not-transcipts-only.png
index cfdb3ff762..3966798c3f 100644
Binary files a/analyses/snv-callers/plots/tcga-caller-comparison/tcga-upset-plot-not-transcipts-only.png and b/analyses/snv-callers/plots/tcga-caller-comparison/tcga-upset-plot-not-transcipts-only.png differ
diff --git a/analyses/snv-callers/plots/tcga-caller-comparison/tcga-upset-plot-transcipts-only.png b/analyses/snv-callers/plots/tcga-caller-comparison/tcga-upset-plot-transcipts-only.png
index ba483584b5..4a8079c08b 100644
Binary files a/analyses/snv-callers/plots/tcga-caller-comparison/tcga-upset-plot-transcipts-only.png and b/analyses/snv-callers/plots/tcga-caller-comparison/tcga-upset-plot-transcipts-only.png differ
diff --git a/analyses/snv-callers/plots/tcga-caller-comparison/tcga-upset-plot.png b/analyses/snv-callers/plots/tcga-caller-comparison/tcga-upset-plot.png
index f20bab0902..5cb9991d51 100644
Binary files a/analyses/snv-callers/plots/tcga-caller-comparison/tcga-upset-plot.png and b/analyses/snv-callers/plots/tcga-caller-comparison/tcga-upset-plot.png differ
diff --git a/analyses/snv-callers/plots/tcga-caller-comparison/tcga-vaf-cor-matrix.png b/analyses/snv-callers/plots/tcga-caller-comparison/tcga-vaf-cor-matrix.png
index ec1c8fa9c5..f3425cc6bf 100644
Binary files a/analyses/snv-callers/plots/tcga-caller-comparison/tcga-vaf-cor-matrix.png and b/analyses/snv-callers/plots/tcga-caller-comparison/tcga-vaf-cor-matrix.png differ
diff --git a/analyses/snv-callers/plots/tcga-caller-comparison/tcga-vaf-distribution-plot.png b/analyses/snv-callers/plots/tcga-caller-comparison/tcga-vaf-distribution-plot.png
index d6ce8dc089..fc7d58fbbf 100644
Binary files a/analyses/snv-callers/plots/tcga-caller-comparison/tcga-vaf-distribution-plot.png and b/analyses/snv-callers/plots/tcga-caller-comparison/tcga-vaf-distribution-plot.png differ
diff --git a/analyses/snv-callers/plots/tcga-caller-comparison/tcga-variant-classification-plot.png b/analyses/snv-callers/plots/tcga-caller-comparison/tcga-variant-classification-plot.png
index 430ce21ad6..f7686b9c90 100644
Binary files a/analyses/snv-callers/plots/tcga-caller-comparison/tcga-variant-classification-plot.png and b/analyses/snv-callers/plots/tcga-caller-comparison/tcga-variant-classification-plot.png differ
diff --git a/analyses/snv-callers/run_caller_consensus_analysis-pbta.sh b/analyses/snv-callers/run_caller_consensus_analysis-pbta.sh
index 7bc2a38e01..30181f386b 100644
--- a/analyses/snv-callers/run_caller_consensus_analysis-pbta.sh
+++ b/analyses/snv-callers/run_caller_consensus_analysis-pbta.sh
@@ -26,6 +26,7 @@ vaf_cutoff=${OPENPBTA_VAF_CUTOFF:-0}
run_plots_nb=${OPENPBTA_PLOTS:-0}
################################ Set Up Database ################################
+echo "Setting up Database"
python3 analyses/snv-callers/scripts/01-setup_db.py \
--db-file $dbfile \
--strelka-file data/pbta-snv-strelka2.vep.maf.gz \
@@ -35,6 +36,7 @@ python3 analyses/snv-callers/scripts/01-setup_db.py \
--meta-file data/pbta-histologies.tsv
##################### Merge callers' files into total files ####################
+echo "Merging callers"
Rscript analyses/snv-callers/scripts/02-merge_callers.R \
--db_file $dbfile \
--output_file $consensus_file \
@@ -42,12 +44,14 @@ Rscript analyses/snv-callers/scripts/02-merge_callers.R \
--overwrite
########################## Add consensus to db ################################
+echo "Adding consensus to database"
python3 analyses/snv-callers/scripts/01-setup_db.py \
--db-file $dbfile \
--consensus-file $consensus_file
############# Create intersection BED files for TMB calculations ###############
# Make All mutations BED files
+echo "Making intersection bed files"
bedtools intersect \
-a data/WGS.hg38.strelka2.unpadded.bed \
-b data/WGS.hg38.mutect2.vardict.unpadded.bed \
@@ -56,6 +60,7 @@ bedtools intersect \
#################### Make coding regions file
# Convert GTF to BED file for use in bedtools
# Here we are only extracting lines with as a CDS i.e. are coded in protein
+echo "Making CDS bed file"
gunzip -c data/gencode.v27.primary_assembly.annotation.gtf.gz \
| awk '$3 ~ /CDS/' \
| convert2bed --do-not-sort --input=gtf - \
@@ -64,6 +69,7 @@ gunzip -c data/gencode.v27.primary_assembly.annotation.gtf.gz \
> $cds_file
######################### Calculate consensus TMB ##############################
+echo "Calculating TMB"
Rscript analyses/snv-callers/scripts/03-calculate_tmb.R \
--db_file $dbfile \
--output analyses/snv-callers/results/consensus \
@@ -79,5 +85,6 @@ gzip $consensus_file
############################# Comparison Plots #################################
if [ "$run_plots_nb" -gt "0" ]
then
+ echo "Making comparison plots"
Rscript -e "rmarkdown::render('analyses/snv-callers/compare_snv_callers_plots.Rmd', clean = TRUE)"
fi
diff --git a/analyses/snv-callers/scripts/01-setup_db.py b/analyses/snv-callers/scripts/01-setup_db.py
index cc0528669b..ce5e8616c9 100644
--- a/analyses/snv-callers/scripts/01-setup_db.py
+++ b/analyses/snv-callers/scripts/01-setup_db.py
@@ -101,19 +101,6 @@
('Matched_Norm_Sample_Barcode', 'TEXT'),
('Match_Norm_Seq_Allele1', 'TEXT'),
('Match_Norm_Seq_Allele2', 'TEXT'),
- ('Tumor_Validation_Allele1', 'TEXT'),
- ('Tumor_Validation_Allele2', 'TEXT'),
- ('Match_Norm_Validation_Allele1', 'TEXT'),
- ('Match_Norm_Validation_Allele2', 'TEXT'),
- ('Verification_Status', 'TEXT'),
- ('Validation_Status', 'TEXT'),
- ('Mutation_Status', 'TEXT'),
- ('Sequencing_Phase', 'TEXT'),
- ('Sequence_Source', 'TEXT'),
- ('Validation_Method', 'TEXT'),
- ('Score', 'TEXT'),
- ('BAM_File', 'TEXT'),
- ('Sequencer', 'TEXT'),
('Tumor_Sample_UUID', 'TEXT'),
('Matched_Norm_Sample_UUID', 'TEXT'),
('HGVSc', 'TEXT'),
@@ -161,7 +148,6 @@
('AF', 'TEXT'),
('AFR_AF', 'TEXT'),
('AMR_AF', 'TEXT'),
- ('ASN_AF', 'TEXT'),
('EAS_AF', 'TEXT'),
('EUR_AF', 'TEXT'),
('SAS_AF', 'TEXT'),
@@ -180,31 +166,11 @@
('TSL', 'TEXT'),
('HGVS_OFFSET', 'TEXT'),
('PHENO', 'TEXT'),
- ('MINIMISED', 'TEXT'),
- ('ExAC_AF', 'TEXT'),
- ('ExAC_AF_AFR', 'TEXT'),
- ('ExAC_AF_AMR', 'TEXT'),
- ('ExAC_AF_EAS', 'TEXT'),
- ('ExAC_AF_FIN', 'TEXT'),
- ('ExAC_AF_NFE', 'TEXT'),
- ('ExAC_AF_OTH', 'TEXT'),
- ('ExAC_AF_SAS', 'TEXT'),
('GENE_PHENO', 'TEXT'),
('FILTER', 'TEXT'),
('flanking_bps', 'TEXT'),
('vcf_id', 'TEXT'),
('vcf_qual', 'REAL'),
- ('ExAC_AF_Adj', 'TEXT'),
- ('ExAC_AC_AN_Adj', 'TEXT'),
- ('ExAC_AC_AN', 'TEXT'),
- ('ExAC_AC_AN_AFR', 'TEXT'),
- ('ExAC_AC_AN_AMR', 'TEXT'),
- ('ExAC_AC_AN_EAS', 'TEXT'),
- ('ExAC_AC_AN_FIN', 'TEXT'),
- ('ExAC_AC_AN_NFE', 'TEXT'),
- ('ExAC_AC_AN_OTH', 'TEXT'),
- ('ExAC_AC_AN_SAS', 'TEXT'),
- ('ExAC_FILTER', 'TEXT'),
('gnomAD_AF', 'TEXT'),
('gnomAD_AFR_AF', 'TEXT'),
('gnomAD_AMR_AF', 'TEXT'),
@@ -215,9 +181,12 @@
('gnomAD_OTH_AF', 'TEXT'),
('gnomAD_SAS_AF', 'TEXT'),
('vcf_pos', 'INTEGER'),
+ ('HotSpotAllele', 'INTEGER'),
('VAF', 'REAL')
]
+common_cols = [col for col, type in maf_types]
+
needed_cols = [
'Hugo_Symbol',
'Entrez_Gene_Id',
@@ -304,7 +273,9 @@
# process the chunk
chunk['VAF'] = (chunk['t_alt_count'] /
(chunk['t_ref_count'] + chunk['t_alt_count']))
- if table_name not in ('strelka', 'lancet', 'consensus'):
+ if table_name in ('strelka', 'lancet', 'consensus'):
+ chunk = chunk[common_cols]
+ else:
chunk = chunk[needed_cols]
chunk.to_sql(table_name, con, if_exists='append')
# create indexes
diff --git a/analyses/snv-callers/scripts/03-calculate_tmb.R b/analyses/snv-callers/scripts/03-calculate_tmb.R
index 2778ecd747..abb091d401 100644
--- a/analyses/snv-callers/scripts/03-calculate_tmb.R
+++ b/analyses/snv-callers/scripts/03-calculate_tmb.R
@@ -284,8 +284,8 @@ strelka_mutect_maf_df <- strelka_mutect_maf_df %>%
),
by = "Tumor_Sample_Barcode"
) %>%
- # Remove samples if they are labeled "Panel"
- dplyr::filter(experimental_strategy != "Panel")
+ # Remove samples if they are not WGS or WXS
+ dplyr::filter(experimental_strategy %in% c("WGS", "WXS"))
############################# Set Up BED Files #################################
# Make a data.frame of the unique BED file paths and their names