diff --git a/.circleci/config.yml b/.circleci/config.yml index bee4586a58..343c04f91a 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -28,14 +28,14 @@ jobs: - run: name: fusion filtering command: ./scripts/run_in_ci.sh Rscript analyses/fusion_filtering/01-fusion-standardization.R -f "data/pbta-fusion-starfusion.tsv.gz" -c "STARfusion" -o "scratch/star.RDS" - + - run: name: Transcriptome dimensionality reduction command: ./scripts/run_in_ci.sh ./analyses/transcriptomic-dimension-reduction/ci-dimension-reduction-plots.sh - + - run: - name: SNV Caller Set Up - command: ./scripts/run_in_ci.sh Rscript analyses/snv-callers/scripts/00-set_up.R + name: SNV Caller Analysis + command: ./scripts/run_in_ci.sh bash analyses/snv-callers/run_caller_evals.sh #### Add your analysis here #### diff --git a/analyses/snv-callers/README.md b/analyses/snv-callers/README.md index dcb23229d6..fdd42f9674 100644 --- a/analyses/snv-callers/README.md +++ b/analyses/snv-callers/README.md @@ -4,6 +4,8 @@ This analysis evaluates [MAF files](https://docs.gdc.cancer.gov/Data/File_Format The GDC has [good documentation on the fields](https://docs.gdc.cancer.gov/Data/File_Formats/MAF_Format/) contained in a standard MAF file. **Table of Contents** +* [How to run this pipeline](#how-to-run-this-pipeline) +* [General usage of scripts](#general-usage-of-scripts) * [Individual caller evaluation](#individual-caller-evaluation) * [Base change analysis](#base-change-analysis) * [Variant allele fraction calculation](#variant-allele-fraction-calculation) @@ -14,7 +16,95 @@ The GDC has [good documentation on the fields](https://docs.gdc.cancer.gov/Data/ * [Mutation IDs](#mutation-ids) * [Overall file structure](#overall-file-structure) * [Summary of functions](#summary-of-custom-functions) -## Individual Caller Evaluation + +## How to run this pipeline + +**Run evaluations of each MAF file** + +To run the initial evaluations of all the SNV callers, call the bash script: +``` +bash run_caller_evals.sh +``` +This script will return results for each caller in the `plots` and `results` folder. +To see an overall summary report, look in the `results` folder for that caller. +(See [Overall File Structure](#overall-file-structure) for more details on +everything that is returned.) + +**Run comparison analysis of the callers** + +*Coming soon* + +## General usage of scripts + +**Overall notes about these scripts:** +- The scripts are sequential as noted by their number. +- All file path-related options assume the file path given is relative to `OpenPBTA-analysis`. +- By default, the scripts will not overwrite existing files of the same name. However, +this can be overridden with `--overwrite` option. + +### 00-set_up.R + +00-set_up.R creates the [annotation RDS file](#genomic-regional-analyses) and [COSMIC mutations file](#cosmic-mutation-overlap) that are used by +the subsequent scripts. +This set up script only needs to be run once and its three options are all relating to where the reference files should be stored. + +**Option descriptions** +``` + --annot_rds : File path to where you would like the annotation_rds file to be + stored + --cosmic_og : Path to original COSMIC file. Can be .gz compressed. Will need to + download this from COSMIC at https://cancer.sanger.ac.uk/cosmic/download + These data are available if you register. + --cosmic_clean : File path specifying where you would like the cleaned brain-related + COSMIC mutations file to be stored. This file is provided to you in + GitHub. Only coordinates are needed. +``` + +### 01-calculate_vaf_tmb.R + +This script sets up the given MAF file and outputs three files ([VAF](#variant-allele-fraction-calculation), +[TMB](#tumor-mutation-burden-calculation), and [Regional analysis](#genomic-regional-analyses)) +that are used to make an overall evaluation report in `02-run_eval.R`. + +**Option descriptions** +``` + -label : Label to be used for folder and all output. eg. 'strelka2'. Default is 'maf'. + -output : File path that specifies the folder where the output should go. + New folder will be created if it doesn't exist. + --maf : Relative file path to MAF file to be analyzed. Can be .gz compressed. + --metadata : Relative file path to original metadata file. + --annot_rds : Relative file path to annotation object RDS file to be analyzed. + --bed_wgs : File path that specifies the caller-specific BED regions file. + --bed_wxs : File path that specifies the WXS BED regions file. + --overwrite : If specified, will overwrite any files of the same name. Default is FALSE. +``` + +### 02-run_eval.R + +This script takes the files output by `01-calculate_vaf_tmb.R` and makes five +plots ([base_change](#base-change-analysis), [depth_vs_vaf](#variant-allele-fraction-calculation), [cosmic_plot](#cosmic-mutation-overlap), [snv_region](#genomic-regional-analyses), and [tmb_plot](#tumor-mutation-burden-calculation)) that are put into an overall report. + +**Option descriptions** +``` +# --label : Label to be used for folder and all output. eg. 'strelka2'. Optional. +# Default is 'maf' +# --plot_type : Specify what kind of plots you want printed out. Must be +# compatible with ggsave. eg pdf. Default is png +# --vaf : Folder from 01-calculate_vaf_tmb.R following files: +# _vaf.tsv +# _region.tsv +# _tmb.tsv +# --output : Where you would like the output from this script to be stored. +# --strategy : Specify whether you would like WXS and WGS separated for the plots. +# Analysis is still done on all data in the MAF file regardless. +# Acceptable options are 'wgs', 'wxs' or 'both', both for if you +# don't want to separate them. Default is both. +# --cosmic : Relative file path to COSMIC file to be analyzed. +# --overwrite : If TRUE, will overwrite any reports of the same name. Default is +# FALSE +``` + +# Individual Caller Evaluation The first step in this analysis is an individual evaluation of each MAF from each caller. This analysis prints out a report that is further subdivided by the WGS and WXS samples. @@ -45,7 +135,7 @@ This is following the [code used in `maftools`](https://github.com/PoisonAlien/maftools/blob/1d0270e35c2e0f49309eba08b62343ac0db10560/R/plot_vaf.R#L39). The VAF calculations and other special variables are added to the MAF fields and written to a TSV ending in `_vaf.tsv` in the caller's results folder. - *Output for this analysiss* + *Output for this analysis* * `results//_vaf.tsv` * `plots//__depth_vs_vaf.png` @@ -87,7 +177,7 @@ The sample-wise TMB calculations written to a TSV ending in `_tmb.tsv` in the ca The COSMIC mutation data were obtained from https://cancer.sanger.ac.uk/cosmic/download *To run this analysis, you need to obtain these data.* The full, unfiltered somatic mutations file `CosmicMutantExport.tsv` for grch38 is used here and the genomic coordinates is arranged to be in BED format. -The COSMIC set is unfiltered down to only mutations detected in brain-related +The COSMIC set is filtered down to only mutations detected in brain-related samples using the `Site subtype 1` field. COSMIC mutations are overlapped with the present data's mutations using `GenomicRanges`. The outcome of this overlap is added to the VAF data.frame with two `TRUE/FALSE` columns: diff --git a/analyses/snv-callers/run_caller_evals.sh b/analyses/snv-callers/run_caller_evals.sh new file mode 100644 index 0000000000..89f0f9cc46 --- /dev/null +++ b/analyses/snv-callers/run_caller_evals.sh @@ -0,0 +1,41 @@ +#!/bin/bash +# C. Savonen +# CCDL for ALSF 2019 + +# Purpose:Run an intial evaluation of each variant caller's MAF file + +# Change directory +cd kitematic + +# The files named in these arrays will be ran in the analysis. +datasets=("strelka2" "mutect2" "lancet") +wgs_files=("WGS.hg38.strelka2.unpadded.bed" "WGS.hg38.mutect2.unpadded.bed" "WGS.hg38.lancet.300bp_padded.bed") + +# Reference file paths +cosmic=analyses/snv-callers/brain_cosmic_variants_coordinates.tsv +annot_rds=scratch/hg38_genomic_region_annotation.rds + +############################ Set Up Reference Files ############################ +# The original COSMIC file is obtained from: https://cancer.sanger.ac.uk/cosmic/download +# These data are available if you register. The full, unfiltered somatic mutations +# file CosmicMutantExport.tsv.gz for grch38 is used here. +Rscript analyses/snv-callers/scripts/00-set_up.R \ + --annot_rds $annot_rds \ + --cosmic_og scratch/CosmicMutantExport.tsv.gz \ + --cosmic_clean $cosmic + +########################## Calculate and Set Up Data ########################## +# Create files that contain calculated VAF, TMB, and regional analyses. +for ((i=0;i<${#datasets[@]};i++)); +do + echo "Processing dataset: ${datasets[$i]}" + Rscript analyses/snv-callers/scripts/01-calculate_vaf_tmb.R \ + --label ${datasets[$i]} \ + --output analyses/snv-callers/results/${datasets[$i]} \ + --maf data/pbta-snv-${datasets[$i]}.vep.maf.gz \ + --metadata data/pbta-histologies.tsv \ + --bed_wgs data/${wgs_files[$i]} \ + --bed_wxs data/WXS.hg38.100bp_padded.bed \ + --annot_rds $annot_rds \ + --overwrite +done diff --git a/analyses/snv-callers/scripts/00-set_up.R b/analyses/snv-callers/scripts/00-set_up.R index 6cf1dce3b2..9c57ca541e 100644 --- a/analyses/snv-callers/scripts/00-set_up.R +++ b/analyses/snv-callers/scripts/00-set_up.R @@ -1,22 +1,30 @@ -# SNV Caller set up and functions -# +# SNV-caller set up # 2019 # C. Savonen for ALSF - CCDL # -# Purpose: Set up for analyzing MAF files +# Purpose: Set up for reference files that are used by 01-calculate_vaf_tmb.R +# script. +# +# Option descriptions +# --annot_rds : File path to where you would like the annotation_rds file to be +# stored +# --cosmic_og : Path to original COSMIC file. Can be .gz compressed. Give path relative +# to top directory, 'OpenPBTA-analysis'. Will need to download this from +# COSMIC at https://cancer.sanger.ac.uk/cosmic/download. +# These data are available if you register. +# --cosmic_clean : File path specifying where you would like the cleaned brain-related +# COSMIC mutations file to be stored. # +#################################### Set Up #################################### # Establish base dir root_dir <- rprojroot::find_root(rprojroot::has_dir(".git")) -# Import special functions -source(file.path(root_dir, "analyses", "snv-callers", "util", "wrangle_functions.R")) -source(file.path(root_dir, "analyses", "snv-callers", "util", "plot_functions.R")) - -#################################### Set Up #################################### - # Magrittr pipe `%>%` <- dplyr::`%>%` +# Load the optparse library +library(optparse) + # Will need optparse for collecting options if (!("optparse" %in% installed.packages())) { install.packages("optparse", repos = "http://cran.us.r-project.org") @@ -34,50 +42,46 @@ if (!("data.table" %in% installed.packages())) { install.packages("data.table", repos = "http://cran.us.r-project.org") } -##################### Set base directories common file paths ################### -# Declare base directory names -scratch_dir <- file.path(root_dir, "scratch") -data_dir <- file.path(root_dir, "data") -snv_dir <- file.path(root_dir, "analyses", "snv-callers") - -# Directories that we will need -base_results_dir <- file.path(snv_dir, "results") -base_plots_dir <- file.path(snv_dir, "plots") -cosmic_dir <- file.path(snv_dir, "cosmic") - -# Create these folders if they haven't been created yet -if (!dir.exists(base_results_dir)) { - dir.create(base_results_dir) -} -if (!dir.exists(base_plots_dir)) { - dir.create(base_plots_dir) -} -if (!dir.exists(cosmic_dir)) { - dir.create(cosmic_dir) -} - -# Declare reference file names -original_metadata <- file.path(data_dir, "pbta-histologies.tsv") - -# These data were obtained from https://cancer.sanger.ac.uk/cosmic/download -# These data are available if you register. -# The full, unfiltered somatic mutations file CosmicMutantExport.tsv.gz for grch38 -# is used here. -cosmic_file <- file.path(snv_dir, "CosmicMutantExport.tsv.gz") - -# This is the cleaned version that only contains the genomic coordinates and -# base changes from the original file for brain sample mutations only. -cosmic_clean_file <- file.path(cosmic_dir, "brain_cosmic_variants_coordinates.tsv") +################################ Set up options ################################ +# Set up optparse options +option_list <- list( + make_option( + opt_str = "--cosmic_og", type = "character", default = "none", + help = "Relative file path (assuming from top directory of + 'OpenPBTA-analysis') to where the COSMIC mutation file is stored. + Will need to download this from COSMIC at + https://cancer.sanger.ac.uk/cosmic/download + These data are available if you register.", + metavar = "character" + ), + make_option( + opt_str = "--cosmic_clean", type = "character", default = "none", + help = "Relative file path (assuming from top directory of + 'OpenPBTA-analysis') where you would like the cleaned COSMIC file + to be stored.", + metavar = "character" + ), + make_option( + opt_str = "--annot_rds", type = "character", + default = "none", help = "Relative file path (assuming from top + directory of 'OpenPBTA-analysis') that specifies where you would + like the annotation_rds file to be stored.", + metavar = "character" + ) +) -# Will set up this annotation file below -annot_rds <- file.path(scratch_dir, "hg38_genomic_region_annotation.rds") +# Parse options +opt <- parse_args(OptionParser(option_list = option_list)) -# This will be used for all WXS samples, but WGS bed regions are caller-specific -wxs_bed_file <- file.path(data_dir, "WXS.hg38.100bp_padded.bed") +##################### Set base directories common file paths ################### +# Make all base names relative to root_dir +opt$cosmic_og <- file.path(root_dir, opt$cosmic_og) +opt$cosmic_clean <- file.path(root_dir, opt$cosmic_clean) +opt$annot_rds <- file.path(root_dir, opt$annot_rds) ################################ Build Annotation ############################## # We will only run this if it hasn't been run before -if (!file.exists(annot_rds)) { +if (opt$annot_rds != "none") { # Print progress message message("Setting up genomic region annotation file. Only need to do this once.") @@ -98,7 +102,7 @@ if (!file.exists(annot_rds)) { # Write this object so we don't have to write it again readr::write_rds(annotations, - annot_rds, + opt$annot_rds, compress = "gz" ) } @@ -108,37 +112,41 @@ if (!file.exists(annot_rds)) { # if you register. The full, unfiltered somatic mutations file # CosmicMutantExport.tsv for grch38 is used here but only mutations from brain # related samples are kept. -if (!file.exists(cosmic_clean_file)) { - # Print progress message - message("Setting up COSMIC mutation file. Only need to do this once.") - - # Read in original file - cosmic_variants <- data.table::fread(cosmic_file, - data.table = FALSE - ) %>% - # Keep only brain mutations so the file is smaller - dplyr::filter(`Site subtype 1` == "brain") - # Get rid of spaces in column names - cosmic_variants <- cosmic_variants %>% - dplyr::rename_all(dplyr::funs(stringr::str_replace_all(., " ", "_"))) %>% - # Separate the genome coordinates into their own BED like variables - dplyr::mutate( - Chromosome = paste0( - "chr", - stringr::word(Mutation_genome_position, sep = ":", 1) - ), - Start_Position = stringr::word(Mutation_genome_position, sep = ":|-", 1), - End_Position = stringr::word(Mutation_genome_position, sep = "-", 2), - # Make a base_change variable so we can compare to our set up for PBTA data - base_change = substr(Mutation_CDS, nchar(Mutation_CDS) - 2, nchar(Mutation_CDS)) - ) %>% - # Carry over the strand info, but rename to match our PBTA set up - dplyr::rename(Strand = Mutation_strand) %>% - # Narrow down to just the needed columns - dplyr::select( - "Chromosome", "Start_Position", "End_Position", "Strand", - "base_change" +if (opt$cosmic_clean != "none") { + if (file.exists(opt$cosmic_og)) { + # Print progress message + message("Setting up COSMIC mutation file. Only need to do this once.") + + # Read in original file + cosmic_variants <- data.table::fread(opt$cosmic_og, + data.table = FALSE ) %>% - # Write to a TSV file - readr::write_tsv(cosmic_clean_file) + # Keep only brain mutations so the file is smaller + dplyr::filter(`Site subtype 1` == "brain") + # Get rid of spaces in column names + cosmic_variants <- cosmic_variants %>% + dplyr::rename_all(dplyr::funs(stringr::str_replace_all(., " ", "_"))) %>% + # Separate the genome coordinates into their own BED like variables + dplyr::mutate( + Chromosome = paste0( + "chr", + stringr::word(Mutation_genome_position, sep = ":", 1) + ), + Start_Position = stringr::word(Mutation_genome_position, sep = ":|-", 1), + End_Position = stringr::word(Mutation_genome_position, sep = "-", 2), + # Make a base_change variable so we can compare to our set up for PBTA data + base_change = substr(Mutation_CDS, nchar(Mutation_CDS) - 2, nchar(Mutation_CDS)) + ) %>% + # Carry over the strand info, but rename to match our PBTA set up + dplyr::rename(Strand = Mutation_strand) %>% + # Narrow down to just the needed columns + dplyr::select( + "Chromosome", "Start_Position", "End_Position", "Strand", + "base_change" + ) %>% + # Write to a TSV file + readr::write_tsv(opt$cosmic_clean) + } else { + stop("Original COSMIC Mutation file not found. Specify with --cosmic_og") + } } diff --git a/analyses/snv-callers/scripts/01-calculate_vaf_tmb.R b/analyses/snv-callers/scripts/01-calculate_vaf_tmb.R new file mode 100644 index 0000000000..4100f330d0 --- /dev/null +++ b/analyses/snv-callers/scripts/01-calculate_vaf_tmb.R @@ -0,0 +1,294 @@ +# Run variant caller evaluation for a given MAF file. +# +# C. Savonen for ALSF - CCDL +# +# 2019 +# +# Option descriptions +# +# -label : Label to be used for folder and all output. eg. 'strelka2'. Default is 'maf'. +# -output : File path that specifies the folder where the output should go. +# New folder will be created if it doesn't exist. Assumes file path is +# given from top directory of 'OpenPBTA-analysis'. +# --maf : Relative file path to MAF file to be analyzed. Can be .gz compressed. +# Assumes file path is given from top directory of 'OpenPBTA-analysis'. +# --metadata : Relative file path to MAF file to be analyzed. Can be .gz compressed. +# Assumes file path is given from top directory of 'OpenPBTA-analysis'. +# --annot_rds : Relative file path to annotation object RDS file to be analyzed. +# Assumes file path is given from top directory of 'OpenPBTA-analysis'. +# --bed_wgs : File path that specifies the caller-specific BED regions file. +# Assumes from top directory, 'OpenPBTA-analysis'. +# --bed_wxs : File path that specifies the WXS BED regions file. Assumes file path +# is given from top directory of 'OpenPBTA-analysis' +# --overwrite : If specified, will overwrite any files of the same name. Default is FALSE. +# +# Command line example: +# +# Rscript analyses/snv-callers/scripts/01-calculate_vaf_tmb.R \ +# --label strelka2 \ +# --output analyses/snv-callers/results \ +# --maf scratch/snv_dummy_data/strelka2 \ +# --metadata data/pbta-histologies.tsv \ +# --bed_wgs data/WGS.hg38.mutect2.unpadded.bed \ +# --bed_wxs data/WXS.hg38.100bp_padded.bed \ +# --annot_rds scratch/hg38_genomic_region_annotation.rds + +################################ Initial Set Up ################################ +# Establish base dir +root_dir <- rprojroot::find_root(rprojroot::has_dir(".git")) + +# Import special functions +source(file.path(root_dir, "analyses", "snv-callers", "util", "wrangle_functions.R")) + +# Magrittr pipe +`%>%` <- dplyr::`%>%` + +# Load library: +library(optparse) + +################################ Set up options ################################ +# Set up optparse options +option_list <- list( + make_option( + opt_str = c("-l", "--label"), type = "character", + default = "maf", help = "Label to be used for folder and all + output. eg. 'strelka2'. Default is 'maf'", + metavar = "character" + ), + make_option( + opt_str = c("-o", "--output"), type = "character", default = "none", + help = "File path that specifies the folder where the output should + go. Assumes from top directory, 'OpenPBTA-analysis'. New folder + will be created if it doesn't exist.", + metavar = "character" + ), + make_option( + opt_str = "--maf", type = "character", default = "none", + help = "Relative file path (assuming from top directory of + 'OpenPBTA-analysis') to MAF file to be analyzed. Can be .gz compressed.", + metavar = "character" + ), + make_option( + opt_str = "--metadata", type = "character", default = "none", + help = "Relative file path (assuming from top directory of + 'OpenPBTA-analysis') to MAF file to be analyzed. Can be .gz compressed.", + metavar = "character" + ), + make_option( + opt_str = c("-a", "--annot_rds"), type = "character", default = "none", + help = "Relative file path (assuming from top directory of + 'OpenPBTA-analysis') to annotation object RDS file to be analyzed.", + metavar = "character" + ), + make_option( + opt_str = "--bed_wgs", type = "character", default = "none", + help = "File path that specifies the caller-specific + BED regions file. Assumes from top directory, 'OpenPBTA-analysis'", + metavar = "character" + ), + make_option( + opt_str = "--bed_wxs", type = "character", default = "none", + help = "File path that specifies the WXS BED regions file. Assumes + from top directory, 'OpenPBTA-analysis'", + metavar = "character" + ), + make_option( + opt_str = "--overwrite", action = "store_true", + default = FALSE, help = "If TRUE, will overwrite any files of + the same name. Default is FALSE", + metavar = "character" + ) +) + +# Parse options +opt <- parse_args(OptionParser(option_list = option_list)) + +########### Check that the files we need are in the paths specified ############ +needed_files <- c(opt$maf, opt$metadata, opt$bed_wgs, opt$bed_wxs, opt$annot_rds, + opt$cosmic) + +# Add root directory to the file paths +needed_files <- file.path(root_dir, needed_files) + +# Get list of which files were found +files_found <- file.exists(needed_files) + +# Report error if any of them aren't found +if (!all(files_found)) { + stop(paste("\n Could not find needed file(s):", + needed_files[which(!files_found)], + "Check your options and set up.", + sep = "\n" + )) +} + +################## Create output directories for this caller ################## +# Caller specific results directory path +caller_results_dir <- file.path(root_dir, opt$output) + +# Make caller specific results folder +if (!dir.exists(caller_results_dir)) { + dir.create(caller_results_dir, recursive = TRUE) +} + +####################### File paths for files we will create #################### +vaf_file <- file.path(caller_results_dir, paste0(opt$label, "_vaf.tsv")) +region_annot_file <- file.path(caller_results_dir, paste0(opt$label, "_region.tsv")) +tmb_file <- file.path(caller_results_dir, paste0(opt$label, "_tmb.tsv")) + +# Declare metadata file name for this caller +metadata_file <- file.path( + caller_results_dir, + paste0(opt$label, "_metadata_filtered.tsv") +) + +##################### Check for files if overwrite is FALSE #################### +# If overwrite is set to FALSE, check if these exist before continuing +if (!opt$overwrite) { + # Make a list of the output files + output_files <- c(vaf_file, region_annot_file, tmb_file) + + # Find out which of these exist + existing_files <- file.exists(output_files) + + # If all files exist; stop + if (all(existing_files)) { + stop(cat( + "Stopping; --overwrite is not being used and all output files already exist: \n", + vaf_file, "\n", + region_annot_file, "\n", + tmb_file + )) + } + # If some files exist, print a warning: + if (any(existing_files)) { + warning(cat( + "Some output files already exist and will not be overwritten unless you use --overwrite: \n", + paste0(output_files[which(existing_files)], "\n") + )) + } +} + +########################### Set up this caller's data ########################## +# Print progress message +message(paste("Reading in", opt$maf, "MAF data...")) + +# Read in this MAF, skip the version number +maf_df <- data.table::fread(opt$maf, skip = 1, data.table = FALSE) + +# Print progress message +message(paste("Setting up", opt$label, "metadata...")) + +# Isolate metadata to only the samples that are in the datasets +metadata <- readr::read_tsv(opt$metadata) %>% + dplyr::filter(Kids_First_Biospecimen_ID %in% maf_df$Tumor_Sample_Barcode) %>% + dplyr::distinct(Kids_First_Biospecimen_ID, .keep_all = TRUE) %>% + dplyr::arrange() %>% + dplyr::rename(Tumor_Sample_Barcode = Kids_First_Biospecimen_ID) %>% + readr::write_tsv(metadata_file) + +# Print out completion message +message(paste("Filtered metadata file saved to: \n", metadata_file)) + +# Make sure that we have metadata for all these samples. +if (!all(unique(maf_df$Tumor_Sample_Barcode) %in% metadata$Tumor_Sample_Barcode)) { + stop("There are samples in this MAF file that are not in the metadata.") +} + +################## Calculate VAF and set up other variables #################### +# If the file exists or the overwrite option is not being used, calculate VAF +if (file.exists(vaf_file) && !opt$overwrite) { + # Stop if this file exists and overwrite is set to FALSE + warning(cat( + "The VAF file already exists: \n", + vaf_file, "\n", + "Use --overwrite if you want to overwrite it." + )) +} else { + # Print out warning if this file is going to be overwritten + if (file.exists(vaf_file)) { + warning("Overwriting existing VAF file.") + } + # Print out progress message + message(paste("Calculating VAF for", opt$label, "MAF data...")) + + # Use the premade function to calculate VAF this will also merge the metadata + vaf_df <- set_up_maf(maf_df, metadata) %>% + readr::write_tsv(vaf_file) + + # Print out completion message + message(paste("VAF calculations saved to: \n", vaf_file)) +} +######################### Annotate genomic regions ############################# +# If the file exists or the overwrite option is not being used, run regional annotation analysis +if (file.exists(region_annot_file) && !opt$overwrite) { + # Stop if this file exists and overwrite is set to FALSE + warning(cat( + "The regional annotation file already exists: \n", + region_annot_file, "\n", + "Use --overwrite if you want to overwrite it." + )) +} else { + # Print out warning if this file is going to be overwritten + if (file.exists(vaf_file)) { + warning("Overwriting existing regional annotation file.") + } + # Print out progress message + message(paste("Annotating genomic regions for", opt$label, "MAF data...")) + + # Annotation genomic regions + maf_annot <- annotr_maf(vaf_df, annotation_file = opt$annot_rds) %>% + readr::write_tsv(region_annot_file) + + # Print out completion message + message(paste("Genomic region annotations saved to:", region_annot_file)) +} +############################# Calculate TMB #################################### +# If the file exists or the overwrite option is not being used, run TMB calculations +if (file.exists(region_annot_file) && !opt$overwrite) { + # Stop if this file exists and overwrite is set to FALSE + warning(cat( + "The Tumor Mutation Burden file already exists: \n", + tmb_file, "\n", + "Use --overwrite if you want to overwrite it." + )) +} else { + # Print out warning if this file is going to be overwritten + if (file.exists(vaf_file)) { + warning("Overwriting existing TMB file.") + } + # Print out progress message + message(paste("Calculating TMB for", opt$label, "MAF data...")) + + # Set up BED region files for TMB calculations + wgs_bed <- readr::read_tsv(opt$bed_wgs, col_names = FALSE) + wxs_bed <- readr::read_tsv(opt$bed_wxs, col_names = FALSE) + + # Calculate size of genome surveyed + wgs_genome_size <- sum(wgs_bed[, 3] - wgs_bed[, 2]) + wxs_exome_size <- sum(wxs_bed[, 3] - wxs_bed[, 2]) + + # Print out these genome sizes + cat( + " WGS size in bp:", wgs_genome_size, + "\n", + "WXS size in bp:", wxs_exome_size, + "\n" + ) + + # Only do this step if you have WXS samples + if (any(metadata$experimental_strategy == "WXS")) { + # Filter out mutations for WXS that are outside of these BED regions. + vaf_df <- wxs_bed_filter(vaf_df, wxs_bed_file = opt$bed_wxs) + } + + # Calculate TMBs and write to TMB file + tmb_df <- calculate_tmb(vaf_df, + wgs_size = wgs_genome_size, + wxs_size = wxs_exome_size + ) %>% + readr::write_tsv(tmb_file) + + # Print out completion message + message(paste("TMB calculations saved to:", tmb_file)) +} diff --git a/analyses/snv-callers/util/wrangle_functions.R b/analyses/snv-callers/util/wrangle_functions.R index 59487472dc..f0ca1948ec 100644 --- a/analyses/snv-callers/util/wrangle_functions.R +++ b/analyses/snv-callers/util/wrangle_functions.R @@ -29,8 +29,7 @@ set_up_maf <- function(maf_df, metadata_df = NULL) { maf_df <- maf_df %>% dplyr::mutate( # Calculate the variant allele frequency - vaf = as.numeric(t_alt_count) / (as.numeric(t_ref_count) + - as.numeric(t_alt_count)), + vaf = as.numeric(t_alt_count) / (as.numeric(t_ref_count) + as.numeric(t_alt_count)), # Create a base_change variable base_change = paste0(Reference_Allele, ">", Allele), @@ -69,7 +68,7 @@ set_up_maf <- function(maf_df, metadata_df = NULL) { if (!is.null(metadata_df)) { # Tack on the metadata so we have this info maf_df <- maf_df %>% - dplyr::left_join(metadata, by = "Tumor_Sample_Barcode") %>% + dplyr::left_join(metadata_df, by = "Tumor_Sample_Barcode") %>% # Get rid of any variables that have completely NAs. dplyr::select(-which(apply(is.na(.), 2, all))) } @@ -168,8 +167,8 @@ wxs_bed_filter <- function(maf_df, wxs_bed_file = NULL, bp_window = 0) { # What fraction of mutations are in these bed regions? cat( - "Ratio of variants in this BED:", ratio, - "Ratio of variants being filtered out:", 1 - ratio + "Ratio of variants in this BED:", ratio, "\n", + "Ratio of variants being filtered out:", 1 - ratio, "\n" ) # Only keep those in the BED regions that overlap the `wxs_bed_granges` @@ -326,7 +325,7 @@ find_cosmic_overlap <- function(maf_df, cosmic_clean_file, bp_window = 0) { # What fraction of mutations are in these bed regions? cat( " Ratio of variants overlapping with COSMIC:", ratio, "\n", - "Number of mutations with same base_change:", sum(same_change) + "Number of mutations with same base_change:", sum(same_change), "\n" ) # Make this a new column