AlexsLemonade · jaclyn-taroni · May 24, 2022 · Mar 24, 2022 · Apr 7, 2022 · Apr 12, 2022
diff --git a/analyses/focal-cn-file-preparation/README.md b/analyses/focal-cn-file-preparation/README.md
@@ -10,7 +10,7 @@ The purpose of this module is to map from those ranges to gene identifiers for c
 
 To run this analysis _only on consensus SEG file_, 
 
-use OPENPBTA_BASE_SUBTYPING=1 to run this module using the pbta-histologies-base.tsv from data folder and relative path to `copy_number_consensus_call/results/pbta-cnv-consensus.seg.gz` while running molecular-subtyping modules for release.
+use OPENPBTA_BASE_SUBTYPING=1 to run this module using the `pbta-histologies-base.tsv` from data folder and relative path to `copy_number_consensus_call/results/pbta-cnv-consensus.seg.gz` while running molecular-subtyping modules for release.
 
 ```
 OPENPBTA_BASE_SUBTYPING=1 bash analyses/focal-cn-file-preparation/run-prepare-cn.sh

diff --git a/analyses/focal-cn-file-preparation/run-prepare-cn.sh b/analyses/focal-cn-file-preparation/run-prepare-cn.sh
@@ -12,8 +12,8 @@ RUN_ORIGINAL=${RUN_ORIGINAL:-0}
 # Run testing files for circle CI - will not by default
 IS_CI=${OPENPBTA_TESTING:-0}
 
-# Run for subtyping
-RUN_FOR_SUBTYPING=${OPENPBTA_BASE_SUBTYPING:-0}
+# Run for data release
+RUN_FOR_RELEASE=${OPENPBTA_BASE_RELEASE:-0}
 
 # This script should always run as if it were being called from
 # the directory it lives in.
@@ -26,7 +26,7 @@ scratch_dir=../../scratch
 data_dir=../../data
 results_dir=../../analyses/focal-cn-file-preparation/results
 
-if [[ RUN_FOR_SUBTYPING == "0" ]]
+if [[ "$RUN_FOR_RELEASE" -eq "0" ]]
 then
    histologies_file="${data_dir}/pbta-histologies.tsv"
 else
@@ -38,7 +38,7 @@ goi_file=../../analyses/oncoprint-landscape/driver-lists/brain-goi-list-long.txt
 independent_specimens_file=${data_dir}/independent-specimens.wgswxs.primary.tsv
 
 # Prep the consensus SEG file data
-Rscript --vanilla -e "rmarkdown::render('02-add-ploidy-consensus.Rmd',params=list(base_run = $RUN_FOR_SUBTYPING), clean = TRUE)"
+Rscript --vanilla -e "rmarkdown::render('02-add-ploidy-consensus.Rmd',params=list(base_run = $RUN_FOR_RELEASE), clean = TRUE)"
 
 # Run snakemake script implementing `bedtools coverage` for each sample bed file in
 # `scratch/cytoband_status` -- these files are generated in

diff --git a/analyses/fusion_filtering/README.md b/analyses/fusion_filtering/README.md
@@ -48,12 +48,15 @@ The code to generate genelistreference.txt and fusionreference.txt is available
 * pbta-fusion-recurrently-fused-genes-bysample.tsv
 
 ### Run script
-use OPENPBTA_BASE_SUBTYPING=1 to run this module using the pbta-histologies-base.tsv from data folder while running molecular-subtyping modules for release.
+
+Use `OPENPBTA_BASE_RELEASE=1` to run this module using the `pbta-histologies-base.tsv` from data folder while running for release:
+
 ```sh
-OPENPBTA_BASE_SUBTYPING=1 run_fusion_merged.sh 
+OPENPBTA_BASE_RELEASE=1 bash run_fusion_merged.sh 
 ```
 
-OR by default uses pbta-histologies.tsv from data folder
+OR by default uses `pbta-histologies.tsv` from `data` folder
+
 ```sh
 bash run_fusion_merged.sh
 ```

diff --git a/analyses/fusion_filtering/run_fusion_merged.sh b/analyses/fusion_filtering/run_fusion_merged.sh
@@ -3,14 +3,13 @@
 # K S Gaonkar
 
 # Run fusion_filtering
-# Takes one environment variable, `OPENPBTA_BASE_SUBTYPING`, if value is 1 then
-# uses pbta-histologies-base.tsv for subtyping if value is 0 runs all modules (Default)
-# with pbta-histologies.tsv
+# Takes one environment variable, `OPENPBTA_BASE_RELEASE`, if value is 1 then
+# uses pbta-histologies-base.tsv, else runs all module with pbta-histologies.tsv (Default)
 
 set -e
 set -o pipefail
 
-RUN_FOR_SUBTYPING=${OPENPBTA_BASE_SUBTYPING:-0}
+RUN_FOR_RELEASE=${OPENPBTA_BASE_RELEASE:-0}
 
 # This script should always run as if it were being called from
 # the directory it lives in.
@@ -46,7 +45,7 @@ stranded_expression_file="${data_path}/pbta-gene-expression-rsem-fpkm.stranded.r
 normal_expression_file="${references_path}/Brain_FPKM_hg38_matrix.txt.zip"
 
 # metadata files
-if [[ RUN_FOR_SUBTYPING == "0" ]]
+if [[ "$RUN_FOR_RELEASE" -eq "0" ]]
 then
    histologies_file="${data_path}/pbta-histologies.tsv" 
    independent_samples_file="${data_path}/independent-specimens.wgswxs.primary-plus.tsv"
@@ -107,10 +106,10 @@ Rscript 03-Calc-zscore-annotate.R --standardFusionCalls "${scratch_path}/standar
                                   --outputfile "${scratch_path}/standardFusionStrandedExp_QC_expression"
 
 # Project specific filtering
-Rscript -e "rmarkdown::render('04-project-specific-filtering.Rmd',params=list(base_run = $RUN_FOR_SUBTYPING))"
+Rscript -e "rmarkdown::render('04-project-specific-filtering.Rmd',params=list(base_run = $RUN_FOR_RELEASE))"
 
 # QC filter putative oncogene found in more than 4 histologies
-Rscript -e "rmarkdown::render('05-QC_putative_onco_fusion_distribution.Rmd',params=list(base_run = $RUN_FOR_SUBTYPING))"
+Rscript -e "rmarkdown::render('05-QC_putative_onco_fusion_distribution.Rmd',params=list(base_run = $RUN_FOR_RELEASE))"
 
 # Recurrent fusion/fused genes
 Rscript 06-recurrent-fusions-per-histology.R --standardFusionCalls $putative_oncogenic_fusion \

diff --git a/analyses/independent-samples/README.md b/analyses/independent-samples/README.md
@@ -25,9 +25,10 @@ independent-specimens.rnaseq.primary-plus-stranded.tsv
 
 To generate the independent sample lists and associated analysis of redundancies in the overall data set, run the following script from the project root directory:
 
-use OPENPBTA_BASE_SUBTYPING=1 to run this module using the pbta-histologies-base.tsv from data folder while running molecular-subtyping modules for release.
+Use `OPENPBTA_BASE_RELEASE=1` to run this module using the `pbta-histologies-base.tsv` from data folder while preparing analysis files for release:
+
 ```sh
-OPENPBTA_BASE_SUBTYPING=1 ../analyses/independent-samples/run-independent-samples.sh 
+OPENPBTA_BASE_RELEASE=1 ../analyses/independent-samples/run-independent-samples.sh 
 ```
 
 OR by default uses pbta-histologies.tsv from data folder

diff --git a/analyses/independent-samples/run-independent-samples.sh b/analyses/independent-samples/run-independent-samples.sh
@@ -3,20 +3,21 @@
 # Josh Shapiro for CCDL 2019
 #
 # Runs 01-generate-independent-specimens.R with default settings.
-# Takes one environment variable, `OPENPBTA_BASE_SUBTYPING`, if value is 1 then
-# uses pbta-histologies-base.tsv for subtyping if value is 0 runs all modules with pbta-histologies.tsv(Default)
+# Takes one environment variable, `OPENPBTA_BASE_RELEASE`, if value is 1 then
+# uses pbta-histologies-base.tsv, else runs all module with pbta-histologies.tsv (Default)
 
 set -e
 set -o pipefail
 
-RUN_FOR_SUBTYPING=${OPENPBTA_BASE_SUBTYPING:-0}
+# If set to 1 (e.g., to generate files for a release), use the base histologies file
+RUN_FOR_RELEASE=${OPENPBTA_BASE_RELEASE:-0}
 
 # Set the working directory to the directory of this file
 cd "$(dirname "${BASH_SOURCE[0]}")"
 
-Rscript -e "rmarkdown::render('00-repeated-samples.Rmd',params=list(base_run = ${RUN_FOR_SUBTYPING}), clean = TRUE)"
+Rscript -e "rmarkdown::render('00-repeated-samples.Rmd', params=list(base_run = ${RUN_FOR_RELEASE}), clean = TRUE)"
 
-if [[ RUN_FOR_SUBTYPING == "0" ]]
+if [[ "$RUN_FOR_RELEASE" -eq "0" ]]
 then
    HISTOLOGY_FILE="../../data/pbta-histologies.tsv" 
 else 

diff --git a/analyses/snv-callers/README.md b/analyses/snv-callers/README.md
@@ -180,7 +180,6 @@ Two TMB files are created, one including *all snv* and a *coding snvs only*, the
 ```
  --db_file : Path to sqlite database file made from 01-setup_db.py
  --metadata : Relative file path to MAF file to be analyzed. Can be .gz compressed.
-              Assumes file path is given from top directory of 'OpenPBTA-analysis'.
  --all_bed_wgs : File path that specifies the BED regions file to be used for the
                  denominator for all mutations TMB for WGS samples.
  --all_bed_wxs : File path that specifies the BED regions file to be used for the

diff --git a/analyses/snv-callers/run_caller_consensus_analysis-pbta.sh b/analyses/snv-callers/run_caller_consensus_analysis-pbta.sh
@@ -8,60 +8,72 @@
 set -e
 set -o pipefail
 
+# Set the working directory to the directory of this file
+cd "$(dirname "${BASH_SOURCE[0]}")"
+
 # The sqlite database made from the callers will be called:
-dbfile=scratch/snv_db.sqlite
+dbfile=../../scratch/snv_db.sqlite
 
 # Designate output file
-consensus_file=analyses/snv-callers/results/consensus/pbta-snv-consensus-mutation.maf.tsv
+consensus_file=results/consensus/pbta-snv-consensus-mutation.maf.tsv
 
 # BED and GTF file paths
-cds_file=scratch/gencode.v27.primary_assembly.annotation.bed
-wgs_bed=scratch/intersect_strelka_mutect_WGS.bed
+cds_file=../../scratch/gencode.v27.primary_assembly.annotation.bed
+wgs_bed=../../scratch/intersect_strelka_mutect_WGS.bed
 
 # Set a default for the VAF filter if none is specified
 vaf_cutoff=${OPENPBTA_VAF_CUTOFF:-0}
 
+# If running during data generation, we want to use the BASE histologies file
+run_for_release=${OPENPBTA_BASE_RELEASE:-0}
+if [[ "$run_for_release" -eq "0" ]]
+then
+   histologies_file=../../data/pbta-histologies.tsv
+else
+   histologies_file=../../data/pbta-histologies-base.tsv
+fi
+
 # Unless told to run the plots, the default is to skip them
 # To run plots, set OPENPBTA_PLOTS to 1 or more
 run_plots_nb=${OPENPBTA_PLOTS:-0}
 
 ################################ Set Up Database ################################
 echo "Setting up Database"
-python3 analyses/snv-callers/scripts/01-setup_db.py \
+python3 scripts/01-setup_db.py \
   --db-file $dbfile \
-  --strelka-file data/pbta-snv-strelka2.vep.maf.gz \
-  --mutect-file data/pbta-snv-mutect2.vep.maf.gz \
-  --lancet-file data/pbta-snv-lancet.vep.maf.gz \
-  --vardict-file data/pbta-snv-vardict.vep.maf.gz \
-  --meta-file data/pbta-histologies.tsv
+  --strelka-file ../../data/pbta-snv-strelka2.vep.maf.gz \
+  --mutect-file ../../data/pbta-snv-mutect2.vep.maf.gz \
+  --lancet-file ../../data/pbta-snv-lancet.vep.maf.gz \
+  --vardict-file ../../data/pbta-snv-vardict.vep.maf.gz \
+  --meta-file $histologies_file
 
 ##################### Merge callers' files into total files ####################
 echo "Merging callers"
-Rscript analyses/snv-callers/scripts/02-merge_callers.R \
+Rscript scripts/02-merge_callers.R \
   --db_file $dbfile \
   --output_file $consensus_file \
   --vaf_filter $vaf_cutoff \
   --overwrite
 
 ########################## Add consensus to db ################################
 echo "Adding consensus to database"
-python3 analyses/snv-callers/scripts/01-setup_db.py \
+python3 scripts/01-setup_db.py \
   --db-file $dbfile \
   --consensus-file $consensus_file
 
 ############# Create intersection BED files for TMB calculations ###############
 # Make All mutations BED files
 echo "Making intersection bed files"
 bedtools intersect \
-  -a data/WGS.hg38.strelka2.unpadded.bed \
-  -b data/WGS.hg38.mutect2.vardict.unpadded.bed \
+  -a ../../data/WGS.hg38.strelka2.unpadded.bed \
+  -b ../../data/WGS.hg38.mutect2.vardict.unpadded.bed \
   > $wgs_bed
 
 #################### Make coding regions file
 # Convert GTF to BED file for use in bedtools
 # Here we are only extracting lines with as a CDS i.e. are coded in protein
 echo "Making CDS bed file"
-gunzip -c data/gencode.v27.primary_assembly.annotation.gtf.gz \
+gunzip -c ../../data/gencode.v27.primary_assembly.annotation.gtf.gz \
   | awk '$3 ~ /CDS/' \
   | convert2bed --do-not-sort --input=gtf - \
   | sort -k 1,1 -k 2,2n \
@@ -70,10 +82,10 @@ gunzip -c data/gencode.v27.primary_assembly.annotation.gtf.gz \
 
 ######################### Calculate consensus TMB ##############################
 echo "Calculating TMB"
-Rscript analyses/snv-callers/scripts/03-calculate_tmb.R \
+Rscript scripts/03-calculate_tmb.R \
   --db_file $dbfile \
-  --output analyses/snv-callers/results/consensus \
-  --metadata data/pbta-histologies.tsv \
+  --output results/consensus \
+  --metadata $histologies_file \
   --coding_regions $cds_file \
   --overwrite \
   --nonsynfilter_maf
@@ -86,5 +98,5 @@ gzip $consensus_file
 if [ "$run_plots_nb" -gt "0" ]
 then
  echo "Making comparison plots"
- Rscript -e "rmarkdown::render('analyses/snv-callers/compare_snv_callers_plots.Rmd', clean = TRUE)"
+ Rscript -e "rmarkdown::render('compare_snv_callers_plots.Rmd', clean = TRUE)"
 fi
diff --git a/analyses/snv-callers/run_caller_consensus_analysis-tcga.sh b/analyses/snv-callers/run_caller_consensus_analysis-tcga.sh
@@ -8,14 +8,17 @@
 set -e
 set -o pipefail
 
+# Set the working directory to the directory of this file
+cd "$(dirname "${BASH_SOURCE[0]}")"
+
 # The sqlite database made from the callers will be called:
-dbfile=scratch/tcga_snv_db.sqlite
+dbfile=../../scratch/tcga_snv_db.sqlite
 
 # Designate output file
-consensus_file=analyses/snv-callers/results/consensus/tcga-snv-consensus-snv.maf.tsv
+consensus_file=results/consensus/tcga-snv-consensus-snv.maf.tsv
 
 # BED and GTF file paths
-cds_file=scratch/gencode.v27.primary_assembly.annotation.bed
+cds_file=../../scratch/gencode.v27.primary_assembly.annotation.bed
 
 # Set a default for the VAF filter if none is specified
 vaf_cutoff=${OPENPBTA_VAF_CUTOFF:-0}
@@ -25,40 +28,40 @@ vaf_cutoff=${OPENPBTA_VAF_CUTOFF:-0}
 run_plots_nb=${OPENPBTA_PLOTS:-0}
 
 ################################ Set Up Database ################################
-python3 analyses/snv-callers/scripts/01-setup_db.py \
+python3 scripts/01-setup_db.py \
   --db-file $dbfile \
-  --strelka-file data/pbta-tcga-snv-strelka2.vep.maf.gz \
-  --mutect-file data/pbta-tcga-snv-mutect2.vep.maf.gz \
-  --lancet-file data/pbta-tcga-snv-lancet.vep.maf.gz \
-  --meta-file data/pbta-tcga-manifest.tsv
+  --strelka-file ../../data/pbta-tcga-snv-strelka2.vep.maf.gz \
+  --mutect-file ../../data/pbta-tcga-snv-mutect2.vep.maf.gz \
+  --lancet-file ../../data/pbta-tcga-snv-lancet.vep.maf.gz \
+  --meta-file ../../data/pbta-tcga-manifest.tsv
 
 ##################### Merge callers' files into total files ####################
-Rscript analyses/snv-callers/scripts/02-merge_callers.R \
+Rscript scripts/02-merge_callers.R \
   --db_file $dbfile \
   --output_file $consensus_file \
   --vaf_filter $vaf_cutoff \
   --overwrite
 
 ########################## Add consensus to db ################################
-python3 analyses/snv-callers/scripts/01-setup_db.py \
+python3 scripts/01-setup_db.py \
   --db-file $dbfile \
   --consensus-file $consensus_file
 
 ###################### Create intersection BED files ###########################
 # Convert GTF to BED file for use in bedtools
 # Here we are only extracting lines with as a CDS i.e. are coded in protein
-gunzip -c data/gencode.v27.primary_assembly.annotation.gtf.gz \
+gunzip -c ../../data/gencode.v27.primary_assembly.annotation.gtf.gz \
   | awk '$3 ~ /CDS/' \
   | convert2bed --do-not-sort --input=gtf - \
   | sort -k 1,1 -k 2,2n \
   | bedtools merge  \
   > $cds_file
 
 ######################### Calculate consensus TMB ##############################
-Rscript analyses/snv-callers/scripts/03-calculate_tmb.R \
+Rscript scripts/03-calculate_tmb.R \
   --db_file $dbfile \
-  --output analyses/snv-callers/results/consensus \
-  --metadata data/pbta-tcga-manifest.tsv \
+  --output results/consensus \
+  --metadata ../../data/pbta-tcga-manifest.tsv \
   --coding_regions $cds_file \
   --overwrite \
   --tcga \
@@ -70,5 +73,5 @@ gzip $consensus_file
 ############################# Comparison Plots #################################
 if [ "$run_plots_nb" -gt "0" ]
 then
- Rscript -e "rmarkdown::render('analyses/snv-callers/compare_snv_callers_plots-tcga.Rmd', clean = TRUE)"
+ Rscript -e "rmarkdown::render('compare_snv_callers_plots-tcga.Rmd', clean = TRUE)"
 fi
diff --git a/analyses/snv-callers/scripts/02-merge_callers.R b/analyses/snv-callers/scripts/02-merge_callers.R
@@ -28,6 +28,7 @@
 #
 # Establish base dir
 root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
+analysis_dir <- file.path(root_dir, "analyses", "snv-callers")
 
 # Magrittr pipe
 `%>%` <- dplyr::`%>%`
@@ -36,7 +37,7 @@ root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
 library(optparse)
 
 # Import special functions
-source(file.path(root_dir, "analyses", "snv-callers", "util", "split_mnv.R"))
+source(file.path(analysis_dir, "util", "split_mnv.R"))
 
 #--------------------------------Set up options--------------------------------#
 # Set up optparse options
@@ -48,7 +49,7 @@ option_list <- list(
   ),
   make_option(
     opt_str = c("-o", "--output_file"), type = "character",
-    default = NULL, help = "File path and file name of where you would like the 
+    default = NULL, help = "File path and file name of where you would like the
                             MAF-like output from this script to be stored.",
     metavar = "character"
   ),
@@ -73,8 +74,6 @@ opt <- parse_args(OptionParser(option_list = option_list))
 vaf_filter <- opt$vaf_filter # get out of opt list for sql
 
 ############################## Connect to database #############################
-# Normalize this file path
-opt$db_file <- file.path(root_dir, opt$db_file)
 
 # Check that the database specified exists
 if (!file.exists(opt$db_file)) {
@@ -85,8 +84,6 @@ if (!file.exists(opt$db_file)) {
 con <- DBI::dbConnect(RSQLite::SQLite(), opt$db_file)
 
 ############################### Set Up Output #####################################
-# Normalize file path
-opt$output_file <- file.path(root_dir, opt$output_file)
 
 # Make sure the folder is made
 output_dir <- stringr::word(opt$output_file, sep = "/", start = 1, end = -2)