broadinstitute · tomwhite · Mar 25, 2019 · Jan 30, 2019 · Mar 20, 2019 · jamesemery
diff --git a/scripts/spark_eval/README.md b/scripts/spark_eval/README.md
@@ -8,13 +8,13 @@ This directory contains scripts for testing GATK pipelines on Spark - either on
 export GCS_CLUSTER=...
 
 # Sanity check on small data (a few mins)
-./run_gcs_cluster.sh small_reads-pipeline_gcs.sh
+./run_gcs_cluster.sh ./small_reads-pipeline_gcs.sh
 
 # Run on exome (<1hr)
-nohup ./run_gcs_cluster.sh exome_reads-pipeline_gcs.sh &
+nohup ./run_gcs_cluster.sh ./exome_reads-pipeline_gcs.sh &
 
 # Run on genome (<2hrs)
-NUM_WORKERS=20 nohup ./run_gcs_cluster.sh copy_genome_to_hdfs_on_gcs.sh genome_reads-pipeline_hdfs.sh &
+NUM_WORKERS=20 nohup ./run_gcs_cluster.sh ./copy_genome_to_hdfs_on_gcs.sh ./genome_reads-pipeline_hdfs.sh &
 
 # Check results
 cat results/*
@@ -89,29 +89,32 @@ This will take a few hours.
 The following starts a GCS cluster, runs the given pipeline, then deletes the cluster.
 
 ```bash
-nohup ./run_gcs_cluster.sh small_reads-pipeline_gcs.sh &
+nohup ./run_gcs_cluster.sh ./small_reads-pipeline_gcs.sh &
 ```
 
 To copy the dataset to HDFS use a copy script first:
 
 ```bash
-nohup ./run_gcs_cluster.sh copy_small_to_hdfs_on_gcs.sh small_reads-pipeline_hdfs.sh &
+nohup ./run_gcs_cluster.sh ./copy_small_to_hdfs_on_gcs.sh ./small_reads-pipeline_hdfs.sh &
 ```
 
 ### More examples
 
 ```bash
 # Exome Mark Duplicates, BQSR, Haplotype Caller on HDFS
-nohup ./run_gcs_cluster.sh copy_exome_to_hdfs_on_gcs.sh exome_md-bqsr-hc_hdfs.sh &
+nohup ./run_gcs_cluster.sh ./copy_exome_to_hdfs_on_gcs.sh ./exome_md-bqsr-hc_hdfs.sh &
 
 # Exome ReadsSparkPipeline on HDFS
-nohup ./run_gcs_cluster.sh copy_exome_to_hdfs_on_gcs.sh exome_reads-pipeline_hdfs.sh &
+nohup ./run_gcs_cluster.sh ./copy_exome_to_hdfs_on_gcs.sh ./exome_reads-pipeline_hdfs.sh &
 
 # Genome Mark Duplicates, BQSR, Haplotype Caller on HDFS using 20 workers
-NUM_WORKERS=20 nohup ./run_gcs_cluster.sh copy_genome_to_hdfs_on_gcs.sh genome_md-bqsr-hc_hdfs.sh &
+NUM_WORKERS=20 nohup ./run_gcs_cluster.sh ./copy_genome_to_hdfs_on_gcs.sh ./genome_md-bqsr-hc_hdfs.sh &
 
 # Genome ReadsSparkPipeline on HDFS using 20 workers
-NUM_WORKERS=20 nohup ./run_gcs_cluster.sh copy_genome_to_hdfs_on_gcs.sh genome_reads-pipeline_hdfs.sh &
+NUM_WORKERS=20 nohup ./run_gcs_cluster.sh ./copy_genome_to_hdfs_on_gcs.sh ./genome_reads-pipeline_hdfs.sh &
+
+# Genome ReadsSparkPipeline on GCS using 20 workers
+NUM_WORKERS=20 nohup ./run_gcs_cluster.sh ./genome_reads-pipeline_gcs.sh &
 ```
 
 ## Running test cases

diff --git a/scripts/spark_eval/exome_md-bqsr-hc_hdfs.sh b/scripts/spark_eval/exome_md-bqsr-hc_hdfs.sh
@@ -5,5 +5,5 @@
 . utils.sh
 
 time_gatk "MarkDuplicatesSpark -I hdfs:///user/$USER/exome_spark_eval/NA12878.ga2.exome.maq.raw.bam -O hdfs:///user/$USER/exome_spark_eval/out/markdups-sharded --sharded-output true" 96 1 4g 4g
-time_gatk "BQSRPipelineSpark -I hdfs:///user/$USER/exome_spark_eval/out/markdups-sharded -O hdfs:///user/$USER/exome_spark_eval/out/bqsr-sharded --sharded-output true -R hdfs:///user/$USER/exome_spark_eval/Homo_sapiens_assembly18.fasta --known-sites hdfs://${HDFS_HOST_PORT}/user/$USER/exome_spark_eval/dbsnp_138.hg18.vcf" 8 8 32g 4g
+time_gatk "BQSRPipelineSpark -I hdfs:///user/$USER/exome_spark_eval/out/markdups-sharded -O hdfs:///user/$USER/exome_spark_eval/out/bqsr-sharded --sharded-output true -R hdfs:///user/$USER/exome_spark_eval/Homo_sapiens_assembly18.fasta --known-sites hdfs://${HDFS_HOST_PORT}/user/$USER/exome_spark_eval/dbsnp_138.hg18.vcf.gz" 8 8 32g 4g
 time_gatk "HaplotypeCallerSpark -I hdfs:///user/$USER/exome_spark_eval/out/bqsr-sharded -R hdfs:///user/$USER/exome_spark_eval/Homo_sapiens_assembly18.fasta -O hdfs://${HDFS_HOST_PORT}/user/$USER/exome_spark_eval/out/NA12878.ga2.exome.maq.raw.vcf -pairHMM AVX_LOGLESS_CACHING --maxReadsPerAlignmentStart 10" 64 1 6g 4g
diff --git a/scripts/spark_eval/exome_reads-pipeline_gcs.sh b/scripts/spark_eval/exome_reads-pipeline_gcs.sh
@@ -4,4 +4,4 @@
 
 . utils.sh
 
-time_gatk "ReadsPipelineSpark -I gs://broad-spark-eval-test-data/exome/NA12878.ga2.exome.maq.raw.bam -O gs://broad-spark-eval-test-data/exome/NA12878.ga2.exome.maq.raw.vcf -R gs://broad-spark-eval-test-data/exome/Homo_sapiens_assembly18.fasta --known-sites gs://broad-spark-eval-test-data/exome/dbsnp_138.hg18.vcf -pairHMM AVX_LOGLESS_CACHING --maxReadsPerAlignmentStart 10" 8 8 32g 4g
+time_gatk "ReadsPipelineSpark -I gs://broad-spark-eval-test-data/exome/NA12878.ga2.exome.maq.raw.bam -O gs://broad-spark-eval-test-data/exome/NA12878.ga2.exome.maq.raw.vcf -R gs://broad-spark-eval-test-data/exome/Homo_sapiens_assembly18.fasta --known-sites gs://broad-spark-eval-test-data/exome/dbsnp_138.hg18.vcf.gz -pairHMM AVX_LOGLESS_CACHING --maxReadsPerAlignmentStart 10" 20 7 20g 4g
diff --git a/scripts/spark_eval/exome_reads-pipeline_hdfs.sh b/scripts/spark_eval/exome_reads-pipeline_hdfs.sh
@@ -4,7 +4,7 @@
 
 . utils.sh
 
-time_gatk "ReadsPipelineSpark -I hdfs:///user/$USER/exome_spark_eval/NA12878.ga2.exome.maq.raw.bam -O hdfs://${HDFS_HOST_PORT}/user/$USER/exome_spark_eval/out/NA12878.ga2.exome.maq.raw.vcf -R hdfs://${HDFS_HOST_PORT}/user/$USER/exome_spark_eval/Homo_sapiens_assembly18.fasta --known-sites hdfs://${HDFS_HOST_PORT}/user/$USER/exome_spark_eval/dbsnp_138.hg18.vcf.gz -pairHMM AVX_LOGLESS_CACHING --maxReadsPerAlignmentStart 10" 20 7 28g 4g
+time_gatk "ReadsPipelineSpark -I hdfs:///user/$USER/exome_spark_eval/NA12878.ga2.exome.maq.raw.bam -O hdfs://${HDFS_HOST_PORT}/user/$USER/exome_spark_eval/out/NA12878.ga2.exome.maq.raw.vcf -R hdfs://${HDFS_HOST_PORT}/user/$USER/exome_spark_eval/Homo_sapiens_assembly18.fasta --known-sites hdfs://${HDFS_HOST_PORT}/user/$USER/exome_spark_eval/dbsnp_138.hg18.vcf.gz -pairHMM AVX_LOGLESS_CACHING --maxReadsPerAlignmentStart 10" 20 7 20g 4g
 
 # Notes
 # 20 executors - 2 per node (this is run on a 10 node cluster of n1-standard-16, each with 16 cores, 60g)

diff --git a/scripts/spark_eval/genome_md-bqsr-hc_hdfs.sh b/scripts/spark_eval/genome_md-bqsr-hc_hdfs.sh
@@ -5,5 +5,5 @@
 . utils.sh
 
 time_gatk "MarkDuplicatesSpark -I hdfs:///user/$USER/q4_spark_eval/WGS-G94982-NA12878-no-NC_007605.bam -O hdfs:///user/$USER/q4_spark_eval/out/markdups-sharded --sharded-output true" 256 1 4g 4g
-time_gatk "BQSRPipelineSpark -I hdfs:///user/$USER/q4_spark_eval/out/markdups-sharded -O hdfs:///user/$USER/q4_spark_eval/out/bqsr-sharded --sharded-output true -R hdfs:///user/$USER/q4_spark_eval/human_g1k_v37.fasta --known-sites hdfs://${HDFS_HOST_PORT}/user/$USER/q4_spark_eval/dbsnp_138.b37.vcf" 20 8 42g 4g
+time_gatk "BQSRPipelineSpark -I hdfs:///user/$USER/q4_spark_eval/out/markdups-sharded -O hdfs:///user/$USER/q4_spark_eval/out/bqsr-sharded --sharded-output true -R hdfs:///user/$USER/q4_spark_eval/human_g1k_v37.fasta --known-sites hdfs://${HDFS_HOST_PORT}/user/$USER/q4_spark_eval/dbsnp_138.b37.vcf.gz" 20 8 42g 4g
 time_gatk "HaplotypeCallerSpark -I hdfs:///user/$USER/q4_spark_eval/out/bqsr-sharded -R hdfs:///user/$USER/q4_spark_eval/human_g1k_v37.fasta -O hdfs://${HDFS_HOST_PORT}/user/$USER/q4_spark_eval/out/WGS-G94982-NA12878.vcf -pairHMM AVX_LOGLESS_CACHING --maxReadsPerAlignmentStart 10" 60 1 12g 8g
diff --git a/scripts/spark_eval/genome_reads-pipeline_gcs.sh b/scripts/spark_eval/genome_reads-pipeline_gcs.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+
+# Run the pipeline (ReadsPipelineSpark) on genome data in GCS.
+
+. utils.sh
+
+time_gatk "ReadsPipelineSpark -I gs://broad-spark-eval-test-data/genome/WGS-G94982-NA12878-no-NC_007605.bam -O gs://broad-spark-eval-test-data/genome/out/WGS-G94982-NA12878.vcf -R gs://broad-spark-eval-test-data/genome/human_g1k_v37.fasta --known-sites gs://broad-spark-eval-test-data/genome/dbsnp_138.b37.vcf.gz -pairHMM AVX_LOGLESS_CACHING --maxReadsPerAlignmentStart 10" 40 7 20g 8g
diff --git a/scripts/spark_eval/run_gcs_cluster.sh b/scripts/spark_eval/run_gcs_cluster.sh
@@ -1,6 +1,8 @@
 #!/usr/bin/env bash
 
 # Starts a GCS cluster, runs scripts, then deletes the cluster.
+# The cluster is only deleted if all the scripts run successfully, to allow for debugging.
+# The cluster will automatically be deleted in any case after 3 hours.
 
 if [ -z "$GCS_CLUSTER" ]; then
   echo "Please set the GCS_CLUSTER environment variable to the name of the cluster you would like to start."
@@ -11,19 +13,18 @@ fi
 gcloud beta dataproc clusters create "$GCS_CLUSTER" \
   --zone us-central1-a \
   --master-machine-type n1-standard-4 \
-  --master-boot-disk-size 500 \
+  --master-boot-disk-size 1000 \
   --num-workers ${NUM_WORKERS:-10} \
   --worker-machine-type n1-standard-16 \
   --worker-boot-disk-size 2000 \
-  --image-version 1.2 \
+  --image-version 1.3 \
   --max-age 3h \
   --project broad-gatk-collab
 
 # Run scripts
 for script in "$@"
 do
-  SCRIPT_NAME="$script"
-  source "$script"
+  eval "$script" || exit $?
 done
 
 # Delete cluster