From 4ba68e5c2445efd9132e9b62bce0765765d01c39 Mon Sep 17 00:00:00 2001 From: Tom White Date: Wed, 30 Jan 2019 11:16:24 +0000 Subject: [PATCH 1/2] Spark script improvements - Increase master disk size - Use Dataproc 1.3 - Use gz for known sites - Script for running genome dataset on gcs --- scripts/spark_eval/README.md | 21 +++++++++++-------- scripts/spark_eval/exome_md-bqsr-hc_hdfs.sh | 2 +- .../spark_eval/exome_reads-pipeline_gcs.sh | 2 +- .../spark_eval/exome_reads-pipeline_hdfs.sh | 2 +- scripts/spark_eval/genome_md-bqsr-hc_hdfs.sh | 2 +- .../spark_eval/genome_reads-pipeline_gcs.sh | 7 +++++++ scripts/spark_eval/run_gcs_cluster.sh | 9 ++++---- 7 files changed, 28 insertions(+), 17 deletions(-) create mode 100755 scripts/spark_eval/genome_reads-pipeline_gcs.sh diff --git a/scripts/spark_eval/README.md b/scripts/spark_eval/README.md index 0081140dcd4..b123c200641 100644 --- a/scripts/spark_eval/README.md +++ b/scripts/spark_eval/README.md @@ -8,13 +8,13 @@ This directory contains scripts for testing GATK pipelines on Spark - either on export GCS_CLUSTER=... # Sanity check on small data (a few mins) -./run_gcs_cluster.sh small_reads-pipeline_gcs.sh +./run_gcs_cluster.sh ./small_reads-pipeline_gcs.sh # Run on exome (<1hr) -nohup ./run_gcs_cluster.sh exome_reads-pipeline_gcs.sh & +nohup ./run_gcs_cluster.sh ./exome_reads-pipeline_gcs.sh & # Run on genome (<2hrs) -NUM_WORKERS=20 nohup ./run_gcs_cluster.sh copy_genome_to_hdfs_on_gcs.sh genome_reads-pipeline_hdfs.sh & +NUM_WORKERS=20 nohup ./run_gcs_cluster.sh ./copy_genome_to_hdfs_on_gcs.sh ./genome_reads-pipeline_hdfs.sh & # Check results cat results/* @@ -89,29 +89,32 @@ This will take a few hours. The following starts a GCS cluster, runs the given pipeline, then deletes the cluster. ```bash -nohup ./run_gcs_cluster.sh small_reads-pipeline_gcs.sh & +nohup ./run_gcs_cluster.sh ./small_reads-pipeline_gcs.sh & ``` To copy the dataset to HDFS use a copy script first: ```bash -nohup ./run_gcs_cluster.sh copy_small_to_hdfs_on_gcs.sh small_reads-pipeline_hdfs.sh & +nohup ./run_gcs_cluster.sh ./copy_small_to_hdfs_on_gcs.sh ./small_reads-pipeline_hdfs.sh & ``` ### More examples ```bash # Exome Mark Duplicates, BQSR, Haplotype Caller on HDFS -nohup ./run_gcs_cluster.sh copy_exome_to_hdfs_on_gcs.sh exome_md-bqsr-hc_hdfs.sh & +nohup ./run_gcs_cluster.sh ./copy_exome_to_hdfs_on_gcs.sh ./exome_md-bqsr-hc_hdfs.sh & # Exome ReadsSparkPipeline on HDFS -nohup ./run_gcs_cluster.sh copy_exome_to_hdfs_on_gcs.sh exome_reads-pipeline_hdfs.sh & +nohup ./run_gcs_cluster.sh ./copy_exome_to_hdfs_on_gcs.sh ./exome_reads-pipeline_hdfs.sh & # Genome Mark Duplicates, BQSR, Haplotype Caller on HDFS using 20 workers -NUM_WORKERS=20 nohup ./run_gcs_cluster.sh copy_genome_to_hdfs_on_gcs.sh genome_md-bqsr-hc_hdfs.sh & +NUM_WORKERS=20 nohup ./run_gcs_cluster.sh ./copy_genome_to_hdfs_on_gcs.sh ./genome_md-bqsr-hc_hdfs.sh & # Genome ReadsSparkPipeline on HDFS using 20 workers -NUM_WORKERS=20 nohup ./run_gcs_cluster.sh copy_genome_to_hdfs_on_gcs.sh genome_reads-pipeline_hdfs.sh & +NUM_WORKERS=20 nohup ./run_gcs_cluster.sh ./copy_genome_to_hdfs_on_gcs.sh ./genome_reads-pipeline_hdfs.sh & + +# Genome ReadsSparkPipeline on GCS using 20 workers +NUM_WORKERS=20 nohup ./run_gcs_cluster.sh ./genome_reads-pipeline_gcs.sh & ``` ## Running test cases diff --git a/scripts/spark_eval/exome_md-bqsr-hc_hdfs.sh b/scripts/spark_eval/exome_md-bqsr-hc_hdfs.sh index 8fcf95eaaf2..9f5285e0f6c 100755 --- a/scripts/spark_eval/exome_md-bqsr-hc_hdfs.sh +++ b/scripts/spark_eval/exome_md-bqsr-hc_hdfs.sh @@ -5,5 +5,5 @@ . utils.sh time_gatk "MarkDuplicatesSpark -I hdfs:///user/$USER/exome_spark_eval/NA12878.ga2.exome.maq.raw.bam -O hdfs:///user/$USER/exome_spark_eval/out/markdups-sharded --sharded-output true" 96 1 4g 4g -time_gatk "BQSRPipelineSpark -I hdfs:///user/$USER/exome_spark_eval/out/markdups-sharded -O hdfs:///user/$USER/exome_spark_eval/out/bqsr-sharded --sharded-output true -R hdfs:///user/$USER/exome_spark_eval/Homo_sapiens_assembly18.fasta --known-sites hdfs://${HDFS_HOST_PORT}/user/$USER/exome_spark_eval/dbsnp_138.hg18.vcf" 8 8 32g 4g +time_gatk "BQSRPipelineSpark -I hdfs:///user/$USER/exome_spark_eval/out/markdups-sharded -O hdfs:///user/$USER/exome_spark_eval/out/bqsr-sharded --sharded-output true -R hdfs:///user/$USER/exome_spark_eval/Homo_sapiens_assembly18.fasta --known-sites hdfs://${HDFS_HOST_PORT}/user/$USER/exome_spark_eval/dbsnp_138.hg18.vcf.gz" 8 8 32g 4g time_gatk "HaplotypeCallerSpark -I hdfs:///user/$USER/exome_spark_eval/out/bqsr-sharded -R hdfs:///user/$USER/exome_spark_eval/Homo_sapiens_assembly18.fasta -O hdfs://${HDFS_HOST_PORT}/user/$USER/exome_spark_eval/out/NA12878.ga2.exome.maq.raw.vcf -pairHMM AVX_LOGLESS_CACHING --maxReadsPerAlignmentStart 10" 64 1 6g 4g \ No newline at end of file diff --git a/scripts/spark_eval/exome_reads-pipeline_gcs.sh b/scripts/spark_eval/exome_reads-pipeline_gcs.sh index 36374e813b3..5624b9dc2de 100755 --- a/scripts/spark_eval/exome_reads-pipeline_gcs.sh +++ b/scripts/spark_eval/exome_reads-pipeline_gcs.sh @@ -4,4 +4,4 @@ . utils.sh -time_gatk "ReadsPipelineSpark -I gs://broad-spark-eval-test-data/exome/NA12878.ga2.exome.maq.raw.bam -O gs://broad-spark-eval-test-data/exome/NA12878.ga2.exome.maq.raw.vcf -R gs://broad-spark-eval-test-data/exome/Homo_sapiens_assembly18.fasta --known-sites gs://broad-spark-eval-test-data/exome/dbsnp_138.hg18.vcf -pairHMM AVX_LOGLESS_CACHING --maxReadsPerAlignmentStart 10" 8 8 32g 4g \ No newline at end of file +time_gatk "ReadsPipelineSpark -I gs://broad-spark-eval-test-data/exome/NA12878.ga2.exome.maq.raw.bam -O gs://broad-spark-eval-test-data/exome/NA12878.ga2.exome.maq.raw.vcf -R gs://broad-spark-eval-test-data/exome/Homo_sapiens_assembly18.fasta --known-sites gs://broad-spark-eval-test-data/exome/dbsnp_138.hg18.vcf.gz -pairHMM AVX_LOGLESS_CACHING --maxReadsPerAlignmentStart 10" 20 7 20g 4g diff --git a/scripts/spark_eval/exome_reads-pipeline_hdfs.sh b/scripts/spark_eval/exome_reads-pipeline_hdfs.sh index 8bf4816aa19..edd9c569887 100755 --- a/scripts/spark_eval/exome_reads-pipeline_hdfs.sh +++ b/scripts/spark_eval/exome_reads-pipeline_hdfs.sh @@ -4,7 +4,7 @@ . utils.sh -time_gatk "ReadsPipelineSpark -I hdfs:///user/$USER/exome_spark_eval/NA12878.ga2.exome.maq.raw.bam -O hdfs://${HDFS_HOST_PORT}/user/$USER/exome_spark_eval/out/NA12878.ga2.exome.maq.raw.vcf -R hdfs://${HDFS_HOST_PORT}/user/$USER/exome_spark_eval/Homo_sapiens_assembly18.fasta --known-sites hdfs://${HDFS_HOST_PORT}/user/$USER/exome_spark_eval/dbsnp_138.hg18.vcf.gz -pairHMM AVX_LOGLESS_CACHING --maxReadsPerAlignmentStart 10" 20 7 28g 4g +time_gatk "ReadsPipelineSpark -I hdfs:///user/$USER/exome_spark_eval/NA12878.ga2.exome.maq.raw.bam -O hdfs://${HDFS_HOST_PORT}/user/$USER/exome_spark_eval/out/NA12878.ga2.exome.maq.raw.vcf -R hdfs://${HDFS_HOST_PORT}/user/$USER/exome_spark_eval/Homo_sapiens_assembly18.fasta --known-sites hdfs://${HDFS_HOST_PORT}/user/$USER/exome_spark_eval/dbsnp_138.hg18.vcf.gz -pairHMM AVX_LOGLESS_CACHING --maxReadsPerAlignmentStart 10" 20 7 20g 4g # Notes # 20 executors - 2 per node (this is run on a 10 node cluster of n1-standard-16, each with 16 cores, 60g) diff --git a/scripts/spark_eval/genome_md-bqsr-hc_hdfs.sh b/scripts/spark_eval/genome_md-bqsr-hc_hdfs.sh index 0cd88290823..7a0e8fbe855 100755 --- a/scripts/spark_eval/genome_md-bqsr-hc_hdfs.sh +++ b/scripts/spark_eval/genome_md-bqsr-hc_hdfs.sh @@ -5,5 +5,5 @@ . utils.sh time_gatk "MarkDuplicatesSpark -I hdfs:///user/$USER/q4_spark_eval/WGS-G94982-NA12878-no-NC_007605.bam -O hdfs:///user/$USER/q4_spark_eval/out/markdups-sharded --sharded-output true" 256 1 4g 4g -time_gatk "BQSRPipelineSpark -I hdfs:///user/$USER/q4_spark_eval/out/markdups-sharded -O hdfs:///user/$USER/q4_spark_eval/out/bqsr-sharded --sharded-output true -R hdfs:///user/$USER/q4_spark_eval/human_g1k_v37.fasta --known-sites hdfs://${HDFS_HOST_PORT}/user/$USER/q4_spark_eval/dbsnp_138.b37.vcf" 20 8 42g 4g +time_gatk "BQSRPipelineSpark -I hdfs:///user/$USER/q4_spark_eval/out/markdups-sharded -O hdfs:///user/$USER/q4_spark_eval/out/bqsr-sharded --sharded-output true -R hdfs:///user/$USER/q4_spark_eval/human_g1k_v37.fasta --known-sites hdfs://${HDFS_HOST_PORT}/user/$USER/q4_spark_eval/dbsnp_138.b37.vcf.gz" 20 8 42g 4g time_gatk "HaplotypeCallerSpark -I hdfs:///user/$USER/q4_spark_eval/out/bqsr-sharded -R hdfs:///user/$USER/q4_spark_eval/human_g1k_v37.fasta -O hdfs://${HDFS_HOST_PORT}/user/$USER/q4_spark_eval/out/WGS-G94982-NA12878.vcf -pairHMM AVX_LOGLESS_CACHING --maxReadsPerAlignmentStart 10" 60 1 12g 8g diff --git a/scripts/spark_eval/genome_reads-pipeline_gcs.sh b/scripts/spark_eval/genome_reads-pipeline_gcs.sh new file mode 100755 index 00000000000..6d6a6e76012 --- /dev/null +++ b/scripts/spark_eval/genome_reads-pipeline_gcs.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +# Run the pipeline (ReadsPipelineSpark) on genome data in GCS. + +. utils.sh + +time_gatk "ReadsPipelineSpark -I gs://broad-spark-eval-test-data/genome/WGS-G94982-NA12878-no-NC_007605.bam -O gs://broad-spark-eval-test-data/genome/out/WGS-G94982-NA12878.vcf -R gs://broad-spark-eval-test-data/genome/human_g1k_v37.fasta --known-sites gs://broad-spark-eval-test-data/genome/dbsnp_138.b37.vcf.gz -pairHMM AVX_LOGLESS_CACHING --maxReadsPerAlignmentStart 10" 40 7 20g 8g diff --git a/scripts/spark_eval/run_gcs_cluster.sh b/scripts/spark_eval/run_gcs_cluster.sh index fa84a62a31c..e9516a60f6f 100755 --- a/scripts/spark_eval/run_gcs_cluster.sh +++ b/scripts/spark_eval/run_gcs_cluster.sh @@ -1,6 +1,8 @@ #!/usr/bin/env bash # Starts a GCS cluster, runs scripts, then deletes the cluster. +# The cluster is only deleted if all the scripts run successfully, to allow for debugging. +# The cluster will automatically be deleted in any case after 3 hours. if [ -z "$GCS_CLUSTER" ]; then echo "Please set the GCS_CLUSTER environment variable to the name of the cluster you would like to start." @@ -11,19 +13,18 @@ fi gcloud beta dataproc clusters create "$GCS_CLUSTER" \ --zone us-central1-a \ --master-machine-type n1-standard-4 \ - --master-boot-disk-size 500 \ + --master-boot-disk-size 1000 \ --num-workers ${NUM_WORKERS:-10} \ --worker-machine-type n1-standard-16 \ --worker-boot-disk-size 2000 \ - --image-version 1.2 \ + --image-version 1.3 \ --max-age 3h \ --project broad-gatk-collab # Run scripts for script in "$@" do - SCRIPT_NAME="$script" - source "$script" + eval "$script" || exit $? done # Delete cluster From 7350990d7957fea1afe7b698469ef3c68f4220f2 Mon Sep 17 00:00:00 2001 From: Tom White Date: Wed, 20 Mar 2019 09:47:33 +0000 Subject: [PATCH 2/2] Show message when script fails saying that cluster will not be deleted. --- scripts/spark_eval/run_gcs_cluster.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/spark_eval/run_gcs_cluster.sh b/scripts/spark_eval/run_gcs_cluster.sh index e9516a60f6f..be5dbc45fea 100755 --- a/scripts/spark_eval/run_gcs_cluster.sh +++ b/scripts/spark_eval/run_gcs_cluster.sh @@ -24,7 +24,7 @@ gcloud beta dataproc clusters create "$GCS_CLUSTER" \ # Run scripts for script in "$@" do - eval "$script" || exit $? + eval "$script" || (echo "Script $script returned exit status $?, exiting. NOT deleting cluster immediately." && exit 1) done # Delete cluster