diff --git a/scripts/mutect2_wdl/mutect2.wdl b/scripts/mutect2_wdl/mutect2.wdl index 2c0b6a2a5ad..f72900f8886 100755 --- a/scripts/mutect2_wdl/mutect2.wdl +++ b/scripts/mutect2_wdl/mutect2.wdl @@ -1,3 +1,5 @@ +version 1.0 + ## Copyright Broad Institute, 2017 ## ## This WDL workflow runs GATK4 Mutect 2 on a single tumor-normal pair or on a single tumor sample, @@ -72,82 +74,91 @@ ## pages at https://hub.docker.com/r/broadinstitute/* for detailed licensing information ## pertaining to the included programs. workflow Mutect2 { - # Mutect2 inputs - File? intervals - File ref_fasta - File ref_fai - File ref_dict - File tumor_reads - File tumor_reads_index - File? normal_reads - File? normal_reads_index - File? pon - File? pon_idx - Int scatter_count - File? gnomad - File? gnomad_idx - File? variants_for_contamination - File? variants_for_contamination_idx - File? realignment_index_bundle - String? realignment_extra_args - Boolean? run_orientation_bias_mixture_model_filter + input { + # Mutect2 inputs + File? intervals + File ref_fasta + File ref_fai + File ref_dict + File tumor_reads + File tumor_reads_index + File? normal_reads + File? normal_reads_index + File? pon + File? pon_idx + Int scatter_count + File? gnomad + File? gnomad_idx + File? variants_for_contamination + File? variants_for_contamination_idx + File? realignment_index_bundle + String? realignment_extra_args + Boolean? run_orientation_bias_mixture_model_filter + String? m2_extra_args + String? m2_extra_filtering_args + String? split_intervals_extra_args + Boolean? make_bamout + Boolean? compress_vcfs + File? gga_vcf + File? gga_vcf_idx + + # oncotator inputs + Boolean? run_oncotator + File? onco_ds_tar_gz + String? onco_ds_local_db_dir + String? sequencing_center + String? sequence_source + File? default_config_file + String? oncotator_extra_args + + # Funcotator inputs + Boolean? run_funcotator + String? funco_reference_version + String? funco_output_format + Boolean? funco_compress + Boolean? funco_use_gnomad_AF + File? funco_data_sources_tar_gz + String? funco_transcript_selection_mode + File? funco_transcript_selection_list + Array[String]? funco_annotation_defaults + Array[String]? funco_annotation_overrides + Array[String]? funcotator_excluded_fields + Boolean? funco_filter_funcotations + String? funcotator_extra_args + + String funco_default_output_format = "MAF" + + # runtime + String gatk_docker + File? gatk_override + String basic_bash_docker = "ubuntu:16.04" + String? oncotator_docker + Boolean? filter_oncotator_maf + Boolean? filter_funcotations + + Int? preemptible_attempts + Int? max_retries + + # Use as a last resort to increase the disk given to every task in case of ill behaving data + Int? emergency_extra_disk + + # These are multipliers to multipler inputs by to make sure we have enough disk to accommodate for possible output sizes + # Large is for Bams/WGS vcfs + # Small is for metrics/other vcfs + Float large_input_to_output_multiplier = 2.25 + Float small_input_to_output_multiplier = 2.0 + Float cram_to_bam_multiplier = 6.0 + } + + Boolean compress = select_first([compress_vcfs, false]) Boolean run_ob_filter = select_first([run_orientation_bias_mixture_model_filter, false]) - String? m2_extra_args - String? m2_extra_filtering_args - String? split_intervals_extra_args - Boolean? make_bamout Boolean make_bamout_or_default = select_first([make_bamout, false]) - Boolean? compress_vcfs - Boolean compress = select_first([compress_vcfs, false]) - File? gga_vcf - File? gga_vcf_idx - - # oncotator inputs - Boolean? run_oncotator Boolean run_oncotator_or_default = select_first([run_oncotator, false]) - File? onco_ds_tar_gz - String? onco_ds_local_db_dir - String? sequencing_center - String? sequence_source - File? default_config_file - String? oncotator_extra_args - - # Funcotator inputs - Boolean? run_funcotator Boolean run_funcotator_or_default = select_first([run_funcotator, false]) - String? funco_reference_version - String? funco_output_format - Boolean? funco_compress - Boolean? funco_use_gnomad_AF - File? funco_data_sources_tar_gz - String? funco_transcript_selection_mode - File? funco_transcript_selection_list - Array[String]? funco_annotation_defaults - Array[String]? funco_annotation_overrides - Array[String]? funcotator_excluded_fields - Boolean? funco_filter_funcotations - String? funcotator_extra_args - - String funco_default_output_format = "MAF" - - - # runtime - String gatk_docker - File? gatk_override - String basic_bash_docker = "ubuntu:16.04" - String? oncotator_docker String oncotator_docker_or_default = select_first([oncotator_docker, "broadinstitute/oncotator:1.9.9.0"]) - Boolean? filter_oncotator_maf Boolean filter_oncotator_maf_or_default = select_first([filter_oncotator_maf, true]) - Boolean? filter_funcotations Boolean filter_funcotations_or_default = select_first([filter_funcotations, true]) - Int? preemptible_attempts - Int? max_retries - - # Use as a last resort to increase the disk given to every task in case of ill behaving data - Int? emergency_extra_disk - # Disk sizes used for dynamic sizing Int ref_size = ceil(size(ref_fasta, "GB") + size(ref_dict, "GB") + size(ref_fai, "GB")) Int tumor_reads_size = ceil(size(tumor_reads, "GB") + size(tumor_reads_index, "GB")) @@ -162,13 +173,6 @@ workflow Mutect2 { # This is added to every task as padding, should increase if systematically you need more disk for every call Int disk_pad = 10 + gatk_override_size + select_first([emergency_extra_disk,0]) - # These are multipliers to multipler inputs by to make sure we have enough disk to accommodate for possible output sizes - # Large is for Bams/WGS vcfs - # Small is for metrics/other vcfs - Float large_input_to_output_multiplier = 2.25 - Float small_input_to_output_multiplier = 2.0 - Float cram_to_bam_multiplier = 6.0 - # logic about output file names -- these are the names *without* .vcf extensions String output_basename = basename(basename(tumor_reads, ".bam"),".cram") #hacky way to strip either .bam or .cram String unfiltered_name = output_basename + "-unfiltered" @@ -177,10 +181,11 @@ workflow Mutect2 { String output_vcf_name = output_basename + ".vcf" - # Size M2 differently based on if we are using NIO or not Int tumor_cram_to_bam_disk = ceil(tumor_reads_size * cram_to_bam_multiplier) Int normal_cram_to_bam_disk = ceil(normal_reads_size * cram_to_bam_multiplier) + + if (basename(tumor_reads) != basename(tumor_reads, ".cram")) { call CramToBam as TumorCramToBam { input: @@ -218,6 +223,7 @@ workflow Mutect2 { Int normal_bam_size = if defined(normal_bam) then ceil(size(normal_bam, "GB") + size(normal_bai, "GB")) else 0 Int m2_output_size = tumor_bam_size / scatter_count + #TODO: do we need to change this disk size now that NIO is always going to happen (for the google backend only) Int m2_per_scatter_size = (tumor_bam_size + normal_bam_size) + ref_size + gnomad_vcf_size + m2_output_size + disk_pad call SplitIntervals { @@ -441,8 +447,8 @@ workflow Mutect2 { reference_version = select_first([funco_reference_version, "hg19"]), output_file_base_name = basename(funcotate_vcf_input, ".vcf") + ".annotated", output_format = if defined(funco_output_format) then "" + funco_output_format else funco_default_output_format, - compress = if defined(funco_compress) then funco_compress else false, - use_gnomad = if defined(funco_use_gnomad_AF) then funco_use_gnomad_AF else false, + compress = if defined(funco_compress) then select_first([funco_compress]) else false, + use_gnomad = if defined(funco_use_gnomad_AF) then select_first([funco_use_gnomad_AF]) else false, data_sources_tar_gz = funco_data_sources_tar_gz, case_id = M2.tumor_sample[0], control_id = M2.normal_sample[0], @@ -481,15 +487,17 @@ workflow Mutect2 { } task CramToBam { - - File ref_fasta - File ref_fai - File ref_dict - File cram - File crai - String name - Int disk_size - Int? mem + input { + File ref_fasta + File ref_fai + File ref_dict + #cram and crai must be optional since Normal cram is optional + File? cram + File? crai + String name + Int disk_size + Int? mem + } Int machine_mem = if defined(mem) then mem * 1000 else 6000 @@ -499,10 +507,10 @@ task CramToBam { set -e set -o pipefail - samtools view -h -T ${ref_fasta} ${cram} | - samtools view -b -o ${name}.bam - - samtools index -b ${name}.bam - mv ${name}.bam.bai ${name}.bai + samtools view -h -T ~{ref_fasta} ~{cram} | + samtools view -b -o ~{name}.bam - + samtools index -b ~{name}.bam + mv ~{name}.bam.bai ~{name}.bai } runtime { @@ -512,30 +520,31 @@ task CramToBam { } output { - File output_bam = "${name}.bam" - File output_bai = "${name}.bai" + File output_bam = "~{name}.bam" + File output_bai = "~{name}.bai" } } task SplitIntervals { - # inputs - File? intervals - File ref_fasta - File ref_fai - File ref_dict - Int scatter_count - String? split_intervals_extra_args - - File? gatk_override - - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? max_retries - Int? disk_space - Int? cpu - Boolean use_ssd = false + input { + File? intervals + File ref_fasta + File ref_fai + File ref_dict + Int scatter_count + String? split_intervals_extra_args + + File? gatk_override + + # runtime + String gatk_docker + Int? mem + Int? preemptible_attempts + Int? max_retries + Int? disk_space + Int? cpu + Boolean use_ssd = false + } # Mem is in units of GB but our command and memory runtime values are in MB Int machine_mem = if defined(mem) then mem * 1000 else 3500 @@ -543,15 +552,15 @@ task SplitIntervals { command { set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override} mkdir interval-files - gatk --java-options "-Xmx${command_mem}m" SplitIntervals \ - -R ${ref_fasta} \ - ${"-L " + intervals} \ - -scatter ${scatter_count} \ + gatk --java-options "-Xmx~{command_mem}m" SplitIntervals \ + -R ~{ref_fasta} \ + ~{"-L " + intervals} \ + -scatter ~{scatter_count} \ -O interval-files \ - ${split_intervals_extra_args} + ~{split_intervals_extra_args} cp interval-files/*.interval_list . } @@ -571,92 +580,111 @@ task SplitIntervals { } task M2 { - # inputs - File? intervals - File ref_fasta - File ref_fai - File ref_dict - File tumor_bam - File tumor_bai - File? normal_bam - File? normal_bai - File? pon - File? pon_idx - File? gnomad - File? gnomad_idx - String? m2_extra_args - Boolean? make_bamout - Boolean? run_ob_filter - Boolean compress - File? gga_vcf - File? gga_vcf_idx - File? variants_for_contamination - File? variants_for_contamination_idx + input { + File? intervals + File ref_fasta + File ref_fai + File ref_dict + File tumor_bam + File tumor_bai + File? normal_bam + File? normal_bai + File? pon + File? pon_idx + File? gnomad + File? gnomad_idx + String? m2_extra_args + Boolean? make_bamout + Boolean? run_ob_filter + Boolean compress + File? gga_vcf + File? gga_vcf_idx + File? variants_for_contamination + File? variants_for_contamination_idx + + File? gatk_override + + # runtime + String gatk_docker + Int? mem + Int? preemptible_attempts + Int? max_retries + Int? disk_space + Int? cpu + Boolean use_ssd = false + } String output_vcf = "output" + if compress then ".vcf.gz" else ".vcf" String output_vcf_idx = output_vcf + if compress then ".tbi" else ".idx" String output_stats = output_vcf + ".stats" - File? gatk_override - - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? max_retries - Int? disk_space - Int? cpu - Boolean use_ssd = false - # Mem is in units of GB but our command and memory runtime values are in MB Int machine_mem = if defined(mem) then mem * 1000 else 3500 Int command_mem = machine_mem - 500 + parameter_meta{ + intervals: {localization_optional: true} + ref_fasta: {localization_optional: true} + ref_fai: {localization_optional: true} + ref_dict: {localization_optional: true} + tumor_bam: {localization_optional: true} + tumor_bai: {localization_optional: true} + normal_bam: {localization_optional: true} + normal_bai: {localization_optional: true} + pon: {localization_optional: true} + pon_idx: {localization_optional: true} + gnomad: {localization_optional: true} + gnomad_idx: {localization_optional: true} + gga_vcf: {localization_optional: true} + gga_vcf_idx: {localization_optional: true} + variants_for_contamination: {localization_optional: true} + variants_for_contamination_idx: {localization_optional: true} + } command <<< set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override} # We need to create these files regardless, even if they stay empty touch bamout.bam touch f1r2.tar.gz echo "" > normal_name.txt - gatk --java-options "-Xmx${command_mem}m" GetSampleName -R ${ref_fasta} -I ${tumor_bam} -O tumor_name.txt -encode - tumor_command_line="-I ${tumor_bam} -tumor `cat tumor_name.txt`" + gatk --java-options "-Xmx~{command_mem}m" GetSampleName -R ~{ref_fasta} -I ~{tumor_bam} -O tumor_name.txt -encode + tumor_command_line="-I ~{tumor_bam} -tumor `cat tumor_name.txt`" - if [[ ! -z "${normal_bam}" ]]; then - gatk --java-options "-Xmx${command_mem}m" GetSampleName -R ${ref_fasta} -I ${normal_bam} -O normal_name.txt -encode - normal_command_line="-I ${normal_bam} -normal `cat normal_name.txt`" + if [[ ! -z "~{normal_bam}" ]]; then + gatk --java-options "-Xmx~{command_mem}m" GetSampleName -R ~{ref_fasta} -I ~{normal_bam} -O normal_name.txt -encode + normal_command_line="-I ~{normal_bam} -normal `cat normal_name.txt`" fi - gatk --java-options "-Xmx${command_mem}m" Mutect2 \ - -R ${ref_fasta} \ + gatk --java-options "-Xmx~{command_mem}m" Mutect2 \ + -R ~{ref_fasta} \ $tumor_command_line \ $normal_command_line \ - ${"--germline-resource " + gnomad} \ - ${"-pon " + pon} \ - ${"-L " + intervals} \ - ${"--alleles " + gga_vcf} \ - -O "${output_vcf}" \ - ${true='--bam-output bamout.bam' false='' make_bamout} \ - ${true='--f1r2-tar-gz f1r2.tar.gz' false='' run_ob_filter} \ - ${m2_extra_args} + ~{"--germline-resource " + gnomad} \ + ~{"-pon " + pon} \ + ~{"-L " + intervals} \ + ~{"--alleles " + gga_vcf} \ + -O "~{output_vcf}" \ + ~{true='--bam-output bamout.bam' false='' make_bamout} \ + ~{true='--f1r2-tar-gz f1r2.tar.gz' false='' run_ob_filter} \ + ~{m2_extra_args} ### GetPileupSummaries # These must be created, even if they remain empty, as cromwell doesn't support optional output touch tumor-pileups.table touch normal-pileups.table - if [[ ! -z "${variants_for_contamination}" ]]; then - gatk --java-options "-Xmx${command_mem}m" GetPileupSummaries -R ${ref_fasta} -I ${tumor_bam} ${"--interval-set-rule INTERSECTION -L " + intervals} \ - -V ${variants_for_contamination} -L ${variants_for_contamination} -O tumor-pileups.table + if [[ ! -z "~{variants_for_contamination}" ]]; then + gatk --java-options "-Xmx~{command_mem}m" GetPileupSummaries -R ~{ref_fasta} -I ~{tumor_bam} ~{"--interval-set-rule INTERSECTION -L " + intervals} \ + -V ~{variants_for_contamination} -L ~{variants_for_contamination} -O tumor-pileups.table - if [[ ! -z "${normal_bam}" ]]; then - gatk --java-options "-Xmx${command_mem}m" GetPileupSummaries -R ${ref_fasta} -I ${normal_bam} ${"--interval-set-rule INTERSECTION -L " + intervals} \ - -V ${variants_for_contamination} -L ${variants_for_contamination} -O normal-pileups.table + if [[ ! -z "~{normal_bam}" ]]; then + gatk --java-options "-Xmx~{command_mem}m" GetPileupSummaries -R ~{ref_fasta} -I ~{normal_bam} ~{"--interval-set-rule INTERSECTION -L " + intervals} \ + -V ~{variants_for_contamination} -L ~{variants_for_contamination} -O normal-pileups.table fi fi >>> @@ -672,12 +700,12 @@ task M2 { } output { - File unfiltered_vcf = "${output_vcf}" - File unfiltered_vcf_idx = "${output_vcf_idx}" + File unfiltered_vcf = "~{output_vcf}" + File unfiltered_vcf_idx = "~{output_vcf_idx}" File output_bamOut = "bamout.bam" String tumor_sample = read_string("tumor_name.txt") String normal_sample = read_string("normal_name.txt") - File stats = "${output_stats}" + File stats = "~{output_stats}" File f1r2_counts = "f1r2.tar.gz" File tumor_pileups = "tumor-pileups.table" File normal_pileups = "normal-pileups.table" @@ -685,24 +713,27 @@ task M2 { } task MergeVCFs { - # inputs - Array[File] input_vcfs - Array[File] input_vcf_indices - String output_name - Boolean compress + input { + Array[File] input_vcfs + Array[File] input_vcf_indices + String output_name + Boolean compress + + File? gatk_override + + # runtime + String gatk_docker + Int? mem + Int? preemptible_attempts + Int? max_retries + Int? disk_space + Int? cpu + Boolean use_ssd = false + } + String output_vcf = output_name + if compress then ".vcf.gz" else ".vcf" String output_vcf_idx = output_vcf + if compress then ".tbi" else ".idx" - File? gatk_override - - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? max_retries - Int? disk_space - Int? cpu - Boolean use_ssd = false # Mem is in units of GB but our command and memory runtime values are in MB Int machine_mem = if defined(mem) then mem * 1000 else 3500 @@ -712,8 +743,8 @@ task MergeVCFs { # WARNING 2015-10-28 15:01:48 GatherVcfs Index creation not currently supported when gathering block compressed VCFs. command { set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - gatk --java-options "-Xmx${command_mem}m" MergeVcfs -I ${sep=' -I ' input_vcfs} -O ${output_vcf} + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override} + gatk --java-options "-Xmx~{command_mem}m" MergeVcfs -I ~{sep=' -I ' input_vcfs} -O ~{output_vcf} } runtime { @@ -727,29 +758,30 @@ task MergeVCFs { } output { - File merged_vcf = "${output_vcf}" - File merged_vcf_idx = "${output_vcf_idx}" + File merged_vcf = "~{output_vcf}" + File merged_vcf_idx = "~{output_vcf_idx}" } } task MergeBamOuts { - # inputs - File ref_fasta - File ref_fai - File ref_dict - Array[File]+ bam_outs - String output_vcf_name - - File? gatk_override - - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? max_retries - Int? disk_space - Int? cpu - Boolean use_ssd = false + input { + File ref_fasta + File ref_fai + File ref_dict + Array[File]+ bam_outs + String output_vcf_name + + File? gatk_override + + # runtime + String gatk_docker + Int? mem + Int? preemptible_attempts + Int? max_retries + Int? disk_space + Int? cpu + Boolean use_ssd = false + } # Mem is in units of GB but our command and memory runtime values are in MB Int machine_mem = if defined(mem) then mem * 1000 else 7000 @@ -759,17 +791,17 @@ task MergeBamOuts { # This command block assumes that there is at least one file in bam_outs. # Do not call this task if len(bam_outs) == 0 set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - gatk --java-options "-Xmx${command_mem}m" GatherBamFiles \ - -I ${sep=" -I " bam_outs} -O unsorted.out.bam -R ${ref_fasta} + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override} + gatk --java-options "-Xmx~{command_mem}m" GatherBamFiles \ + -I ~{sep=" -I " bam_outs} -O unsorted.out.bam -R ~{ref_fasta} # We must sort because adjacent scatters may have overlapping (padded) assembly regions, hence # overlapping bamouts - gatk --java-options "-Xmx${command_mem}m" SortSam -I unsorted.out.bam \ - -O ${output_vcf_name}.out.bam \ + gatk --java-options "-Xmx~{command_mem}m" SortSam -I unsorted.out.bam \ + -O ~{output_vcf_name}.out.bam \ --SORT_ORDER coordinate -VALIDATION_STRINGENCY LENIENT - gatk --java-options "-Xmx${command_mem}m" BuildBamIndex -I ${output_vcf_name}.out.bam -VALIDATION_STRINGENCY LENIENT + gatk --java-options "-Xmx~{command_mem}m" BuildBamIndex -I ~{output_vcf_name}.out.bam -VALIDATION_STRINGENCY LENIENT >>> runtime { @@ -783,26 +815,27 @@ task MergeBamOuts { } output { - File merged_bam_out = "${output_vcf_name}.out.bam" - File merged_bam_out_index = "${output_vcf_name}.out.bai" + File merged_bam_out = "~{output_vcf_name}.out.bam" + File merged_bam_out_index = "~{output_vcf_name}.out.bai" } } task MergeStats { - # inputs - Array[File]+ stats - - File? gatk_override - - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? max_retries - Int? disk_space - Int? cpu - Boolean use_ssd = false + input { + Array[File]+ stats + + File? gatk_override + + # runtime + String gatk_docker + Int? mem + Int? preemptible_attempts + Int? max_retries + Int? disk_space + Int? cpu + Boolean use_ssd = false + } # Mem is in units of GB but our command and memory runtime values are in MB Int machine_mem = if defined(mem) then mem * 1000 else 2000 @@ -810,11 +843,11 @@ task MergeStats { command { set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override} - gatk --java-options "-Xmx${command_mem}m" MergeMutectStats \ - -stats ${sep=" -stats " stats} -O merged.stats + gatk --java-options "-Xmx~{command_mem}m" MergeMutectStats \ + -stats ~{sep=" -stats " stats} -O merged.stats } @@ -834,20 +867,22 @@ task MergeStats { } task MergePileupSummaries { - # input_tables needs to be optional because GetPileupSummaries is in an if-block - Array[File?] input_tables - String output_name - File? gatk_override - File ref_dict - - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? max_retries - Int? disk_space - Int? cpu - Boolean use_ssd = false + input { + # input_tables needs to be optional because GetPileupSummaries is in an if-block + Array[File?] input_tables + String output_name + File? gatk_override + File ref_dict + + # runtime + String gatk_docker + Int? mem + Int? preemptible_attempts + Int? max_retries + Int? disk_space + Int? cpu + Boolean use_ssd = false + } # Mem is in units of GB but our command and memory runtime values are in MB Int machine_mem = if defined(mem) then mem * 1000 else 3500 @@ -855,12 +890,12 @@ task MergePileupSummaries { command { set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override} - gatk --java-options "-Xmx${command_mem}m" GatherPileupSummaries \ - --sequence-dictionary ${ref_dict} \ - -I ${sep=' -I ' input_tables} \ - -O ${output_name}.tsv + gatk --java-options "-Xmx~{command_mem}m" GatherPileupSummaries \ + --sequence-dictionary ~{ref_dict} \ + -I ~{sep=' -I ' input_tables} \ + -O ~{output_name}.tsv } runtime { @@ -874,23 +909,25 @@ task MergePileupSummaries { } output { - File merged_table = "${output_name}.tsv" + File merged_table = "~{output_name}.tsv" } } # Learning step of the orientation bias mixture model, which is the recommended orientation bias filter as of September 2018 task LearnReadOrientationModel { - Array[File] f1r2_tar_gz - File? gatk_override - - # runtime - Int? max_retries - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? disk_space - Int? cpu - Boolean use_ssd = false + input { + Array[File] f1r2_tar_gz + File? gatk_override + + # runtime + Int? max_retries + String gatk_docker + Int? mem + Int? preemptible_attempts + Int? disk_space + Int? cpu + Boolean use_ssd = false + } # Mem is in units of GB but our command and memory runtime values are in MB Int machine_mem = if defined(mem) then mem * 1000 else 8000 @@ -898,10 +935,10 @@ task LearnReadOrientationModel { command { set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override} - gatk --java-options "-Xmx${command_mem}m" LearnReadOrientationModel \ - -I ${sep=" -I " f1r2_tar_gz} \ + gatk --java-options "-Xmx~{command_mem}m" LearnReadOrientationModel \ + -I ~{sep=" -I " f1r2_tar_gz} \ -O "artifact-priors.tar.gz" } @@ -922,19 +959,20 @@ task LearnReadOrientationModel { } task CalculateContamination { - # inputs - String? intervals - File tumor_pileups - File? normal_pileups - - File? gatk_override - - # runtime - Int? preemptible_attempts - Int? max_retries - String gatk_docker - Int? disk_space - Int? mem + input { + String? intervals + File tumor_pileups + File? normal_pileups + + File? gatk_override + + # runtime + Int? preemptible_attempts + Int? max_retries + String gatk_docker + Int? disk_space + Int? mem + } # Mem is in units of GB but our command and memory runtime values are in MB Int machine_mem = if defined(mem) then mem * 1000 else 3000 @@ -943,10 +981,10 @@ task CalculateContamination { command { set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override} - gatk --java-options "-Xmx${command_mem}m" CalculateContamination -I ${tumor_pileups} \ - -O contamination.table --tumor-segmentation segments.table ${"-matched " + normal_pileups} + gatk --java-options "-Xmx~{command_mem}m" CalculateContamination -I ~{tumor_pileups} \ + -O contamination.table --tumor-segmentation segments.table ~{"-matched " + normal_pileups} } runtime { @@ -965,52 +1003,63 @@ task CalculateContamination { } task Filter { - # inputs - File? intervals - File ref_fasta - File ref_fai - File ref_dict - File unfiltered_vcf - File unfiltered_vcf_idx - String output_name - Boolean compress + input { + File? intervals + File ref_fasta + File ref_fai + File ref_dict + File unfiltered_vcf + File unfiltered_vcf_idx + String output_name + Boolean compress + File? mutect_stats + File? artifact_priors_tar_gz + File? contamination_table + File? maf_segments + String? m2_extra_filtering_args + + File? gatk_override + + # runtime + String gatk_docker + Int? mem + Int? preemptible_attempts + Int? max_retries + Int? disk_space + Int? cpu + Boolean use_ssd = false + } + String output_vcf = output_name + if compress then ".vcf.gz" else ".vcf" String output_vcf_idx = output_vcf + if compress then ".tbi" else ".idx" - File? mutect_stats - File? artifact_priors_tar_gz - File? contamination_table - File? maf_segments - String? m2_extra_filtering_args - - File? gatk_override - - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? max_retries - Int? disk_space - Int? cpu - Boolean use_ssd = false # Mem is in units of GB but our command and memory runtime values are in MB Int machine_mem = if defined(mem) then mem * 1000 else 7000 Int command_mem = machine_mem - 500 + parameter_meta{ + intervals: {localization_optional: true} + ref_fasta: {localization_optional: true} + ref_fai: {localization_optional: true} + ref_dict: {localization_optional: true} + unfiltered_vcf: {localization_optional: true} + unfiltered_vcf_idx: {localization_optional: true} + } + command { set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - gatk --java-options "-Xmx${command_mem}m" FilterMutectCalls -V ${unfiltered_vcf} \ - -R ${ref_fasta} \ - -O ${output_vcf} \ - ${"--contamination-table " + contamination_table} \ - ${"--tumor-segmentation " + maf_segments} \ - ${"--ob-priors " + artifact_priors_tar_gz} \ - ${"-stats " + mutect_stats} \ - --filtering-stats filtering.stats \ - ${m2_extra_filtering_args} + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override} + + gatk --java-options "-Xmx~{command_mem}m" FilterMutectCalls -V ~{unfiltered_vcf} \ + -R ~{ref_fasta} \ + -O ~{output_vcf} \ + ~{"--contamination-table " + contamination_table} \ + ~{"--tumor-segmentation " + maf_segments} \ + ~{"--ob-priors " + artifact_priors_tar_gz} \ + ~{"-stats " + mutect_stats} \ + --filtering-stats filtering.stats \ + ~{m2_extra_filtering_args} } runtime { @@ -1024,50 +1073,59 @@ task Filter { } output { - File filtered_vcf = "${output_vcf}" - File filtered_vcf_idx = "${output_vcf_idx}" + File filtered_vcf = "~{output_vcf}" + File filtered_vcf_idx = "~{output_vcf_idx}" File filtering_stats = "filtering.stats" } } task FilterAlignmentArtifacts { - #input - File? gatk_override - File input_vcf - File input_vcf_idx - File bam - File bai - String output_name - Boolean compress + input { + File? gatk_override + File input_vcf + File input_vcf_idx + File bam + File bai + String output_name + Boolean compress + File realignment_index_bundle + String? realignment_extra_args + + # runtime + String gatk_docker + Int? mem + Int? preemptible_attempts + Int? max_retries + Int? disk_space + Int? cpu + Boolean use_ssd = false + } + String output_vcf = output_name + if compress then ".vcf.gz" else ".vcf" String output_vcf_idx = output_vcf + if compress then ".tbi" else ".idx" - File realignment_index_bundle - String? realignment_extra_args - - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? max_retries - Int? disk_space - Int? cpu - Boolean use_ssd = false # Mem is in units of GB but our command and memory runtime values are in MB Int machine_mem = if defined(mem) then mem * 1000 else 9000 Int command_mem = machine_mem - 500 + parameter_meta{ + input_vcf: {localization_optional: true} + input_vcf_idx: {localization_optional: true} + bam: {localization_optional: true} + bai: {localization_optional: true} + } + command { set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override} - gatk --java-options "-Xmx${command_mem}m" FilterAlignmentArtifacts \ - -V ${input_vcf} \ - -I ${bam} \ - --bwa-mem-index-image ${realignment_index_bundle} \ - ${realignment_extra_args} \ - -O ${output_vcf} + gatk --java-options "-Xmx~{command_mem}m" FilterAlignmentArtifacts \ + -V ~{input_vcf} \ + -I ~{bam} \ + --bwa-mem-index-image ~{realignment_index_bundle} \ + ~{realignment_extra_args} \ + -O ~{output_vcf} } runtime { @@ -1081,34 +1139,36 @@ task FilterAlignmentArtifacts { } output { - File filtered_vcf = "${output_vcf}" - File filtered_vcf_idx = "${output_vcf_idx}" + File filtered_vcf = "~{output_vcf}" + File filtered_vcf_idx = "~{output_vcf_idx}" } } task oncotate_m2 { - # inputs - File m2_vcf - File? onco_ds_tar_gz - String? onco_ds_local_db_dir - String? oncotator_exe - String? sequencing_center - String? sequence_source - File? default_config_file - String case_id - String? control_id - String? oncotator_extra_args - - # runtime - String oncotator_docker - Int? mem - Int? preemptible_attempts - Int? max_retries - Int? disk_space - Int? cpu - Boolean use_ssd = false - - Boolean? filter_maf + input { + File m2_vcf + File? onco_ds_tar_gz + String? onco_ds_local_db_dir + String? oncotator_exe + String? sequencing_center + String? sequence_source + File? default_config_file + String case_id + String? control_id + String? oncotator_extra_args + + # runtime + String oncotator_docker + Int? mem + Int? preemptible_attempts + Int? max_retries + Int? disk_space + Int? cpu + Boolean use_ssd = false + + Boolean? filter_maf + } + Boolean is_filter_maf = select_first([filter_maf, true]) String filter_maf_args = if (is_filter_maf) then " --collapse-filter-cols --prune-filter-cols " else "" @@ -1121,14 +1181,14 @@ task oncotate_m2 { set -e # local db dir is a directory and has been specified - if [[ -d "${onco_ds_local_db_dir}" ]]; then - echo "Using local db-dir: ${onco_ds_local_db_dir}" + if [[ -d "~{onco_ds_local_db_dir}" ]]; then + echo "Using local db-dir: ~{onco_ds_local_db_dir}" echo "THIS ONLY WORKS WITHOUT DOCKER!" - ln -s ${onco_ds_local_db_dir} onco_dbdir - elif [[ "${onco_ds_tar_gz}" == *.tar.gz ]]; then - echo "Using given tar file: ${onco_ds_tar_gz}" + ln -s ~{onco_ds_local_db_dir} onco_dbdir + elif [[ "~{onco_ds_tar_gz}" == *.tar.gz ]]; then + echo "Using given tar file: ~{onco_ds_tar_gz}" mkdir onco_dbdir - tar zxvf ${onco_ds_tar_gz} -C onco_dbdir --strip-components 1 + tar zxvf ~{onco_ds_tar_gz} -C onco_dbdir --strip-components 1 else echo "Downloading and installing oncotator datasources from Broad FTP site..." # Download and untar the db-dir @@ -1137,15 +1197,15 @@ task oncotate_m2 { ln -s oncotator_v1_ds_April052016 onco_dbdir fi - ${default="/root/oncotator_venv/bin/oncotator" oncotator_exe} --db-dir onco_dbdir/ -c $HOME/tx_exact_uniprot_matches.AKT1_CRLF2_FGFR1.txt \ - -v ${m2_vcf} ${case_id}.maf.annotated hg19 -i VCF -o TCGAMAF --skip-no-alt --collapse-number-annotations --log_name oncotator.log \ - -a Center:${default="Unknown" sequencing_center} \ - -a source:${default="Unknown" sequence_source} \ - -a normal_barcode:${control_id} \ - -a tumor_barcode:${case_id} \ - ${"--default_config " + default_config_file} \ - ${filter_maf_args} \ - ${oncotator_extra_args} + ~{default="/root/oncotator_venv/bin/oncotator" oncotator_exe} --db-dir onco_dbdir/ -c $HOME/tx_exact_uniprot_matches.AKT1_CRLF2_FGFR1.txt \ + -v ~{m2_vcf} ~{case_id}.maf.annotated hg19 -i VCF -o TCGAMAF --skip-no-alt --collapse-number-annotations --log_name oncotator.log \ + -a Center:~{default="Unknown" sequencing_center} \ + -a source:~{default="Unknown" sequence_source} \ + -a normal_barcode:~{control_id} \ + -a tumor_barcode:~{case_id} \ + ~{"--default_config " + default_config_file} \ + ~{filter_maf_args} \ + ~{oncotator_extra_args} >>> runtime { @@ -1159,20 +1219,22 @@ task oncotate_m2 { } output { - File oncotated_m2_maf="${case_id}.maf.annotated" + File oncotated_m2_maf="~{case_id}.maf.annotated" } } # Calculates sum of a list of floats task SumFloats { - Array[Float] sizes + input { + Array[Float] sizes - # Runtime parameters - Int? preemptible_attempts - Int? max_retries + # Runtime parameters + Int? preemptible_attempts + Int? max_retries + } command <<< - python -c "print ${sep="+" sizes}" + python -c "print ~{sep="+" sizes}" >>> output { @@ -1188,34 +1250,51 @@ task SumFloats { } task Funcotate { - # ============== - # Inputs - File ref_fasta - File ref_fai - File ref_dict - File input_vcf - File input_vcf_idx - String reference_version - String output_file_base_name - String output_format - Boolean compress - Boolean use_gnomad - # This should be updated when a new version of the data sources is released - # TODO: Make this dynamically chosen in the command. - File? data_sources_tar_gz = "gs://broad-public-datasets/funcotator/funcotator_dataSources.v1.6.20190124s.tar.gz" - String? control_id - String? case_id - String? sequencing_center - String? sequence_source - String? transcript_selection_mode - File? transcript_selection_list - Array[String]? annotation_defaults - Array[String]? annotation_overrides - Array[String]? funcotator_excluded_fields - Boolean? filter_funcotations - File? interval_list - - String? extra_args + input { + File ref_fasta + File ref_fai + File ref_dict + File input_vcf + File input_vcf_idx + String reference_version + String output_file_base_name + String output_format + Boolean compress + Boolean use_gnomad + # This should be updated when a new version of the data sources is released + # TODO: Make this dynamically chosen in the command. + File? data_sources_tar_gz = "gs://broad-public-datasets/funcotator/funcotator_dataSources.v1.6.20190124s.tar.gz" + String? control_id + String? case_id + String? sequencing_center + String? sequence_source + String? transcript_selection_mode + File? transcript_selection_list + Array[String]? annotation_defaults + Array[String]? annotation_overrides + Array[String]? funcotator_excluded_fields + Boolean? filter_funcotations + File? interval_list + + String? extra_args + + # ============== + # Runtime options: + String gatk_docker + File? gatk_override + Int? mem + Int? preemptible_attempts + Int? max_retries + Int? disk_space_gb + Int? cpu + + Boolean use_ssd = false + + # You may have to change the following two parameter values depending on the task requirements + Int default_ram_mb = 3000 + # WARNING: In the workflow, you should calculate the disk space as an input to this task (disk_space_gb). Please see [TODO: Link from Jose] for examples. + Int default_disk_space_gb = 100 + } # ============== # Process input args: @@ -1233,77 +1312,68 @@ task Funcotate { String interval_list_arg = if defined(interval_list) then " -L " else "" String extra_args_arg = select_first([extra_args, ""]) - # ============== - # Runtime options: - String gatk_docker - File? gatk_override - Int? mem - Int? preemptible_attempts - Int? max_retries - Int? disk_space_gb - Int? cpu - - Boolean use_ssd = false - - # You may have to change the following two parameter values depending on the task requirements - Int default_ram_mb = 3000 - # WARNING: In the workflow, you should calculate the disk space as an input to this task (disk_space_gb). Please see [TODO: Link from Jose] for examples. - Int default_disk_space_gb = 100 - # Mem is in units of GB but our command and memory runtime values are in MB Int machine_mem = if defined(mem) then mem *1000 else default_ram_mb Int command_mem = machine_mem - 1000 String dollar = "$" + parameter_meta{ + ref_fasta: {localization_optional: true} + ref_fai: {localization_optional: true} + ref_dict: {localization_optional: true} + input_vcf: {localization_optional: true} + input_vcf_idx: {localization_optional: true} + } + command <<< set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override} # Extract our data sources: echo "Extracting data sources zip file..." mkdir datasources_dir - tar zxvf ${data_sources_tar_gz} -C datasources_dir --strip-components 1 + tar zxvf ~{data_sources_tar_gz} -C datasources_dir --strip-components 1 DATA_SOURCES_FOLDER="$PWD/datasources_dir" # Handle gnomAD: - if ${use_gnomad} ; then + if ~{use_gnomad} ; then echo "Enabling gnomAD..." for potential_gnomad_gz in gnomAD_exome.tar.gz gnomAD_genome.tar.gz ; do - if [[ -f ${dollar}{DATA_SOURCES_FOLDER}/${dollar}{potential_gnomad_gz} ]] ; then - cd ${dollar}{DATA_SOURCES_FOLDER} - tar -zvxf ${dollar}{potential_gnomad_gz} + if [[ -f ~{dollar}{DATA_SOURCES_FOLDER}/~{dollar}{potential_gnomad_gz} ]] ; then + cd ~{dollar}{DATA_SOURCES_FOLDER} + tar -zvxf ~{dollar}{potential_gnomad_gz} cd - else - echo "ERROR: Cannot find gnomAD folder: ${dollar}{potential_gnomad_gz}" 1>&2 + echo "ERROR: Cannot find gnomAD folder: ~{dollar}{potential_gnomad_gz}" 1>&2 false fi done fi # Run Funcotator: - gatk --java-options "-Xmx${command_mem}m" Funcotator \ + gatk --java-options "-Xmx~{command_mem}m" Funcotator \ --data-sources-path $DATA_SOURCES_FOLDER \ - --ref-version ${reference_version} \ - --output-file-format ${output_format} \ - -R ${ref_fasta} \ - -V ${input_vcf} \ - -O ${output_file} \ - ${interval_list_arg} ${default="" interval_list} \ - --annotation-default normal_barcode:${default="Unknown" control_id} \ - --annotation-default tumor_barcode:${default="Unknown" case_id} \ - --annotation-default Center:${default="Unknown" sequencing_center} \ - --annotation-default source:${default="Unknown" sequence_source} \ - ${"--transcript-selection-mode " + transcript_selection_mode} \ - ${transcript_selection_arg}${default="" sep=" --transcript-list " transcript_selection_list} \ - ${annotation_def_arg}${default="" sep=" --annotation-default " annotation_defaults} \ - ${annotation_over_arg}${default="" sep=" --annotation-override " annotation_overrides} \ - ${excluded_fields_args}${default="" sep=" --exclude-field " funcotator_excluded_fields} \ - ${filter_funcotations_args} \ - ${extra_args_arg} + --ref-version ~{reference_version} \ + --output-file-format ~{output_format} \ + -R ~{ref_fasta} \ + -V ~{input_vcf} \ + -O ~{output_file} \ + ~{interval_list_arg} ~{default="" interval_list} \ + --annotation-default normal_barcode:~{default="Unknown" control_id} \ + --annotation-default tumor_barcode:~{default="Unknown" case_id} \ + --annotation-default Center:~{default="Unknown" sequencing_center} \ + --annotation-default source:~{default="Unknown" sequence_source} \ + ~{"--transcript-selection-mode " + transcript_selection_mode} \ + ~{transcript_selection_arg}~{default="" sep=" --transcript-list " transcript_selection_list} \ + ~{annotation_def_arg}~{default="" sep=" --annotation-default " annotation_defaults} \ + ~{annotation_over_arg}~{default="" sep=" --annotation-override " annotation_overrides} \ + ~{excluded_fields_args}~{default="" sep=" --exclude-field " funcotator_excluded_fields} \ + ~{filter_funcotations_args} \ + ~{extra_args_arg} # Make sure we have a placeholder index for MAF files so this workflow doesn't fail: - if [[ "${output_format}" == "MAF" ]] ; then - touch ${output_maf_index} + if [[ "~{output_format}" == "MAF" ]] ; then + touch ~{output_maf_index} fi >>> @@ -1318,7 +1388,7 @@ task Funcotate { } output { - File funcotated_output_file = "${output_file}" - File funcotated_output_file_index = "${output_file_index}" + File funcotated_output_file = "~{output_file}" + File funcotated_output_file_index = "~{output_file_index}" } } diff --git a/scripts/mutect2_wdl/mutect2_multi_sample.wdl b/scripts/mutect2_wdl/mutect2_multi_sample.wdl index fd3848256cd..ef7b18fcd92 100644 --- a/scripts/mutect2_wdl/mutect2_multi_sample.wdl +++ b/scripts/mutect2_wdl/mutect2_multi_sample.wdl @@ -1,3 +1,5 @@ +version 1.0 + # Run Mutect 2 on a list of tumors or tumor-normal pairs # # Description of inputs @@ -20,22 +22,22 @@ import "mutect2.wdl" as m2 workflow Mutect2_Multi { - # Mutect2 inputs - File? intervals - File ref_fasta - File ref_fai - File ref_dict - File pair_list - Array[Array[String]] pairs = read_tsv(pair_list) - File? pon - File? pon_idx - File? gnomad - File? gnomad_idx - File? variants_for_contamination + input { + File? intervals + File ref_fasta + File ref_fai + File ref_dict + File pair_list + + File? pon + File? pon_idx + File? gnomad + File? gnomad_idx + File? variants_for_contamination File? variants_for_contamination_idx - Boolean? run_orientation_bias_mixture_model_filter - Int scatter_count - String? m2_extra_args + Boolean? run_orientation_bias_mixture_model_filter + Int scatter_count + String? m2_extra_args String? m2_extra_filtering_args Boolean? compress_vcfs Boolean? make_bamout @@ -63,6 +65,9 @@ workflow Mutect2_Multi { String? oncotator_docker Int? preemptible_attempts File? gatk_override + } + + Array[Array[String]] pairs = read_tsv(pair_list) scatter( row in pairs ) { # If the condition is true, variables inside the 'if' block retain their values outside the block. @@ -124,4 +129,4 @@ workflow Mutect2_Multi { Array[File?] m2_bamout = Mutect2.bamout Array[File?] m2_bamout_index = Mutect2.bamout_index } -} \ No newline at end of file +} diff --git a/scripts/mutect2_wdl/mutect2_nio.wdl b/scripts/mutect2_wdl/mutect2_nio.wdl deleted file mode 100755 index 85d40db06a5..00000000000 --- a/scripts/mutect2_wdl/mutect2_nio.wdl +++ /dev/null @@ -1,1296 +0,0 @@ -## Copyright Broad Institute, 2017 -## -## This WDL workflow runs GATK4 Mutect 2 on a single tumor-normal pair or on a single tumor sample, -## and performs additional filtering and functional annotation tasks. -## -## NOTE: this wdl is an exact copy of mutect2.wdl in the gatk repo except for replacing File with String in GATK task inputs in order to -## avoid localizing files in cromwell and thereby allowing the GATK engine to access cloud-based files with NIO. Once -## cromwell supports "smart" File variables that know when and when not to localize the two wdls should be merged. -## -## Main requirements/expectations : -## - One analysis-ready BAM file (and its index) for each sample -## -## Description of inputs: -## -## ** Runtime ** -## gatk_docker, oncotator_docker: docker images to use for GATK 4 Mutect2 and for Oncotator -## preemptible_attempts: how many preemptions to tolerate before switching to a non-preemptible machine (on Google) -## max_retries: how many times to retry failed tasks -- very important on the cloud when there are transient errors -## gatk_override: (optional) local file or Google bucket path to a GATK 4 java jar file to be used instead of the GATK 4 jar -## in the docker image. This must be supplied when running in an environment that does not support docker -## (e.g. SGE cluster on a Broad on-prem VM) -## -## ** Workflow options ** -## intervals: genomic intervals (will be used for scatter) -## scatter_count: number of parallel jobs to generate when scattering over intervals -## m2_extra_args, m2_extra_filtering_args: additional arguments for Mutect2 calling and filtering (optional) -## split_intervals_extra_args: additional arguments for splitting intervals before scattering (optional) -## run_orientation_bias_mixture_model_filter: (optional) if true, filter orientation bias sites with the read orientation artifact mixture model. -## run_oncotator: if true, annotate the M2 VCFs using oncotator (to produce a TCGA MAF). Important: This requires a -## docker image and should not be run in environments where docker is unavailable (e.g. SGE cluster on -## a Broad on-prem VM). Access to docker hub is also required, since the task downloads a public docker image. -## (optional, false by default) -## -## ** Primary inputs ** -## ref_fasta, ref_fai, ref_dict: reference genome, index, and dictionary -## tumor_bam, tumor_bam_index: BAM and index for the tumor sample -## normal_bam, normal_bam_index: BAM and index for the normal sample -## -## ** Primary resources ** (optional but strongly recommended) -## pon: optional panel of normals in VCF format containing probable technical artifacts (false positves) -## gnomad: optional database of known germline variants (see http://gnomad.broadinstitute.org/downloads) -## variants_for_contamination: VCF of common variants with allele frequencies for calculating contamination -## -## ** Secondary resources ** (for optional tasks) -## onco_ds_tar_gz, default_config_file: Oncotator datasources and config file -## sequencing_center, sequence_source: metadata for Oncotator -## filter_oncotator_maf: Whether the MAF generated by oncotator should have the filtered variants removed. Default: true -## realignment_index_bundle: resource for FilterAlignmentArtifacts, which runs if and only if it is specified. Generated by BwaMemIndexImageCreator. -## -## Funcotator parameters (see Funcotator help for more details). -## funco_reference_version: "hg19" for hg19 or b37. "hg38" for hg38. Default: "hg19" -## funco_output_format: "MAF" to produce a MAF file, "VCF" to procude a VCF file. Default: "MAF" -## funco_compress: (Only valid if funco_output_format == "VCF" ) If true, will compress the output of Funcotator. If false, produces an uncompressed output file. Default: false -## funco_use_gnomad_AF: If true, will include gnomAD allele frequency annotations in output by connecting to the internet to query gnomAD (this impacts performance). If false, will not annotate with gnomAD. Default: false -## funco_transcript_selection_mode: How to select transcripts in Funcotator. ALL, CANONICAL, or BEST_EFFECT -## funco_transcript_selection_list: Transcripts (one GENCODE ID per line) to give priority during selection process. -## funco_data_sources_tar_gz: Funcotator datasources tar gz file. Bucket location is recommended when running on the cloud. -## funco_annotation_defaults: Default values for annotations, when values are unspecified. Specified as :. For example: "Center:Broad" -## funco_annotation_overrides: Values for annotations, even when values are unspecified. Specified as :. For example: "Center:Broad" -## funcotator_excluded_fields: Annotations that should not appear in the output (VCF or MAF). Specified as . For example: "ClinVar_ALLELEID" -## funco_filter_funcotations: If true, will only annotate variants that have passed filtering (. or PASS value in the FILTER column). If false, will annotate all variants in the input file. Default: true -## funcotator_extra_args: Any additional arguments to pass to Funcotator. Default: "" -## -## Outputs : -## - One VCF file and its index with primary filtering applied; secondary filtering and functional annotation if requested; a bamout.bam -## file of reassembled reads if requested -## -## Cromwell version support -## - Successfully tested on v34 -## -## LICENSING : -## This script is released under the WDL source code license (BSD-3) (see LICENSE in -## https://github.com/broadinstitute/wdl). Note however that the programs it calls may -## be subject to different licenses. Users are responsible for checking that they are -## authorized to run all programs before running this script. Please see the docker -## pages at https://hub.docker.com/r/broadinstitute/* for detailed licensing information -## pertaining to the included programs. -workflow Mutect2 { - # Mutect2 inputs - File? intervals - File ref_fasta - File ref_fai - File ref_dict - File tumor_reads - File tumor_reads_index - File? normal_reads - File? normal_reads_index - File? pon - Int scatter_count - File? gnomad - File? variants_for_contamination - File? realignment_index_bundle - String? realignment_extra_args - Boolean? run_orientation_bias_mixture_model_filter - Boolean run_ob_filter = select_first([run_orientation_bias_mixture_model_filter, false]) - String? m2_extra_args - String? m2_extra_filtering_args - String? split_intervals_extra_args - Boolean? make_bamout - Boolean make_bamout_or_default = select_first([make_bamout, false]) - Boolean? compress_vcfs - Boolean compress = select_first([compress_vcfs, false]) - File? gga_vcf - - # oncotator inputs - Boolean? run_oncotator - Boolean run_oncotator_or_default = select_first([run_oncotator, false]) - File? onco_ds_tar_gz - String? onco_ds_local_db_dir - String? sequencing_center - String? sequence_source - File? default_config_file - String? oncotator_extra_args - - # Funcotator inputs - Boolean? run_funcotator - Boolean run_funcotator_or_default = select_first([run_funcotator, false]) - String? funco_reference_version - String? funco_output_format - Boolean? funco_compress - Boolean? funco_use_gnomad_AF - File? funco_data_sources_tar_gz - String? funco_transcript_selection_mode - File? funco_transcript_selection_list - Array[String]? funco_annotation_defaults - Array[String]? funco_annotation_overrides - Array[String]? funcotator_excluded_fields - Boolean? funco_filter_funcotations - String? funcotator_extra_args - - String funco_default_output_format = "MAF" - - - # runtime - String gatk_docker - File? gatk_override - String basic_bash_docker = "ubuntu:16.04" - String? oncotator_docker - String oncotator_docker_or_default = select_first([oncotator_docker, "broadinstitute/oncotator:1.9.9.0"]) - Boolean? filter_oncotator_maf - Boolean filter_oncotator_maf_or_default = select_first([filter_oncotator_maf, true]) - Boolean? filter_funcotations - Boolean filter_funcotations_or_default = select_first([filter_funcotations, true]) - - Int? preemptible_attempts - Int? max_retries - - # Use as a last resort to increase the disk given to every task in case of ill behaving data - Int? emergency_extra_disk - - # Disk sizes used for dynamic sizing - Int ref_size = ceil(size(ref_fasta, "GB") + size(ref_dict, "GB") + size(ref_fai, "GB")) - Int tumor_reads_size = ceil(size(tumor_reads, "GB") + size(tumor_reads_index, "GB")) - Int gnomad_vcf_size = if defined(gnomad) then ceil(size(gnomad, "GB")) else 0 - Int normal_reads_size = if defined(normal_reads) then ceil(size(normal_reads, "GB") + size(normal_reads_index, "GB")) else 0 - - # If no tar is provided, the task downloads one from broads ftp server - Int onco_tar_size = if defined(onco_ds_tar_gz) then ceil(size(onco_ds_tar_gz, "GB") * 3) else 100 - Int funco_tar_size = if defined(funco_data_sources_tar_gz) then ceil(size(funco_data_sources_tar_gz, "GB") * 3) else 100 - Int gatk_override_size = if defined(gatk_override) then ceil(size(gatk_override, "GB")) else 0 - - # This is added to every task as padding, should increase if systematically you need more disk for every call - Int disk_pad = 10 + gatk_override_size + select_first([emergency_extra_disk,0]) - - # These are multipliers to multipler inputs by to make sure we have enough disk to accommodate for possible output sizes - # Large is for Bams/WGS vcfs - # Small is for metrics/other vcfs - Float large_input_to_output_multiplier = 2.25 - Float small_input_to_output_multiplier = 2.0 - Float cram_to_bam_multiplier = 6.0 - - # logic about output file names -- these are the names *without* .vcf extensions - String output_basename = basename(basename(tumor_reads, ".bam"),".cram") #hacky way to strip either .bam or .cram - String unfiltered_name = output_basename + "-unfiltered" - String filtered_name = output_basename + "-filtered" - String funcotated_name = output_basename + "-funcotated" - - String output_vcf_name = output_basename + ".vcf" - - # Size M2 differently based on if we are using NIO or not - Int tumor_cram_to_bam_disk = ceil(tumor_reads_size * cram_to_bam_multiplier) - Int normal_cram_to_bam_disk = ceil(normal_reads_size * cram_to_bam_multiplier) - - if (basename(tumor_reads) != basename(tumor_reads, ".cram")) { - call CramToBam as TumorCramToBam { - input: - ref_fasta = ref_fasta, - ref_fai = ref_fai, - ref_dict = ref_dict, - cram = tumor_reads, - crai = tumor_reads_index, - name = output_basename, - disk_size = tumor_cram_to_bam_disk - } - } - - String normal_or_empty = select_first([normal_reads, ""]) - if (basename(normal_or_empty) != basename(normal_or_empty, ".cram")) { - String normal_basename = basename(basename(normal_or_empty, ".bam"),".cram") - call CramToBam as NormalCramToBam { - input: - ref_fasta = ref_fasta, - ref_fai = ref_fai, - ref_dict = ref_dict, - cram = normal_reads, - crai = normal_reads_index, - name = normal_basename, - disk_size = normal_cram_to_bam_disk - } - } - - File tumor_bam = select_first([TumorCramToBam.output_bam, tumor_reads]) - File tumor_bai = select_first([TumorCramToBam.output_bai, tumor_reads_index]) - File? normal_bam = if defined(normal_reads) then select_first([NormalCramToBam.output_bam, normal_reads]) else normal_reads - File? normal_bai = if defined(normal_reads) then select_first([NormalCramToBam.output_bai, normal_reads_index]) else normal_reads_index - - Int tumor_bam_size = ceil(size(tumor_bam, "GB") + size(tumor_bai, "GB")) - Int normal_bam_size = if defined(normal_bam) then ceil(size(normal_bam, "GB") + size(normal_bai, "GB")) else 0 - - Int m2_output_size = tumor_bam_size / scatter_count - Int m2_per_scatter_size = ((tumor_bam_size + normal_bam_size) / scatter_count) + ref_size + (gnomad_vcf_size / scatter_count) + m2_output_size + disk_pad - - call SplitIntervals { - input: - intervals = intervals, - ref_fasta = ref_fasta, - ref_fai = ref_fai, - ref_dict = ref_dict, - scatter_count = scatter_count, - split_intervals_extra_args = split_intervals_extra_args, - gatk_override = gatk_override, - gatk_docker = gatk_docker, - preemptible_attempts = preemptible_attempts, - max_retries = max_retries, - disk_space = ref_size + ceil(size(intervals, "GB") * small_input_to_output_multiplier) + disk_pad - } - - scatter (subintervals in SplitIntervals.interval_files ) { - call M2 { - input: - intervals = subintervals, - ref_fasta = ref_fasta, - tumor_bam = tumor_bam, - normal_bam = normal_bam, - pon = pon, - gnomad = gnomad, - preemptible_attempts = preemptible_attempts, - max_retries = max_retries, - m2_extra_args = m2_extra_args, - variants_for_contamination = variants_for_contamination, - make_bamout = make_bamout_or_default, - run_ob_filter = run_ob_filter, - compress = compress, - gga_vcf = gga_vcf, - gatk_override = gatk_override, - gatk_docker = gatk_docker, - disk_space = m2_per_scatter_size - } - - Float sub_vcf_size = size(M2.unfiltered_vcf, "GB") - Float sub_bamout_size = size(M2.output_bamOut, "GB") - } - - call SumFloats as SumSubVcfs { - input: - sizes = sub_vcf_size, - preemptible_attempts = preemptible_attempts, - max_retries = max_retries - } - - if (run_ob_filter) { - call LearnReadOrientationModel { - input: - f1r2_tar_gz = M2.f1r2_counts, - gatk_override = gatk_override, - gatk_docker = gatk_docker, - preemptible_attempts = preemptible_attempts, - max_retries = max_retries - } - } - - call MergeVCFs { - input: - input_vcfs = M2.unfiltered_vcf, - input_vcf_indices = M2.unfiltered_vcf_idx, - output_name = unfiltered_name, - compress = compress, - gatk_override = gatk_override, - gatk_docker = gatk_docker, - preemptible_attempts = preemptible_attempts, - max_retries = max_retries, - disk_space = ceil(SumSubVcfs.total_size * large_input_to_output_multiplier) + disk_pad - } - - if (make_bamout_or_default) { - call SumFloats as SumSubBamouts { - input: - sizes = sub_bamout_size, - preemptible_attempts = preemptible_attempts, - max_retries = max_retries - } - - call MergeBamOuts { - input: - ref_fasta = ref_fasta, - ref_fai = ref_fai, - ref_dict = ref_dict, - bam_outs = M2.output_bamOut, - output_vcf_name = basename(MergeVCFs.merged_vcf, ".vcf"), - gatk_override = gatk_override, - gatk_docker = gatk_docker, - disk_space = ceil(SumSubBamouts.total_size * large_input_to_output_multiplier) + disk_pad, - max_retries = max_retries - } - } - - call MergeStats { - input: - stats = M2.stats, - gatk_override = gatk_override, - gatk_docker = gatk_docker - } - - if (defined(variants_for_contamination)) { - call MergePileupSummaries as MergeTumorPileups { - input: - input_tables = M2.tumor_pileups, - output_name = output_basename, - ref_dict = ref_dict, - gatk_override = gatk_override, - gatk_docker = gatk_docker, - preemptible_attempts = preemptible_attempts, - max_retries = max_retries, - disk_space = ceil(SumSubVcfs.total_size * large_input_to_output_multiplier) + disk_pad - } - - if (defined(normal_bam)){ - call MergePileupSummaries as MergeNormalPileups { - input: - input_tables = M2.normal_pileups, - output_name = output_basename, - ref_dict = ref_dict, - gatk_override = gatk_override, - gatk_docker = gatk_docker, - preemptible_attempts = preemptible_attempts, - max_retries = max_retries, - disk_space = ceil(SumSubVcfs.total_size * large_input_to_output_multiplier) + disk_pad - } - } - - call CalculateContamination { - input: - gatk_override = gatk_override, - preemptible_attempts = preemptible_attempts, - max_retries = max_retries, - gatk_docker = gatk_docker, - tumor_pileups = MergeTumorPileups.merged_table, - normal_pileups = MergeNormalPileups.merged_table, - disk_space = tumor_bam_size + normal_bam_size + ceil(size(variants_for_contamination, "GB") * small_input_to_output_multiplier) + disk_pad - } - } - - call Filter { - input: - ref_fasta = ref_fasta, - gatk_override = gatk_override, - gatk_docker = gatk_docker, - intervals = intervals, - unfiltered_vcf = MergeVCFs.merged_vcf, - output_name = filtered_name, - compress = compress, - preemptible_attempts = preemptible_attempts, - mutect_stats = MergeStats.merged_stats, - max_retries = max_retries, - contamination_table = CalculateContamination.contamination_table, - maf_segments = CalculateContamination.maf_segments, - artifact_priors_tar_gz = LearnReadOrientationModel.artifact_prior_table, - m2_extra_filtering_args = m2_extra_filtering_args, - disk_space = ceil(size(MergeVCFs.merged_vcf, "GB") * small_input_to_output_multiplier) + disk_pad - } - - if (defined(realignment_index_bundle)) { - File realignment_filter_input = Filter.filtered_vcf - call FilterAlignmentArtifacts { - input: - gatk_override = gatk_override, - bam = tumor_bam, - realignment_index_bundle = select_first([realignment_index_bundle]), - realignment_extra_args = realignment_extra_args, - gatk_docker = gatk_docker, - max_retries = max_retries, - compress = compress, - output_name = filtered_name, - input_vcf = realignment_filter_input - } - } - - if (run_oncotator_or_default) { - File oncotate_vcf_input = select_first([FilterAlignmentArtifacts.filtered_vcf, Filter.filtered_vcf]) - call oncotate_m2 { - input: - m2_vcf = oncotate_vcf_input, - onco_ds_tar_gz = onco_ds_tar_gz, - onco_ds_local_db_dir = onco_ds_local_db_dir, - sequencing_center = sequencing_center, - sequence_source = sequence_source, - default_config_file = default_config_file, - case_id = M2.tumor_sample[0], - control_id = M2.normal_sample[0], - oncotator_docker = oncotator_docker_or_default, - preemptible_attempts = preemptible_attempts, - max_retries = max_retries, - disk_space = ceil(size(oncotate_vcf_input, "GB") * large_input_to_output_multiplier) + onco_tar_size + disk_pad, - filter_maf = filter_oncotator_maf_or_default, - oncotator_extra_args = oncotator_extra_args - } - } - - if (run_funcotator_or_default) { - File funcotate_vcf_input = select_first([FilterAlignmentArtifacts.filtered_vcf, Filter.filtered_vcf]) - File funcotate_vcf_input_index = select_first([FilterAlignmentArtifacts.filtered_vcf_idx, Filter.filtered_vcf_idx]) - call Funcotate { - input: - ref_fasta = ref_fasta, - input_vcf = funcotate_vcf_input, - input_vcf_idx = funcotate_vcf_input_index, - reference_version = select_first([funco_reference_version, "hg19"]), - output_file_base_name = basename(funcotate_vcf_input, ".vcf") + ".annotated", - output_format = if defined(funco_output_format) then "" + funco_output_format else funco_default_output_format, - compress = if defined(funco_compress) then funco_compress else false, - use_gnomad = if defined(funco_use_gnomad_AF) then funco_use_gnomad_AF else false, - data_sources_tar_gz = funco_data_sources_tar_gz, - case_id = M2.tumor_sample[0], - control_id = M2.normal_sample[0], - sequencing_center = sequencing_center, - sequence_source = sequence_source, - transcript_selection_mode = funco_transcript_selection_mode, - transcript_selection_list = funco_transcript_selection_list, - annotation_defaults = funco_annotation_defaults, - annotation_overrides = funco_annotation_overrides, - funcotator_excluded_fields = funcotator_excluded_fields, - filter_funcotations = filter_funcotations_or_default, - extra_args = funcotator_extra_args, - gatk_docker = gatk_docker, - gatk_override = gatk_override, - preemptible_attempts = preemptible_attempts, - max_retries = max_retries, - disk_space_gb = ceil(size(funcotate_vcf_input, "GB") * large_input_to_output_multiplier) + onco_tar_size + disk_pad - } - } - - output { - File filtered_vcf = select_first([FilterAlignmentArtifacts.filtered_vcf, Filter.filtered_vcf]) - File filtered_vcf_idx = select_first([FilterAlignmentArtifacts.filtered_vcf_idx, Filter.filtered_vcf_idx]) - File filtering_stats = Filter.filtering_stats - File mutect_stats = MergeStats.merged_stats - File? contamination_table = CalculateContamination.contamination_table - - File? oncotated_m2_maf = oncotate_m2.oncotated_m2_maf - File? funcotated_file = Funcotate.funcotated_output_file - File? funcotated_file_index = Funcotate.funcotated_output_file_index - File? bamout = MergeBamOuts.merged_bam_out - File? bamout_index = MergeBamOuts.merged_bam_out_index - File? maf_segments = CalculateContamination.maf_segments - File? read_orientation_model_params = LearnReadOrientationModel.artifact_prior_table - } -} - -task CramToBam { - - File ref_fasta - File ref_fai - File ref_dict - File cram - File crai - String name - Int disk_size - Int? mem - - Int machine_mem = if defined(mem) then mem * 1000 else 6000 - - #Calls samtools view to do the conversion - command { - #Set -e and -o says if any command I run fails in this script, make sure to return a failure - set -e - set -o pipefail - - samtools view -h -T ${ref_fasta} ${cram} | - samtools view -b -o ${name}.bam - - samtools index -b ${name}.bam - mv ${name}.bam.bai ${name}.bai - } - - runtime { - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.3-1513176735" - memory: machine_mem + " MB" - disks: "local-disk " + disk_size + " HDD" - } - - output { - File output_bam = "${name}.bam" - File output_bai = "${name}.bai" - } -} - -task SplitIntervals { - # inputs - File? intervals - File ref_fasta - File ref_fai - File ref_dict - Int scatter_count - String? split_intervals_extra_args - - File? gatk_override - - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? max_retries - Int? disk_space - Int? cpu - Boolean use_ssd = false - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 3500 - Int command_mem = machine_mem - 500 - - command { - set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - mkdir interval-files - gatk --java-options "-Xmx${command_mem}m" SplitIntervals \ - -R ${ref_fasta} \ - ${"-L " + intervals} \ - -scatter ${scatter_count} \ - -O interval-files \ - ${split_intervals_extra_args} - cp interval-files/*.interval_list . - } - - runtime { - docker: gatk_docker - bootDiskSizeGb: 12 - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) - maxRetries: select_first([max_retries, 0]) - cpu: select_first([cpu, 1]) - } - - output { - Array[File] interval_files = glob("*.interval_list") - } -} - -task M2 { - # inputs - String? intervals - String ref_fasta - String tumor_bam - String? normal_bam - String? pon - String? gnomad - String? m2_extra_args - Boolean? make_bamout - Boolean? run_ob_filter - Boolean compress - String? gga_vcf - String? gga_vcf_idx - String? variants_for_contamination - - String output_vcf = "output" + if compress then ".vcf.gz" else ".vcf" - String output_vcf_idx = output_vcf + if compress then ".tbi" else ".idx" - - String output_stats = output_vcf + ".stats" - - File? gatk_override - - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? max_retries - Int? disk_space - Int? cpu - Boolean use_ssd = false - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 3500 - Int command_mem = machine_mem - 500 - - - command <<< - set -e - - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - # We need to create these files regardless, even if they stay empty - touch bamout.bam - touch f1r2.tar.gz - echo "" > normal_name.txt - - gatk --java-options "-Xmx${command_mem}m" GetSampleName -R ${ref_fasta} -I ${tumor_bam} -O tumor_name.txt -encode - tumor_command_line="-I ${tumor_bam} -tumor `cat tumor_name.txt`" - - if [[ ! -z "${normal_bam}" ]]; then - gatk --java-options "-Xmx${command_mem}m" GetSampleName -R ${ref_fasta} -I ${normal_bam} -O normal_name.txt -encode - normal_command_line="-I ${normal_bam} -normal `cat normal_name.txt`" - fi - - gatk --java-options "-Xmx${command_mem}m" Mutect2 \ - -R ${ref_fasta} \ - $tumor_command_line \ - $normal_command_line \ - ${"--germline-resource " + gnomad} \ - ${"-pon " + pon} \ - ${"-L " + intervals} \ - ${"--alleles " + gga_vcf} \ - -O "${output_vcf}" \ - ${true='--bam-output bamout.bam' false='' make_bamout} \ - ${true='--f1r2-tar-gz f1r2.tar.gz' false='' run_ob_filter} \ - ${m2_extra_args} - - ### GetPileupSummaries - # These must be created, even if they remain empty, as cromwell doesn't support optional output - touch tumor-pileups.table - touch normal-pileups.table - - if [[ ! -z "${variants_for_contamination}" ]]; then - gatk --java-options "-Xmx${command_mem}m" GetPileupSummaries -R ${ref_fasta} -I ${tumor_bam} ${"--interval-set-rule INTERSECTION -L " + intervals} \ - -V ${variants_for_contamination} -L ${variants_for_contamination} -O tumor-pileups.table - - if [[ ! -z "${normal_bam}" ]]; then - gatk --java-options "-Xmx${command_mem}m" GetPileupSummaries -R ${ref_fasta} -I ${normal_bam} ${"--interval-set-rule INTERSECTION -L " + intervals} \ - -V ${variants_for_contamination} -L ${variants_for_contamination} -O normal-pileups.table - fi - fi - >>> - - runtime { - docker: gatk_docker - bootDiskSizeGb: 12 - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) - maxRetries: select_first([max_retries, 0]) - cpu: select_first([cpu, 1]) - } - - output { - File unfiltered_vcf = "${output_vcf}" - File unfiltered_vcf_idx = "${output_vcf_idx}" - File output_bamOut = "bamout.bam" - String tumor_sample = read_string("tumor_name.txt") - String normal_sample = read_string("normal_name.txt") - File stats = "${output_stats}" - File f1r2_counts = "f1r2.tar.gz" - File tumor_pileups = "tumor-pileups.table" - File normal_pileups = "normal-pileups.table" - } -} - -task MergeVCFs { - # inputs - Array[File] input_vcfs - Array[File] input_vcf_indices - String output_name - Boolean compress - String output_vcf = output_name + if compress then ".vcf.gz" else ".vcf" - String output_vcf_idx = output_vcf + if compress then ".tbi" else ".idx" - - File? gatk_override - - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? max_retries - Int? disk_space - Int? cpu - Boolean use_ssd = false - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 3500 - Int command_mem = machine_mem - 1000 - - # using MergeVcfs instead of GatherVcfs so we can create indices - # WARNING 2015-10-28 15:01:48 GatherVcfs Index creation not currently supported when gathering block compressed VCFs. - command { - set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - gatk --java-options "-Xmx${command_mem}m" MergeVcfs -I ${sep=' -I ' input_vcfs} -O ${output_vcf} - } - - runtime { - docker: gatk_docker - bootDiskSizeGb: 12 - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) - maxRetries: select_first([max_retries, 0]) - cpu: select_first([cpu, 1]) - } - - output { - File merged_vcf = "${output_vcf}" - File merged_vcf_idx = "${output_vcf_idx}" - } -} - -task MergeBamOuts { - # inputs - File ref_fasta - File ref_fai - File ref_dict - Array[File]+ bam_outs - String output_vcf_name - - File? gatk_override - - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? max_retries - Int? disk_space - Int? cpu - Boolean use_ssd = false - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 7000 - Int command_mem = machine_mem - 1000 - - command <<< - # This command block assumes that there is at least one file in bam_outs. - # Do not call this task if len(bam_outs) == 0 - set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - gatk --java-options "-Xmx${command_mem}m" GatherBamFiles \ - -I ${sep=" -I " bam_outs} -O unsorted.out.bam -R ${ref_fasta} - - # We must sort because adjacent scatters may have overlapping (padded) assembly regions, hence - # overlapping bamouts - - gatk --java-options "-Xmx${command_mem}m" SortSam -I unsorted.out.bam \ - -O ${output_vcf_name}.out.bam \ - --SORT_ORDER coordinate -VALIDATION_STRINGENCY LENIENT - gatk --java-options "-Xmx${command_mem}m" BuildBamIndex -I ${output_vcf_name}.out.bam -VALIDATION_STRINGENCY LENIENT - >>> - - runtime { - docker: gatk_docker - bootDiskSizeGb: 12 - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) - maxRetries: select_first([max_retries, 0]) - cpu: select_first([cpu, 1]) - } - - output { - File merged_bam_out = "${output_vcf_name}.out.bam" - File merged_bam_out_index = "${output_vcf_name}.out.bai" - } -} - - -task MergeStats { - # inputs - Array[File]+ stats - - File? gatk_override - - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? max_retries - Int? disk_space - Int? cpu - Boolean use_ssd = false - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 2000 - Int command_mem = machine_mem - 1000 - - command { - set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - - gatk --java-options "-Xmx${command_mem}m" MergeMutectStats \ - -stats ${sep=" -stats " stats} -O merged.stats - - } - - runtime { - docker: gatk_docker - bootDiskSizeGb: 12 - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space, 10]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) - maxRetries: select_first([max_retries, 0]) - cpu: select_first([cpu, 1]) - } - - output { - File merged_stats = "merged.stats" - } -} - -task MergePileupSummaries { - # input_tables needs to be optional because GetPileupSummaries is in an if-block - Array[File?] input_tables - String output_name - File? gatk_override - File ref_dict - - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? max_retries - Int? disk_space - Int? cpu - Boolean use_ssd = false - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 3500 - Int command_mem = machine_mem - 1000 - - command { - set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - gatk --java-options "-Xmx${command_mem}m" GatherPileupSummaries \ - --sequence-dictionary ${ref_dict} \ - -I ${sep=' -I ' input_tables} \ - -O ${output_name}.tsv - } - - runtime { - docker: gatk_docker - bootDiskSizeGb: 12 - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) - maxRetries: select_first([max_retries, 3]) - cpu: select_first([cpu, 1]) - } - - output { - File merged_table = "${output_name}.tsv" - } -} - -# Learning step of the orientation bias mixture model, which is the recommended orientation bias filter as of September 2018 -task LearnReadOrientationModel { - Array[File] f1r2_tar_gz - File? gatk_override - - # runtime - Int? max_retries - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? disk_space - Int? cpu - Boolean use_ssd = false - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 8000 - Int command_mem = machine_mem - 1000 - - command { - set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - gatk --java-options "-Xmx${command_mem}m" LearnReadOrientationModel \ - -I ${sep=" -I " f1r2_tar_gz} \ - -O "artifact-priors.tar.gz" - } - - runtime { - docker: gatk_docker - bootDiskSizeGb: 12 - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) - maxRetries: select_first([max_retries, 3]) - cpu: select_first([cpu, 1]) - } - - output { - File artifact_prior_table = "artifact-priors.tar.gz" - } - -} - -task CalculateContamination { - # inputs - String? intervals - File tumor_pileups - File? normal_pileups - - File? gatk_override - - # runtime - Int? preemptible_attempts - Int? max_retries - String gatk_docker - Int? disk_space - Int? mem - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 3000 - Int command_mem = machine_mem - 500 - - command { - set -e - - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - gatk --java-options "-Xmx${command_mem}m" CalculateContamination -I ${tumor_pileups} \ - -O contamination.table --tumor-segmentation segments.table ${"-matched " + normal_pileups} - } - - runtime { - docker: gatk_docker - bootDiskSizeGb: 12 - memory: command_mem + " MB" - disks: "local-disk " + select_first([disk_space, 100]) + " HDD" - preemptible: select_first([preemptible_attempts, 10]) - maxRetries: select_first([max_retries, 0]) - } - - output { - File contamination_table = "contamination.table" - File maf_segments = "segments.table" - } -} - -task Filter { - # inputs - String? intervals - String ref_fasta - String unfiltered_vcf - String output_name - Boolean compress - String output_vcf = output_name + if compress then ".vcf.gz" else ".vcf" - String output_vcf_idx = output_vcf + if compress then ".tbi" else ".idx" - File? mutect_stats - File? artifact_priors_tar_gz - File? contamination_table - File? maf_segments - String? m2_extra_filtering_args - - File? gatk_override - - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? max_retries - Int? disk_space - Int? cpu - Boolean use_ssd = false - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 7000 - Int command_mem = machine_mem - 500 - - command { - set -e - - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - gatk --java-options "-Xmx${command_mem}m" FilterMutectCalls -V ${unfiltered_vcf} \ - -R ${ref_fasta} \ - -O ${output_vcf} \ - ${"--contamination-table " + contamination_table} \ - ${"--tumor-segmentation " + maf_segments} \ - ${"--ob-priors " + artifact_priors_tar_gz} \ - ${"-stats " + mutect_stats} \ - --filtering-stats filtering.stats \ - ${m2_extra_filtering_args} - } - - runtime { - docker: gatk_docker - bootDiskSizeGb: 12 - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) - maxRetries: select_first([max_retries, 0]) - cpu: select_first([cpu, 1]) - } - - output { - File filtered_vcf = "${output_vcf}" - File filtered_vcf_idx = "${output_vcf_idx}" - File filtering_stats = "filtering.stats" - } -} - -task FilterAlignmentArtifacts { - #input - File? gatk_override - String input_vcf - String bam - String output_name - Boolean compress - String output_vcf = output_name + if compress then ".vcf.gz" else ".vcf" - String output_vcf_idx = output_vcf + if compress then ".tbi" else ".idx" - File realignment_index_bundle - String? realignment_extra_args - - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? max_retries - Int? disk_space - Int? cpu - Boolean use_ssd = false - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 9000 - Int command_mem = machine_mem - 500 - - command { - set -e - - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - gatk --java-options "-Xmx${command_mem}m" FilterAlignmentArtifacts \ - -V ${input_vcf} \ - -I ${bam} \ - --bwa-mem-index-image ${realignment_index_bundle} \ - ${realignment_extra_args} \ - -O ${output_vcf} - } - - runtime { - docker: gatk_docker - bootDiskSizeGb: 12 - memory: command_mem + " MB" - disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) - maxRetries: select_first([max_retries, 0]) - cpu: select_first([cpu, 1]) - } - - output { - File filtered_vcf = "${output_vcf}" - File filtered_vcf_idx = "${output_vcf_idx}" - } -} - -task oncotate_m2 { - # inputs - File m2_vcf - File? onco_ds_tar_gz - String? onco_ds_local_db_dir - String? oncotator_exe - String? sequencing_center - String? sequence_source - File? default_config_file - String case_id - String? control_id - String? oncotator_extra_args - - # runtime - String oncotator_docker - Int? mem - Int? preemptible_attempts - Int? max_retries - Int? disk_space - Int? cpu - Boolean use_ssd = false - - Boolean? filter_maf - Boolean is_filter_maf = select_first([filter_maf, true]) - String filter_maf_args = if (is_filter_maf) then " --collapse-filter-cols --prune-filter-cols " else "" - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 3500 - Int command_mem = machine_mem - 500 - - command <<< - # fail if *any* command below (not just the last) doesn't return 0, in particular if wget fails - set -e - - # local db dir is a directory and has been specified - if [[ -d "${onco_ds_local_db_dir}" ]]; then - echo "Using local db-dir: ${onco_ds_local_db_dir}" - echo "THIS ONLY WORKS WITHOUT DOCKER!" - ln -s ${onco_ds_local_db_dir} onco_dbdir - elif [[ "${onco_ds_tar_gz}" == *.tar.gz ]]; then - echo "Using given tar file: ${onco_ds_tar_gz}" - mkdir onco_dbdir - tar zxvf ${onco_ds_tar_gz} -C onco_dbdir --strip-components 1 - else - echo "Downloading and installing oncotator datasources from Broad FTP site..." - # Download and untar the db-dir - wget ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/oncotator/oncotator_v1_ds_April052016.tar.gz - tar zxvf oncotator_v1_ds_April052016.tar.gz - ln -s oncotator_v1_ds_April052016 onco_dbdir - fi - - ${default="/root/oncotator_venv/bin/oncotator" oncotator_exe} --db-dir onco_dbdir/ -c $HOME/tx_exact_uniprot_matches.AKT1_CRLF2_FGFR1.txt \ - -v ${m2_vcf} ${case_id}.maf.annotated hg19 -i VCF -o TCGAMAF --skip-no-alt --collapse-number-annotations --log_name oncotator.log \ - -a Center:${default="Unknown" sequencing_center} \ - -a source:${default="Unknown" sequence_source} \ - -a normal_barcode:${control_id} \ - -a tumor_barcode:${case_id} \ - ${"--default_config " + default_config_file} \ - ${filter_maf_args} \ - ${oncotator_extra_args} - >>> - - runtime { - docker: oncotator_docker - memory: machine_mem + " MB" - bootDiskSizeGb: 12 - disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) - maxRetries: select_first([max_retries, 0]) - cpu: select_first([cpu, 1]) - } - - output { - File oncotated_m2_maf="${case_id}.maf.annotated" - } -} - -# Calculates sum of a list of floats -task SumFloats { - Array[Float] sizes - - # Runtime parameters - Int? preemptible_attempts - Int? max_retries - - command <<< - python -c "print ${sep="+" sizes}" - >>> - - output { - Float total_size = read_float(stdout()) - } - - runtime { - docker: "python:2.7" - disks: "local-disk " + 10 + " HDD" - preemptible: select_first([preemptible_attempts, 10]) - maxRetries: select_first([max_retries, 0]) - } -} - -task Funcotate { - # ============== - # Inputs - String ref_fasta - String input_vcf - String input_vcf_idx - String reference_version - String output_file_base_name - String output_format - Boolean compress - Boolean use_gnomad - # This should be updated when a new version of the data sources is released - # TODO: Make this dynamically chosen in the command. - File? data_sources_tar_gz = "gs://broad-public-datasets/funcotator/funcotator_dataSources.v1.6.20190124s.tar.gz" - String? control_id - String? case_id - String? sequencing_center - String? sequence_source - String? transcript_selection_mode - File? transcript_selection_list - Array[String]? annotation_defaults - Array[String]? annotation_overrides - Array[String]? funcotator_excluded_fields - Boolean? filter_funcotations - File? interval_list - - String? extra_args - - # ============== - # Process input args: - String output_maf = output_file_base_name + ".maf" - String output_maf_index = output_maf + ".idx" - String output_vcf = output_file_base_name + if compress then ".vcf.gz" else ".vcf" - String output_vcf_idx = output_vcf + if compress then ".tbi" else ".idx" - String output_file = if output_format == "MAF" then output_maf else output_vcf - String output_file_index = if output_format == "MAF" then output_maf_index else output_vcf_idx - String transcript_selection_arg = if defined(transcript_selection_list) then " --transcript-list " else "" - String annotation_def_arg = if defined(annotation_defaults) then " --annotation-default " else "" - String annotation_over_arg = if defined(annotation_overrides) then " --annotation-override " else "" - String filter_funcotations_args = if defined(filter_funcotations) && (filter_funcotations) then " --remove-filtered-variants " else "" - String excluded_fields_args = if defined(funcotator_excluded_fields) then " --exclude-field " else "" - String interval_list_arg = if defined(interval_list) then " -L " else "" - String extra_args_arg = select_first([extra_args, ""]) - - # ============== - # Runtime options: - String gatk_docker - File? gatk_override - Int? mem - Int? preemptible_attempts - Int? max_retries - Int? disk_space_gb - Int? cpu - - Boolean use_ssd = false - - # You may have to change the following two parameter values depending on the task requirements - Int default_ram_mb = 3000 - # WARNING: In the workflow, you should calculate the disk space as an input to this task (disk_space_gb). Please see [TODO: Link from Jose] for examples. - Int default_disk_space_gb = 100 - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem *1000 else default_ram_mb - Int command_mem = machine_mem - 1000 - - String dollar = "$" - - command <<< - set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - # Extract our data sources: - echo "Extracting data sources zip file..." - mkdir datasources_dir - tar zxvf ${data_sources_tar_gz} -C datasources_dir --strip-components 1 - DATA_SOURCES_FOLDER="$PWD/datasources_dir" - - # Handle gnomAD: - if ${use_gnomad} ; then - echo "Enabling gnomAD..." - for potential_gnomad_gz in gnomAD_exome.tar.gz gnomAD_genome.tar.gz ; do - if [[ -f ${dollar}{DATA_SOURCES_FOLDER}/${dollar}{potential_gnomad_gz} ]] ; then - cd ${dollar}{DATA_SOURCES_FOLDER} - tar -zvxf ${dollar}{potential_gnomad_gz} - cd - - else - echo "ERROR: Cannot find gnomAD folder: ${dollar}{potential_gnomad_gz}" 1>&2 - false - fi - done - fi - - # Run Funcotator: - gatk --java-options "-Xmx${command_mem}m" Funcotator \ - --data-sources-path $DATA_SOURCES_FOLDER \ - --ref-version ${reference_version} \ - --output-file-format ${output_format} \ - -R ${ref_fasta} \ - -V ${input_vcf} \ - -O ${output_file} \ - ${interval_list_arg} ${default="" interval_list} \ - --annotation-default normal_barcode:${default="Unknown" control_id} \ - --annotation-default tumor_barcode:${default="Unknown" case_id} \ - --annotation-default Center:${default="Unknown" sequencing_center} \ - --annotation-default source:${default="Unknown" sequence_source} \ - ${"--transcript-selection-mode " + transcript_selection_mode} \ - ${transcript_selection_arg}${default="" sep=" --transcript-list " transcript_selection_list} \ - ${annotation_def_arg}${default="" sep=" --annotation-default " annotation_defaults} \ - ${annotation_over_arg}${default="" sep=" --annotation-override " annotation_overrides} \ - ${excluded_fields_args}${default="" sep=" --exclude-field " funcotator_excluded_fields} \ - ${filter_funcotations_args} \ - ${extra_args_arg} - # Make sure we have a placeholder index for MAF files so this workflow doesn't fail: - if [[ "${output_format}" == "MAF" ]] ; then - touch ${output_maf_index} - fi - >>> - - runtime { - docker: gatk_docker - bootDiskSizeGb: 20 - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space_gb, default_disk_space_gb]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 3]) - maxRetries: select_first([max_retries, 0]) - cpu: select_first([cpu, 1]) - } - - output { - File funcotated_output_file = "${output_file}" - File funcotated_output_file_index = "${output_file_index}" - } - } \ No newline at end of file diff --git a/scripts/mutect2_wdl/mutect2_pon.wdl b/scripts/mutect2_wdl/mutect2_pon.wdl index a3a6a15afeb..befd82674d7 100644 --- a/scripts/mutect2_wdl/mutect2_pon.wdl +++ b/scripts/mutect2_wdl/mutect2_pon.wdl @@ -1,3 +1,5 @@ +version 1.0 + # Create a Mutect2 panel of normals # # Description of inputs @@ -9,17 +11,17 @@ # m2_extra_args: additional command line parameters for Mutect2. This should not involve --max-mnp-distance, # which the wdl hard-codes to 0 because GenpmicsDBImport can't handle MNPs -import "mutect2_nio.wdl" as m2 +import "mutect2.wdl" as m2 workflow Mutect2_Panel { - # inputs - File? intervals - File ref_fasta - File ref_fai - File ref_dict - Int scatter_count - Array[String] normal_bams - Array[String] normal_bais - String gnomad + input { + File? intervals + File ref_fasta + File ref_fai + File ref_dict + Int scatter_count + Array[String] normal_bams + Array[String] normal_bais + String gnomad String? m2_extra_args String? create_pon_extra_args Boolean? compress @@ -27,7 +29,6 @@ workflow Mutect2_Panel { Int? min_contig_size Int? num_contigs - Int contig_size = select_first([min_contig_size, 1000000]) File? gatk_override @@ -35,6 +36,9 @@ workflow Mutect2_Panel { String gatk_docker Int? preemptible_attempts Int? max_retries + } + + Int contig_size = select_first([min_contig_size, 1000000]) scatter (normal_bam in zip(normal_bams, normal_bais)) { call m2.Mutect2 { @@ -43,8 +47,8 @@ workflow Mutect2_Panel { ref_fasta = ref_fasta, ref_fai = ref_fai, ref_dict = ref_dict, - tumor_bam = normal_bam.left, - tumor_bai = normal_bam.right, + tumor_reads = normal_bam.left, + tumor_reads_index = normal_bam.right, scatter_count = scatter_count, m2_extra_args = select_first([m2_extra_args, ""]) + "--max-mnp-distance 0", gatk_override = gatk_override, @@ -97,43 +101,44 @@ workflow Mutect2_Panel { output { File pon = MergeVCFs.merged_vcf - File pon_idx = MergeVCFs.merged_vcf_index + File pon_idx = MergeVCFs.merged_vcf_idx Array[File] normal_calls = Mutect2.filtered_vcf - Array[File] normal_calls_idx = Mutect2.filtered_vcf_index + Array[File] normal_calls_idx = Mutect2.filtered_vcf_idx } } task CreatePanel { - # inputs - File intervals - Array[String] input_vcfs - File ref_fasta - File ref_fai - File ref_dict - String output_vcf_name - String gnomad - String? create_pon_extra_args - - File? gatk_override - - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? max_retries - Int? disk_space + input { + File intervals + Array[String] input_vcfs + File ref_fasta + File ref_fai + File ref_dict + String output_vcf_name + String gnomad + String? create_pon_extra_args + + File? gatk_override + + # runtime + String gatk_docker + Int? mem + Int? preemptible_attempts + Int? max_retries + Int? disk_space + } Int machine_mem = select_first([mem, 8]) Int command_mem = machine_mem - 1 command { set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override} - gatk GenomicsDBImport --genomicsdb-workspace-path pon_db -R ${ref_fasta} -V ${sep=' -V ' input_vcfs} -L ${intervals} + gatk GenomicsDBImport --genomicsdb-workspace-path pon_db -R ~{ref_fasta} -V ~{sep=' -V ' input_vcfs} -L ~{intervals} - gatk --java-options "-Xmx${command_mem}g" CreateSomaticPanelOfNormals -R ${ref_fasta} --germline-resource ${gnomad} \ - -V gendb://pon_db -O ${output_vcf_name}.vcf ${create_pon_extra_args} + gatk --java-options "-Xmx~{command_mem}g" CreateSomaticPanelOfNormals -R ~{ref_fasta} --germline-resource ~{gnomad} \ + -V gendb://pon_db -O ~{output_vcf_name}.vcf ~{create_pon_extra_args} } runtime { @@ -146,7 +151,7 @@ task CreatePanel { } output { - File output_vcf = "${output_vcf_name}.vcf" - File output_vcf_index = "${output_vcf_name}.vcf.idx" + File output_vcf = "~{output_vcf_name}.vcf" + File output_vcf_index = "~{output_vcf_name}.vcf.idx" } -} \ No newline at end of file +}