Skip to content

Commit

Permalink
Updating gatk version to get adaptive pruning (#5669)
Browse files Browse the repository at this point in the history
  • Loading branch information
meganshand authored Feb 14, 2019
1 parent acdb7f5 commit 0595f9c
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 80 deletions.
78 changes: 3 additions & 75 deletions scripts/mitochondria_m2_wdl/MitochondriaCalling.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ workflow MitochondriaCalling {
String? m2_extra_args
File blacklisted_sites
File blacklisted_sites_index
Int mean_coverage
Float? autosomal_coverage
Float contamination
#max_read_length is only used for an optimization. If it's too small CollectWgsMetrics might fail, but the results are not affected by this number.
Expand All @@ -28,18 +29,7 @@ workflow MitochondriaCalling {
#Optional runtime arguments
Int? preemptible_tries

call CollectWgsMetrics {
input:
input_bam = input_bam,
input_bam_index = input_bam_index,
ref_fasta = ref_fasta,
ref_fasta_index = ref_fasta_index,
read_length = max_read_length,
coverage_cap = 100000,
preemptible_tries = preemptible_tries
}

Int? M2_mem = if CollectWgsMetrics.mean_coverage > 25000 then 14 else 7
Int? M2_mem = if mean_coverage > 25000 then 14 else 7

call M2AndFilter {
input:
Expand All @@ -49,10 +39,6 @@ workflow MitochondriaCalling {
input_bam = input_bam,
input_bai = input_bam_index,
compress = false,
# The minimum pruning value for the assembly graph in M2 defaults to 1. This only makes sense at low depths so we
# need this number to scale with mean coverage. Ideally this knob should be removed once dynamic pruning has been
# added to M2.
prune = (CollectWgsMetrics.mean_coverage / 500) + 1,
max_alt_allele_count = 4,
lod = lod_cutoff,
contamination = contamination,
Expand All @@ -66,67 +52,11 @@ workflow MitochondriaCalling {
}

output {
File coverage_metrics = CollectWgsMetrics.metrics
File theoretical_sensitivity_metrics = CollectWgsMetrics.theoretical_sensitivity
Int mean_coverage = CollectWgsMetrics.mean_coverage
File vcf = M2AndFilter.filtered_vcf
File vcf_index = M2AndFilter.filtered_vcf_idx
}
}

task CollectWgsMetrics {
File input_bam
File input_bam_index
File ref_fasta
File ref_fasta_index
Int? read_length
Int read_length_for_optimization = select_first([read_length, 151])
Int? coverage_cap

Int? preemptible_tries
Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB")
Int disk_size = ceil(size(input_bam, "GB") + ref_size) + 20

meta {
description: "Collect coverage metrics"
}
parameter_meta {
read_length: "Read length used for optimization only. If this is too small CollectWgsMetrics might fail. Default is 151."
}

command <<<
set -e

java -Xms2000m -jar /usr/gitc/picard.jar \
CollectWgsMetrics \
INPUT=${input_bam} \
VALIDATION_STRINGENCY=SILENT \
REFERENCE_SEQUENCE=${ref_fasta} \
OUTPUT=metrics.txt \
USE_FAST_ALGORITHM=true \
READ_LENGTH=${read_length_for_optimization} \
${"COVERAGE_CAP=" + coverage_cap} \
INCLUDE_BQ_HISTOGRAM=true \
THEORETICAL_SENSITIVITY_OUTPUT=theoretical_sensitivity.txt

R --vanilla <<CODE
df = read.table("metrics.txt",skip=6,header=TRUE,stringsAsFactors=FALSE,sep='\t',nrows=1)
write.table(floor(df[,"MEAN_COVERAGE"]), "mean_coverage.txt", quote=F, col.names=F, row.names=F)
CODE
>>>
runtime {
preemptible: select_first([preemptible_tries, 5])
memory: "3 GB"
disks: "local-disk " + disk_size + " HDD"
docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856"
}
output {
File metrics = "metrics.txt"
File theoretical_sensitivity = "theoretical_sensitivity.txt"
Int mean_coverage = read_int("mean_coverage.txt")
}
}
task M2AndFilter {
File ref_fasta
File ref_fai
Expand All @@ -138,7 +68,6 @@ task M2AndFilter {
Boolean compress
File? gga_vcf
File? gga_vcf_idx
Int prune
Float? autosomal_coverage

String output_vcf = "output" + if compress then ".vcf.gz" else ".vcf"
Expand Down Expand Up @@ -188,7 +117,6 @@ task M2AndFilter {
-O raw.vcf \
${true='--bam-output bamout.bam' false='' make_bamout} \
${m2_extra_args} \
--min-pruning ${prune} \
${"--median-autosomal-coverage " + autosomal_coverage} \
--annotation StrandBiasBySample \
--mitochondria-mode \
Expand All @@ -209,7 +137,7 @@ task M2AndFilter {
--mask-name "blacklisted_site"
>>>
runtime {
docker: "us.gcr.io/broad-gatk/gatk:4.0.11.0"
docker: "us.gcr.io/broad-gatk/gatk:4.1.0.0"
memory: machine_mem + " MB"
disks: "local-disk " + disk_size + " HDD"
preemptible: select_first([preemptible_tries, 5])
Expand Down
77 changes: 72 additions & 5 deletions scripts/mitochondria_m2_wdl/MitochondriaPipeline.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ workflow MitochondriaPipeline {
# Using an older version of the default Mutect LOD cutoff. This value can be changed and is only useful at low depths
# to catch sites that would not get caught by the LOD divided by depth filter.
Float lod_cutoff = 6.3
# Read length used for optimization only. If this is too small CollectWgsMetrics might fail. Default is 151.
# Read length used for optimization only. If this is too small CollectWgsMetrics might fail, but the results are not
# affected by this number. Default is 151.
Int? max_read_length

# Full reference is only requred if starting with a CRAM (BAM doesn't need these files)
Expand Down Expand Up @@ -114,6 +115,17 @@ workflow MitochondriaPipeline {
preemptible_tries = preemptible_tries
}

call CollectWgsMetrics {
input:
input_bam = AlignToMt.mt_aligned_bam,
input_bam_index = AlignToMt.mt_aligned_bam,
ref_fasta = mt_fasta,
ref_fasta_index = mt_fasta_index,
read_length = max_read_length,
coverage_cap = 100000,
preemptible_tries = preemptible_tries
}

call GetContamination {
input:
input_bam = AlignToMt.mt_aligned_bam,
Expand All @@ -136,6 +148,7 @@ workflow MitochondriaPipeline {
m2_extra_args = select_first([m2_extra_args, ""]) + " -L chrM:576-16024 ",
blacklisted_sites = blacklisted_sites,
blacklisted_sites_index = blacklisted_sites_index,
mean_coverage = CollectWgsMetrics.mean_coverage,
autosomal_coverage = autosomal_coverage,
contamination = GetContamination.minor_level,
max_read_length = max_read_length,
Expand All @@ -155,6 +168,7 @@ workflow MitochondriaPipeline {
m2_extra_args = select_first([m2_extra_args, ""]) + " -L chrM:8025-9144 ",
blacklisted_sites = blacklisted_sites_shifted,
blacklisted_sites_index = blacklisted_sites_shifted_index,
mean_coverage = CollectWgsMetrics.mean_coverage,
autosomal_coverage = autosomal_coverage,
contamination = GetContamination.minor_level,
max_read_length = max_read_length,
Expand Down Expand Up @@ -199,11 +213,11 @@ workflow MitochondriaPipeline {
File out_vcf = LiftoverAndCombineVcfs.final_vcf
File out_vcf_index = LiftoverAndCombineVcfs.final_vcf_index
File duplicate_metrics = AlignToMt.duplicate_metrics
File coverage_metrics = CallAndFilterMt.coverage_metrics
File theoretical_sensitivity_metrics = CallAndFilterMt.theoretical_sensitivity_metrics
File coverage_metrics = CollectWgsMetrics.metrics
File theoretical_sensitivity_metrics = CollectWgsMetrics.theoretical_sensitivity
File contamination_metrics = GetContamination.contamination_file
File base_level_coverage_metrics = CoverageAtEveryBase.table
Int mean_coverage = CallAndFilterMt.mean_coverage
Int mean_coverage = CollectWgsMetrics.mean_coverage
String major_haplogroup = GetContamination.major_hg
Float contamination = GetContamination.minor_level
}
Expand Down Expand Up @@ -278,7 +292,7 @@ task AddOriginalAlignmentTags {
runtime {
memory: "3 GB"
disks: "local-disk " + disk_size + " HDD"
docker: "us.gcr.io/broad-gatk/gatk:4.0.11.0"
docker: "us.gcr.io/broad-gatk/gatk:4.1.0.0"
preemptible: select_first([preemptible_tries, 5])
}
output {
Expand Down Expand Up @@ -440,6 +454,59 @@ CODE
}
}

task CollectWgsMetrics {
File input_bam
File input_bam_index
File ref_fasta
File ref_fasta_index
Int? read_length
Int read_length_for_optimization = select_first([read_length, 151])
Int? coverage_cap

Int? preemptible_tries
Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB")
Int disk_size = ceil(size(input_bam, "GB") + ref_size) + 20

meta {
description: "Collect coverage metrics"
}
parameter_meta {
read_length: "Read length used for optimization only. If this is too small CollectWgsMetrics might fail. Default is 151."
}

command <<<
set -e

java -Xms2000m -jar /usr/gitc/picard.jar \
CollectWgsMetrics \
INPUT=${input_bam} \
VALIDATION_STRINGENCY=SILENT \
REFERENCE_SEQUENCE=${ref_fasta} \
OUTPUT=metrics.txt \
USE_FAST_ALGORITHM=true \
READ_LENGTH=${read_length_for_optimization} \
${"COVERAGE_CAP=" + coverage_cap} \
INCLUDE_BQ_HISTOGRAM=true \
THEORETICAL_SENSITIVITY_OUTPUT=theoretical_sensitivity.txt

R --vanilla <<CODE
df = read.table("metrics.txt",skip=6,header=TRUE,stringsAsFactors=FALSE,sep='\t',nrows=1)
write.table(floor(df[,"MEAN_COVERAGE"]), "mean_coverage.txt", quote=F, col.names=F, row.names=F)
CODE
>>>
runtime {
preemptible: select_first([preemptible_tries, 5])
memory: "3 GB"
disks: "local-disk " + disk_size + " HDD"
docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856"
}
output {
File metrics = "metrics.txt"
File theoretical_sensitivity = "theoretical_sensitivity.txt"
Int mean_coverage = read_int("mean_coverage.txt")
}
}
task CoverageAtEveryBase {
File input_bam_regular_ref
File input_bam_regular_ref_index
Expand Down
Binary file modified scripts/mitochondria_m2_wdl/MitochondriaPipelineDependencies.zip
Binary file not shown.

0 comments on commit 0595f9c

Please sign in to comment.