diff --git a/.dockstore.yml b/.dockstore.yml index ca5feb45..8ceacb87 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -35,3 +35,10 @@ workflows: authors: - name: Terra Scientific Services email: teaspoons-developers@broadinstitute.org + + - subclass: WDL + name: SubsetVcfByBedFile + primaryDescriptorPath: /pipelines/imputation/simulatedData/SubsetVcfByBedFile.wdl + authors: + - name: Terra Scientific Services + email: teaspoons-developers@broadinstitute.org diff --git a/pipelines/imputation/scientificValidation/README.md b/pipelines/imputation/scientificValidation/README.md index 3cf637a6..f40fe735 100644 --- a/pipelines/imputation/scientificValidation/README.md +++ b/pipelines/imputation/scientificValidation/README.md @@ -13,3 +13,23 @@ This wdl is basically a wrapper around that tool/image #### Outputs * recombined_reference_panel - output vcf after mitigation algorithm has been run + + +## SubsetVcfByBedFile +### Purpose +This wdl is meant to be used to subset a vcf down +to sites provided through a bed file. This wdl does +not interact with headers or annotations mostly because +the only really "required" header is the dictionary +and that gets transferred across and the imputation +tool only look at GT and no info/format fields so +we can just leave them be and have it not affect + +#### Inputs +* input_vcf - input file to be subset +* input_vcf_index +* bed_file - bed file containing intervals to subset by + +#### Outputs +* subset_vcf - subsetted vcf +* subset_vcf_index diff --git a/pipelines/imputation/scientificValidation/ReshapeReferencePanel.wdl b/pipelines/imputation/scientificValidation/ReshapeReferencePanel.wdl index 3bbd2cc4..d059c6c5 100644 --- a/pipelines/imputation/scientificValidation/ReshapeReferencePanel.wdl +++ b/pipelines/imputation/scientificValidation/ReshapeReferencePanel.wdl @@ -31,7 +31,7 @@ task ReshapeReferencePanel { Int disk_size_gb = ceil(3*size(ref_panel_vcf, "GiB")) + 20 Int cpu = 1 - Int memory_mb = 8000 + Int memory_mb = 6000 } command { diff --git a/pipelines/imputation/scientificValidation/SubsetVcfByBedFile.wdl b/pipelines/imputation/scientificValidation/SubsetVcfByBedFile.wdl new file mode 100644 index 00000000..cd1fe90c --- /dev/null +++ b/pipelines/imputation/scientificValidation/SubsetVcfByBedFile.wdl @@ -0,0 +1,61 @@ +version 1.0 + +# This script is under review. It is not actively tested or maintained at this time. +workflow SubsetVcfByBedFile { + input { + File input_vcf + File input_vcf_index + File bed_file + } + + call BcftoolsSubsetVcf { + input: + input_vcf = input_vcf, + input_vcf_index = input_vcf_index, + bed_file = bed_file + } + + output { + File subset_vcf = BcftoolsSubsetVcf.output_vcf + File subset_vcf_index = BcftoolsSubsetVcf.output_vcf_index + } +} + +task BcftoolsSubsetVcf { + input { + File input_vcf + File input_vcf_index + File bed_file + + Int disk_size_gb = ceil(3 * size(input_vcf, "GiB")) + 20 + Int cpu = 1 + Int memory_mb = 6000 + } + + String basename = basename(input_vcf, '.vcf.gz') + + command { + set -e -o pipefail + + bcftools view \ + -R ~{bed_file} \ + -O z \ + -o ~{basename}.subset.vcf.gz \ + ~{input_vcf} + + bcftools index -t ~{basename}.subset.vcf.gz + + } + + output { + File output_vcf = "~{basename}.subset.vcf.gz" + File output_vcf_index = "~{basename}.subset.vcf.gz.tbi" + } + + runtime { + docker: "us.gcr.io/broad-gatk/gatk:4.5.0.0" + disks: "local-disk ${disk_size_gb} HDD" + memory: "${memory_mb} MiB" + cpu: cpu + } +}