Skip to content

Commit

Permalink
Added some fixes for minor CNV issues. (#5699)
Browse files Browse the repository at this point in the history
* Cleaned up intermediate files in gCNV WDL and fixed miscellaneous typos. (#5382)

* Added output of MAD values as floats in somatic CNV WDL. (#5591)

* Exposed boot disk space for Oncotator in somatic CNV WDL. (#3566)

* Added check to skip outlier truncation if number of matrix elements exceeds Integer.MAX_VALUE in CreateReadCountPanelOfNormals. (#4734)

* Miscellaneous boy scout activities.

* Fixed some issues concerning intervals in DetermineGermlineContigPloidy documentation.

* Fixed non-kebab-case argument in CollectAllelicCountsSpark and other minor issues.

* Improved consistency of style and input/output validation across CNV tools. (#4825)
  • Loading branch information
samuelklee authored Mar 12, 2019
1 parent 913f24d commit a6c89de
Show file tree
Hide file tree
Showing 54 changed files with 804 additions and 661 deletions.
52 changes: 35 additions & 17 deletions scripts/cnv_wdl/cnv_common_tasks.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,11 @@ task CollectAllelicCounts {
}
}

# Users should consult the IntervalListTools documentation and/or manually inspect the results of this task
# to ensure that the number of intervals in each shard is as desired, as the logic IntervalListTools uses
# for dividing intervals can yield shards that are unexpectedly larger than num_intervals_per_scatter.
# Depending on their use case, users may want to modify this task to instead use the SCATTER_COUNT option of
# IntervalListTools, which allows the number of shards to be directly specified.
task ScatterIntervals {
File interval_list
Int num_intervals_per_scatter
Expand All @@ -301,28 +306,40 @@ task ScatterIntervals {

String base_filename = basename(interval_list, ".interval_list")

String dollar = "$" #WDL workaround, see https://github.com/broadinstitute/cromwell/issues/1819
command <<<
set -e
# IntervalListTools will fail if the output directory does not exist, so we create it
mkdir ${output_dir_}
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk4_jar_override}

{
>&2 echo "Attempting to run IntervalListTools..."

# IntervalListTools behaves differently when scattering to a single or multiple shards, so we do some handling in bash

# IntervalListTools tries to equally divide intervals across shards to give at least INTERVAL_COUNT in each and
# puts remainder intervals in the last shard, so integer division gives the number of shards
# (unless NUM_INTERVALS < num_intervals_per_scatter and NUM_SCATTERS = 0, in which case we still want a single shard)
NUM_INTERVALS=${dollar}(grep -v '@' ${interval_list} | wc -l)
NUM_SCATTERS=${dollar}(echo ${dollar}((NUM_INTERVALS / ${num_intervals_per_scatter})))

if [ $NUM_SCATTERS -le 1 ]; then
# if only a single shard is required, then we can just rename the original interval list
>&2 echo "Not running IntervalListTools because only a single shard is required. Copying original interval list..."
cp ${interval_list} ${output_dir_}/${base_filename}.scattered.0001.interval_list
else
gatk --java-options "-Xmx${command_mem_mb}m" IntervalListTools \
--INPUT ${interval_list} \
--SUBDIVISION_MODE INTERVAL_COUNT \
--SCATTER_CONTENT ${num_intervals_per_scatter} \
--OUTPUT ${output_dir_} &&
# output files are named output_dir_/temp_0001_of_N/scattered.interval_list, etc. (N = num_intervals_per_scatter);
# we rename them as output_dir_/base_filename.scattered.0000.interval_list, etc.
ls ${output_dir_}/*/scattered.interval_list | \
--OUTPUT ${output_dir_}

# output files are named output_dir_/temp_0001_of_N/scattered.interval_list, etc. (N = number of scatters);
# we rename them as output_dir_/base_filename.scattered.0001.interval_list, etc.
ls -v ${output_dir_}/*/scattered.interval_list | \
cat -n | \
while read n filename; do mv $filename ${output_dir_}/${base_filename}.scattered.$(printf "%04d" $n).interval_list; done
} || {
# if only a single shard is required, then we can just rename the original interval list
>&2 echo "IntervalListTools failed because only a single shard is required. Copying original interval list..."
cp ${interval_list} ${output_dir_}/${base_filename}.scattered.1.interval_list
}
rm -rf ${output_dir_}/temp_*_of_*
fi
>>>

runtime {
Expand Down Expand Up @@ -405,21 +422,22 @@ task PostprocessGermlineCNVCalls {
model_args="$model_args --model-shard-path MODEL_$index"
done

mkdir extracted-contig-ploidy-calls
tar xzf ${contig_ploidy_calls_tar} -C extracted-contig-ploidy-calls
mkdir contig-ploidy-calls
tar xzf ${contig_ploidy_calls_tar} -C contig-ploidy-calls

gatk --java-options "-Xmx${command_mem_mb}m" PostprocessGermlineCNVCalls \
$calls_args \
$model_args \
${sep=" " allosomal_contigs_args} \
--autosomal-ref-copy-number ${ref_copy_number_autosomal_contigs} \
--contig-ploidy-calls extracted-contig-ploidy-calls \
--contig-ploidy-calls contig-ploidy-calls \
--sample-index ${sample_index} \
--output-genotyped-intervals ${genotyped_intervals_vcf_filename} \
--output-genotyped-segments ${genotyped_segments_vcf_filename}

rm -r CALLS_*
rm -r MODEL_*
rm -rf CALLS_*
rm -rf MODEL_*
rm -rf contig-ploidy-calls
>>>

runtime {
Expand Down
19 changes: 11 additions & 8 deletions scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -273,24 +273,25 @@ task DetermineGermlineContigPloidyCaseMode {

command <<<
set -e
mkdir ${output_dir_}
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk4_jar_override}
export MKL_NUM_THREADS=${default=8 cpu}
export OMP_NUM_THREADS=${default=8 cpu}

mkdir input-contig-ploidy-model
tar xzf ${contig_ploidy_model_tar} -C input-contig-ploidy-model
mkdir contig-ploidy-model
tar xzf ${contig_ploidy_model_tar} -C contig-ploidy-model

gatk --java-options "-Xmx${command_mem_mb}m" DetermineGermlineContigPloidy \
--input ${sep=" --input " read_count_files} \
--model input-contig-ploidy-model \
--model contig-ploidy-model \
--output ${output_dir_} \
--output-prefix case \
--verbosity DEBUG \
--mapping-error-rate ${default="0.01" mapping_error_rate} \
--sample-psi-scale ${default="0.0001" sample_psi_scale}

tar czf case-contig-ploidy-calls.tar.gz -C ${output_dir_}/case-calls .

rm -rf contig-ploidy-model
>>>

runtime {
Expand Down Expand Up @@ -367,21 +368,20 @@ task GermlineCNVCallerCaseMode {

command <<<
set -e
mkdir ${output_dir_}
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk4_jar_override}
export MKL_NUM_THREADS=${default=8 cpu}
export OMP_NUM_THREADS=${default=8 cpu}

mkdir contig-ploidy-calls-dir
tar xzf ${contig_ploidy_calls_tar} -C contig-ploidy-calls-dir
mkdir contig-ploidy-calls
tar xzf ${contig_ploidy_calls_tar} -C contig-ploidy-calls

mkdir gcnv-model
tar xzf ${gcnv_model_tar} -C gcnv-model

gatk --java-options "-Xmx${command_mem_mb}m" GermlineCNVCaller \
--run-mode CASE \
--input ${sep=" --input " read_count_files} \
--contig-ploidy-calls contig-ploidy-calls-dir \
--contig-ploidy-calls contig-ploidy-calls \
--model gcnv-model \
--output ${output_dir_} \
--output-prefix case \
Expand Down Expand Up @@ -425,6 +425,9 @@ task GermlineCNVCallerCaseMode {
tar czf case-gcnv-calls-shard-${scatter_index}-sample-$CURRENT_SAMPLE_WITH_LEADING_ZEROS.tar.gz -C ${output_dir_}/case-calls/SAMPLE_$CURRENT_SAMPLE .
let CURRENT_SAMPLE=CURRENT_SAMPLE+1
done

rm -rf contig-ploidy-calls
rm -rf gcnv-model
>>>

runtime {
Expand Down
10 changes: 5 additions & 5 deletions scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -374,7 +374,6 @@ task DetermineGermlineContigPloidyCohortMode {

command <<<
set -e
mkdir ${output_dir_}
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk4_jar_override}
export MKL_NUM_THREADS=${default=8 cpu}
export OMP_NUM_THREADS=${default=8 cpu}
Expand Down Expand Up @@ -482,19 +481,18 @@ task GermlineCNVCallerCohortMode {
command <<<
set -e
mkdir ${output_dir_}
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk4_jar_override}
export MKL_NUM_THREADS=${default=8 cpu}
export OMP_NUM_THREADS=${default=8 cpu}

mkdir contig-ploidy-calls-dir
tar xzf ${contig_ploidy_calls_tar} -C contig-ploidy-calls-dir
mkdir contig-ploidy-calls
tar xzf ${contig_ploidy_calls_tar} -C contig-ploidy-calls

gatk --java-options "-Xmx${command_mem_mb}m" GermlineCNVCaller \
--run-mode COHORT \
-L ${intervals} \
--input ${sep=" --input " read_count_files} \
--contig-ploidy-calls contig-ploidy-calls-dir \
--contig-ploidy-calls contig-ploidy-calls \
${"--annotated-intervals " + annotated_intervals} \
--interval-merging-rule OVERLAPPING_ONLY \
--output ${output_dir_} \
Expand Down Expand Up @@ -549,6 +547,8 @@ task GermlineCNVCallerCohortMode {
tar czf ${cohort_entity_id}-gcnv-calls-shard-${scatter_index}-sample-$CURRENT_SAMPLE_WITH_LEADING_ZEROS.tar.gz -C ${output_dir_}/${cohort_entity_id}-calls/SAMPLE_$CURRENT_SAMPLE .
let CURRENT_SAMPLE=CURRENT_SAMPLE+1
done

rm -rf contig-ploidy-calls
>>>

runtime {
Expand Down
5 changes: 4 additions & 1 deletion scripts/cnv_wdl/somatic/cnv_somatic_oncotator_workflow.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ workflow CNVOncotatorWorkflow {
String? additional_args
String? oncotator_docker
Int? mem_gb_for_oncotator
Int? boot_disk_space_gb_for_oncotator
Int? preemptible_attempts

call OncotateSegments {
Expand All @@ -19,6 +20,7 @@ workflow CNVOncotatorWorkflow {
additional_args = additional_args,
oncotator_docker = oncotator_docker,
mem_gb = mem_gb_for_oncotator,
boot_disk_space_gb = boot_disk_space_gb_for_oncotator,
preemptible_attempts = preemptible_attempts
}

Expand All @@ -36,6 +38,7 @@ task OncotateSegments {
String? oncotator_docker
Int? mem_gb
Int? disk_space_gb
Int? boot_disk_space_gb
Boolean use_ssd = false
Int? cpu
Int? preemptible_attempts
Expand Down Expand Up @@ -69,7 +72,7 @@ task OncotateSegments {
disks: "local-disk " + select_first([disk_space_gb, 50]) + if use_ssd then " SSD" else " HDD"
cpu: select_first([cpu, 1])
preemptible: select_first([preemptible_attempts, 2])
bootDiskSizeGb: 50
bootDiskSizeGb: select_first([boot_disk_space_gb, 20])
}

output {
Expand Down
17 changes: 14 additions & 3 deletions scripts/cnv_wdl/somatic/cnv_somatic_pair_workflow.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ workflow CNVSomaticPairWorkflow {
String? additional_args_for_oncotator
String? oncotator_docker
Int? mem_gb_for_oncotator
Int? boot_disk_space_gb_for_oncotator

Int ref_size = ceil(size(ref_fasta, "GB") + size(ref_fasta_dict, "GB") + size(ref_fasta_fai, "GB"))
Int read_count_pon_size = ceil(size(read_count_pon, "GB"))
Expand Down Expand Up @@ -427,6 +428,7 @@ workflow CNVSomaticPairWorkflow {
additional_args = additional_args_for_oncotator,
oncotator_docker = oncotator_docker,
mem_gb_for_oncotator = mem_gb_for_oncotator,
boot_disk_space_gb_for_oncotator = boot_disk_space_gb_for_oncotator,
preemptible_attempts = preemptible_attempts
}
}
Expand Down Expand Up @@ -456,9 +458,13 @@ workflow CNVSomaticPairWorkflow {
File denoised_copy_ratios_plot_tumor = PlotDenoisedCopyRatiosTumor.denoised_copy_ratios_plot
File denoised_copy_ratios_lim_4_plot_tumor = PlotDenoisedCopyRatiosTumor.denoised_copy_ratios_lim_4_plot
File standardized_MAD_tumor = PlotDenoisedCopyRatiosTumor.standardized_MAD
Float standardized_MAD_value_tumor = PlotDenoisedCopyRatiosTumor.standardized_MAD_value
File denoised_MAD_tumor = PlotDenoisedCopyRatiosTumor.denoised_MAD
Float denoised_MAD_value_tumor = PlotDenoisedCopyRatiosTumor.denoised_MAD_value
File delta_MAD_tumor = PlotDenoisedCopyRatiosTumor.delta_MAD
Float delta_MAD_value_tumor = PlotDenoisedCopyRatiosTumor.delta_MAD_value
File scaled_delta_MAD_tumor = PlotDenoisedCopyRatiosTumor.scaled_delta_MAD
Float scaled_delta_MAD_value_tumor = PlotDenoisedCopyRatiosTumor.scaled_delta_MAD_value
File modeled_segments_plot_tumor = PlotModeledSegmentsTumor.modeled_segments_plot

File? read_counts_entity_id_normal = CollectCountsNormal.entity_id
Expand All @@ -483,9 +489,13 @@ workflow CNVSomaticPairWorkflow {
File? denoised_copy_ratios_plot_normal = PlotDenoisedCopyRatiosNormal.denoised_copy_ratios_plot
File? denoised_copy_ratios_lim_4_plot_normal = PlotDenoisedCopyRatiosNormal.denoised_copy_ratios_lim_4_plot
File? standardized_MAD_normal = PlotDenoisedCopyRatiosNormal.standardized_MAD
Float? standardized_MAD_value_normal = PlotDenoisedCopyRatiosNormal.standardized_MAD_value
File? denoised_MAD_normal = PlotDenoisedCopyRatiosNormal.denoised_MAD
Float? denoised_MAD_value_normal = PlotDenoisedCopyRatiosNormal.denoised_MAD_value
File? delta_MAD_normal = PlotDenoisedCopyRatiosNormal.delta_MAD
Float? delta_MAD_value_normal = PlotDenoisedCopyRatiosNormal.delta_MAD_value
File? scaled_delta_MAD_normal = PlotDenoisedCopyRatiosNormal.scaled_delta_MAD
Float? scaled_delta_MAD_value_normal = PlotDenoisedCopyRatiosNormal.scaled_delta_MAD_value
File? modeled_segments_plot_normal = PlotModeledSegmentsNormal.modeled_segments_plot

File oncotated_called_file_tumor = select_first([CNVOncotatorWorkflow.oncotated_called_file, "null"])
Expand Down Expand Up @@ -587,7 +597,6 @@ task ModelSegments {

command <<<
set -e
mkdir ${output_dir_}
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk4_jar_override}

gatk --java-options "-Xmx${command_mem_mb}m" ModelSegments \
Expand Down Expand Up @@ -717,7 +726,6 @@ task PlotDenoisedCopyRatios {

command <<<
set -e
mkdir ${output_dir_}
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk4_jar_override}

gatk --java-options "-Xmx${command_mem_mb}m" PlotDenoisedCopyRatios \
Expand All @@ -741,9 +749,13 @@ task PlotDenoisedCopyRatios {
File denoised_copy_ratios_plot = "${output_dir_}/${entity_id}.denoised.png"
File denoised_copy_ratios_lim_4_plot = "${output_dir_}/${entity_id}.denoisedLimit4.png"
File standardized_MAD = "${output_dir_}/${entity_id}.standardizedMAD.txt"
Float standardized_MAD_value = read_float(standardized_MAD)
File denoised_MAD = "${output_dir_}/${entity_id}.denoisedMAD.txt"
Float denoised_MAD_value = read_float(denoised_MAD)
File delta_MAD = "${output_dir_}/${entity_id}.deltaMAD.txt"
Float delta_MAD_value = read_float(delta_MAD)
File scaled_delta_MAD = "${output_dir_}/${entity_id}.scaledDeltaMAD.txt"
Float scaled_delta_MAD_value = read_float(scaled_delta_MAD)
}
}

Expand Down Expand Up @@ -773,7 +785,6 @@ task PlotModeledSegments {

command <<<
set -e
mkdir ${output_dir_}
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk4_jar_override}

gatk --java-options "-Xmx${command_mem_mb}m" PlotModeledSegments \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -804,7 +804,7 @@ public VariantContextWriter createVCFWriter(final Path outPath) {

if (createOutputVariantIndex) {
if (null == sequenceDictionary) {
logger.warn("An variant index will not be created - a sequence dictionary is required to create an output index");
logger.warn("A variant index will not be created - a sequence dictionary is required to create an output index");
// fall through and create without an index
} else {
options.add(Options.INDEX_ON_THE_FLY);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import org.apache.commons.collections4.IteratorUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.BetaFeature;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
Expand Down Expand Up @@ -157,7 +156,7 @@ public boolean requiresIntervals() {

@Override
public void onTraversalStart() {
CopyNumberArgumentValidationUtils.validateIntervalArgumentCollection(intervalArgumentCollection);
validateArguments();

logger.info("Loading intervals for annotation...");
sequenceDictionary = getBestAvailableSequenceDictionary();
Expand Down Expand Up @@ -191,6 +190,11 @@ public void onTraversalStart() {
logger.info("Annotating intervals...");
}

private void validateArguments() {
CopyNumberArgumentValidationUtils.validateIntervalArgumentCollection(intervalArgumentCollection);
CopyNumberArgumentValidationUtils.validateOutputFiles(outputAnnotatedIntervalsFile);
}

private static void checkForOverlaps(final FeatureManager featureManager,
final FeatureInput<BEDFeature> featureTrackPath,
final SAMSequenceDictionary sequenceDictionary) {
Expand Down Expand Up @@ -229,8 +233,12 @@ public void traverse() {
public Object onTraversalSuccess() {
reference.close();
features.close();
logger.info(String.format("Writing annotated intervals to %s...", outputAnnotatedIntervalsFile));

logger.info(String.format("Writing annotated intervals to %s...", outputAnnotatedIntervalsFile.getAbsolutePath()));
annotatedIntervals.write(outputAnnotatedIntervalsFile);

logger.info(String.format("%s complete.", getClass().getSimpleName()));

return super.onTraversalSuccess();
}

Expand Down
Loading

0 comments on commit a6c89de

Please sign in to comment.