Improved consistency of style and input/output validation across CNV …

…tools. (#4825)
broadinstitute · Mar 4, 2019 · f8c68e7 · f8c68e7
1 parent ffa96a4
commit f8c68e7
Show file tree

Hide file tree

Showing 41 changed files with 644 additions and 524 deletions.
diff --git a/scripts/cnv_wdl/cnv_common_tasks.wdl b/scripts/cnv_wdl/cnv_common_tasks.wdl
@@ -301,29 +301,39 @@ task ScatterIntervals {
 
     String base_filename = basename(interval_list, ".interval_list")
 
+    String dollar = "$" #WDL workaround, see https://github.com/broadinstitute/cromwell/issues/1819
+
     command <<<
         set -e
+        # IntervalListTools will fail if the output directory does not exist, so we create it
         mkdir ${output_dir_}
         export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk4_jar_override}
-
-        {
-            >&2 echo "Attempting to run IntervalListTools..."
+
+        # IntervalListTools behaves differently when scattering to a ssingle or multiple shards, so we do some handling in bash
+
+        # IntervalListTools puts remainder intervals in the last shard, so integer division gives the number of shards
+        # (unless NUM_INTERVALS < num_intervals_per_scatter and NUM_SCATTERS = 0, in which case we still want a single shard)
+        NUM_INTERVALS=${dollar}(grep -v '@' ${interval_list} | wc -l)
+        NUM_SCATTERS=${dollar}(echo ${dollar}((NUM_INTERVALS / ${num_intervals_per_scatter})))
+
+        if [ $NUM_SCATTERS -le 1 ]; then
+            # if only a single shard is required, then we can just rename the original interval list
+            >&2 echo "Not running IntervalListTools because only a single shard is required. Copying original interval list..."
+            cp ${interval_list} ${output_dir_}/${base_filename}.scattered.0001.interval_list
+        else
             gatk --java-options "-Xmx${command_mem_mb}m" IntervalListTools \
                 --INPUT ${interval_list} \
                 --SUBDIVISION_MODE INTERVAL_COUNT \
                 --SCATTER_CONTENT ${num_intervals_per_scatter} \
-                --OUTPUT ${output_dir_} &&
+                --OUTPUT ${output_dir_}
+
             # output files are named output_dir_/temp_0001_of_N/scattered.interval_list, etc. (N = number of scatters);
-            # we rename them as output_dir_/base_filename.scattered.0000.interval_list, etc.
-            ls ${output_dir_}/*/scattered.interval_list | \
+            # we rename them as output_dir_/base_filename.scattered.0001.interval_list, etc.
+            ls -v ${output_dir_}/*/scattered.interval_list | \
                 cat -n | \
                 while read n filename; do mv $filename ${output_dir_}/${base_filename}.scattered.$(printf "%04d" $n).interval_list; done
             rm -rf ${output_dir_}/temp_*_of_*
-        } || {
-            # if only a single shard is required, then we can just rename the original interval list
-            >&2 echo "IntervalListTools failed because only a single shard is required. Copying original interval list..."
-            cp ${interval_list} ${output_dir_}/${base_filename}.scattered.1.interval_list
-        }
+        fi
     >>>
 
     runtime {

diff --git a/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl b/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl
@@ -273,7 +273,6 @@ task DetermineGermlineContigPloidyCaseMode {
 
     command <<<
         set -e
-        mkdir ${output_dir_}
         export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk4_jar_override}
         export MKL_NUM_THREADS=${default=8 cpu}
         export OMP_NUM_THREADS=${default=8 cpu}
@@ -369,7 +368,6 @@ task GermlineCNVCallerCaseMode {
 
     command <<<
         set -e
-        mkdir ${output_dir_}
         export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk4_jar_override}
         export MKL_NUM_THREADS=${default=8 cpu}
         export OMP_NUM_THREADS=${default=8 cpu}

diff --git a/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl b/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl
@@ -374,7 +374,6 @@ task DetermineGermlineContigPloidyCohortMode {
 
     command <<<
         set -e
-        mkdir ${output_dir_}
         export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk4_jar_override}
         export MKL_NUM_THREADS=${default=8 cpu}
         export OMP_NUM_THREADS=${default=8 cpu}
@@ -482,7 +481,6 @@ task GermlineCNVCallerCohortMode {
 
     command <<<
         set -e
-        mkdir ${output_dir_}
         export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk4_jar_override}
         export MKL_NUM_THREADS=${default=8 cpu}
         export OMP_NUM_THREADS=${default=8 cpu}

diff --git a/scripts/cnv_wdl/somatic/cnv_somatic_pair_workflow.wdl b/scripts/cnv_wdl/somatic/cnv_somatic_pair_workflow.wdl
@@ -597,7 +597,6 @@ task ModelSegments {
 
     command <<<
         set -e
-        mkdir ${output_dir_}
         export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk4_jar_override}
 
         gatk --java-options "-Xmx${command_mem_mb}m" ModelSegments \
@@ -727,7 +726,6 @@ task PlotDenoisedCopyRatios {
 
     command <<<
         set -e
-        mkdir ${output_dir_}
         export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk4_jar_override}
 
         gatk --java-options "-Xmx${command_mem_mb}m" PlotDenoisedCopyRatios \
@@ -787,7 +785,6 @@ task PlotModeledSegments {
 
     command <<<
         set -e
-        mkdir ${output_dir_}
         export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk4_jar_override}
 
         gatk --java-options "-Xmx${command_mem_mb}m" PlotModeledSegments \

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/AnnotateIntervals.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/AnnotateIntervals.java
@@ -7,7 +7,6 @@
 import org.apache.commons.collections4.IteratorUtils;
 import org.apache.commons.lang3.tuple.Pair;
 import org.broadinstitute.barclay.argparser.Argument;
-import org.broadinstitute.barclay.argparser.BetaFeature;
 import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
 import org.broadinstitute.barclay.help.DocumentedFeature;
 import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
@@ -157,7 +156,7 @@ public boolean requiresIntervals() {
 
     @Override
     public void onTraversalStart() {
-        CopyNumberArgumentValidationUtils.validateIntervalArgumentCollection(intervalArgumentCollection);
+        validateArguments();
 
         logger.info("Loading intervals for annotation...");
         sequenceDictionary = getBestAvailableSequenceDictionary();
@@ -191,6 +190,11 @@ public void onTraversalStart() {
         logger.info("Annotating intervals...");
     }
 
+    private void validateArguments() {
+        CopyNumberArgumentValidationUtils.validateIntervalArgumentCollection(intervalArgumentCollection);
+        CopyNumberArgumentValidationUtils.validateOutputFiles(outputAnnotatedIntervalsFile);
+    }
+
     private static void checkForOverlaps(final FeatureManager featureManager,
                                          final FeatureInput<BEDFeature> featureTrackPath,
                                          final SAMSequenceDictionary sequenceDictionary) {
@@ -229,8 +233,12 @@ public void traverse() {
     public Object onTraversalSuccess() {
         reference.close();
         features.close();
-        logger.info(String.format("Writing annotated intervals to %s...", outputAnnotatedIntervalsFile));
+
+        logger.info(String.format("Writing annotated intervals to %s...", outputAnnotatedIntervalsFile.getAbsolutePath()));
         annotatedIntervals.write(outputAnnotatedIntervalsFile);
+
+        logger.info("AnnotateIntervals complete.");
+
         return super.onTraversalSuccess();
     }
 

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CallCopyRatioSegments.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CallCopyRatioSegments.java
@@ -3,12 +3,12 @@
 import com.google.common.annotations.VisibleForTesting;
 import org.apache.commons.io.FilenameUtils;
 import org.broadinstitute.barclay.argparser.Argument;
-import org.broadinstitute.barclay.argparser.BetaFeature;
 import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
 import org.broadinstitute.barclay.help.DocumentedFeature;
 import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
 import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
 import org.broadinstitute.hellbender.cmdline.programgroups.CopyNumberProgramGroup;
+import org.broadinstitute.hellbender.tools.copynumber.arguments.CopyNumberArgumentValidationUtils;
 import org.broadinstitute.hellbender.tools.copynumber.caller.SimpleCopyRatioCaller;
 import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CalledCopyRatioSegmentCollection;
 import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CalledLegacySegmentCollection;
@@ -56,6 +56,16 @@
  *         a row specifying the column headers contained in {@link CalledCopyRatioSegmentCollection.CalledCopyRatioSegmentTableColumn},
  *         and the corresponding entry rows.
  *     </li>
+ *     <li>
+ *         CBS-formatted .igv.seg file (compatible with IGV).
+ *         This is a tab-separated values (TSV) file with CBS-format column headers
+ *         (see <a href="http://software.broadinstitute.org/cancer/software/genepattern/file-formats-guide#CBS">
+ *             http://software.broadinstitute.org/cancer/software/genepattern/file-formats-guide#CBS</a>)
+ *         and the corresponding entry rows that can be plotted using IGV (see
+ *         <a href="https://software.broadinstitute.org/software/igv/SEG">
+ *             https://software.broadinstitute.org/software/igv/SEG</a>).
+ *         The mean log2 copy ratio is given in the SEGMENT_MEAN column.
+ *     </li>
  * </ul>
  *
  * <h3>Usage example</h3>
@@ -131,29 +141,48 @@ public final class CallCopyRatioSegments extends CommandLineProgram {
     )
     private double callingCopyRatioZScoreThreshold = 2.;
 
+    private File outputCalledLegacySegmentsFile;
+
     @Override
     protected Object doWork() {
-        Utils.validateArg(neutralSegmentCopyRatioLowerBound < neutralSegmentCopyRatioUpperBound,
-                "Copy-neutral lower bound must be less than upper bound.");
+        validateArguments();
 
         final CopyRatioSegmentCollection copyRatioSegments = new CopyRatioSegmentCollection(inputCopyRatioSegmentsFile);
         final CalledCopyRatioSegmentCollection calledCopyRatioSegments =
                 new SimpleCopyRatioCaller(copyRatioSegments,
                         neutralSegmentCopyRatioLowerBound, neutralSegmentCopyRatioUpperBound,
                         outlierNeutralSegmentCopyRatioZScoreThreshold, callingCopyRatioZScoreThreshold)
                         .makeCalls();
+
+        logger.info(String.format("Writing called segments to %s...", outputCalledCopyRatioSegmentsFile.getAbsolutePath()));
         calledCopyRatioSegments.write(outputCalledCopyRatioSegmentsFile);
 
         // Write an IGV compatible collection
         final CalledLegacySegmentCollection legacySegmentCollection = createCalledLegacySegmentCollection(calledCopyRatioSegments);
-        legacySegmentCollection.write(createCalledLegacyOutputFilename(outputCalledCopyRatioSegmentsFile));
+        logger.info(String.format("Writing called segments in IGV-compatible format to %s...", outputCalledLegacySegmentsFile.getAbsolutePath()));
+        legacySegmentCollection.write(outputCalledLegacySegmentsFile);
 
-        return "SUCCESS";
+        logger.info("CallCopyRatioSegments complete.");
+
+        return null;
+    }
+
+    private void validateArguments() {
+        Utils.validateArg(neutralSegmentCopyRatioLowerBound < neutralSegmentCopyRatioUpperBound,
+                String.format("Copy-neutral lower bound (%s) must be less than upper bound (%s).",
+                        NEUTRAL_SEGMENT_COPY_RATIO_LOWER_BOUND_LONG_NAME,
+                        NEUTRAL_SEGMENT_COPY_RATIO_UPPER_BOUND_LONG_NAME));
+
+        CopyNumberArgumentValidationUtils.validateInputs(inputCopyRatioSegmentsFile);
+        outputCalledLegacySegmentsFile = createCalledLegacySegmentsFile(outputCalledCopyRatioSegmentsFile);
+        CopyNumberArgumentValidationUtils.validateOutputFiles(
+                outputCalledCopyRatioSegmentsFile,
+                outputCalledLegacySegmentsFile);
     }
 
     @VisibleForTesting
-    public static File createCalledLegacyOutputFilename(final File calledCopyRatioBaseFilename) {
-        return new File(FilenameUtils.removeExtension(calledCopyRatioBaseFilename.getAbsolutePath()) + LEGACY_SEGMENTS_FILE_SUFFIX);
+    static File createCalledLegacySegmentsFile(final File calledCopyRatioSegmentsFile) {
+        return new File(FilenameUtils.removeExtension(calledCopyRatioSegmentsFile.getAbsolutePath()) + LEGACY_SEGMENTS_FILE_SUFFIX);
     }
 
     private static CalledLegacySegmentCollection createCalledLegacySegmentCollection(final CalledCopyRatioSegmentCollection segments) {

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CollectAllelicCounts.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CollectAllelicCounts.java
@@ -2,8 +2,6 @@
 
 import com.google.common.collect.ImmutableList;
 import htsjdk.samtools.SAMSequenceDictionary;
-import org.apache.logging.log4j.LogManager;
-import org.apache.logging.log4j.Logger;
 import org.broadinstitute.barclay.argparser.Argument;
 import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
 import org.broadinstitute.barclay.help.DocumentedFeature;
@@ -80,11 +78,7 @@
 )
 @DocumentedFeature
 public final class CollectAllelicCounts extends LocusWalker {
-    private static final Logger logger = LogManager.getLogger(CollectAllelicCounts.class);
-
-    static final String MINIMUM_BASE_QUALITY_LONG_NAME = "minimum-base-quality";
-
-    static final int DEFAULT_MINIMUM_MAPPING_QUALITY = 30;
+    private static final int DEFAULT_MINIMUM_MAPPING_QUALITY = 30;
     static final int DEFAULT_MINIMUM_BASE_QUALITY = 20;
 
     static final List<ReadFilter> DEFAULT_ADDITIONAL_READ_FILTERS = ImmutableList.of(
@@ -93,6 +87,8 @@ public final class CollectAllelicCounts extends LocusWalker {
             ReadFilterLibrary.NOT_DUPLICATE,
             new MappingQualityReadFilter(DEFAULT_MINIMUM_MAPPING_QUALITY));
 
+    public static final String MINIMUM_BASE_QUALITY_LONG_NAME = "minimum-base-quality";
+
     @Argument(
             doc = "Output file for allelic counts.",
             fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
@@ -134,20 +130,31 @@ public List<ReadFilter> getDefaultReadFilters() {
 
     @Override
     public void onTraversalStart() {
+        validateArguments();
+
         final SampleLocatableMetadata metadata = MetadataUtils.fromHeader(getHeaderForReads(), Metadata.Type.SAMPLE_LOCATABLE);
         final SAMSequenceDictionary sequenceDictionary = getBestAvailableSequenceDictionary();
+        //this check is currently redundant, since the master dictionary is taken from the reads;
+        //however, if any other dictionary is added in the future, such a check should be performed
         if (!CopyNumberArgumentValidationUtils.isSameDictionary(metadata.getSequenceDictionary(), sequenceDictionary)) {
             logger.warn("Sequence dictionary in BAM does not match the master sequence dictionary.");
         }
         allelicCountCollector = new AllelicCountCollector(metadata);
         logger.info("Collecting allelic counts...");
     }
 
+    private void validateArguments() {
+        CopyNumberArgumentValidationUtils.validateOutputFiles(outputAllelicCountsFile);
+    }
+
     @Override
     public Object onTraversalSuccess() {
+        logger.info(String.format("Writing allelic counts to %s...", outputAllelicCountsFile.getAbsolutePath()));
         allelicCountCollector.getAllelicCounts().write(outputAllelicCountsFile);
-        logger.info("Allelic counts written to " + outputAllelicCountsFile);
-        return("SUCCESS");
+
+        logger.info("CollectAllelicCounts complete.");
+
+        return null;
     }
 
     @Override

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CollectAllelicCountsSpark.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CollectAllelicCountsSpark.java
@@ -1,18 +1,18 @@
 package org.broadinstitute.hellbender.tools.copynumber;
 
-import org.apache.logging.log4j.LogManager;
-import org.apache.logging.log4j.Logger;
+import htsjdk.samtools.SAMSequenceDictionary;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.FlatMapFunction;
 import org.apache.spark.broadcast.Broadcast;
 import org.broadinstitute.barclay.argparser.Argument;
 import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
 import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
-import org.broadinstitute.hellbender.cmdline.programgroups.CopyNumberProgramGroup;
+import org.broadinstitute.hellbender.cmdline.programgroups.CoverageAnalysisProgramGroup;
 import org.broadinstitute.hellbender.engine.filters.ReadFilter;
 import org.broadinstitute.hellbender.engine.spark.LocusWalkerContext;
 import org.broadinstitute.hellbender.engine.spark.LocusWalkerSpark;
+import org.broadinstitute.hellbender.tools.copynumber.arguments.CopyNumberArgumentValidationUtils;
 import org.broadinstitute.hellbender.tools.copynumber.datacollection.AllelicCountCollector;
 import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.Metadata;
 import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.MetadataUtils;
@@ -29,16 +29,13 @@
  * See {@link CollectAllelicCounts}.  This behaves the same, except that it supports spark.
  */
 @CommandLineProgramProperties(
-        summary = "Collects ref/alt counts at sites",
-        oneLineSummary = "Collects ref/alt counts at sites",
-        programGroup = CopyNumberProgramGroup.class
+        summary = "Collects reference and alternate allele counts at specified sites",
+        oneLineSummary = "Collects reference and alternate allele counts at specified sites",
+        programGroup = CoverageAnalysisProgramGroup.class
 )
 public class CollectAllelicCountsSpark extends LocusWalkerSpark {
-
     private static final long serialVersionUID = 1L;
 
-    private static final Logger logger = LogManager.getLogger(CollectAllelicCounts.class);
-
     @Argument(
             doc = "Output file for allelic counts.",
             fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
@@ -72,13 +69,29 @@ public List<ReadFilter> getDefaultReadFilters() {
 
     @Override
     protected void processAlignments(JavaRDD<LocusWalkerContext> rdd, JavaSparkContext ctx) {
+        validateArguments();
+
         final SampleLocatableMetadata metadata = MetadataUtils.fromHeader(getHeaderForReads(), Metadata.Type.SAMPLE_LOCATABLE);
+        final SAMSequenceDictionary sequenceDictionary = getBestAvailableSequenceDictionary();
+        //this check is currently redundant, since the master dictionary is taken from the reads;
+        //however, if any other dictionary is added in the future, such a check should be performed
+        if (!CopyNumberArgumentValidationUtils.isSameDictionary(metadata.getSequenceDictionary(), sequenceDictionary)) {
+            logger.warn("Sequence dictionary in BAM does not match the master sequence dictionary.");
+        }
         final Broadcast<SampleLocatableMetadata> sampleMetadataBroadcast = ctx.broadcast(metadata);
 
         final AllelicCountCollector finalAllelicCountCollector =
                 rdd.mapPartitions(distributedCount(sampleMetadataBroadcast, minimumBaseQuality))
                 .reduce((a1, a2) -> combineAllelicCountCollectors(a1, a2, sampleMetadataBroadcast.getValue()));
+
+        logger.info(String.format("Writing allelic counts to %s...", outputAllelicCountsFile.getAbsolutePath()));
         finalAllelicCountCollector.getAllelicCounts().write(outputAllelicCountsFile);
+
+        logger.info("CollectAllelicCountsSpark complete.");
+    }
+
+    private void validateArguments() {
+        CopyNumberArgumentValidationUtils.validateOutputFiles(outputAllelicCountsFile);
     }
 
     private static FlatMapFunction<Iterator<LocusWalkerContext>, AllelicCountCollector> distributedCount(final Broadcast<SampleLocatableMetadata> sampleMetadataBroadcast,