Skip to content

Commit

Permalink
Improved consistency of style and input/output validation across CNV …
Browse files Browse the repository at this point in the history
…tools. (#4825)
  • Loading branch information
samuelklee committed Mar 4, 2019
1 parent ffa96a4 commit f8c68e7
Show file tree
Hide file tree
Showing 41 changed files with 644 additions and 524 deletions.
32 changes: 21 additions & 11 deletions scripts/cnv_wdl/cnv_common_tasks.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -301,29 +301,39 @@ task ScatterIntervals {

String base_filename = basename(interval_list, ".interval_list")

String dollar = "$" #WDL workaround, see https://github.com/broadinstitute/cromwell/issues/1819
command <<<
set -e
# IntervalListTools will fail if the output directory does not exist, so we create it
mkdir ${output_dir_}
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk4_jar_override}

{
>&2 echo "Attempting to run IntervalListTools..."

# IntervalListTools behaves differently when scattering to a ssingle or multiple shards, so we do some handling in bash

# IntervalListTools puts remainder intervals in the last shard, so integer division gives the number of shards
# (unless NUM_INTERVALS < num_intervals_per_scatter and NUM_SCATTERS = 0, in which case we still want a single shard)
NUM_INTERVALS=${dollar}(grep -v '@' ${interval_list} | wc -l)
NUM_SCATTERS=${dollar}(echo ${dollar}((NUM_INTERVALS / ${num_intervals_per_scatter})))

if [ $NUM_SCATTERS -le 1 ]; then
# if only a single shard is required, then we can just rename the original interval list
>&2 echo "Not running IntervalListTools because only a single shard is required. Copying original interval list..."
cp ${interval_list} ${output_dir_}/${base_filename}.scattered.0001.interval_list
else
gatk --java-options "-Xmx${command_mem_mb}m" IntervalListTools \
--INPUT ${interval_list} \
--SUBDIVISION_MODE INTERVAL_COUNT \
--SCATTER_CONTENT ${num_intervals_per_scatter} \
--OUTPUT ${output_dir_} &&
--OUTPUT ${output_dir_}

# output files are named output_dir_/temp_0001_of_N/scattered.interval_list, etc. (N = number of scatters);
# we rename them as output_dir_/base_filename.scattered.0000.interval_list, etc.
ls ${output_dir_}/*/scattered.interval_list | \
# we rename them as output_dir_/base_filename.scattered.0001.interval_list, etc.
ls -v ${output_dir_}/*/scattered.interval_list | \
cat -n | \
while read n filename; do mv $filename ${output_dir_}/${base_filename}.scattered.$(printf "%04d" $n).interval_list; done
rm -rf ${output_dir_}/temp_*_of_*
} || {
# if only a single shard is required, then we can just rename the original interval list
>&2 echo "IntervalListTools failed because only a single shard is required. Copying original interval list..."
cp ${interval_list} ${output_dir_}/${base_filename}.scattered.1.interval_list
}
fi
>>>

runtime {
Expand Down
2 changes: 0 additions & 2 deletions scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,6 @@ task DetermineGermlineContigPloidyCaseMode {

command <<<
set -e
mkdir ${output_dir_}
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk4_jar_override}
export MKL_NUM_THREADS=${default=8 cpu}
export OMP_NUM_THREADS=${default=8 cpu}
Expand Down Expand Up @@ -369,7 +368,6 @@ task GermlineCNVCallerCaseMode {

command <<<
set -e
mkdir ${output_dir_}
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk4_jar_override}
export MKL_NUM_THREADS=${default=8 cpu}
export OMP_NUM_THREADS=${default=8 cpu}
Expand Down
2 changes: 0 additions & 2 deletions scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -374,7 +374,6 @@ task DetermineGermlineContigPloidyCohortMode {

command <<<
set -e
mkdir ${output_dir_}
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk4_jar_override}
export MKL_NUM_THREADS=${default=8 cpu}
export OMP_NUM_THREADS=${default=8 cpu}
Expand Down Expand Up @@ -482,7 +481,6 @@ task GermlineCNVCallerCohortMode {
command <<<
set -e
mkdir ${output_dir_}
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk4_jar_override}
export MKL_NUM_THREADS=${default=8 cpu}
export OMP_NUM_THREADS=${default=8 cpu}
Expand Down
3 changes: 0 additions & 3 deletions scripts/cnv_wdl/somatic/cnv_somatic_pair_workflow.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -597,7 +597,6 @@ task ModelSegments {

command <<<
set -e
mkdir ${output_dir_}
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk4_jar_override}

gatk --java-options "-Xmx${command_mem_mb}m" ModelSegments \
Expand Down Expand Up @@ -727,7 +726,6 @@ task PlotDenoisedCopyRatios {

command <<<
set -e
mkdir ${output_dir_}
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk4_jar_override}

gatk --java-options "-Xmx${command_mem_mb}m" PlotDenoisedCopyRatios \
Expand Down Expand Up @@ -787,7 +785,6 @@ task PlotModeledSegments {

command <<<
set -e
mkdir ${output_dir_}
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk4_jar_override}

gatk --java-options "-Xmx${command_mem_mb}m" PlotModeledSegments \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import org.apache.commons.collections4.IteratorUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.BetaFeature;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
Expand Down Expand Up @@ -157,7 +156,7 @@ public boolean requiresIntervals() {

@Override
public void onTraversalStart() {
CopyNumberArgumentValidationUtils.validateIntervalArgumentCollection(intervalArgumentCollection);
validateArguments();

logger.info("Loading intervals for annotation...");
sequenceDictionary = getBestAvailableSequenceDictionary();
Expand Down Expand Up @@ -191,6 +190,11 @@ public void onTraversalStart() {
logger.info("Annotating intervals...");
}

private void validateArguments() {
CopyNumberArgumentValidationUtils.validateIntervalArgumentCollection(intervalArgumentCollection);
CopyNumberArgumentValidationUtils.validateOutputFiles(outputAnnotatedIntervalsFile);
}

private static void checkForOverlaps(final FeatureManager featureManager,
final FeatureInput<BEDFeature> featureTrackPath,
final SAMSequenceDictionary sequenceDictionary) {
Expand Down Expand Up @@ -229,8 +233,12 @@ public void traverse() {
public Object onTraversalSuccess() {
reference.close();
features.close();
logger.info(String.format("Writing annotated intervals to %s...", outputAnnotatedIntervalsFile));

logger.info(String.format("Writing annotated intervals to %s...", outputAnnotatedIntervalsFile.getAbsolutePath()));
annotatedIntervals.write(outputAnnotatedIntervalsFile);

logger.info("AnnotateIntervals complete.");

return super.onTraversalSuccess();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
import com.google.common.annotations.VisibleForTesting;
import org.apache.commons.io.FilenameUtils;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.BetaFeature;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.programgroups.CopyNumberProgramGroup;
import org.broadinstitute.hellbender.tools.copynumber.arguments.CopyNumberArgumentValidationUtils;
import org.broadinstitute.hellbender.tools.copynumber.caller.SimpleCopyRatioCaller;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CalledCopyRatioSegmentCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CalledLegacySegmentCollection;
Expand Down Expand Up @@ -56,6 +56,16 @@
* a row specifying the column headers contained in {@link CalledCopyRatioSegmentCollection.CalledCopyRatioSegmentTableColumn},
* and the corresponding entry rows.
* </li>
* <li>
* CBS-formatted .igv.seg file (compatible with IGV).
* This is a tab-separated values (TSV) file with CBS-format column headers
* (see <a href="http://software.broadinstitute.org/cancer/software/genepattern/file-formats-guide#CBS">
* http://software.broadinstitute.org/cancer/software/genepattern/file-formats-guide#CBS</a>)
* and the corresponding entry rows that can be plotted using IGV (see
* <a href="https://software.broadinstitute.org/software/igv/SEG">
* https://software.broadinstitute.org/software/igv/SEG</a>).
* The mean log2 copy ratio is given in the SEGMENT_MEAN column.
* </li>
* </ul>
*
* <h3>Usage example</h3>
Expand Down Expand Up @@ -131,29 +141,48 @@ public final class CallCopyRatioSegments extends CommandLineProgram {
)
private double callingCopyRatioZScoreThreshold = 2.;

private File outputCalledLegacySegmentsFile;

@Override
protected Object doWork() {
Utils.validateArg(neutralSegmentCopyRatioLowerBound < neutralSegmentCopyRatioUpperBound,
"Copy-neutral lower bound must be less than upper bound.");
validateArguments();

final CopyRatioSegmentCollection copyRatioSegments = new CopyRatioSegmentCollection(inputCopyRatioSegmentsFile);
final CalledCopyRatioSegmentCollection calledCopyRatioSegments =
new SimpleCopyRatioCaller(copyRatioSegments,
neutralSegmentCopyRatioLowerBound, neutralSegmentCopyRatioUpperBound,
outlierNeutralSegmentCopyRatioZScoreThreshold, callingCopyRatioZScoreThreshold)
.makeCalls();

logger.info(String.format("Writing called segments to %s...", outputCalledCopyRatioSegmentsFile.getAbsolutePath()));
calledCopyRatioSegments.write(outputCalledCopyRatioSegmentsFile);

// Write an IGV compatible collection
final CalledLegacySegmentCollection legacySegmentCollection = createCalledLegacySegmentCollection(calledCopyRatioSegments);
legacySegmentCollection.write(createCalledLegacyOutputFilename(outputCalledCopyRatioSegmentsFile));
logger.info(String.format("Writing called segments in IGV-compatible format to %s...", outputCalledLegacySegmentsFile.getAbsolutePath()));
legacySegmentCollection.write(outputCalledLegacySegmentsFile);

return "SUCCESS";
logger.info("CallCopyRatioSegments complete.");

return null;
}

private void validateArguments() {
Utils.validateArg(neutralSegmentCopyRatioLowerBound < neutralSegmentCopyRatioUpperBound,
String.format("Copy-neutral lower bound (%s) must be less than upper bound (%s).",
NEUTRAL_SEGMENT_COPY_RATIO_LOWER_BOUND_LONG_NAME,
NEUTRAL_SEGMENT_COPY_RATIO_UPPER_BOUND_LONG_NAME));

CopyNumberArgumentValidationUtils.validateInputs(inputCopyRatioSegmentsFile);
outputCalledLegacySegmentsFile = createCalledLegacySegmentsFile(outputCalledCopyRatioSegmentsFile);
CopyNumberArgumentValidationUtils.validateOutputFiles(
outputCalledCopyRatioSegmentsFile,
outputCalledLegacySegmentsFile);
}

@VisibleForTesting
public static File createCalledLegacyOutputFilename(final File calledCopyRatioBaseFilename) {
return new File(FilenameUtils.removeExtension(calledCopyRatioBaseFilename.getAbsolutePath()) + LEGACY_SEGMENTS_FILE_SUFFIX);
static File createCalledLegacySegmentsFile(final File calledCopyRatioSegmentsFile) {
return new File(FilenameUtils.removeExtension(calledCopyRatioSegmentsFile.getAbsolutePath()) + LEGACY_SEGMENTS_FILE_SUFFIX);
}

private static CalledLegacySegmentCollection createCalledLegacySegmentCollection(final CalledCopyRatioSegmentCollection segments) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@

import com.google.common.collect.ImmutableList;
import htsjdk.samtools.SAMSequenceDictionary;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
Expand Down Expand Up @@ -80,11 +78,7 @@
)
@DocumentedFeature
public final class CollectAllelicCounts extends LocusWalker {
private static final Logger logger = LogManager.getLogger(CollectAllelicCounts.class);

static final String MINIMUM_BASE_QUALITY_LONG_NAME = "minimum-base-quality";

static final int DEFAULT_MINIMUM_MAPPING_QUALITY = 30;
private static final int DEFAULT_MINIMUM_MAPPING_QUALITY = 30;
static final int DEFAULT_MINIMUM_BASE_QUALITY = 20;

static final List<ReadFilter> DEFAULT_ADDITIONAL_READ_FILTERS = ImmutableList.of(
Expand All @@ -93,6 +87,8 @@ public final class CollectAllelicCounts extends LocusWalker {
ReadFilterLibrary.NOT_DUPLICATE,
new MappingQualityReadFilter(DEFAULT_MINIMUM_MAPPING_QUALITY));

public static final String MINIMUM_BASE_QUALITY_LONG_NAME = "minimum-base-quality";

@Argument(
doc = "Output file for allelic counts.",
fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
Expand Down Expand Up @@ -134,20 +130,31 @@ public List<ReadFilter> getDefaultReadFilters() {

@Override
public void onTraversalStart() {
validateArguments();

final SampleLocatableMetadata metadata = MetadataUtils.fromHeader(getHeaderForReads(), Metadata.Type.SAMPLE_LOCATABLE);
final SAMSequenceDictionary sequenceDictionary = getBestAvailableSequenceDictionary();
//this check is currently redundant, since the master dictionary is taken from the reads;
//however, if any other dictionary is added in the future, such a check should be performed
if (!CopyNumberArgumentValidationUtils.isSameDictionary(metadata.getSequenceDictionary(), sequenceDictionary)) {
logger.warn("Sequence dictionary in BAM does not match the master sequence dictionary.");
}
allelicCountCollector = new AllelicCountCollector(metadata);
logger.info("Collecting allelic counts...");
}

private void validateArguments() {
CopyNumberArgumentValidationUtils.validateOutputFiles(outputAllelicCountsFile);
}

@Override
public Object onTraversalSuccess() {
logger.info(String.format("Writing allelic counts to %s...", outputAllelicCountsFile.getAbsolutePath()));
allelicCountCollector.getAllelicCounts().write(outputAllelicCountsFile);
logger.info("Allelic counts written to " + outputAllelicCountsFile);
return("SUCCESS");

logger.info("CollectAllelicCounts complete.");

return null;
}

@Override
Expand Down
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
package org.broadinstitute.hellbender.tools.copynumber;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import htsjdk.samtools.SAMSequenceDictionary;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.broadcast.Broadcast;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.programgroups.CopyNumberProgramGroup;
import org.broadinstitute.hellbender.cmdline.programgroups.CoverageAnalysisProgramGroup;
import org.broadinstitute.hellbender.engine.filters.ReadFilter;
import org.broadinstitute.hellbender.engine.spark.LocusWalkerContext;
import org.broadinstitute.hellbender.engine.spark.LocusWalkerSpark;
import org.broadinstitute.hellbender.tools.copynumber.arguments.CopyNumberArgumentValidationUtils;
import org.broadinstitute.hellbender.tools.copynumber.datacollection.AllelicCountCollector;
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.Metadata;
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.MetadataUtils;
Expand All @@ -29,16 +29,13 @@
* See {@link CollectAllelicCounts}. This behaves the same, except that it supports spark.
*/
@CommandLineProgramProperties(
summary = "Collects ref/alt counts at sites",
oneLineSummary = "Collects ref/alt counts at sites",
programGroup = CopyNumberProgramGroup.class
summary = "Collects reference and alternate allele counts at specified sites",
oneLineSummary = "Collects reference and alternate allele counts at specified sites",
programGroup = CoverageAnalysisProgramGroup.class
)
public class CollectAllelicCountsSpark extends LocusWalkerSpark {

private static final long serialVersionUID = 1L;

private static final Logger logger = LogManager.getLogger(CollectAllelicCounts.class);

@Argument(
doc = "Output file for allelic counts.",
fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
Expand Down Expand Up @@ -72,13 +69,29 @@ public List<ReadFilter> getDefaultReadFilters() {

@Override
protected void processAlignments(JavaRDD<LocusWalkerContext> rdd, JavaSparkContext ctx) {
validateArguments();

final SampleLocatableMetadata metadata = MetadataUtils.fromHeader(getHeaderForReads(), Metadata.Type.SAMPLE_LOCATABLE);
final SAMSequenceDictionary sequenceDictionary = getBestAvailableSequenceDictionary();
//this check is currently redundant, since the master dictionary is taken from the reads;
//however, if any other dictionary is added in the future, such a check should be performed
if (!CopyNumberArgumentValidationUtils.isSameDictionary(metadata.getSequenceDictionary(), sequenceDictionary)) {
logger.warn("Sequence dictionary in BAM does not match the master sequence dictionary.");
}
final Broadcast<SampleLocatableMetadata> sampleMetadataBroadcast = ctx.broadcast(metadata);

final AllelicCountCollector finalAllelicCountCollector =
rdd.mapPartitions(distributedCount(sampleMetadataBroadcast, minimumBaseQuality))
.reduce((a1, a2) -> combineAllelicCountCollectors(a1, a2, sampleMetadataBroadcast.getValue()));

logger.info(String.format("Writing allelic counts to %s...", outputAllelicCountsFile.getAbsolutePath()));
finalAllelicCountCollector.getAllelicCounts().write(outputAllelicCountsFile);

logger.info("CollectAllelicCountsSpark complete.");
}

private void validateArguments() {
CopyNumberArgumentValidationUtils.validateOutputFiles(outputAllelicCountsFile);
}

private static FlatMapFunction<Iterator<LocusWalkerContext>, AllelicCountCollector> distributedCount(final Broadcast<SampleLocatableMetadata> sampleMetadataBroadcast,
Expand Down
Loading

0 comments on commit f8c68e7

Please sign in to comment.