-
Notifications
You must be signed in to change notification settings - Fork 589
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Added the ability for MarkDuplicatesSpark to accept multiple inputs #5430
Changes from all commits
c123270
d2ce086
27dd6c1
d6c47f2
7ed7a00
c4efaab
bf58fd3
3a54fdb
73155c6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||
---|---|---|---|---|---|---|---|---|
@@ -1,8 +1,9 @@ | ||||||||
package org.broadinstitute.hellbender.engine.spark; | ||||||||
|
||||||||
import htsjdk.samtools.SAMFileHeader; | ||||||||
import htsjdk.samtools.SAMSequenceDictionary; | ||||||||
import com.google.common.annotations.VisibleForTesting; | ||||||||
import htsjdk.samtools.*; | ||||||||
import htsjdk.samtools.reference.ReferenceSequenceFileFactory; | ||||||||
import htsjdk.samtools.util.CloseableIterator; | ||||||||
import htsjdk.samtools.util.GZIIndex; | ||||||||
import htsjdk.samtools.util.IOUtil; | ||||||||
import htsjdk.variant.vcf.VCFHeaderLine; | ||||||||
|
@@ -25,6 +26,7 @@ | |||||||
import org.broadinstitute.hellbender.engine.filters.WellformedReadFilter; | ||||||||
import org.broadinstitute.hellbender.engine.spark.datasources.ReadsSparkSink; | ||||||||
import org.broadinstitute.hellbender.engine.spark.datasources.ReadsSparkSource; | ||||||||
import org.broadinstitute.hellbender.exceptions.GATKException; | ||||||||
import org.broadinstitute.hellbender.exceptions.UserException; | ||||||||
import org.broadinstitute.hellbender.tools.walkers.annotator.Annotation; | ||||||||
import org.broadinstitute.hellbender.utils.SequenceDictionaryUtils; | ||||||||
|
@@ -121,7 +123,7 @@ public abstract class GATKSparkTool extends SparkCommandLineProgram { | |||||||
|
||||||||
private ReadsSparkSource readsSource; | ||||||||
private SAMFileHeader readsHeader; | ||||||||
private String readInput; | ||||||||
private LinkedHashMap<String, SAMFileHeader> readInputs; | ||||||||
private ReferenceMultiSparkSource referenceSource; | ||||||||
private SAMSequenceDictionary referenceDictionary; | ||||||||
private List<SimpleInterval> userIntervals; | ||||||||
|
@@ -158,6 +160,20 @@ public boolean requiresReads() { | |||||||
return false; | ||||||||
} | ||||||||
|
||||||||
/** | ||||||||
* Does this tool support multiple inputs? Tools that do should override this method with the desired {@link ReadInputMergingPolicy}. | ||||||||
* | ||||||||
* @return doNotMerge by default | ||||||||
*/ | ||||||||
public ReadInputMergingPolicy getReadInputMergingPolicy() { | ||||||||
return ReadInputMergingPolicy.doNotMerge; | ||||||||
} | ||||||||
|
||||||||
public static enum ReadInputMergingPolicy { | ||||||||
doNotMerge, | ||||||||
concatMerge | ||||||||
} | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The |
||||||||
|
||||||||
/** | ||||||||
* Does this tool require intervals? Tools that do should override to return true. | ||||||||
* | ||||||||
|
@@ -274,24 +290,39 @@ public JavaRDD<GATKRead> getUnfilteredReads() { | |||||||
} | ||||||||
traversalParameters = new TraversalParameters(getIntervals(), traverseUnmapped); | ||||||||
} else { | ||||||||
traversalParameters = null; // no intervals were specified so return all reads (mapped and unmapped) | ||||||||
traversalParameters = null; | ||||||||
} | ||||||||
|
||||||||
// TODO: This if statement is a temporary hack until #959 gets resolved. | ||||||||
if (readInput.endsWith(".adam")) { | ||||||||
JavaRDD<GATKRead> output = null; | ||||||||
ReadsSparkSource source = readsSource; | ||||||||
for (String input : readInputs.keySet()) { | ||||||||
if (output == null) { | ||||||||
output = getGatkReadJavaRDD(traversalParameters, source, input); | ||||||||
} else { | ||||||||
output = output.union(getGatkReadJavaRDD(traversalParameters, source, input)); | ||||||||
} | ||||||||
} | ||||||||
return output; | ||||||||
} | ||||||||
|
||||||||
protected JavaRDD<GATKRead> getGatkReadJavaRDD(TraversalParameters traversalParameters, ReadsSparkSource source, String input) { | ||||||||
JavaRDD<GATKRead> output; | ||||||||
// TODO: This if statement is a temporary hack until #959 gets resolve | ||||||||
if (input.endsWith(".adam")) { | ||||||||
try { | ||||||||
return readsSource.getADAMReads(readInput, traversalParameters, getHeaderForReads()); | ||||||||
output = source.getADAMReads(input, traversalParameters, getHeaderForReads()); | ||||||||
} catch (IOException e) { | ||||||||
throw new UserException("Failed to read ADAM file " + readInput, e); | ||||||||
throw new UserException("Failed to read ADAM file " + input, e); | ||||||||
} | ||||||||
|
||||||||
} else { | ||||||||
if (hasCramInput() && !hasReference()){ | ||||||||
throw new UserException.MissingReference("A reference file is required when using CRAM files."); | ||||||||
} | ||||||||
final String refPath = hasReference() ? referenceArguments.getReferenceFileName() : null; | ||||||||
return readsSource.getParallelReads(readInput, refPath, traversalParameters, bamPartitionSplitSize); | ||||||||
output = source.getParallelReads(input, refPath, traversalParameters, bamPartitionSplitSize); | ||||||||
} | ||||||||
return output; | ||||||||
} | ||||||||
|
||||||||
/** | ||||||||
|
@@ -334,7 +365,8 @@ public int getRecommendedNumReducers() { | |||||||
if (numReducers != 0) { | ||||||||
return numReducers; | ||||||||
} | ||||||||
return 1 + (int) (BucketUtils.dirSize(getReadSourceName()) / getTargetPartitionSize()); | ||||||||
int size = readInputs.keySet().stream().mapToInt(k -> (int) BucketUtils.dirSize(k)).sum(); | ||||||||
return 1 + (size / getTargetPartitionSize()); | ||||||||
} | ||||||||
|
||||||||
/** | ||||||||
|
@@ -445,8 +477,18 @@ public Collection<Annotation> makeVariantAnnotations() { | |||||||
/** | ||||||||
* Returns the name of the source of reads data. It can be a file name or URL. | ||||||||
*/ | ||||||||
protected String getReadSourceName(){ | ||||||||
return readInput; | ||||||||
protected List<String> getReadSourceName(){ | ||||||||
if (readInputs.size() > 1) { | ||||||||
throw new GATKException("Multiple ReadsDataSources specificed but a single source requested by the tool"); | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not ideal maybe but I don't know what else to do about this method... maybe change it to getReadSourceNames and return a list? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right, I wanted to avoid returning a list because there are a bunch of places in the code where tools expect there to be only one read source kicking around and I didn't want to risk breaking something or having to uproot everything... I agree its pretty gross... Theoretically it shouldn't be a problem for most tools which don't accept multiple inputs anyway There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should change it to a list, or even get rid of it. All of the existing consumers except one use it to generate an output name for saving metrics (relatively easy to change); the other one ( There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @cmnbroad Alright, I'll return a list. |
||||||||
} | ||||||||
return new ArrayList<>(readInputs.keySet()); | ||||||||
} | ||||||||
|
||||||||
/** | ||||||||
* Returns a map of read input to header. | ||||||||
*/ | ||||||||
protected LinkedHashMap<String, SAMFileHeader> getReadSouceHeaderMap(){ | ||||||||
return readInputs; | ||||||||
} | ||||||||
|
||||||||
/** | ||||||||
|
@@ -489,15 +531,37 @@ private void initializeReads(final JavaSparkContext sparkContext) { | |||||||
return; | ||||||||
} | ||||||||
|
||||||||
if ( readArguments.getReadFilesNames().size() != 1 ) { | ||||||||
throw new UserException("Sorry, we only support a single reads input for spark tools for now."); | ||||||||
if (getReadInputMergingPolicy() == ReadInputMergingPolicy.doNotMerge && readArguments.getReadFilesNames().size() != 1 ) { | ||||||||
throw new UserException("Sorry, we only support a single reads input for for this spark tool."); | ||||||||
} | ||||||||
|
||||||||
readInput = readArguments.getReadFilesNames().get(0); | ||||||||
readInputs = new LinkedHashMap<>(); | ||||||||
readsSource = new ReadsSparkSource(sparkContext, readArguments.getReadValidationStringency()); | ||||||||
readsHeader = readsSource.getHeader( | ||||||||
readInput, | ||||||||
hasReference() ? referenceArguments.getReferenceFileName() : null); | ||||||||
for (String input : readArguments.getReadFilesNames()) { | ||||||||
readInputs.put(input, readsSource.getHeader( | ||||||||
input, hasReference() ? referenceArguments.getReferenceFileName() : null)); | ||||||||
} | ||||||||
readsHeader = createHeaderMerger().getMergedHeader(); | ||||||||
} | ||||||||
|
||||||||
/** | ||||||||
* Create a header merger from the individual SAM/BAM headers in our readers | ||||||||
* | ||||||||
* @return a header merger containing all individual headers in this data source | ||||||||
*/ | ||||||||
private SamFileHeaderMerger createHeaderMerger() { | ||||||||
return new SamFileHeaderMerger(identifySortOrder(readInputs.values()), readInputs.values(), true); | ||||||||
} | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There should be a newline here:
Suggested change
|
||||||||
@VisibleForTesting | ||||||||
static SAMFileHeader.SortOrder identifySortOrder(final Collection<SAMFileHeader> headers){ | ||||||||
final Set<SAMFileHeader.SortOrder> sortOrders = headers.stream().map(SAMFileHeader::getSortOrder).collect(Collectors.toSet()); | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Clever way to check this. |
||||||||
final SAMFileHeader.SortOrder order; | ||||||||
if (sortOrders.size() == 1) { | ||||||||
order = sortOrders.iterator().next(); | ||||||||
} else { | ||||||||
order = SAMFileHeader.SortOrder.unsorted; | ||||||||
} | ||||||||
return order; | ||||||||
} | ||||||||
|
||||||||
/** | ||||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why not default to
concatMerge
and support multiple inputs for all Spark tools?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I debated this with @lbergelson. It seems like an undesirable behavior for tools that tailor their behavior to the header sort order to union the RDDs of multiple inputs potentially invalidating any assumptions about input ordering.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm in favor of tools opting in.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK