Skip to content

Commit

Permalink
A port of FastaAlternateReferenceMaker and FastaReferenceMaker from G…
Browse files Browse the repository at this point in the history
…ATK3 (#5549)

* Ported FastaAlternateReferenceMaker and FastaReferenceMaker from GATK3

* Added a new walker type ReferenceWalker, and a new tool CountBasesInReference to demonstrate it

* Made changes to the close behavior in FastaWriter, and added additional buffering

Resolves #4891
  • Loading branch information
lbergelson authored and droazen committed Jan 29, 2019
1 parent 5cdb1d9 commit 64e2d3a
Show file tree
Hide file tree
Showing 111 changed files with 2,212 additions and 112 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,12 @@

import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.reference.ReferenceSequence;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.iterators.ByteArrayIterator;

import java.util.Arrays;
import java.util.Iterator;

/**
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
package org.broadinstitute.hellbender.engine;

import org.broadinstitute.hellbender.engine.filters.CountingReadFilter;
import org.broadinstitute.hellbender.tools.examples.ExampleReferenceWalker;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.iterators.IntervalLocusIterator;

/**
* A reference walker is a tool which processes each base in a given reference. Each base can be processed individually
* or in a context of multiple bases in a window. Reads and Feature tracks can optionally be included as well. The
* reference bases to process can be subset to a given set of intervals.
*
* ReferenceWalker authors must implement the apply() method to process each position, and may optionally implement
* {@link #onTraversalStart()} and/or {@link #onTraversalSuccess()}.
*
* Tool authors may also implement {@link #getReferenceWindow(SimpleInterval)} to provide additional bases of context
* around each position.
*
* See the {@link ExampleReferenceWalker} walker for an example.
*/
public abstract class ReferenceWalker extends GATKTool {

@Override
public String getProgressMeterRecordLabel() { return "bases"; }

@Override
public final boolean requiresReference() { return true; }

/**
* Initialize data sources for traversal.
*
* Marked final so that tool authors don't override it. Tool authors should override onTraversalStart() instead.
*/
@Override
protected final void onStartup() {
super.onStartup();
}

/**
* Implementation of reference-locus-based traversal.
* Subclasses can override to provide own behavior but default implementation should be suitable for most uses.
*/
@Override
public void traverse() {
final CountingReadFilter readFilter = makeReadFilter();

for(final SimpleInterval locus : getIntervalIterator()){
final SimpleInterval referenceWindow = getReferenceWindow(locus);
final ReferenceContext referenceContext = new ReferenceContext(reference, locus, referenceWindow);
apply(referenceContext,
new ReadsContext(reads, referenceContext.getWindow(), readFilter), // Will create an empty ReadsContext if reads == null
new FeatureContext(features, referenceContext.getWindow())); // Will create an empty FeatureContext if features == null

progressMeter.update(referenceContext.getInterval());
};

}

/**
* Determine the window to use when creating the ReferenceContext in apply. This determines which reference bases are
* seen at each position, as well as which reads / features are considered to overlap the reference site.
*
* The default implementation returns the single reference locus passed that is passed in, but subclasses may override
* this method in order to change the window size.
*
* @param locus the current locus being processed in the traversal
* @return the window to use when creating the ReferenceContext that is passed into {@link #apply(ReferenceContext, ReadsContext, FeatureContext)}
*/
protected SimpleInterval getReferenceWindow(SimpleInterval locus){
return locus;
}

private Iterable<SimpleInterval> getIntervalIterator(){
return () -> new IntervalLocusIterator(getTraversalIntervals().iterator());
}

/**
* Process an individual reference locus (with optional contextual information). Must be implemented by tool authors.
* In general, tool authors should simply stream their output from apply(), and maintain as little internal state
* as possible.
*
* @param referenceContext Reference bases at the current locus. The window around the single locus is provided by {@link #getReferenceWindow(SimpleInterval)}
* Additional bases my be retrieved by invoking the by invoking {@link ReferenceContext#setWindow}
* on this object before calling {@link ReferenceContext#getBases}, however this will not effect
* the set of reads and features that are considered overlapping which are based on the preset
* window passed in.
* @param readsContext Reads spanning the current reference window. Will be an empty, but non-null, context object
* if there is no backing source of Read data (in which case all queries on it will return an
* empty List).
* @param featureContext Features spanning the current reference window. Will be an empty, but non-null, context object
* if there is no backing source of Feature data (in which case all queries on it will return an
* empty List).
*
*/
public abstract void apply(ReferenceContext referenceContext, ReadsContext readsContext, FeatureContext featureContext );

/**
* Shutdown data sources.
*
* Marked final so that tool authors don't override it. Tool authors should override {@link #onTraversalSuccess()} instead.
*/
@Override
protected final void onShutdown() {
// Overridden only to make final so that concrete tool implementations don't override
super.onShutdown();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
package org.broadinstitute.hellbender.tools.examples;

import com.google.common.annotations.VisibleForTesting;
import htsjdk.variant.variantcontext.VariantContext;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.programgroups.ExampleProgramGroup;
import org.broadinstitute.hellbender.engine.*;
import org.broadinstitute.hellbender.utils.SimpleInterval;

import java.util.Comparator;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;

/**
* Counts the number of times each reference context is seen as well as how many times it's overlapped by reads and variants.
* Why you would want to do this is a mystery.
*
* This is a toy example to illustrate how to implement a ReferenceWalker.
*/
@CommandLineProgramProperties(
summary = "Example of how to implement a ReferenceWalker that uses reads and features as well as a custom window",
oneLineSummary = "Example of how to implement a ReferenceWalker",
programGroup = ExampleProgramGroup.class,
omitFromCommandLine = true)
public class ExampleReferenceWalker extends ReferenceWalker {

@Argument(fullName = StandardArgumentDefinitions.VARIANT_LONG_NAME,
shortName = StandardArgumentDefinitions.VARIANT_SHORT_NAME,
doc="variants to count overlaps of")
private FeatureInput<VariantContext> variants;

@VisibleForTesting
final Map<String, OverlapCounts> contextCounts = new HashMap<>();

@Override
protected SimpleInterval getReferenceWindow(SimpleInterval locus) {
// add one base of padding to each side of each reference locus.
return new SimpleInterval(locus.getContig(), Math.max(locus.getStart() -1, 1), locus.getStart()+1);
}

@Override
public void apply(ReferenceContext referenceContext, ReadsContext readsContext, FeatureContext featureContext) {
final byte[] bases = referenceContext.getBases();
final String baseString = new String(bases);
final OverlapCounts counts = contextCounts.getOrDefault(baseString, new OverlapCounts());
if(readsContext.iterator().hasNext()){
counts.overlappedByReads++;
}

if(!featureContext.getValues(variants).isEmpty()){
counts.overlappedByVariants++;
}

counts.timesSeen++;

contextCounts.put(baseString, counts);
}

@Override
public Object onTraversalSuccess(){
contextCounts.entrySet().stream()
.sorted(Comparator.comparing(Entry::getKey))
.forEachOrdered(entry -> System.out.println(entry.getKey() + " " + entry.getValue().toString()));
return 0;
}

@VisibleForTesting
static class OverlapCounts {
long timesSeen = 0;
long overlappedByReads = 0;
long overlappedByVariants = 0;

@Override
public String toString(){
return String.format("Seen: %d Reads %d Variants %d", timesSeen, overlappedByReads, overlappedByVariants);
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
package org.broadinstitute.hellbender.tools.walkers.fasta;

import com.google.common.annotations.VisibleForTesting;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.ReadsContext;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.engine.ReferenceWalker;
import picard.cmdline.programgroups.ReferenceProgramGroup;

@DocumentedFeature
@CommandLineProgramProperties(
oneLineSummary = "Count the numbers of each base in a reference file",
summary = "Count the number of times each individual base occurs in a reference file.",
programGroup = ReferenceProgramGroup.class
)
public class CountBasesInReference extends ReferenceWalker {
@VisibleForTesting
final long[] baseCounts = new long[256];

@Override
public void apply(ReferenceContext referenceContext, ReadsContext read, FeatureContext featureContext) {
baseCounts[referenceContext.getBase()]++;
}

@Override
public Object onTraversalSuccess(){
for (int i = 0; i < baseCounts.length; i++) {
final long count = baseCounts[i];
if (count > 0){
System.out.println((char)i + " : " + count );
}
}
return 0;
}
}
Loading

0 comments on commit 64e2d3a

Please sign in to comment.