Skip to content

Commit

Permalink
More refactoring PDHCE and preparing for joint detection (#8492)
Browse files Browse the repository at this point in the history
* streamlining EventGroup code

* PartiallyDeterminedHaplotype can handle multiple determined events

* Branching in terms of included events, not excluded events

* unit tests for event group clustering and branching
  • Loading branch information
davidbenjamin authored Sep 21, 2023
1 parent c9bf941 commit 70e047d
Show file tree
Hide file tree
Showing 9 changed files with 469 additions and 310 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -588,7 +588,7 @@ public void clearHaplotypes() {
haplotypes.clear();
refHaplotype = null;
}
public void replaceAllHaplotypes(Set<Haplotype> list) {
public void replaceAllHaplotypes(Collection<Haplotype> list) {
haplotypes.clear();
refHaplotype = null;
for ( Haplotype h : list ) {
Expand All @@ -612,8 +612,8 @@ private void removeHaplotype(final Haplotype hap) {
}
}

public void setPartiallyDeterminedMode() {
this.isPartiallyDeterminedList = true;
public void setPartiallyDeterminedMode(final boolean isPartiallyDetermined) {
this.isPartiallyDeterminedList = isPartiallyDetermined;
}

public boolean isPartiallyDeterminedList() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -832,7 +832,7 @@ public List<VariantContext> callRegion(final AssemblyRegion region, final Featur
final SWParameters readToHaplotypeSWParameters = hcArgs.getReadToHaplotypeSWParameters();
// TODO Yes we skip realignment entirely when we are in DRAGEN-GATK PDHMM mode. Realignment of the reads makes no sense when
// TODO the bases of the haplotypes used for calling no longer reflect specified variants present.
if (!(hcArgs.pileupDetectionArgs.generatePDHaplotypes && !hcArgs.pileupDetectionArgs.determinePDHaps)) {
if (!(hcArgs.pileupDetectionArgs.generatePDHaplotypes && !hcArgs.pileupDetectionArgs.useDeterminedHaplotypesDespitePdhmmMode)) {
final Map<GATKRead, GATKRead> readRealignments = AssemblyBasedCallerUtils.realignReadsToTheirBestHaplotype(subsettedReadLikelihoodsFinal, assemblyResult.getReferenceHaplotype(), assemblyResult.getPaddedReferenceLoc(), aligner, readToHaplotypeSWParameters);
subsettedReadLikelihoodsFinal.changeEvidence(readRealignments);
}
Expand Down

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ public final class PileupDetectionArgumentCollection {
public GATKPath pdhmmDebugOutputResults = null;
@Hidden
@Argument(fullName= DETERMINE_PD_HAPS, doc = "PDHMM: As an alternative to using the PDHMM, run all of the haplotype branching/determination code and instead of using the PDHMM use the old HMM with determined haplotypes. NOTE: this often fails and fallsback to other code due to combinatorial expansion. (Requires '--"+GENERATE_PARTIALLY_DETERMINED_HAPLOTYPES_LONG_NAME+"' argument)", optional = true)
public boolean determinePDHaps = false;
public boolean useDeterminedHaplotypesDespitePdhmmMode = false;
@Hidden
@Argument(fullName= DEBUG_PILEUPCALLING_ARG, doc = "PDHMM: If set, print to stdout a prodigious amount of debugging information about each of the steps involved in artificial haplotype construction and filtering. (Requires '--"+GENERATE_PARTIALLY_DETERMINED_HAPLOTYPES_LONG_NAME+"' argument)", optional = true)
public boolean debugPileupStdout = false;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import java.util.Collection;
import java.util.Iterator;
import java.util.stream.IntStream;

/**
* Small BitSet with a capacity of 30 elements, corresponding to the number of bits in an int.
Expand Down Expand Up @@ -121,6 +122,11 @@ public boolean get(final int element) {
return (bits & elementIndex(element)) != 0;
}

// IntStream of indices set to true i.e. contained in the set
public IntStream stream(final int capacity) {
return IntStream.range(0, capacity).filter(this::get);
}

public boolean isEmpty() { return bits == 0; }

public boolean hasElementGreaterThan(final int element) { return bits >= 1 << element; }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ public static void buildEventMapsForHaplotypes( final Collection<Haplotype> hapl
//TODO h.recomputeAndSetEventMap() with overrides for the two haplotype classes should replace this casting+checking code here which is prone to error.

// Since PD haplotypes Know what alleles are variants, simply ask it and generate the map that way.
final EventMap events = h.isPartiallyDetermined() ? new EventMap(((PartiallyDeterminedHaplotype) h).getDeterminedAlleles()) :
final EventMap events = h.isPartiallyDetermined() ? new EventMap(((PartiallyDeterminedHaplotype) h).getDeterminedEvents()) :
fromHaplotype(h, ref, refLoc, maxMnpDistance);
h.setEventMap(events);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,7 @@
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;

import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.*;
import java.util.function.Function;
import java.util.stream.Collectors;

Expand Down Expand Up @@ -75,48 +72,43 @@ public byte[] getAlternateBases() {

private final byte[] alternateBases;
private final List<Event> constituentBuiltEvents;
// NOTE: we must store ALL of the determined events at this site (which is different than the constituent events, we expect the constituent
// events for one of these objects to only be a single element) for the purposes of the overlapping reads PDHMM optimization.
// At Multiallelic sites, we ultimately genotype all of the alleles at once. If we aren't careful, reads that only overlap some
// alleles at a site will end up with incorrect/undetermined PDHMM scores for a subset of alleles in the genotyper which can
// lead to false positives/poorly called sites.
private List<Event> allDeterminedEventsAtThisSite;

//TODO Eventually these will have to be refactored to support multiple determined alleles per PDHaplotype
private final Event alleleBearingEvent; // NOTE this must be in both of the previous lists
private final long determinedPosition;
private final boolean isDeterminedAlleleRef;
private final Set<Event> determinedEvents; // NOTE this must be a subset (possibly empty if the determined allele is ref) of both of the previous lists
private final int determinedPosition;

private SimpleInterval cachedExtent;

/**
* Compares two haplotypes first by their lengths and then by lexicographic order of their bases.
*/
public static final Comparator<Haplotype> SIZE_AND_BASE_ORDER =
Comparator.comparingInt((Haplotype hap) -> hap.getBases().length)
.thenComparing(Allele::getBaseString);
// NOTE: we need all of the events at the determined site (all of which are determined in *some* PD haplotype) for the purposes
// of the overlapping reads PDHMM optimization. At Multiallelic sites, we ultimately genotype all of the alleles at once.
// If we aren't careful, reads that only overlap some alleles at a site will end up with incorrect/undetermined PDHMM scores
// for a subset of alleles in the genotyper which can lead to false positives/poorly called sites.
//NOTE: we never want the genotyper to handle reads that were not HMM scored, caching this extent helps keep us safe from messy sites
private final SimpleInterval determinedExtent;

/**
* @param base base (reference) haplotype used to construct the PDHaplotype
* @param isRefAllele is the determined allele reference or alt
* @param pdBytes array of bytes indicating what bases are skips for the pdhmm
* @param constituentEvents events (both determined and undetermined) covered by this haplotype, should follow the rules for PD variants
* @param eventWithVariant event from @param constituentEvents that is determined for this pd haplotype
* @param determinedEvents events from @param constituentEvents that are determined for this pd haplotype. Empty if determined allele is ref
* @param cigar haplotype cigar agianst the reference
* @param determinedPosition position (wrt the reference contig) that the haplotype should be considered determined //TODO this will be refactored to be a range of events in JointDetection
* @param getAlignmentStartHapwrtRef alignment startHapwrtRef from baseHaplotype corresponding to the in-memory storage of reference bases (must be set for trimming/clipping ops to work)
*/
public PartiallyDeterminedHaplotype(final Haplotype base, boolean isRefAllele, byte[] pdBytes, List<Event> constituentEvents,
final Event eventWithVariant, final Cigar cigar, long determinedPosition, int getAlignmentStartHapwrtRef) {
public PartiallyDeterminedHaplotype(final Haplotype base, byte[] pdBytes, List<Event> constituentEvents, final Set<Event> determinedEvents,
final Cigar cigar, final int determinedPosition, List<Event> allEventsAtDeterminedLocus, int getAlignmentStartHapwrtRef) {
super(base.getBases(), false, base.getAlignmentStartHapwrtRef(), cigar);
Utils.validateArg(base.length() == pdBytes.length, "pdBytes array must have same length as base haplotype.");
this.setGenomeLocation(base.getGenomeLocation());
this.alternateBases = pdBytes;
this.constituentBuiltEvents = constituentEvents;
this.alleleBearingEvent = eventWithVariant;
this.allDeterminedEventsAtThisSite = Collections.singletonList(eventWithVariant);
// TODO: this needs to generalize to a set of determined events or empty if ref is determined
this.determinedEvents = determinedEvents;

final int minStart = allEventsAtDeterminedLocus.stream().mapToInt(Event::getStart).min().orElse(determinedPosition);
final int maxEnd = allEventsAtDeterminedLocus.stream().mapToInt(Event::getEnd).max().orElse(determinedPosition);
determinedExtent = new SimpleInterval(getContig(), minStart, maxEnd);

// TODO: eventually determined events can be at different positions
this.determinedPosition = determinedPosition;
this.isDeterminedAlleleRef = isRefAllele;
setAlignmentStartHapwrtRef(getAlignmentStartHapwrtRef);
}

Expand All @@ -136,7 +128,7 @@ public String toString() {
String output = "HapLen:"+length() +", "+new String(getDisplayBases());
output = output + "\nUnresolved Bases["+alternateBases.length+"] "+Arrays.toString(alternateBases);
return output + "\n"+getCigar().toString()+" "+ constituentBuiltEvents.stream()
.map(v ->(v==this.alleleBearingEvent ?"*":"")+ getDRAGENDebugEventString( getStart()).apply(v) )
.map(v ->(determinedEvents.contains(v) ?"*":"")+ getDRAGENDebugEventString( getStart()).apply(v) )
.collect(Collectors.joining("->"));
}

Expand All @@ -162,24 +154,13 @@ public int hashCode() {
return Objects.hash(Arrays.hashCode(getBases()),Arrays.hashCode(alternateBases));
}

public List<Event> getDeterminedAlleles() {
return isDeterminedAlleleRef ? Collections.emptyList() : Collections.singletonList(alleleBearingEvent);
}

public void setAllDeterminedEventsAtThisSite(List<Event> allDeterminedEventsAtThisSite) {
this.allDeterminedEventsAtThisSite = allDeterminedEventsAtThisSite;
cachedExtent = null;
public Set<Event> getDeterminedEvents() {
return determinedEvents;
}

//NOTE: we never want the genotyper to handle reads that were not HMM scored, caching this extent helps keep us safe from messy sites
public SimpleInterval getMaximumExtentOfSiteDeterminedAlleles() {
if (cachedExtent == null) {
cachedExtent = new SimpleInterval(alleleBearingEvent);
for( Event event : allDeterminedEventsAtThisSite) {
cachedExtent = cachedExtent.mergeWithContiguous(event);
}
}
return cachedExtent;
return determinedExtent;
}

public long getDeterminedPosition() {
Expand Down
Loading

0 comments on commit 70e047d

Please sign in to comment.