Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed ref conf calculation near indels #5172

Merged
merged 1 commit into from
Sep 17, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -474,7 +474,7 @@ public ActivityProfileState isActive( final AlignmentContext context, final Refe
for( final Map.Entry<String, AlignmentContext> sample : splitContexts.entrySet() ) {
// The ploidy here is not dictated by the sample but by the simple genotyping-engine used to determine whether regions are active or not.
final int activeRegionDetectionHackishSamplePloidy = activeRegionEvaluationGenotyperEngine.getConfiguration().genotypeArgs.samplePloidy;
final double[] genotypeLikelihoods = referenceConfidenceModel.calcGenotypeLikelihoodsOfRefVsAny(activeRegionDetectionHackishSamplePloidy,sample.getValue().getBasePileup(), ref.getBase(), hcArgs.minBaseQualityScore, averageHQSoftClips).genotypeLikelihoods;
final double[] genotypeLikelihoods = referenceConfidenceModel.calcGenotypeLikelihoodsOfRefVsAny(activeRegionDetectionHackishSamplePloidy,sample.getValue().getBasePileup(), ref.getBase(), hcArgs.minBaseQualityScore, averageHQSoftClips, false).genotypeLikelihoods;
genotypes.add( new GenotypeBuilder(sample.getKey()).alleles(noCall).PL(genotypeLikelihoods).make() );
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,7 @@
import org.broadinstitute.hellbender.engine.AssemblyRegion;
import org.broadinstitute.hellbender.tools.walkers.genotyper.PloidyModel;
import org.broadinstitute.hellbender.tools.walkers.variantutils.PosteriorProbabilitiesUtils;
import org.broadinstitute.hellbender.utils.MathUtils;
import org.broadinstitute.hellbender.utils.QualityUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.*;
import org.broadinstitute.hellbender.utils.genotyper.ReadLikelihoods;
import org.broadinstitute.hellbender.utils.genotyper.SampleList;
import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
Expand All @@ -26,6 +23,7 @@
import org.broadinstitute.hellbender.utils.pileup.ReadPileup;
import org.broadinstitute.hellbender.utils.read.AlignmentUtils;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.ReadCoordinateComparator;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;
import org.broadinstitute.hellbender.utils.variant.HomoSapiensConstants;
Expand Down Expand Up @@ -292,7 +290,7 @@ private VariantContext makeReferenceConfidenceVariantContext(final int ploidy,
// Assume infinite population on a single sample.
final int refOffset = offset + globalRefOffset;
final byte refBase = ref[refOffset];
final RefVsAnyResult homRefCalc = calcGenotypeLikelihoodsOfRefVsAny(ploidy, pileup, refBase, BASE_QUAL_THRESHOLD, null);
final RefVsAnyResult homRefCalc = calcGenotypeLikelihoodsOfRefVsAny(ploidy, pileup, refBase, BASE_QUAL_THRESHOLD, null, true);

final Allele refAllele = Allele.create(refBase, true);
final List<Allele> refSiteAlleles = Arrays.asList(refAllele, Allele.NON_REF_ALLELE);
Expand Down Expand Up @@ -405,7 +403,8 @@ public RefVsAnyResult calcGenotypeLikelihoodsOfRefVsAny(final int ploidy,
final ReadPileup pileup,
final byte refBase,
final byte minBaseQual,
final MathUtils.RunningAverage hqSoftClips) {
final MathUtils.RunningAverage hqSoftClips,
final boolean readsWereRealigned) {

final int likelihoodCount = ploidy + 1;
final double log10Ploidy = MathUtils.log10(ploidy);
Expand All @@ -418,7 +417,7 @@ public RefVsAnyResult calcGenotypeLikelihoodsOfRefVsAny(final int ploidy,
continue;
}
readCount++;
applyPileupElementRefVsNonRefLikelihoodAndCount(refBase, likelihoodCount, log10Ploidy, result, p, qual, hqSoftClips);
applyPileupElementRefVsNonRefLikelihoodAndCount(refBase, likelihoodCount, log10Ploidy, result, p, qual, hqSoftClips, readsWereRealigned);
}
final double denominator = readCount * log10Ploidy;
for (int i = 0; i < likelihoodCount; i++) {
Expand All @@ -427,9 +426,8 @@ public RefVsAnyResult calcGenotypeLikelihoodsOfRefVsAny(final int ploidy,
return result;
}

private void applyPileupElementRefVsNonRefLikelihoodAndCount(final byte refBase, final int likelihoodCount, final double log10Ploidy, final RefVsAnyResult result, final PileupElement element, final byte qual, final MathUtils.RunningAverage hqSoftClips) {
final boolean isAlt = element.getBase() != refBase || element.isDeletion() || element.isBeforeDeletionStart()
|| element.isAfterDeletionEnd() || element.isBeforeInsertion() || element.isAfterInsertion() || element.isNextToSoftClip();
private void applyPileupElementRefVsNonRefLikelihoodAndCount(final byte refBase, final int likelihoodCount, final double log10Ploidy, final RefVsAnyResult result, final PileupElement element, final byte qual, final MathUtils.RunningAverage hqSoftClips, final boolean readsWereRealigned) {
final boolean isAlt = readsWereRealigned ? isAltAfterAssembly(element, refBase) : isAltBeforeAssembly(element, refBase);
final double referenceLikelihood;
final double nonRefLikelihood;
if (isAlt) {
Expand All @@ -456,6 +454,15 @@ private void applyPileupElementRefVsNonRefLikelihoodAndCount(final byte refBase,
}
}

private boolean isAltBeforeAssembly(final PileupElement element, final byte refBase){
return element.getBase() != refBase || element.isDeletion() || element.isBeforeDeletionStart()
|| element.isAfterDeletionEnd() || element.isBeforeInsertion() || element.isAfterInsertion() || element.isNextToSoftClip();
}

private boolean isAltAfterAssembly(final PileupElement element, final byte refBase){
return element.getBase() != refBase || element.isDeletion(); //we shouldn't have soft clips after assembly
}

/**
* Get a list of pileups that span the entire active region span, in order, one for each position
*/
Expand All @@ -468,7 +475,8 @@ private List<ReadPileup> getPileupsOverReference(final Haplotype refHaplotype,
Utils.validateArg(calledHaplotypes.contains(refHaplotype), "calledHaplotypes must contain the refHaplotype");
Utils.validateArg(readLikelihoods.numberOfSamples() == 1, () -> "readLikelihoods must contain exactly one sample but it contained " + readLikelihoods.numberOfSamples());

final List<GATKRead> reads = activeRegion.getReads();
final List<GATKRead> reads = new ArrayList<>(readLikelihoods.sampleReads(0));
reads.sort(new ReadCoordinateComparator(activeRegion.getHeader())); //because we updated the reads based on the local realignments we have to re-sort or the pileups will be... unpredictable

final LocusIteratorByState libs = new LocusIteratorByState(reads.iterator(), LocusIteratorByState.NO_DOWNSAMPLING,
false, samples.asSetOfSamples(), activeRegion.getHeader(), true);
Expand Down Expand Up @@ -547,7 +555,7 @@ int sumMismatchingQualities(final byte[] readBases,
for ( int i = 0; i < n; i++ ) {
final byte readBase = readBases[readStart + i];
final byte refBase = refBases[refStart + i];
if ( readBase != refBase ) {
if ( !Nucleotide.intersect(readBase, refBase) && !(readBase == AlignmentUtils.GAP_CHARACTER)) {
sum += readQuals[readStart + i];
if ( sum > maxSum ) { // abort early
return sum;
Expand All @@ -562,7 +570,7 @@ int sumMismatchingQualities(final byte[] readBases,
* Compute whether a read is informative to eliminate an indel of size <= maxIndelSize segregating at readStart/refStart
*
* @param read the read
* @param readStart the starting position of the read (i.e., that aligns it to a position in the reference)
* @param readStart the index with respect to @{param}refBases where the read starts
* @param refBases the reference bases
* @param refStart the offset into refBases that aligns to the readStart position in readBases
* @param maxIndelSize the max indel size to consider for the read to be informative
Expand All @@ -582,8 +590,9 @@ boolean isReadInformativeAboutIndelsOfSize(final GATKRead read,
// We are safe to use the faster no-copy versions of getBases and getBaseQualities here,
// since we're not modifying the returned arrays in any way. This makes a small difference
// in the HaplotypeCaller profile, since this method is a major hotspot.
final byte[] readBases = read.getBasesNoCopy();
final byte[] readQuals = read.getBaseQualitiesNoCopy();
final byte[] readBases = AlignmentUtils.getBasesAlignedOneToOne(read); //calls getBasesNoCopy if CIGAR is all match
final byte[] readQuals = AlignmentUtils.getBaseQualsAlignedOneToOne(read);


final int baselineMMSum = sumMismatchingQualities(readBases, readQuals, readStart, refBases, refStart, Integer.MAX_VALUE);

Expand All @@ -607,7 +616,7 @@ boolean isReadInformativeAboutIndelsOfSize(final GATKRead read,
* Calculate the number of indel informative reads at pileup
*
* @param pileup a pileup
* @param pileupOffsetIntoRef the position of the pileup in the reference
* @param pileupOffsetIntoRef index along the reference corresponding to the pileup
* @param ref the ref bases
* @param maxIndelSize maximum indel size to consider in the informativeness calculation
* @return an integer >= 0
Expand All @@ -616,16 +625,14 @@ boolean isReadInformativeAboutIndelsOfSize(final GATKRead read,
int calcNIndelInformativeReads(final ReadPileup pileup, final int pileupOffsetIntoRef, final byte[] ref, final int maxIndelSize) {
int nInformative = 0;
for ( final PileupElement p : pileup ) {
final GATKRead read = p.getRead();
final int offset = p.getOffset();

// doesn't count as evidence
if ( p.isBeforeDeletionStart() || p.isBeforeInsertion() || p.isDeletion() ) {
continue;
}

// todo -- this code really should handle CIGARs directly instead of relying on the above tests
if ( isReadInformativeAboutIndelsOfSize(read, offset, ref, pileupOffsetIntoRef, maxIndelSize) ) {
final int offset = getCigarModifiedOffset(p);

if ( isReadInformativeAboutIndelsOfSize(p.getRead(), offset, ref, pileupOffsetIntoRef, maxIndelSize) ) {
nInformative++;
if( nInformative > MAX_N_INDEL_INFORMATIVE_READS ) {
return MAX_N_INDEL_INFORMATIVE_READS;
Expand All @@ -635,6 +642,25 @@ int calcNIndelInformativeReads(final ReadPileup pileup, final int pileupOffsetIn
return nInformative;
}

/**
* Calculate the index of the current pileup position against the reference-aligned read
* This offset should be representative of the "IGV view" for the read where insertions are collapsed and deletions
* are padded so that we can easily count the mismatches against the reference
* @param p the PileupElement containing the offset as an index into the read base sequence
* @return the new reference-aligned index/offset
*/
@VisibleForTesting
protected int getCigarModifiedOffset (final PileupElement p){
Copy link
Contributor

@vruano vruano Sep 11, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems that this method intent boils down to calculate the number of "consumed reference bases" (+ softclips?) by the read alignment up to the location of the pileup in the read... CigarOperator have a consumesReferenceBases method that one can use to figure out this number without the need to make explicit reference to the operation types such INSERTION or DELETION or any other.

In other words I think there is a more straight forward solution that would count the consumed reference bases rather than "adjust" the consumed read bases to get to the same number....

perhaps would look something like this:

int offset = p.getCurrentCigarElement().getOperator().consumesReferenceBases() ? p.getIndexInCurrentElement() : 0;
for (CigarElement elem : read.getCigar().getCigarElements().subList(0, p.getCurrentCigarOffset())) {
    if (elem.getOperator().consumesReferenceBases() || elem.getOperator() == CigarOperator.S) {
        offset += elem.getLength();
    }
} 

final GATKRead read = p.getRead();
int offset = (p.getCurrentCigarElement().getOperator().consumesReferenceBases() || p.getCurrentCigarElement().getOperator() == CigarOperator.S)? p.getOffsetInCurrentCigar() : 0;
for (final CigarElement elem : read.getCigar().getCigarElements().subList(0, p.getCurrentCigarOffset())) {
if (elem.getOperator().consumesReferenceBases() || elem.getOperator() == CigarOperator.S) {
offset += elem.getLength();
}
}
return offset;
}

/**
* Create a reference haplotype for an active region
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
import org.broadinstitute.hellbender.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.hellbender.utils.read.*;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
Expand Down Expand Up @@ -303,12 +302,12 @@ private static GATKRead splitReadBasedOnCigar(final GATKRead read, final int cig

// we keep only the section of the read that is aligned to the reference between startRefIndex and stopRefIndex (inclusive).
// the other sections of the read are clipped:
final int startRefIndex = read.getUnclippedStart() + CigarUtils.countRefBasesBasedOnCigar(read, 0, cigarFirstIndex); //goes through the prefix of the cigar (up to cigarStartIndex) and move the reference index.
final int stopRefIndex = startRefIndex + CigarUtils.countRefBasesBasedOnCigar(read,cigarFirstIndex,cigarSecondIndex)-1; //goes through a consecutive non-N section of the cigar (up to cigarEndIndex) and move the reference index.
final int startRefIndex = read.getUnclippedStart() + CigarUtils.countRefBasesBasedOnUnclippedAlignment(read, 0, cigarFirstIndex); //goes through the prefix of the cigar (up to cigarStartIndex) and move the reference index.
final int stopRefIndex = startRefIndex + CigarUtils.countRefBasesBasedOnUnclippedAlignment(read,cigarFirstIndex,cigarSecondIndex)-1; //goes through a consecutive non-N section of the cigar (up to cigarEndIndex) and move the reference index.

if ( forSplitPositions != null ) {
final String contig = read.getContig();
final int splitStart = startRefIndex + CigarUtils.countRefBasesBasedOnCigar(read,cigarFirstIndex,cigarEndIndex); //we use cigarEndIndex instead of cigarSecondIndex so we won't take into account the D's at the end.
final int splitStart = startRefIndex + CigarUtils.countRefBasesBasedOnUnclippedAlignment(read,cigarFirstIndex,cigarEndIndex); //we use cigarEndIndex instead of cigarSecondIndex so we won't take into account the D's at the end.
final int splitEnd = splitStart + read.getCigarElement(cigarEndIndex).getLength() - 1;
forSplitPositions.addSplicePosition(contig, splitStart, splitEnd);
}
Expand Down
10 changes: 10 additions & 0 deletions src/main/java/org/broadinstitute/hellbender/utils/Nucleotide.java
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,16 @@ public Nucleotide intersect(final Nucleotide other) {
return maskToValue[mask & other.mask];
}

/**
* Checks whether two nucleotides intersect given their byte encodings.
* @param a first nucleotide.
* @param b second nucleotide.
* @return {@code true} iff the input nucleotides intersect.
*/
public static boolean intersect(final byte a, final byte b) {
return (baseToValue[0xFF & a].mask & baseToValue[0xFF & b].mask) != 0;
}

/**
* Checks whether two base encodings make reference to the same {@link #Nucleotide}
* instance regardless of their case.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,7 @@ public static GATKRead hardClipToRegion( final GATKRead read, final int refStart
*/
public static GATKRead hardClipToRegionIncludingClippedBases( final GATKRead read, final int refStart, final int refStop ) {
final int start = read.getUnclippedStart();
final int stop = start + CigarUtils.countRefBasesBasedOnCigar(read, 0, read.numCigarElements()) - 1;
final int stop = start + CigarUtils.countRefBasesBasedOnUnclippedAlignment(read, 0, read.numCigarElements()) - 1;
return hardClipToRegion(read, refStart, refStop, start, stop);
}

Expand Down Expand Up @@ -519,7 +519,7 @@ protected GATKRead clipByReferenceCoordinates(final int refStart, final int refS
*/
public static GATKRead softClipToRegionIncludingClippedBases( final GATKRead read, final int refStart, final int refStop ) {
final int start = read.getUnclippedStart();
final int stop = start + CigarUtils.countRefBasesBasedOnCigar(read, 0, read.numCigarElements()) - 1;
final int stop = start + CigarUtils.countRefBasesBasedOnUnclippedAlignment(read, 0, read.numCigarElements()) - 1;

if (start <= refStop && stop >= refStart) {
if (start < refStart && stop > refStop) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,22 @@
import org.broadinstitute.gatk.nativebindings.smithwaterman.SWOverhangStrategy;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.utils.BaseUtils;
import org.broadinstitute.hellbender.utils.Nucleotide;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
import org.broadinstitute.hellbender.utils.pileup.PileupElement;
import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAligner;
import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAlignment;

import java.util.*;
import java.util.function.Function;


public final class AlignmentUtils {
private static final EnumSet<CigarOperator> ALIGNED_TO_GENOME_OPERATORS = EnumSet.of(CigarOperator.M, CigarOperator.EQ, CigarOperator.X);
private static final EnumSet<CigarOperator> ALIGNED_TO_GENOME_PLUS_SOFTCLIPS = EnumSet.of(CigarOperator.M, CigarOperator.EQ, CigarOperator.X, CigarOperator.S);
public final static String HAPLOTYPE_TAG = "HC";
public final static byte GAP_CHARACTER = (byte)'-';

// cannot be instantiated
private AlignmentUtils() { }
Expand Down Expand Up @@ -197,6 +200,51 @@ public static byte[] getBasesCoveringRefInterval(final int refStart, final int r
return Arrays.copyOfRange(bases, basesStart, basesStop + 1);
}

public static byte[] getBasesAlignedOneToOne(final GATKRead read) {
return getSequenceAlignedOneToOne(read, r -> r.getBasesNoCopy(), GAP_CHARACTER);
}

public static byte[] getBaseQualsAlignedOneToOne(final GATKRead read) {
return getSequenceAlignedOneToOne(read, r -> r.getBaseQualitiesNoCopy(), (byte)0);
}

public static byte[] getSequenceAlignedOneToOne(final GATKRead read, final Function<GATKRead, byte[]> bytesProvider, final byte padWith) {
Utils.nonNull(read);
Utils.nonNull(bytesProvider);
final Cigar cigar = read.getCigar();
final byte[] sequence = bytesProvider.apply(read);

if (!cigar.containsOperator(CigarOperator.DELETION) && !cigar.containsOperator(CigarOperator.INSERTION)) {
return sequence;
}
else {
final byte[] paddedBases = new byte[CigarUtils.countRefBasesIncludingSoftClips(read, 0, cigar.numCigarElements())];
int literalPos = 0;
int paddedPos = 0;
for ( int i = 0; i < cigar.numCigarElements(); i++ ) {
final CigarElement ce = cigar.getCigarElement(i);
final CigarOperator co = ce.getOperator();
if (co.consumesReadBases()) {
if (!co.consumesReferenceBases()) {
literalPos += ce.getLength(); //skip inserted bases
}
else {
System.arraycopy(sequence, literalPos, paddedBases, paddedPos, ce.getLength());
literalPos += ce.getLength();
paddedPos += ce.getLength();
}
}
else if (co.consumesReferenceBases()) {
for ( int j = 0; j < ce.getLength(); j++ ) { //pad deleted bases
paddedBases[paddedPos] = padWith;
paddedPos++;
}
}
}
return paddedBases;
}
}

/**
* Get the number of bases at which refSeq and readSeq differ, given their alignment
*
Expand Down
Loading