broadinstitute · ldgauthier · Sep 17, 2018 · Aug 31, 2018 · vruano · Sep 11, 2018
diff --git a/...va/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java b/...va/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java
@@ -474,7 +474,7 @@ public ActivityProfileState isActive( final AlignmentContext context, final Refe
         for( final Map.Entry<String, AlignmentContext> sample : splitContexts.entrySet() ) {
             // The ploidy here is not dictated by the sample but by the simple genotyping-engine used to determine whether regions are active or not.
             final int activeRegionDetectionHackishSamplePloidy = activeRegionEvaluationGenotyperEngine.getConfiguration().genotypeArgs.samplePloidy;
-            final double[] genotypeLikelihoods = referenceConfidenceModel.calcGenotypeLikelihoodsOfRefVsAny(activeRegionDetectionHackishSamplePloidy,sample.getValue().getBasePileup(), ref.getBase(), hcArgs.minBaseQualityScore, averageHQSoftClips).genotypeLikelihoods;
+            final double[] genotypeLikelihoods = referenceConfidenceModel.calcGenotypeLikelihoodsOfRefVsAny(activeRegionDetectionHackishSamplePloidy,sample.getValue().getBasePileup(), ref.getBase(), hcArgs.minBaseQualityScore, averageHQSoftClips, false).genotypeLikelihoods;
             genotypes.add( new GenotypeBuilder(sample.getKey()).alleles(noCall).PL(genotypeLikelihoods).make() );
         }
 

diff --git a/...org/broadinstitute/hellbender/tools/walkers/haplotypecaller/ReferenceConfidenceModel.java b/...org/broadinstitute/hellbender/tools/walkers/haplotypecaller/ReferenceConfidenceModel.java
@@ -13,10 +13,7 @@
 import org.broadinstitute.hellbender.engine.AssemblyRegion;
 import org.broadinstitute.hellbender.tools.walkers.genotyper.PloidyModel;
 import org.broadinstitute.hellbender.tools.walkers.variantutils.PosteriorProbabilitiesUtils;
-import org.broadinstitute.hellbender.utils.MathUtils;
-import org.broadinstitute.hellbender.utils.QualityUtils;
-import org.broadinstitute.hellbender.utils.SimpleInterval;
-import org.broadinstitute.hellbender.utils.Utils;
+import org.broadinstitute.hellbender.utils.*;
 import org.broadinstitute.hellbender.utils.genotyper.ReadLikelihoods;
 import org.broadinstitute.hellbender.utils.genotyper.SampleList;
 import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
@@ -26,6 +23,7 @@
 import org.broadinstitute.hellbender.utils.pileup.ReadPileup;
 import org.broadinstitute.hellbender.utils.read.AlignmentUtils;
 import org.broadinstitute.hellbender.utils.read.GATKRead;
+import org.broadinstitute.hellbender.utils.read.ReadCoordinateComparator;
 import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
 import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;
 import org.broadinstitute.hellbender.utils.variant.HomoSapiensConstants;
@@ -292,7 +290,7 @@ private VariantContext makeReferenceConfidenceVariantContext(final int ploidy,
         // Assume infinite population on a single sample.
         final int refOffset = offset + globalRefOffset;
         final byte refBase = ref[refOffset];
-        final RefVsAnyResult homRefCalc = calcGenotypeLikelihoodsOfRefVsAny(ploidy, pileup, refBase, BASE_QUAL_THRESHOLD, null);
+        final RefVsAnyResult homRefCalc = calcGenotypeLikelihoodsOfRefVsAny(ploidy, pileup, refBase, BASE_QUAL_THRESHOLD, null, true);
 
         final Allele refAllele = Allele.create(refBase, true);
         final List<Allele> refSiteAlleles = Arrays.asList(refAllele, Allele.NON_REF_ALLELE);
@@ -405,7 +403,8 @@ public RefVsAnyResult calcGenotypeLikelihoodsOfRefVsAny(final int ploidy,
                                                         final ReadPileup pileup,
                                                         final byte refBase,
                                                         final byte minBaseQual,
-                                                        final MathUtils.RunningAverage hqSoftClips) {
+                                                        final MathUtils.RunningAverage hqSoftClips,
+                                                            final boolean readsWereRealigned) {
 
         final int likelihoodCount = ploidy + 1;
         final double log10Ploidy = MathUtils.log10(ploidy);
@@ -418,7 +417,7 @@ public RefVsAnyResult calcGenotypeLikelihoodsOfRefVsAny(final int ploidy,
                 continue;
             }
             readCount++;
-            applyPileupElementRefVsNonRefLikelihoodAndCount(refBase, likelihoodCount, log10Ploidy, result, p, qual, hqSoftClips);
+            applyPileupElementRefVsNonRefLikelihoodAndCount(refBase, likelihoodCount, log10Ploidy, result, p, qual, hqSoftClips, readsWereRealigned);
         }
         final double denominator = readCount * log10Ploidy;
         for (int i = 0; i < likelihoodCount; i++) {
@@ -427,9 +426,8 @@ public RefVsAnyResult calcGenotypeLikelihoodsOfRefVsAny(final int ploidy,
         return result;
     }
 
-    private void applyPileupElementRefVsNonRefLikelihoodAndCount(final byte refBase, final int likelihoodCount, final double log10Ploidy, final RefVsAnyResult result, final PileupElement element, final byte qual, final MathUtils.RunningAverage hqSoftClips) {
-        final boolean isAlt = element.getBase() != refBase || element.isDeletion() || element.isBeforeDeletionStart()
-                || element.isAfterDeletionEnd() || element.isBeforeInsertion() || element.isAfterInsertion() || element.isNextToSoftClip();
+    private void applyPileupElementRefVsNonRefLikelihoodAndCount(final byte refBase, final int likelihoodCount, final double log10Ploidy, final RefVsAnyResult result, final PileupElement element, final byte qual, final MathUtils.RunningAverage hqSoftClips, final boolean readsWereRealigned) {
+        final boolean isAlt = readsWereRealigned ? isAltAfterAssembly(element, refBase) : isAltBeforeAssembly(element, refBase);
         final double referenceLikelihood;
         final double nonRefLikelihood;
         if (isAlt) {
@@ -456,6 +454,15 @@ private void applyPileupElementRefVsNonRefLikelihoodAndCount(final byte refBase,
         }
     }
 
+    private boolean isAltBeforeAssembly(final PileupElement element, final byte refBase){
+        return element.getBase() != refBase || element.isDeletion() || element.isBeforeDeletionStart()
+                || element.isAfterDeletionEnd() || element.isBeforeInsertion() || element.isAfterInsertion() || element.isNextToSoftClip();
+    }
+
+    private boolean isAltAfterAssembly(final PileupElement element, final byte refBase){
+        return element.getBase() != refBase || element.isDeletion(); //we shouldn't have soft clips after assembly
+    }
+
     /**
      * Get a list of pileups that span the entire active region span, in order, one for each position
      */
@@ -468,7 +475,8 @@ private List<ReadPileup> getPileupsOverReference(final Haplotype refHaplotype,
         Utils.validateArg(calledHaplotypes.contains(refHaplotype), "calledHaplotypes must contain the refHaplotype");
         Utils.validateArg(readLikelihoods.numberOfSamples() == 1, () -> "readLikelihoods must contain exactly one sample but it contained " + readLikelihoods.numberOfSamples());
 
-        final List<GATKRead> reads = activeRegion.getReads();
+        final List<GATKRead> reads = new ArrayList<>(readLikelihoods.sampleReads(0));
+        reads.sort(new ReadCoordinateComparator(activeRegion.getHeader()));  //because we updated the reads based on the local realignments we have to re-sort or the pileups will be... unpredictable
 
         final LocusIteratorByState libs = new LocusIteratorByState(reads.iterator(), LocusIteratorByState.NO_DOWNSAMPLING,
                 false, samples.asSetOfSamples(), activeRegion.getHeader(), true);
@@ -547,7 +555,7 @@ int sumMismatchingQualities(final byte[] readBases,
         for ( int i = 0; i < n; i++ ) {
             final byte readBase = readBases[readStart + i];
             final byte refBase  = refBases[refStart + i];
-            if ( readBase != refBase ) {
+            if ( !Nucleotide.intersect(readBase, refBase) && !(readBase == AlignmentUtils.GAP_CHARACTER)) {
                 sum += readQuals[readStart + i];
                 if ( sum > maxSum ) { // abort early
                     return sum;
@@ -562,7 +570,7 @@ int sumMismatchingQualities(final byte[] readBases,
      * Compute whether a read is informative to eliminate an indel of size <= maxIndelSize segregating at readStart/refStart
      *
      * @param read the read
-     * @param readStart the starting position of the read (i.e., that aligns it to a position in the reference)
+     * @param readStart the index with respect to @{param}refBases where the read starts
      * @param refBases the reference bases
      * @param refStart the offset into refBases that aligns to the readStart position in readBases
      * @param maxIndelSize the max indel size to consider for the read to be informative
@@ -582,8 +590,9 @@ boolean isReadInformativeAboutIndelsOfSize(final GATKRead read,
         // We are safe to use the faster no-copy versions of getBases and getBaseQualities here,
         // since we're not modifying the returned arrays in any way. This makes a small difference
         // in the HaplotypeCaller profile, since this method is a major hotspot.
-        final byte[] readBases = read.getBasesNoCopy();
-        final byte[] readQuals = read.getBaseQualitiesNoCopy();
+        final byte[] readBases = AlignmentUtils.getBasesAlignedOneToOne(read);  //calls getBasesNoCopy if CIGAR is all match
+        final byte[] readQuals = AlignmentUtils.getBaseQualsAlignedOneToOne(read);
+
 
         final int baselineMMSum = sumMismatchingQualities(readBases, readQuals, readStart, refBases, refStart, Integer.MAX_VALUE);
 
@@ -607,7 +616,7 @@ boolean isReadInformativeAboutIndelsOfSize(final GATKRead read,
      * Calculate the number of indel informative reads at pileup
      *
      * @param pileup a pileup
-     * @param pileupOffsetIntoRef the position of the pileup in the reference
+     * @param pileupOffsetIntoRef index along the reference corresponding to the pileup
      * @param ref the ref bases
      * @param maxIndelSize maximum indel size to consider in the informativeness calculation
      * @return an integer >= 0
@@ -616,16 +625,14 @@ boolean isReadInformativeAboutIndelsOfSize(final GATKRead read,
     int calcNIndelInformativeReads(final ReadPileup pileup, final int pileupOffsetIntoRef, final byte[] ref, final int maxIndelSize) {
         int nInformative = 0;
         for ( final PileupElement p : pileup ) {
-            final GATKRead read = p.getRead();
-            final int offset = p.getOffset();
-
             // doesn't count as evidence
             if ( p.isBeforeDeletionStart() || p.isBeforeInsertion() || p.isDeletion() ) {
                 continue;
             }
 
-            // todo -- this code really should handle CIGARs directly instead of relying on the above tests
-            if ( isReadInformativeAboutIndelsOfSize(read, offset, ref, pileupOffsetIntoRef, maxIndelSize) ) {
+            final int offset = getCigarModifiedOffset(p);
+
+            if ( isReadInformativeAboutIndelsOfSize(p.getRead(), offset, ref, pileupOffsetIntoRef, maxIndelSize) ) {
                 nInformative++;
                 if( nInformative > MAX_N_INDEL_INFORMATIVE_READS ) {
                     return MAX_N_INDEL_INFORMATIVE_READS;
@@ -635,6 +642,25 @@ int calcNIndelInformativeReads(final ReadPileup pileup, final int pileupOffsetIn
         return nInformative;
     }
 
+    /**
+     * Calculate the index of the current pileup position against the reference-aligned read
+     * This offset should be representative of the "IGV view" for the read where insertions are collapsed and deletions
+     * are padded so that we can easily count the mismatches against the reference
+     * @param p the PileupElement containing the offset as an index into the read base sequence
+     * @return the new reference-aligned index/offset
+     */
+    @VisibleForTesting
+    protected int getCigarModifiedOffset (final PileupElement p){
+        final GATKRead read = p.getRead();
+        int offset = (p.getCurrentCigarElement().getOperator().consumesReferenceBases() || p.getCurrentCigarElement().getOperator() == CigarOperator.S)? p.getOffsetInCurrentCigar() : 0;
+        for (final CigarElement elem : read.getCigar().getCigarElements().subList(0, p.getCurrentCigarOffset())) {
+            if (elem.getOperator().consumesReferenceBases() || elem.getOperator() == CigarOperator.S) {
+                offset += elem.getLength();
+            }
+        }
+        return offset;
+    }
+
     /**
      * Create a reference haplotype for an active region
      *

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/rnaseq/SplitNCigarReads.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/rnaseq/SplitNCigarReads.java
@@ -23,7 +23,6 @@
 import org.broadinstitute.hellbender.utils.fasta.CachingIndexedFastaSequenceFile;
 import org.broadinstitute.hellbender.utils.read.*;
 
-import java.io.File;
 import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.util.ArrayList;
@@ -303,12 +302,12 @@ private static GATKRead splitReadBasedOnCigar(final GATKRead read, final int cig
 
         // we keep only the section of the read that is aligned to the reference between startRefIndex and stopRefIndex (inclusive).
         // the other sections of the read are clipped:
-        final int startRefIndex = read.getUnclippedStart() + CigarUtils.countRefBasesBasedOnCigar(read, 0, cigarFirstIndex); //goes through the prefix of the cigar (up to cigarStartIndex) and move the reference index.
-        final int stopRefIndex = startRefIndex + CigarUtils.countRefBasesBasedOnCigar(read,cigarFirstIndex,cigarSecondIndex)-1; //goes through a consecutive non-N section of the cigar (up to cigarEndIndex) and move the reference index.
+        final int startRefIndex = read.getUnclippedStart() + CigarUtils.countRefBasesBasedOnUnclippedAlignment(read, 0, cigarFirstIndex); //goes through the prefix of the cigar (up to cigarStartIndex) and move the reference index.
+        final int stopRefIndex = startRefIndex + CigarUtils.countRefBasesBasedOnUnclippedAlignment(read,cigarFirstIndex,cigarSecondIndex)-1; //goes through a consecutive non-N section of the cigar (up to cigarEndIndex) and move the reference index.
 
         if ( forSplitPositions != null ) {
             final String contig = read.getContig();
-            final int splitStart = startRefIndex + CigarUtils.countRefBasesBasedOnCigar(read,cigarFirstIndex,cigarEndIndex);  //we use cigarEndIndex instead of cigarSecondIndex so we won't take into account the D's at the end.
+            final int splitStart = startRefIndex + CigarUtils.countRefBasesBasedOnUnclippedAlignment(read,cigarFirstIndex,cigarEndIndex);  //we use cigarEndIndex instead of cigarSecondIndex so we won't take into account the D's at the end.
             final int splitEnd = splitStart + read.getCigarElement(cigarEndIndex).getLength() - 1;
             forSplitPositions.addSplicePosition(contig, splitStart, splitEnd);
         }

diff --git a/src/main/java/org/broadinstitute/hellbender/utils/Nucleotide.java b/src/main/java/org/broadinstitute/hellbender/utils/Nucleotide.java
@@ -368,6 +368,16 @@ public Nucleotide intersect(final Nucleotide other) {
         return maskToValue[mask & other.mask];
     }
 
+    /**
+     * Checks whether two nucleotides intersect given their byte encodings.
+     * @param a first nucleotide.
+     * @param b second nucleotide.
+     * @return {@code true} iff the input nucleotides intersect.
+     */
+    public static boolean intersect(final byte a, final byte b) {
+        return (baseToValue[0xFF & a].mask & baseToValue[0xFF & b].mask) != 0;
+    }
+
     /**
      * Checks whether two base encodings make reference to the same {@link #Nucleotide}
      *  instance regardless of their case.

diff --git a/src/main/java/org/broadinstitute/hellbender/utils/clipping/ReadClipper.java b/src/main/java/org/broadinstitute/hellbender/utils/clipping/ReadClipper.java
@@ -357,7 +357,7 @@ public static GATKRead hardClipToRegion( final GATKRead read, final int refStart
      */
     public static GATKRead hardClipToRegionIncludingClippedBases( final GATKRead read, final int refStart, final int refStop ) {
         final int start = read.getUnclippedStart();
-        final int stop = start + CigarUtils.countRefBasesBasedOnCigar(read, 0, read.numCigarElements()) - 1;
+        final int stop = start + CigarUtils.countRefBasesBasedOnUnclippedAlignment(read, 0, read.numCigarElements()) - 1;
         return hardClipToRegion(read, refStart, refStop, start, stop);
     }
 
@@ -519,7 +519,7 @@ protected GATKRead clipByReferenceCoordinates(final int refStart, final int refS
      */
     public static GATKRead softClipToRegionIncludingClippedBases( final GATKRead read, final int refStart, final int refStop ) {
         final int start = read.getUnclippedStart();
-        final int stop = start + CigarUtils.countRefBasesBasedOnCigar(read, 0, read.numCigarElements()) - 1;
+        final int stop = start + CigarUtils.countRefBasesBasedOnUnclippedAlignment(read, 0, read.numCigarElements()) - 1;
 
         if (start <= refStop && stop >= refStart) {
             if (start < refStart && stop > refStop) {

diff --git a/src/main/java/org/broadinstitute/hellbender/utils/read/AlignmentUtils.java b/src/main/java/org/broadinstitute/hellbender/utils/read/AlignmentUtils.java
@@ -6,19 +6,22 @@
 import org.broadinstitute.gatk.nativebindings.smithwaterman.SWOverhangStrategy;
 import org.broadinstitute.hellbender.exceptions.GATKException;
 import org.broadinstitute.hellbender.utils.BaseUtils;
+import org.broadinstitute.hellbender.utils.Nucleotide;
 import org.broadinstitute.hellbender.utils.Utils;
 import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
 import org.broadinstitute.hellbender.utils.pileup.PileupElement;
 import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAligner;
 import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAlignment;
 
 import java.util.*;
+import java.util.function.Function;
 
 
 public final class AlignmentUtils {
     private static final EnumSet<CigarOperator> ALIGNED_TO_GENOME_OPERATORS = EnumSet.of(CigarOperator.M, CigarOperator.EQ, CigarOperator.X);
     private static final EnumSet<CigarOperator> ALIGNED_TO_GENOME_PLUS_SOFTCLIPS = EnumSet.of(CigarOperator.M, CigarOperator.EQ, CigarOperator.X, CigarOperator.S);
     public final static String HAPLOTYPE_TAG = "HC";
+    public final static byte GAP_CHARACTER = (byte)'-';
 
     // cannot be instantiated
     private AlignmentUtils() { }
@@ -197,6 +200,51 @@ public static byte[] getBasesCoveringRefInterval(final int refStart, final int r
         return Arrays.copyOfRange(bases, basesStart, basesStop + 1);
     }
 
+    public static byte[] getBasesAlignedOneToOne(final GATKRead read) {
+        return getSequenceAlignedOneToOne(read, r -> r.getBasesNoCopy(), GAP_CHARACTER);
+    }
+
+    public static byte[] getBaseQualsAlignedOneToOne(final GATKRead read) {
+        return getSequenceAlignedOneToOne(read, r -> r.getBaseQualitiesNoCopy(), (byte)0);
+    }
+
+    public static byte[] getSequenceAlignedOneToOne(final GATKRead read, final Function<GATKRead, byte[]> bytesProvider, final byte padWith) {
+        Utils.nonNull(read);
+        Utils.nonNull(bytesProvider);
+        final Cigar cigar = read.getCigar();
+        final byte[] sequence = bytesProvider.apply(read);
+
+        if (!cigar.containsOperator(CigarOperator.DELETION) && !cigar.containsOperator(CigarOperator.INSERTION)) {
+            return sequence;
+        }
+        else {
+            final byte[] paddedBases = new byte[CigarUtils.countRefBasesIncludingSoftClips(read, 0, cigar.numCigarElements())];
+            int literalPos = 0;
+            int paddedPos = 0;
+            for ( int i = 0; i < cigar.numCigarElements(); i++ ) {
+                final CigarElement ce = cigar.getCigarElement(i);
+                final CigarOperator co = ce.getOperator();
+                if (co.consumesReadBases()) {
+                    if (!co.consumesReferenceBases()) {
+                        literalPos += ce.getLength();  //skip inserted bases
+                    }
+                    else {
+                        System.arraycopy(sequence, literalPos, paddedBases, paddedPos, ce.getLength());
+                        literalPos += ce.getLength();
+                        paddedPos += ce.getLength();
+                    }
+                }
+                else if (co.consumesReferenceBases()) {
+                    for ( int j = 0; j < ce.getLength(); j++ ) {  //pad deleted bases
+                        paddedBases[paddedPos] = padWith;
+                        paddedPos++;
+                    }
+                }
+            }
+            return paddedBases;
+        }
+    }
+
     /**
      * Get the number of bases at which refSeq and readSeq differ, given their alignment
      *