diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java index 360b05ee749..a39212b19d6 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java @@ -39,6 +39,7 @@ import org.broadinstitute.hellbender.utils.read.GATKRead; import org.broadinstitute.hellbender.utils.read.ReadUtils; import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAligner; +import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanJavaAligner; import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines; import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils; @@ -96,6 +97,8 @@ public final class HaplotypeCallerEngine implements AssemblyRegionEvaluator { private SmithWatermanAligner aligner; + private SmithWatermanAligner alignerHaplotypeToRef; + public static final byte MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION = 6; /** @@ -154,6 +157,8 @@ public HaplotypeCallerEngine( final HaplotypeCallerArgumentCollection hcArgs, bo this.referenceReader = Utils.nonNull(referenceReader); this.annotationEngine = Utils.nonNull(annotationEngine); this.aligner = SmithWatermanAligner.getAligner(hcArgs.smithWatermanImplementation); + boolean haplotypeToRef = true; + this.alignerHaplotypeToRef = new SmithWatermanJavaAligner(haplotypeToRef); initialize(createBamOutIndex, createBamOutMD5); } @@ -539,7 +544,7 @@ public List callRegion(final AssemblyRegion region, final Featur } // run the local assembler, getting back a collection of information on how we should proceed - final AssemblyResultSet untrimmedAssemblyResult = AssemblyBasedCallerUtils.assembleReads(region, givenAlleles, hcArgs, readsHeader, samplesList, logger, referenceReader, assemblyEngine, aligner, !hcArgs.doNotCorrectOverlappingBaseQualities); + final AssemblyResultSet untrimmedAssemblyResult = AssemblyBasedCallerUtils.assembleReads(region, givenAlleles, hcArgs, readsHeader, samplesList, logger, referenceReader, assemblyEngine, alignerHaplotypeToRef, !hcArgs.doNotCorrectOverlappingBaseQualities); final SortedSet allVariationEvents = untrimmedAssemblyResult.getVariationEvents(hcArgs.maxMnpDistance); @@ -699,7 +704,10 @@ private List referenceModelForNoVariation(final AssemblyRegion r */ public void shutdown() { likelihoodCalculationEngine.close(); + aligner.close(); + alignerHaplotypeToRef.close(); + if ( haplotypeBAMWriter.isPresent() ) { haplotypeBAMWriter.get().close(); } @@ -710,8 +718,6 @@ public void shutdown() { throw new RuntimeIOException(e); } } - - } private Set filterNonPassingReads( final AssemblyRegion activeRegion ) { diff --git a/src/main/java/org/broadinstitute/hellbender/utils/Utils.java b/src/main/java/org/broadinstitute/hellbender/utils/Utils.java index b51658114b7..a7d0296e3e0 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/Utils.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/Utils.java @@ -8,10 +8,12 @@ import htsjdk.variant.variantcontext.VariantContext; import org.apache.commons.lang3.ArrayUtils; import org.apache.commons.lang3.StringUtils; +import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.commons.math3.random.RandomDataGenerator; import org.apache.commons.math3.random.Well19937c; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +import org.broadinstitute.gatk.nativebindings.smithwaterman.SWParameters; import org.broadinstitute.hellbender.engine.FeatureDataSource; import org.broadinstitute.hellbender.exceptions.GATKException; @@ -1033,6 +1035,11 @@ public static boolean xor(final boolean x, final boolean y) { return x != y; } + //overload method which calls lasIndexOf with the refIndexBoun param being 0 + public static int lastIndexOfAtMostTwoMismatches(final byte[] reference, final byte[] query, final int allowedMismatches) { + return lastIndexOfAtMostTwoMismatches(reference, query, allowedMismatches, 0); + } + /** * Find the last occurrence of the query sequence in the reference sequence * @@ -1041,22 +1048,336 @@ public static boolean xor(final boolean x, final boolean y) { * @param reference the reference sequence * @param query the query sequence */ - public static int lastIndexOf(final byte[] reference, final byte[] query) { + public static int lastIndexOfAtMostTwoMismatches(final byte[] reference, final byte[] query, final int allowedMismatches, int refIndexBound) { int queryLength = query.length; + if(refIndexBound < 0){ + refIndexBound = 0; + } // start search from the last possible matching position and search to the left - for (int r = reference.length - queryLength; r >= 0; r--) { - int q = 0; - while (q < queryLength && reference[r+q] == query[q]) { - q++; + for (int refIndex = reference.length - queryLength; refIndex >= refIndexBound; refIndex--) { + int mismatchCount = 0; + for (int queryIndex = 0; queryIndex < queryLength && mismatchCount <= allowedMismatches; queryIndex++) { + if (reference[refIndex+queryIndex] != query[queryIndex]) { + mismatchCount++; + } } - if (q == queryLength) { - return r; + + if (mismatchCount <= allowedMismatches) { + return refIndex; } } return -1; } + /** + * Finds the location of one indel in the query sequence in relation to the reference sequence + * Global Alignment + * + * Returns the index and size of the indel (as a 2 element int array) or -1 and 0 if an indel less than 4 bases is not found + * + * @param reference the reference sequence + * @param query the query sequence + * @param maxIndelLength the maximum length indel we look for + */ + public static ImmutablePair oneIndelHapToRef(final byte[] reference, final byte[] query, int maxIndelLength){ + int lengthOfIndel = Math.abs(reference.length - query.length); + + if(lengthOfIndel > maxIndelLength){ + return new ImmutablePair(-1,0); + } + //deletion + if(query.length < reference.length){ + return findIndel(query, reference, lengthOfIndel); + } + //insertion + else if(query.length > reference.length){ + return findIndel(reference, query, lengthOfIndel); + } + //lengths are equal, no one indel + else{ + return new ImmutablePair(-1,0); + } + } + + private static ImmutablePair findIndel(final byte[] shorter, final byte[] longer, int lengthOfIndel){ + int indelEnd = -1; + //traverse backwards until you hit mismatch/end of indel + for(int i = shorter.length - 1; i >= 0; i--){ + if(shorter[i] != longer[i + lengthOfIndel]){ + indelEnd = i + 1; + break; + } + } + if(indelEnd == -1){ + //deletion located at beginning of the query + return new ImmutablePair(0, lengthOfIndel); + } + //traverse forwards until you hit start of indel + for(int i = 0; i < indelEnd; i++){ + if(shorter[i] != longer[i]){ + return new ImmutablePair(-1,0); + } + } + //traversed entire query, one indel and no mismatches found + return new ImmutablePair(indelEnd, lengthOfIndel); + } + + + /** + * Finds the location of one indel in the query sequence in relation to the reference sequence + * local Alignment + * + * Returns an Indel object that contains all the necessary information for generating a cigar string + * + * @param reference the reference sequence + * @param query the query sequence + * @param parameters the SW parameters + * @param maxInsertionSize allowed size of insertion so that we can confidently say nothing can beat it + * @param maxDeletionSize allowed size of deletion so that we can confidently say nothing can beat it + */ + public static Indel oneIndelReadToHap(final byte[] reference, final byte[] query, SWParameters parameters, int maxInsertionSize, int maxDeletionSize){ + + Indel insertion = new Indel(-1, -1, maxInsertionSize, true); + Indel deletion = new Indel(-1, -1, maxDeletionSize, false); + + //set bounds for indices that are eligible to have indels + int refBackInsertionBound = query.length - 1 - maxInsertionSize; + int refBackDeletionBound = query.length; + + int refIndexBack = reference.length - 1; + + //traverse ref up to insertion bound + while(refIndexBack >= refBackInsertionBound){ + boolean skipDeletion = false; + + int queryIndexBack = query.length - 1; + + //iterate until you hit a match + while(reference[refIndexBack] != query[queryIndexBack] && refIndexBack >= refBackInsertionBound){ + if(refIndexBack < refBackDeletionBound){ + skipDeletion = true; + } + refIndexBack--; + } + + //keep track of potential starting position for alignment. this is needed in case the "traverse until mismatch found" step iterates over the correct alignment location + /* + e.g - the 1st while loop takes you to index 74, and the 2nd while loop takes you to index 41, so you passed over the index in the ref that represents the canonical alignment end position, 68 + 41 68 74 + ref AACCCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGCGGT + + read AACCCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCCTAA CCCTAACCCTAACCCTAACCCTAA - happens if you don't use queue + + read AACCCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAA CCCTAACCCTAACCCTAA + + */ + Queue startingPositions = new LinkedList<>(); + startingPositions.add(refIndexBack); + byte startingLetter = reference[refIndexBack]; + boolean inListOfStartingPositions = false; + int queryIndex = queryIndexBack; + + //while there are still potential starting indices, attempt alignments at each index + while(!startingPositions.isEmpty()){ + + //if element in queue is the last, begin new StartingPositionList + if (inListOfStartingPositions == true && startingPositions.size() == 1){ + inListOfStartingPositions = false; + } + refIndexBack = startingPositions.remove(); + if(refIndexBack < refBackInsertionBound){ + break; + } + queryIndexBack = queryIndex; + + //iterate until you hit mismatch + while(reference[refIndexBack] == query[queryIndexBack]){ + + if(refIndexBack == 1 || queryIndexBack == 2){ + refIndexBack--; + queryIndexBack--; + break; + } + + refIndexBack--; + queryIndexBack--; + + //if you've found a potential alignment starting position, add it to the queue + if(!inListOfStartingPositions){ + if(reference[refIndexBack] == startingLetter){ + startingPositions.add(refIndexBack); + } + } + } + + //once you've filled your queue, set this to true to make sure you don't create additional queues for the elements already in a queue + inListOfStartingPositions = true; + + //check for deletion code + //************************************ + if(!skipDeletion){ + byte[] ref = new byte[refIndexBack + 1]; + System.arraycopy(reference,0, ref, 0, refIndexBack + 1); + byte[] que = new byte[queryIndexBack + 1]; + System.arraycopy(query, 0, que, 0, queryIndexBack + 1); + int matchIndex = lastIndexOfAtMostTwoMismatches(ref, que, 0, ref.length - que.length - maxDeletionSize); + if(matchIndex != -1){ + int alignmentOffset = matchIndex; + int matchingBases = queryIndexBack + 1; + int indelSize = refIndexBack - matchIndex + 1 - matchingBases; + if(indelSize <= deletion.getIndelSize()){ + deletion.setAlignmentOffset(alignmentOffset); + deletion.setMatchingBases(matchingBases); + deletion.setIndelSize(indelSize); + deletion.setIndelType(false); + } + } + } + //************************************ + + int queryIndexFront = queryIndexBack - 1; + //set bound. this ensures that you don't look for insertions greater than the maxInsertionSize + int queryFrontBound = queryIndexBack - maxInsertionSize; + + //repeat the process, but this time you're determining the bases on the left side of the insertion + while(queryIndexFront >= queryFrontBound){ + + int refIndexFront = refIndexBack; + + //look for match + while(reference[refIndexFront] != query[queryIndexFront] && queryIndexFront >= queryFrontBound){ + queryIndexFront--; + + if(queryIndexFront < queryFrontBound){ + break; + } + } + + if(queryIndexFront < queryFrontBound){ + break; + } + + Queue overlap2 = new LinkedList<>(); + overlap2.add(queryIndexFront); + byte repeatLetter2 = query[queryIndexFront]; + boolean inListOfStartingPositions2 = false; + int refIndex = refIndexFront; + + while(!overlap2.isEmpty()){ + + queryIndexFront = overlap2.remove(); + if(queryIndexFront < queryFrontBound){ + break; + } + refIndexFront = refIndex; + + //iterate until mismatch or insertion is found + while(queryIndexFront >= 0 && refIndexFront >= 0 && reference[refIndexFront] == query[queryIndexFront]){ + queryIndexFront--; + refIndexFront--; + + //you've reached the end of the query, which means you have an insertion + if(queryIndexFront == -1){ + int alignmentOffset = refIndexFront + 1; + int matchingBases = refIndexBack - alignmentOffset + 1; + int indelSize = queryIndexBack - matchingBases + 1; + if(indelSize <= insertion.getIndelSize()){ + insertion.setAlignmentOffset(alignmentOffset); + insertion.setMatchingBases(matchingBases); + insertion.setIndelSize(indelSize); + insertion.setIndelType(true); + } + } + + if(!inListOfStartingPositions2 && queryIndexFront != -1){ + if(query[queryIndexFront] == repeatLetter2){ + overlap2.add(queryIndexFront); + } + } + } + + inListOfStartingPositions2 = true; + + if(queryIndexFront == -1){ + break; + } + } + } + } + } + + if(insertion.getAlignmentOffset() != -1 && deletion.getAlignmentOffset() == -1){ + return insertion; + } + else if(insertion.getAlignmentOffset() == -1 && deletion.getAlignmentOffset() != -1){ + return deletion; + } + else if(insertion.getAlignmentOffset() == -1 && deletion.getAlignmentOffset() == -1){ + return insertion; + } + else{ + int insertionBases = query.length - insertion.getIndelSize(); + int inScore = (insertionBases * parameters.getMatchValue()) + parameters.getGapOpenPenalty() + ((insertion.getIndelSize() - 1) * parameters.getGapExtendPenalty()); + + int delScore = (query.length * parameters.getMatchValue()) + parameters.getGapOpenPenalty() + ((deletion.getIndelSize() - 1) * parameters.getGapExtendPenalty()); + if(delScore == inScore){ + //tiebreaker is number of ref bases consumed - will need to be changed + return insertionBases > (query.length + deletion.getIndelSize()) ? insertion: deletion; + } + else{ + return delScore > inScore ? deletion : insertion; + } + } + } + + //inner class that stores all necessary information to produce cigar string + public static class Indel{ + int alignmentOffset; + int matchingBases; + int indelSize; + boolean insertion; + + public Indel(int alignmentOffset, int matchingBases, int indelSize, boolean insertion){ + this.alignmentOffset = alignmentOffset; + this.indelSize = indelSize; + this.matchingBases = matchingBases; + this.insertion = insertion; + } + + public int getIndelSize(){ + return indelSize; + } + + public int getAlignmentOffset(){ + return alignmentOffset; + } + + public int getMatchingBases(){ + return matchingBases; + } + + public boolean getIndelType(){ + return insertion; + } + + public void setIndelSize(int indelSize){ + this.indelSize = indelSize; + } + + public void setAlignmentOffset(int alignmentOffset){ + this.alignmentOffset = alignmentOffset; + } + + public void setMatchingBases(int matchingBases){ + this.matchingBases = matchingBases; + } + + public void setIndelType(boolean insertion){ + this.insertion = insertion; + } + } + /** * Simple wrapper for sticking elements of a int[] array into a List * @param ar - the array whose elements should be listified diff --git a/src/main/java/org/broadinstitute/hellbender/utils/read/AlignmentUtils.java b/src/main/java/org/broadinstitute/hellbender/utils/read/AlignmentUtils.java index ef6068c05ea..76a8d5a68b8 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/read/AlignmentUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/read/AlignmentUtils.java @@ -74,6 +74,7 @@ public static GATKRead createReadAlignedToRef(final GATKRead originalRead, // compute the smith-waterman alignment of read -> haplotype final SmithWatermanAlignment swPairwiseAlignment = aligner.align(haplotype.getBases(), originalRead.getBases(), CigarUtils.ALIGNMENT_TO_BEST_HAPLOTYPE_SW_PARAMETERS, SWOverhangStrategy.SOFTCLIP); + if ( swPairwiseAlignment.getAlignmentOffset() == -1 ) { // sw can fail (reasons not clear) so if it happens just don't realign the read return originalRead; diff --git a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SWNativeAlignerWrapper.java b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SWNativeAlignerWrapper.java index 612794bb497..78d50f3be25 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SWNativeAlignerWrapper.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SWNativeAlignerWrapper.java @@ -37,7 +37,7 @@ public SmithWatermanAlignment align(final byte[] reference, final byte[] alterna if (overhangStrategy == SWOverhangStrategy.SOFTCLIP || overhangStrategy == SWOverhangStrategy.IGNORE) { // Use a substring search to find an exact match of the alternate in the reference // NOTE: This approach only works for SOFTCLIP and IGNORE overhang strategies - matchIndex = Utils.lastIndexOf(reference, alternate); + matchIndex = Utils.lastIndexOfAtMostTwoMismatches(reference, alternate, 0); } final SmithWatermanAlignment alignmentResult; diff --git a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java index 16bd2051b40..9d4c4f3b4cf 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java @@ -3,6 +3,7 @@ import htsjdk.samtools.Cigar; import htsjdk.samtools.CigarElement; import htsjdk.samtools.CigarOperator; +import org.apache.commons.lang3.tuple.ImmutablePair; import org.broadinstitute.gatk.nativebindings.smithwaterman.SWOverhangStrategy; import org.broadinstitute.gatk.nativebindings.smithwaterman.SWParameters; import org.broadinstitute.hellbender.utils.Utils; @@ -24,6 +25,7 @@ public final class SmithWatermanJavaAligner implements SmithWatermanAligner { private static final SmithWatermanJavaAligner ALIGNER = new SmithWatermanJavaAligner(); private long totalComputeTime = 0; + private boolean haplotypeToref = false; /** * return the stateless singleton instance of SmithWatermanJavaAligner @@ -48,6 +50,13 @@ protected enum State { */ private SmithWatermanJavaAligner(){} + public SmithWatermanJavaAligner(boolean haplotypeToref){ + this.haplotypeToref = haplotypeToref; + } + + int noSW = 0; + int yesSW = 0; + long totalExactMatchTime = 0; /** * Aligns the alternate sequence to the reference sequence * @@ -64,37 +73,153 @@ public SmithWatermanAlignment align(final byte[] reference, final byte[] alterna Utils.nonNull(parameters); Utils.nonNull(overhangStrategy); - // avoid running full Smith-Waterman if there is an exact match of alternate in reference - int matchIndex = -1; - if (overhangStrategy == SWOverhangStrategy.SOFTCLIP || overhangStrategy == SWOverhangStrategy.IGNORE) { - // Use a substring search to find an exact match of the alternate in the reference - // NOTE: This approach only works for SOFTCLIP and IGNORE overhang strategies - matchIndex = Utils.lastIndexOf(reference, alternate); - } - final SmithWatermanAlignment alignmentResult; - if (matchIndex != -1) { - // generate the alignment result when the substring search was successful - final List lce = new ArrayList<>(alternate.length); - lce.add(makeElement(State.MATCH, alternate.length)); - alignmentResult = new SWPairwiseAlignmentResult(AlignmentUtils.consolidateCigar(new Cigar(lce)), matchIndex); - } - else { - // run full Smith-Waterman - final int n = reference.length+1; - final int m = alternate.length+1; - final int[][] sw = new int[n][m]; - final int[][] btrack=new int[n][m]; - - calculateMatrix(reference, alternate, sw, btrack, overhangStrategy, parameters); - alignmentResult = calculateCigar(sw, btrack, overhangStrategy); // length of the segment (continuous matches, insertions or deletions) + if (overhangStrategy == SWOverhangStrategy.SOFTCLIP || overhangStrategy == SWOverhangStrategy.IGNORE){ + //exact match + long startTime1 = System.nanoTime(); + int exactMatchIndex = Utils.lastIndexOfAtMostTwoMismatches(reference, alternate, 0); + totalExactMatchTime += System.nanoTime() - startTime1; + if (exactMatchIndex != -1) { + noSW++; + // generate the alignment result when the substring search was successful + final List lce = Collections.singletonList(makeElement(State.MATCH, alternate.length)); + alignmentResult = new SWPairwiseAlignmentResult(AlignmentUtils.consolidateCigar(new Cigar(lce)), exactMatchIndex); + totalComputeTime += System.nanoTime() - startTime; + return alignmentResult; + } + + /* + //one mismatch + int singleMismatchIndex = Utils.lastIndexOfAtMostTwoMismatches(reference, alternate, 1); + if (singleMismatchIndex != -1) { + final List lce = Collections.singletonList(makeElement(State.MATCH, alternate.length)); + alignmentResult = new SWPairwiseAlignmentResult(AlignmentUtils.consolidateCigar(new Cigar(lce)), singleMismatchIndex); + totalComputeTime += System.nanoTime() - startTime; + return alignmentResult; + } + + //one indel + if(this.haplotypeToref){ + int maxIndelLength = calculateAllowedLengthOfIndelHapToRef(parameters); + ImmutablePair indelStartAndSize = Utils.oneIndelHapToRef(reference, alternate, maxIndelLength); + int oneIndelIndex = indelStartAndSize.getLeft(); + if (oneIndelIndex != -1){ + int indelLength = indelStartAndSize.getRight(); + alignmentResult = calculateOneIndelCigar(indelLength, reference, alternate, oneIndelIndex); + totalComputeTime += System.nanoTime() - startTime; + return alignmentResult; + } + } + else{ + int maxInsertionSize = calculateMaxInsertionSizeReadToHap(parameters); + int maxDeletionSize = calculateMaxDeletionSizeReadToHap(parameters); + Utils.Indel indel = Utils.oneIndelReadToHap(reference, alternate, parameters, maxInsertionSize, maxDeletionSize); + int alignmentOffset = indel.getAlignmentOffset(); + if(alignmentOffset != -1){ + int matchingBases = indel.getMatchingBases(); + int indelSize = indel.getIndelSize(); + //construct cigar + State state; + int length; + if(indel.getIndelType()){ + state = State.INSERTION; + length = alternate.length - indelSize - matchingBases; + } + else{ + state = State.DELETION; + length = alternate.length - matchingBases; + + } + final List lce = Arrays.asList(makeElement(State.MATCH, matchingBases), + makeElement(state, indelSize), makeElement(State.MATCH, length)); + alignmentResult = new SWPairwiseAlignmentResult(AlignmentUtils.consolidateCigar(new Cigar(lce)), alignmentOffset); + + Cigar cigar1 = alignmentResult.getCigar(); + final int n = reference.length+1; + final int m = alternate.length+1; + final int[][] sw = new int[n][m]; + final int[][] btrack=new int[n][m]; + calculateMatrix(reference, alternate, sw, btrack, overhangStrategy, parameters); + SWPairwiseAlignmentResult alignmentResult2 = calculateCigar(sw, btrack, overhangStrategy); + Cigar cigar2 = alignmentResult2.getCigar(); + if(!cigar1.equals(cigar2) && !cigar2.containsOperator(CigarOperator.S)){ + System.out.println("CIGARS NOT EQUAL"); + System.out.println(new String(reference)); + System.out.println(""); + System.out.println(new String(alternate)); + System.out.println("Heuristic:" + cigar1); + System.out.println("SW: " + cigar2); + } + + totalComputeTime += System.nanoTime() - startTime; + return alignmentResult; + } + } + */ } + yesSW++; + // run full Smith-Waterman + final int n = reference.length+1; + final int m = alternate.length+1; + final int[][] sw = new int[n][m]; + final int[][] btrack=new int[n][m]; + + calculateMatrix(reference, alternate, sw, btrack, overhangStrategy, parameters); + alignmentResult = calculateCigar(sw, btrack, overhangStrategy); // length of the segment (continuous matches, insertions or deletions) totalComputeTime += System.nanoTime() - startTime; return alignmentResult; } + private static SWPairwiseAlignmentResult calculateOneIndelCigar(int indelLength, final byte[] reference, final byte[] alternate, int oneIndelIndex){ + State state = null; + int cigarThirdElementLength = 0; + + if(alternate.length < reference.length){ + state = State.DELETION; + cigarThirdElementLength = alternate.length - oneIndelIndex; + } + if(alternate.length > reference.length){ + state = State.INSERTION; + cigarThirdElementLength = alternate.length - indelLength - oneIndelIndex; + } + + final List lce = Arrays.asList(makeElement(State.MATCH, oneIndelIndex), + makeElement(state, indelLength), makeElement(State.MATCH, cigarThirdElementLength)); + return new SWPairwiseAlignmentResult(AlignmentUtils.consolidateCigar(new Cigar(lce)), 0); + } + + private static int calculateAllowedLengthOfIndelHapToRef(final SWParameters parameters){ + //calculate allowed length for indel to be less of a penalty than 2 mismatches + int mismatchScore = parameters.getMismatchPenalty(); + int indelExtendScore = parameters.getGapExtendPenalty(); + int indelOpenScore = parameters.getGapOpenPenalty(); + int maxIndelLength = (((2 * mismatchScore) - indelOpenScore)/indelExtendScore) + 1; + return maxIndelLength; + } + + //calculates insertion size whose score is better than any other scenario + private static int calculateMaxInsertionSizeReadToHap(final SWParameters parameters){ + int matchScore = parameters.getMatchValue(); + int mismatchPenalty = -1 * parameters.getMismatchPenalty(); + int gapOpenPenalty = -1 * parameters.getGapOpenPenalty(); + int gapExtensionPenalty = -1 * parameters.getGapExtendPenalty(); + + int insertionCapSize = (2*matchScore + 2*mismatchPenalty - gapOpenPenalty + gapExtensionPenalty - 1) / (matchScore + gapExtensionPenalty); + return insertionCapSize; + } + + private static int calculateMaxDeletionSizeReadToHap(final SWParameters parameters){ + int matchScore = parameters.getMatchValue(); + int mismatchPenalty = -1*parameters.getMismatchPenalty(); + int gapOpenPenalty = -1*parameters.getGapOpenPenalty(); + int gapExtensionPenalty = -1*parameters.getGapExtendPenalty(); + + int deletionCapSize = (2*matchScore + 2*mismatchPenalty - gapOpenPenalty + gapExtensionPenalty - 1) / (gapExtensionPenalty); + return deletionCapSize; + } + /** * Calculates the SW matrices for the given sequences * @param reference ref sequence @@ -389,5 +514,9 @@ private static CigarElement makeElement(final State state, final int length) { @Override public void close() { logger.info(String.format("Total compute time in java Smith-Waterman : %.2f sec", totalComputeTime * 1e-9)); + logger.info(String.format("Total compute time in exactMatch heuristic : %.2f sec", totalExactMatchTime * 1e-9)); + logger.info("Total calls to exactMatchHeuristic: " + noSW); + logger.info("Total calls to SW: " + yesSW); + } } diff --git a/src/test/java/org/broadinstitute/hellbender/utils/UtilsUnitTest.java b/src/test/java/org/broadinstitute/hellbender/utils/UtilsUnitTest.java index 28eee06c005..0dc46e016a4 100644 --- a/src/test/java/org/broadinstitute/hellbender/utils/UtilsUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/utils/UtilsUnitTest.java @@ -591,7 +591,7 @@ public void testLastIndexOfQueryTooLong() { final String reference = "AAAA"; final String query = "AAAAAAA"; - final int result = Utils.lastIndexOf(reference.getBytes(), query.getBytes()); + final int result = Utils.lastIndexOfAtMostTwoMismatches(reference.getBytes(), query.getBytes(), 0); final int expected = reference.lastIndexOf(query); Assert.assertEquals(result, expected); } @@ -602,17 +602,120 @@ public void testLastIndexOfLastBoundaries() { // match right boundary of reference String query = "TGGGG"; - int result = Utils.lastIndexOf(reference.getBytes(), query.getBytes()); + int result = Utils.lastIndexOfAtMostTwoMismatches(reference.getBytes(), query.getBytes(), 0); int expected = reference.lastIndexOf(query); Assert.assertEquals(result, expected); // match left boundary of reference query = "AAAAC"; - result = Utils.lastIndexOf(reference.getBytes(), query.getBytes()); + result = Utils.lastIndexOfAtMostTwoMismatches(reference.getBytes(), query.getBytes(), 0); expected = reference.lastIndexOf(query); Assert.assertEquals(result, expected); } + @Test + public void testLastIndexOfAtMostOneMismatchLastBoundaries() { + final String reference = "AAAACCCCTTTTGGGG"; + + // match right boundary of reference + final String query1 = "TGAGG"; + int result = Utils.lastIndexOfAtMostTwoMismatches(reference.getBytes(), query1.getBytes(), 1); + int expected = reference.length() - query1.length(); + Assert.assertEquals(result, expected); + + // match left boundary of reference + final String query2 = "AAGAC"; + result = Utils.lastIndexOfAtMostTwoMismatches(reference.getBytes(), query2.getBytes(), 1); + Assert.assertEquals(result, 0); + } + + @Test + public void testLastIndexOfAtMostOneMismatchRandomExactMatch() { + final int numTests = 100; + final int referenceLength = 100; + final int queryLength = 10; + + byte[] reference = new byte[referenceLength]; + byte[] query = new byte[queryLength]; + + final Random rng = Utils.getRandomGenerator(); + + for (int i = 0; i < numTests; i++) { + randomByteString(rng, reference); + randomByteString(rng, query); + + int index = -1; + + //add query to reference at a random lo cation for 75% of the tests + if (i % 4 > 0) { + index = rng.nextInt(referenceLength - queryLength); + System.arraycopy(query,0, reference, index, queryLength); + } + + final int result = Utils.lastIndexOfAtMostTwoMismatches(reference, query, 1); + final int expected = index; + Assert.assertEquals(result, expected); + } + } + + @Test + public void testLastIndexOfAtMostOneMismatchTwoMismatches() { + final String reference = "AAAACCCCTTTTGGGG"; + + // match right boundary of reference + final String query1 = "AGAGG"; + int result = Utils.lastIndexOfAtMostTwoMismatches(reference.getBytes(), query1.getBytes(), 1); + Assert.assertEquals(result, -1); + + // match right boundary of reference + final String query2 = "GGAAC"; + int result2 = Utils.lastIndexOfAtMostTwoMismatches(reference.getBytes(), query2.getBytes(), 1); + Assert.assertEquals(result2, -1); + } + + @Test + public void testLastIndexOfAtMostOneMismatchOneMismatchTwice() { + final String reference = "AGGCCCCCTTTTGGGG"; + + //matches both boundaries of reference + final String query1 = "AGGG"; + int result = Utils.lastIndexOfAtMostTwoMismatches(reference.getBytes(), query1.getBytes(), 1); + //will catch first match it encounters from the right + int expected = reference.length() - query1.length(); + Assert.assertEquals(result, expected); + } + + @Test + public void testLastIndexOfAtMostOneMismatchRandom() { + final int numTests = 100; + final int referenceLength = 100; + final int queryLength = 10; + + byte[] reference = new byte[referenceLength]; + byte[] query = new byte[queryLength]; + + final Random rng = Utils.getRandomGenerator(); + + for (int i = 0; i < numTests; i++) { + randomByteString(rng, reference); + randomByteString(rng, query); + + int index = -1; + + // add one-off query to reference at a random location for 75% of the tests + if (i % 4 > 0) { + index = rng.nextInt(referenceLength - queryLength); + System.arraycopy(query,0, reference, index, queryLength); + int mismatchPosition = rng.nextInt(queryLength); + reference[index + mismatchPosition] = BaseUtils.simpleComplement(query[mismatchPosition]); + } + + final int result = Utils.lastIndexOfAtMostTwoMismatches(reference, query, 1); + final int expected = index; + Assert.assertEquals(result, expected); + } + } + private void randomByteString(Random rng, byte[] bytes) { for (int i = 0; i < bytes.length; i++) { bytes[i] = (byte)(rng.nextInt(94) + 32); @@ -642,12 +745,53 @@ public void testLastIndexOfRandom() { } } - final int result = Utils.lastIndexOf(reference, query); + final int result = Utils.lastIndexOfAtMostTwoMismatches(reference, query, 0); final int expected = new String(reference).lastIndexOf(new String(query)); Assert.assertEquals(result, expected); } } + @Test + public void atMostOneIndel(){ + //deletion in middle of query + String reference = "AGGATTTGGGATTAC"; + String query = "AGGAGGGATTAC"; + int result = (Utils.oneIndelHapToRef(reference.getBytes(), query.getBytes(), 3)).getLeft(); + Assert.assertEquals(result, 4); + + //deletion in end of query + String reference2 = "AGGATTTGGGATTAC"; + String query2 = "AGGATTTGGGAT"; + int result2 = (Utils.oneIndelHapToRef(reference2.getBytes(), query2.getBytes(), 3)).getLeft(); + int expected2 = query2.length(); + Assert.assertEquals(result2, expected2); + + //insertion in middle of query + String reference3 = "ATTTAGTGGGATTA"; + String query3 = "ATTTAGTAGTGGGATTA"; + int result3 = (Utils.oneIndelHapToRef(reference3.getBytes(), query3.getBytes(), 3)).getLeft(); + Assert.assertEquals(result3, 3); + + //insertion in end of query + String reference4 = "AGTAGTGTGCGTCA"; + String query4 = "AGTAGTGTGCGTCAACT"; + int result4 = (Utils.oneIndelHapToRef(reference4.getBytes(), query4.getBytes(), 3)).getLeft(); + int expected4 = reference4.length(); + Assert.assertEquals(result4, expected4); + + //too long of an indel + String reference5 = "AGTAGTGTGCGTCA"; + String query5 = "AGTAGTGTGCGTCAACTA"; + int result5 = (Utils.oneIndelHapToRef(reference5.getBytes(), query5.getBytes(), 3)).getLeft(); + Assert.assertEquals(result5, -1); + + //deletion in beginning of query + String reference6 = "AGTACCGTTTGAC"; + String query6 = "TACCGTTTGAC"; + int result6 = (Utils.oneIndelHapToRef(reference6.getBytes(), query6.getBytes(), 3)).getLeft(); + Assert.assertEquals(result6, 0); + } + @Test(expectedExceptions = IllegalArgumentException.class) public void testListFromPrimitivesNull() throws Exception { Utils.listFromPrimitives(null);