From 7cdde7c71c8d7107a3185d65bc3a69e54323d70c Mon Sep 17 00:00:00 2001 From: Jonathon Cohen Date: Wed, 19 Jun 2019 16:33:43 -0400 Subject: [PATCH 01/23] added counts for SW and non-SW calls print number of SW and nonSW alignments added oneMismatch heuristic and tests added count for oneMismatch heuristic --- .../HaplotypeCallerEngine.java | 8 ++- .../hellbender/utils/Utils.java | 25 +++++++ .../hellbender/utils/read/AlignmentUtils.java | 1 + .../smithwaterman/SWNativeAlignerWrapper.java | 18 +++++ .../smithwaterman/SmithWatermanAligner.java | 5 ++ .../SmithWatermanIntelAligner.java | 17 +++++ .../SmithWatermanJavaAligner.java | 69 ++++++++++++++++--- .../hellbender/utils/UtilsUnitTest.java | 69 +++++++++++++++++++ 8 files changed, 202 insertions(+), 10 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java index 360b05ee749..ffa295a51f6 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java @@ -699,6 +699,12 @@ private List referenceModelForNoVariation(final AssemblyRegion r */ public void shutdown() { likelihoodCalculationEngine.close(); + + //print out alignments + System.out.println("TOTAL NUMBER OF ALIGNMENTS:" + aligner.getNumOfAlignments()); + System.out.println("NO SW:" + aligner.noSW()); + System.out.println("YES SW:" + aligner.yesSW()); + aligner.close(); if ( haplotypeBAMWriter.isPresent() ) { haplotypeBAMWriter.get().close(); @@ -710,8 +716,6 @@ public void shutdown() { throw new RuntimeIOException(e); } } - - } private Set filterNonPassingReads( final AssemblyRegion activeRegion ) { diff --git a/src/main/java/org/broadinstitute/hellbender/utils/Utils.java b/src/main/java/org/broadinstitute/hellbender/utils/Utils.java index b51658114b7..baccc13e174 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/Utils.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/Utils.java @@ -1042,6 +1042,7 @@ public static boolean xor(final boolean x, final boolean y) { * @param query the query sequence */ public static int lastIndexOf(final byte[] reference, final byte[] query) { + int queryLength = query.length; // start search from the last possible matching position and search to the left @@ -1057,6 +1058,30 @@ public static int lastIndexOf(final byte[] reference, final byte[] query) { return -1; } + public static int lastIndexOfOneMismatch(final byte[] reference, final byte[] query) + { + int queryLength = query.length; + + boolean oneMismatch; + + // start search from the last possible matching position and search to the left + for (int r = reference.length - queryLength; r >= 0; r--) { + oneMismatch = false; + int q = 0; + while (q < queryLength && (reference[r+q] == query[q] || oneMismatch == false)) { + if (reference[r+q] != query[q]) + { + oneMismatch = true; + } + q++; + } + if (q == queryLength) { + return r; + } + } + return -1; + } + /** * Simple wrapper for sticking elements of a int[] array into a List * @param ar - the array whose elements should be listified diff --git a/src/main/java/org/broadinstitute/hellbender/utils/read/AlignmentUtils.java b/src/main/java/org/broadinstitute/hellbender/utils/read/AlignmentUtils.java index ef6068c05ea..76a8d5a68b8 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/read/AlignmentUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/read/AlignmentUtils.java @@ -74,6 +74,7 @@ public static GATKRead createReadAlignedToRef(final GATKRead originalRead, // compute the smith-waterman alignment of read -> haplotype final SmithWatermanAlignment swPairwiseAlignment = aligner.align(haplotype.getBases(), originalRead.getBases(), CigarUtils.ALIGNMENT_TO_BEST_HAPLOTYPE_SW_PARAMETERS, SWOverhangStrategy.SOFTCLIP); + if ( swPairwiseAlignment.getAlignmentOffset() == -1 ) { // sw can fail (reasons not clear) so if it happens just don't realign the read return originalRead; diff --git a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SWNativeAlignerWrapper.java b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SWNativeAlignerWrapper.java index 612794bb497..4bc53a983da 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SWNativeAlignerWrapper.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SWNativeAlignerWrapper.java @@ -25,6 +25,24 @@ public SWNativeAlignerWrapper(final SWAlignerNativeBinding aligner) { this.aligner = aligner; } + + //methods for printing out number of SW/non-SW alignments. not needed + public int getNumOfAlignments() + { + System.out.println("SW native wrapper being used"); + return 0; + } + public int noSW() + { + System.out.println("native SW being used"); + return 0; + } + public int yesSW() + { + System.out.println("native SW being used"); + return 0; + } + @Override public SmithWatermanAlignment align(final byte[] reference, final byte[] alternate, final SWParameters parameters, final SWOverhangStrategy overhangStrategy){ long startTime = System.nanoTime(); diff --git a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanAligner.java b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanAligner.java index 84f869523b1..86ea53349c2 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanAligner.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanAligner.java @@ -28,6 +28,11 @@ public interface SmithWatermanAligner extends Closeable { */ SmithWatermanAlignment align(final byte[] ref, final byte[] alt, SWParameters parameters, SWOverhangStrategy overhangStrategy); + //methods for printing out number of SW/non-SW alignments + int getNumOfAlignments(); + int noSW(); + int yesSW(); + /** * Implementations may optionally implement close in order to release any resources that they are holding. * diff --git a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanIntelAligner.java b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanIntelAligner.java index 8a8f058e9a1..74129b78113 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanIntelAligner.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanIntelAligner.java @@ -45,6 +45,23 @@ public SmithWatermanAlignment align(final byte[] reference, final byte[] alterna return alignerWrapper.align(reference, alternate, parameters, overhangStrategy); } + //methods for printing out number of SW/non-SW alignments. not needed for now + public int getNumOfAlignments() + { + System.out.println("Intel SW being used"); + return 0; + } + public int noSW() + { + System.out.println("Intel SW being used"); + return 0; + } + public int yesSW() + { + System.out.println("Intel SW being used"); + return 0; + } + /** * Close the aligner */ diff --git a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java index 16bd2051b40..248816d0a41 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java @@ -24,6 +24,8 @@ public final class SmithWatermanJavaAligner implements SmithWatermanAligner { private static final SmithWatermanJavaAligner ALIGNER = new SmithWatermanJavaAligner(); private long totalComputeTime = 0; + private long totalHeuristicTime = 0; + private long totalMismatchHeuristicTime = 0; /** * return the stateless singleton instance of SmithWatermanJavaAligner @@ -42,6 +44,27 @@ protected enum State { CLIP } + private int numOfAlignments = 0; + private int noSW = 0; + private int yesSW = 0; + + //methods for printing out number of SW/non-SW alignments + public int getNumOfAlignments() + { + return numOfAlignments; + } + + public int noSW() + { + return noSW; + } + + public int yesSW() + { + return yesSW; + } + + /** * Create a new SW pairwise aligner, this has no state so instead of creating new instances, we create a singleton which is * accessible via {@link #getInstance} @@ -58,6 +81,8 @@ private SmithWatermanJavaAligner(){} public SmithWatermanAlignment align(final byte[] reference, final byte[] alternate, final SWParameters parameters, final SWOverhangStrategy overhangStrategy) { long startTime = System.nanoTime(); + numOfAlignments++; + if ( reference == null || reference.length == 0 || alternate == null || alternate.length == 0 ) { throw new IllegalArgumentException("Non-null, non-empty sequences are required for the Smith-Waterman calculation"); } @@ -69,29 +94,55 @@ public SmithWatermanAlignment align(final byte[] reference, final byte[] alterna if (overhangStrategy == SWOverhangStrategy.SOFTCLIP || overhangStrategy == SWOverhangStrategy.IGNORE) { // Use a substring search to find an exact match of the alternate in the reference // NOTE: This approach only works for SOFTCLIP and IGNORE overhang strategies + long startHeuristic = System.nanoTime(); matchIndex = Utils.lastIndexOf(reference, alternate); + totalHeuristicTime += System.nanoTime() - startHeuristic; } final SmithWatermanAlignment alignmentResult; + //if exact match if (matchIndex != -1) { + + noSW++; + // generate the alignment result when the substring search was successful final List lce = new ArrayList<>(alternate.length); lce.add(makeElement(State.MATCH, alternate.length)); alignmentResult = new SWPairwiseAlignmentResult(AlignmentUtils.consolidateCigar(new Cigar(lce)), matchIndex); } else { - // run full Smith-Waterman - final int n = reference.length+1; - final int m = alternate.length+1; - final int[][] sw = new int[n][m]; - final int[][] btrack=new int[n][m]; - - calculateMatrix(reference, alternate, sw, btrack, overhangStrategy, parameters); - alignmentResult = calculateCigar(sw, btrack, overhangStrategy); // length of the segment (continuous matches, insertions or deletions) + //look for one mismatch + long startOneMismatchHeuristic = System.nanoTime(); + matchIndex = Utils.lastIndexOfOneMismatch(reference, alternate); + totalMismatchHeuristicTime += System.nanoTime() - startOneMismatchHeuristic; + + if (matchIndex != -1) + { + noSW++; + + // generate the alignment result when the substring search was successful + final List lce = new ArrayList<>(alternate.length); + lce.add(makeElement(State.MATCH, alternate.length)); + alignmentResult = new SWPairwiseAlignmentResult(AlignmentUtils.consolidateCigar(new Cigar(lce)), matchIndex); + } + else{ + // run full Smith-Waterman + + yesSW++; + + final int n = reference.length+1; + final int m = alternate.length+1; + final int[][] sw = new int[n][m]; + final int[][] btrack=new int[n][m]; + + calculateMatrix(reference, alternate, sw, btrack, overhangStrategy, parameters); + alignmentResult = calculateCigar(sw, btrack, overhangStrategy); // length of the segment (continuous matches, insertions or deletions) + } } totalComputeTime += System.nanoTime() - startTime; + return alignmentResult; } @@ -389,5 +440,7 @@ private static CigarElement makeElement(final State state, final int length) { @Override public void close() { logger.info(String.format("Total compute time in java Smith-Waterman : %.2f sec", totalComputeTime * 1e-9)); + logger.info(String.format("Total compute time in heuristic : %.2f sec", totalHeuristicTime * 1e-9)); + logger.info(String.format("Total compute time in oneMismatch heuristic : %.2f sec", totalMismatchHeuristicTime * 1e-9)); } } diff --git a/src/test/java/org/broadinstitute/hellbender/utils/UtilsUnitTest.java b/src/test/java/org/broadinstitute/hellbender/utils/UtilsUnitTest.java index 28eee06c005..a32837f26ac 100644 --- a/src/test/java/org/broadinstitute/hellbender/utils/UtilsUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/utils/UtilsUnitTest.java @@ -613,6 +613,75 @@ public void testLastIndexOfLastBoundaries() { Assert.assertEquals(result, expected); } + @Test + public void testLastIndexOfOneMismatchLastBoundaries() { + final String reference = "AAAACCCCTTTTGGGG"; + + // match right boundary of reference + String query = "TGAGG"; + int result = Utils.lastIndexOfOneMismatch(reference.getBytes(), query.getBytes()); + int expected = 11; + Assert.assertEquals(result, expected); + + // match left boundary of reference + query = "AAGAC"; + result = Utils.lastIndexOfOneMismatch(reference.getBytes(), query.getBytes()); + expected = 0; + Assert.assertEquals(result, expected); + } + + @Test + public void testLastIndexOfOneMismatchRandom() { + final int num_tests = 100; + final int referenceLength = 1000; + final int queryLength = 100; + + byte [] reference = new byte[referenceLength]; + byte [] query = new byte[queryLength]; + + final Random rng = Utils.getRandomGenerator(); + + for (int i = 0; i < num_tests; i++) { + randomByteString(rng, reference); + randomByteString(rng, query); + + int index = -1; + // add one-off query to reference at a random location for 75% of the tests + if (i % 4 > 0) { + index = rng.nextInt(referenceLength - queryLength); + int mismatch = index + rng.nextInt(queryLength); + for (int j = 0; j < queryLength; j++) { + if (index + j == mismatch) + { + if ((char)(query[j] & 0xFF) == 'A') + { + reference[index + j] = (byte)'G'; + } + if ((char)(query[j] & 0xFF) == 'G') + { + reference[index + j] = (byte)'C'; + } + if ((char)(query[j] & 0xFF) == 'C') + { + reference[index + j] = (byte)'T'; + } + if ((char)(query[j] & 0xFF) == 'T') + { + reference[index + j] = (byte)'A'; + } + } + else { + reference[index + j] = query[j]; + } + } + } + + final int result = Utils.lastIndexOfOneMismatch(reference, query); + final int expected = index; + Assert.assertEquals(result, expected); + } + } + private void randomByteString(Random rng, byte[] bytes) { for (int i = 0; i < bytes.length; i++) { bytes[i] = (byte)(rng.nextInt(94) + 32); From 0ef2b14b5d7b48b99b510a7fe9cb8f57ccacf2a6 Mon Sep 17 00:00:00 2001 From: Jonathon Cohen Date: Tue, 25 Jun 2019 10:02:40 -0400 Subject: [PATCH 02/23] add another aligner object for testing --- .../haplotypecaller/HaplotypeCallerEngine.java | 14 ++++++++++++-- .../smithwaterman/SmithWatermanJavaAligner.java | 2 +- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java index ffa295a51f6..e8c191d4afc 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java @@ -39,6 +39,7 @@ import org.broadinstitute.hellbender.utils.read.GATKRead; import org.broadinstitute.hellbender.utils.read.ReadUtils; import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAligner; +import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanJavaAligner; import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines; import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils; @@ -96,6 +97,8 @@ public final class HaplotypeCallerEngine implements AssemblyRegionEvaluator { private SmithWatermanAligner aligner; + private SmithWatermanAligner alignerHaplotypeToRef; + public static final byte MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION = 6; /** @@ -154,6 +157,7 @@ public HaplotypeCallerEngine( final HaplotypeCallerArgumentCollection hcArgs, bo this.referenceReader = Utils.nonNull(referenceReader); this.annotationEngine = Utils.nonNull(annotationEngine); this.aligner = SmithWatermanAligner.getAligner(hcArgs.smithWatermanImplementation); + this.alignerHaplotypeToRef = new SmithWatermanJavaAligner(); initialize(createBamOutIndex, createBamOutMD5); } @@ -539,7 +543,7 @@ public List callRegion(final AssemblyRegion region, final Featur } // run the local assembler, getting back a collection of information on how we should proceed - final AssemblyResultSet untrimmedAssemblyResult = AssemblyBasedCallerUtils.assembleReads(region, givenAlleles, hcArgs, readsHeader, samplesList, logger, referenceReader, assemblyEngine, aligner, !hcArgs.doNotCorrectOverlappingBaseQualities); + final AssemblyResultSet untrimmedAssemblyResult = AssemblyBasedCallerUtils.assembleReads(region, givenAlleles, hcArgs, readsHeader, samplesList, logger, referenceReader, assemblyEngine, alignerHaplotypeToRef, !hcArgs.doNotCorrectOverlappingBaseQualities); final SortedSet allVariationEvents = untrimmedAssemblyResult.getVariationEvents(hcArgs.maxMnpDistance); @@ -701,11 +705,17 @@ public void shutdown() { likelihoodCalculationEngine.close(); //print out alignments - System.out.println("TOTAL NUMBER OF ALIGNMENTS:" + aligner.getNumOfAlignments()); + System.out.println("TOTAL NUMBER OF ALIGNMENTS (reads to ref):" + aligner.getNumOfAlignments()); System.out.println("NO SW:" + aligner.noSW()); System.out.println("YES SW:" + aligner.yesSW()); + System.out.println("TOTAL NUMBER OF ALIGNMENTS (haplotypes to ref):" + alignerHaplotypeToRef.getNumOfAlignments()); + System.out.println("NO SW:" + alignerHaplotypeToRef.noSW()); + System.out.println("YES SW:" + alignerHaplotypeToRef.yesSW()); + aligner.close(); + alignerHaplotypeToRef.close(); + if ( haplotypeBAMWriter.isPresent() ) { haplotypeBAMWriter.get().close(); } diff --git a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java index 248816d0a41..4aac03baa9b 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java @@ -69,7 +69,7 @@ public int yesSW() * Create a new SW pairwise aligner, this has no state so instead of creating new instances, we create a singleton which is * accessible via {@link #getInstance} */ - private SmithWatermanJavaAligner(){} + public SmithWatermanJavaAligner(){} /** * Aligns the alternate sequence to the reference sequence From b456351c1ff506dc5ca1bbd10e619146f1a3d09e Mon Sep 17 00:00:00 2001 From: Jonathon Cohen Date: Tue, 25 Jun 2019 15:58:10 -0400 Subject: [PATCH 03/23] edited for David Benjamin's comments added more tests for atMostOneMismatch heuristic --- .../hellbender/utils/Utils.java | 24 ++-- .../SmithWatermanJavaAligner.java | 21 ++-- .../hellbender/utils/UtilsUnitTest.java | 104 +++++++++++------- 3 files changed, 79 insertions(+), 70 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/utils/Utils.java b/src/main/java/org/broadinstitute/hellbender/utils/Utils.java index baccc13e174..8e799acbbe7 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/Utils.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/Utils.java @@ -1042,7 +1042,6 @@ public static boolean xor(final boolean x, final boolean y) { * @param query the query sequence */ public static int lastIndexOf(final byte[] reference, final byte[] query) { - int queryLength = query.length; // start search from the last possible matching position and search to the left @@ -1058,25 +1057,20 @@ public static int lastIndexOf(final byte[] reference, final byte[] query) { return -1; } - public static int lastIndexOfOneMismatch(final byte[] reference, final byte[] query) - { + public static int lastIndexOfAtMostOneMismatch(final byte[] reference, final byte[] query) { int queryLength = query.length; - boolean oneMismatch; - // start search from the last possible matching position and search to the left - for (int r = reference.length - queryLength; r >= 0; r--) { - oneMismatch = false; - int q = 0; - while (q < queryLength && (reference[r+q] == query[q] || oneMismatch == false)) { - if (reference[r+q] != query[q]) - { - oneMismatch = true; + for (int refIndex = reference.length - queryLength; refIndex >= 0; refIndex--) { + int mismatchCount = 0; + for (int queryIndex = 0; queryIndex < queryLength && mismatchCount < 2; queryIndex++) { + if (reference[refIndex+queryIndex] != query[queryIndex]) { + mismatchCount++; } - q++; } - if (q == queryLength) { - return r; + + if (mismatchCount < 2) { + return refIndex; } } return -1; diff --git a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java index 4aac03baa9b..26fa50a9b1d 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java @@ -49,18 +49,15 @@ protected enum State { private int yesSW = 0; //methods for printing out number of SW/non-SW alignments - public int getNumOfAlignments() - { + public int getNumOfAlignments() { return numOfAlignments; } - public int noSW() - { + public int noSW() { return noSW; } - public int yesSW() - { + public int yesSW() { return yesSW; } @@ -107,28 +104,24 @@ public SmithWatermanAlignment align(final byte[] reference, final byte[] alterna noSW++; // generate the alignment result when the substring search was successful - final List lce = new ArrayList<>(alternate.length); - lce.add(makeElement(State.MATCH, alternate.length)); + final List lce = Collections.singletonList(makeElement(State.MATCH, alternate.length)); alignmentResult = new SWPairwiseAlignmentResult(AlignmentUtils.consolidateCigar(new Cigar(lce)), matchIndex); } else { //look for one mismatch long startOneMismatchHeuristic = System.nanoTime(); - matchIndex = Utils.lastIndexOfOneMismatch(reference, alternate); + matchIndex = Utils.lastIndexOfAtMostOneMismatch(reference, alternate); totalMismatchHeuristicTime += System.nanoTime() - startOneMismatchHeuristic; - if (matchIndex != -1) - { + if (matchIndex != -1) { noSW++; // generate the alignment result when the substring search was successful - final List lce = new ArrayList<>(alternate.length); - lce.add(makeElement(State.MATCH, alternate.length)); + final List lce = Collections.singletonList(makeElement(State.MATCH, alternate.length)); alignmentResult = new SWPairwiseAlignmentResult(AlignmentUtils.consolidateCigar(new Cigar(lce)), matchIndex); } else{ // run full Smith-Waterman - yesSW++; final int n = reference.length+1; diff --git a/src/test/java/org/broadinstitute/hellbender/utils/UtilsUnitTest.java b/src/test/java/org/broadinstitute/hellbender/utils/UtilsUnitTest.java index a32837f26ac..416dd82086c 100644 --- a/src/test/java/org/broadinstitute/hellbender/utils/UtilsUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/utils/UtilsUnitTest.java @@ -614,69 +614,91 @@ public void testLastIndexOfLastBoundaries() { } @Test - public void testLastIndexOfOneMismatchLastBoundaries() { + public void testLastIndexOfAtMostOneMismatchLastBoundaries() { final String reference = "AAAACCCCTTTTGGGG"; // match right boundary of reference - String query = "TGAGG"; - int result = Utils.lastIndexOfOneMismatch(reference.getBytes(), query.getBytes()); - int expected = 11; + final String query1 = "TGAGG"; + int result = Utils.lastIndexOfAtMostOneMismatch(reference.getBytes(), query1.getBytes()); + int expected = reference.length() - query1.length(); Assert.assertEquals(result, expected); // match left boundary of reference - query = "AAGAC"; - result = Utils.lastIndexOfOneMismatch(reference.getBytes(), query.getBytes()); - expected = 0; - Assert.assertEquals(result, expected); + final String query2 = "AAGAC"; + result = Utils.lastIndexOfAtMostOneMismatch(reference.getBytes(), query2.getBytes()); + Assert.assertEquals(result, 0); } @Test - public void testLastIndexOfOneMismatchRandom() { - final int num_tests = 100; - final int referenceLength = 1000; - final int queryLength = 100; + public void testLastIndexOfAtMostOneMismatchTwoMismatches() { + final String reference = "AAAACCCCTTTTGGGG"; - byte [] reference = new byte[referenceLength]; - byte [] query = new byte[queryLength]; + // match right boundary of reference + final String query1 = "AGAGG"; + int result = Utils.lastIndexOfAtMostOneMismatch(reference.getBytes(), query1.getBytes()); + Assert.assertEquals(result, -1); + + // match right boundary of reference + final String query2 = "GGAAC"; + int result2 = Utils.lastIndexOfAtMostOneMismatch(reference.getBytes(), query2.getBytes()); + Assert.assertEquals(result2, -1); + } + + @Test + public void testLastIndexOfAtMostOneMismatchRandom() { + final int numTests = 100; + final int referenceLength = 100; + final int queryLength = 10; + + byte[] reference = new byte[referenceLength]; + byte[] query = new byte[queryLength]; final Random rng = Utils.getRandomGenerator(); - for (int i = 0; i < num_tests; i++) { + for (int i = 0; i < numTests; i++) { randomByteString(rng, reference); randomByteString(rng, query); int index = -1; + // add one-off query to reference at a random location for 75% of the tests if (i % 4 > 0) { index = rng.nextInt(referenceLength - queryLength); - int mismatch = index + rng.nextInt(queryLength); - for (int j = 0; j < queryLength; j++) { - if (index + j == mismatch) - { - if ((char)(query[j] & 0xFF) == 'A') - { - reference[index + j] = (byte)'G'; - } - if ((char)(query[j] & 0xFF) == 'G') - { - reference[index + j] = (byte)'C'; - } - if ((char)(query[j] & 0xFF) == 'C') - { - reference[index + j] = (byte)'T'; - } - if ((char)(query[j] & 0xFF) == 'T') - { - reference[index + j] = (byte)'A'; - } - } - else { - reference[index + j] = query[j]; - } - } + System.arraycopy(query,0, reference, index, queryLength); + int mismatchPosition = rng.nextInt(queryLength); + reference[index + mismatchPosition] = BaseUtils.simpleComplement(query[mismatchPosition]); + } + + final int result = Utils.lastIndexOfAtMostOneMismatch(reference, query); + final int expected = index; + Assert.assertEquals(result, expected); + } + } + + @Test + public void testLastIndexOfAtMostOneMismatchRandomExactMatch() { + final int numTests = 100; + final int referenceLength = 100; + final int queryLength = 10; + + byte[] reference = new byte[referenceLength]; + byte[] query = new byte[queryLength]; + + final Random rng = Utils.getRandomGenerator(); + + for (int i = 0; i < numTests; i++) { + randomByteString(rng, reference); + randomByteString(rng, query); + + int index = -1; + + //add query to reference at a random lo cation for 75% of the tests + if (i % 4 > 0) { + index = rng.nextInt(referenceLength - queryLength); + System.arraycopy(query,0, reference, index, queryLength); } - final int result = Utils.lastIndexOfOneMismatch(reference, query); + final int result = Utils.lastIndexOfAtMostOneMismatch(reference, query); final int expected = index; Assert.assertEquals(result, expected); } From 8ff93a25372099c55899ebedfcc51c5807f01558 Mon Sep 17 00:00:00 2001 From: Jonathon Cohen Date: Tue, 25 Jun 2019 16:23:04 -0400 Subject: [PATCH 04/23] added test for multiple oneMismatches --- .../hellbender/utils/UtilsUnitTest.java | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/test/java/org/broadinstitute/hellbender/utils/UtilsUnitTest.java b/src/test/java/org/broadinstitute/hellbender/utils/UtilsUnitTest.java index 416dd82086c..482b9947bfe 100644 --- a/src/test/java/org/broadinstitute/hellbender/utils/UtilsUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/utils/UtilsUnitTest.java @@ -644,6 +644,18 @@ public void testLastIndexOfAtMostOneMismatchTwoMismatches() { Assert.assertEquals(result2, -1); } + @Test + public void testLastIndexOfAtMostOneMismatchOneMismatchTwice() { + final String reference = "AGGCCCCCTTTTGGGG"; + + //matches both boundaries of reference + final String query1 = "AGGG"; + int result = Utils.lastIndexOfAtMostOneMismatch(reference.getBytes(), query1.getBytes()); + //will catch first match it encounters from the right + int expected = reference.length() - query1.length(); + Assert.assertEquals(result, expected); + } + @Test public void testLastIndexOfAtMostOneMismatchRandom() { final int numTests = 100; From 323072a3a151f4dbd7a5ae35060debe57aca81fb Mon Sep 17 00:00:00 2001 From: Jonathon Cohen Date: Tue, 25 Jun 2019 16:24:55 -0400 Subject: [PATCH 05/23] stylistic change with brackes --- .../java/org/broadinstitute/hellbender/utils/UtilsUnitTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/java/org/broadinstitute/hellbender/utils/UtilsUnitTest.java b/src/test/java/org/broadinstitute/hellbender/utils/UtilsUnitTest.java index 482b9947bfe..d4ca261e7d3 100644 --- a/src/test/java/org/broadinstitute/hellbender/utils/UtilsUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/utils/UtilsUnitTest.java @@ -654,7 +654,7 @@ public void testLastIndexOfAtMostOneMismatchOneMismatchTwice() { //will catch first match it encounters from the right int expected = reference.length() - query1.length(); Assert.assertEquals(result, expected); - } + } @Test public void testLastIndexOfAtMostOneMismatchRandom() { From 5a7344a80410f4d0d5d318a87e11c5d50528bec9 Mon Sep 17 00:00:00 2001 From: Jonathon Cohen Date: Wed, 26 Jun 2019 10:32:20 -0400 Subject: [PATCH 06/23] removed test code --- .../HaplotypeCallerEngine.java | 15 +---- .../smithwaterman/SmithWatermanAligner.java | 5 -- .../SmithWatermanIntelAligner.java | 17 ------ .../SmithWatermanJavaAligner.java | 35 +---------- .../hellbender/utils/UtilsUnitTest.java | 58 +++++++++---------- 5 files changed, 31 insertions(+), 99 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java index e8c191d4afc..176c2e5848f 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java @@ -97,8 +97,6 @@ public final class HaplotypeCallerEngine implements AssemblyRegionEvaluator { private SmithWatermanAligner aligner; - private SmithWatermanAligner alignerHaplotypeToRef; - public static final byte MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION = 6; /** @@ -157,7 +155,6 @@ public HaplotypeCallerEngine( final HaplotypeCallerArgumentCollection hcArgs, bo this.referenceReader = Utils.nonNull(referenceReader); this.annotationEngine = Utils.nonNull(annotationEngine); this.aligner = SmithWatermanAligner.getAligner(hcArgs.smithWatermanImplementation); - this.alignerHaplotypeToRef = new SmithWatermanJavaAligner(); initialize(createBamOutIndex, createBamOutMD5); } @@ -543,7 +540,7 @@ public List callRegion(final AssemblyRegion region, final Featur } // run the local assembler, getting back a collection of information on how we should proceed - final AssemblyResultSet untrimmedAssemblyResult = AssemblyBasedCallerUtils.assembleReads(region, givenAlleles, hcArgs, readsHeader, samplesList, logger, referenceReader, assemblyEngine, alignerHaplotypeToRef, !hcArgs.doNotCorrectOverlappingBaseQualities); + final AssemblyResultSet untrimmedAssemblyResult = AssemblyBasedCallerUtils.assembleReads(region, givenAlleles, hcArgs, readsHeader, samplesList, logger, referenceReader, assemblyEngine, aligner, !hcArgs.doNotCorrectOverlappingBaseQualities); final SortedSet allVariationEvents = untrimmedAssemblyResult.getVariationEvents(hcArgs.maxMnpDistance); @@ -704,17 +701,7 @@ private List referenceModelForNoVariation(final AssemblyRegion r public void shutdown() { likelihoodCalculationEngine.close(); - //print out alignments - System.out.println("TOTAL NUMBER OF ALIGNMENTS (reads to ref):" + aligner.getNumOfAlignments()); - System.out.println("NO SW:" + aligner.noSW()); - System.out.println("YES SW:" + aligner.yesSW()); - - System.out.println("TOTAL NUMBER OF ALIGNMENTS (haplotypes to ref):" + alignerHaplotypeToRef.getNumOfAlignments()); - System.out.println("NO SW:" + alignerHaplotypeToRef.noSW()); - System.out.println("YES SW:" + alignerHaplotypeToRef.yesSW()); - aligner.close(); - alignerHaplotypeToRef.close(); if ( haplotypeBAMWriter.isPresent() ) { haplotypeBAMWriter.get().close(); diff --git a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanAligner.java b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanAligner.java index 86ea53349c2..84f869523b1 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanAligner.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanAligner.java @@ -28,11 +28,6 @@ public interface SmithWatermanAligner extends Closeable { */ SmithWatermanAlignment align(final byte[] ref, final byte[] alt, SWParameters parameters, SWOverhangStrategy overhangStrategy); - //methods for printing out number of SW/non-SW alignments - int getNumOfAlignments(); - int noSW(); - int yesSW(); - /** * Implementations may optionally implement close in order to release any resources that they are holding. * diff --git a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanIntelAligner.java b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanIntelAligner.java index 74129b78113..8a8f058e9a1 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanIntelAligner.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanIntelAligner.java @@ -45,23 +45,6 @@ public SmithWatermanAlignment align(final byte[] reference, final byte[] alterna return alignerWrapper.align(reference, alternate, parameters, overhangStrategy); } - //methods for printing out number of SW/non-SW alignments. not needed for now - public int getNumOfAlignments() - { - System.out.println("Intel SW being used"); - return 0; - } - public int noSW() - { - System.out.println("Intel SW being used"); - return 0; - } - public int yesSW() - { - System.out.println("Intel SW being used"); - return 0; - } - /** * Close the aligner */ diff --git a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java index 26fa50a9b1d..710d1353835 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java @@ -44,29 +44,11 @@ protected enum State { CLIP } - private int numOfAlignments = 0; - private int noSW = 0; - private int yesSW = 0; - - //methods for printing out number of SW/non-SW alignments - public int getNumOfAlignments() { - return numOfAlignments; - } - - public int noSW() { - return noSW; - } - - public int yesSW() { - return yesSW; - } - - /** * Create a new SW pairwise aligner, this has no state so instead of creating new instances, we create a singleton which is * accessible via {@link #getInstance} */ - public SmithWatermanJavaAligner(){} + private SmithWatermanJavaAligner(){} /** * Aligns the alternate sequence to the reference sequence @@ -78,8 +60,6 @@ public SmithWatermanJavaAligner(){} public SmithWatermanAlignment align(final byte[] reference, final byte[] alternate, final SWParameters parameters, final SWOverhangStrategy overhangStrategy) { long startTime = System.nanoTime(); - numOfAlignments++; - if ( reference == null || reference.length == 0 || alternate == null || alternate.length == 0 ) { throw new IllegalArgumentException("Non-null, non-empty sequences are required for the Smith-Waterman calculation"); } @@ -91,39 +71,28 @@ public SmithWatermanAlignment align(final byte[] reference, final byte[] alterna if (overhangStrategy == SWOverhangStrategy.SOFTCLIP || overhangStrategy == SWOverhangStrategy.IGNORE) { // Use a substring search to find an exact match of the alternate in the reference // NOTE: This approach only works for SOFTCLIP and IGNORE overhang strategies - long startHeuristic = System.nanoTime(); matchIndex = Utils.lastIndexOf(reference, alternate); - totalHeuristicTime += System.nanoTime() - startHeuristic; } final SmithWatermanAlignment alignmentResult; //if exact match if (matchIndex != -1) { - - noSW++; - // generate the alignment result when the substring search was successful final List lce = Collections.singletonList(makeElement(State.MATCH, alternate.length)); alignmentResult = new SWPairwiseAlignmentResult(AlignmentUtils.consolidateCigar(new Cigar(lce)), matchIndex); } else { //look for one mismatch - long startOneMismatchHeuristic = System.nanoTime(); matchIndex = Utils.lastIndexOfAtMostOneMismatch(reference, alternate); - totalMismatchHeuristicTime += System.nanoTime() - startOneMismatchHeuristic; if (matchIndex != -1) { - noSW++; - // generate the alignment result when the substring search was successful final List lce = Collections.singletonList(makeElement(State.MATCH, alternate.length)); alignmentResult = new SWPairwiseAlignmentResult(AlignmentUtils.consolidateCigar(new Cigar(lce)), matchIndex); } else{ // run full Smith-Waterman - yesSW++; - final int n = reference.length+1; final int m = alternate.length+1; final int[][] sw = new int[n][m]; @@ -433,7 +402,5 @@ private static CigarElement makeElement(final State state, final int length) { @Override public void close() { logger.info(String.format("Total compute time in java Smith-Waterman : %.2f sec", totalComputeTime * 1e-9)); - logger.info(String.format("Total compute time in heuristic : %.2f sec", totalHeuristicTime * 1e-9)); - logger.info(String.format("Total compute time in oneMismatch heuristic : %.2f sec", totalMismatchHeuristicTime * 1e-9)); } } diff --git a/src/test/java/org/broadinstitute/hellbender/utils/UtilsUnitTest.java b/src/test/java/org/broadinstitute/hellbender/utils/UtilsUnitTest.java index d4ca261e7d3..453306dd65e 100644 --- a/src/test/java/org/broadinstitute/hellbender/utils/UtilsUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/utils/UtilsUnitTest.java @@ -629,6 +629,35 @@ public void testLastIndexOfAtMostOneMismatchLastBoundaries() { Assert.assertEquals(result, 0); } + @Test + public void testLastIndexOfAtMostOneMismatchRandomExactMatch() { + final int numTests = 100; + final int referenceLength = 100; + final int queryLength = 10; + + byte[] reference = new byte[referenceLength]; + byte[] query = new byte[queryLength]; + + final Random rng = Utils.getRandomGenerator(); + + for (int i = 0; i < numTests; i++) { + randomByteString(rng, reference); + randomByteString(rng, query); + + int index = -1; + + //add query to reference at a random lo cation for 75% of the tests + if (i % 4 > 0) { + index = rng.nextInt(referenceLength - queryLength); + System.arraycopy(query,0, reference, index, queryLength); + } + + final int result = Utils.lastIndexOfAtMostOneMismatch(reference, query); + final int expected = index; + Assert.assertEquals(result, expected); + } + } + @Test public void testLastIndexOfAtMostOneMismatchTwoMismatches() { final String reference = "AAAACCCCTTTTGGGG"; @@ -687,35 +716,6 @@ public void testLastIndexOfAtMostOneMismatchRandom() { } } - @Test - public void testLastIndexOfAtMostOneMismatchRandomExactMatch() { - final int numTests = 100; - final int referenceLength = 100; - final int queryLength = 10; - - byte[] reference = new byte[referenceLength]; - byte[] query = new byte[queryLength]; - - final Random rng = Utils.getRandomGenerator(); - - for (int i = 0; i < numTests; i++) { - randomByteString(rng, reference); - randomByteString(rng, query); - - int index = -1; - - //add query to reference at a random lo cation for 75% of the tests - if (i % 4 > 0) { - index = rng.nextInt(referenceLength - queryLength); - System.arraycopy(query,0, reference, index, queryLength); - } - - final int result = Utils.lastIndexOfAtMostOneMismatch(reference, query); - final int expected = index; - Assert.assertEquals(result, expected); - } - } - private void randomByteString(Random rng, byte[] bytes) { for (int i = 0; i < bytes.length; i++) { bytes[i] = (byte)(rng.nextInt(94) + 32); From e1757143264fe4026e605b03a8d8977b367901e0 Mon Sep 17 00:00:00 2001 From: Jonathon Cohen Date: Wed, 26 Jun 2019 14:36:35 -0400 Subject: [PATCH 07/23] include correct paramaters before heuristic execution --- .../utils/smithwaterman/SmithWatermanJavaAligner.java | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java index 710d1353835..7c6a6e05994 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java @@ -83,13 +83,16 @@ public SmithWatermanAlignment align(final byte[] reference, final byte[] alterna alignmentResult = new SWPairwiseAlignmentResult(AlignmentUtils.consolidateCigar(new Cigar(lce)), matchIndex); } else { - //look for one mismatch - matchIndex = Utils.lastIndexOfAtMostOneMismatch(reference, alternate); + int matchIndex2 = -1; + if (overhangStrategy == SWOverhangStrategy.SOFTCLIP || overhangStrategy == SWOverhangStrategy.IGNORE) { + //look for one mismatch + matchIndex2 = Utils.lastIndexOfAtMostOneMismatch(reference, alternate); + } - if (matchIndex != -1) { + if (matchIndex2 != -1) { // generate the alignment result when the substring search was successful final List lce = Collections.singletonList(makeElement(State.MATCH, alternate.length)); - alignmentResult = new SWPairwiseAlignmentResult(AlignmentUtils.consolidateCigar(new Cigar(lce)), matchIndex); + alignmentResult = new SWPairwiseAlignmentResult(AlignmentUtils.consolidateCigar(new Cigar(lce)), matchIndex2); } else{ // run full Smith-Waterman From c9b20ec480325a3eca64d95e87f15023dd9af369 Mon Sep 17 00:00:00 2001 From: Jonathon Cohen Date: Tue, 2 Jul 2019 15:11:53 -0400 Subject: [PATCH 08/23] edited for David Benjamin's final comments for stage 1 --- .../broadinstitute/hellbender/utils/Utils.java | 8 ++++++++ .../smithwaterman/SWNativeAlignerWrapper.java | 18 ------------------ .../SmithWatermanJavaAligner.java | 16 ++++++++-------- 3 files changed, 16 insertions(+), 26 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/utils/Utils.java b/src/main/java/org/broadinstitute/hellbender/utils/Utils.java index 8e799acbbe7..efdd91b8b1f 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/Utils.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/Utils.java @@ -1057,6 +1057,14 @@ public static int lastIndexOf(final byte[] reference, final byte[] query) { return -1; } + /** + * Find the last one-off occurrence of the query sequence in the reference sequence + * + * Returns the index of the last occurrence or -1 if the query sequence is not found + * + * @param reference the reference sequence + * @param query the query sequence + */ public static int lastIndexOfAtMostOneMismatch(final byte[] reference, final byte[] query) { int queryLength = query.length; diff --git a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SWNativeAlignerWrapper.java b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SWNativeAlignerWrapper.java index 4bc53a983da..612794bb497 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SWNativeAlignerWrapper.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SWNativeAlignerWrapper.java @@ -25,24 +25,6 @@ public SWNativeAlignerWrapper(final SWAlignerNativeBinding aligner) { this.aligner = aligner; } - - //methods for printing out number of SW/non-SW alignments. not needed - public int getNumOfAlignments() - { - System.out.println("SW native wrapper being used"); - return 0; - } - public int noSW() - { - System.out.println("native SW being used"); - return 0; - } - public int yesSW() - { - System.out.println("native SW being used"); - return 0; - } - @Override public SmithWatermanAlignment align(final byte[] reference, final byte[] alternate, final SWParameters parameters, final SWOverhangStrategy overhangStrategy){ long startTime = System.nanoTime(); diff --git a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java index 7c6a6e05994..e4fbb671683 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java @@ -67,32 +67,32 @@ public SmithWatermanAlignment align(final byte[] reference, final byte[] alterna Utils.nonNull(overhangStrategy); // avoid running full Smith-Waterman if there is an exact match of alternate in reference - int matchIndex = -1; + int exactMatchIndex = -1; if (overhangStrategy == SWOverhangStrategy.SOFTCLIP || overhangStrategy == SWOverhangStrategy.IGNORE) { // Use a substring search to find an exact match of the alternate in the reference // NOTE: This approach only works for SOFTCLIP and IGNORE overhang strategies - matchIndex = Utils.lastIndexOf(reference, alternate); + exactMatchIndex = Utils.lastIndexOf(reference, alternate); } final SmithWatermanAlignment alignmentResult; //if exact match - if (matchIndex != -1) { + if (exactMatchIndex != -1) { // generate the alignment result when the substring search was successful final List lce = Collections.singletonList(makeElement(State.MATCH, alternate.length)); - alignmentResult = new SWPairwiseAlignmentResult(AlignmentUtils.consolidateCigar(new Cigar(lce)), matchIndex); + alignmentResult = new SWPairwiseAlignmentResult(AlignmentUtils.consolidateCigar(new Cigar(lce)), exactMatchIndex); } else { - int matchIndex2 = -1; + int singleMismatchIndex = -1; if (overhangStrategy == SWOverhangStrategy.SOFTCLIP || overhangStrategy == SWOverhangStrategy.IGNORE) { //look for one mismatch - matchIndex2 = Utils.lastIndexOfAtMostOneMismatch(reference, alternate); + singleMismatchIndex = Utils.lastIndexOfAtMostOneMismatch(reference, alternate); } - if (matchIndex2 != -1) { + if (singleMismatchIndex != -1) { // generate the alignment result when the substring search was successful final List lce = Collections.singletonList(makeElement(State.MATCH, alternate.length)); - alignmentResult = new SWPairwiseAlignmentResult(AlignmentUtils.consolidateCigar(new Cigar(lce)), matchIndex2); + alignmentResult = new SWPairwiseAlignmentResult(AlignmentUtils.consolidateCigar(new Cigar(lce)), singleMismatchIndex); } else{ // run full Smith-Waterman From 21532cbf4f9a076f409ba35af206ac84892dc149 Mon Sep 17 00:00:00 2001 From: Jonathon Cohen Date: Tue, 2 Jul 2019 15:43:06 -0400 Subject: [PATCH 09/23] added atMostOneIndel heuristic --- .../hellbender/utils/Utils.java | 79 +++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/src/main/java/org/broadinstitute/hellbender/utils/Utils.java b/src/main/java/org/broadinstitute/hellbender/utils/Utils.java index efdd91b8b1f..9367dc74a4d 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/Utils.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/Utils.java @@ -1084,6 +1084,85 @@ public static int lastIndexOfAtMostOneMismatch(final byte[] reference, final byt return -1; } + /** + * Finds the location of one indel in the query sequence in relation to the reference sequence + * + * Returns the index and size of the indel or -1 and 0 if an indel less than 13 bases is not found + * + * @param reference the reference sequence + * @param query the query sequence + * @param allowedLengthOfIndel the maximum size of the indel before 2 mismatches becomes less penalized + */ + public static int[] atMostOneIndel(final byte[] reference, final byte[] query, int allowedLengthOfIndel){ + int referenceLength = reference.length; + int queryLength = query.length; + + int indelStart = -1; + int[] indelStartAndSize = {-1,0}; + + if(queryLength < referenceLength){ + int lengthOfIndel = referenceLength - queryLength; + if(lengthOfIndel > allowedLengthOfIndel){ + return indelStartAndSize; + } + + //traverse until you hit mismatch/start of indel + for(int i = 0; i < queryLength; i++){ + if(query[i] != reference[i]){ + indelStart = i; + + //traverse backwards until you hit end of indel + for(i = queryLength - 1; i >= indelStart; i--){ + if(query[i] != reference[i + lengthOfIndel]){ + return indelStartAndSize; + } + } + + //traversed entire query, one indel and no mismatches found + indelStartAndSize[0] = indelStart; + indelStartAndSize[1] = lengthOfIndel; + return indelStartAndSize; + } + } + + //deletion located at the end of the query + indelStartAndSize[0] = queryLength; + indelStartAndSize[1] = referenceLength - queryLength; + return indelStartAndSize; + } + if(referenceLength < queryLength){ + int lengthOfIndel = queryLength - referenceLength; + if(lengthOfIndel > allowedLengthOfIndel){ + return indelStartAndSize; + } + + for(int i = 0; i < referenceLength; i++){ + if(reference[i] != query[i]){ + indelStart = i; + + for(i = referenceLength - 1; i >= indelStart; i--){ + if(reference[i] != query[i + lengthOfIndel]){ + return indelStartAndSize; + } + } + + //traversed entire query, one indel and no mismatches found + indelStartAndSize[0] = indelStart; + indelStartAndSize[1] = lengthOfIndel; + return indelStartAndSize; + } + } + + //insertion located at end of query + indelStartAndSize[0] = referenceLength; + indelStartAndSize[1] = queryLength - referenceLength; + return indelStartAndSize; + } + + //lengths are equal, no one indel + return indelStartAndSize; + } + /** * Simple wrapper for sticking elements of a int[] array into a List * @param ar - the array whose elements should be listified From 4735712f9086fc9b8d5c12186d8fea1533e746eb Mon Sep 17 00:00:00 2001 From: Jonathon Cohen Date: Tue, 2 Jul 2019 16:04:12 -0400 Subject: [PATCH 10/23] added test for atMostOneIndel --- .../hellbender/utils/Utils.java | 2 +- .../hellbender/utils/UtilsUnitTest.java | 35 +++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/broadinstitute/hellbender/utils/Utils.java b/src/main/java/org/broadinstitute/hellbender/utils/Utils.java index 9367dc74a4d..8e98e5752aa 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/Utils.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/Utils.java @@ -1087,7 +1087,7 @@ public static int lastIndexOfAtMostOneMismatch(final byte[] reference, final byt /** * Finds the location of one indel in the query sequence in relation to the reference sequence * - * Returns the index and size of the indel or -1 and 0 if an indel less than 13 bases is not found + * Returns the index and size of the indel (as a 2 element int array) or -1 and 0 if an indel less than 4 bases is not found * * @param reference the reference sequence * @param query the query sequence diff --git a/src/test/java/org/broadinstitute/hellbender/utils/UtilsUnitTest.java b/src/test/java/org/broadinstitute/hellbender/utils/UtilsUnitTest.java index 453306dd65e..257650ee2e0 100644 --- a/src/test/java/org/broadinstitute/hellbender/utils/UtilsUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/utils/UtilsUnitTest.java @@ -751,6 +751,41 @@ public void testLastIndexOfRandom() { } } + @Test + public void atMostOneIndel(){ + //deletion in middle of query + String reference = "AGGATTTGGGATTAC"; + String query = "AGGAGGGATTAC"; + int result = (Utils.atMostOneIndel(reference.getBytes(), query.getBytes(), 3))[0]; + Assert.assertEquals(result, 4); + + //deletion in end of query + String reference2 = "AGGATTTGGGATTAC"; + String query2 = "AGGATTTGGGAT"; + int result2 = (Utils.atMostOneIndel(reference2.getBytes(), query2.getBytes(), 3))[0]; + int expected2 = query2.length(); + Assert.assertEquals(result2, expected2); + + //insertion in middle of query + String reference3 = "ATTTAGTGGGATTA"; + String query3 = "ATTTAGTAGTGGGATTA"; + int result3 = (Utils.atMostOneIndel(reference3.getBytes(), query3.getBytes(), 3))[0]; + Assert.assertEquals(result3, 7); + + //insertion in end of query + String reference4 = "AGTAGTGTGCGTCA"; + String query4 = "AGTAGTGTGCGTCAACT"; + int result4 = (Utils.atMostOneIndel(reference4.getBytes(), query4.getBytes(), 3))[0]; + int expected4 = reference4.length(); + Assert.assertEquals(result4, expected4); + + //too long of an indel + String reference5 = "AGTAGTGTGCGTCA"; + String query5 = "AGTAGTGTGCGTCAACTA"; + int result5 = (Utils.atMostOneIndel(reference5.getBytes(), query5.getBytes(), 3))[0]; + Assert.assertEquals(result5, -1); + } + @Test(expectedExceptions = IllegalArgumentException.class) public void testListFromPrimitivesNull() throws Exception { Utils.listFromPrimitives(null); From 16753db6fd905defe705634089d23a641cc62b1c Mon Sep 17 00:00:00 2001 From: Jonathon Cohen Date: Wed, 3 Jul 2019 10:12:25 -0400 Subject: [PATCH 11/23] added oneIndel heuristic to alignment and haplotypeToRef aligner object to HaplotypeCallerEngine --- .../HaplotypeCallerEngine.java | 7 +- .../SmithWatermanJavaAligner.java | 68 ++++++++++++++++--- 2 files changed, 64 insertions(+), 11 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java index 176c2e5848f..a39212b19d6 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java @@ -97,6 +97,8 @@ public final class HaplotypeCallerEngine implements AssemblyRegionEvaluator { private SmithWatermanAligner aligner; + private SmithWatermanAligner alignerHaplotypeToRef; + public static final byte MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION = 6; /** @@ -155,6 +157,8 @@ public HaplotypeCallerEngine( final HaplotypeCallerArgumentCollection hcArgs, bo this.referenceReader = Utils.nonNull(referenceReader); this.annotationEngine = Utils.nonNull(annotationEngine); this.aligner = SmithWatermanAligner.getAligner(hcArgs.smithWatermanImplementation); + boolean haplotypeToRef = true; + this.alignerHaplotypeToRef = new SmithWatermanJavaAligner(haplotypeToRef); initialize(createBamOutIndex, createBamOutMD5); } @@ -540,7 +544,7 @@ public List callRegion(final AssemblyRegion region, final Featur } // run the local assembler, getting back a collection of information on how we should proceed - final AssemblyResultSet untrimmedAssemblyResult = AssemblyBasedCallerUtils.assembleReads(region, givenAlleles, hcArgs, readsHeader, samplesList, logger, referenceReader, assemblyEngine, aligner, !hcArgs.doNotCorrectOverlappingBaseQualities); + final AssemblyResultSet untrimmedAssemblyResult = AssemblyBasedCallerUtils.assembleReads(region, givenAlleles, hcArgs, readsHeader, samplesList, logger, referenceReader, assemblyEngine, alignerHaplotypeToRef, !hcArgs.doNotCorrectOverlappingBaseQualities); final SortedSet allVariationEvents = untrimmedAssemblyResult.getVariationEvents(hcArgs.maxMnpDistance); @@ -702,6 +706,7 @@ public void shutdown() { likelihoodCalculationEngine.close(); aligner.close(); + alignerHaplotypeToRef.close(); if ( haplotypeBAMWriter.isPresent() ) { haplotypeBAMWriter.get().close(); diff --git a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java index e4fbb671683..4bf02dc8ef1 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java @@ -24,8 +24,7 @@ public final class SmithWatermanJavaAligner implements SmithWatermanAligner { private static final SmithWatermanJavaAligner ALIGNER = new SmithWatermanJavaAligner(); private long totalComputeTime = 0; - private long totalHeuristicTime = 0; - private long totalMismatchHeuristicTime = 0; + private boolean haplotypeToref = false; /** * return the stateless singleton instance of SmithWatermanJavaAligner @@ -50,6 +49,10 @@ protected enum State { */ private SmithWatermanJavaAligner(){} + public SmithWatermanJavaAligner(boolean haplotypeToref){ + this.haplotypeToref = haplotypeToref; + } + /** * Aligns the alternate sequence to the reference sequence * @@ -95,14 +98,59 @@ public SmithWatermanAlignment align(final byte[] reference, final byte[] alterna alignmentResult = new SWPairwiseAlignmentResult(AlignmentUtils.consolidateCigar(new Cigar(lce)), singleMismatchIndex); } else{ - // run full Smith-Waterman - final int n = reference.length+1; - final int m = alternate.length+1; - final int[][] sw = new int[n][m]; - final int[][] btrack=new int[n][m]; - - calculateMatrix(reference, alternate, sw, btrack, overhangStrategy, parameters); - alignmentResult = calculateCigar(sw, btrack, overhangStrategy); // length of the segment (continuous matches, insertions or deletions) + if(haplotypeToref){ + //check for one indel + int oneIndelIndex = -1; + int[] indelStartAndSize = null; + + if (overhangStrategy == SWOverhangStrategy.SOFTCLIP || overhangStrategy == SWOverhangStrategy.IGNORE) { + //calculate allowed length for indel to be less of a penalty than 2 mismatches + int mismatchScore = parameters.getMismatchPenalty(); + int indelExtendScore = parameters.getGapExtendPenalty(); + int indelOpenScore = parameters.getGapOpenPenalty(); + int allowedLengthOfIndel = ((2 * mismatchScore) - indelOpenScore)/indelExtendScore; + + indelStartAndSize = Utils.atMostOneIndel(reference, alternate, allowedLengthOfIndel); + oneIndelIndex = indelStartAndSize[0]; + + } + + if (oneIndelIndex != -1){ + int indelLength = indelStartAndSize[1]; + State state = null; + if(alternate.length < reference.length){ + state = State.DELETION; + } + if(alternate.length > reference.length){ + state = State.INSERTION; + } + + final List lce = Arrays.asList(makeElement(State.MATCH, oneIndelIndex), + makeElement(state, indelLength), makeElement(State.MATCH, alternate.length - oneIndelIndex)); + alignmentResult = new SWPairwiseAlignmentResult(AlignmentUtils.consolidateCigar(new Cigar(lce)), 0); + } + else{ + //run full Smith-Waterman + + final int n = reference.length+1; + final int m = alternate.length+1; + final int[][] sw = new int[n][m]; + final int[][] btrack=new int[n][m]; + + calculateMatrix(reference, alternate, sw, btrack, overhangStrategy, parameters); + alignmentResult = calculateCigar(sw, btrack, overhangStrategy); // length of the segment (continuous matches, insertions or deletions) + } + } + else{ + // run full Smith-Waterman + final int n = reference.length+1; + final int m = alternate.length+1; + final int[][] sw = new int[n][m]; + final int[][] btrack=new int[n][m]; + + calculateMatrix(reference, alternate, sw, btrack, overhangStrategy, parameters); + alignmentResult = calculateCigar(sw, btrack, overhangStrategy); // length of the segment (continuous matches, insertions or deletions) + } } } From 6713458e85053a840bbb94d3e2f4fd63d6b51ff4 Mon Sep 17 00:00:00 2001 From: Jonathon Cohen Date: Wed, 3 Jul 2019 10:15:26 -0400 Subject: [PATCH 12/23] completed indel alignment implementation --- .../utils/smithwaterman/SmithWatermanJavaAligner.java | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java index 4bf02dc8ef1..769886ffdaa 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java @@ -109,10 +109,9 @@ public SmithWatermanAlignment align(final byte[] reference, final byte[] alterna int indelExtendScore = parameters.getGapExtendPenalty(); int indelOpenScore = parameters.getGapOpenPenalty(); int allowedLengthOfIndel = ((2 * mismatchScore) - indelOpenScore)/indelExtendScore; - + //returns int array of 2 elements - location, size indelStartAndSize = Utils.atMostOneIndel(reference, alternate, allowedLengthOfIndel); oneIndelIndex = indelStartAndSize[0]; - } if (oneIndelIndex != -1){ @@ -124,7 +123,7 @@ public SmithWatermanAlignment align(final byte[] reference, final byte[] alterna if(alternate.length > reference.length){ state = State.INSERTION; } - + //produce cigar string of 3 elements final List lce = Arrays.asList(makeElement(State.MATCH, oneIndelIndex), makeElement(state, indelLength), makeElement(State.MATCH, alternate.length - oneIndelIndex)); alignmentResult = new SWPairwiseAlignmentResult(AlignmentUtils.consolidateCigar(new Cigar(lce)), 0); From 6f2fd007f6671d8a94c50c8c8c962eee576a821f Mon Sep 17 00:00:00 2001 From: Jonathon Cohen Date: Wed, 3 Jul 2019 14:00:32 -0400 Subject: [PATCH 13/23] for indel detection, traverse backwards first and then forward --- .../hellbender/utils/Utils.java | 34 ++++++++++--------- .../SmithWatermanJavaAligner.java | 29 ++++++++++++++-- .../hellbender/utils/UtilsUnitTest.java | 2 +- 3 files changed, 46 insertions(+), 19 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/utils/Utils.java b/src/main/java/org/broadinstitute/hellbender/utils/Utils.java index 8e98e5752aa..55ad3d14946 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/Utils.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/Utils.java @@ -1097,7 +1097,7 @@ public static int[] atMostOneIndel(final byte[] reference, final byte[] query, i int referenceLength = reference.length; int queryLength = query.length; - int indelStart = -1; + int indelEnd = -1; int[] indelStartAndSize = {-1,0}; if(queryLength < referenceLength){ @@ -1106,20 +1106,20 @@ public static int[] atMostOneIndel(final byte[] reference, final byte[] query, i return indelStartAndSize; } - //traverse until you hit mismatch/start of indel - for(int i = 0; i < queryLength; i++){ - if(query[i] != reference[i]){ - indelStart = i; + //traverse backwards until you hit mismatch/end of indel + for(int i = queryLength - 1; i >= 0; i--){ + if(query[i] != reference[i + lengthOfIndel]){ + indelEnd = i + 1; - //traverse backwards until you hit end of indel - for(i = queryLength - 1; i >= indelStart; i--){ - if(query[i] != reference[i + lengthOfIndel]){ + //traverse forwards until you hit start of indel + for(i = 0; i < indelEnd; i++){ + if(query[i] != reference[i]){ return indelStartAndSize; } } //traversed entire query, one indel and no mismatches found - indelStartAndSize[0] = indelStart; + indelStartAndSize[0] = indelEnd; indelStartAndSize[1] = lengthOfIndel; return indelStartAndSize; } @@ -1136,24 +1136,26 @@ public static int[] atMostOneIndel(final byte[] reference, final byte[] query, i return indelStartAndSize; } - for(int i = 0; i < referenceLength; i++){ - if(reference[i] != query[i]){ - indelStart = i; + //traverse backwards until you hit mismatch/end of indel + for(int i = referenceLength - 1; i >= 0; i--){ + if(reference[i] != query[i + lengthOfIndel]){ + indelEnd = i + 1; - for(i = referenceLength - 1; i >= indelStart; i--){ - if(reference[i] != query[i + lengthOfIndel]){ + //traverse forwards until you hit start of indel + for(i = 0; i < indelEnd; i++){ + if(reference[i] != query[i]){ return indelStartAndSize; } } //traversed entire query, one indel and no mismatches found - indelStartAndSize[0] = indelStart; + indelStartAndSize[0] = indelEnd; indelStartAndSize[1] = lengthOfIndel; return indelStartAndSize; } } - //insertion located at end of query + //deletion located at the end of the query indelStartAndSize[0] = referenceLength; indelStartAndSize[1] = queryLength - referenceLength; return indelStartAndSize; diff --git a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java index 769886ffdaa..b89dc0d5d57 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java @@ -117,16 +117,41 @@ public SmithWatermanAlignment align(final byte[] reference, final byte[] alterna if (oneIndelIndex != -1){ int indelLength = indelStartAndSize[1]; State state = null; + int cigarThirdElementLength = 0; + if(alternate.length < reference.length){ state = State.DELETION; + cigarThirdElementLength = alternate.length - oneIndelIndex; } if(alternate.length > reference.length){ state = State.INSERTION; + cigarThirdElementLength = alternate.length - indelLength - oneIndelIndex; } - //produce cigar string of 3 elements + final List lce = Arrays.asList(makeElement(State.MATCH, oneIndelIndex), - makeElement(state, indelLength), makeElement(State.MATCH, alternate.length - oneIndelIndex)); + makeElement(state, indelLength), makeElement(State.MATCH, cigarThirdElementLength)); alignmentResult = new SWPairwiseAlignmentResult(AlignmentUtils.consolidateCigar(new Cigar(lce)), 0); + + //test to see if SW output and indel heurisitic output are the same + Cigar cigar1 = alignmentResult.getCigar(); + + final int n = reference.length+1; + final int m = alternate.length+1; + final int[][] sw = new int[n][m]; + final int[][] btrack=new int[n][m]; + + calculateMatrix(reference, alternate, sw, btrack, overhangStrategy, parameters); + SmithWatermanAlignment alignmentResult2 = calculateCigar(sw, btrack, overhangStrategy); + Cigar cigar2 = alignmentResult2.getCigar(); + + if(!cigar1.equals(cigar2)){ + System.out.println("CIGARS NOT EQUAL"); + System.out.println(new String(reference)); + System.out.println(""); + System.out.println(new String(alternate)); + System.out.println("heuristic: " + cigar1.toString()); + System.out.println("SW : " + cigar2.toString()); + } } else{ //run full Smith-Waterman diff --git a/src/test/java/org/broadinstitute/hellbender/utils/UtilsUnitTest.java b/src/test/java/org/broadinstitute/hellbender/utils/UtilsUnitTest.java index 257650ee2e0..2af0dd3a1bd 100644 --- a/src/test/java/org/broadinstitute/hellbender/utils/UtilsUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/utils/UtilsUnitTest.java @@ -770,7 +770,7 @@ public void atMostOneIndel(){ String reference3 = "ATTTAGTGGGATTA"; String query3 = "ATTTAGTAGTGGGATTA"; int result3 = (Utils.atMostOneIndel(reference3.getBytes(), query3.getBytes(), 3))[0]; - Assert.assertEquals(result3, 7); + Assert.assertEquals(result3, 3); //insertion in end of query String reference4 = "AGTAGTGTGCGTCA"; From 07e6968d317d88355d4b35c8c1a3686c0a831ebe Mon Sep 17 00:00:00 2001 From: Jonathon Cohen Date: Wed, 3 Jul 2019 14:01:29 -0400 Subject: [PATCH 14/23] removed test code --- .../SmithWatermanJavaAligner.java | 21 ------------------- 1 file changed, 21 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java index b89dc0d5d57..91e6855a75d 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java @@ -131,27 +131,6 @@ public SmithWatermanAlignment align(final byte[] reference, final byte[] alterna final List lce = Arrays.asList(makeElement(State.MATCH, oneIndelIndex), makeElement(state, indelLength), makeElement(State.MATCH, cigarThirdElementLength)); alignmentResult = new SWPairwiseAlignmentResult(AlignmentUtils.consolidateCigar(new Cigar(lce)), 0); - - //test to see if SW output and indel heurisitic output are the same - Cigar cigar1 = alignmentResult.getCigar(); - - final int n = reference.length+1; - final int m = alternate.length+1; - final int[][] sw = new int[n][m]; - final int[][] btrack=new int[n][m]; - - calculateMatrix(reference, alternate, sw, btrack, overhangStrategy, parameters); - SmithWatermanAlignment alignmentResult2 = calculateCigar(sw, btrack, overhangStrategy); - Cigar cigar2 = alignmentResult2.getCigar(); - - if(!cigar1.equals(cigar2)){ - System.out.println("CIGARS NOT EQUAL"); - System.out.println(new String(reference)); - System.out.println(""); - System.out.println(new String(alternate)); - System.out.println("heuristic: " + cigar1.toString()); - System.out.println("SW : " + cigar2.toString()); - } } else{ //run full Smith-Waterman From 2e5f3e94574d34d7459172e6d12ec7fc7beb3cb6 Mon Sep 17 00:00:00 2001 From: Jonathon Cohen Date: Mon, 8 Jul 2019 10:52:25 -0400 Subject: [PATCH 15/23] consolidated all mismatch heuristics into one method using allowedMismatches parameter --- .../hellbender/utils/Utils.java | 30 ++----------------- .../smithwaterman/SWNativeAlignerWrapper.java | 2 +- .../SmithWatermanJavaAligner.java | 4 +-- .../hellbender/utils/UtilsUnitTest.java | 22 +++++++------- 4 files changed, 17 insertions(+), 41 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/utils/Utils.java b/src/main/java/org/broadinstitute/hellbender/utils/Utils.java index 55ad3d14946..6b3c54dddc2 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/Utils.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/Utils.java @@ -1033,30 +1033,6 @@ public static boolean xor(final boolean x, final boolean y) { return x != y; } - /** - * Find the last occurrence of the query sequence in the reference sequence - * - * Returns the index of the last occurrence or -1 if the query sequence is not found - * - * @param reference the reference sequence - * @param query the query sequence - */ - public static int lastIndexOf(final byte[] reference, final byte[] query) { - int queryLength = query.length; - - // start search from the last possible matching position and search to the left - for (int r = reference.length - queryLength; r >= 0; r--) { - int q = 0; - while (q < queryLength && reference[r+q] == query[q]) { - q++; - } - if (q == queryLength) { - return r; - } - } - return -1; - } - /** * Find the last one-off occurrence of the query sequence in the reference sequence * @@ -1065,19 +1041,19 @@ public static int lastIndexOf(final byte[] reference, final byte[] query) { * @param reference the reference sequence * @param query the query sequence */ - public static int lastIndexOfAtMostOneMismatch(final byte[] reference, final byte[] query) { + public static int lastIndexOfAtMostTwoMismatches(final byte[] reference, final byte[] query, final int allowedMismatches) { int queryLength = query.length; // start search from the last possible matching position and search to the left for (int refIndex = reference.length - queryLength; refIndex >= 0; refIndex--) { int mismatchCount = 0; - for (int queryIndex = 0; queryIndex < queryLength && mismatchCount < 2; queryIndex++) { + for (int queryIndex = 0; queryIndex < queryLength && mismatchCount <= allowedMismatches; queryIndex++) { if (reference[refIndex+queryIndex] != query[queryIndex]) { mismatchCount++; } } - if (mismatchCount < 2) { + if (mismatchCount <= allowedMismatches) { return refIndex; } } diff --git a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SWNativeAlignerWrapper.java b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SWNativeAlignerWrapper.java index 612794bb497..78d50f3be25 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SWNativeAlignerWrapper.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SWNativeAlignerWrapper.java @@ -37,7 +37,7 @@ public SmithWatermanAlignment align(final byte[] reference, final byte[] alterna if (overhangStrategy == SWOverhangStrategy.SOFTCLIP || overhangStrategy == SWOverhangStrategy.IGNORE) { // Use a substring search to find an exact match of the alternate in the reference // NOTE: This approach only works for SOFTCLIP and IGNORE overhang strategies - matchIndex = Utils.lastIndexOf(reference, alternate); + matchIndex = Utils.lastIndexOfAtMostTwoMismatches(reference, alternate, 0); } final SmithWatermanAlignment alignmentResult; diff --git a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java index 91e6855a75d..e674ab28f21 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java @@ -74,7 +74,7 @@ public SmithWatermanAlignment align(final byte[] reference, final byte[] alterna if (overhangStrategy == SWOverhangStrategy.SOFTCLIP || overhangStrategy == SWOverhangStrategy.IGNORE) { // Use a substring search to find an exact match of the alternate in the reference // NOTE: This approach only works for SOFTCLIP and IGNORE overhang strategies - exactMatchIndex = Utils.lastIndexOf(reference, alternate); + exactMatchIndex = Utils.lastIndexOfAtMostTwoMismatches(reference, alternate, 0); } final SmithWatermanAlignment alignmentResult; @@ -89,7 +89,7 @@ public SmithWatermanAlignment align(final byte[] reference, final byte[] alterna int singleMismatchIndex = -1; if (overhangStrategy == SWOverhangStrategy.SOFTCLIP || overhangStrategy == SWOverhangStrategy.IGNORE) { //look for one mismatch - singleMismatchIndex = Utils.lastIndexOfAtMostOneMismatch(reference, alternate); + singleMismatchIndex = Utils.lastIndexOfAtMostTwoMismatches(reference, alternate, 1); } if (singleMismatchIndex != -1) { diff --git a/src/test/java/org/broadinstitute/hellbender/utils/UtilsUnitTest.java b/src/test/java/org/broadinstitute/hellbender/utils/UtilsUnitTest.java index 2af0dd3a1bd..184469f7a67 100644 --- a/src/test/java/org/broadinstitute/hellbender/utils/UtilsUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/utils/UtilsUnitTest.java @@ -591,7 +591,7 @@ public void testLastIndexOfQueryTooLong() { final String reference = "AAAA"; final String query = "AAAAAAA"; - final int result = Utils.lastIndexOf(reference.getBytes(), query.getBytes()); + final int result = Utils.lastIndexOfAtMostTwoMismatches(reference.getBytes(), query.getBytes(), 0); final int expected = reference.lastIndexOf(query); Assert.assertEquals(result, expected); } @@ -602,13 +602,13 @@ public void testLastIndexOfLastBoundaries() { // match right boundary of reference String query = "TGGGG"; - int result = Utils.lastIndexOf(reference.getBytes(), query.getBytes()); + int result = Utils.lastIndexOfAtMostTwoMismatches(reference.getBytes(), query.getBytes(), 0); int expected = reference.lastIndexOf(query); Assert.assertEquals(result, expected); // match left boundary of reference query = "AAAAC"; - result = Utils.lastIndexOf(reference.getBytes(), query.getBytes()); + result = Utils.lastIndexOfAtMostTwoMismatches(reference.getBytes(), query.getBytes(), 0); expected = reference.lastIndexOf(query); Assert.assertEquals(result, expected); } @@ -619,13 +619,13 @@ public void testLastIndexOfAtMostOneMismatchLastBoundaries() { // match right boundary of reference final String query1 = "TGAGG"; - int result = Utils.lastIndexOfAtMostOneMismatch(reference.getBytes(), query1.getBytes()); + int result = Utils.lastIndexOfAtMostTwoMismatches(reference.getBytes(), query1.getBytes(), 1); int expected = reference.length() - query1.length(); Assert.assertEquals(result, expected); // match left boundary of reference final String query2 = "AAGAC"; - result = Utils.lastIndexOfAtMostOneMismatch(reference.getBytes(), query2.getBytes()); + result = Utils.lastIndexOfAtMostTwoMismatches(reference.getBytes(), query2.getBytes(), 1); Assert.assertEquals(result, 0); } @@ -652,7 +652,7 @@ public void testLastIndexOfAtMostOneMismatchRandomExactMatch() { System.arraycopy(query,0, reference, index, queryLength); } - final int result = Utils.lastIndexOfAtMostOneMismatch(reference, query); + final int result = Utils.lastIndexOfAtMostTwoMismatches(reference, query, 1); final int expected = index; Assert.assertEquals(result, expected); } @@ -664,12 +664,12 @@ public void testLastIndexOfAtMostOneMismatchTwoMismatches() { // match right boundary of reference final String query1 = "AGAGG"; - int result = Utils.lastIndexOfAtMostOneMismatch(reference.getBytes(), query1.getBytes()); + int result = Utils.lastIndexOfAtMostTwoMismatches(reference.getBytes(), query1.getBytes(), 1); Assert.assertEquals(result, -1); // match right boundary of reference final String query2 = "GGAAC"; - int result2 = Utils.lastIndexOfAtMostOneMismatch(reference.getBytes(), query2.getBytes()); + int result2 = Utils.lastIndexOfAtMostTwoMismatches(reference.getBytes(), query2.getBytes(), 1); Assert.assertEquals(result2, -1); } @@ -679,7 +679,7 @@ public void testLastIndexOfAtMostOneMismatchOneMismatchTwice() { //matches both boundaries of reference final String query1 = "AGGG"; - int result = Utils.lastIndexOfAtMostOneMismatch(reference.getBytes(), query1.getBytes()); + int result = Utils.lastIndexOfAtMostTwoMismatches(reference.getBytes(), query1.getBytes(), 1); //will catch first match it encounters from the right int expected = reference.length() - query1.length(); Assert.assertEquals(result, expected); @@ -710,7 +710,7 @@ public void testLastIndexOfAtMostOneMismatchRandom() { reference[index + mismatchPosition] = BaseUtils.simpleComplement(query[mismatchPosition]); } - final int result = Utils.lastIndexOfAtMostOneMismatch(reference, query); + final int result = Utils.lastIndexOfAtMostTwoMismatches(reference, query, 1); final int expected = index; Assert.assertEquals(result, expected); } @@ -745,7 +745,7 @@ public void testLastIndexOfRandom() { } } - final int result = Utils.lastIndexOf(reference, query); + final int result = Utils.lastIndexOfAtMostTwoMismatches(reference, query, 0); final int expected = new String(reference).lastIndexOf(new String(query)); Assert.assertEquals(result, expected); } From 3d6cf81bf1a83a8efae52f837f913c5ee771dee4 Mon Sep 17 00:00:00 2001 From: Jonathon Cohen Date: Mon, 8 Jul 2019 14:17:06 -0400 Subject: [PATCH 16/23] extracted private indel method to avoid code duplication --- .../hellbender/utils/Utils.java | 104 +++++++----------- .../SmithWatermanJavaAligner.java | 8 +- .../hellbender/utils/UtilsUnitTest.java | 16 ++- 3 files changed, 55 insertions(+), 73 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/utils/Utils.java b/src/main/java/org/broadinstitute/hellbender/utils/Utils.java index 6b3c54dddc2..7b116451f26 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/Utils.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/Utils.java @@ -8,6 +8,7 @@ import htsjdk.variant.variantcontext.VariantContext; import org.apache.commons.lang3.ArrayUtils; import org.apache.commons.lang3.StringUtils; +import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.commons.math3.random.RandomDataGenerator; import org.apache.commons.math3.random.Well19937c; import org.apache.logging.log4j.LogManager; @@ -1034,7 +1035,7 @@ public static boolean xor(final boolean x, final boolean y) { } /** - * Find the last one-off occurrence of the query sequence in the reference sequence + * Find the last occurrence, whether it be an exact match, off-by-one, or off-by-two, of the query sequence in the reference sequence * * Returns the index of the last occurrence or -1 if the query sequence is not found * @@ -1062,83 +1063,56 @@ public static int lastIndexOfAtMostTwoMismatches(final byte[] reference, final b /** * Finds the location of one indel in the query sequence in relation to the reference sequence + * Global Alignment * * Returns the index and size of the indel (as a 2 element int array) or -1 and 0 if an indel less than 4 bases is not found * * @param reference the reference sequence * @param query the query sequence - * @param allowedLengthOfIndel the maximum size of the indel before 2 mismatches becomes less penalized + * @param maxIndelLength the maximum length indel we look for */ - public static int[] atMostOneIndel(final byte[] reference, final byte[] query, int allowedLengthOfIndel){ - int referenceLength = reference.length; - int queryLength = query.length; - - int indelEnd = -1; - int[] indelStartAndSize = {-1,0}; + public static ImmutablePair atMostOneIndel(final byte[] reference, final byte[] query, int maxIndelLength){ + int lengthOfIndel = Math.abs(reference.length - query.length); - if(queryLength < referenceLength){ - int lengthOfIndel = referenceLength - queryLength; - if(lengthOfIndel > allowedLengthOfIndel){ - return indelStartAndSize; - } - - //traverse backwards until you hit mismatch/end of indel - for(int i = queryLength - 1; i >= 0; i--){ - if(query[i] != reference[i + lengthOfIndel]){ - indelEnd = i + 1; + if(lengthOfIndel > maxIndelLength){ + return new ImmutablePair(-1,0); + } + //deletion + if(query.length < reference.length){ + return findIndel(query, reference, lengthOfIndel); + } + //insertion + else if(query.length > reference.length){ + return findIndel(reference, query, lengthOfIndel); + } + //lengths are equal, no one indel + else{ + return new ImmutablePair(-1,0); + } + } - //traverse forwards until you hit start of indel - for(i = 0; i < indelEnd; i++){ - if(query[i] != reference[i]){ - return indelStartAndSize; - } - } + private static ImmutablePair findIndel(final byte[] shorter, final byte[] longer, int lengthOfIndel){ - //traversed entire query, one indel and no mismatches found - indelStartAndSize[0] = indelEnd; - indelStartAndSize[1] = lengthOfIndel; - return indelStartAndSize; - } + int indelEnd = -1; + //traverse backwards until you hit mismatch/end of indel + for(int i = shorter.length - 1; i >= 0; i--){ + if(shorter[i] != longer[i + lengthOfIndel]){ + indelEnd = i + 1; + break; } - - //deletion located at the end of the query - indelStartAndSize[0] = queryLength; - indelStartAndSize[1] = referenceLength - queryLength; - return indelStartAndSize; } - if(referenceLength < queryLength){ - int lengthOfIndel = queryLength - referenceLength; - if(lengthOfIndel > allowedLengthOfIndel){ - return indelStartAndSize; - } - - //traverse backwards until you hit mismatch/end of indel - for(int i = referenceLength - 1; i >= 0; i--){ - if(reference[i] != query[i + lengthOfIndel]){ - indelEnd = i + 1; - - //traverse forwards until you hit start of indel - for(i = 0; i < indelEnd; i++){ - if(reference[i] != query[i]){ - return indelStartAndSize; - } - } - - //traversed entire query, one indel and no mismatches found - indelStartAndSize[0] = indelEnd; - indelStartAndSize[1] = lengthOfIndel; - return indelStartAndSize; - } + if(indelEnd == -1){ + //deletion located at beginning of the query + return new ImmutablePair(0, lengthOfIndel); + } + //traverse forwards until you hit start of indel + for(int i = 0; i < indelEnd; i++){ + if(shorter[i] != longer[i]){ + return new ImmutablePair(-1,0); } - - //deletion located at the end of the query - indelStartAndSize[0] = referenceLength; - indelStartAndSize[1] = queryLength - referenceLength; - return indelStartAndSize; } - - //lengths are equal, no one indel - return indelStartAndSize; + //traversed entire query, one indel and no mismatches found + return new ImmutablePair(indelEnd, lengthOfIndel); } /** diff --git a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java index e674ab28f21..43f17774534 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java @@ -3,6 +3,8 @@ import htsjdk.samtools.Cigar; import htsjdk.samtools.CigarElement; import htsjdk.samtools.CigarOperator; +import jdk.nashorn.internal.ir.annotations.Immutable; +import org.apache.commons.lang3.tuple.ImmutablePair; import org.broadinstitute.gatk.nativebindings.smithwaterman.SWOverhangStrategy; import org.broadinstitute.gatk.nativebindings.smithwaterman.SWParameters; import org.broadinstitute.hellbender.utils.Utils; @@ -101,7 +103,7 @@ public SmithWatermanAlignment align(final byte[] reference, final byte[] alterna if(haplotypeToref){ //check for one indel int oneIndelIndex = -1; - int[] indelStartAndSize = null; + ImmutablePair indelStartAndSize = null; if (overhangStrategy == SWOverhangStrategy.SOFTCLIP || overhangStrategy == SWOverhangStrategy.IGNORE) { //calculate allowed length for indel to be less of a penalty than 2 mismatches @@ -111,11 +113,11 @@ public SmithWatermanAlignment align(final byte[] reference, final byte[] alterna int allowedLengthOfIndel = ((2 * mismatchScore) - indelOpenScore)/indelExtendScore; //returns int array of 2 elements - location, size indelStartAndSize = Utils.atMostOneIndel(reference, alternate, allowedLengthOfIndel); - oneIndelIndex = indelStartAndSize[0]; + oneIndelIndex = indelStartAndSize.getLeft(); } if (oneIndelIndex != -1){ - int indelLength = indelStartAndSize[1]; + int indelLength = indelStartAndSize.getRight(); State state = null; int cigarThirdElementLength = 0; diff --git a/src/test/java/org/broadinstitute/hellbender/utils/UtilsUnitTest.java b/src/test/java/org/broadinstitute/hellbender/utils/UtilsUnitTest.java index 184469f7a67..570dafb7a69 100644 --- a/src/test/java/org/broadinstitute/hellbender/utils/UtilsUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/utils/UtilsUnitTest.java @@ -756,34 +756,40 @@ public void atMostOneIndel(){ //deletion in middle of query String reference = "AGGATTTGGGATTAC"; String query = "AGGAGGGATTAC"; - int result = (Utils.atMostOneIndel(reference.getBytes(), query.getBytes(), 3))[0]; + int result = (Utils.atMostOneIndel(reference.getBytes(), query.getBytes(), 3)).getLeft(); Assert.assertEquals(result, 4); //deletion in end of query String reference2 = "AGGATTTGGGATTAC"; String query2 = "AGGATTTGGGAT"; - int result2 = (Utils.atMostOneIndel(reference2.getBytes(), query2.getBytes(), 3))[0]; + int result2 = (Utils.atMostOneIndel(reference2.getBytes(), query2.getBytes(), 3)).getLeft(); int expected2 = query2.length(); Assert.assertEquals(result2, expected2); //insertion in middle of query String reference3 = "ATTTAGTGGGATTA"; String query3 = "ATTTAGTAGTGGGATTA"; - int result3 = (Utils.atMostOneIndel(reference3.getBytes(), query3.getBytes(), 3))[0]; + int result3 = (Utils.atMostOneIndel(reference3.getBytes(), query3.getBytes(), 3)).getLeft(); Assert.assertEquals(result3, 3); //insertion in end of query String reference4 = "AGTAGTGTGCGTCA"; String query4 = "AGTAGTGTGCGTCAACT"; - int result4 = (Utils.atMostOneIndel(reference4.getBytes(), query4.getBytes(), 3))[0]; + int result4 = (Utils.atMostOneIndel(reference4.getBytes(), query4.getBytes(), 3)).getLeft(); int expected4 = reference4.length(); Assert.assertEquals(result4, expected4); //too long of an indel String reference5 = "AGTAGTGTGCGTCA"; String query5 = "AGTAGTGTGCGTCAACTA"; - int result5 = (Utils.atMostOneIndel(reference5.getBytes(), query5.getBytes(), 3))[0]; + int result5 = (Utils.atMostOneIndel(reference5.getBytes(), query5.getBytes(), 3)).getLeft(); Assert.assertEquals(result5, -1); + + //deletion in beginning of query + String reference6 = "AGTACCGTTTGAC"; + String query6 = "TACCGTTTGAC"; + int result6 = (Utils.atMostOneIndel(reference6.getBytes(), query6.getBytes(), 3)).getLeft(); + Assert.assertEquals(result6, 0); } @Test(expectedExceptions = IllegalArgumentException.class) From a7e5ef45e2ade6f282a39d6e25cbc3cab8388824 Mon Sep 17 00:00:00 2001 From: Jonathon Cohen Date: Mon, 8 Jul 2019 14:53:25 -0400 Subject: [PATCH 17/23] edited for david Benjamin's comments on indel stage --- .../hellbender/utils/Utils.java | 1 - .../SmithWatermanJavaAligner.java | 86 +++++++++---------- 2 files changed, 41 insertions(+), 46 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/utils/Utils.java b/src/main/java/org/broadinstitute/hellbender/utils/Utils.java index 7b116451f26..37aa57869f5 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/Utils.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/Utils.java @@ -1092,7 +1092,6 @@ else if(query.length > reference.length){ } private static ImmutablePair findIndel(final byte[] shorter, final byte[] longer, int lengthOfIndel){ - int indelEnd = -1; //traverse backwards until you hit mismatch/end of indel for(int i = shorter.length - 1; i >= 0; i--){ diff --git a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java index 43f17774534..10b926febcb 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java @@ -90,10 +90,9 @@ public SmithWatermanAlignment align(final byte[] reference, final byte[] alterna else { int singleMismatchIndex = -1; if (overhangStrategy == SWOverhangStrategy.SOFTCLIP || overhangStrategy == SWOverhangStrategy.IGNORE) { - //look for one mismatch singleMismatchIndex = Utils.lastIndexOfAtMostTwoMismatches(reference, alternate, 1); } - + //if off-by-one if (singleMismatchIndex != -1) { // generate the alignment result when the substring search was successful final List lce = Collections.singletonList(makeElement(State.MATCH, alternate.length)); @@ -101,69 +100,66 @@ public SmithWatermanAlignment align(final byte[] reference, final byte[] alterna } else{ if(haplotypeToref){ - //check for one indel int oneIndelIndex = -1; ImmutablePair indelStartAndSize = null; if (overhangStrategy == SWOverhangStrategy.SOFTCLIP || overhangStrategy == SWOverhangStrategy.IGNORE) { //calculate allowed length for indel to be less of a penalty than 2 mismatches - int mismatchScore = parameters.getMismatchPenalty(); - int indelExtendScore = parameters.getGapExtendPenalty(); - int indelOpenScore = parameters.getGapOpenPenalty(); - int allowedLengthOfIndel = ((2 * mismatchScore) - indelOpenScore)/indelExtendScore; - //returns int array of 2 elements - location, size + int allowedLengthOfIndel = calculateAllowedLengthOfIndel(parameters); indelStartAndSize = Utils.atMostOneIndel(reference, alternate, allowedLengthOfIndel); oneIndelIndex = indelStartAndSize.getLeft(); } - + //if one indel if (oneIndelIndex != -1){ int indelLength = indelStartAndSize.getRight(); - State state = null; - int cigarThirdElementLength = 0; - - if(alternate.length < reference.length){ - state = State.DELETION; - cigarThirdElementLength = alternate.length - oneIndelIndex; - } - if(alternate.length > reference.length){ - state = State.INSERTION; - cigarThirdElementLength = alternate.length - indelLength - oneIndelIndex; - } - - final List lce = Arrays.asList(makeElement(State.MATCH, oneIndelIndex), - makeElement(state, indelLength), makeElement(State.MATCH, cigarThirdElementLength)); - alignmentResult = new SWPairwiseAlignmentResult(AlignmentUtils.consolidateCigar(new Cigar(lce)), 0); - } - else{ - //run full Smith-Waterman - - final int n = reference.length+1; - final int m = alternate.length+1; - final int[][] sw = new int[n][m]; - final int[][] btrack=new int[n][m]; + alignmentResult = calculateOneIndelCigar(indelLength, reference, alternate, oneIndelIndex); - calculateMatrix(reference, alternate, sw, btrack, overhangStrategy, parameters); - alignmentResult = calculateCigar(sw, btrack, overhangStrategy); // length of the segment (continuous matches, insertions or deletions) + totalComputeTime += System.nanoTime() - startTime; + return alignmentResult; } } - else{ - // run full Smith-Waterman - final int n = reference.length+1; - final int m = alternate.length+1; - final int[][] sw = new int[n][m]; - final int[][] btrack=new int[n][m]; - - calculateMatrix(reference, alternate, sw, btrack, overhangStrategy, parameters); - alignmentResult = calculateCigar(sw, btrack, overhangStrategy); // length of the segment (continuous matches, insertions or deletions) - } + // run full Smith-Waterman + final int n = reference.length+1; + final int m = alternate.length+1; + final int[][] sw = new int[n][m]; + final int[][] btrack=new int[n][m]; + + calculateMatrix(reference, alternate, sw, btrack, overhangStrategy, parameters); + alignmentResult = calculateCigar(sw, btrack, overhangStrategy); // length of the segment (continuous matches, insertions or deletions) } } totalComputeTime += System.nanoTime() - startTime; - return alignmentResult; } + private static int calculateAllowedLengthOfIndel(final SWParameters parameters){ + //calculate allowed length for indel to be less of a penalty than 2 mismatches + int mismatchScore = parameters.getMismatchPenalty(); + int indelExtendScore = parameters.getGapExtendPenalty(); + int indelOpenScore = parameters.getGapOpenPenalty(); + int allowedLengthOfIndel = ((2 * mismatchScore) - indelOpenScore)/indelExtendScore; + return allowedLengthOfIndel; + } + + private static SWPairwiseAlignmentResult calculateOneIndelCigar(int indelLength, final byte[] reference, final byte[] alternate, int oneIndelIndex){ + State state = null; + int cigarThirdElementLength = 0; + + if(alternate.length < reference.length){ + state = State.DELETION; + cigarThirdElementLength = alternate.length - oneIndelIndex; + } + if(alternate.length > reference.length){ + state = State.INSERTION; + cigarThirdElementLength = alternate.length - indelLength - oneIndelIndex; + } + + final List lce = Arrays.asList(makeElement(State.MATCH, oneIndelIndex), + makeElement(state, indelLength), makeElement(State.MATCH, cigarThirdElementLength)); + return new SWPairwiseAlignmentResult(AlignmentUtils.consolidateCigar(new Cigar(lce)), 0); + } + /** * Calculates the SW matrices for the given sequences * @param reference ref sequence From 056ac96a64e41ccc9d76a1d631f28191bc211eaa Mon Sep 17 00:00:00 2001 From: Jonathon Cohen Date: Mon, 8 Jul 2019 15:36:02 -0400 Subject: [PATCH 18/23] fixed allowed indel length --- .../smithwaterman/SmithWatermanJavaAligner.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java index 10b926febcb..13e9b824b20 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java @@ -3,7 +3,6 @@ import htsjdk.samtools.Cigar; import htsjdk.samtools.CigarElement; import htsjdk.samtools.CigarOperator; -import jdk.nashorn.internal.ir.annotations.Immutable; import org.apache.commons.lang3.tuple.ImmutablePair; import org.broadinstitute.gatk.nativebindings.smithwaterman.SWOverhangStrategy; import org.broadinstitute.gatk.nativebindings.smithwaterman.SWParameters; @@ -105,8 +104,8 @@ public SmithWatermanAlignment align(final byte[] reference, final byte[] alterna if (overhangStrategy == SWOverhangStrategy.SOFTCLIP || overhangStrategy == SWOverhangStrategy.IGNORE) { //calculate allowed length for indel to be less of a penalty than 2 mismatches - int allowedLengthOfIndel = calculateAllowedLengthOfIndel(parameters); - indelStartAndSize = Utils.atMostOneIndel(reference, alternate, allowedLengthOfIndel); + int maxIndelLength = calculateAllowedLengthOfIndelHapToRef(parameters); + indelStartAndSize = Utils.atMostOneIndel(reference, alternate, maxIndelLength); oneIndelIndex = indelStartAndSize.getLeft(); } //if one indel @@ -133,13 +132,14 @@ public SmithWatermanAlignment align(final byte[] reference, final byte[] alterna return alignmentResult; } - private static int calculateAllowedLengthOfIndel(final SWParameters parameters){ + //will likely have to be modified to incorporate reads and more complex indel cases + private static int calculateAllowedLengthOfIndelHapToRef(final SWParameters parameters){ //calculate allowed length for indel to be less of a penalty than 2 mismatches int mismatchScore = parameters.getMismatchPenalty(); int indelExtendScore = parameters.getGapExtendPenalty(); int indelOpenScore = parameters.getGapOpenPenalty(); - int allowedLengthOfIndel = ((2 * mismatchScore) - indelOpenScore)/indelExtendScore; - return allowedLengthOfIndel; + int maxIndelLength = (((2 * mismatchScore) - indelOpenScore)/indelExtendScore) + 1; + return maxIndelLength; } private static SWPairwiseAlignmentResult calculateOneIndelCigar(int indelLength, final byte[] reference, final byte[] alternate, int oneIndelIndex){ From 07fdbb3a1c9a9c01c6512e40823a4c891c913b93 Mon Sep 17 00:00:00 2001 From: Jonathon Cohen Date: Thu, 11 Jul 2019 10:35:10 -0400 Subject: [PATCH 19/23] cleaned up align method --- .../SmithWatermanJavaAligner.java | 79 ++++++++----------- 1 file changed, 34 insertions(+), 45 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java index 13e9b824b20..3800713ec35 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java @@ -70,64 +70,53 @@ public SmithWatermanAlignment align(final byte[] reference, final byte[] alterna Utils.nonNull(parameters); Utils.nonNull(overhangStrategy); - // avoid running full Smith-Waterman if there is an exact match of alternate in reference - int exactMatchIndex = -1; - if (overhangStrategy == SWOverhangStrategy.SOFTCLIP || overhangStrategy == SWOverhangStrategy.IGNORE) { - // Use a substring search to find an exact match of the alternate in the reference - // NOTE: This approach only works for SOFTCLIP and IGNORE overhang strategies - exactMatchIndex = Utils.lastIndexOfAtMostTwoMismatches(reference, alternate, 0); - } - final SmithWatermanAlignment alignmentResult; - //if exact match - if (exactMatchIndex != -1) { - // generate the alignment result when the substring search was successful - final List lce = Collections.singletonList(makeElement(State.MATCH, alternate.length)); - alignmentResult = new SWPairwiseAlignmentResult(AlignmentUtils.consolidateCigar(new Cigar(lce)), exactMatchIndex); - } - else { - int singleMismatchIndex = -1; - if (overhangStrategy == SWOverhangStrategy.SOFTCLIP || overhangStrategy == SWOverhangStrategy.IGNORE) { - singleMismatchIndex = Utils.lastIndexOfAtMostTwoMismatches(reference, alternate, 1); + if (overhangStrategy == SWOverhangStrategy.SOFTCLIP || overhangStrategy == SWOverhangStrategy.IGNORE){ + //exact match + int exactMatchIndex = Utils.lastIndexOfAtMostTwoMismatches(reference, alternate, 0); + if (exactMatchIndex != -1) { + // generate the alignment result when the substring search was successful + final List lce = Collections.singletonList(makeElement(State.MATCH, alternate.length)); + alignmentResult = new SWPairwiseAlignmentResult(AlignmentUtils.consolidateCigar(new Cigar(lce)), exactMatchIndex); + totalComputeTime += System.nanoTime() - startTime; + return alignmentResult; } - //if off-by-one + + //one mismatch + int singleMismatchIndex = Utils.lastIndexOfAtMostTwoMismatches(reference, alternate, 1); if (singleMismatchIndex != -1) { // generate the alignment result when the substring search was successful final List lce = Collections.singletonList(makeElement(State.MATCH, alternate.length)); alignmentResult = new SWPairwiseAlignmentResult(AlignmentUtils.consolidateCigar(new Cigar(lce)), singleMismatchIndex); + totalComputeTime += System.nanoTime() - startTime; + return alignmentResult; } - else{ - if(haplotypeToref){ - int oneIndelIndex = -1; - ImmutablePair indelStartAndSize = null; - - if (overhangStrategy == SWOverhangStrategy.SOFTCLIP || overhangStrategy == SWOverhangStrategy.IGNORE) { - //calculate allowed length for indel to be less of a penalty than 2 mismatches - int maxIndelLength = calculateAllowedLengthOfIndelHapToRef(parameters); - indelStartAndSize = Utils.atMostOneIndel(reference, alternate, maxIndelLength); - oneIndelIndex = indelStartAndSize.getLeft(); - } - //if one indel - if (oneIndelIndex != -1){ - int indelLength = indelStartAndSize.getRight(); - alignmentResult = calculateOneIndelCigar(indelLength, reference, alternate, oneIndelIndex); - totalComputeTime += System.nanoTime() - startTime; - return alignmentResult; - } + //one indel + if(haplotypeToref){ + //calculate allowed length for indel to be less of a penalty than 2 mismatches + int maxIndelLength = calculateAllowedLengthOfIndelHapToRef(parameters); + ImmutablePair indelStartAndSize = Utils.atMostOneIndel(reference, alternate, maxIndelLength); + int oneIndelIndex = indelStartAndSize.getLeft(); + //if one indel + if (oneIndelIndex != -1){ + int indelLength = indelStartAndSize.getRight(); + alignmentResult = calculateOneIndelCigar(indelLength, reference, alternate, oneIndelIndex); + totalComputeTime += System.nanoTime() - startTime; + return alignmentResult; } - // run full Smith-Waterman - final int n = reference.length+1; - final int m = alternate.length+1; - final int[][] sw = new int[n][m]; - final int[][] btrack=new int[n][m]; - - calculateMatrix(reference, alternate, sw, btrack, overhangStrategy, parameters); - alignmentResult = calculateCigar(sw, btrack, overhangStrategy); // length of the segment (continuous matches, insertions or deletions) } } + // run full Smith-Waterman + final int n = reference.length+1; + final int m = alternate.length+1; + final int[][] sw = new int[n][m]; + final int[][] btrack=new int[n][m]; + + calculateMatrix(reference, alternate, sw, btrack, overhangStrategy, parameters); + alignmentResult = calculateCigar(sw, btrack, overhangStrategy); // length of the segment (continuous matches, insertions or deletions) totalComputeTime += System.nanoTime() - startTime; return alignmentResult; } From 6eef7be365ee41b9393e04b298738f7f8a4c7a2b Mon Sep 17 00:00:00 2001 From: Jonathon Cohen Date: Thu, 25 Jul 2019 13:57:11 -0400 Subject: [PATCH 20/23] added method that aligns read-bestHap given 1 indel --- .../hellbender/utils/Utils.java | 272 +++++++++++++++++- .../SmithWatermanJavaAligner.java | 93 +++++- .../hellbender/utils/UtilsUnitTest.java | 12 +- 3 files changed, 352 insertions(+), 25 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/utils/Utils.java b/src/main/java/org/broadinstitute/hellbender/utils/Utils.java index 37aa57869f5..a7d0296e3e0 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/Utils.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/Utils.java @@ -13,6 +13,7 @@ import org.apache.commons.math3.random.Well19937c; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +import org.broadinstitute.gatk.nativebindings.smithwaterman.SWParameters; import org.broadinstitute.hellbender.engine.FeatureDataSource; import org.broadinstitute.hellbender.exceptions.GATKException; @@ -1034,19 +1035,27 @@ public static boolean xor(final boolean x, final boolean y) { return x != y; } + //overload method which calls lasIndexOf with the refIndexBoun param being 0 + public static int lastIndexOfAtMostTwoMismatches(final byte[] reference, final byte[] query, final int allowedMismatches) { + return lastIndexOfAtMostTwoMismatches(reference, query, allowedMismatches, 0); + } + /** - * Find the last occurrence, whether it be an exact match, off-by-one, or off-by-two, of the query sequence in the reference sequence + * Find the last occurrence of the query sequence in the reference sequence * * Returns the index of the last occurrence or -1 if the query sequence is not found * * @param reference the reference sequence * @param query the query sequence */ - public static int lastIndexOfAtMostTwoMismatches(final byte[] reference, final byte[] query, final int allowedMismatches) { + public static int lastIndexOfAtMostTwoMismatches(final byte[] reference, final byte[] query, final int allowedMismatches, int refIndexBound) { int queryLength = query.length; + if(refIndexBound < 0){ + refIndexBound = 0; + } // start search from the last possible matching position and search to the left - for (int refIndex = reference.length - queryLength; refIndex >= 0; refIndex--) { + for (int refIndex = reference.length - queryLength; refIndex >= refIndexBound; refIndex--) { int mismatchCount = 0; for (int queryIndex = 0; queryIndex < queryLength && mismatchCount <= allowedMismatches; queryIndex++) { if (reference[refIndex+queryIndex] != query[queryIndex]) { @@ -1071,7 +1080,7 @@ public static int lastIndexOfAtMostTwoMismatches(final byte[] reference, final b * @param query the query sequence * @param maxIndelLength the maximum length indel we look for */ - public static ImmutablePair atMostOneIndel(final byte[] reference, final byte[] query, int maxIndelLength){ + public static ImmutablePair oneIndelHapToRef(final byte[] reference, final byte[] query, int maxIndelLength){ int lengthOfIndel = Math.abs(reference.length - query.length); if(lengthOfIndel > maxIndelLength){ @@ -1114,6 +1123,261 @@ private static ImmutablePair findIndel(final byte[] shorter, f return new ImmutablePair(indelEnd, lengthOfIndel); } + + /** + * Finds the location of one indel in the query sequence in relation to the reference sequence + * local Alignment + * + * Returns an Indel object that contains all the necessary information for generating a cigar string + * + * @param reference the reference sequence + * @param query the query sequence + * @param parameters the SW parameters + * @param maxInsertionSize allowed size of insertion so that we can confidently say nothing can beat it + * @param maxDeletionSize allowed size of deletion so that we can confidently say nothing can beat it + */ + public static Indel oneIndelReadToHap(final byte[] reference, final byte[] query, SWParameters parameters, int maxInsertionSize, int maxDeletionSize){ + + Indel insertion = new Indel(-1, -1, maxInsertionSize, true); + Indel deletion = new Indel(-1, -1, maxDeletionSize, false); + + //set bounds for indices that are eligible to have indels + int refBackInsertionBound = query.length - 1 - maxInsertionSize; + int refBackDeletionBound = query.length; + + int refIndexBack = reference.length - 1; + + //traverse ref up to insertion bound + while(refIndexBack >= refBackInsertionBound){ + boolean skipDeletion = false; + + int queryIndexBack = query.length - 1; + + //iterate until you hit a match + while(reference[refIndexBack] != query[queryIndexBack] && refIndexBack >= refBackInsertionBound){ + if(refIndexBack < refBackDeletionBound){ + skipDeletion = true; + } + refIndexBack--; + } + + //keep track of potential starting position for alignment. this is needed in case the "traverse until mismatch found" step iterates over the correct alignment location + /* + e.g - the 1st while loop takes you to index 74, and the 2nd while loop takes you to index 41, so you passed over the index in the ref that represents the canonical alignment end position, 68 + 41 68 74 + ref AACCCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGCGGT + + read AACCCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCCTAA CCCTAACCCTAACCCTAACCCTAA - happens if you don't use queue + + read AACCCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAA CCCTAACCCTAACCCTAA + + */ + Queue startingPositions = new LinkedList<>(); + startingPositions.add(refIndexBack); + byte startingLetter = reference[refIndexBack]; + boolean inListOfStartingPositions = false; + int queryIndex = queryIndexBack; + + //while there are still potential starting indices, attempt alignments at each index + while(!startingPositions.isEmpty()){ + + //if element in queue is the last, begin new StartingPositionList + if (inListOfStartingPositions == true && startingPositions.size() == 1){ + inListOfStartingPositions = false; + } + refIndexBack = startingPositions.remove(); + if(refIndexBack < refBackInsertionBound){ + break; + } + queryIndexBack = queryIndex; + + //iterate until you hit mismatch + while(reference[refIndexBack] == query[queryIndexBack]){ + + if(refIndexBack == 1 || queryIndexBack == 2){ + refIndexBack--; + queryIndexBack--; + break; + } + + refIndexBack--; + queryIndexBack--; + + //if you've found a potential alignment starting position, add it to the queue + if(!inListOfStartingPositions){ + if(reference[refIndexBack] == startingLetter){ + startingPositions.add(refIndexBack); + } + } + } + + //once you've filled your queue, set this to true to make sure you don't create additional queues for the elements already in a queue + inListOfStartingPositions = true; + + //check for deletion code + //************************************ + if(!skipDeletion){ + byte[] ref = new byte[refIndexBack + 1]; + System.arraycopy(reference,0, ref, 0, refIndexBack + 1); + byte[] que = new byte[queryIndexBack + 1]; + System.arraycopy(query, 0, que, 0, queryIndexBack + 1); + int matchIndex = lastIndexOfAtMostTwoMismatches(ref, que, 0, ref.length - que.length - maxDeletionSize); + if(matchIndex != -1){ + int alignmentOffset = matchIndex; + int matchingBases = queryIndexBack + 1; + int indelSize = refIndexBack - matchIndex + 1 - matchingBases; + if(indelSize <= deletion.getIndelSize()){ + deletion.setAlignmentOffset(alignmentOffset); + deletion.setMatchingBases(matchingBases); + deletion.setIndelSize(indelSize); + deletion.setIndelType(false); + } + } + } + //************************************ + + int queryIndexFront = queryIndexBack - 1; + //set bound. this ensures that you don't look for insertions greater than the maxInsertionSize + int queryFrontBound = queryIndexBack - maxInsertionSize; + + //repeat the process, but this time you're determining the bases on the left side of the insertion + while(queryIndexFront >= queryFrontBound){ + + int refIndexFront = refIndexBack; + + //look for match + while(reference[refIndexFront] != query[queryIndexFront] && queryIndexFront >= queryFrontBound){ + queryIndexFront--; + + if(queryIndexFront < queryFrontBound){ + break; + } + } + + if(queryIndexFront < queryFrontBound){ + break; + } + + Queue overlap2 = new LinkedList<>(); + overlap2.add(queryIndexFront); + byte repeatLetter2 = query[queryIndexFront]; + boolean inListOfStartingPositions2 = false; + int refIndex = refIndexFront; + + while(!overlap2.isEmpty()){ + + queryIndexFront = overlap2.remove(); + if(queryIndexFront < queryFrontBound){ + break; + } + refIndexFront = refIndex; + + //iterate until mismatch or insertion is found + while(queryIndexFront >= 0 && refIndexFront >= 0 && reference[refIndexFront] == query[queryIndexFront]){ + queryIndexFront--; + refIndexFront--; + + //you've reached the end of the query, which means you have an insertion + if(queryIndexFront == -1){ + int alignmentOffset = refIndexFront + 1; + int matchingBases = refIndexBack - alignmentOffset + 1; + int indelSize = queryIndexBack - matchingBases + 1; + if(indelSize <= insertion.getIndelSize()){ + insertion.setAlignmentOffset(alignmentOffset); + insertion.setMatchingBases(matchingBases); + insertion.setIndelSize(indelSize); + insertion.setIndelType(true); + } + } + + if(!inListOfStartingPositions2 && queryIndexFront != -1){ + if(query[queryIndexFront] == repeatLetter2){ + overlap2.add(queryIndexFront); + } + } + } + + inListOfStartingPositions2 = true; + + if(queryIndexFront == -1){ + break; + } + } + } + } + } + + if(insertion.getAlignmentOffset() != -1 && deletion.getAlignmentOffset() == -1){ + return insertion; + } + else if(insertion.getAlignmentOffset() == -1 && deletion.getAlignmentOffset() != -1){ + return deletion; + } + else if(insertion.getAlignmentOffset() == -1 && deletion.getAlignmentOffset() == -1){ + return insertion; + } + else{ + int insertionBases = query.length - insertion.getIndelSize(); + int inScore = (insertionBases * parameters.getMatchValue()) + parameters.getGapOpenPenalty() + ((insertion.getIndelSize() - 1) * parameters.getGapExtendPenalty()); + + int delScore = (query.length * parameters.getMatchValue()) + parameters.getGapOpenPenalty() + ((deletion.getIndelSize() - 1) * parameters.getGapExtendPenalty()); + if(delScore == inScore){ + //tiebreaker is number of ref bases consumed - will need to be changed + return insertionBases > (query.length + deletion.getIndelSize()) ? insertion: deletion; + } + else{ + return delScore > inScore ? deletion : insertion; + } + } + } + + //inner class that stores all necessary information to produce cigar string + public static class Indel{ + int alignmentOffset; + int matchingBases; + int indelSize; + boolean insertion; + + public Indel(int alignmentOffset, int matchingBases, int indelSize, boolean insertion){ + this.alignmentOffset = alignmentOffset; + this.indelSize = indelSize; + this.matchingBases = matchingBases; + this.insertion = insertion; + } + + public int getIndelSize(){ + return indelSize; + } + + public int getAlignmentOffset(){ + return alignmentOffset; + } + + public int getMatchingBases(){ + return matchingBases; + } + + public boolean getIndelType(){ + return insertion; + } + + public void setIndelSize(int indelSize){ + this.indelSize = indelSize; + } + + public void setAlignmentOffset(int alignmentOffset){ + this.alignmentOffset = alignmentOffset; + } + + public void setMatchingBases(int matchingBases){ + this.matchingBases = matchingBases; + } + + public void setIndelType(boolean insertion){ + this.insertion = insertion; + } + } + /** * Simple wrapper for sticking elements of a int[] array into a List * @param ar - the array whose elements should be listified diff --git a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java index 3800713ec35..a4c7781f551 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java @@ -86,7 +86,6 @@ public SmithWatermanAlignment align(final byte[] reference, final byte[] alterna //one mismatch int singleMismatchIndex = Utils.lastIndexOfAtMostTwoMismatches(reference, alternate, 1); if (singleMismatchIndex != -1) { - // generate the alignment result when the substring search was successful final List lce = Collections.singletonList(makeElement(State.MATCH, alternate.length)); alignmentResult = new SWPairwiseAlignmentResult(AlignmentUtils.consolidateCigar(new Cigar(lce)), singleMismatchIndex); totalComputeTime += System.nanoTime() - startTime; @@ -94,12 +93,10 @@ public SmithWatermanAlignment align(final byte[] reference, final byte[] alterna } //one indel - if(haplotypeToref){ - //calculate allowed length for indel to be less of a penalty than 2 mismatches + if(this.haplotypeToref){ int maxIndelLength = calculateAllowedLengthOfIndelHapToRef(parameters); - ImmutablePair indelStartAndSize = Utils.atMostOneIndel(reference, alternate, maxIndelLength); + ImmutablePair indelStartAndSize = Utils.oneIndelHapToRef(reference, alternate, maxIndelLength); int oneIndelIndex = indelStartAndSize.getLeft(); - //if one indel if (oneIndelIndex != -1){ int indelLength = indelStartAndSize.getRight(); alignmentResult = calculateOneIndelCigar(indelLength, reference, alternate, oneIndelIndex); @@ -107,6 +104,52 @@ public SmithWatermanAlignment align(final byte[] reference, final byte[] alterna return alignmentResult; } } + else{ + int maxInsertionSize = calculateMaxInsertionSizeReadToHap(parameters); + int maxDeletionSize = calculateMaxDeletionSizeReadToHap(parameters); + Utils.Indel indel = Utils.oneIndelReadToHap(reference, alternate, parameters, maxInsertionSize, maxDeletionSize); + int alignmentOffset = indel.getAlignmentOffset(); + if(alignmentOffset != -1){ + int matchingBases = indel.getMatchingBases(); + int indelSize = indel.getIndelSize(); + //construct cigar + State state; + int length; + if(indel.getIndelType()){ + state = State.INSERTION; + length = alternate.length - indelSize - matchingBases; + } + else{ + state = State.DELETION; + length = alternate.length - matchingBases; + + } + final List lce = Arrays.asList(makeElement(State.MATCH, matchingBases), + makeElement(state, indelSize), makeElement(State.MATCH, length)); + alignmentResult = new SWPairwiseAlignmentResult(AlignmentUtils.consolidateCigar(new Cigar(lce)), alignmentOffset); + + Cigar cigar1 = alignmentResult.getCigar(); + final int n = reference.length+1; + final int m = alternate.length+1; + final int[][] sw = new int[n][m]; + final int[][] btrack=new int[n][m]; + calculateMatrix(reference, alternate, sw, btrack, overhangStrategy, parameters); + SWPairwiseAlignmentResult alignmentResult2 = calculateCigar(sw, btrack, overhangStrategy); + Cigar cigar2 = alignmentResult2.getCigar(); + if(!cigar1.equals(cigar2) && !cigar2.containsOperator(CigarOperator.S)){ + System.out.println("CIGARS NOT EQUAL"); + System.out.println(new String(reference)); + System.out.println(""); + System.out.println(new String(alternate)); + System.out.println("Heuristic:" + cigar1); + System.out.println("SW: " + cigar2); + } + + totalComputeTime += System.nanoTime() - startTime; + return alignmentResult; + } + } + } // run full Smith-Waterman @@ -121,16 +164,6 @@ public SmithWatermanAlignment align(final byte[] reference, final byte[] alterna return alignmentResult; } - //will likely have to be modified to incorporate reads and more complex indel cases - private static int calculateAllowedLengthOfIndelHapToRef(final SWParameters parameters){ - //calculate allowed length for indel to be less of a penalty than 2 mismatches - int mismatchScore = parameters.getMismatchPenalty(); - int indelExtendScore = parameters.getGapExtendPenalty(); - int indelOpenScore = parameters.getGapOpenPenalty(); - int maxIndelLength = (((2 * mismatchScore) - indelOpenScore)/indelExtendScore) + 1; - return maxIndelLength; - } - private static SWPairwiseAlignmentResult calculateOneIndelCigar(int indelLength, final byte[] reference, final byte[] alternate, int oneIndelIndex){ State state = null; int cigarThirdElementLength = 0; @@ -149,6 +182,36 @@ private static SWPairwiseAlignmentResult calculateOneIndelCigar(int indelLength, return new SWPairwiseAlignmentResult(AlignmentUtils.consolidateCigar(new Cigar(lce)), 0); } + private static int calculateAllowedLengthOfIndelHapToRef(final SWParameters parameters){ + //calculate allowed length for indel to be less of a penalty than 2 mismatches + int mismatchScore = parameters.getMismatchPenalty(); + int indelExtendScore = parameters.getGapExtendPenalty(); + int indelOpenScore = parameters.getGapOpenPenalty(); + int maxIndelLength = (((2 * mismatchScore) - indelOpenScore)/indelExtendScore) + 1; + return maxIndelLength; + } + + //calculates insertion size whose score is better than any other scenario + private static int calculateMaxInsertionSizeReadToHap(final SWParameters parameters){ + int matchScore = parameters.getMatchValue(); + int mismatchPenalty = -1 * parameters.getMismatchPenalty(); + int gapOpenPenalty = -1 * parameters.getGapOpenPenalty(); + int gapExtensionPenalty = -1 * parameters.getGapExtendPenalty(); + + int insertionCapSize = (2*matchScore + 2*mismatchPenalty - gapOpenPenalty + gapExtensionPenalty - 1) / (matchScore + gapExtensionPenalty); + return insertionCapSize; + } + + private static int calculateMaxDeletionSizeReadToHap(final SWParameters parameters){ + int matchScore = parameters.getMatchValue(); + int mismatchPenalty = -1*parameters.getMismatchPenalty(); + int gapOpenPenalty = -1*parameters.getGapOpenPenalty(); + int gapExtensionPenalty = -1*parameters.getGapExtendPenalty(); + + int deletionCapSize = (2*matchScore + 2*mismatchPenalty - gapOpenPenalty + gapExtensionPenalty - 1) / (gapExtensionPenalty); + return deletionCapSize; + } + /** * Calculates the SW matrices for the given sequences * @param reference ref sequence diff --git a/src/test/java/org/broadinstitute/hellbender/utils/UtilsUnitTest.java b/src/test/java/org/broadinstitute/hellbender/utils/UtilsUnitTest.java index 570dafb7a69..0dc46e016a4 100644 --- a/src/test/java/org/broadinstitute/hellbender/utils/UtilsUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/utils/UtilsUnitTest.java @@ -756,39 +756,39 @@ public void atMostOneIndel(){ //deletion in middle of query String reference = "AGGATTTGGGATTAC"; String query = "AGGAGGGATTAC"; - int result = (Utils.atMostOneIndel(reference.getBytes(), query.getBytes(), 3)).getLeft(); + int result = (Utils.oneIndelHapToRef(reference.getBytes(), query.getBytes(), 3)).getLeft(); Assert.assertEquals(result, 4); //deletion in end of query String reference2 = "AGGATTTGGGATTAC"; String query2 = "AGGATTTGGGAT"; - int result2 = (Utils.atMostOneIndel(reference2.getBytes(), query2.getBytes(), 3)).getLeft(); + int result2 = (Utils.oneIndelHapToRef(reference2.getBytes(), query2.getBytes(), 3)).getLeft(); int expected2 = query2.length(); Assert.assertEquals(result2, expected2); //insertion in middle of query String reference3 = "ATTTAGTGGGATTA"; String query3 = "ATTTAGTAGTGGGATTA"; - int result3 = (Utils.atMostOneIndel(reference3.getBytes(), query3.getBytes(), 3)).getLeft(); + int result3 = (Utils.oneIndelHapToRef(reference3.getBytes(), query3.getBytes(), 3)).getLeft(); Assert.assertEquals(result3, 3); //insertion in end of query String reference4 = "AGTAGTGTGCGTCA"; String query4 = "AGTAGTGTGCGTCAACT"; - int result4 = (Utils.atMostOneIndel(reference4.getBytes(), query4.getBytes(), 3)).getLeft(); + int result4 = (Utils.oneIndelHapToRef(reference4.getBytes(), query4.getBytes(), 3)).getLeft(); int expected4 = reference4.length(); Assert.assertEquals(result4, expected4); //too long of an indel String reference5 = "AGTAGTGTGCGTCA"; String query5 = "AGTAGTGTGCGTCAACTA"; - int result5 = (Utils.atMostOneIndel(reference5.getBytes(), query5.getBytes(), 3)).getLeft(); + int result5 = (Utils.oneIndelHapToRef(reference5.getBytes(), query5.getBytes(), 3)).getLeft(); Assert.assertEquals(result5, -1); //deletion in beginning of query String reference6 = "AGTACCGTTTGAC"; String query6 = "TACCGTTTGAC"; - int result6 = (Utils.atMostOneIndel(reference6.getBytes(), query6.getBytes(), 3)).getLeft(); + int result6 = (Utils.oneIndelHapToRef(reference6.getBytes(), query6.getBytes(), 3)).getLeft(); Assert.assertEquals(result6, 0); } From ccfdb9c7249431fcdd7e8350c3aadb4591f48eaf Mon Sep 17 00:00:00 2001 From: Jonathon Cohen Date: Mon, 5 Aug 2019 13:30:16 -0400 Subject: [PATCH 21/23] control code to measure runtime --- .../smithwaterman/SmithWatermanJavaAligner.java | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java index a4c7781f551..595213b5f8e 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java @@ -54,6 +54,9 @@ public SmithWatermanJavaAligner(boolean haplotypeToref){ this.haplotypeToref = haplotypeToref; } + int noSW = 0; + int yesSW = 0; + long totalExactMatchTime = 0; /** * Aligns the alternate sequence to the reference sequence * @@ -74,8 +77,11 @@ public SmithWatermanAlignment align(final byte[] reference, final byte[] alterna if (overhangStrategy == SWOverhangStrategy.SOFTCLIP || overhangStrategy == SWOverhangStrategy.IGNORE){ //exact match + long startTime1 = System.nanoTime(); int exactMatchIndex = Utils.lastIndexOfAtMostTwoMismatches(reference, alternate, 0); + totalExactMatchTime += System.nanoTime() - startTime1; if (exactMatchIndex != -1) { + noSW++; // generate the alignment result when the substring search was successful final List lce = Collections.singletonList(makeElement(State.MATCH, alternate.length)); alignmentResult = new SWPairwiseAlignmentResult(AlignmentUtils.consolidateCigar(new Cigar(lce)), exactMatchIndex); @@ -83,6 +89,7 @@ public SmithWatermanAlignment align(final byte[] reference, final byte[] alterna return alignmentResult; } + /* //one mismatch int singleMismatchIndex = Utils.lastIndexOfAtMostTwoMismatches(reference, alternate, 1); if (singleMismatchIndex != -1) { @@ -149,9 +156,10 @@ public SmithWatermanAlignment align(final byte[] reference, final byte[] alterna return alignmentResult; } } - + */ } + yesSW++; // run full Smith-Waterman final int n = reference.length+1; final int m = alternate.length+1; @@ -506,5 +514,9 @@ private static CigarElement makeElement(final State state, final int length) { @Override public void close() { logger.info(String.format("Total compute time in java Smith-Waterman : %.2f sec", totalComputeTime * 1e-9)); + logger.info(String.format("Total compute time in exactMatch heuristic : %.2f sec", totalExactMatchTime * 1e-9)); + logger.info(String.format("Total calls to exactMatchHeuristic: %0.2f sec", noSW * 1e-9)); + logger.info(String.format("Total calls to SW: %0.2f sec", yesSW * 1e-9)); + } } From c8f4af83d0b10dc266b4427a62e5248327c68b18 Mon Sep 17 00:00:00 2001 From: Jonathon Cohen Date: Mon, 5 Aug 2019 22:25:25 -0400 Subject: [PATCH 22/23] fixed formatting error --- .../utils/smithwaterman/SmithWatermanJavaAligner.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java index 595213b5f8e..79bd2668b9e 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java @@ -515,8 +515,8 @@ private static CigarElement makeElement(final State state, final int length) { public void close() { logger.info(String.format("Total compute time in java Smith-Waterman : %.2f sec", totalComputeTime * 1e-9)); logger.info(String.format("Total compute time in exactMatch heuristic : %.2f sec", totalExactMatchTime * 1e-9)); - logger.info(String.format("Total calls to exactMatchHeuristic: %0.2f sec", noSW * 1e-9)); - logger.info(String.format("Total calls to SW: %0.2f sec", yesSW * 1e-9)); + logger.info(String.format("Total calls to exactMatchHeuristic: %.2f sec", noSW * 1e-9)); + logger.info(String.format("Total calls to SW: %.2f sec", yesSW * 1e-9)); } } From 46d62993e0af9954cf359951efd36fb68c242c73 Mon Sep 17 00:00:00 2001 From: Jonathon Cohen Date: Tue, 6 Aug 2019 10:21:31 -0400 Subject: [PATCH 23/23] fixed totalCallsToSW count --- .../utils/smithwaterman/SmithWatermanJavaAligner.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java index 79bd2668b9e..9d4c4f3b4cf 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/smithwaterman/SmithWatermanJavaAligner.java @@ -515,8 +515,8 @@ private static CigarElement makeElement(final State state, final int length) { public void close() { logger.info(String.format("Total compute time in java Smith-Waterman : %.2f sec", totalComputeTime * 1e-9)); logger.info(String.format("Total compute time in exactMatch heuristic : %.2f sec", totalExactMatchTime * 1e-9)); - logger.info(String.format("Total calls to exactMatchHeuristic: %.2f sec", noSW * 1e-9)); - logger.info(String.format("Total calls to SW: %.2f sec", yesSW * 1e-9)); + logger.info("Total calls to exactMatchHeuristic: " + noSW); + logger.info("Total calls to SW: " + yesSW); } }