From e433fab8cda1d2cc15b5cab2e56bb31f4e52b7ee Mon Sep 17 00:00:00 2001 From: David Benjamin Date: Thu, 29 Nov 2018 22:38:10 -0500 Subject: [PATCH] optimization in CigarUtils to shortcut to M-only CIGAR when provably optimal --- .../hellbender/utils/read/CigarUtils.java | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/utils/read/CigarUtils.java b/src/main/java/org/broadinstitute/hellbender/utils/read/CigarUtils.java index 33e7a8d0ebc..1c7c88a7cc6 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/read/CigarUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/read/CigarUtils.java @@ -289,10 +289,19 @@ public static Cigar calculateCigar(final byte[] refSeq, final byte[] altSeq, fin //Note: this is a performance optimization. // If two strings are equal (a O(n) check) then it's trivial to get CIGAR for them. - if (Arrays.equals(refSeq, altSeq)){ - final Cigar matching = new Cigar(); - matching.add(new CigarElement(refSeq.length, CigarOperator.MATCH_OR_MISMATCH)); - return matching; + // Furthermore, if their lengths are equal and their element-by-element comparison yields two or fewer mismatches + // it's also a trivial M-only CIGAR, because in order to have equal length one would need at least one insertion and + // one deletion, in which case two substitutions is a better alignment. + if (altSeq.length == refSeq.length){ + int mismatchCount = 0; + for (int n = 0; n < refSeq.length && mismatchCount <= 2; n++) { + mismatchCount += (altSeq[n] == refSeq[n] ? 0 : 1); + } + if (mismatchCount <= 2) { + final Cigar matching = new Cigar(); + matching.add(new CigarElement(refSeq.length, CigarOperator.MATCH_OR_MISMATCH)); + return matching; + } } final Cigar nonStandard;