Skip to content

Commit

Permalink
fixed a bug in the palindrome artifact read filter (#5241)
Browse files Browse the repository at this point in the history
  • Loading branch information
davidbenjamin authored and takutosato committed Oct 1, 2018
1 parent 51dd4bf commit 25c42db
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import htsjdk.samtools.Cigar;
import htsjdk.samtools.CigarOperator;
import htsjdk.samtools.SAMSequenceDictionary;
import org.apache.commons.lang3.mutable.MutableInt;
import org.broadinstitute.hellbender.engine.ReferenceDataSource;
import org.broadinstitute.hellbender.utils.BaseUtils;
Expand Down Expand Up @@ -53,12 +54,15 @@ public final class PalindromeArtifactClipReadTransformer implements ReadTransfor

private final ReferenceDataSource referenceDataSource;

private final SAMSequenceDictionary sequenceDictionary;

// the artifact requires a minimum number of palindromic bases to create the single-strand loop
private final int minPalindromeSize;

public PalindromeArtifactClipReadTransformer(final ReferenceDataSource referenceDataSource, final int minPalindromeSize) {
this.referenceDataSource = referenceDataSource;
this.minPalindromeSize = minPalindromeSize;
sequenceDictionary = referenceDataSource.getSequenceDictionary();
}

@Override
Expand All @@ -85,15 +89,22 @@ public GATKRead apply(final GATKRead read) {
final int numBasesToCompare = Math.min(potentialArtifactBaseCount + minPalindromeSize, read.getLength());

// the reference position of bases that are the reverse complement of the suspected artifact
final String contig = read.getContig();
final int refStart = readIsUpstreamOfMate ? adaptorBoundary - numBasesToCompare : adaptorBoundary + 1;
final int refEnd = readIsUpstreamOfMate ? adaptorBoundary - 1 : adaptorBoundary + numBasesToCompare;

// it's possible that the read's assigned fragment length / mate start were incompatible with the contig length,
// so we explicitly check for this case to avoid errors below. We can't rely on the alignment!
if (refStart < 1 || refEnd > sequenceDictionary.getSequence(contig).getSequenceLength()) {
return read;
}

// if the reference bases overlap the soft-clipped bases, it's not an artifact. Note that read.getStart() is the unclipped start
// this can happen, for a read with a huge soft-clip that overlaps its mate significantly.
if ( (readIsUpstreamOfMate && refStart < read.getStart()) || (!readIsUpstreamOfMate && read.getEnd() < refEnd)) {
return read;
}
final SimpleInterval refInterval = new SimpleInterval(read.getContig(), refStart, refEnd);
final SimpleInterval refInterval = new SimpleInterval(contig, refStart, refEnd);
final MutableInt numMatch = new MutableInt(0);

// we traverse the reference in the forward direction, hence the read in the reverse direction
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,9 @@ public class PalindromeArtifactClipReadTransformerUnitTest {
@Test
public void test() {
final SAMFileHeader header = ArtificialReadUtils.createArtificialSamHeader();
final String contig = header.getSequence(0).toString();
final String contig = header.getSequence(0).getSequenceName();
final int refStart = 1000;
final int contigLength = header.getSequence(0).getSequenceLength();

// some random reference sequence -- no ITR yet
final byte[] refBases = ("GATTCCCCAAGGGGGCTGCTCCCAGAGGGTGTGTTGCTGGGATTGCCCAGGACAGGGATGGCCCTCTCATCAGGTGGGGG" +
Expand Down Expand Up @@ -84,6 +85,13 @@ public void test() {
final GATKRead definitelyNotArtifactRead = makeRead(header, contig, readStart, fragmentLength, readBasesMatchingRef, quals, cigar);
Assert.assertTrue(transformer.apply(definitelyNotArtifactRead).getCigar().getFirstCigarElement().getOperator() == CigarOperator.SOFT_CLIP);

// reads would cause chromosome out-of-bounds error if we didn't check explicitly for this edge case
final GATKRead mateOffContigRead = makeRead(header, contig, readStart, contigLength, readBasesMatchingRef, quals, cigar);
Assert.assertTrue(transformer.apply(mateOffContigRead) == mateOffContigRead);

final GATKRead mateOffContigRead2 = makeRead(header, contig, readStart, -contigLength, readBasesMatchingRef, quals, cigar);
Assert.assertTrue(transformer.apply(mateOffContigRead2) == mateOffContigRead2);

}

private static GATKRead makeRead(final SAMFileHeader header, final String contig, final int readStart, final int fragmentLength, final byte[] bases, final byte[] qual, final String cigar) {
Expand Down

0 comments on commit 25c42db

Please sign in to comment.