Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added a distinction between PCR orientation and Optical Duplicates orientation in MarkDuplicatesSpark #4752

Merged
merged 2 commits into from
May 18, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import org.broadinstitute.hellbender.utils.read.markduplicates.SerializableOpticalDuplicatesFinder;
import picard.sam.markduplicates.util.OpticalDuplicateFinder;
import org.broadinstitute.hellbender.utils.spark.SparkUtils;
import picard.sam.markduplicates.util.OpticalDuplicateFinder;
import picard.cmdline.programgroups.ReadDataManipulationProgramGroup;
import scala.Tuple2;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.metrics.MetricsFile;
import org.apache.parquet.Ints;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
Expand Down Expand Up @@ -325,10 +324,14 @@ private static JavaPairRDD<IndexPair<String>, Integer> markDuplicateRecords(fina

// Note, this uses bitshift operators in order to perform only a single groupBy operation for all the merged data
private static long getGroupKey(MarkDuplicatesSparkRecord record) {
return record.getClass()==Passthrough.class?-1:
(((long)((PairedEnds)record).getUnclippedStartPosition()) << 32 |
((PairedEnds)record).getFirstRefIndex() << 16 );
//| ((PairedEnds)pe).getLibraryIndex())).values();
if ( record.getClass()==Passthrough.class) {
return -1;
} else {
final PairedEnds pairedEnds = (PairedEnds) record;
return ((((long) pairedEnds.getUnclippedStartPosition()) << 32) |
(pairedEnds.getFirstRefIndex() << 16) |
pairedEnds.getOrientationForPCRDuplicates() );
}
}

/**
Expand Down Expand Up @@ -520,8 +523,8 @@ public int compare( PairedEnds first, PairedEnds second ) {
}

//This is done to mimic SAMRecordCoordinateComparator's behavior
if (first.isR1R() != second.isR1R()) {
return first.isR1R() ? -1: 1;
if (first.isRead1ReverseStrand() != second.isRead1ReverseStrand()) {
return first.isRead1ReverseStrand() ? -1: 1;
}

if ( first.getName() != null && second.getName() != null ) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import htsjdk.samtools.SAMFileHeader;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.ReadUtils;
import org.broadinstitute.hellbender.utils.read.markduplicates.ReadEnds;
import org.broadinstitute.hellbender.utils.read.markduplicates.ReadsKey;

/**
Expand Down Expand Up @@ -33,7 +34,7 @@ public EmptyFragment(GATKRead read, SAMFileHeader header) {
this.R1R = read.isReverseStrand();
firstStartPosition = 0;
this.key = ReadsKey.hashKeyForFragment(firstUnclippedStartPosition,
isR1R(),
isRead1ReverseStrand(),
firstRefIndex,
ReadUtils.getLibrary(read, header));
}
Expand All @@ -60,10 +61,14 @@ public int getFirstStartPosition() {
return firstStartPosition;
}
@Override
public boolean isR1R() {
public boolean isRead1ReverseStrand() {
return R1R;
}
@Override
public byte getOrientationForPCRDuplicates() {
return (R1R)? ReadEnds.R : ReadEnds.F;
}
@Override
public int getFirstRefIndex() {
return firstRefIndex;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.ReadUtils;
import org.broadinstitute.hellbender.utils.read.markduplicates.MarkDuplicatesScoringStrategy;
import org.broadinstitute.hellbender.utils.read.markduplicates.ReadEnds;
import org.broadinstitute.hellbender.utils.read.markduplicates.ReadsKey;

/**
Expand Down Expand Up @@ -31,7 +32,7 @@ public Fragment(final GATKRead first, final SAMFileHeader header, int partitionI
this.score = scoringStrategy.score(first);
this.R1R = first.isReverseStrand();
this.key = ReadsKey.hashKeyForFragment(firstUnclippedStartPosition,
isR1R(),
isRead1ReverseStrand(),
firstRefIndex,
ReadUtils.getLibrary(first, header));
}
Expand All @@ -58,10 +59,14 @@ public int getFirstStartPosition() {
return firstStartPosition;
}
@Override
public boolean isR1R() {
public boolean isRead1ReverseStrand() {
return R1R;
}
@Override
public byte getOrientationForPCRDuplicates() {
return (R1R)? ReadEnds.R : ReadEnds.F;
}
@Override
public int getFirstRefIndex() {
return firstRefIndex;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,13 @@ public final class Pair extends PairedEnds implements PhysicalLocation {
private final int firstStartPosition;
private final int firstUnclippedStartPosition;
private final short firstRefIndex;
private final boolean R1R;
private final boolean isRead1ReverseStrand;

private final int secondUnclippedStartPosition;
private final short secondRefIndex;
private final boolean R2R;
private final boolean isRead2ReverseStrand;
private final int score;
private final boolean wasFlipped;

// Information used to detect optical dupes
private short readGroupIndex = -1;
Expand Down Expand Up @@ -67,23 +68,15 @@ public Pair(final GATKRead read1, final GATKRead read2, final SAMFileHeader head
firstUnclippedStartPosition = read2UnclippedStart;
secondUnclippedStartPosition = read1UnclippedStart;
}
// Keep track of the orientation of read1 and read2 as it is important for optical duplicate marking
wasFlipped = second.isFirstOfPair();

firstStartPosition = first.getAssignedStart();
firstRefIndex = (short)ReadUtils.getReferenceIndex(first, header);
secondRefIndex = (short)ReadUtils.getReferenceIndex(second, header);

final GATKRead firstOfPair;
final GATKRead secondOfPair;
if (read1.isFirstOfPair()){
firstOfPair = read1;
secondOfPair = read2;
} else {
firstOfPair = read2;
secondOfPair = read1;
}

R1R = firstOfPair.isReverseStrand();
R2R = secondOfPair.isReverseStrand();
isRead1ReverseStrand = first.isReverseStrand();
isRead2ReverseStrand = second.isReverseStrand();

this.key = ReadsKey.hashKeyForPair(header, first, second);
}
Expand All @@ -103,13 +96,14 @@ private Pair(Kryo kryo, Input input){
firstStartPosition = input.readInt();
firstUnclippedStartPosition = input.readInt();
firstRefIndex = input.readShort();
R1R = input.readBoolean();
isRead1ReverseStrand = input.readBoolean();

secondUnclippedStartPosition = input.readInt();
secondRefIndex = input.readShort();
R2R = input.readBoolean();
isRead2ReverseStrand = input.readBoolean();

readGroupIndex = input.readShort();
wasFlipped = input.readBoolean();
}

protected void serialize(Kryo kryo, Output output) {
Expand All @@ -121,13 +115,14 @@ protected void serialize(Kryo kryo, Output output) {
output.writeInt(firstStartPosition);
output.writeInt(firstUnclippedStartPosition);
output.writeShort(firstRefIndex);
output.writeBoolean(R1R);
output.writeBoolean(isRead1ReverseStrand);

output.writeInt(secondUnclippedStartPosition);
output.writeShort(secondRefIndex);
output.writeBoolean(R2R);
output.writeBoolean(isRead2ReverseStrand);

output.writeShort(readGroupIndex);
output.writeBoolean(wasFlipped);
}

@Override
Expand All @@ -152,8 +147,8 @@ public int getFirstStartPosition() {
return firstStartPosition;
}
@Override
public boolean isR1R() {
return R1R;
public boolean isRead1ReverseStrand() {
return isRead1ReverseStrand;
}
@Override
public int getFirstRefIndex() {
Expand All @@ -171,16 +166,25 @@ public String toString() {
* Returns one of {@link ReadEnds#RR}, {@link ReadEnds#RF}, {@link ReadEnds#FR}, {@link ReadEnds#FF}
*/
public byte getOrientationForOpticalDuplicates() {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we want unit tests for these two methods that assert the truth table for the two different type of orientations

if (R1R && R2R) {
return getOrientation(true);
}

@Override
public byte getOrientationForPCRDuplicates() {
return getOrientation(false);
}

private byte getOrientation(final boolean optical) {
if (isRead1ReverseStrand && isRead2ReverseStrand) {
return ReadEnds.RR;
}
if (R1R) {
return ReadEnds.RF; //at this point we know for sure R2R is false
if (isRead1ReverseStrand) {
return (optical && wasFlipped)? ReadEnds.FR : ReadEnds.RF; //at this point we know for sure isRead2ReverseStrand is false
}
if (R2R) {
return ReadEnds.FR; //at this point we know for sure R1R is false
if (isRead2ReverseStrand) {
return (optical && wasFlipped)? ReadEnds.RF :ReadEnds.FR; //at this point we know for sure isRead1ReverseStrand is false
}
return ReadEnds.FF; //at this point we know for sure R1R is false and R2R is false
return ReadEnds.FF; //at this point we know for sure isRead1ReverseStrand is false and isRead2ReverseStrand is false
}

// Methods for OpticalDuplicateFinder.PhysicalLocation
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,14 @@ public abstract class PairedEnds extends MarkDuplicatesSparkRecord {
public abstract int getFirstStartPosition();
public abstract int getUnclippedStartPosition();
public abstract int getFirstRefIndex();
public abstract boolean isR1R();
public abstract int getScore();
public abstract boolean isRead1ReverseStrand();

/**
* This returns a byte summary spanning from 0-5 representing all combinations of single read or two read
* forward/reverse strand for the first and second read in the Pair. Note, for PCR Duplicates orientation
* that the 'first' read corresponds to the read that appears first by coordinate sorting order, which is
* distinct from optical sorting which considers the 'first' read to be the first in pair.
*/
public abstract byte getOrientationForPCRDuplicates();
}
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,6 @@ public void testOpticalDuplicatesDifferentReadGroups() {
tester.setExpectedOpticalDuplicate(0);
tester.addMatePair("RUNID:7:1203:2886:82292", 19, 19, 485253, 485253, false, false, true, true, "42M59S", "59S42M", true, false, false, false, false, DEFAULT_BASE_QUALITY, "H0164.2"); // duplicate
tester.addMatePair("RUNID:7:1203:2886:82242", 19, 19, 485253, 485253, false, false, false, false, "42M59S", "59S42M", true, false, false, false, false, DEFAULT_BASE_QUALITY, "H0164.1");

SAMFileHeader header = tester.getHeader();
SAMReadGroupRecord readGroup1 = new SAMReadGroupRecord("H0164.2");
SAMReadGroupRecord readGroup2 = new SAMReadGroupRecord("H0164.1");
Expand All @@ -189,7 +188,7 @@ public void testOpticalDuplicatesDifferentReadGroups() {
public void testOpticalDuplicatesTheSameReadGroup() {
final AbstractMarkDuplicatesTester tester = getTester();
tester.setExpectedOpticalDuplicate(1);
tester.addMatePair("RUNID:7:1203:2886:82292", 19, 19, 485253, 485253, false, false, true, true, "42M59S", "59S42M", true, false, false, false, false, DEFAULT_BASE_QUALITY, "H0164.2"); // duplicate
tester.addMatePair("RUNID:7:1203:2886:82292", 19, 19, 485253, 485253, false, false, true, true, "42M59S", "59S42M", true, false, false, false, false, DEFAULT_BASE_QUALITY, "H0164.2"); // duplicate (tie broken by readname)
tester.addMatePair("RUNID:7:1203:2886:82242", 19, 19, 485253, 485253, false, false, false, false, "42M59S", "59S42M", true, false, false, false, false, DEFAULT_BASE_QUALITY, "H0164.2");

SAMFileHeader header = tester.getHeader();
Expand All @@ -202,6 +201,16 @@ public void testOpticalDuplicatesTheSameReadGroup() {
tester.runTest();
}

@Test
public void testOpticalDuplicatesAndPCRDuplicatesOrientationDifference() {
final AbstractMarkDuplicatesTester tester = getTester();
tester.setExpectedOpticalDuplicate(0);
tester.addMatePair("RUNID:7:1203:2886:82292", 19, 19, 485253, 486253, false, false, true, true, "101M", "101M", true, false, false, false, false, DEFAULT_BASE_QUALITY, "1"); // duplicate
tester.addMatePair("RUNID:7:1203:2886:16834", 19, 19, 486253, 485253, false, false, false, false, "101M", "101M", false, true, false, false, false, DEFAULT_BASE_QUALITY, "1");
// Even though these reads are in a duplicate group together, we don't want to mark them as Optical Duplicates because their orientation is flipped (which doesn't matter for PCR duplicates)
tester.runTest();
}

@Test
public void testOpticalDuplicateClusterSamePositionNoOpticalDuplicatesWithinPixelDistance() {
final AbstractMarkDuplicatesTester tester = getTester();
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
package org.broadinstitute.hellbender.utils.read.markduplicates.sparkrecords;

import org.broadinstitute.hellbender.GATKBaseTest;
import org.broadinstitute.hellbender.utils.read.ArtificialReadUtils;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.markduplicates.MarkDuplicatesScoringStrategy;
import org.broadinstitute.hellbender.utils.read.markduplicates.ReadEnds;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;

public class PairedEndsUnitTest extends GATKBaseTest {

@DataProvider
public Object[][] orientationTruthTable() {
return new Object[][]{
{false, false, false, ReadEnds.FF, ReadEnds.FF},
{false, false, true, ReadEnds.FR, ReadEnds.FR},
{false, true, false, ReadEnds.RF, ReadEnds.RF},
{false, true, true, ReadEnds.RR, ReadEnds.RR},
{true, false, false, ReadEnds.FF, ReadEnds.FF},
{true, false, true, ReadEnds.RF, ReadEnds.FR},
{true, true, false, ReadEnds.FR, ReadEnds.RF},
{true, true, true, ReadEnds.RR, ReadEnds.RR},};
}

@Test (dataProvider = "orientationTruthTable")
public void testGetOrientationForPCRDuplicates(boolean flipStarts, boolean firstReadReverse, boolean secondReadReverse,
byte PCROrientation, byte opticalOrientation) {
GATKRead primaryRead;
GATKRead secondaryRead;

if (flipStarts) {
secondaryRead = ArtificialReadUtils.createSamBackedRead("100M",100000,100);
primaryRead = ArtificialReadUtils.createSamBackedRead("100M",101000,100);
} else {
primaryRead = ArtificialReadUtils.createSamBackedRead("100M",100000,100);
secondaryRead = ArtificialReadUtils.createSamBackedRead("100M",101000,100);
}
primaryRead.setIsFirstOfPair();
primaryRead.setIsReverseStrand(firstReadReverse);
secondaryRead.setIsSecondOfPair();
secondaryRead.setIsReverseStrand(secondReadReverse);


// Creating a PairedEnds object and asserting the orientation is correct
Pair pair = PairedEnds.newPair(primaryRead, secondaryRead, hg19Header, 0, MarkDuplicatesScoringStrategy.SUM_OF_BASE_QUALITIES);
Assert.assertEquals(pair.getOrientationForPCRDuplicates(), PCROrientation);
Assert.assertEquals(pair.getOrientationForOpticalDuplicates(), opticalOrientation);
}
}