Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Switched MarkDuplicatesSpark tile parsing code to use shorts in order to match picard #5165

Merged
merged 4 commits into from
Sep 7, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ public abstract class TransientFieldPhysicalLocation extends PairedEnds implemen
// Information used to detect optical dupes
protected short readGroupIndex = -1;
protected transient short tile = -1;
protected transient int x = -1;
protected transient int y = -1;
protected transient short x = -1;
protected transient short y = -1;
protected transient short libraryId = -1;

public TransientFieldPhysicalLocation(int partitionIndex, String name) {
Expand All @@ -36,14 +36,16 @@ public TransientFieldPhysicalLocation(int partitionIndex, String name) {
@Override
public int getX() { return this.x; }

// NOTE picard in practice compresses the pixel values to signed shorts for space purposes despite the api using an integer
@Override
public void setX(final int x) { this.x = x; }
public void setX(final int x) { this.x = (short)x; }

@Override
public int getY() { return this.y; }

// NOTE picard in practice compresses the pixel values to signed shorts for space purposes despite the api using an integer
@Override
public void setY(final int y) { this.y = y; }
public void setY(final int y) { this.y = (short)y; }

@Override
public short getLibraryId() { return this.libraryId; }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -267,11 +267,11 @@ public void testOpticalDuplicatesTiebrokenByPhysicalLocationNotStartPosition(fin

int compare = position1.tile - position2.tile;
if (compare == 0) {
compare = position1.x - position2.x;
compare = (short)position1.x - (short)position2.x;
}

if (compare == 0) {
compare = position1.y - position2.y;
compare = (short)position1.y - (short)position2.y;
}

final boolean isDuplicate = compare < 0;
Expand All @@ -281,4 +281,65 @@ public void testOpticalDuplicatesTiebrokenByPhysicalLocationNotStartPosition(fin
tester.addMatePair(readName2, 1,2, 51, false, false, isDuplicate, isDuplicate, "6S42M28S", "8S68M", true, false, false, false, false, DEFAULT_BASE_QUALITY);
tester.runTest();
}

@DataProvider
public Object[][] readNameData(){
return new Object[][]{
{"RUNID:7:1203:2886:82292", "RUNID:7:1205:3886:16834"},

{"RUNID:7:1203:2886:16756", "RUNID:7:1205:3886:16756"},
{"RUNID:7:1204:2886:16756", "RUNID:7:1205:3886:16756"},
{"RUNID:7:1205:2886:16756", "RUNID:7:1205:3886:16756"},
{"RUNID:7:1206:2886:16756", "RUNID:7:1205:3886:16756"},
{"RUNID:7:1207:2886:16756", "RUNID:7:1205:3886:16756"},

{"RUNID:7:1203:2886:16756", "RUNID:7:1203:4886:26756"},
{"RUNID:7:1203:3886:16756", "RUNID:7:1203:4886:26756"},
{"RUNID:7:1203:4886:16756", "RUNID:7:1203:4886:26756"},
{"RUNID:7:1203:5886:16756", "RUNID:7:1203:4886:26756"},
{"RUNID:7:1203:6886:16756", "RUNID:7:1203:4886:26756"},

{"RUNID:7:1203:2886:34756", "RUNID:7:1203:2886:36756"},
{"RUNID:7:1203:2886:35756", "RUNID:7:1203:2886:36756"},
{"RUNID:7:1203:2886:37756", "RUNID:7:1203:2886:36756"},
{"RUNID:7:1203:2886:38756", "RUNID:7:1203:2886:36756"},

//Added a test for tiebreaking accounting for the short casting done in picard
{"HK3T5CCXX160204:3:1112:11586:37067", "HK3T5CCXX160204:3:1112:11586:32144"}
};
}

@Test(dataProvider = "readNameData")
public void testOpticalDuplicateClusterSamePositionNoOpticalDuplicates(final String readName1, final String readName2) {
// This tests the readname based tiebreaking code in mark duplicates. Since it's ambiguous which read should be marked
// as duplicate or not if scores match we break ties by evaluating the readname for consistencies sake.

final ReadNameParser parser = new ReadNameParser();

final PhysicalLocationInt position1 = new PhysicalLocationInt();
final PhysicalLocationInt position2 = new PhysicalLocationInt();

parser.addLocationInformation(readName1, position1);
parser.addLocationInformation(readName2, position2);

final AbstractMarkDuplicatesTester tester = getTester();
tester.getSamRecordSetBuilder().setReadLength(101);
tester.setExpectedOpticalDuplicate(0);

int compare = position1.tile - position2.tile;
if (compare == 0) {
compare = (short)position1.x - (short)position2.x;
}

if (compare == 0) {
compare = (short)position1.y - (short)position2.y;
}

final boolean isDuplicate = compare < 0;

tester.addMatePair(readName1, 1,485253, 485253, false, false, !isDuplicate, !isDuplicate, "42M59S", "59S42M", false, true, false, false, false, DEFAULT_BASE_QUALITY);
tester.addMatePair(readName2, 1,485253, 485253, false, false, isDuplicate, isDuplicate, "59S42M", "42M59S", true, false, false, false, false, DEFAULT_BASE_QUALITY);

tester.runTest();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -156,64 +156,6 @@ public void testOpticalDuplicateFinding() {
tester.runTest();
}

@DataProvider
public Object[][] readNameData(){
return new Object[][]{
{"RUNID:7:1203:2886:82292", "RUNID:7:1205:3886:16834"},

{"RUNID:7:1203:2886:16756", "RUNID:7:1205:3886:16756"},
{"RUNID:7:1204:2886:16756", "RUNID:7:1205:3886:16756"},
{"RUNID:7:1205:2886:16756", "RUNID:7:1205:3886:16756"},
{"RUNID:7:1206:2886:16756", "RUNID:7:1205:3886:16756"},
{"RUNID:7:1207:2886:16756", "RUNID:7:1205:3886:16756"},

{"RUNID:7:1203:2886:16756", "RUNID:7:1203:4886:26756"},
{"RUNID:7:1203:3886:16756", "RUNID:7:1203:4886:26756"},
{"RUNID:7:1203:4886:16756", "RUNID:7:1203:4886:26756"},
{"RUNID:7:1203:5886:16756", "RUNID:7:1203:4886:26756"},
{"RUNID:7:1203:6886:16756", "RUNID:7:1203:4886:26756"},

{"RUNID:7:1203:2886:34756", "RUNID:7:1203:2886:36756"},
{"RUNID:7:1203:2886:35756", "RUNID:7:1203:2886:36756"},
{"RUNID:7:1203:2886:37756", "RUNID:7:1203:2886:36756"},
{"RUNID:7:1203:2886:38756", "RUNID:7:1203:2886:36756"},
};
}

@Test(dataProvider = "readNameData")
public void testOpticalDuplicateClusterSamePositionNoOpticalDuplicates(final String readName1, final String readName2) {
// This tests the readname based tiebreaking code in mark duplicates. Since it's ambiguous which read should be marked
// as duplicate or not if scores match we break ties by evaluating the readname for consistencies sake.

final ReadNameParser parser = new ReadNameParser();

final PhysicalLocationInt position1 = new PhysicalLocationInt();
final PhysicalLocationInt position2 = new PhysicalLocationInt();

parser.addLocationInformation(readName1, position1);
parser.addLocationInformation(readName2, position2);

final AbstractMarkDuplicatesTester tester = getTester();
tester.getSamRecordSetBuilder().setReadLength(101);
tester.setExpectedOpticalDuplicate(0);

int compare = position1.tile - position2.tile;
if (compare == 0) {
compare = position1.x - position2.x;
}

if (compare == 0) {
compare = position1.y - position2.y;
}

final boolean isDuplicate = compare < 0;

tester.addMatePair(readName1, 1,485253, 485253, false, false, !isDuplicate, !isDuplicate, "42M59S", "59S42M", false, true, false, false, false, DEFAULT_BASE_QUALITY);
tester.addMatePair(readName2, 1,485253, 485253, false, false, isDuplicate, isDuplicate, "59S42M", "42M59S", true, false, false, false, false, DEFAULT_BASE_QUALITY);

tester.runTest();
}

@Test
public void testOpticalDuplicatesDifferentReadGroups() {
final AbstractMarkDuplicatesTester tester = getTester();
Expand Down Expand Up @@ -251,7 +193,7 @@ public void testOpticalDuplicatesTheSameReadGroup() {
public void testOpticalDuplicatesAndPCRDuplicatesOrientationDifference() {
final AbstractMarkDuplicatesTester tester = getTester();
tester.setExpectedOpticalDuplicate(0);
tester.addMatePair("RUNID:7:1203:2886:82292", 19, 19, 485253, 486253, false, false, true, true, "101M", "101M", true, false, false, false, false, DEFAULT_BASE_QUALITY, "1"); // duplicate
tester.addMatePair("RUNID:7:1203:2886:16838", 19, 19, 485253, 486253, false, false, true, true, "101M", "101M", true, false, false, false, false, DEFAULT_BASE_QUALITY, "1"); // duplicate
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why did this one need to change? If it resolves differently now, how were both we and picard passing before? Or is this a gatk only test?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So this test is different now because the tiebreaking has changed, specifically because of integer overflowing in the Y coordinate of this read. The point of this test is about orientation and optical duplicates (which it wasn't doing a good job of testing before anyway because it was failing for a different reason). The tiebreaking change caused a different read to be marked as duplicate. I could have reversed the value but it seems better to correct the test as now its actually testing what it purports to. Also, yes this does not exist in picard.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍 Thanks for the explanation.

tester.addMatePair("RUNID:7:1203:2886:16834", 19, 19, 486253, 485253, false, false, false, false, "101M", "101M", false, true, false, false, false, DEFAULT_BASE_QUALITY, "1");
// Even though these reads are in a duplicate group together, we don't want to mark them as Optical Duplicates because their orientation is flipped (which doesn't matter for PCR duplicates)
tester.runTest();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import picard.sam.util.PhysicalLocationInt;
import picard.sam.util.ReadNameParser;

import java.io.File;
import java.util.*;
Expand Down