Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed an optimization in KBestHaplotypeFinder that caused some bestPaths to be lost while still preserving some of the optimization. #5952

Merged
merged 1 commit into from
May 21, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -78,15 +78,15 @@ public List<KBestHaplotype> findBestHaplotypes(final int maxNumberOfHaplotypes)
if (sinks.contains(vertexToExtend)) {
result.add(pathToExtend);
} else {
final Set<BaseEdge> outgoingEdges = graph.outgoingEdgesOf(vertexToExtend);
int totalOutgoingMultiplicity = 0;
for (final BaseEdge edge : outgoingEdges) {
totalOutgoingMultiplicity += edge.getMultiplicity();
}
if (vertexCounts.get(vertexToExtend).getAndIncrement() < maxNumberOfHaplotypes) {
final Set<BaseEdge> outgoingEdges = graph.outgoingEdgesOf(vertexToExtend);
int totalOutgoingMultiplicity = 0;
for (final BaseEdge edge : outgoingEdges) {
totalOutgoingMultiplicity += edge.getMultiplicity();
}

for (final BaseEdge edge : outgoingEdges) {
final SeqVertex targetVertex = graph.getEdgeTarget(edge);
if (vertexCounts.get(targetVertex).getAndIncrement() < maxNumberOfHaplotypes) {
for (final BaseEdge edge : outgoingEdges) {
final SeqVertex targetVertex = graph.getEdgeTarget(edge);
queue.add(new KBestHaplotype(pathToExtend, edge, totalOutgoingMultiplicity));
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,66 @@ public void testIntraNodeInsertionDeletion() {
Assert.assertEquals(altPath.calculateCigar(refString.getBytes(), SmithWatermanJavaAligner.getInstance()).toString(), "1M3I5M3D1M");
}

@Test
/*
This is a test of what can go awry if path pruning is based on the number of incoming edges to a given vertex.
An illustration of the graph in this test:
/ ----- top \
/(33) \
refStart -(32) -- mid -- midExt ---------------------------- refEnd
\(34) / (1) /
\ ---- bot /- (33) botExt - (15) - botExtTop - /
\ (18) /
\ ------ botExtBot /

The expected best paths are (refStart->top->midExt->refEnd), and (refStart->mid->midExt->refEnd) because the bottom
path is penalized for two extra forks despite it greedily looking the best at the start.

Because the old behavior used to base pruning on the number of incoming edges < K, the edge (bot->midExt) would be created
first, then (top -> midExt) but (mid -> midExt) would never be created because midExt already has 2 incoming edges.
*/

public void testDegeneratePathPruningOptimizationCase() {
// Construct an assembly graph demonstrating this issue
final SeqGraph graph = new SeqGraph(11);
final SeqVertex top = new SeqVertex("T");
final SeqVertex mid = new SeqVertex("C");
final SeqVertex midAndTopExt = new SeqVertex("GGG");
final SeqVertex bot = new SeqVertex("G");
final SeqVertex botExt = new SeqVertex("AAA");
final SeqVertex botExtTop = new SeqVertex("A");
final SeqVertex botExtBot = new SeqVertex("T");

// Ref source and sink vertexes
final SeqVertex refStart = new SeqVertex("CCCCCGGG");
final SeqVertex refEnd = new SeqVertex("TTTT");

graph.addVertices(top, bot, mid, midAndTopExt, bot, botExt, botExtTop, botExtBot, refStart, refEnd);
// First "diamond" with 3 mostly equivalent cost paths
graph.addEdges(() -> new BaseEdge(false, 34), refStart, bot, botExt);
graph.addEdges(() -> new BaseEdge(true, 33), refStart, top, midAndTopExt);
graph.addEdges(() -> new BaseEdge(false, 32), refStart, mid, midAndTopExt);

// The best looking path reconnects with a very poor edge multiplicity to midAndTopExt (This is this is what casues the bug)
graph.addEdges(() -> new BaseEdge(false, 1), bot, midAndTopExt);
// There is another diamond at bot ext that will end up discounting that path from being in the k best
graph.addEdges(() -> new BaseEdge(false, 15), botExt, botExtTop, refEnd);
graph.addEdges(() -> new BaseEdge(false, 18), botExt, botExtBot, refEnd);

// Wheras the path is smooth sailing from refEnd
graph.addEdges(() -> new BaseEdge(true, 65), midAndTopExt, refEnd);

@SuppressWarnings("all")
final List<KBestHaplotype> bestPaths = new KBestHaplotypeFinder(graph,refStart,refEnd).findBestHaplotypes(2);
Assert.assertEquals(bestPaths.size(), 2);
final Path<SeqVertex,BaseEdge> refPath = bestPaths.get(0);
final Path<SeqVertex,BaseEdge> altPath = bestPaths.get(1);

Assert.assertEquals(refPath.getVertices().toArray(new SeqVertex[0]), new SeqVertex[]{refStart, top, midAndTopExt, refEnd});
Assert.assertEquals(altPath.getVertices().toArray(new SeqVertex[0]), new SeqVertex[]{refStart, mid, midAndTopExt, refEnd});
}


@Test
public void testHardSWPath() {
// Construct the assembly graph
Expand Down