Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

handle unmarked duplicates with mate MQ = 0 in Mutect2 #5734

Merged
merged 3 commits into from
Mar 1, 2019
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderLine;
import htsjdk.variant.vcf.VCFStandardHeaderLines;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.math3.special.Beta;
import org.apache.commons.math3.special.Gamma;
import org.apache.commons.math3.util.FastMath;
Expand Down Expand Up @@ -73,6 +74,8 @@ public final class Mutect2Engine implements AssemblyRegionEvaluator {
public static final int MAX_NORMAL_QUAL_SUM = 100;
public static final int MIN_PALINDROME_SIZE = 5;

public static final int HUGE_FRAGMENT_LENGTH = 1_000_000;

private M2ArgumentCollection MTAC;
private SAMFileHeader header;

Expand Down Expand Up @@ -198,6 +201,8 @@ public List<VariantContext> callRegion(final AssemblyRegion originalAssemblyRegi
return emitReferenceConfidence() ? referenceModelForNoVariation(originalAssemblyRegion) : NO_CALLS; //TODD: does this need to be finalized?
}

removeUnmarkedDuplicates(originalAssemblyRegion);

final List<VariantContext> givenAlleles = MTAC.genotypingOutputMode == GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES ?
featureContext.getValues(MTAC.alleles).stream().filter(vc -> MTAC.genotypeFilteredAlleles || vc.isNotFiltered()).collect(Collectors.toList()) :
Collections.emptyList();
Expand All @@ -220,9 +225,7 @@ public List<VariantContext> callRegion(final AssemblyRegion originalAssemblyRegi
}

final AssemblyRegion regionForGenotyping = assemblyResult.getRegionForGenotyping();
final List<GATKRead> readStubs = regionForGenotyping.getReads().stream()
.filter(r -> r.getLength() < AssemblyBasedCallerUtils.MINIMUM_READ_LENGTH_AFTER_TRIMMING).collect(Collectors.toList());
regionForGenotyping.removeAll(readStubs);
removeReadStubs(regionForGenotyping);

final Map<String,List<GATKRead>> reads = splitReadsBySample( regionForGenotyping.getReads() );

Expand Down Expand Up @@ -260,6 +263,30 @@ public List<VariantContext> callRegion(final AssemblyRegion originalAssemblyRegi
}
}

private void removeReadStubs(final AssemblyRegion assemblyRegion) {
final List<GATKRead> readStubs = assemblyRegion.getReads().stream()
.filter(r -> r.getLength() < AssemblyBasedCallerUtils.MINIMUM_READ_LENGTH_AFTER_TRIMMING).collect(Collectors.toList());
assemblyRegion.removeAll(readStubs);
}

private void removeUnmarkedDuplicates(final AssemblyRegion assemblyRegion) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks like it considers:

  • SVs: This code checks whether the mates are clustered, even if on a different contig from the supporting reads of the variant.
  • Duplicate start and end positions of supporting reads.

Please tell me if I have misinterpreted. If I have not misinterpreted, then no action required.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's right.

// PCR duplicates whose mates have MQ = 0 won't necessarily be marked as duplicates because
// the mates map randomly to many different places
final Map<ImmutablePair<String, Integer>, List<GATKRead>> possibleDuplicates = assemblyRegion.getReads().stream()
.filter(read -> read.isPaired() && !read.mateIsUnmapped() &&
(!read.getMateContig().equals(read.getContig()) || Math.abs(read.getFragmentLength()) > HUGE_FRAGMENT_LENGTH))
.collect(Collectors.groupingBy(
read -> ImmutablePair.of(ReadUtils.getSampleName(read, header), (read.isFirstOfPair() ? 1 : -1) * read.getUnclippedStart())));

final List<GATKRead> duplicates = possibleDuplicates.values().stream().flatMap(list -> {
final Map<String, List<GATKRead>> readsByContig = list.stream().collect(Collectors.groupingBy(GATKRead::getMateContig));
// if no clear best contig, they're almost certainly all mapping errors (in addition to all but one being duplicates)
// so we toss them all. Otherwise we toss all but one
return readsByContig.values().stream().flatMap(contigReads -> contigReads.stream().skip(contigReads.size() > list.size() / 2 ? 1 : 0));
}).collect(Collectors.toList());
assemblyRegion.removeAll(duplicates);
}

//TODO: refactor this
private boolean containsCalls(final HaplotypeCallerGenotypingEngine.CalledHaplotypes calledHaplotypes) {
return calledHaplotypes.getCalls().stream()
Expand Down