Skip to content

Commit

Permalink
M2 modifies base quals instead of discarding overlapping read pairs (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
davidbenjamin authored Mar 13, 2019
1 parent 4ba5170 commit 811e72a
Show file tree
Hide file tree
Showing 14 changed files with 121 additions and 140 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -178,12 +178,6 @@ private static void updateTable(final int[] table, final Allele allele, final GA
// a normal read with an actual strand
final boolean isForward = !read.isReverseStrand();
table[offset + (isForward ? 0 : 1)]++;

// This read's mate got discarded by SomaticGenotypingEngine::clipOverlappingReads()
if (read.hasAttribute(SomaticGenotypingEngine.DISCARDED_MATE_READ_TAG)){
// Note that if this read is forward, then we increment its mate which we assume is reverse
table[offset + (isForward ? 1 : 0)]++;
}
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ public abstract class AssemblyBasedCallerArgumentCollection {

public static final String MIN_BASE_QUALITY_SCORE_LONG_NAME = "min-base-quality-score";
public static final String SMITH_WATERMAN_LONG_NAME = "smith-waterman";
public static final String CORRECT_OVERLAPPING_BASE_QUALITIES_LONG_NAME = "correct-overlapping-quality";

public ReadThreadingAssembler createReadThreadingAssembler() {
final ReadThreadingAssembler assemblyEngine = assemblerArgs.makeReadThreadingAssembler();
Expand Down Expand Up @@ -105,9 +104,6 @@ public ReadThreadingAssembler createReadThreadingAssembler() {
@Argument(fullName = SMITH_WATERMAN_LONG_NAME, doc = "Which Smith-Waterman implementation to use, generally FASTEST_AVAILABLE is the right choice", optional = true)
public SmithWatermanAligner.Implementation smithWatermanImplementation = SmithWatermanAligner.Implementation.JAVA;

@Argument(fullName = CORRECT_OVERLAPPING_BASE_QUALITIES_LONG_NAME)
public boolean doNotCorrectOverlappingBaseQualities = false;

/**
* (BETA feature) The reference confidence mode makes it possible to emit a per-bp or summarized confidence estimate for a site being strictly homozygous-reference.
* This is similar to the HaplotypeCaller reference confidence/GVCF mode. See https://software.broadinstitute.org/gatk/documentation/article.php?id=4017 for information about GVCFs.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -128,25 +128,40 @@ public static void finalizeRegion(final AssemblyRegion region,
Collections.sort(readsToUse, new ReadCoordinateComparator(readsHeader)); // TODO: sort may be unnecessary here

// handle overlapping read pairs from the same fragment
cleanOverlappingReadPairs(readsToUse, samplesList, readsHeader, correctOverlappingBaseQualities);
if (correctOverlappingBaseQualities) {
cleanOverlappingReadPairs(readsToUse, samplesList, readsHeader, true, OptionalInt.empty(), OptionalInt.empty());
}

region.clearReads();
region.addAll(readsToUse);
region.setFinalized(true);
}

/**
* Clean up reads/bases that overlap within read pairs
* Modify base qualities when paired reads overlap to account for the possibility of PCR error.
*
* Overlapping mates provded independent evidence as far as sequencing error is concerned, but their PCR errors
* are correlated. The base qualities are thus limited by the sequencing base quality as well as half of the PCR
* quality. We use half of the PCR quality because downstream we treat read pairs as independent, and summing two halves
* effectively gives the PCR quality of the pairs when taken together.
*
* @param reads the list of reads to consider
* @param correctOverlappingBaseQualities
* @param samplesList list of samples
* @param readsHeader bam header of reads' source
* @param setConflictingToZero if true, set base qualities to zero when mates have different base at overlapping position
* @param halfOfPcrSnvQual half of phred-scaled quality of substitution errors from PCR
* @param halfOfPcrIndelQual half of phred-scaled quality of indel errors from PCR
*/
private static void cleanOverlappingReadPairs(final List<GATKRead> reads, final SampleList samplesList, final SAMFileHeader readsHeader,
final boolean correctOverlappingBaseQualities) {
public static void cleanOverlappingReadPairs(final List<GATKRead> reads, final SampleList samplesList, final SAMFileHeader readsHeader,
final boolean setConflictingToZero, final OptionalInt halfOfPcrSnvQual, final OptionalInt halfOfPcrIndelQual) {
Utils.nonNull(reads);
Utils.nonNull(samplesList);
Utils.nonNull(halfOfPcrSnvQual);
Utils.nonNull(halfOfPcrSnvQual);
for ( final List<GATKRead> perSampleReadList : splitReadsBySample(samplesList, readsHeader, reads).values() ) {
final FragmentCollection<GATKRead> fragmentCollection = FragmentCollection.create(perSampleReadList);
for ( final List<GATKRead> overlappingPair : fragmentCollection.getOverlappingPairs() ) {
FragmentUtils.adjustQualsOfOverlappingPairedFragments(overlappingPair, correctOverlappingBaseQualities);
for ( final Pair<GATKRead, GATKRead> overlappingPair : fragmentCollection.getOverlappingPairs() ) {
FragmentUtils.adjustQualsOfOverlappingPairedFragments(overlappingPair, setConflictingToZero, halfOfPcrSnvQual, halfOfPcrIndelQual);
}
}
}
Expand Down Expand Up @@ -251,8 +266,9 @@ public static AssemblyResultSet assembleReads(final AssemblyRegion region,
final Logger logger,
final ReferenceSequenceFile referenceReader,
final ReadThreadingAssembler assemblyEngine,
final SmithWatermanAligner aligner){
finalizeRegion(region, argumentCollection.assemblerArgs.errorCorrectReads, argumentCollection.dontUseSoftClippedBases, (byte)(argumentCollection.minBaseQualityScore - 1), header, sampleList, ! argumentCollection.doNotCorrectOverlappingBaseQualities);
final SmithWatermanAligner aligner,
final boolean correctOverlappingBaseQualities){
finalizeRegion(region, argumentCollection.assemblerArgs.errorCorrectReads, argumentCollection.dontUseSoftClippedBases, (byte)(argumentCollection.minBaseQualityScore - 1), header, sampleList, correctOverlappingBaseQualities);
if( argumentCollection.assemblerArgs.debugAssembly) {
logger.info("Assembling " + region.getSpan() + " with " + region.size() + " reads: (with overlap region = " + region.getExtendedSpan() + ")");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ public class HaplotypeCallerArgumentCollection extends AssemblyBasedCallerArgume
public static final String MAX_MNP_DISTANCE_SHORT_NAME = "mnp-dist";
public static final String GQ_BAND_LONG_NAME = "gvcf-gq-bands";
public static final String GQ_BAND_SHORT_NAME = "GQB";
public static final String CORRECT_OVERLAPPING_BASE_QUALITIES_LONG_NAME = "correct-overlapping-quality";


@ArgumentCollection
public StandardCallerArgumentCollection standardArgs = new StandardCallerArgumentCollection();
Expand Down Expand Up @@ -150,4 +152,7 @@ protected ReadThreadingAssemblerArgumentCollection getReadThreadingAssemblerArgu
@Advanced
@Argument(fullName= USE_FILTERED_READS_FOR_ANNOTATIONS_LONG_NAME, doc = "Use the contamination-filtered read maps for the purposes of annotating variants", optional=true)
public boolean useFilteredReadMapForAnnotations = false;

@Argument(fullName = CORRECT_OVERLAPPING_BASE_QUALITIES_LONG_NAME)
public boolean doNotCorrectOverlappingBaseQualities = false;
}
Original file line number Diff line number Diff line change
Expand Up @@ -539,7 +539,7 @@ public List<VariantContext> callRegion(final AssemblyRegion region, final Featur
}

// run the local assembler, getting back a collection of information on how we should proceed
final AssemblyResultSet untrimmedAssemblyResult = AssemblyBasedCallerUtils.assembleReads(region, givenAlleles, hcArgs, readsHeader, samplesList, logger, referenceReader, assemblyEngine, aligner);
final AssemblyResultSet untrimmedAssemblyResult = AssemblyBasedCallerUtils.assembleReads(region, givenAlleles, hcArgs, readsHeader, samplesList, logger, referenceReader, assemblyEngine, aligner, !hcArgs.doNotCorrectOverlappingBaseQualities);

final SortedSet<VariantContext> allVariationEvents = untrimmedAssemblyResult.getVariationEvents(hcArgs.maxMnpDistance);
// TODO - line bellow might be unnecessary : it might be that assemblyResult will always have those alleles anyway
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ public class M2ArgumentCollection extends AssemblyBasedCallerArgumentCollection
public static final String EMISSION_LOG_SHORT_NAME = "emit-lod";
public static final String INITIAL_TUMOR_LOD_LONG_NAME = "initial-tumor-lod";
public static final String INITIAL_TUMOR_LOD_SHORT_NAME = "init-lod";
public static final String INITIAL_PCR_ERROR_QUAL = "initial-pcr-qual";
public static final String MAX_POPULATION_AF_LONG_NAME = "max-population-af";
public static final String MAX_POPULATION_AF_SHORT_NAME = "max-af";
public static final String DOWNSAMPLING_STRIDE_LONG_NAME = "downsampling-stride";
Expand All @@ -48,6 +47,8 @@ public class M2ArgumentCollection extends AssemblyBasedCallerArgumentCollection
public static final String MEDIAN_AUTOSOMAL_COVERAGE_LONG_NAME = "median-autosomal-coverage";
public static final String MITOCHONDRIA_MODE_LONG_NAME = "mitochondria-mode";
public static final String CALLABLE_DEPTH_LONG_NAME = "callable-depth";
public static final String PCR_SNV_QUAL_LONG_NAME = "pcr-snv-qual";
public static final String PCR_INDEL_QUAL_LONG_NAME = "pcr-indel-qual";

public static final double DEFAULT_AF_FOR_TUMOR_ONLY_CALLING = 5e-8;
public static final double DEFAULT_AF_FOR_TUMOR_NORMAL_CALLING = 1e-6;
Expand Down Expand Up @@ -173,11 +174,12 @@ public double getInitialLod() {
return mitochondria && initialLod == DEFAULT_INITIAL_LOD ? DEFAULT_MITO_INITIAL_LOD : initialLod;
}

/**
* PCR error rate for overlapping fragments in isActive()
*/
@Argument(fullName = INITIAL_PCR_ERROR_QUAL, optional = true, doc = "PCR error rate for overlapping fragments in isActive()")
public int initialPCRErrorQual = 40;

@Argument(fullName = PCR_SNV_QUAL_LONG_NAME, optional = true, doc = "Phred-scaled PCR SNV qual for overlapping fragments")
public int pcrSnvQual = 40;

@Argument(fullName = PCR_INDEL_QUAL_LONG_NAME, optional = true, doc = "Phred-scaled PCR SNV qual for overlapping fragments")
public int pcrIndelQual = 40;

/**
* In tumor-only mode, we discard variants with population allele frequencies greater than this threshold.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
import org.broadinstitute.hellbender.tools.walkers.annotator.StandardMutectAnnotation;
import org.broadinstitute.hellbender.tools.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypingGivenAllelesUtils;
import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypingOutputMode;
import org.broadinstitute.hellbender.tools.walkers.genotyper.HomogeneousPloidyModel;
import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.*;
import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.readthreading.ReadThreadingAssembler;
Expand Down Expand Up @@ -209,6 +208,10 @@ public void writeHeader(final VariantContextWriter vcfWriter, final Set<VCFHeade
}

public List<VariantContext> callRegion(final AssemblyRegion originalAssemblyRegion, final ReferenceContext referenceContext, final FeatureContext featureContext ) {
// divide PCR qual by two in order to get the correct total qual when treating paired reads as independent
AssemblyBasedCallerUtils.cleanOverlappingReadPairs(originalAssemblyRegion.getReads(), samplesList, header,
false, OptionalInt.of(MTAC.pcrSnvQual /2), OptionalInt.of(MTAC.pcrIndelQual /2));

if ( !originalAssemblyRegion.isActive() || originalAssemblyRegion.size() == 0 ) {
return emitReferenceConfidence() ? referenceModelForNoVariation(originalAssemblyRegion) : NO_CALLS; //TODD: does this need to be finalized?
}
Expand All @@ -219,7 +222,7 @@ public List<VariantContext> callRegion(final AssemblyRegion originalAssemblyRegi
.filter(vc -> MTAC.genotypeFilteredAlleles || vc.isNotFiltered()).collect(Collectors.toList());

final AssemblyRegion assemblyActiveRegion = AssemblyBasedCallerUtils.assemblyRegionWithWellMappedReads(originalAssemblyRegion, READ_QUALITY_FILTER_THRESHOLD, header);
final AssemblyResultSet untrimmedAssemblyResult = AssemblyBasedCallerUtils.assembleReads(assemblyActiveRegion, givenAlleles, MTAC, header, samplesList, logger, referenceReader, assemblyEngine, aligner);
final AssemblyResultSet untrimmedAssemblyResult = AssemblyBasedCallerUtils.assembleReads(assemblyActiveRegion, givenAlleles, MTAC, header, samplesList, logger, referenceReader, assemblyEngine, aligner, false);

final SortedSet<VariantContext> allVariationEvents = untrimmedAssemblyResult.getVariationEvents(MTAC.maxMnpDistance);
final AssemblyRegionTrimmer.Result trimmingResult = trimmer.trim(originalAssemblyRegion, allVariationEvents);
Expand Down Expand Up @@ -369,14 +372,14 @@ public ActivityProfileState isActive(final AlignmentContext context, final Refer
callableSites.increment();
}
final ReadPileup tumorPileup = pileup.makeFilteredPileup(pe -> isTumorSample(ReadUtils.getSampleName(pe.getRead(), header)));
final List<Byte> tumorAltQuals = altQuals(tumorPileup, refBase, MTAC.initialPCRErrorQual);
final List<Byte> tumorAltQuals = altQuals(tumorPileup, refBase, MTAC.pcrSnvQual);
final double tumorLog10Odds = MathUtils.logToLog10(lnLikelihoodRatio(tumorPileup.size() - tumorAltQuals.size(), tumorAltQuals));

if (tumorLog10Odds < MTAC.getInitialLod()) {
return new ActivityProfileState(refInterval, 0.0);
} else if (hasNormal() && !MTAC.genotypeGermlineSites) {
final ReadPileup normalPileup = pileup.makeFilteredPileup(pe -> isNormalSample(ReadUtils.getSampleName(pe.getRead(), header)));
final List<Byte> normalAltQuals = altQuals(normalPileup, refBase, MTAC.initialPCRErrorQual);
final List<Byte> normalAltQuals = altQuals(normalPileup, refBase, MTAC.pcrSnvQual);
final int normalAltCount = normalAltQuals.size();
final double normalQualSum = normalAltQuals.stream().mapToDouble(Byte::doubleValue).sum();
if (normalAltCount > normalPileup.size() * MAX_ALT_FRACTION_IN_NORMAL && normalQualSum > MAX_NORMAL_QUAL_SUM) {
Expand Down Expand Up @@ -433,7 +436,8 @@ public boolean emitReferenceConfidence() {
* @return a list of variant contexts (can be empty) to emit for this ref region
*/
private List<VariantContext> referenceModelForNoVariation(final AssemblyRegion region) {
AssemblyBasedCallerUtils.finalizeRegion(region, false, true, (byte)9, header, samplesList, ! MTAC.doNotCorrectOverlappingBaseQualities); //take off soft clips and low Q tails before we calculate likelihoods
// don't correct overlapping base qualities because we did that upstream
AssemblyBasedCallerUtils.finalizeRegion(region, false, true, (byte)9, header, samplesList, false); //take off soft clips and low Q tails before we calculate likelihoods
final SimpleInterval paddedLoc = region.getExtendedSpan();
final Haplotype refHaplotype = AssemblyBasedCallerUtils.createReferenceHaplotype(region, paddedLoc, referenceReader);
final List<Haplotype> haplotypes = Collections.singletonList(refHaplotype);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ public class SomaticGenotypingEngine {
private final M2ArgumentCollection MTAC;
private final Set<String> normalSamples;
final boolean hasNormal;
public static final String DISCARDED_MATE_READ_TAG = "DM";
protected VariantAnnotatorEngine annotationEngine;

public SomaticGenotypingEngine(final M2ArgumentCollection MTAC, final Set<String> normalSamples, final VariantAnnotatorEngine annotationEngine) {
Expand Down Expand Up @@ -103,7 +102,6 @@ public CalledHaplotypes callMutations(
final Map<Allele, List<Haplotype>> alleleMapper = AssemblyBasedCallerUtils.createAlleleMapper(mergedVC, loc, haplotypes, null);
final ReadLikelihoods<Allele> log10Likelihoods = log10ReadLikelihoods.marginalize(alleleMapper,
new SimpleInterval(mergedVC).expandWithinContig(HaplotypeCallerGenotypingEngine.ALLELE_EXTENSION, header.getSequenceDictionary()));
filterOverlappingReads(log10Likelihoods, mergedVC.getReference(), loc, false);

if (emitRefConf) {
mergedVC = ReferenceConfidenceUtils.addNonRefSymbolicAllele(mergedVC);
Expand Down Expand Up @@ -298,51 +296,6 @@ private int getRefIndex(LikelihoodMatrix<Allele> matrix) {
return optionalRefIndex.getAsInt();
}

private void filterOverlappingReads(final ReadLikelihoods<Allele> likelihoods, final Allele ref, final int location, final boolean retainMismatches) {
for (final String sample : likelihoods.samples()) {
// Get the best alleles of each read and group them by the read name.
// This puts paired reads from the same fragment together
final Map<String, List<ReadLikelihoods<Allele>.BestAllele>> fragments = likelihoods.bestAllelesBreakingTies(sample).stream()
.collect(Collectors.groupingBy(ba -> ba.read.getName()));

// We only potentially filter read pairs that overlap at this position
final List<Pair<ReadLikelihoods<Allele>.BestAllele, ReadLikelihoods<Allele>.BestAllele>> overlappingReadPairs =
fragments.values().stream()
.filter(l -> l.size() == 2)
.map(l -> new ImmutablePair<>(l.get(0), l.get(1)))
.filter(p -> ReadUtils.isInsideRead(p.getLeft().read, location) && ReadUtils.isInsideRead(p.getRight().read, location))
.collect(Collectors.toList());

final Set<GATKRead> readsToDiscard = new HashSet<>();

for (final Pair<ReadLikelihoods<Allele>.BestAllele, ReadLikelihoods<Allele>.BestAllele> pair : overlappingReadPairs) {
final ReadLikelihoods<Allele>.BestAllele read = pair.getLeft();
final ReadLikelihoods<Allele>.BestAllele mate = pair.getRight();

if (read.allele.equals(mate.allele)) {
// keep the higher-quality read
readsToDiscard.add(read.likelihood < mate.likelihood ? read.read : mate.read);

// mark the read to indicate that its mate was dropped - so that we can account for it in {@link StrandArtifactFilter}
// and {@link StrandBiasBySample}
if (MTAC.annotateBasedOnReads){
final GATKRead readToKeep = read.likelihood >= mate.likelihood ? read.read : mate.read;
readToKeep.setAttribute(DISCARDED_MATE_READ_TAG, 1);
}
} else if (retainMismatches) {
// keep the alt read
readsToDiscard.add(read.allele.equals(ref) ? read.read : mate.read);
} else {
// throw out both
readsToDiscard.add(read.read);
readsToDiscard.add(mate.read);
}
}

likelihoods.removeSampleReads(likelihoods.indexOfSample(sample), readsToDiscard, likelihoods.numberOfAlleles());
}
}

//convert a likelihood matrix of alleles x reads into a RealMatrix
public static RealMatrix getAsRealMatrix(final LikelihoodMatrix<Allele> matrix) {
final RealMatrix result = new Array2DRowRealMatrix(matrix.numberOfAlleles(), matrix.numberOfReads());
Expand Down
Loading

0 comments on commit 811e72a

Please sign in to comment.