Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Changed flow allele filtering in Mutect2 to work separately on the tumor and the normal samples #8855

Draft
wants to merge 5 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package org.broadinstitute.hellbender.tools.walkers.haplotypecaller;

import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.util.CollectionUtil;
import htsjdk.variant.variantcontext.Allele;
import org.apache.commons.lang3.tuple.ImmutablePair;
Expand All @@ -11,6 +13,7 @@
import org.broadinstitute.hellbender.tools.walkers.annotator.StrandOddsRatio;
import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.graphs.InverseAllele;
import org.broadinstitute.hellbender.utils.BaseUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
import org.broadinstitute.hellbender.utils.haplotype.Event;
import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
Expand Down Expand Up @@ -40,14 +43,19 @@
*/

public abstract class AlleleFiltering {

final protected static Logger logger = LogManager.getLogger(AlleleFiltering.class);
final protected AssemblyBasedCallerArgumentCollection assemblyArgs;
final private OutputStreamWriter assemblyDebugOutStream;
AlleleFiltering(final AssemblyBasedCallerArgumentCollection assemblyArgs, final OutputStreamWriter assemblyDebugOutStream){
final private SAMSequenceDictionary sequenceDictionary;

AlleleFiltering(final AssemblyBasedCallerArgumentCollection assemblyArgs,
final OutputStreamWriter assemblyDebugOutStream,
final SAMFileHeader header){
this.assemblyArgs = assemblyArgs;
this.assemblyDebugOutStream = assemblyDebugOutStream;
this.sequenceDictionary = header.getSequenceDictionary();
}
protected abstract double getStringentQuality();

/**
* Finds alleles that are likely not contributing much to explaining the data and remove the haplotypes
Expand Down Expand Up @@ -177,8 +185,6 @@ private AlleleLikelihoods<GATKRead, Haplotype> subsetHaplotypesByAlleles(final A
}
logger.debug("AHM::printout end");



final List<AlleleLikelihoods<GATKRead, Allele>> alleleLikelihoods =
activeAlleles.stream().map(al -> getAlleleLikelihoodMatrix(readLikelihoods, al,
haplotypeAlleleMap, activeHaplotypes)).collect(Collectors.toList());
Expand All @@ -199,8 +205,7 @@ private AlleleLikelihoods<GATKRead, Haplotype> subsetHaplotypesByAlleles(final A
// the very weak quality is hardcoded
final List<Event> filteringCandidatesStringent = identifyBadAlleles(collectedRPLs,
collectedSORs,
activeAlleles,
1,
activeAlleles, getStringentQuality(),
Integer.MAX_VALUE);


Expand Down Expand Up @@ -334,8 +339,10 @@ private AlleleLikelihoods<GATKRead, Allele> getAlleleLikelihoodMatrix(final Alle
final Map<Haplotype, Collection<Event>> haplotypeAlleleMap,
final Set<Haplotype> enabledHaplotypes
){

final Map<Allele,List<Haplotype>> alleleHaplotypeMap = new CollectionUtil.DefaultingMap<>((k) -> new ArrayList<>(), true);


final Allele notAllele= InverseAllele.of(allele.altAllele(), true);
readLikelihoods.alleles().stream().filter(enabledHaplotypes::contains)
.filter(h->haplotypeAlleleMap.get(h).contains(allele))
Expand All @@ -345,6 +352,9 @@ private AlleleLikelihoods<GATKRead, Allele> getAlleleLikelihoodMatrix(final Alle
.forEach(alleleHaplotypeMap.get(notAllele)::add);

final AlleleLikelihoods<GATKRead, Allele> alleleLikelihoods = readLikelihoods.marginalize(alleleHaplotypeMap);
final SimpleInterval variantCallingRelevantFragmentOverlap = new SimpleInterval(allele).expandWithinContig(assemblyArgs.informativeReadOverlapMargin, sequenceDictionary);
alleleLikelihoods.retainEvidence(variantCallingRelevantFragmentOverlap::overlaps);

logger.debug(() -> String.format("GALM: %s %d %d", allele, alleleHaplotypeMap.get(allele.altAllele()).size(), alleleHaplotypeMap.get(notAllele).size()));
return alleleLikelihoods;
}
Expand All @@ -358,7 +368,6 @@ private double getAlleleSOR(final AlleleLikelihoods<GATKRead, Allele> alleleLike
final double sor = StrandOddsRatio.calculateSOR(contingency_table);
logger.debug(() -> String.format("GAS:: %s: %f (%d %d %d %d)", allele.toString(), sor, contingency_table[0][0], contingency_table[0][1], contingency_table[1][0], contingency_table[1][1]));
return sor;

}

//filters pairs of alleles by distance
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.broadinstitute.hellbender.tools.walkers.haplotypecaller;

import htsjdk.samtools.SAMFileHeader;
import htsjdk.variant.variantcontext.Allele;
import org.broadinstitute.hellbender.tools.walkers.genotyper.*;
import org.broadinstitute.hellbender.tools.walkers.genotyper.afcalc.AFCalculationResult;
Expand Down Expand Up @@ -29,13 +30,15 @@ public class AlleleFilteringHC extends AlleleFiltering {
private HaplotypeCallerGenotypingEngine genotypingEngine;
private AlleleFrequencyCalculator afCalc;

public AlleleFilteringHC(HaplotypeCallerArgumentCollection _hcargs, OutputStreamWriter assemblyDebugStream, HaplotypeCallerGenotypingEngine _genotypingEngine){
super(_hcargs, assemblyDebugStream);
public AlleleFilteringHC(HaplotypeCallerArgumentCollection _hcargs, OutputStreamWriter assemblyDebugStream,
HaplotypeCallerGenotypingEngine _genotypingEngine, final SAMFileHeader header){
super(_hcargs, assemblyDebugStream, header);
genotypingEngine = _genotypingEngine;
GenotypeCalculationArgumentCollection config = genotypingEngine.getConfiguration().genotypeArgs;
afCalc = AlleleFrequencyCalculator.makeCalculator(config);
}

protected double getStringentQuality() { return 1; }
/**
* Calculate genotype likelihood of requirement of an allele. Specifically, calculates the likelihood
* of the data given that allele versus the likelihood of the data when all haplotypes containing the allele are removed
Expand Down
Original file line number Diff line number Diff line change
@@ -1,16 +1,21 @@
package org.broadinstitute.hellbender.tools.walkers.haplotypecaller;

import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.util.CollectionUtil;
import htsjdk.variant.variantcontext.Allele;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.graphs.InverseAllele;
import org.broadinstitute.hellbender.tools.walkers.mutect.*;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
import org.broadinstitute.hellbender.utils.genotyper.AlleleList;
import org.broadinstitute.hellbender.utils.genotyper.LikelihoodMatrix;
import org.broadinstitute.hellbender.utils.haplotype.Event;
import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
import org.broadinstitute.hellbender.utils.read.GATKRead;

import java.io.OutputStreamWriter;
import java.util.List;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

Expand All @@ -25,19 +30,27 @@

public class AlleleFilteringMutect extends AlleleFiltering {
private SomaticGenotypingEngine genotypingEngine;
private Set<String> normalSamples;

public AlleleFilteringMutect(M2ArgumentCollection _m2args,
OutputStreamWriter assemblyDebugStream,
SomaticGenotypingEngine _genotypingEngine){
super(_m2args, assemblyDebugStream);
SomaticGenotypingEngine _genotypingEngine,
Set<String> _normalSamples,
final SAMFileHeader header){
super(_m2args, assemblyDebugStream, header);
genotypingEngine = _genotypingEngine;
normalSamples = _normalSamples;
}

protected double getStringentQuality() { return -50; }

/**
* Calculate calculate genotype likelihood of requirement of an allele. Specifically, calculates the likelihood
* of the data given that allele versus the likelihood of the data when all haplotypes containing the allele are removed
* This is very similar to what is done in the callMutations function in MutectEngine, but here the haplotypes that do
* not support the allele are all haplotypes that do not contain the allele rather than only the haplotypes that support reference
* etc.
* The log odds is calculated separately for the tumor and the normal samples and the smalles (the strongest) is returned
*
* @param alleleLikelihoods
* @param allele
Expand All @@ -48,13 +61,28 @@ public AlleleFilteringMutect(M2ArgumentCollection _m2args,
int getAlleleLikelihoodVsInverse(final AlleleLikelihoods<GATKRead, Allele> alleleLikelihoods, Allele allele) {

final List<LikelihoodMatrix<GATKRead, Allele>> allMatrices = IntStream.range(0, alleleLikelihoods.numberOfSamples())
.filter(i -> !normalSamples.contains(alleleLikelihoods.getSample(i)))
.mapToObj(alleleLikelihoods::sampleMatrix)
.collect(Collectors.toList());
final AlleleList<Allele> alleleList = allMatrices.get(0);
final LikelihoodMatrix<GATKRead, Allele> logAllMatrix = SomaticGenotypingEngine.combinedLikelihoodMatrix(allMatrices, alleleList);
double alleleLogOdds = somaticAltLogOdds(logAllMatrix);
logger.debug(() -> String.format("GAL:: %s: %f", allele.toString(), alleleLogOdds));
final double alleleLogOddsTumor = somaticAltLogOdds(logAllMatrix);
logger.debug(() -> String.format("GAL:: Tumor %s: %f", allele.toString(), alleleLogOddsTumor));
double alleleLogOdds = alleleLogOddsTumor;
if (!normalSamples.isEmpty()) {
final List<LikelihoodMatrix<GATKRead, Allele>> allMatricesNormal = IntStream.range(0, alleleLikelihoods.numberOfSamples())
.filter(i -> normalSamples.contains(alleleLikelihoods.getSample(i)))
.mapToObj(alleleLikelihoods::sampleMatrix)
.collect(Collectors.toList());
final LikelihoodMatrix<GATKRead, Allele> logAllMatrixNormal = SomaticGenotypingEngine.combinedLikelihoodMatrix(allMatricesNormal, alleleList);
double alleleLogOddsNormal = somaticAltLogOdds(logAllMatrixNormal);
logger.debug(() -> String.format("GAL:: Normal %s: %f", allele.toString(), alleleLogOddsNormal));
if (alleleLogOddsNormal < alleleLogOddsTumor) {
alleleLogOdds = alleleLogOddsNormal;
}
}
return (int)(10*alleleLogOdds);

}


Expand All @@ -69,6 +97,7 @@ int getAlleleLikelihoodVsInverse(final AlleleLikelihoods<GATKRead, Allele> allel
private double somaticAltLogOdds(final LikelihoodMatrix<GATKRead, Allele> matrix) {

final LikelihoodMatrix<GATKRead, Allele> initialMatrix = matrix;

if (matrix.getAllele(1-matrix.indexOfReference()) instanceof InverseAllele){
throw new GATKException.ShouldNeverReachHereException("Inverse allele removed in filtering");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -926,14 +926,16 @@ public List<VariantContext> callRegion(final AssemblyRegion region, final Featur
if (hcArgs.filterAlleles) {
logger.debug("Filtering alleles");

AlleleFilteringHC alleleFilter = new AlleleFilteringHC(hcArgs, assemblyDebugOutStream, localGenotypingEngine);
AlleleFilteringHC alleleFilter = new AlleleFilteringHC(hcArgs, assemblyDebugOutStream, localGenotypingEngine, readsHeader);
//need to update haplotypes to find the alleles
EventMap.buildEventMapsForHaplotypes(readLikelihoods.alleles(),
assemblyResult.getFullReferenceWithPadding(),
assemblyResult.getPaddedReferenceLoc(),
hcArgs.assemblerArgs.debugAssembly,
hcArgs.maxMnpDistance);
subsettedReadLikelihoodsFinal = alleleFilter.filterAlleles(readLikelihoods, assemblyResult.getPaddedReferenceLoc().getStart(), suspiciousLocations);
subsettedReadLikelihoodsFinal = alleleFilter.filterAlleles(readLikelihoods,
assemblyResult.getPaddedReferenceLoc().getStart(),
suspiciousLocations);

} else {
subsettedReadLikelihoodsFinal = readLikelihoods;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -480,7 +480,7 @@ private void filter(final CallRegionContext context) {
context.suspiciousLocations = new HashSet<>();
if (hcArgs.filterAlleles) {
logger.debug("Filtering alleles");
AlleleFilteringHC alleleFilter = new AlleleFilteringHC(hcArgs, assemblyDebugOutStream, localGenotypingEngine);
AlleleFilteringHC alleleFilter = new AlleleFilteringHC(hcArgs, assemblyDebugOutStream, localGenotypingEngine, readsHeader);
//need to update haplotypes to find the alleles
EventMap.buildEventMapsForHaplotypes(context.readLikelihoods.alleles(),
context.assemblyResult.getFullReferenceWithPadding(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,7 @@ public List<VariantContext> callRegion(final AssemblyRegion originalAssemblyRegi
Set<Integer> suspiciousLocations = new HashSet<>();
if (MTAC.filterAlleles) {
logger.debug("Filtering alleles");
AlleleFilteringMutect alleleFilter = new AlleleFilteringMutect(MTAC, null, genotypingEngine);
AlleleFilteringMutect alleleFilter = new AlleleFilteringMutect(MTAC, null, genotypingEngine, normalSamples, header);
EventMap.buildEventMapsForHaplotypes(uncollapsedReadLikelihoods.alleles(),
assemblyResult.getFullReferenceWithPadding(),
assemblyResult.getPaddedReferenceLoc(),
Expand Down
Loading
Loading