-
Notifications
You must be signed in to change notification settings - Fork 589
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
The rest of the mitochondria joint calling pipeline #5673
Changes from 6 commits
5cdebaa
9e33107
19d3261
a566ac8
09c9488
1a56b6e
b364b5c
ca3a2f3
84abe1e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,6 +15,9 @@ | |
import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.AS_RMSMappingQuality; | ||
import org.broadinstitute.hellbender.tools.walkers.genotyper.*; | ||
import org.broadinstitute.hellbender.tools.walkers.genotyper.afcalc.GeneralPloidyFailOverAFCalculatorProvider; | ||
import org.broadinstitute.hellbender.tools.walkers.mutect.M2ArgumentCollection; | ||
import org.broadinstitute.hellbender.tools.walkers.mutect.M2FiltersArgumentCollection; | ||
import org.broadinstitute.hellbender.utils.GATKProtectedVariantContextUtils; | ||
import org.broadinstitute.hellbender.utils.SimpleInterval; | ||
import org.broadinstitute.hellbender.utils.Utils; | ||
import org.broadinstitute.hellbender.utils.genotyper.IndexedSampleList; | ||
|
@@ -92,14 +95,38 @@ public final class GenotypeGVCFs extends VariantLocusWalker { | |
public static final String ALL_SITES_LONG_NAME = "include-non-variant-sites"; | ||
public static final String ALL_SITES_SHORT_NAME = "all-sites"; | ||
private static final String GVCF_BLOCK = "GVCFBlock"; | ||
private VCFHeader outputHeader; | ||
|
||
|
||
@Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, | ||
doc="File to which variants should be written", optional=false) | ||
private File outputFile; | ||
|
||
@Argument(fullName=ALL_SITES_LONG_NAME, shortName="all-sites", doc="Include loci found to be non-variant after genotyping", optional=true) | ||
@Argument(fullName=ALL_SITES_LONG_NAME, shortName=ALL_SITES_SHORT_NAME, doc="Include loci found to be non-variant after genotyping", optional=true) | ||
private boolean includeNonVariants = false; | ||
|
||
/** | ||
* "Genotype" somatic GVCFs, outputting genotypes according to confidently called alt alleles, which may lead to inconsistent ploidy | ||
* Note that the Mutect2 reference confidence mode is in BETA -- the likelihoods model and output format are subject to change in subsequent versions. | ||
*/ | ||
@Argument(fullName= CombineGVCFs.SOMATIC_INPUT_LONG_NAME, doc = "Finalize input GVCF according to somatic (i.e. Mutect2) TLODs (BETA feature)") | ||
protected boolean somaticInput = false; | ||
|
||
/** | ||
* Only variants with tumor LODs exceeding this threshold will be written to the VCF, regardless of filter status. | ||
* Set to less than or equal to tumor_lod. Increase argument value to reduce false positives in the callset. | ||
*/ | ||
@Argument(fullName=M2ArgumentCollection.EMISSION_LOD_LONG_NAME, shortName = M2ArgumentCollection.EMISSION_LOG_SHORT_NAME, | ||
doc = "LOD threshold to emit variant to VCF.") | ||
protected double tlodThreshold = 0.8 * M2FiltersArgumentCollection.DEFAULT_TLOD_FILTER_THRESHOLD; //allow for some lower quality variants | ||
|
||
|
||
/** | ||
* Margin of error in allele fraction to consider a somatic variant homoplasmic, i.e. if there is less than a 0.1% reference allele fraction, those reads are likely errors | ||
*/ | ||
@Argument(fullName=CombineGVCFs.ALLELE_FRACTION_DELTA_LONG_NAME, doc = "Margin of error in allele fraction to consider a somatic variant homoplasmic") | ||
protected double afTolerance = 1e-3; //based on Q30 as a "good" base quality score | ||
|
||
@ArgumentCollection | ||
private GenotypeCalculationArgumentCollection genotypeArgs = new GenotypeCalculationArgumentCollection(); | ||
|
||
|
@@ -152,6 +179,10 @@ public List<Class<? extends Annotation>> getDefaultVariantAnnotationGroups() { | |
|
||
@Override | ||
public void onTraversalStart() { | ||
if (somaticInput) { | ||
logger.warn("Note that the Mutect2 reference confidence mode is in BETA -- the likelihoods model and output format are subject to change in subsequent versions."); | ||
} | ||
|
||
if (!includeNonVariants) { | ||
changeTraversalModeToByVariant(); | ||
} | ||
|
@@ -185,7 +216,7 @@ public void onTraversalStart() { | |
// We only want the engine to generate the AS_QUAL key if we are using AlleleSpecific annotations. | ||
genotypingEngine = new MinimalGenotypingEngine(createUAC(), samples, new GeneralPloidyFailOverAFCalculatorProvider(genotypeArgs), annotationEngine.isRequestedReducibleRawKey(GATKVCFConstants.AS_QUAL_KEY)); | ||
|
||
merger = new ReferenceConfidenceVariantContextMerger(annotationEngine, getHeaderForVariants()); | ||
merger = new ReferenceConfidenceVariantContextMerger(annotationEngine, getHeaderForVariants(), somaticInput); | ||
|
||
if ( includeNonVariants ) { | ||
// Save INFO header names that require alt alleles | ||
|
@@ -205,7 +236,7 @@ private static boolean annotationShouldBeSkippedForHomRefSites(VariantAnnotation | |
return annotation instanceof RankSumTest || annotation instanceof RMSMappingQuality || annotation instanceof AS_RMSMappingQuality; | ||
} | ||
|
||
private void setupVCFWriter(VCFHeader inputVCFHeader, SampleList samples) { | ||
private void setupVCFWriter(final VCFHeader inputVCFHeader, final SampleList samples) { | ||
final Set<VCFHeaderLine> headerLines = new LinkedHashSet<>(inputVCFHeader.getMetaDataInInputOrder()); | ||
headerLines.addAll(getDefaultToolVCFHeaderLines()); | ||
|
||
|
@@ -227,8 +258,8 @@ private void setupVCFWriter(VCFHeader inputVCFHeader, SampleList samples) { | |
vcfWriter = createVCFWriter(outputFile); | ||
|
||
final Set<String> sampleNameSet = samples.asSetOfSamples(); | ||
final VCFHeader vcfHeader = new VCFHeader(headerLines, new TreeSet<>(sampleNameSet)); | ||
vcfWriter.writeHeader(vcfHeader); | ||
outputHeader = new VCFHeader(headerLines, new TreeSet<>(sampleNameSet)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you inline this and remove it from the class's members? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nope, I need it for some of the per-allele annotation subsetting magic later on. |
||
vcfWriter.writeHeader(outputHeader); | ||
} | ||
|
||
@Override | ||
|
@@ -238,7 +269,8 @@ public void apply(final Locatable loc, List<VariantContext> variants, ReadsConte | |
|
||
ref.setWindow(10, 10); //TODO this matches the gatk3 behavior but may be unnecessary | ||
final VariantContext mergedVC = merger.merge(variantsToProcess, loc, includeNonVariants ? ref.getBase() : null, !includeNonVariants, false); | ||
final VariantContext regenotypedVC = regenotypeVC(mergedVC, ref, features, includeNonVariants); | ||
final VariantContext regenotypedVC = somaticInput ? regenotypeSomaticVC(mergedVC, ref, features, includeNonVariants) : | ||
regenotypeVC(mergedVC, ref, features, includeNonVariants); | ||
if (regenotypedVC != null) { | ||
final SimpleInterval variantStart = new SimpleInterval(regenotypedVC.getContig(), regenotypedVC.getStart(), regenotypedVC.getStart()); | ||
if (!GATKVariantContextUtils.isSpanningDeletionOnly(regenotypedVC) && | ||
|
@@ -363,6 +395,102 @@ private VariantContext removeNonRefAlleles(final VariantContext vc) { | |
} | ||
} | ||
|
||
/** | ||
* Re-genotype (and re-annotate) a combined genomic VC | ||
* @return a new VariantContext or null if the site turned monomorphic and we don't want such sites | ||
*/ | ||
private VariantContext regenotypeSomaticVC(final VariantContext originalVC, final ReferenceContext ref, final FeatureContext features, boolean includeNonVariants) { | ||
Utils.nonNull(originalVC); | ||
|
||
final VariantContext result; | ||
if ( originalVC.isVariant() && originalVC.getAttributeAsInt(VCFConstants.DEPTH_KEY,0) > 0 ) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My vote is to replace this all with
which might strike you as ghastly. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nope, nested ternaries violate my personal style code. |
||
result = callSomaticGenotypes(originalVC); | ||
} else if (includeNonVariants) { | ||
result = originalVC; | ||
} else { | ||
result = null; | ||
} | ||
return result; | ||
} | ||
|
||
|
||
/** | ||
* Drop low quality alleles and call genotypes | ||
* CombineGVCFs will convert calls to no-call (of varying ploidy, as is the case in somatic) | ||
* | ||
* @param vc input VariantContext with no-called genotypes | ||
* @return a VC with called genotypes and low quality alleles removed, may be null | ||
*/ | ||
private VariantContext callSomaticGenotypes(final VariantContext vc) { | ||
final List<Genotype> newGenotypes = new ArrayList<>(); | ||
final GenotypesContext genotypes = vc.getGenotypes(); | ||
final double[] perAlleleLikelihoodSums = new double[vc.getAlleles().size()]; //needs the ref for the subsetting utils | ||
|
||
for(final Genotype g : genotypes) { | ||
GenotypeBuilder gb = new GenotypeBuilder(g); | ||
final double[] tlodArray = GATKProtectedVariantContextUtils.getAttributeAsDoubleArray(g, GATKVCFConstants.TUMOR_LOD_KEY, () -> null, 0.0); | ||
final double[] variantAFArray = GATKProtectedVariantContextUtils.getAttributeAsDoubleArray(g, GATKVCFConstants.ALLELE_FRACTION_KEY, () -> null, 0.0); | ||
double variantAFtotal = 0; | ||
final List<Allele> calledAlleles = new ArrayList<>(); | ||
for(int i = 0; i < vc.getAlleles().size()-1; i++) { | ||
variantAFtotal += variantAFArray[i]; | ||
if (tlodArray[i] > tlodThreshold) { | ||
calledAlleles.add(vc.getAlternateAllele(i)); | ||
perAlleleLikelihoodSums[i+1] += tlodArray[i]; | ||
} | ||
} | ||
//hack for weird Mutect2 ploidy -- if the variant is non-homoplasmic, call the reference allele too | ||
if(variantAFtotal < 1-afTolerance && !(g.hasAD() && g.getAD()[0] > 0 )) { | ||
calledAlleles.add(0, vc.getReference()); | ||
} | ||
//"ploidy" gets set according to the size of the alleles List in the Genotype | ||
gb.alleles(calledAlleles); | ||
newGenotypes.add(gb.make()); | ||
} | ||
|
||
final VariantContextBuilder builder = new VariantContextBuilder(vc); | ||
final VariantContext regenotypedVC = builder.genotypes(newGenotypes).make(); | ||
|
||
final int maxAltAlleles = ((UnifiedArgumentCollection)genotypingEngine.getConfiguration()).genotypeArgs.MAX_ALTERNATE_ALLELES; | ||
List<Allele> allelesToKeep; | ||
|
||
//we need to make sure all alleles pass the tlodThreshold | ||
allelesToKeep = new ArrayList<>(perAlleleLikelihoodSums.length-1); | ||
allelesToKeep.add(vc.getReference()); | ||
for (int i = 1; i < perAlleleLikelihoodSums.length; i++) { | ||
if (perAlleleLikelihoodSums[i] > tlodThreshold) { | ||
allelesToKeep.add(vc.getAlternateAllele(i-1)); | ||
} | ||
} | ||
|
||
if (regenotypedVC.getAlternateAlleles().size() > maxAltAlleles) { | ||
allelesToKeep = AlleleSubsettingUtils.filterToMaxNumberOfAltAllelesBasedOnScores(maxAltAlleles, allelesToKeep, perAlleleLikelihoodSums); | ||
} | ||
|
||
if (allelesToKeep.size() == 1) { | ||
return null; | ||
} | ||
|
||
//if we didn't drop alleles then we're done! | ||
if (allelesToKeep.size() == regenotypedVC.getAlleles().size()) { | ||
return regenotypedVC; | ||
} | ||
|
||
final int[] relevantIndices = allelesToKeep.stream().mapToInt(a -> regenotypedVC.getAlleles().indexOf(a)).toArray(); | ||
|
||
//do another pass over genotypes to drop the alleles that aren't called | ||
final GenotypesContext reducedGenotypes = AlleleSubsettingUtils.subsetSomaticAlleles(outputHeader, regenotypedVC.getGenotypes(), allelesToKeep, relevantIndices); | ||
final VariantContext subsetVC = builder.alleles(allelesToKeep).genotypes(reducedGenotypes).make(); | ||
final VariantContext trimmedVC = GATKVariantContextUtils.trimAlleles(subsetVC, true, true); | ||
if (isProperlyPolymorphic(trimmedVC)) { | ||
return trimmedVC; | ||
} | ||
else { | ||
return null; | ||
} | ||
} | ||
|
||
|
||
/** | ||
* Determines whether the provided VariantContext has real alternate alleles. | ||
* | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So the current plan is to filter at the single sample level and then merge with no callset level filters, is that right?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There will be callset-level filters. If any sample has a PASS variant the site will be PASS. If all variants are filtered the site will get all the filters. The original filters will go to the genotype filter (GF) field.