Skip to content

Commit

Permalink
Tools, filters and annotations for mitochodnria pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
meganshand committed Oct 15, 2018
1 parent 8103bde commit b158b82
Show file tree
Hide file tree
Showing 27 changed files with 646 additions and 87 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import htsjdk.samtools.Cigar;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.tools.AddOriginalAlignmentTags;
import org.broadinstitute.hellbender.utils.QualityUtils;
import org.broadinstitute.hellbender.utils.help.HelpConstants;
import org.broadinstitute.hellbender.utils.read.CigarUtils;
Expand Down Expand Up @@ -263,6 +264,20 @@ public static class ValidAlignmentEndReadFilter extends ReadFilter {
@Override public boolean test(final GATKRead read) {
return read.isUnmapped() || (read.getEnd() - read.getStart() + 1) >= 0;}}

/**
* If original alignment and mate original alignment tags exist, filter reads that were originally chimeric (mates were on different contigs).
*/
@DocumentedFeature(groupName=HelpConstants.DOC_CAT_READFILTERS, groupSummary = HelpConstants.DOC_CAT_READFILTERS_SUMMARY, summary = "Filters reads whose original alignment was chimeric.")
public static class NonChimericOriginalAlignmentReadFilter extends ReadFilter {
private static final long serialVersionUID = 1L;
@Override public boolean test(final GATKRead read) {
if (read.hasAttribute(AddOriginalAlignmentTags.OA_TAG_NAME) && read.hasAttribute(AddOriginalAlignmentTags.MATE_CONTIG_TAG_NAME)) {
return AddOriginalAlignmentTags.getOAContig(read).equals(read.getAttributeAsString(AddOriginalAlignmentTags.MATE_CONTIG_TAG_NAME));
}
return true;
}
}

/**
* Static, stateless read filter instances
*/
Expand Down Expand Up @@ -291,5 +306,6 @@ public static class ValidAlignmentEndReadFilter extends ReadFilter {
public static final SeqIsStoredReadFilter SEQ_IS_STORED = new SeqIsStoredReadFilter();
public static final ValidAlignmentStartReadFilter VALID_ALIGNMENT_START = new ValidAlignmentStartReadFilter();
public static final ValidAlignmentEndReadFilter VALID_ALIGNMENT_END = new ValidAlignmentEndReadFilter();
public static final NonChimericOriginalAlignmentReadFilter NON_CHIMERIC_ORIGINAL_ALIGNMENT_READ_FILTER = new NonChimericOriginalAlignmentReadFilter();

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
package org.broadinstitute.hellbender.tools;

import htsjdk.samtools.SAMTag;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.argparser.ExperimentalFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.ReadWalker;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.SAMFileGATKReadWriter;
import picard.cmdline.programgroups.ReadDataManipulationProgramGroup;

@CommandLineProgramProperties(
summary = "Adds Original Alignment tag and original mate contig tag",
oneLineSummary = "Adds Original Alignment tag and original mate contig tag",
programGroup = ReadDataManipulationProgramGroup.class
)
@ExperimentalFeature
public class AddOriginalAlignmentTags extends ReadWalker {
@Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
doc="Write output to this file")
public String output;
private SAMFileGATKReadWriter outputWriter;

public final static String MATE_CONTIG_TAG_NAME = "XM";
public final static String OA_TAG_NAME = "OA";
public final static String OA_SEPARATOR = ",";

@Override
public void onTraversalStart() {
outputWriter = createSAMWriter(IOUtils.getPath(output), true);
}
@Override
public void apply(GATKRead read, ReferenceContext referenceContext, FeatureContext featureContext) {
addOATag(read);
addMateContigTag(read);
outputWriter.addRead(read);
}
@Override
public void closeTool() {
if ( outputWriter != null ) {
outputWriter.close();
}
}

private static void addMateContigTag(final GATKRead read) {
read.setAttribute(MATE_CONTIG_TAG_NAME, !read.mateIsUnmapped() ? read.getMateContig() : "*");
}

//TODO: Once the OA tag is merged into the spec (https://github.com/samtools/hts-specs/pull/193) this should be moved to htsjdk
private static void addOATag(final GATKRead read) {
String OAValue;
if(!read.isUnmapped()){
OAValue = String.format("%s,%s,%s,%s,%s,%s;",
read.getContig().replace(OA_SEPARATOR, "_"),
read.getStart(),
read.isReverseStrand() ? "-" : "+",
read.getCigar().toString(),
read.getMappingQuality(),
read.getAttributeAsString(SAMTag.NM.name()));
} else {
OAValue = "*,0,*,*,0,0;";
}

read.setAttribute(OA_TAG_NAME, OAValue);
}

public static String getOAContig (final GATKRead read) {
return(read.getAttributeAsString(OA_TAG_NAME).split(OA_SEPARATOR)[0]);
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
package org.broadinstitute.hellbender.tools.walkers.annotator;

import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.GenotypeBuilder;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFFormatHeaderLine;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.tools.AddOriginalAlignmentTags;
import org.broadinstitute.hellbender.utils.*;
import org.broadinstitute.hellbender.utils.genotyper.ReadLikelihoods;
import org.broadinstitute.hellbender.utils.help.HelpConstants;
import org.broadinstitute.hellbender.utils.logging.OneShotLogger;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines;

import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;

/**
* Original Alignment annotation counts the number of alt reads where the original alignment contig doesn't match the current alignment contig
*
* <p>If reads were realigned to multiple references (for example the full human reference followed by just the
* mitochondrial contig) and the original alignment tag was recorded before realigning, then we can count the number of alt
* reads that have been realigned from other contigs to this one. This can be useful for downstream filtering if an alt
* allele has all or most of its support originally from a different contig. In the mitochondrial case this can be useful
* for filtering known NuMTs that are present in other contigs in the reference. </p>
*
* <h3>Caveat</h3>
* <p>This annotation can only be calculated if an OA tag is present in the bam and can only be run by Mutect2.</p>
*
*/
@DocumentedFeature(groupName= HelpConstants.DOC_CAT_ANNOTATORS, groupSummary=HelpConstants.DOC_CAT_ANNOTATORS_SUMMARY, summary="Number of alt reads with an OA tag that doesn't match the current alignment contig.")
public class OriginalAlignment extends GenotypeAnnotation implements StandardMutectAnnotation {
protected final OneShotLogger warning = new OneShotLogger(this.getClass());
public static final String KEY = GATKVCFConstants.ORIGINAL_CONTIG_MISMATCH_KEY;

@Override
public void annotate(ReferenceContext ref, VariantContext vc, Genotype g, GenotypeBuilder gb, ReadLikelihoods<Allele> likelihoods) {
Utils.nonNull(gb);
Utils.nonNull(vc);
Utils.nonNull(likelihoods);

final double[] lods = GATKProtectedVariantContextUtils.getAttributeAsDoubleArray(vc, GATKVCFConstants.LOD_KEY);
if (lods==null) {
warning.warn(String.format("One or more variant contexts is missing the 'LOD' annotation, %s will not be computed for these VariantContexts", GATKVCFConstants.ORIGINAL_CONTIG_MISMATCH_KEY));
return;
}
final int indexOfMaxLod = MathUtils.maxElementIndex(lods);
final Allele altAlelle = vc.getAlternateAllele(indexOfMaxLod);
final Collection<ReadLikelihoods<Allele>.BestAllele> bestAlleles = likelihoods.bestAllelesBreakingTies(g.getSampleName());
final String currentContig = ref.getInterval().getContig();

final long nonChrMAlt = bestAlleles.stream()
.filter(ba -> ba.read.hasAttribute(AddOriginalAlignmentTags.OA_TAG_NAME) && ba.isInformative() && ba.allele.equals(altAlelle) &&
!AddOriginalAlignmentTags.getOAContig(ba.read).equals(currentContig))
.count();
gb.attribute(GATKVCFConstants.ORIGINAL_CONTIG_MISMATCH_KEY, nonChrMAlt);
}

@Override
public List<VCFFormatHeaderLine> getDescriptions() {
return Collections.singletonList(GATKVCFHeaderLines.getFormatLine(KEY));
}

@Override
public List<String> getKeyNames() {
return Collections.singletonList(KEY);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
package org.broadinstitute.hellbender.tools.walkers.annotator;

import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.utils.help.HelpConstants;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.GenotypeBuilder;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFFormatHeaderLine;
import org.apache.commons.math3.distribution.PoissonDistribution;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.utils.GATKProtectedVariantContextUtils;
import org.broadinstitute.hellbender.utils.MathUtils;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.genotyper.ReadLikelihoods;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
import org.broadinstitute.hellbender.utils.logging.OneShotLogger;
import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines;
import org.spark_project.guava.collect.Range;

import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;

/**
* Potential polymorphic NuMT annotation compares the number of alts to the autosomal coverage
*
* <p>Polymorphic NuMTs (NuMT sequence that is not in the reference) will result in autosomal reads that map to the
* mitochondria. This annotation notes when the number of alt reads falls within 80% of the autosomal coverage
* distribution, assuming that autosomal coverage is a Poisson distribution given a central statistic of the depth
* (median is recommended). This will also include true low allele fraction mitochondrial variants and should be used
* as an annotation, rather than a filter.</p>
*
* <h3>Caveat</h3>
* <p>This annotation can only be calculated in Mutect2 if median-autosomal-coverage argument is provided.</p>
*
*/
@DocumentedFeature(groupName= HelpConstants.DOC_CAT_ANNOTATORS, groupSummary=HelpConstants.DOC_CAT_ANNOTATORS_SUMMARY, summary="Number of alts indicates it could be an autosomal false positive.")
public class PolymorphicNuMT extends GenotypeAnnotation implements Annotation {
protected final OneShotLogger warning = new OneShotLogger(this.getClass());
public static final String KEY = GATKVCFConstants.POTENTIAL_POLYMORPHIC_NUMT_KEY;
private static final double LOWER_BOUND_PROB = .1;
private Range<Long> autosomalHetRange;
private Range<Long> autosomalHomAltRange;

public PolymorphicNuMT(final double medianAutosomalCoverage){
final PoissonDistribution autosomalCoverage = new PoissonDistribution(medianAutosomalCoverage);
final long minAutosomalHomAlt = autosomalCoverage.inverseCumulativeProbability(LOWER_BOUND_PROB);
final long maxAutosomalHomAlt = autosomalCoverage.inverseCumulativeProbability(1 - LOWER_BOUND_PROB);
final long minAutosomalHet = minAutosomalHomAlt / 2;
final long maxAutosomalHet = maxAutosomalHomAlt / 2;
autosomalHetRange = Range.closed(minAutosomalHet, maxAutosomalHet);
autosomalHomAltRange = Range.closed(minAutosomalHomAlt, maxAutosomalHomAlt);
}

// Barclay requires that each annotation define a constructor that takes no arguments
public PolymorphicNuMT(){ }

@Override
public void annotate(ReferenceContext ref, VariantContext vc, Genotype g, GenotypeBuilder gb, ReadLikelihoods<Allele> likelihoods) {
Utils.nonNull(gb);
Utils.nonNull(vc);
Utils.nonNull(likelihoods);
final double[] lods = GATKProtectedVariantContextUtils.getAttributeAsDoubleArray(vc, GATKVCFConstants.LOD_KEY);
if (lods==null) {
warning.warn(String.format("One or more variant contexts is missing the 'LOD' annotation, %s will not be computed for these VariantContexts", GATKVCFConstants.POTENTIAL_POLYMORPHIC_NUMT_KEY));
return;
}
final int indexOfMaxLod = MathUtils.maxElementIndex(lods);
final Allele altAlelle = vc.getAlternateAllele(indexOfMaxLod);
Collection<ReadLikelihoods<Allele>.BestAllele> bestAlleles = likelihoods.bestAllelesBreakingTies(g.getSampleName());
final long numAltReads = bestAlleles.stream().filter(ba -> ba.isInformative() && ba.allele.equals(altAlelle)).count();
if ( autosomalHetRange.contains(numAltReads) || autosomalHomAltRange.contains(numAltReads) ) {
gb.attribute(GATKVCFConstants.POTENTIAL_POLYMORPHIC_NUMT_KEY, "true");
}
}
@Override
public List<VCFFormatHeaderLine> getDescriptions() {
return Collections.singletonList(GATKVCFHeaderLines.getFormatLine(KEY));
}
@Override
public List<String> getKeyNames() {
return Collections.singletonList(KEY);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ public void annotate(final ReferenceContext ref,
return;
}

final double[] tumorLods = GATKProtectedVariantContextUtils.getAttributeAsDoubleArray(vc, GATKVCFConstants.TUMOR_LOD_KEY, () -> null, -1);
final double[] tumorLods = GATKProtectedVariantContextUtils.getAttributeAsDoubleArray(vc, GATKVCFConstants.LOD_KEY, () -> null, -1);
final int indexOfMaxTumorLod = MathUtils.maxElementIndex(tumorLods);
final Allele altAllele = vc.getAlternateAllele(indexOfMaxTumorLod);
final Nucleotide altBase = Nucleotide.valueOf(altAllele.toString());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,15 +82,15 @@ public void annotate(final ReferenceContext ref,
pi.put(ART_FWD, PRIOR_PROBABILITY_OF_ARTIFACT);
pi.put(ART_REV, PRIOR_PROBABILITY_OF_ARTIFACT);

// We use the allele with highest LOD score
final double[] tumorLods = GATKProtectedVariantContextUtils.getAttributeAsDoubleArray(vc, GATKVCFConstants.TUMOR_LOD_KEY, () -> null, -1);
// We use the allele with highest log odds score
final double[] lods = GATKProtectedVariantContextUtils.getAttributeAsDoubleArray(vc, GATKVCFConstants.LOD_KEY, () -> null, -1);

if (tumorLods==null) {
warning.warn("One or more variant contexts is missing the 'TLOD' annotation, StrandArtifact will not be computed for these VariantContexts");
if (lods==null) {
warning.warn("One or more variant contexts is missing the 'LOD' annotation, StrandArtifact will not be computed for these VariantContexts");
return;
}
final int indexOfMaxTumorLod = MathUtils.maxElementIndex(tumorLods);
final Allele altAllele = vc.getAlternateAllele(indexOfMaxTumorLod);
final int indexOfMaxLod = MathUtils.maxElementIndex(lods);
final Allele altAllele = vc.getAlternateAllele(indexOfMaxLod);

final Collection<ReadLikelihoods<Allele>.BestAllele> informativeBestAlleles = likelihoods.bestAllelesBreakingTies(g.getSampleName()).stream().
filter(ba -> ba.isInformative()).collect(Collectors.toList());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.*;
import org.broadinstitute.hellbender.exceptions.UserException;
import picard.cmdline.programgroups.VariantFilteringProgramGroup;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.ReadsContext;
Expand All @@ -21,6 +22,7 @@
import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines;

import java.io.File;
import java.util.ArrayList;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
Expand Down Expand Up @@ -143,7 +145,15 @@ public void closeTool() {
}
}

private String getTumorSampleName(){
return getHeaderForVariants().getMetaDataLine(Mutect2Engine.TUMOR_SAMPLE_KEY_IN_VCF_HEADER).getValue();
private String getTumorSampleName() {
return MTFAC.mitochondria ? getMitoSampleName() : getHeaderForVariants().getMetaDataLine(Mutect2Engine.TUMOR_SAMPLE_KEY_IN_VCF_HEADER).getValue();
}

private String getMitoSampleName() {
ArrayList<String> allSampleNames = getHeaderForVariants().getSampleNamesInOrder();
if(allSampleNames.size() != 1) {
throw new UserException(String.format("Expected single sample VCF in mitochondria mode, but there were %s samples.", allSampleNames.size()));
}
return allSampleNames.get(0);
}
}
Loading

0 comments on commit b158b82

Please sign in to comment.