Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add option to recalculate the allele fraction based on AD instead of … #5118

Merged
merged 2 commits into from
Aug 17, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
package org.broadinstitute.hellbender.tools.walkers.annotator;

import htsjdk.variant.variantcontext.*;
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.variant.vcf.VCFFormatHeaderLine;
import htsjdk.variant.vcf.VCFStandardHeaderLines;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.utils.MathUtils;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.genotyper.ReadLikelihoods;
import org.broadinstitute.hellbender.utils.help.HelpConstants;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines;

import java.util.*;

/**
* Variant allele fraction for each sample.
*
* <p>This annotation describes the proportion of the sample's reads that support the variant allele(s). It uses only reads that are actually considered informative by HaplotypeCaller (HC) or Mutect2, using pre-read likelihoods that are produced internally by HC/Mutect2.</p>
* <p>In this context, an informative read is defined as one that allows the allele it carries to be easily distinguished. In contrast, a read might be considered uninformative if, for example, it only partially overlaps a short tandem repeat and it is not clear whether the read contains the reference allele or an extra repeat.</p>
*
* <p>See the method documentation on <a href="http://www.broadinstitute.org/gatk/guide/article?id=4721">using coverage information</a> for important interpretation details.</p>
*
* <h3>Caveats</h3>
* <ul>
* <li>If a genotype is already annotated with allele fraction (as by the SomaticGenotypingEngine if `--get-af-from-ad` is not specified), the value will not be recalculated.</li>
* </ul>
*
* <h3>Related annotations</h3>
* <ul>
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_DepthPerAlleleBySample.php">DepthPerAlleleBySample</a></b> calculates depth of coverage for each allele per sample (AD).</li>
* </ul>
*
*/
@DocumentedFeature(groupName= HelpConstants.DOC_CAT_ANNOTATORS, groupSummary=HelpConstants.DOC_CAT_ANNOTATORS_SUMMARY, summary="Variant allele fraction for a genotype")

public final class AlleleFraction extends GenotypeAnnotation implements StandardMutectAnnotation {
@Override
public void annotate(final ReferenceContext ref,
final VariantContext vc,
final Genotype g,
final GenotypeBuilder gb,
final ReadLikelihoods<Allele> likelihoods) {
Utils.nonNull(gb, "gb is null");
Utils.nonNull(vc, "vc is null");

final GenotypesContext genotypes = vc.getGenotypes();
if ( g == null || !g.isCalled() || g.hasExtendedAttribute(getKeyNames().get(0))) { //don't overwrite AF based on Bayesian estimate if it already exists
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It should be clear in the class javadoc that this annotation does nothing if AF is already present.

return;
}

for ( final Genotype genotype : genotypes ) {
if ( genotype.hasAD() ) {
final int[] AD = genotype.getAD();
final double[] allAlleleFractions = MathUtils.normalizeFromRealSpace(Arrays.stream(AD).mapToDouble(x -> x).toArray());
gb.attribute(getKeyNames().get(0), Arrays.copyOfRange(allAlleleFractions, 1, allAlleleFractions.length)); //omit the first entry of the array corresponding to the reference
}
// if there is no AD value calculate it now using likelihoods
else if (likelihoods != null) {
DepthPerAlleleBySample adCalc = new DepthPerAlleleBySample();
final int[] AD = adCalc.annotateWithLikelihoods(vc, g, new LinkedHashSet<>(vc.getAlleles()), likelihoods);
final double[] allAlleleFractions = MathUtils.normalizeFromRealSpace(Arrays.stream(AD).mapToDouble(x -> x*1.0).toArray());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

x -> x will work here without the *1.0

gb.attribute(getKeyNames().get(0), Arrays.copyOfRange(allAlleleFractions, 1, allAlleleFractions.length)); //omit the first entry of the array corresponding to the reference
}
}
}

@Override
public List<String> getKeyNames() { return Collections.singletonList(GATKVCFConstants.ALLELE_FRACTION_KEY);}

@Override
public List<VCFFormatHeaderLine> getDescriptions() {
return Collections.singletonList(GATKVCFHeaderLines.getFormatLine(getKeyNames().get(0)));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,6 @@
* <h3>Related annotations</h3>
* <ul>
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_Coverage.php">Coverage</a></b> gives the filtered depth of coverage for each sample and the unfiltered depth across all samples.</li>
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_AlleleBalance.php">AlleleBalance</a></b> is a generalization of this annotation over all samples.</li>
* <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_AlleleBalanceBySample.php">AlleleBalanceBySample</a></b> calculates allele balance for each individual sample.</li>
* </ul>
*/
@DocumentedFeature(groupName=HelpConstants.DOC_CAT_ANNOTATORS, groupSummary=HelpConstants.DOC_CAT_ANNOTATORS_SUMMARY, summary="Depth of coverage of each allele per sample (AD)")
Expand Down Expand Up @@ -95,7 +93,7 @@ private int[] annotateWithPileup(final VariantContext vc, List<PileupElement> pi
return counts;
}

private int[] annotateWithLikelihoods(VariantContext vc, Genotype g, Set<Allele> alleles, ReadLikelihoods<Allele> likelihoods) {
protected int[] annotateWithLikelihoods(VariantContext vc, Genotype g, Set<Allele> alleles, ReadLikelihoods<Allele> likelihoods) {

final Map<Allele, Integer> alleleCounts = new LinkedHashMap<>();
for ( final Allele allele : vc.getAlleles() ) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ public class M2ArgumentCollection extends AssemblyBasedCallerArgumentCollection
public static final String MAX_MNP_DISTANCE_SHORT_NAME = "mnp-dist";
public static final String IGNORE_ITR_ARTIFACTS_LONG_NAME = "ignore-itr-artifacts";
public static final String ARTIFACT_PRIOR_TABLE_NAME = "orientation-bias-artifact-priors";
public static final String GET_AF_FROM_AD_LONG_NAME = "get-af-from-ad";

public static final double DEFAULT_AF_FOR_TUMOR_ONLY_CALLING = 5e-8;
public static final double DEFAULT_AF_FOR_TUMOR_NORMAL_CALLING = 1e-6;
Expand Down Expand Up @@ -164,4 +165,8 @@ public double getDefaultAlleleFrequency() {
@Argument(fullName= IGNORE_ITR_ARTIFACTS_LONG_NAME, doc="Turn off read transformer that clips artifacts associated with end repair insertions near inverted tandem repeats.", optional = true)
public boolean dontClipITRArtifacts = false;

@Advanced
@Argument(fullName = GET_AF_FROM_AD_LONG_NAME, doc="Use allelic depth to calculate tumor allele fraction; recommended for mitochondrial applications", optional = true)
public boolean calculateAFfromAD = false;

}
Original file line number Diff line number Diff line change
Expand Up @@ -235,10 +235,13 @@ private void addGenotypes(final LikelihoodMatrix<Allele> tumorLog10Matrix,
final Optional<LikelihoodMatrix<Allele>> normalLog10Matrix,
final VariantContextBuilder callVcb) {
final double[] tumorAlleleCounts = getEffectiveCounts(tumorLog10Matrix);
final Genotype tumorGenotype = new GenotypeBuilder(tumorSample, tumorLog10Matrix.alleles())
.AD(Arrays.stream(tumorAlleleCounts).mapToInt(x -> (int) FastMath.round(x)).toArray())
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is AD still getting added to the genotype in the new code below?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, the original code wrote out AD based on the Bayesian estimate, then overwrote it during annotation because DepthPerAlleleBySample is a StandardMutectAnnotation. So in theory, there didn't used to be a way to turn it off.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I confirmed that the output without the arg does still have AD and added coverage in the tumor normal test.

.attribute(GATKVCFConstants.ALLELE_FRACTION_KEY, getAltAlleleFractions(tumorAlleleCounts))
.make();
final int[] adArray = Arrays.stream(tumorAlleleCounts).mapToInt(x -> (int) FastMath.round(x)).toArray();
final int dp = (int) MathUtils.sum(adArray);
final GenotypeBuilder gb = new GenotypeBuilder(tumorSample, tumorLog10Matrix.alleles());
if (!MTAC.calculateAFfromAD) {
gb.attribute(GATKVCFConstants.ALLELE_FRACTION_KEY, getAltAlleleFractions(Arrays.stream(tumorAlleleCounts).map(x -> (double) FastMath.round(x)/dp).toArray()));
}
final Genotype tumorGenotype = gb.make();
final List<Genotype> genotypes = new ArrayList<>(Arrays.asList(tumorGenotype));

// TODO: We shouldn't always assume that the genotype in the normal is hom ref
Expand Down Expand Up @@ -269,7 +272,7 @@ public static double[] getEffectiveCounts(final LikelihoodMatrix<Allele> log10Li

private static double[] getAltAlleleFractions(final double[] alleleCounts) {
final double[] allAlleleFractions = MathUtils.normalizeFromRealSpace(alleleCounts);
return Arrays.copyOfRange(allAlleleFractions, 1, allAlleleFractions.length);
return Arrays.copyOfRange(allAlleleFractions, 1, allAlleleFractions.length); //omit the first entry of the array corresponding to the reference
}
/**
* Calculate the log10 likelihoods of the ref/alt het genotype for each alt allele, then subtracts
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import scala.tools.nsc.transform.patmat.ScalaLogic;

import java.io.File;
import java.nio.file.Files;
Expand Down Expand Up @@ -49,6 +50,8 @@ public class Mutect2IntegrationTest extends CommandLineProgramTest {

private static final File NA12878_MITO_BAM = new File(toolsTestDir, "mutect/mito/NA12878.bam");
private static final File MITO_REF = new File(toolsTestDir, "mutect/mito/Homo_sapiens_assembly38.mt_only.fasta");
private static final File DEEP_MITO_BAM = new File(largeFileTestDir, "mutect/highDPMTsnippet.bam");
private static final String DEEP_MITO_SAMPLE_NAME = "mixture";

/**
* Several DREAM challenge bams with synthetic truth data. In order to keep file sizes manageable, bams are restricted
Expand Down Expand Up @@ -177,7 +180,7 @@ public void testPon(final File tumorBam, final String tumorSample, final File no
Assert.assertEquals(numVariants, 0);
}

// run tumor-only using the original DREAM synthetic sample 1 tumor and normal restricted to
// run tumor-normal mode using the original DREAM synthetic sample 1 tumor and normal restricted to
// 1/3 of our dbSNP interval, in which there is only one true positive.
// we want to see that the number of false positives is small
@Test
Expand All @@ -201,6 +204,7 @@ public void testTumorNormal() throws Exception {
};

runCommandLine(args);
VariantContextTestUtils.streamVcf(outputVcf).forEach(a -> Assert.assertTrue(a.getGenotype(tumorName).hasAD()));
final long numVariants = VariantContextTestUtils.streamVcf(outputVcf).count();
Assert.assertTrue(numVariants < 4);
}
Expand Down Expand Up @@ -418,7 +422,9 @@ public void testBamWithRepeatedReads() {
false,
false
);
}/*
}

/*
* Test that the min_base_quality_score parameter works
*/
@Test
Expand Down Expand Up @@ -499,7 +505,34 @@ public void testMitochondria() throws Exception {
Assert.assertTrue(expectedKeys.stream().allMatch(variantKeys::contains));
}

@Test
@SuppressWarnings("deprecation")
public void testAFfromADoverHighDP() throws Exception {
Utils.resetRandomGenerator();
final File unfilteredVcf = createTempFile("unfiltered", ".vcf");

final List<String> args = Arrays.asList("-I", DEEP_MITO_BAM.getAbsolutePath(),
"-" + M2ArgumentCollection.TUMOR_SAMPLE_SHORT_NAME, DEEP_MITO_SAMPLE_NAME,
"-R", MITO_REF.getAbsolutePath(),
"-L", "chrM:1-1018",
"-ip", "300",
"-min-pruning", "4",
"--" + M2ArgumentCollection.GET_AF_FROM_AD_LONG_NAME,
"-O", unfilteredVcf.getAbsolutePath());
runCommandLine(args);

final List<VariantContext> variants = VariantContextTestUtils.streamVcf(unfilteredVcf).collect(Collectors.toList());

for (final VariantContext vc : variants) {
Assert.assertTrue(vc.isBiallelic()); //I do some lazy parsing below that won't hold for multiple alternate alleles
Genotype g = vc.getGenotype(DEEP_MITO_SAMPLE_NAME);
Assert.assertTrue(g.hasAD());
final int[] ADs = g.getAD();
Assert.assertTrue(g.hasExtendedAttribute(GATKVCFConstants.ALLELE_FRACTION_KEY));
//Assert.assertEquals(Double.parseDouble(String.valueOf(vc.getGenotype(DEEP_MITO_SAMPLE_NAME).getExtendedAttribute(GATKVCFConstants.ALLELE_FRACTION_KEY,"0"))), (double)ADs[1]/(ADs[0]+ADs[1]), 1e-6);
Assert.assertEquals(Double.parseDouble(String.valueOf(vc.getGenotype(DEEP_MITO_SAMPLE_NAME).getAttributeAsString(GATKVCFConstants.ALLELE_FRACTION_KEY,"0"))), (double)ADs[1]/(ADs[0]+ADs[1]), 1e-6);
}
}

@DataProvider(name="bamoutVariations")
public Object[][] bamoutVariations() {
Expand Down
3 changes: 3 additions & 0 deletions src/test/resources/large/mutect/highDPMTsnippet.bai
Git LFS file not shown
3 changes: 3 additions & 0 deletions src/test/resources/large/mutect/highDPMTsnippet.bam
Git LFS file not shown
Loading