diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/afcalc/AlleleFrequencyCalculator.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/afcalc/AlleleFrequencyCalculator.java index 5a393e98d9c..263c4103f67 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/afcalc/AlleleFrequencyCalculator.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/afcalc/AlleleFrequencyCalculator.java @@ -3,6 +3,7 @@ import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.VariantContext; +import it.unimi.dsi.fastutil.ints.Int2ObjectArrayMap; import org.apache.commons.math3.util.MathArrays; import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeAlleleCounts; import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeLikelihoodCalculator; @@ -66,9 +67,8 @@ public AFCalculationResult getLog10PNonRef(final VariantContext vc, final int de double[] alleleCounts = new double[numAlleles]; final double flatLog10AlleleFrequency = -MathUtils.log10(numAlleles); // log10(1/numAlleles) double[] log10AlleleFrequencies = new IndexRange(0, numAlleles).mapToDouble(n -> flatLog10AlleleFrequency); - double alleleCountsMaximumDifference = Double.POSITIVE_INFINITY; - while (alleleCountsMaximumDifference > THRESHOLD_FOR_ALLELE_COUNT_CONVERGENCE) { + for (double alleleCountsMaximumDifference = Double.POSITIVE_INFINITY; alleleCountsMaximumDifference > THRESHOLD_FOR_ALLELE_COUNT_CONVERGENCE; ) { final double[] newAlleleCounts = effectiveAlleleCounts(vc, log10AlleleFrequencies); alleleCountsMaximumDifference = Arrays.stream(MathArrays.ebeSubtract(alleleCounts, newAlleleCounts)).map(Math::abs).max().getAsDouble(); alleleCounts = newAlleleCounts; @@ -83,6 +83,8 @@ public AFCalculationResult getLog10PNonRef(final VariantContext vc, final int de double[] log10POfZeroCountsByAllele = new double[numAlleles]; double log10PNoVariant = 0; + final boolean spanningDeletionPresent = alleles.contains(Allele.SPAN_DEL); + final Map nonVariantIndicesByPloidy = new Int2ObjectArrayMap<>(); for (final Genotype g : vc.getGenotypes()) { if (!g.hasLikelihoods()) { continue; @@ -93,7 +95,14 @@ public AFCalculationResult getLog10PNonRef(final VariantContext vc, final int de final double[] log10GenotypePosteriors = log10NormalizedGenotypePosteriors(g, glCalc, log10AlleleFrequencies); //the total probability - log10PNoVariant += log10GenotypePosteriors[HOM_REF_GENOTYPE_INDEX]; + if (!spanningDeletionPresent) { + log10PNoVariant += log10GenotypePosteriors[HOM_REF_GENOTYPE_INDEX]; + } else { + nonVariantIndicesByPloidy.computeIfAbsent(ploidy, p -> genotypeIndicesWithOnlyRefAndSpanDel(p, alleles)); + final int[] nonVariantIndices = nonVariantIndicesByPloidy.get(ploidy); + final double[] nonVariantLog10Posteriors = MathUtils.applyToArray(nonVariantIndices, n -> log10GenotypePosteriors[n]); + log10PNoVariant += MathUtils.log10SumLog10(nonVariantLog10Posteriors); + } // per allele non-log space probabilities of zero counts for this sample // for each allele calculate the total probability of genotypes containing at least one copy of the allele @@ -168,6 +177,18 @@ private static double[] log10NormalizedGenotypePosteriors(final Genotype g, fina return MathUtils.normalizeLog10(log10Posteriors); } + private static int[] genotypeIndicesWithOnlyRefAndSpanDel(final int ploidy, final List alleles) { + final GenotypeLikelihoodCalculator glCalc = GL_CALCS.getInstance(ploidy, alleles.size()); + final boolean spanningDeletionPresent = alleles.contains(Allele.SPAN_DEL); + if (!spanningDeletionPresent) { + return new int[] {HOM_REF_GENOTYPE_INDEX}; + } else { + final int spanDelIndex = alleles.indexOf(Allele.SPAN_DEL); + // allele counts are in the GenotypeLikelihoodCalculator format of {ref index, ref count, span del index, span del count} + return new IndexRange(0, ploidy).mapToInteger(n -> glCalc.alleleCountsToIndex(new int[]{0, ploidy - n, spanDelIndex, n})); + } + } + @Override //Note: unused protected AFCalculationResult getResultFromFinalState(final VariantContext vc, final double[] priors, final StateTracker st) { return null; } diff --git a/src/main/java/org/broadinstitute/hellbender/utils/MathUtils.java b/src/main/java/org/broadinstitute/hellbender/utils/MathUtils.java index a42a8a369d6..2abfcbfd45b 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/MathUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/MathUtils.java @@ -1383,6 +1383,25 @@ public static double[] applyToArray(final double[] array, final DoubleUnaryOpera return result; } + /** + * The following method implements Arrays.stream(array).map(func).toArray(), which is concise but performs poorly due + * to the overhead of creating a stream, especially with small arrays. Thus we wrap the wordy but fast array code + * in the following method which permits concise Java 8 code. + * + * Returns a new array -- the original array in not modified. + * + * This method has been benchmarked and performs as well as array-only code. + */ + public static double[] applyToArray(final int[] array, final IntToDoubleFunction func) { + Utils.nonNull(func); + Utils.nonNull(array); + final double[] result = new double[array.length]; + for (int m = 0; m < result.length; m++) { + result[m] = func.applyAsDouble(array[m]); + } + return result; + } + /** * The following method implements Arrays.stream(array).map(func).toArray(), which is concise but performs poorly due * to the overhead of creating a stream, especially with small arrays. Thus we wrap the wordy but fast array code diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/genotyper/afcalc/AlleleFrequencyCalculatorUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/genotyper/afcalc/AlleleFrequencyCalculatorUnitTest.java index a86a088b2c2..9b79fcac481 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/genotyper/afcalc/AlleleFrequencyCalculatorUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/genotyper/afcalc/AlleleFrequencyCalculatorUnitTest.java @@ -170,6 +170,82 @@ public void testManyRefSamplesDontKillGoodVariant() { } } + // spanning deletion alleles should be treated as non-variant, so we make a few random PLs and test that the + // qual is invariant to swapping the ref/ref PL with the ref/SPAN_DEL PL + @Test + public void testSpanningDeletionIsNotConsideredVariant() { + final int ploidy = 2; + final AlleleFrequencyCalculator afCalc = new AlleleFrequencyCalculator(1, 0.1, 0.1, ploidy); + final List alleles = Arrays.asList(A, B, Allele.SPAN_DEL); + + //{AA}, {AB}, {BB}, {AC}, {BC}, {CC} + + // some pls that have high likelihood for span del allele but not for the SNP (B) + final int[] spanDelPls = new int[] {50, 100, 100, 0, 100, 100}; + + // some pls that weakly support the SNP + final int[] lowQualSnpPls = new int[] {10,0,40,100,70,300}; + + + final Genotype spanDel = makeGenotype(ploidy, spanDelPls); + final Genotype lowQualSNP = makeGenotype(ploidy, lowQualSnpPls); + + // first test the span del genotype alone. Its best PL containing the SNP is 100, so we expect a variant probability + // of about 10^(-100/10) -- a bit less due to the prior bias in favor of the reference + final VariantContext vcSpanDel = makeVC(alleles, Arrays.asList(spanDel)); + final double log10PVariant = afCalc.getLog10PNonRef(vcSpanDel).getLog10LikelihoodOfAFGT0(); + Assert.assertTrue(log10PVariant < - 10); + + // now test a realistic situation of two samples, one with a low-quality SNP and one with the spanning deletion + // we want to find that the spanning deletion has little effect on the qual + // In fact, we also want to check that it *decreases* the qual, because it's essentially one more hom ref sample + // Furthermore, to be precise it should be really behave almost identically to a hom ref *haploid* sample, + // so we check that, too + final VariantContext vcLowQualSnp = makeVC(alleles, Arrays.asList(lowQualSNP)); + final double lowQualSNPQualScore = afCalc.getLog10PNonRef(vcLowQualSnp).getLog10LikelihoodOfAFGT0(); + final VariantContext vcBoth = makeVC(alleles, Arrays.asList(lowQualSNP, spanDel)); + final double bothQualScore = afCalc.getLog10PNonRef(vcBoth).getLog10LikelihoodOfAFGT0(); + Assert.assertEquals(lowQualSNPQualScore, bothQualScore, 0.1); + Assert.assertTrue(bothQualScore < lowQualSNPQualScore); + + final int[] haploidRefPls = new int[] {0, 100, 100}; + final Genotype haploidRef = makeGenotype(1, haploidRefPls); + + final VariantContext vcLowQualSnpAndHaploidRef = makeVC(alleles, Arrays.asList(lowQualSNP, haploidRef)); + final double lowQualSNPAndHaplpidRefQualScore = afCalc.getLog10PNonRef(vcLowQualSnpAndHaploidRef).getLog10LikelihoodOfAFGT0(); + Assert.assertEquals(bothQualScore, lowQualSNPAndHaplpidRefQualScore, 1e-5); + + // as a final test, we check that getting rid of the spanning deletion allele, in the sense that + // REF / SPAN_DEL --> haploid REF; REF / SNP --> REF / SNP + // does not affect the qual score + + final int[] haploidRefPlsWithoutSpanDel = new int[] {0, 100}; + final int[] snpPlsWithoutSpanDel = new int[] {10, 0, 40}; + final VariantContext vcNoSpanDel = makeVC(Arrays.asList(A,B), Arrays.asList(makeGenotype(ploidy, snpPlsWithoutSpanDel), + makeGenotype(1, haploidRefPlsWithoutSpanDel))); + final double noSpanDelQualScore = afCalc.getLog10PNonRef(vcNoSpanDel).getLog10LikelihoodOfAFGT0(); + Assert.assertEquals(bothQualScore, noSpanDelQualScore, 1e-6); + } + + @Test + public void testPresenceOfUnlikelySpanningDeletionDoesntAffectResults() { + final int ploidy = 2; + final AlleleFrequencyCalculator afCalc = new AlleleFrequencyCalculator(1, 0.1, 0.1, ploidy); + final List allelesWithoutSpanDel = Arrays.asList(A, B); + final List allelesWithSpanDel = Arrays.asList(A, B, Allele.SPAN_DEL); + + // make PLs that support an A/B genotype + final int[] plsWithoutSpanDel = new int[] {50, 0, 50}; + final int[] plsWithSpanDel = new int[] {50, 0, 50, 100, 100, 100}; + final Genotype genotypeWithoutSpanDel = makeGenotype(ploidy, plsWithoutSpanDel); + final Genotype genotypeWithSpanDel = makeGenotype(ploidy, plsWithSpanDel); + final VariantContext vcWithoutSpanDel = makeVC(allelesWithoutSpanDel, Arrays.asList(genotypeWithoutSpanDel)); + final VariantContext vcWithSpanDel = makeVC(allelesWithSpanDel, Arrays.asList(genotypeWithSpanDel)); + final double log10PVariantWithoutSpanDel = afCalc.getLog10PNonRef(vcWithoutSpanDel).getLog10LikelihoodOfAFGT0(); + final double log10PVariantWithSpanDel = afCalc.getLog10PNonRef(vcWithSpanDel).getLog10LikelihoodOfAFGT0(); + Assert.assertEquals(log10PVariantWithoutSpanDel, log10PVariantWithSpanDel, 0.0001); + } + // make PLs that correspond to an obvious call i.e. one PL is relatively big and the rest are zero // alleleCounts is the GenotypeAlleleCounts format for the obvious genotype, with repeats but in no particular order private static int[] PLsForObviousCall(final int ploidy, final int numAlleles, final int[] alleleCounts, final int PL) {