Skip to content

Commit

Permalink
Add new max alt argument for GenomicsDB and require it to be >= GGVCFs (
Browse files Browse the repository at this point in the history
  • Loading branch information
ldgauthier authored Feb 4, 2022
1 parent d373a50 commit da7cd83
Show file tree
Hide file tree
Showing 17 changed files with 293 additions and 179 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,27 @@ public class GenomicsDBArgumentCollection implements Serializable {
public static final String USE_GCS_HDFS_CONNECTOR = "genomicsdb-use-gcs-hdfs-connector";

public static final String CALL_GENOTYPES_LONG_NAME = "call-genotypes";
public static final String MAX_ALTS_LONG_NAME = "genomicsdb-max-alternate-alleles";
private static final boolean DEFAULT_CALL_GENOTYPES = false;
private static final boolean DEFAULT_USE_BCF_CODEC = false;
private static final boolean DEFAULT_SHARED_POSIXFS_OPTIMIZATIONS = false;
private static final boolean DEFAULT_USE_GCS_HDFS_CONNECTOR = false;

/**
* Not full-fledged arguments for now.
* Maximum number of alternate alleles that will report likelihoods after being combined on reading from GenomicsDB (including <NON_REF>)
* Must be at least one greater than the maximum number of alternate alleles for genotyping.
* A typical value is 3 more than the --max-alternate-alleles value that's used by GenotypeGVCFs and larger differences
* result in more robustness to PCR-related indel errors.
* Note that GenotypeGVCFs will drop highly multi-allelic sites that are missing likelihoods.
*
* See also {@link org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeCalculationArgumentCollection#MAX_ALTERNATE_ALLELES_LONG_NAME}
*/
@Argument(fullName = MAX_ALTS_LONG_NAME, doc = "Maximum number of alternate alleles that will be combined on reading from GenomicsDB")
public int maxDiploidAltAllelesThatCanBeGenotyped = GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED;

/**
* Output called genotypes in the final VCF (otherwise no-call)
*/
@Argument(fullName = CALL_GENOTYPES_LONG_NAME, doc = "Output called genotypes in final VCF (otherwise no-call)", optional = true)
public boolean callGenotypes = DEFAULT_CALL_GENOTYPES;

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.broadinstitute.hellbender.tools.genomicsdb;

import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeCalculationArgumentCollection;

import java.nio.file.Path;
Expand Down Expand Up @@ -35,12 +36,16 @@ public GenomicsDBOptions(final Path reference, final GenomicsDBArgumentCollectio
this.useBCFCodec = genomicsdbArgs.useBCFCodec;
this.sharedPosixFSOptimizations = genomicsdbArgs.sharedPosixFSOptimizations;
this.useGcsHdfsConnector = genomicsdbArgs.useGcsHdfsConnector;
if (genomicsdbArgs.maxDiploidAltAllelesThatCanBeGenotyped - 1 < genotypeCalcArgs.maxAlternateAlleles) { //-1 for <NON_REF>
throw new UserException.BadInput("GenomicsDB max alternate alleles (" + GenomicsDBArgumentCollection.MAX_ALTS_LONG_NAME
+ ") must be at least one greater than genotype calculation max alternate alleles ("
+ GenotypeCalculationArgumentCollection.MAX_ALTERNATE_ALLELES_LONG_NAME + "), accounting for the non-ref allele");
}
this.maxDiploidAltAllelesThatCanBeGenotyped = genomicsdbArgs.maxDiploidAltAllelesThatCanBeGenotyped;
if (genotypeCalcArgs != null) {
this.maxDiploidAltAllelesThatCanBeGenotyped = genotypeCalcArgs.maxAlternateAlleles;
this.maxGenotypeCount = genotypeCalcArgs.maxGenotypeCount;
} else {
// Some defaults
this.maxDiploidAltAllelesThatCanBeGenotyped = genomicsdbArgs.maxDiploidAltAllelesThatCanBeGenotyped;
this.maxGenotypeCount = GenotypeCalculationArgumentCollection.DEFAULT_MAX_GENOTYPE_COUNT;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,12 @@ public static GenotypesContext subsetAlleles(final GenotypesContext originalGs,
MathUtils.scaleLogSpaceArrayForNumericalStability(Arrays.stream(subsettedLikelihoodIndices)
.mapToDouble(idx -> originalLikelihoods[idx]).toArray()) : null;
if (newLikelihoods != null) {
final int PLindex = MathUtils.maxElementIndex(newLikelihoods);
newLog10GQ = GenotypeLikelihoods.getGQLog10FromLikelihoods(PLindex, newLikelihoods);
if (newLikelihoods.length > 1) {
final int PLindex = MathUtils.maxElementIndex(newLikelihoods); //pick out the call (log10L = 0)
newLog10GQ = GenotypeLikelihoods.getGQLog10FromLikelihoods(PLindex, newLikelihoods);
} else { //if we subset to just ref allele, keep the GQ
newLog10GQ = g.getGQ()/-10.0; //-10 to go from Phred to log space
}
}

} else if (g.hasGQ()) {
Expand All @@ -99,8 +103,8 @@ public static GenotypesContext subsetAlleles(final GenotypesContext originalGs,
//TODO: remove other G-length attributes, although that may require header parsing
attributes.remove(VCFConstants.GENOTYPE_POSTERIORS_KEY);
attributes.remove(GATKVCFConstants.GENOTYPE_PRIOR_KEY);
gb.noAttributes().attributes(attributes);
if (newLog10GQ != Double.NEGATIVE_INFINITY) {
gb.noPL().noGQ().noAttributes().attributes(attributes); //if alleles are subset, old PLs and GQ are invalid
if (newLog10GQ != Double.NEGATIVE_INFINITY && g.hasGQ()) { //only put GQ if originally present
gb.log10PError(newLog10GQ);
}
if (useNewLikelihoods) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,18 +131,22 @@ public GenotypeCalculationArgumentCollection clone() {
/**
* If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN ALLELES),
* then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive and it
* scales exponentially based on the number of alternate alleles. Unless there is a good reason to change the default value, we highly recommend
* scales exponentially based on the number of alternate alleles.
*
* Unless there is a good reason to change the default value, we highly recommend
* that you not play around with this parameter.
*
* See also {@link #maxGenotypeCount}.
* See also {@link #maxGenotypeCount} and {@link org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBArgumentCollection#MAX_ALTS_LONG_NAME}.
* This value can be no greater than one less than the corresponding GenomicsDB argument. Sites that exceed the
* GenomicsDB alt allele max will not be output with likelihoods and will be dropped by GenotypeGVCFs.
*/
@Advanced
@Argument(fullName = MAX_ALTERNATE_ALLELES_LONG_NAME, doc = "Maximum number of alternate alleles to genotype", optional = true)
public int maxAlternateAlleles = DEFAULT_MAX_ALTERNATE_ALLELES;

/**
* If there are more than this number of genotypes at a locus presented to the genotyper, then only this many genotypes will be used.
* The possible genotypes are simply different ways of partitioning alleles given a specific ploidy asumption.
* The possible genotypes are simply different ways of partitioning alleles given a specific ploidy assumption.
* Therefore, we remove genotypes from consideration by removing alternate alleles that are the least well supported.
* The estimate of allele support is based on the ranking of the candidate haplotypes coming out of the graph building step.
* Note that the reference allele is always kept.
Expand All @@ -154,7 +158,8 @@ public GenotypeCalculationArgumentCollection clone() {
* 1. the largest number of alt alleles, given ploidy, that yields a genotype count no higher than {@link #maxGenotypeCount}
* 2. the value of {@link #maxAlternateAlleles}
*
* See also {@link #maxAlternateAlleles}.
* See also {@link #maxAlternateAlleles} and
* {@link org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBArgumentCollection#maxDiploidAltAllelesThatCanBeGenotyped}
*/
@Advanced
@Argument(fullName = MAX_GENOTYPE_COUNT_LONG_NAME, doc = "Maximum number of genotypes to consider at any site", optional = true)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ public Config getConfiguration() {
}

/**
* Main entry function to calculate genotypes of a given VC with corresponding GL's that is shared across genotypers (namely UG and HC).
* Main entry function to calculate genotypes of a given VC with corresponding GLs that is shared across genotypers (namely GGVCFs and HC).
*
* Completes a variant context with genotype calls and associated annotations given the genotype likelihoods and
* the model that need to be applied.
Expand All @@ -122,7 +122,7 @@ public Config getConfiguration() {
*/
public VariantContext calculateGenotypes(final VariantContext vc, final GenotypePriorCalculator gpc, final List<VariantContext> givenAlleles) {
// if input VC can't be genotyped, exit with either null VCC or, in case where we need to emit all sites, an empty call
if (hasTooManyAlternativeAlleles(vc) || vc.getNSamples() == 0) {
if (cannotBeGenotyped(vc) || vc.getNSamples() == 0) {
return null;
}

Expand Down Expand Up @@ -390,14 +390,21 @@ boolean isVcCoveredByDeletion(final VariantContext vc) {
* @return {@code true} iff there is too many alternative alleles based on
* {@link GenotypeLikelihoods#MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED}.
*/
protected final boolean hasTooManyAlternativeAlleles(final VariantContext vc) {
protected final boolean cannotBeGenotyped(final VariantContext vc) {
// protect against too many alternate alleles that we can't even run AF on:
if (vc.getNAlleles() <= GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED) {
if (vc.getNAlleles() <= GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED
&& vc.getGenotypes().stream().anyMatch(GenotypeUtils::genotypeIsUsableForAFCalculation)) {
return false;
}
logger.warn("Attempting to genotype more than " + GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED +
" alleles. Site will be skipped at location "+vc.getContig()+":"+vc.getStart());
return true;
if (vc.getNAlleles() > GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED) {
logger.warn("Attempting to genotype more than " + GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED +
" alleles. Site will be skipped at location " + vc.getContig() + ":" + vc.getStart());
return true;
}else {
logger.warn("No genotype contained sufficient data to recalculate genotypes. Site will be skipped at location "
+ vc.getContig() + ":" + vc.getStart());
return true;
}
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,9 @@ public static AlleleFrequencyCalculator makeCalculator(final DragstrParams drags

/**
*
* @param g must have likelihoods or (if approximateHomRefsFromGQ is true) GQ
* @param g must have likelihoods or (if approximateHomRefsFromGQ is true) be hom-ref with GQ
* (see {@link org.broadinstitute.hellbender.utils.GenotypeUtils#genotypeIsUsableForAFCalculation(Genotype)
* genotypeIsUsableForAFCalculation} )
* @param glCalc
* @param log10AlleleFrequencies
* @return
Expand All @@ -75,7 +77,7 @@ private static double[] log10NormalizedGenotypePosteriors(final Genotype g, fina
final double[] log10Likelihoods;
if (g.hasLikelihoods()) {
log10Likelihoods = g.getLikelihoods().getAsVector();
} else if ( g.isHomRef() || g.isNoCall()) {
} else if (g.isHomRef()) {
if (g.getPloidy() != 2) {
throw new IllegalStateException("Likelihoods are required to calculate posteriors for hom-refs with ploidy != 2, " +
"but were not found for genotype " + g + " with ploidy " + g.getPloidy());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,8 @@ public final class SelectVariants extends VariantWalker {
* When this flag is enabled, all alternate alleles that are not present in the (output) samples will be removed.
* Note that this even extends to biallelic SNPs - if the alternate allele is not present in any sample, it will be
* removed and the record will contain a '.' in the ALT column. Note also that sites-only VCFs, by definition, do
* not include the alternate allele in any genotype calls.
* not include the alternate allele in any genotype calls. Further note that PLs will be trimmed appropriately,
* removing likelihood information (even for homozygous reference calls).
*/
@Argument(fullName="remove-unused-alternates",
doc="Remove alternate alleles not present in any genotypes", optional=true)
Expand Down Expand Up @@ -1030,7 +1031,9 @@ private VariantContext subsetRecord(final VariantContext vc, final boolean prese
// fix the PL and AD values if sub has fewer alleles than original vc
final GenotypesContext subGenotypesWithOldAlleles = sub.getGenotypes(); //we need sub for the right samples, but PLs still go with old alleles
newGC = sub.getNAlleles() == vc.getNAlleles() ? subGenotypesWithOldAlleles :
AlleleSubsettingUtils.subsetAlleles(subGenotypesWithOldAlleles, 0, vc.getAlleles(), sub.getAlleles(), null, GenotypeAssignmentMethod.DO_NOT_ASSIGN_GENOTYPES, vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0), true);
AlleleSubsettingUtils.subsetAlleles(subGenotypesWithOldAlleles, 0, vc.getAlleles(),
sub.getAlleles(), null, GenotypeAssignmentMethod.DO_NOT_ASSIGN_GENOTYPES,
vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0), false);
} else {
newGC = sub.getGenotypes();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,14 @@ public static GenotypeCounts computeDiploidGenotypeCounts(final VariantContext v
return new GenotypeCounts(genotypeWithTwoRefsCount, genotypesWithOneRefCount, genotypesWithNoRefsCount);
}

/**
* Do we have (or can we infer) likelihoods necessary for allele frequency calculation?
* Some reblocked and/or DRAGEN GVCFs omit likelihoods for ref blocks, but we can estimate them
* @param g a genotype of unknown call and ploidy
* @return true if we have enough info for AF calculation
*/
public static boolean genotypeIsUsableForAFCalculation(Genotype g) {
return g.hasLikelihoods() || g.hasGQ() || g.getAlleles().stream().anyMatch(a -> a.isCalled() && a.isNonReference() && !a.isSymbolic());
return g.hasLikelihoods() || (g.isHomRef() && g.hasGQ() && 2 == g.getPloidy());
}

/**
Expand Down
Loading

0 comments on commit da7cd83

Please sign in to comment.