From 2d50cf8b334f1e781f00ac1dae683355fee1fe99 Mon Sep 17 00:00:00 2001 From: David Benjamin Date: Fri, 26 Jan 2024 13:57:18 -0500 Subject: [PATCH] Improvements to Mutect2's Permutect training data mode (#8663) * include normal seq error log likelihood in Permutect dataset * handle different alelle representations in multiallelic / indel variants for Permutect training data mode * set the default artifact to non-artifact ratio to 1 in Permutect training data mode --- .../FeaturizedReadSets.java | 134 +++++++----------- .../walkers/mutect/M2ArgumentCollection.java | 22 ++- .../walkers/mutect/Mutect3DatasetEngine.java | 53 ++++--- .../mutect/SomaticGenotypingEngine.java | 2 +- 4 files changed, 104 insertions(+), 107 deletions(-) rename src/main/java/org/broadinstitute/hellbender/tools/walkers/{annotator => mutect}/FeaturizedReadSets.java (53%) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/FeaturizedReadSets.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/FeaturizedReadSets.java similarity index 53% rename from src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/FeaturizedReadSets.java rename to src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/FeaturizedReadSets.java index c2a37a5a31d..6a4d3f5c06a 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/FeaturizedReadSets.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/FeaturizedReadSets.java @@ -1,26 +1,22 @@ -package org.broadinstitute.hellbender.tools.walkers.annotator; +package org.broadinstitute.hellbender.tools.walkers.mutect; import htsjdk.variant.variantcontext.Allele; -import htsjdk.variant.variantcontext.Genotype; -import htsjdk.variant.variantcontext.GenotypeBuilder; import htsjdk.variant.variantcontext.VariantContext; -import org.apache.commons.lang3.StringUtils; -import org.broadinstitute.barclay.help.DocumentedFeature; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import org.broadinstitute.gatk.nativebindings.smithwaterman.SWOverhangStrategy; -import org.broadinstitute.hellbender.engine.FeatureContext; -import org.broadinstitute.hellbender.engine.ReferenceContext; +import org.broadinstitute.hellbender.tools.walkers.annotator.BaseQuality; +import org.broadinstitute.hellbender.tools.walkers.annotator.ReadPosition; import org.broadinstitute.hellbender.utils.Utils; import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods; import org.broadinstitute.hellbender.utils.haplotype.Haplotype; -import org.broadinstitute.hellbender.utils.help.HelpConstants; import org.broadinstitute.hellbender.utils.read.AlignmentUtils; import org.broadinstitute.hellbender.utils.read.Fragment; import org.broadinstitute.hellbender.utils.read.GATKRead; import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAligner; import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAlignment; import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAlignmentConstants; -import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; import java.util.*; import java.util.stream.Collectors; @@ -31,69 +27,23 @@ * [1,2] and [3,4] and allele 2 has featurized reads [5,6] and [7,8], the annotation is * 1,2,3,4|5,6,7,8 */ -@DocumentedFeature(groupName= HelpConstants.DOC_CAT_ANNOTATORS, groupSummary=HelpConstants.DOC_CAT_ANNOTATORS_SUMMARY, - summary="Featurized read sets for Mutect3 training data") -public class FeaturizedReadSets implements JumboGenotypeAnnotation { - public static final int DEFAULT_BASE_QUALITY = 25; - - private static final int DEFAULT_MAX_REF_COUNT = Integer.MAX_VALUE; +public class FeaturizedReadSets { + private static final Logger logger = LogManager.getLogger(FeaturizedReadSets.class); - public static final int FEATURES_PER_READ = 11; + public static final int DEFAULT_BASE_QUALITY = 25; private static final SmithWatermanAligner aligner = SmithWatermanAligner.getAligner(SmithWatermanAligner.Implementation.JAVA); - // downsample ref reads to this count if needed - private final int maxRefCount; - - public FeaturizedReadSets(final int maxRefCount) { - this.maxRefCount = maxRefCount; - } - - public FeaturizedReadSets() { - this(DEFAULT_MAX_REF_COUNT); - } - - @Override - public void annotate(final ReferenceContext ref, - final FeatureContext features, - final VariantContext vc, - final Genotype g, - final GenotypeBuilder gb, - final AlleleLikelihoods likelihoods, - final AlleleLikelihoods fragmentLikelihoods, - final AlleleLikelihoods haplotypeLikelihoods) { - Utils.nonNull(vc); - Utils.nonNull(g); - Utils.nonNull(gb); - - if ( likelihoods == null) { - return; - } - - final List>> readVectorsByAllele = getReadVectors(vc, Collections.singletonList(g.getSampleName()), - likelihoods, haplotypeLikelihoods, maxRefCount, Integer.MAX_VALUE); - - // flatten twice: over all reads supporting each allele and over all alleles - // we can partition by allele with the countsInAlleleOrder annotation - // and by read using the constant feature vector size - final int[] flattenedTensorInAlleleOrder = readVectorsByAllele.stream() - .flatMap(listOfLists -> listOfLists.stream().flatMap(List::stream)) - .mapToInt(n -> n) - .toArray(); - - final int[] countsInAlleleOrder = readVectorsByAllele.stream().mapToInt(List::size).toArray(); - - gb.attribute(GATKVCFConstants.FEATURIZED_READ_SETS_KEY, flattenedTensorInAlleleOrder); - gb.attribute(GATKVCFConstants.FEATURIZED_READ_SETS_COUNTS_KEY, countsInAlleleOrder); - } + private FeaturizedReadSets() { } public static List>> getReadVectors(final VariantContext vc, final Collection samples, final AlleleLikelihoods likelihoods, final AlleleLikelihoods haplotypeLikelihoods, final int refDownsample, - final int altDownsample) { - return getReadVectors(vc, samples, likelihoods, haplotypeLikelihoods, refDownsample, altDownsample, Collections.emptyMap()); + final int altDownsample, + final M2ArgumentCollection.Mutect3DatasetMode mutect3DatasetMode) { + return getReadVectors(vc, samples, likelihoods, haplotypeLikelihoods, refDownsample, altDownsample, Collections.emptyMap(), mutect3DatasetMode); } // returns Lists (in allele order) of lists of read vectors supporting each allele @@ -103,7 +53,8 @@ public static List>> getReadVectors(final VariantContext vc, final AlleleLikelihoods haplotypeLikelihoods, final int refDownsample, final int altDownsample, - final Map altDownsampleMap) { + final Map altDownsampleMap, + final M2ArgumentCollection.Mutect3DatasetMode mutect3DatasetMode) { final Map> readsByAllele = likelihoods.alleles().stream() .collect(Collectors.toMap(a -> a, a -> new ArrayList<>())); @@ -126,17 +77,14 @@ public static List>> getReadVectors(final VariantContext vc, .forEach(ba -> ba.evidence.getReads().forEach(read -> bestHaplotypes.put(read, ba.allele))); return vc.getAlleles().stream() - .map(allele -> readsByAllele.get(allele).stream().map(read -> featurize(read, vc, bestHaplotypes)).collect(Collectors.toList())) + .map(allele -> readsByAllele.get(allele).stream().map(read -> featurize(read, vc, bestHaplotypes, mutect3DatasetMode)).collect(Collectors.toList())) .collect(Collectors.toList()); } - @Override - public List getKeyNames() { - return Arrays.asList(GATKVCFConstants.FEATURIZED_READ_SETS_KEY, GATKVCFConstants.FEATURIZED_READ_SETS_COUNTS_KEY); - } - - private static List featurize(final GATKRead read, final VariantContext vc, final Map bestHaplotypes) { + private static List featurize(final GATKRead read, final VariantContext vc, + final Map bestHaplotypes, + final M2ArgumentCollection.Mutect3DatasetMode mutect3DatasetMode) { final List result = new ArrayList<>(); result.add(read.getMappingQuality()); result.add(BaseQuality.getBaseQuality(read, vc).orElse(DEFAULT_BASE_QUALITY)); @@ -151,23 +99,41 @@ private static List featurize(final GATKRead read, final VariantContext result.add(Math.abs(read.getFragmentLength())); - // distances from ends of fragment - final int fragmentStart = Math.min(read.getMateStart(), read.getUnclippedStart()); - final int fragmentEnd = fragmentStart + Math.abs(read.getFragmentLength()); - result.add(vc.getStart() - fragmentStart); - result.add(fragmentEnd - vc.getEnd()); + if (mutect3DatasetMode == M2ArgumentCollection.Mutect3DatasetMode.ILLUMINA) { + // distances from ends of fragment + final int fragmentStart = Math.min(read.getMateStart(), read.getUnclippedStart()); + final int fragmentEnd = fragmentStart + Math.abs(read.getFragmentLength()); + result.add(vc.getStart() - fragmentStart); + result.add(fragmentEnd - vc.getEnd()); + } + + // Ultima-specific read tags + if (mutect3DatasetMode == M2ArgumentCollection.Mutect3DatasetMode.ULTIMA) { + result.add(read.getAttributeAsInteger("si")); // si is an integer on the order of 100s or 1000s + result.add((int) (1000*read.getAttributeAsFloat("rq"))); // rq is a float from 0 and 1, so we multiply by 1000 and round + } // mismatches versus best haplotype final Haplotype haplotype = bestHaplotypes.get(read); - final SmithWatermanAlignment readToHaplotypeAlignment = aligner.align(haplotype.getBases(), read.getBases(), SmithWatermanAlignmentConstants.ALIGNMENT_TO_BEST_HAPLOTYPE_SW_PARAMETERS, SWOverhangStrategy.SOFTCLIP); - final GATKRead copy = read.copy(); - copy.setCigar(readToHaplotypeAlignment.getCigar()); - final int mismatchCount = AlignmentUtils.getMismatchCount(copy, haplotype.getBases(), readToHaplotypeAlignment.getAlignmentOffset()).numMismatches; - result.add(mismatchCount); - - final long indelsVsBestHaplotype = readToHaplotypeAlignment.getCigar().getCigarElements().stream().filter(el -> el.getOperator().isIndel()).count(); - result.add((int) indelsVsBestHaplotype); - Utils.validate(result.size() == FEATURES_PER_READ, "Wrong number of features"); + + // TODO: fix this + // I have no idea why this edge case occurs in Ultima data and maybe sometimes in Illumina + if (!bestHaplotypes.containsKey(read)) { + logger.warn(String.format("Best haplotypes don't contain read %s at position %s:%d.", read.getName(), + vc.getContig(), vc.getStart())); + result.add(3); + result.add(2); + } else { + final SmithWatermanAlignment readToHaplotypeAlignment = aligner.align(haplotype.getBases(), read.getBases(), SmithWatermanAlignmentConstants.ALIGNMENT_TO_BEST_HAPLOTYPE_SW_PARAMETERS, SWOverhangStrategy.SOFTCLIP); + final GATKRead copy = read.copy(); + copy.setCigar(readToHaplotypeAlignment.getCigar()); + final int mismatchCount = AlignmentUtils.getMismatchCount(copy, haplotype.getBases(), readToHaplotypeAlignment.getAlignmentOffset()).numMismatches; + result.add(mismatchCount); + + final long indelsVsBestHaplotype = readToHaplotypeAlignment.getCigar().getCigarElements().stream().filter(el -> el.getOperator().isIndel()).count(); + result.add((int) indelsVsBestHaplotype); + } + Utils.validate(result.size() == mutect3DatasetMode.getNumReadFeatures(), "Wrong number of features"); return result; } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java index d25c7a33fc5..0087441ae1a 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java @@ -81,10 +81,11 @@ public class M2ArgumentCollection extends AssemblyBasedCallerArgumentCollection public static final String MUTECT3_ALT_DOWNSAMPLE_LONG_NAME = "mutect3-alt-downsample"; public static final String MUTECT3_DATASET_LONG_NAME = "mutect3-dataset"; public static final String MUTECT3_TRAINING_TRUTH_LONG_NAME = "mutect3-training-truth"; + public static final String MUTECT3_DATASET_MODE_LONG_NAME = "mutect3-dataset-mode"; public static final int DEFAULT_MUTECT3_REF_DOWNSAMPLE = 10; public static final int DEFAULT_MUTECT3_ALT_DOWNSAMPLE = 20; - public static final int DEFAULT_MUTECT3_NON_ARTIFACT_RATIO = 20; + public static final int DEFAULT_MUTECT3_NON_ARTIFACT_RATIO = 1; @Override protected int getDefaultMaxMnpDistance() { return 1; } @@ -205,6 +206,25 @@ public double getDefaultAlleleFrequency() { @Argument(fullName = MUTECT3_DATASET_LONG_NAME, optional = true, doc="Destination for Mutect3 data collection") public File mutect3Dataset; + @Advanced + @Argument(fullName = MUTECT3_DATASET_MODE_LONG_NAME, optional = true, doc="The type of Mutect3 dataset. Depends on sequencing technology.") + public Mutect3DatasetMode mutect3DatasetMode = Mutect3DatasetMode.ILLUMINA; + + public enum Mutect3DatasetMode { + ILLUMINA(11), + ULTIMA(11); + + final private int numReadFeatures; + + Mutect3DatasetMode(final int numReadFeatures) { + this.numReadFeatures = numReadFeatures; + } + + public int getNumReadFeatures() { + return numReadFeatures; + } + } + /** * VCF of known calls for a sample used for generating a Mutect3 training dataset. Unfiltered variants (PASS or empty FILTER field) * contained in this VCF are considered good; other variants (i.e. filtered in this VCF or absent from it) are considered errors. diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect3DatasetEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect3DatasetEngine.java index e6a080cd14f..942c2de15b2 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect3DatasetEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect3DatasetEngine.java @@ -4,18 +4,15 @@ import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.VariantContext; import org.apache.commons.lang3.tuple.Triple; -import org.apache.commons.math3.util.CombinatoricsUtils; import org.apache.commons.math3.util.FastMath; -import org.broadinstitute.hellbender.engine.FeatureContext; import org.broadinstitute.hellbender.engine.ReferenceContext; import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.tools.walkers.ReferenceConfidenceVariantContextMerger; import org.broadinstitute.hellbender.tools.walkers.annotator.AssemblyComplexity; -import org.broadinstitute.hellbender.tools.walkers.annotator.FeaturizedReadSets; import org.broadinstitute.hellbender.tools.walkers.annotator.ReferenceBases; import org.broadinstitute.hellbender.tools.walkers.mutect.filtering.Mutect2FilteringEngine; import org.broadinstitute.hellbender.utils.IndexRange; import org.broadinstitute.hellbender.utils.MathUtils; -import org.broadinstitute.hellbender.utils.NaturalLogUtils; import org.broadinstitute.hellbender.utils.Utils; import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods; import org.broadinstitute.hellbender.utils.genotyper.LikelihoodMatrix; @@ -46,9 +43,6 @@ private enum Label { ARTIFACT, VARIANT, UNLABELED, IGNORE } - // number of features for each vectorized read - private static final int FEATURES_PER_READ = FeaturizedReadSets.FEATURES_PER_READ; - // number of additional variant features for assembly complexity, reference context private static final int NUM_EXTRA_FEATURES = 9; @@ -108,7 +102,8 @@ public Mutect3DatasetEngine(final File datasetFile, final boolean trainingMode, public void addData(final ReferenceContext ref, final VariantContext vc, Optional> truthVCs, final AlleleLikelihoods likelihoods, final AlleleLikelihoods logFragmentLikelihoods, - final AlleleLikelihoods logFragmentAlleleLikelihoods) { + final AlleleLikelihoods logFragmentAlleleLikelihoods, + final M2ArgumentCollection.Mutect3DatasetMode mutect3DatasetMode) { final String refBases = ReferenceBases.annotate(ref, vc); final String refAllele = vc.getReference().getBaseString(); final String contig = vc.getContig(); @@ -132,6 +127,20 @@ public void addData(final ReferenceContext ref, final VariantContext vc, Optiona final int normalDepth = (int) MathUtils.sum(normalADs); final boolean hasNormal = normalDepth > 0; + final List allRefAlleles = new ArrayList<>(); + allRefAlleles.add(vc.getReference()); + truthVCs.ifPresent(vcs -> vcs.forEach(var -> allRefAlleles.add(var.getReference()))); + final Allele longestRef = allRefAlleles.stream().sorted(Comparator.comparingInt(Allele::length).reversed()).findFirst().get(); + + // skip(1) excludes the reference allele + final List remappedAltAlleles = ReferenceConfidenceVariantContextMerger.remapAlleles(vc, longestRef).stream() + .skip(1).toList(); + + final Set truthAlleles = !truthVCs.isPresent() ? Collections.emptySet() : truthVCs.get().stream() + .filter(truthVC -> ! truthVC.isFiltered()) + .flatMap(truthVC -> ReferenceConfidenceVariantContextMerger.remapAlleles(truthVC, longestRef).stream()) + .collect(Collectors.toSet()); + final List