Improvements to Mutect2's Permutect training data mode (#8663)

* include normal seq error log likelihood in Permutect dataset * handle different alelle representations in multiallelic / indel variants for Permutect training data mode * set the default artifact to non-artifact ratio to 1 in Permutect training data mode
broadinstitute · Jan 26, 2024 · 2d50cf8 · 2d50cf8
1 parent e796d20
commit 2d50cf8
Show file tree

Hide file tree

Showing 4 changed files with 104 additions and 107 deletions.
diff --git a/...walkers/annotator/FeaturizedReadSets.java → ...ls/walkers/mutect/FeaturizedReadSets.java b/...walkers/annotator/FeaturizedReadSets.java → ...ls/walkers/mutect/FeaturizedReadSets.java
@@ -1,26 +1,22 @@
-package org.broadinstitute.hellbender.tools.walkers.annotator;
+package org.broadinstitute.hellbender.tools.walkers.mutect;
 
 
 import htsjdk.variant.variantcontext.Allele;
-import htsjdk.variant.variantcontext.Genotype;
-import htsjdk.variant.variantcontext.GenotypeBuilder;
 import htsjdk.variant.variantcontext.VariantContext;
-import org.apache.commons.lang3.StringUtils;
-import org.broadinstitute.barclay.help.DocumentedFeature;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
 import org.broadinstitute.gatk.nativebindings.smithwaterman.SWOverhangStrategy;
-import org.broadinstitute.hellbender.engine.FeatureContext;
-import org.broadinstitute.hellbender.engine.ReferenceContext;
+import org.broadinstitute.hellbender.tools.walkers.annotator.BaseQuality;
+import org.broadinstitute.hellbender.tools.walkers.annotator.ReadPosition;
 import org.broadinstitute.hellbender.utils.Utils;
 import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
 import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
-import org.broadinstitute.hellbender.utils.help.HelpConstants;
 import org.broadinstitute.hellbender.utils.read.AlignmentUtils;
 import org.broadinstitute.hellbender.utils.read.Fragment;
 import org.broadinstitute.hellbender.utils.read.GATKRead;
 import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAligner;
 import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAlignment;
 import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAlignmentConstants;
-import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
 
 import java.util.*;
 import java.util.stream.Collectors;
@@ -31,69 +27,23 @@
  * [1,2] and [3,4] and allele 2 has featurized reads [5,6] and [7,8], the annotation is
  * 1,2,3,4|5,6,7,8
  */
-@DocumentedFeature(groupName= HelpConstants.DOC_CAT_ANNOTATORS, groupSummary=HelpConstants.DOC_CAT_ANNOTATORS_SUMMARY,
-        summary="Featurized read sets for Mutect3 training data")
-public class FeaturizedReadSets implements JumboGenotypeAnnotation {
-    public static final int DEFAULT_BASE_QUALITY = 25;
-
-    private static final int DEFAULT_MAX_REF_COUNT = Integer.MAX_VALUE;
+public class FeaturizedReadSets {
+    private static final Logger logger = LogManager.getLogger(FeaturizedReadSets.class);
 
-    public static final int FEATURES_PER_READ = 11;
+    public static final int DEFAULT_BASE_QUALITY = 25;
 
     private static final SmithWatermanAligner aligner = SmithWatermanAligner.getAligner(SmithWatermanAligner.Implementation.JAVA);
 
-    // downsample ref reads to this count if needed
-    private final int maxRefCount;
-
-    public FeaturizedReadSets(final int maxRefCount) {
-        this.maxRefCount = maxRefCount;
-    }
-
-    public FeaturizedReadSets() {
-        this(DEFAULT_MAX_REF_COUNT);
-    }
-
-    @Override
-    public void annotate(final ReferenceContext ref,
-                         final FeatureContext features,
-                         final VariantContext vc,
-                         final Genotype g,
-                         final GenotypeBuilder gb,
-                         final AlleleLikelihoods<GATKRead, Allele> likelihoods,
-                         final AlleleLikelihoods<Fragment, Allele> fragmentLikelihoods,
-                         final AlleleLikelihoods<Fragment, Haplotype> haplotypeLikelihoods) {
-        Utils.nonNull(vc);
-        Utils.nonNull(g);
-        Utils.nonNull(gb);
-
-        if ( likelihoods == null) {
-            return;
-        }
-
-        final List<List<List<Integer>>> readVectorsByAllele = getReadVectors(vc, Collections.singletonList(g.getSampleName()),
-                likelihoods, haplotypeLikelihoods, maxRefCount, Integer.MAX_VALUE);
-
-        // flatten twice: over all reads supporting each allele and over all alleles
-        // we can partition by allele with the countsInAlleleOrder annotation
-        // and by read using the constant feature vector size
-        final int[] flattenedTensorInAlleleOrder = readVectorsByAllele.stream()
-                .flatMap(listOfLists -> listOfLists.stream().flatMap(List::stream))
-                .mapToInt(n -> n)
-                .toArray();
-
-        final int[] countsInAlleleOrder = readVectorsByAllele.stream().mapToInt(List::size).toArray();
-
-        gb.attribute(GATKVCFConstants.FEATURIZED_READ_SETS_KEY, flattenedTensorInAlleleOrder);
-        gb.attribute(GATKVCFConstants.FEATURIZED_READ_SETS_COUNTS_KEY, countsInAlleleOrder);
-    }
+    private FeaturizedReadSets() { }
 
     public static List<List<List<Integer>>> getReadVectors(final VariantContext vc,
                                                            final Collection<String> samples,
                                                            final AlleleLikelihoods<GATKRead, Allele> likelihoods,
                                                            final AlleleLikelihoods<Fragment, Haplotype> haplotypeLikelihoods,
                                                            final int refDownsample,
-                                                           final int altDownsample) {
-        return getReadVectors(vc, samples, likelihoods, haplotypeLikelihoods, refDownsample, altDownsample, Collections.emptyMap());
+                                                           final int altDownsample,
+                                                           final M2ArgumentCollection.Mutect3DatasetMode mutect3DatasetMode) {
+        return getReadVectors(vc, samples, likelihoods, haplotypeLikelihoods, refDownsample, altDownsample, Collections.emptyMap(), mutect3DatasetMode);
     }
 
     // returns Lists (in allele order) of lists of read vectors supporting each allele
@@ -103,7 +53,8 @@ public static List<List<List<Integer>>> getReadVectors(final VariantContext vc,
                                                            final AlleleLikelihoods<Fragment, Haplotype> haplotypeLikelihoods,
                                                            final int refDownsample,
                                                            final int altDownsample,
-                                                           final Map<Allele, Integer> altDownsampleMap) {
+                                                           final Map<Allele, Integer> altDownsampleMap,
+                                                           final M2ArgumentCollection.Mutect3DatasetMode mutect3DatasetMode) {
         final Map<Allele, List<GATKRead>> readsByAllele = likelihoods.alleles().stream()
                 .collect(Collectors.toMap(a -> a, a -> new ArrayList<>()));
 
@@ -126,17 +77,14 @@ public static List<List<List<Integer>>> getReadVectors(final VariantContext vc,
                 .forEach(ba -> ba.evidence.getReads().forEach(read -> bestHaplotypes.put(read, ba.allele)));
 
         return vc.getAlleles().stream()
-                .map(allele -> readsByAllele.get(allele).stream().map(read -> featurize(read, vc, bestHaplotypes)).collect(Collectors.toList()))
+                .map(allele -> readsByAllele.get(allele).stream().map(read -> featurize(read, vc, bestHaplotypes, mutect3DatasetMode)).collect(Collectors.toList()))
                 .collect(Collectors.toList());
     }
 
 
-    @Override
-    public List<String> getKeyNames() {
-        return Arrays.asList(GATKVCFConstants.FEATURIZED_READ_SETS_KEY, GATKVCFConstants.FEATURIZED_READ_SETS_COUNTS_KEY);
-    }
-
-    private static List<Integer> featurize(final GATKRead read, final VariantContext vc, final Map<GATKRead, Haplotype> bestHaplotypes) {
+    private static List<Integer> featurize(final GATKRead read, final VariantContext vc,
+                                           final Map<GATKRead, Haplotype> bestHaplotypes,
+                                           final M2ArgumentCollection.Mutect3DatasetMode mutect3DatasetMode) {
         final List<Integer> result = new ArrayList<>();
         result.add(read.getMappingQuality());
         result.add(BaseQuality.getBaseQuality(read, vc).orElse(DEFAULT_BASE_QUALITY));
@@ -151,23 +99,41 @@ private static List<Integer> featurize(final GATKRead read, final VariantContext
 
         result.add(Math.abs(read.getFragmentLength()));
 
-        // distances from ends of fragment
-        final int fragmentStart = Math.min(read.getMateStart(), read.getUnclippedStart());
-        final int fragmentEnd = fragmentStart + Math.abs(read.getFragmentLength());
-        result.add(vc.getStart() - fragmentStart);
-        result.add(fragmentEnd - vc.getEnd());
+        if (mutect3DatasetMode == M2ArgumentCollection.Mutect3DatasetMode.ILLUMINA) {
+            // distances from ends of fragment
+            final int fragmentStart = Math.min(read.getMateStart(), read.getUnclippedStart());
+            final int fragmentEnd = fragmentStart + Math.abs(read.getFragmentLength());
+            result.add(vc.getStart() - fragmentStart);
+            result.add(fragmentEnd - vc.getEnd());
+        }
+
+        // Ultima-specific read tags
+        if (mutect3DatasetMode == M2ArgumentCollection.Mutect3DatasetMode.ULTIMA) {
+            result.add(read.getAttributeAsInteger("si"));   // si is an integer on the order of 100s or 1000s
+            result.add((int) (1000*read.getAttributeAsFloat("rq")));    // rq is a float from 0 and 1, so we multiply by 1000 and round
+        }
 
         // mismatches versus best haplotype
         final Haplotype haplotype = bestHaplotypes.get(read);
-        final SmithWatermanAlignment readToHaplotypeAlignment = aligner.align(haplotype.getBases(), read.getBases(), SmithWatermanAlignmentConstants.ALIGNMENT_TO_BEST_HAPLOTYPE_SW_PARAMETERS, SWOverhangStrategy.SOFTCLIP);
-        final GATKRead copy = read.copy();
-        copy.setCigar(readToHaplotypeAlignment.getCigar());
-        final int mismatchCount = AlignmentUtils.getMismatchCount(copy, haplotype.getBases(), readToHaplotypeAlignment.getAlignmentOffset()).numMismatches;
-        result.add(mismatchCount);
-
-        final long indelsVsBestHaplotype = readToHaplotypeAlignment.getCigar().getCigarElements().stream().filter(el -> el.getOperator().isIndel()).count();
-        result.add((int) indelsVsBestHaplotype);
-        Utils.validate(result.size() == FEATURES_PER_READ, "Wrong number of features");
+
+        // TODO: fix this
+        // I have no idea why this edge case occurs in Ultima data and maybe sometimes in Illumina
+        if (!bestHaplotypes.containsKey(read)) {
+            logger.warn(String.format("Best haplotypes don't contain read %s at position %s:%d.", read.getName(),
+                    vc.getContig(), vc.getStart()));
+            result.add(3);
+            result.add(2);
+        } else {
+            final SmithWatermanAlignment readToHaplotypeAlignment = aligner.align(haplotype.getBases(), read.getBases(), SmithWatermanAlignmentConstants.ALIGNMENT_TO_BEST_HAPLOTYPE_SW_PARAMETERS, SWOverhangStrategy.SOFTCLIP);
+            final GATKRead copy = read.copy();
+            copy.setCigar(readToHaplotypeAlignment.getCigar());
+            final int mismatchCount = AlignmentUtils.getMismatchCount(copy, haplotype.getBases(), readToHaplotypeAlignment.getAlignmentOffset()).numMismatches;
+            result.add(mismatchCount);
+
+            final long indelsVsBestHaplotype = readToHaplotypeAlignment.getCigar().getCigarElements().stream().filter(el -> el.getOperator().isIndel()).count();
+            result.add((int) indelsVsBestHaplotype);
+        }
+        Utils.validate(result.size() == mutect3DatasetMode.getNumReadFeatures(), "Wrong number of features");
 
         return result;
     }

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java
@@ -81,10 +81,11 @@ public class M2ArgumentCollection extends AssemblyBasedCallerArgumentCollection
     public static final String MUTECT3_ALT_DOWNSAMPLE_LONG_NAME = "mutect3-alt-downsample";
     public static final String MUTECT3_DATASET_LONG_NAME = "mutect3-dataset";
     public static final String MUTECT3_TRAINING_TRUTH_LONG_NAME = "mutect3-training-truth";
+    public static final String MUTECT3_DATASET_MODE_LONG_NAME = "mutect3-dataset-mode";
 
     public static final int DEFAULT_MUTECT3_REF_DOWNSAMPLE = 10;
     public static final int DEFAULT_MUTECT3_ALT_DOWNSAMPLE = 20;
-    public static final int DEFAULT_MUTECT3_NON_ARTIFACT_RATIO = 20;
+    public static final int DEFAULT_MUTECT3_NON_ARTIFACT_RATIO = 1;
 
     @Override
     protected int getDefaultMaxMnpDistance() { return 1; }
@@ -205,6 +206,25 @@ public double getDefaultAlleleFrequency() {
     @Argument(fullName = MUTECT3_DATASET_LONG_NAME, optional = true, doc="Destination for Mutect3 data collection")
     public File mutect3Dataset;
 
+    @Advanced
+    @Argument(fullName = MUTECT3_DATASET_MODE_LONG_NAME, optional = true, doc="The type of Mutect3 dataset.  Depends on sequencing technology.")
+    public Mutect3DatasetMode mutect3DatasetMode = Mutect3DatasetMode.ILLUMINA;
+
+    public enum Mutect3DatasetMode {
+        ILLUMINA(11),
+        ULTIMA(11);
+
+        final private int numReadFeatures;
+
+        Mutect3DatasetMode(final int numReadFeatures) {
+            this.numReadFeatures = numReadFeatures;
+        }
+
+        public int getNumReadFeatures() {
+            return numReadFeatures;
+        }
+    }
+
     /**
      * VCF of known calls for a sample used for generating a Mutect3 training dataset.  Unfiltered variants (PASS or empty FILTER field)
      * contained in this VCF are considered good; other variants (i.e. filtered in this VCF or absent from it) are considered errors.