From 2d50cf8b334f1e781f00ac1dae683355fee1fe99 Mon Sep 17 00:00:00 2001
From: David Benjamin <davidben@broadinstitute.org>
Date: Fri, 26 Jan 2024 13:57:18 -0500
Subject: [PATCH] Improvements to Mutect2's Permutect training data mode
 (#8663)

* include normal seq error log likelihood in Permutect dataset

* handle different alelle representations in multiallelic / indel variants for Permutect training data mode

* set the default artifact to non-artifact ratio to 1 in Permutect training data mode
---
 .../FeaturizedReadSets.java                   | 134 +++++++-----------
 .../walkers/mutect/M2ArgumentCollection.java  |  22 ++-
 .../walkers/mutect/Mutect3DatasetEngine.java  |  53 ++++---
 .../mutect/SomaticGenotypingEngine.java       |   2 +-
 4 files changed, 104 insertions(+), 107 deletions(-)
 rename src/main/java/org/broadinstitute/hellbender/tools/walkers/{annotator => mutect}/FeaturizedReadSets.java (53%)

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/FeaturizedReadSets.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/FeaturizedReadSets.java
similarity index 53%
rename from src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/FeaturizedReadSets.java
rename to src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/FeaturizedReadSets.java
index c2a37a5a31d..6a4d3f5c06a 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/FeaturizedReadSets.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/FeaturizedReadSets.java
@@ -1,26 +1,22 @@
-package org.broadinstitute.hellbender.tools.walkers.annotator;
+package org.broadinstitute.hellbender.tools.walkers.mutect;
 
 
 import htsjdk.variant.variantcontext.Allele;
-import htsjdk.variant.variantcontext.Genotype;
-import htsjdk.variant.variantcontext.GenotypeBuilder;
 import htsjdk.variant.variantcontext.VariantContext;
-import org.apache.commons.lang3.StringUtils;
-import org.broadinstitute.barclay.help.DocumentedFeature;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
 import org.broadinstitute.gatk.nativebindings.smithwaterman.SWOverhangStrategy;
-import org.broadinstitute.hellbender.engine.FeatureContext;
-import org.broadinstitute.hellbender.engine.ReferenceContext;
+import org.broadinstitute.hellbender.tools.walkers.annotator.BaseQuality;
+import org.broadinstitute.hellbender.tools.walkers.annotator.ReadPosition;
 import org.broadinstitute.hellbender.utils.Utils;
 import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
 import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
-import org.broadinstitute.hellbender.utils.help.HelpConstants;
 import org.broadinstitute.hellbender.utils.read.AlignmentUtils;
 import org.broadinstitute.hellbender.utils.read.Fragment;
 import org.broadinstitute.hellbender.utils.read.GATKRead;
 import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAligner;
 import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAlignment;
 import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAlignmentConstants;
-import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
 
 import java.util.*;
 import java.util.stream.Collectors;
@@ -31,69 +27,23 @@
  * [1,2] and [3,4] and allele 2 has featurized reads [5,6] and [7,8], the annotation is
  * 1,2,3,4|5,6,7,8
  */
-@DocumentedFeature(groupName= HelpConstants.DOC_CAT_ANNOTATORS, groupSummary=HelpConstants.DOC_CAT_ANNOTATORS_SUMMARY,
-        summary="Featurized read sets for Mutect3 training data")
-public class FeaturizedReadSets implements JumboGenotypeAnnotation {
-    public static final int DEFAULT_BASE_QUALITY = 25;
-
-    private static final int DEFAULT_MAX_REF_COUNT = Integer.MAX_VALUE;
+public class FeaturizedReadSets {
+    private static final Logger logger = LogManager.getLogger(FeaturizedReadSets.class);
 
-    public static final int FEATURES_PER_READ = 11;
+    public static final int DEFAULT_BASE_QUALITY = 25;
 
     private static final SmithWatermanAligner aligner = SmithWatermanAligner.getAligner(SmithWatermanAligner.Implementation.JAVA);
 
-    // downsample ref reads to this count if needed
-    private final int maxRefCount;
-
-    public FeaturizedReadSets(final int maxRefCount) {
-        this.maxRefCount = maxRefCount;
-    }
-
-    public FeaturizedReadSets() {
-        this(DEFAULT_MAX_REF_COUNT);
-    }
-
-    @Override
-    public void annotate(final ReferenceContext ref,
-                         final FeatureContext features,
-                         final VariantContext vc,
-                         final Genotype g,
-                         final GenotypeBuilder gb,
-                         final AlleleLikelihoods<GATKRead, Allele> likelihoods,
-                         final AlleleLikelihoods<Fragment, Allele> fragmentLikelihoods,
-                         final AlleleLikelihoods<Fragment, Haplotype> haplotypeLikelihoods) {
-        Utils.nonNull(vc);
-        Utils.nonNull(g);
-        Utils.nonNull(gb);
-
-        if ( likelihoods == null) {
-            return;
-        }
-
-        final List<List<List<Integer>>> readVectorsByAllele = getReadVectors(vc, Collections.singletonList(g.getSampleName()),
-                likelihoods, haplotypeLikelihoods, maxRefCount, Integer.MAX_VALUE);
-
-        // flatten twice: over all reads supporting each allele and over all alleles
-        // we can partition by allele with the countsInAlleleOrder annotation
-        // and by read using the constant feature vector size
-        final int[] flattenedTensorInAlleleOrder = readVectorsByAllele.stream()
-                .flatMap(listOfLists -> listOfLists.stream().flatMap(List::stream))
-                .mapToInt(n -> n)
-                .toArray();
-
-        final int[] countsInAlleleOrder = readVectorsByAllele.stream().mapToInt(List::size).toArray();
-
-        gb.attribute(GATKVCFConstants.FEATURIZED_READ_SETS_KEY, flattenedTensorInAlleleOrder);
-        gb.attribute(GATKVCFConstants.FEATURIZED_READ_SETS_COUNTS_KEY, countsInAlleleOrder);
-    }
+    private FeaturizedReadSets() { }
 
     public static List<List<List<Integer>>> getReadVectors(final VariantContext vc,
                                                            final Collection<String> samples,
                                                            final AlleleLikelihoods<GATKRead, Allele> likelihoods,
                                                            final AlleleLikelihoods<Fragment, Haplotype> haplotypeLikelihoods,
                                                            final int refDownsample,
-                                                           final int altDownsample) {
-        return getReadVectors(vc, samples, likelihoods, haplotypeLikelihoods, refDownsample, altDownsample, Collections.emptyMap());
+                                                           final int altDownsample,
+                                                           final M2ArgumentCollection.Mutect3DatasetMode mutect3DatasetMode) {
+        return getReadVectors(vc, samples, likelihoods, haplotypeLikelihoods, refDownsample, altDownsample, Collections.emptyMap(), mutect3DatasetMode);
     }
 
     // returns Lists (in allele order) of lists of read vectors supporting each allele
@@ -103,7 +53,8 @@ public static List<List<List<Integer>>> getReadVectors(final VariantContext vc,
                                                            final AlleleLikelihoods<Fragment, Haplotype> haplotypeLikelihoods,
                                                            final int refDownsample,
                                                            final int altDownsample,
-                                                           final Map<Allele, Integer> altDownsampleMap) {
+                                                           final Map<Allele, Integer> altDownsampleMap,
+                                                           final M2ArgumentCollection.Mutect3DatasetMode mutect3DatasetMode) {
         final Map<Allele, List<GATKRead>> readsByAllele = likelihoods.alleles().stream()
                 .collect(Collectors.toMap(a -> a, a -> new ArrayList<>()));
 
@@ -126,17 +77,14 @@ public static List<List<List<Integer>>> getReadVectors(final VariantContext vc,
                 .forEach(ba -> ba.evidence.getReads().forEach(read -> bestHaplotypes.put(read, ba.allele)));
 
         return vc.getAlleles().stream()
-                .map(allele -> readsByAllele.get(allele).stream().map(read -> featurize(read, vc, bestHaplotypes)).collect(Collectors.toList()))
+                .map(allele -> readsByAllele.get(allele).stream().map(read -> featurize(read, vc, bestHaplotypes, mutect3DatasetMode)).collect(Collectors.toList()))
                 .collect(Collectors.toList());
     }
 
 
-    @Override
-    public List<String> getKeyNames() {
-        return Arrays.asList(GATKVCFConstants.FEATURIZED_READ_SETS_KEY, GATKVCFConstants.FEATURIZED_READ_SETS_COUNTS_KEY);
-    }
-
-    private static List<Integer> featurize(final GATKRead read, final VariantContext vc, final Map<GATKRead, Haplotype> bestHaplotypes) {
+    private static List<Integer> featurize(final GATKRead read, final VariantContext vc,
+                                           final Map<GATKRead, Haplotype> bestHaplotypes,
+                                           final M2ArgumentCollection.Mutect3DatasetMode mutect3DatasetMode) {
         final List<Integer> result = new ArrayList<>();
         result.add(read.getMappingQuality());
         result.add(BaseQuality.getBaseQuality(read, vc).orElse(DEFAULT_BASE_QUALITY));
@@ -151,23 +99,41 @@ private static List<Integer> featurize(final GATKRead read, final VariantContext
 
         result.add(Math.abs(read.getFragmentLength()));
 
-        // distances from ends of fragment
-        final int fragmentStart = Math.min(read.getMateStart(), read.getUnclippedStart());
-        final int fragmentEnd = fragmentStart + Math.abs(read.getFragmentLength());
-        result.add(vc.getStart() - fragmentStart);
-        result.add(fragmentEnd - vc.getEnd());
+        if (mutect3DatasetMode == M2ArgumentCollection.Mutect3DatasetMode.ILLUMINA) {
+            // distances from ends of fragment
+            final int fragmentStart = Math.min(read.getMateStart(), read.getUnclippedStart());
+            final int fragmentEnd = fragmentStart + Math.abs(read.getFragmentLength());
+            result.add(vc.getStart() - fragmentStart);
+            result.add(fragmentEnd - vc.getEnd());
+        }
+
+        // Ultima-specific read tags
+        if (mutect3DatasetMode == M2ArgumentCollection.Mutect3DatasetMode.ULTIMA) {
+            result.add(read.getAttributeAsInteger("si"));   // si is an integer on the order of 100s or 1000s
+            result.add((int) (1000*read.getAttributeAsFloat("rq")));    // rq is a float from 0 and 1, so we multiply by 1000 and round
+        }
 
         // mismatches versus best haplotype
         final Haplotype haplotype = bestHaplotypes.get(read);
-        final SmithWatermanAlignment readToHaplotypeAlignment = aligner.align(haplotype.getBases(), read.getBases(), SmithWatermanAlignmentConstants.ALIGNMENT_TO_BEST_HAPLOTYPE_SW_PARAMETERS, SWOverhangStrategy.SOFTCLIP);
-        final GATKRead copy = read.copy();
-        copy.setCigar(readToHaplotypeAlignment.getCigar());
-        final int mismatchCount = AlignmentUtils.getMismatchCount(copy, haplotype.getBases(), readToHaplotypeAlignment.getAlignmentOffset()).numMismatches;
-        result.add(mismatchCount);
-
-        final long indelsVsBestHaplotype = readToHaplotypeAlignment.getCigar().getCigarElements().stream().filter(el -> el.getOperator().isIndel()).count();
-        result.add((int) indelsVsBestHaplotype);
-        Utils.validate(result.size() == FEATURES_PER_READ, "Wrong number of features");
+
+        // TODO: fix this
+        // I have no idea why this edge case occurs in Ultima data and maybe sometimes in Illumina
+        if (!bestHaplotypes.containsKey(read)) {
+            logger.warn(String.format("Best haplotypes don't contain read %s at position %s:%d.", read.getName(),
+                    vc.getContig(), vc.getStart()));
+            result.add(3);
+            result.add(2);
+        } else {
+            final SmithWatermanAlignment readToHaplotypeAlignment = aligner.align(haplotype.getBases(), read.getBases(), SmithWatermanAlignmentConstants.ALIGNMENT_TO_BEST_HAPLOTYPE_SW_PARAMETERS, SWOverhangStrategy.SOFTCLIP);
+            final GATKRead copy = read.copy();
+            copy.setCigar(readToHaplotypeAlignment.getCigar());
+            final int mismatchCount = AlignmentUtils.getMismatchCount(copy, haplotype.getBases(), readToHaplotypeAlignment.getAlignmentOffset()).numMismatches;
+            result.add(mismatchCount);
+
+            final long indelsVsBestHaplotype = readToHaplotypeAlignment.getCigar().getCigarElements().stream().filter(el -> el.getOperator().isIndel()).count();
+            result.add((int) indelsVsBestHaplotype);
+        }
+        Utils.validate(result.size() == mutect3DatasetMode.getNumReadFeatures(), "Wrong number of features");
 
         return result;
     }
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java
index d25c7a33fc5..0087441ae1a 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java
@@ -81,10 +81,11 @@ public class M2ArgumentCollection extends AssemblyBasedCallerArgumentCollection
     public static final String MUTECT3_ALT_DOWNSAMPLE_LONG_NAME = "mutect3-alt-downsample";
     public static final String MUTECT3_DATASET_LONG_NAME = "mutect3-dataset";
     public static final String MUTECT3_TRAINING_TRUTH_LONG_NAME = "mutect3-training-truth";
+    public static final String MUTECT3_DATASET_MODE_LONG_NAME = "mutect3-dataset-mode";
 
     public static final int DEFAULT_MUTECT3_REF_DOWNSAMPLE = 10;
     public static final int DEFAULT_MUTECT3_ALT_DOWNSAMPLE = 20;
-    public static final int DEFAULT_MUTECT3_NON_ARTIFACT_RATIO = 20;
+    public static final int DEFAULT_MUTECT3_NON_ARTIFACT_RATIO = 1;
 
     @Override
     protected int getDefaultMaxMnpDistance() { return 1; }
@@ -205,6 +206,25 @@ public double getDefaultAlleleFrequency() {
     @Argument(fullName = MUTECT3_DATASET_LONG_NAME, optional = true, doc="Destination for Mutect3 data collection")
     public File mutect3Dataset;
 
+    @Advanced
+    @Argument(fullName = MUTECT3_DATASET_MODE_LONG_NAME, optional = true, doc="The type of Mutect3 dataset.  Depends on sequencing technology.")
+    public Mutect3DatasetMode mutect3DatasetMode = Mutect3DatasetMode.ILLUMINA;
+
+    public enum Mutect3DatasetMode {
+        ILLUMINA(11),
+        ULTIMA(11);
+
+        final private int numReadFeatures;
+
+        Mutect3DatasetMode(final int numReadFeatures) {
+            this.numReadFeatures = numReadFeatures;
+        }
+
+        public int getNumReadFeatures() {
+            return numReadFeatures;
+        }
+    }
+
     /**
      * VCF of known calls for a sample used for generating a Mutect3 training dataset.  Unfiltered variants (PASS or empty FILTER field)
      * contained in this VCF are considered good; other variants (i.e. filtered in this VCF or absent from it) are considered errors.
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect3DatasetEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect3DatasetEngine.java
index e6a080cd14f..942c2de15b2 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect3DatasetEngine.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect3DatasetEngine.java
@@ -4,18 +4,15 @@
 import htsjdk.variant.variantcontext.Genotype;
 import htsjdk.variant.variantcontext.VariantContext;
 import org.apache.commons.lang3.tuple.Triple;
-import org.apache.commons.math3.util.CombinatoricsUtils;
 import org.apache.commons.math3.util.FastMath;
-import org.broadinstitute.hellbender.engine.FeatureContext;
 import org.broadinstitute.hellbender.engine.ReferenceContext;
 import org.broadinstitute.hellbender.exceptions.UserException;
+import org.broadinstitute.hellbender.tools.walkers.ReferenceConfidenceVariantContextMerger;
 import org.broadinstitute.hellbender.tools.walkers.annotator.AssemblyComplexity;
-import org.broadinstitute.hellbender.tools.walkers.annotator.FeaturizedReadSets;
 import org.broadinstitute.hellbender.tools.walkers.annotator.ReferenceBases;
 import org.broadinstitute.hellbender.tools.walkers.mutect.filtering.Mutect2FilteringEngine;
 import org.broadinstitute.hellbender.utils.IndexRange;
 import org.broadinstitute.hellbender.utils.MathUtils;
-import org.broadinstitute.hellbender.utils.NaturalLogUtils;
 import org.broadinstitute.hellbender.utils.Utils;
 import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
 import org.broadinstitute.hellbender.utils.genotyper.LikelihoodMatrix;
@@ -46,9 +43,6 @@ private enum Label {
         ARTIFACT, VARIANT, UNLABELED, IGNORE
     }
 
-    // number of features for each vectorized read
-    private static final int FEATURES_PER_READ = FeaturizedReadSets.FEATURES_PER_READ;
-
     // number of additional variant features for assembly complexity, reference context
     private static final int NUM_EXTRA_FEATURES = 9;
 
@@ -108,7 +102,8 @@ public Mutect3DatasetEngine(final File datasetFile, final boolean trainingMode,
     public void addData(final ReferenceContext ref, final VariantContext vc, Optional<List<VariantContext>> truthVCs,
                         final AlleleLikelihoods<GATKRead, Allele> likelihoods,
                         final AlleleLikelihoods<Fragment, Haplotype> logFragmentLikelihoods,
-                        final AlleleLikelihoods<Fragment, Allele> logFragmentAlleleLikelihoods) {
+                        final AlleleLikelihoods<Fragment, Allele> logFragmentAlleleLikelihoods,
+                        final M2ArgumentCollection.Mutect3DatasetMode mutect3DatasetMode) {
         final String refBases = ReferenceBases.annotate(ref, vc);
         final String refAllele = vc.getReference().getBaseString();
         final String contig = vc.getContig();
@@ -132,6 +127,20 @@ public void addData(final ReferenceContext ref, final VariantContext vc, Optiona
         final int normalDepth = (int) MathUtils.sum(normalADs);
         final boolean hasNormal = normalDepth > 0;
 
+        final List<Allele> allRefAlleles = new ArrayList<>();
+        allRefAlleles.add(vc.getReference());
+        truthVCs.ifPresent(vcs -> vcs.forEach(var -> allRefAlleles.add(var.getReference())));
+        final Allele longestRef = allRefAlleles.stream().sorted(Comparator.comparingInt(Allele::length).reversed()).findFirst().get();
+
+        // skip(1) excludes the reference allele
+        final List<Allele> remappedAltAlleles = ReferenceConfidenceVariantContextMerger.remapAlleles(vc, longestRef).stream()
+                .skip(1).toList();
+
+        final Set<Allele> truthAlleles = !truthVCs.isPresent() ? Collections.emptySet() : truthVCs.get().stream()
+                .filter(truthVC -> ! truthVC.isFiltered())
+                .flatMap(truthVC -> ReferenceConfidenceVariantContextMerger.remapAlleles(truthVC, longestRef).stream())
+                .collect(Collectors.toSet());
+
         final List<Label> labels = new ArrayList<>(numAlt);
         final Map<Allele, Integer> altDownsampleMap= new HashMap<>();
 
@@ -139,16 +148,11 @@ public void addData(final ReferenceContext ref, final VariantContext vc, Optiona
             final double tumorAF = tumorADs[n+1] / ((double) tumorDepth);
             final double normalAF = hasNormal ? normalADs[n+1] / ((double) normalDepth) : 0.0;
             Allele altAllele = vc.getAlternateAllele(n);
+            final Allele remappedAltAlelle = remappedAltAlleles.get(n);
             final String altAlleleString = altAllele.getBaseString();
             final int diff = altAlleleString.length() - refAllele.length();
             final VariantType type = diff == 0 ? VariantType.SNV : ( diff > 0 ? VariantType.INSERTION : VariantType.DELETION);
 
-            // TODO: what about alternative representations?
-            final Set<Allele> truthAlleles = !truthVCs.isPresent() ? Collections.emptySet() : truthVCs.get().stream()
-                    .filter(var -> ! var.isFiltered())
-                    .flatMap(var -> var.getAlternateAlleles().stream())
-                    .collect(Collectors.toSet());
-
             if (trainingMode) {
                 final ArrayBlockingQueue<Integer> unmatchedQueue = unmatchedArtifactAltCounts.get(type);
                 final boolean likelySeqError = tumorLods[n] < TLOD_THRESHOLD;
@@ -156,10 +160,10 @@ public void addData(final ReferenceContext ref, final VariantContext vc, Optiona
                 // extremely strict criteria because there are so many germline variants we can afford to waste a lot
                 final boolean definiteGermline = !likelySeqError && popafs[n] < COMMON_POPAF_THRESHOLD &&
                         tumorAF > 0.35 && (!hasNormal || normalAF > 0.35);
-                final boolean trueVariant = truthVCs.isPresent() ? truthAlleles.contains(altAllele) : definiteGermline;
+                final boolean trueVariant = truthVCs.isPresent() ? truthAlleles.contains(remappedAltAlelle) : definiteGermline;
 
                 // low AF in tumor and normal, rare in population implies artifact
-                boolean probableArtifact = !likelySeqError && (truthVCs.isPresent() ? !truthAlleles.contains(altAllele) :
+                boolean probableArtifact = !likelySeqError && (truthVCs.isPresent() ? !truthAlleles.contains(remappedAltAlelle) :
                         (tumorAF < 0.2 && popafs[n] > RARE_POPAF_THRESHOLD));
 
                 if  (probableArtifact) {
@@ -194,8 +198,10 @@ public void addData(final ReferenceContext ref, final VariantContext vc, Optiona
 
         // TODO: for now we don't really need normal reads
         // note that the following use the VC's allele order, not necessarily the likelihoods' allele order
-        final List<List<List<Integer>>> normalReadVectorsByAllele =  FeaturizedReadSets.getReadVectors(vc, normalSamples, likelihoods, logFragmentLikelihoods, maxRefCount, maxAltCount);
-        final List<List<List<Integer>>> tumorReadVectorsByAllele =  FeaturizedReadSets.getReadVectors(vc, tumorSamples, likelihoods, logFragmentLikelihoods, maxRefCount, maxAltCount, altDownsampleMap);
+        final List<List<List<Integer>>> normalReadVectorsByAllele =  FeaturizedReadSets.getReadVectors(vc, normalSamples,
+                likelihoods, logFragmentLikelihoods, maxRefCount, maxAltCount, mutect3DatasetMode);
+        final List<List<List<Integer>>> tumorReadVectorsByAllele =  FeaturizedReadSets.getReadVectors(vc, tumorSamples,
+                likelihoods, logFragmentLikelihoods, maxRefCount, maxAltCount, altDownsampleMap, mutect3DatasetMode);
 
         // ref and alt reads have already been downsampled by the read featurizer
         final List<List<Integer>> tumorRefReads = tumorReadVectorsByAllele.get(0);
@@ -227,13 +233,18 @@ public void addData(final ReferenceContext ref, final VariantContext vc, Optiona
             tumorAltReads.forEach(r -> printWriter.println(integerString(r)));
             //normalRefReads.forEach(r -> printWriter.print(numberString(r)));
             //normalAltReads.forEach(r -> printWriter.print(numberString(r)));
-            printWriter.printf("%d %d %d %d%n", tumorDepth, tumorADs[n+1], normalDepth, normalADs[n+1]);  // pre-downsampling counts for normal artifact model
+            printWriter.printf("%d %d %d %d%n", tumorDepth, tumorADs[n+1], normalDepth, normalADs[n+1]);  // pre-downsampling counts
             // this is approximately the likelihood that these particular reads are alt given sequencing error, excluding
             // the depth C N_alt combinatorial factor that is common to all likelihoods in M3
-            // basicaly, it's the TLOD with a correction for the marginalized flat prior from M2
+            // basically, it's the TLOD with a correction for the marginalized flat prior from M2
             final double tlod = vc.getAttributeAsDoubleList("TLOD", 0).get(n);
             final double seqErrorLogLikelihood = -MathUtils.log10ToLog(tlod) - Math.log(tumorDepth + 1);
-            printWriter.printf("%.3f%n", seqErrorLogLikelihood);  // pre-downsampling counts for normal artifact model
+            printWriter.printf("%.3f%n", seqErrorLogLikelihood);
+
+            // and do the same for the normal
+            final double nalod = normalSamples.isEmpty() ? 0 : vc.getAttributeAsDoubleList("NALOD", 0).get(n);
+            final double normalSeqErrorLogLikelihood = -MathUtils.log10ToLog(nalod) - Math.log(normalDepth + 1);
+            printWriter.printf("%.3f%n", normalSeqErrorLogLikelihood);
         }
     }
 
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/SomaticGenotypingEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/SomaticGenotypingEngine.java
index 4e3e5a0bf21..476fe36ae28 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/SomaticGenotypingEngine.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/SomaticGenotypingEngine.java
@@ -214,7 +214,7 @@ public CalledHaplotypes callMutations(
             final Optional<List<VariantContext>> truthVCs = MTAC.mutect3TrainingTruth == null ? Optional.empty() :
                     Optional.of(featureContext.getValues(MTAC.mutect3TrainingTruth, mergedVC.getStart()));
             mutect3DatasetEngine.ifPresent(engine -> engine.addData(referenceContext, annotatedCall, truthVCs,
-                    trimmedLikelihoodsForAnnotation, logFragmentLikelihoods, logLikelihoods));
+                    trimmedLikelihoodsForAnnotation, logFragmentLikelihoods, logLikelihoods, MTAC.mutect3DatasetMode));
 
             call.getAlleles().stream().map(alleleMapper::get).filter(Objects::nonNull).forEach(calledHaplotypes::addAll);
             returnCalls.add( annotatedCall );