Skip to content

Commit

Permalink
Improvements to Mutect2's Permutect training data mode (#8663)
Browse files Browse the repository at this point in the history
* include normal seq error log likelihood in Permutect dataset

* handle different alelle representations in multiallelic / indel variants for Permutect training data mode

* set the default artifact to non-artifact ratio to 1 in Permutect training data mode
  • Loading branch information
davidbenjamin authored Jan 26, 2024
1 parent e796d20 commit 2d50cf8
Show file tree
Hide file tree
Showing 4 changed files with 104 additions and 107 deletions.
Original file line number Diff line number Diff line change
@@ -1,26 +1,22 @@
package org.broadinstitute.hellbender.tools.walkers.annotator;
package org.broadinstitute.hellbender.tools.walkers.mutect;


import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.GenotypeBuilder;
import htsjdk.variant.variantcontext.VariantContext;
import org.apache.commons.lang3.StringUtils;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.gatk.nativebindings.smithwaterman.SWOverhangStrategy;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.tools.walkers.annotator.BaseQuality;
import org.broadinstitute.hellbender.tools.walkers.annotator.ReadPosition;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
import org.broadinstitute.hellbender.utils.help.HelpConstants;
import org.broadinstitute.hellbender.utils.read.AlignmentUtils;
import org.broadinstitute.hellbender.utils.read.Fragment;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAligner;
import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAlignment;
import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAlignmentConstants;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;

import java.util.*;
import java.util.stream.Collectors;
Expand All @@ -31,69 +27,23 @@
* [1,2] and [3,4] and allele 2 has featurized reads [5,6] and [7,8], the annotation is
* 1,2,3,4|5,6,7,8
*/
@DocumentedFeature(groupName= HelpConstants.DOC_CAT_ANNOTATORS, groupSummary=HelpConstants.DOC_CAT_ANNOTATORS_SUMMARY,
summary="Featurized read sets for Mutect3 training data")
public class FeaturizedReadSets implements JumboGenotypeAnnotation {
public static final int DEFAULT_BASE_QUALITY = 25;

private static final int DEFAULT_MAX_REF_COUNT = Integer.MAX_VALUE;
public class FeaturizedReadSets {
private static final Logger logger = LogManager.getLogger(FeaturizedReadSets.class);

public static final int FEATURES_PER_READ = 11;
public static final int DEFAULT_BASE_QUALITY = 25;

private static final SmithWatermanAligner aligner = SmithWatermanAligner.getAligner(SmithWatermanAligner.Implementation.JAVA);

// downsample ref reads to this count if needed
private final int maxRefCount;

public FeaturizedReadSets(final int maxRefCount) {
this.maxRefCount = maxRefCount;
}

public FeaturizedReadSets() {
this(DEFAULT_MAX_REF_COUNT);
}

@Override
public void annotate(final ReferenceContext ref,
final FeatureContext features,
final VariantContext vc,
final Genotype g,
final GenotypeBuilder gb,
final AlleleLikelihoods<GATKRead, Allele> likelihoods,
final AlleleLikelihoods<Fragment, Allele> fragmentLikelihoods,
final AlleleLikelihoods<Fragment, Haplotype> haplotypeLikelihoods) {
Utils.nonNull(vc);
Utils.nonNull(g);
Utils.nonNull(gb);

if ( likelihoods == null) {
return;
}

final List<List<List<Integer>>> readVectorsByAllele = getReadVectors(vc, Collections.singletonList(g.getSampleName()),
likelihoods, haplotypeLikelihoods, maxRefCount, Integer.MAX_VALUE);

// flatten twice: over all reads supporting each allele and over all alleles
// we can partition by allele with the countsInAlleleOrder annotation
// and by read using the constant feature vector size
final int[] flattenedTensorInAlleleOrder = readVectorsByAllele.stream()
.flatMap(listOfLists -> listOfLists.stream().flatMap(List::stream))
.mapToInt(n -> n)
.toArray();

final int[] countsInAlleleOrder = readVectorsByAllele.stream().mapToInt(List::size).toArray();

gb.attribute(GATKVCFConstants.FEATURIZED_READ_SETS_KEY, flattenedTensorInAlleleOrder);
gb.attribute(GATKVCFConstants.FEATURIZED_READ_SETS_COUNTS_KEY, countsInAlleleOrder);
}
private FeaturizedReadSets() { }

public static List<List<List<Integer>>> getReadVectors(final VariantContext vc,
final Collection<String> samples,
final AlleleLikelihoods<GATKRead, Allele> likelihoods,
final AlleleLikelihoods<Fragment, Haplotype> haplotypeLikelihoods,
final int refDownsample,
final int altDownsample) {
return getReadVectors(vc, samples, likelihoods, haplotypeLikelihoods, refDownsample, altDownsample, Collections.emptyMap());
final int altDownsample,
final M2ArgumentCollection.Mutect3DatasetMode mutect3DatasetMode) {
return getReadVectors(vc, samples, likelihoods, haplotypeLikelihoods, refDownsample, altDownsample, Collections.emptyMap(), mutect3DatasetMode);
}

// returns Lists (in allele order) of lists of read vectors supporting each allele
Expand All @@ -103,7 +53,8 @@ public static List<List<List<Integer>>> getReadVectors(final VariantContext vc,
final AlleleLikelihoods<Fragment, Haplotype> haplotypeLikelihoods,
final int refDownsample,
final int altDownsample,
final Map<Allele, Integer> altDownsampleMap) {
final Map<Allele, Integer> altDownsampleMap,
final M2ArgumentCollection.Mutect3DatasetMode mutect3DatasetMode) {
final Map<Allele, List<GATKRead>> readsByAllele = likelihoods.alleles().stream()
.collect(Collectors.toMap(a -> a, a -> new ArrayList<>()));

Expand All @@ -126,17 +77,14 @@ public static List<List<List<Integer>>> getReadVectors(final VariantContext vc,
.forEach(ba -> ba.evidence.getReads().forEach(read -> bestHaplotypes.put(read, ba.allele)));

return vc.getAlleles().stream()
.map(allele -> readsByAllele.get(allele).stream().map(read -> featurize(read, vc, bestHaplotypes)).collect(Collectors.toList()))
.map(allele -> readsByAllele.get(allele).stream().map(read -> featurize(read, vc, bestHaplotypes, mutect3DatasetMode)).collect(Collectors.toList()))
.collect(Collectors.toList());
}


@Override
public List<String> getKeyNames() {
return Arrays.asList(GATKVCFConstants.FEATURIZED_READ_SETS_KEY, GATKVCFConstants.FEATURIZED_READ_SETS_COUNTS_KEY);
}

private static List<Integer> featurize(final GATKRead read, final VariantContext vc, final Map<GATKRead, Haplotype> bestHaplotypes) {
private static List<Integer> featurize(final GATKRead read, final VariantContext vc,
final Map<GATKRead, Haplotype> bestHaplotypes,
final M2ArgumentCollection.Mutect3DatasetMode mutect3DatasetMode) {
final List<Integer> result = new ArrayList<>();
result.add(read.getMappingQuality());
result.add(BaseQuality.getBaseQuality(read, vc).orElse(DEFAULT_BASE_QUALITY));
Expand All @@ -151,23 +99,41 @@ private static List<Integer> featurize(final GATKRead read, final VariantContext

result.add(Math.abs(read.getFragmentLength()));

// distances from ends of fragment
final int fragmentStart = Math.min(read.getMateStart(), read.getUnclippedStart());
final int fragmentEnd = fragmentStart + Math.abs(read.getFragmentLength());
result.add(vc.getStart() - fragmentStart);
result.add(fragmentEnd - vc.getEnd());
if (mutect3DatasetMode == M2ArgumentCollection.Mutect3DatasetMode.ILLUMINA) {
// distances from ends of fragment
final int fragmentStart = Math.min(read.getMateStart(), read.getUnclippedStart());
final int fragmentEnd = fragmentStart + Math.abs(read.getFragmentLength());
result.add(vc.getStart() - fragmentStart);
result.add(fragmentEnd - vc.getEnd());
}

// Ultima-specific read tags
if (mutect3DatasetMode == M2ArgumentCollection.Mutect3DatasetMode.ULTIMA) {
result.add(read.getAttributeAsInteger("si")); // si is an integer on the order of 100s or 1000s
result.add((int) (1000*read.getAttributeAsFloat("rq"))); // rq is a float from 0 and 1, so we multiply by 1000 and round
}

// mismatches versus best haplotype
final Haplotype haplotype = bestHaplotypes.get(read);
final SmithWatermanAlignment readToHaplotypeAlignment = aligner.align(haplotype.getBases(), read.getBases(), SmithWatermanAlignmentConstants.ALIGNMENT_TO_BEST_HAPLOTYPE_SW_PARAMETERS, SWOverhangStrategy.SOFTCLIP);
final GATKRead copy = read.copy();
copy.setCigar(readToHaplotypeAlignment.getCigar());
final int mismatchCount = AlignmentUtils.getMismatchCount(copy, haplotype.getBases(), readToHaplotypeAlignment.getAlignmentOffset()).numMismatches;
result.add(mismatchCount);

final long indelsVsBestHaplotype = readToHaplotypeAlignment.getCigar().getCigarElements().stream().filter(el -> el.getOperator().isIndel()).count();
result.add((int) indelsVsBestHaplotype);
Utils.validate(result.size() == FEATURES_PER_READ, "Wrong number of features");

// TODO: fix this
// I have no idea why this edge case occurs in Ultima data and maybe sometimes in Illumina
if (!bestHaplotypes.containsKey(read)) {
logger.warn(String.format("Best haplotypes don't contain read %s at position %s:%d.", read.getName(),
vc.getContig(), vc.getStart()));
result.add(3);
result.add(2);
} else {
final SmithWatermanAlignment readToHaplotypeAlignment = aligner.align(haplotype.getBases(), read.getBases(), SmithWatermanAlignmentConstants.ALIGNMENT_TO_BEST_HAPLOTYPE_SW_PARAMETERS, SWOverhangStrategy.SOFTCLIP);
final GATKRead copy = read.copy();
copy.setCigar(readToHaplotypeAlignment.getCigar());
final int mismatchCount = AlignmentUtils.getMismatchCount(copy, haplotype.getBases(), readToHaplotypeAlignment.getAlignmentOffset()).numMismatches;
result.add(mismatchCount);

final long indelsVsBestHaplotype = readToHaplotypeAlignment.getCigar().getCigarElements().stream().filter(el -> el.getOperator().isIndel()).count();
result.add((int) indelsVsBestHaplotype);
}
Utils.validate(result.size() == mutect3DatasetMode.getNumReadFeatures(), "Wrong number of features");

return result;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,11 @@ public class M2ArgumentCollection extends AssemblyBasedCallerArgumentCollection
public static final String MUTECT3_ALT_DOWNSAMPLE_LONG_NAME = "mutect3-alt-downsample";
public static final String MUTECT3_DATASET_LONG_NAME = "mutect3-dataset";
public static final String MUTECT3_TRAINING_TRUTH_LONG_NAME = "mutect3-training-truth";
public static final String MUTECT3_DATASET_MODE_LONG_NAME = "mutect3-dataset-mode";

public static final int DEFAULT_MUTECT3_REF_DOWNSAMPLE = 10;
public static final int DEFAULT_MUTECT3_ALT_DOWNSAMPLE = 20;
public static final int DEFAULT_MUTECT3_NON_ARTIFACT_RATIO = 20;
public static final int DEFAULT_MUTECT3_NON_ARTIFACT_RATIO = 1;

@Override
protected int getDefaultMaxMnpDistance() { return 1; }
Expand Down Expand Up @@ -205,6 +206,25 @@ public double getDefaultAlleleFrequency() {
@Argument(fullName = MUTECT3_DATASET_LONG_NAME, optional = true, doc="Destination for Mutect3 data collection")
public File mutect3Dataset;

@Advanced
@Argument(fullName = MUTECT3_DATASET_MODE_LONG_NAME, optional = true, doc="The type of Mutect3 dataset. Depends on sequencing technology.")
public Mutect3DatasetMode mutect3DatasetMode = Mutect3DatasetMode.ILLUMINA;

public enum Mutect3DatasetMode {
ILLUMINA(11),
ULTIMA(11);

final private int numReadFeatures;

Mutect3DatasetMode(final int numReadFeatures) {
this.numReadFeatures = numReadFeatures;
}

public int getNumReadFeatures() {
return numReadFeatures;
}
}

/**
* VCF of known calls for a sample used for generating a Mutect3 training dataset. Unfiltered variants (PASS or empty FILTER field)
* contained in this VCF are considered good; other variants (i.e. filtered in this VCF or absent from it) are considered errors.
Expand Down
Loading

0 comments on commit 2d50cf8

Please sign in to comment.