Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix integer overflow bug in RMSMappingQuality #5435

Merged
merged 3 commits into from
Nov 28, 2018
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines;

import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.IntStream;


Expand Down Expand Up @@ -85,7 +86,7 @@ public Map<String, Object> annotateRawData(final ReferenceContext ref,
}

final Map<String, Object> annotations = new HashMap<>();
final ReducibleAnnotationData<List<Integer>> myData = new ReducibleAnnotationData<>(null);
final ReducibleAnnotationData<List<Long>> myData = new ReducibleAnnotationData<>(null);
calculateRawData(vc, likelihoods, myData);
final String annotationString = makeRawAnnotationString(vc.getAlleles(), myData.getAttributeMap());
annotations.put(getRawKeyName(), annotationString);
Expand All @@ -108,8 +109,8 @@ public Map<String, Object> combineRawData(final List<Allele> vcAlleles, final Li
return annotations;
}

public String makeRawAnnotationString(final List<Allele> vcAlleles, final Map<Allele, List<Integer>> perAlleleData) {
return String.format("%d,%d", perAlleleData.get(Allele.NO_CALL).get(0), perAlleleData.get(Allele.NO_CALL).get(1));
private String makeRawAnnotationString(final List<Allele> vcAlleles, final Map<Allele, List<Long>> perAlleleData) {
return String.format("%d,%d", perAlleleData.get(Allele.NO_CALL).get(SUM_OF_SQUARES_INDEX), perAlleleData.get(Allele.NO_CALL).get(TOTAL_DEPTH_INDEX));
}

@Override
Expand All @@ -127,17 +128,17 @@ else if (vc.hasAttribute(getDeprecatedRawKeyName())) {
"that may not have compatible annotations with this GATK version. Attempting to reformat MQ data.");
final String rawMQdepth = vc.getAttributeAsString("MQ_DP",null);
if (rawMQdepth == null) {
throw new UserException.BadInput("MQ annotation data is not properly formatted. This version expects an " +
"int tuple of sum of squared MQ values and total reads over variant genotypes.");
throw new UserException.BadInput("MQ annotation data is not properly formatted. This version expects a " +
"long tuple of sum of squared MQ values and total reads over variant genotypes.");
}
rawMQdata = Math.round(Double.parseDouble(rawMQdata)) + "," + rawMQdepth; //deprecated format was double so it needs to be converted to int
rawMQdata = Math.round(Double.parseDouble(rawMQdata)) + "," + rawMQdepth; //deprecated format was double so it needs to be converted to long
}
else {
logger.warn("MQ annotation data is not properly formatted. This GATK version expects key "
+ getRawKeyName() + " with an int tuple of sum of squared MQ values and total reads over variant "
+ getRawKeyName() + " with an long tuple of sum of squared MQ values and total reads over variant "
+ "genotypes as the value. Attempting to use deprecated MQ calculation.");
final int numOfReads = getNumOfReads(vc, null);
rawMQdata = Math.round(Double.parseDouble(rawMQdata)) + "," + numOfReads; //deprecated format was double so it needs to be converted to int
final long numOfReads = getNumOfReads(vc, null);
rawMQdata = Math.round(Double.parseDouble(rawMQdata)) + "," + numOfReads; //deprecated format was double so it needs to be converted to long
}
}
else {
Expand All @@ -147,43 +148,43 @@ else if (vc.hasAttribute(getDeprecatedRawKeyName())) {
return new HashMap<>();
}

ReducibleAnnotationData<List<Integer>> myData = new ReducibleAnnotationData(rawMQdata);
ReducibleAnnotationData<List<Long>> myData = new ReducibleAnnotationData(rawMQdata);
parseRawDataString(myData);

final String annotationString = makeFinalizedAnnotationString(myData.getAttribute(Allele.NO_CALL).get(TOTAL_DEPTH_INDEX), myData.getAttribute(Allele.NO_CALL).get(SUM_OF_SQUARES_INDEX));
return Collections.singletonMap(getKeyNames().get(0), annotationString);
}

public String makeFinalizedAnnotationString(final int numOfReads, final int sumOfSquaredMQs) {
private String makeFinalizedAnnotationString(final long numOfReads, final long sumOfSquaredMQs) {
return String.format("%.2f", Math.sqrt(sumOfSquaredMQs/(double)numOfReads));
}


/**
* Combine the int tuples: sum of squared mapping quality and read counts over variant genotypes
* Combine the long tuples: sum of squared mapping quality and read counts over variant genotypes
* Since this is not an allele-specific annotation, store the result with the NO_CALL allele key.
* @param toAdd new data
* @param combined passed in as previous data, modified to store the combined result
*/
public void combineAttributeMap(ReducibleAnnotationData<List<Integer>> toAdd, ReducibleAnnotationData<List<Integer>> combined) {
private void combineAttributeMap(ReducibleAnnotationData<List<Long>> toAdd, ReducibleAnnotationData<List<Long>> combined) {
if (combined.getAttribute(Allele.NO_CALL) != null) {
combined.putAttribute(Allele.NO_CALL, Arrays.asList(combined.getAttribute(Allele.NO_CALL).get(0) + toAdd.getAttribute(Allele.NO_CALL).get(0),
combined.getAttribute(Allele.NO_CALL).get(1) + toAdd.getAttribute(Allele.NO_CALL).get(1)));
combined.putAttribute(Allele.NO_CALL, Arrays.asList(combined.getAttribute(Allele.NO_CALL).get(SUM_OF_SQUARES_INDEX) + toAdd.getAttribute(Allele.NO_CALL).get(SUM_OF_SQUARES_INDEX),
combined.getAttribute(Allele.NO_CALL).get(TOTAL_DEPTH_INDEX) + toAdd.getAttribute(Allele.NO_CALL).get(TOTAL_DEPTH_INDEX)));
} else {
combined.putAttribute(Allele.NO_CALL, toAdd.getAttribute(Allele.NO_CALL));
}
}

@SuppressWarnings({"unchecked", "rawtypes"})//FIXME
public void calculateRawData(final VariantContext vc,
private void calculateRawData(final VariantContext vc,
final ReadLikelihoods<Allele> likelihoods,
final ReducibleAnnotationData rawAnnotations){
//GATK3.5 had a double, but change this to an int for the tuple representation (square sum, read count)
int squareSum = 0;
int numReadsUsed = 0;
//GATK3.5 had a double, but change this to an long for the tuple representation (square sum, read count)
long squareSum = 0;
long numReadsUsed = 0;
for (int i = 0; i < likelihoods.numberOfSamples(); i++) {
for (final GATKRead read : likelihoods.sampleReads(i)) {
int mq = read.getMappingQuality();
long mq = read.getMappingQuality();
if (mq != QualityUtils.MAPPING_QUALITY_UNAVAILABLE) {
squareSum += mq * mq;
numReadsUsed++;
Expand All @@ -203,7 +204,7 @@ public Map<String, Object> annotate(final ReferenceContext ref,
}

final Map<String, Object> annotations = new HashMap<>();
final ReducibleAnnotationData<List<Integer>> myData = new ReducibleAnnotationData<>(null);
final ReducibleAnnotationData<List<Long>> myData = new ReducibleAnnotationData<>(null);
calculateRawData(vc, likelihoods, myData);
final String annotationString = makeFinalizedAnnotationString(myData.getAttribute(Allele.NO_CALL).get(TOTAL_DEPTH_INDEX), myData.getAttribute(Allele.NO_CALL).get(SUM_OF_SQUARES_INDEX));
annotations.put(getKeyNames().get(0), annotationString);
Expand All @@ -227,7 +228,7 @@ public VariantContext finalizeRawMQ(final VariantContext vc) {
if (rawMQdata == null) {
return vc;
} else {
final List<Integer> SSQMQandDP = parseRawDataString(rawMQdata);
final List<Long> SSQMQandDP = parseRawDataString(rawMQdata);
final double rms = Math.sqrt(SSQMQandDP.get(SUM_OF_SQUARES_INDEX) / (double)SSQMQandDP.get(TOTAL_DEPTH_INDEX));
final String finalizedRMSMAppingQuality = formattedValue(rms);
return new VariantContextBuilder(vc)
Expand All @@ -237,25 +238,43 @@ public VariantContext finalizeRawMQ(final VariantContext vc) {
}
}

protected void parseRawDataString(ReducibleAnnotationData<List<Integer>> myData) {
private void parseRawDataString(ReducibleAnnotationData<List<Long>> myData) {
myData.putAttribute(Allele.NO_CALL, parseRawDataString(myData.getRawData()));
}

//TODO once the AS annotations have been added genotype gvcfs this can be removed for a more generic approach
private static List<Integer> parseRawDataString(String rawDataString) {
private static List<Long> parseRawDataString(String rawDataString) {
try {
final String[] parsed = rawDataString.split(",");
if (parsed.length != NUM_LIST_ENTRIES) {
throw new UserException.BadInput("Raw value for annotation has " + parsed.length + " values, expected " + NUM_LIST_ENTRIES);
}
final int squareSum = Integer.parseInt(parsed[SUM_OF_SQUARES_INDEX]);
final int totalDP = Integer.parseInt(parsed[TOTAL_DEPTH_INDEX]);
final long squareSum = Long.parseLong(parsed[SUM_OF_SQUARES_INDEX]);
final long totalDP = Long.parseLong(parsed[TOTAL_DEPTH_INDEX]);
return Arrays.asList(squareSum,totalDP);
} catch (final NumberFormatException e) {
throw new UserException.BadInput("malformed " + GATKVCFConstants.RAW_RMS_MAPPING_QUALITY_KEY + " annotation: " + rawDataString);
}
}

/**
* Private getter function to replace VariantContext::getAttributeAsIntList in instances where there is a chance
* that ints will overlow beyond Integer.MAX_VALUE
* @return VariantContext attribute indexed by key, as list of long.
*/
static private List<Long> getAttributeAsLongList(final VariantContext vc, final String key, final Long defaultValue) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree with your reticence to put this into htsjdk. The type handling there is super awkward, but I don't think it's worth dealing with until it gets improved in 3.0. However, I might suggest making this method public and moving it to a utility class. GATKProtectedVariantContextUtils has a lot of similar methods

return vc.getAttributeAsList(key).stream().map(
x -> {
if (x == null || x == VCFConstants.MISSING_VALUE_v4) {
return defaultValue;
} else if (x instanceof Number) {
return ((Number) x).longValue();
} else {
return Long.valueOf((String)x); // throws an exception if this isn't a string
}
}
).collect(Collectors.toList());
}

/**
*
Expand All @@ -266,24 +285,24 @@ private static List<Integer> parseRawDataString(String rawDataString) {
* @throws UserException.BadInput if the {@link VCFConstants#DEPTH_KEY} is missing or if the calculated depth is <= 0
*/
@VisibleForTesting
static int getNumOfReads(final VariantContext vc,
static long getNumOfReads(final VariantContext vc,
final ReadLikelihoods<Allele> likelihoods) {
if(vc.hasAttribute(GATKVCFConstants.RAW_MAPPING_QUALITY_WITH_DEPTH_KEY)) {
List<Integer> MQtuple = vc.getAttributeAsIntList(GATKVCFConstants.RAW_MAPPING_QUALITY_WITH_DEPTH_KEY,0);
List<Long> MQtuple = getAttributeAsLongList(vc, GATKVCFConstants.RAW_MAPPING_QUALITY_WITH_DEPTH_KEY,0L);
if (MQtuple.get(TOTAL_DEPTH_INDEX) > 0) {
return MQtuple.get(TOTAL_DEPTH_INDEX);
}
}

int numOfReads = 0;
long numOfReads = 0;
if (vc.hasAttribute(VCFConstants.DEPTH_KEY)) {
numOfReads = vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, -1);
if(vc.hasGenotypes()) {
for(final Genotype gt : vc.getGenotypes()) {
if(gt.isHomRef()) {
//site-level DP contribution will come from MIN_DP for gVCF-called reference variants or DP for BP resolution
if (gt.hasExtendedAttribute(GATKVCFConstants.MIN_DP_FORMAT_KEY)) {
numOfReads -= Integer.parseInt(gt.getExtendedAttribute(GATKVCFConstants.MIN_DP_FORMAT_KEY).toString());
numOfReads -= Long.parseLong(gt.getExtendedAttribute(GATKVCFConstants.MIN_DP_FORMAT_KEY).toString());
} else if (gt.hasDP()) {
numOfReads -= gt.getDP();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -215,8 +215,32 @@ public void testCombineAndFinalize() {
Assert.assertEquals(Double.parseDouble((String)output.get("MQ")), 59.01);
}

@Test
public void testCombineAndFinalizeHighMQSquared() {
final List<Allele> vcAlleles = Arrays.asList(Allele.create("A", true), Allele.create("T", false));
final List<ReducibleAnnotationData<?>> combinedVCdata = new ArrayList<>();
// Test that updating the annotation works when Integer.MAX_VALUE is exceeded, both for small and large updates:
combinedVCdata.add(new ReducibleAnnotationData<>("10125000000,5000000")); //5,000,000 MQ45 reads
combinedVCdata.add(new ReducibleAnnotationData<>("2601,1")); //1 MQ51 read
combinedVCdata.add(new ReducibleAnnotationData<>("1800000000,500000")); //500,000 MQ60 reads

final RMSMappingQuality annotator = RMSMappingQuality.getInstance();

final Map<String, Object> combined = annotator.combineRawData(vcAlleles, combinedVCdata);
final String combinedListString = (String)combined.get(annotator.getRawKeyName());
Assert.assertEquals(combinedListString, "11925002601,5500001");

final VariantContext vc = new VariantContextBuilder(makeVC())
.attribute(GATKVCFConstants.RAW_MAPPING_QUALITY_WITH_DEPTH_KEY, combinedListString)
.make();
final VariantContext originalVC = null;
final Map<String, Object> output = new RMSMappingQuality().finalizeRawData(vc, originalVC);
Assert.assertEquals(Double.parseDouble((String)output.get("MQ")), 46.56);
}

@Test
public void testFinalizeRawData(){
// NOTE: RMSMappingQuality should ignore homRef depth of 20 and use only the 13 variants to calculate score.
final VariantContext vc = new VariantContextBuilder(makeVC())
.attribute(GATKVCFConstants.RAW_MAPPING_QUALITY_WITH_DEPTH_KEY, "43732,13")
.attribute(VCFConstants.DEPTH_KEY, 20)
Expand All @@ -226,6 +250,17 @@ public void testFinalizeRawData(){
Assert.assertEquals(output.get("MQ"), "58.00");
}

@Test
public void testFinalizeHighMQSquaredRawData(){
// Test that RMS Mapping Quality is correctly computed when Integer.MAX_VALUE is exceeded.
final VariantContext vc = new VariantContextBuilder(makeVC())
.attribute(GATKVCFConstants.RAW_MAPPING_QUALITY_WITH_DEPTH_KEY, "3415207168,1749038")
.make();
final VariantContext originalVC = null;
final Map<String, Object> output = new RMSMappingQuality().finalizeRawData(vc, originalVC);
Assert.assertEquals(output.get("MQ"), "44.19");
}

@Test
public void testFinalizeRawMQ(){
final VariantContext vc = new VariantContextBuilder(makeVC())
Expand Down