-
Notifications
You must be signed in to change notification settings - Fork 589
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
CNNVariant Update models, validate scores, cleanup training #5175
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,191 @@ | ||
package org.broadinstitute.hellbender.tools.walkers.validation; | ||
|
||
import java.nio.file.Paths; | ||
import java.io.IOException; | ||
|
||
import org.apache.commons.collections4.Predicate; | ||
|
||
import htsjdk.variant.variantcontext.VariantContext; | ||
|
||
import org.broadinstitute.barclay.argparser.Advanced; | ||
import org.broadinstitute.barclay.argparser.Argument; | ||
import org.broadinstitute.barclay.argparser.BetaFeature; | ||
import org.broadinstitute.barclay.help.DocumentedFeature; | ||
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; | ||
|
||
import org.broadinstitute.hellbender.engine.ReadsContext; | ||
import org.broadinstitute.hellbender.engine.ReferenceContext; | ||
import org.broadinstitute.hellbender.exceptions.UserException; | ||
import org.broadinstitute.hellbender.engine.AbstractConcordanceWalker; | ||
|
||
import picard.cmdline.programgroups.VariantEvaluationProgramGroup; | ||
|
||
/** | ||
* Compare INFO field values between two VCFs or compare two different INFO fields from one VCF. | ||
* We only evaluate sites that are in both VCFs. | ||
* Although we use the arguments eval and truth, we only compare the scores, we do not determine the correct score. | ||
* Either VCF can be used as eval or truth, or the same VCF can be used for both. | ||
* Differences greater than the epsilon argument will trigger a warning. | ||
* | ||
* <h3>Compare the CNN_2D info fields for the same sites from two different VCFs:</h3> | ||
* | ||
* <pre> | ||
* gatk EvaluateInfoFieldConcordance \ | ||
* -eval a.vcf \ | ||
* -truth another.vcf \ | ||
* -S summary.txt \ | ||
* -eval-info-key CNN_2D \ | ||
* -truth-info-key CNN_2D \ | ||
* -epsilon 0.01 | ||
* </pre> | ||
* | ||
* <h3>Compare the CNN_2D info field with the CNN_1D field from the same sites in one VCF:</h3> | ||
* | ||
* <pre> | ||
* gatk EvaluateInfoFieldConcordance \ | ||
* -eval my.vcf \ | ||
* -truth my.vcf \ | ||
* -S summary.txt \ | ||
* -eval-info-key CNN_2D \ | ||
* -truth-info-key CNN_1D \ | ||
* -epsilon 0.01 | ||
* </pre> | ||
*/ | ||
@CommandLineProgramProperties( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This tool needs javadoc, including a usage example. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The doc here, and the summary lines below, should reflect/mention that this handles one key at a time (currently, the doc uses plural "keys" so it sounds like it can evaluate more than one at a time). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Doc added, we can use two different keys from 1 VCF or 1 or 2 keys for 2 different vcfs. Clarified in the doc. |
||
summary=EvaluateInfoFieldConcordance.USAGE_SUMMARY, | ||
oneLineSummary=EvaluateInfoFieldConcordance.USAGE_ONE_LINE_SUMMARY, | ||
programGroup=VariantEvaluationProgramGroup.class) | ||
@DocumentedFeature | ||
@BetaFeature | ||
public class EvaluateInfoFieldConcordance extends AbstractConcordanceWalker { | ||
static final String USAGE_ONE_LINE_SUMMARY = "Evaluate concordance of info fields in an input VCF against a validated truth VCF"; | ||
static final String USAGE_SUMMARY = "This tool evaluates info fields from an input VCF against a VCF that has been validated and is considered to represent ground truth.\n"; | ||
public static final String SUMMARY_LONG_NAME = "summary"; | ||
public static final String SUMMARY_SHORT_NAME = "S"; | ||
|
||
@Argument(doc="A table of summary statistics (true positives, sensitivity, etc.)", fullName=SUMMARY_LONG_NAME, shortName=SUMMARY_SHORT_NAME) | ||
protected String summary; | ||
|
||
@Argument(fullName="eval-info-key", shortName="eval-info-key", doc="Info key from eval vcf") | ||
protected String evalInfoKey; | ||
|
||
@Argument(fullName="truth-info-key", shortName="truth-info-key", doc="Info key from truth vcf") | ||
protected String truthInfoKey; | ||
|
||
@Advanced | ||
@Argument(fullName="warn-big-differences", | ||
shortName="warn-big-differences", | ||
doc="If set differences in the info key values greater than epsilon will trigger warnings.", | ||
optional=true) | ||
protected boolean warnBigDifferences = false; | ||
|
||
@Advanced | ||
@Argument(fullName="epsilon", shortName="epsilon", doc="Difference tolerance", optional=true) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Doc should describe what this this is used for (currently its only used for messages written to the logger - see comment below). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed |
||
protected double epsilon = 0.1; | ||
|
||
private int snpCount = 0; | ||
private int indelCount = 0; | ||
|
||
private double snpSumDelta = 0.0; | ||
private double snpSumDeltaSquared = 0.0; | ||
private double indelSumDelta = 0.0; | ||
private double indelSumDeltaSquared = 0.0; | ||
|
||
@Override | ||
public void onTraversalStart() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. All of the inherited methods in here should have There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed |
||
if(getEvalHeader().getInfoHeaderLine(evalInfoKey) == null){ | ||
throw new UserException("Missing key:"+evalInfoKey+" in Eval VCF:"+evalVariantsFile); | ||
} | ||
|
||
if(getTruthHeader().getInfoHeaderLine(truthInfoKey) == null){ | ||
throw new UserException("Missing key:"+truthInfoKey+" in Truth VCF:"+truthVariantsFile); | ||
} | ||
} | ||
|
||
@Override | ||
protected void apply(AbstractConcordanceWalker.TruthVersusEval truthVersusEval, ReadsContext readsContext, ReferenceContext refContext) { | ||
ConcordanceState concordanceState = truthVersusEval.getConcordance(); | ||
switch (concordanceState) { | ||
case TRUE_POSITIVE: { | ||
if(truthVersusEval.getEval().isSNP()){ | ||
snpCount++; | ||
} else if (truthVersusEval.getEval().isIndel()) { | ||
indelCount++; | ||
} | ||
this.infoDifference(truthVersusEval.getEval(), truthVersusEval.getTruth()); | ||
break; | ||
} | ||
case FALSE_POSITIVE: | ||
case FALSE_NEGATIVE: | ||
case FILTERED_TRUE_NEGATIVE: | ||
case FILTERED_FALSE_NEGATIVE: { | ||
break; | ||
} | ||
default: { | ||
throw new IllegalStateException("Unexpected ConcordanceState: " + concordanceState.toString()); | ||
} | ||
} | ||
} | ||
|
||
private void infoDifference(final VariantContext eval, final VariantContext truth) { | ||
if(eval.hasAttribute(this.evalInfoKey) && truth.hasAttribute(truthInfoKey)) { | ||
final double evalVal = Double.valueOf((String) eval.getAttribute(this.evalInfoKey)); | ||
final double truthVal = Double.valueOf((String) truth.getAttribute(this.truthInfoKey)); | ||
final double delta = evalVal - truthVal; | ||
final double deltaSquared = delta * delta; | ||
if (eval.isSNP()) { | ||
this.snpSumDelta += Math.sqrt(deltaSquared); | ||
this.snpSumDeltaSquared += deltaSquared; | ||
} else if (eval.isIndel()) { | ||
this.indelSumDelta += Math.sqrt(deltaSquared); | ||
this.indelSumDeltaSquared += deltaSquared; | ||
} | ||
if (warnBigDifferences && Math.abs(delta) > this.epsilon) { | ||
this.logger.warn(String.format("Difference (%f) greater than epsilon (%f) at %s:%d %s:", delta, this.epsilon, eval.getContig(), eval.getStart(), eval.getAlleles().toString())); | ||
this.logger.warn(String.format("\t\tTruth info: " + truth.getAttributes().toString())); | ||
this.logger.warn(String.format("\t\tEval info: " + eval.getAttributes().toString())); | ||
} | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure it makes sense to send this to log output. It could result in a lot of output, and its a lot of conversion/formatting that will happen no matter what the log level. Maybe write them to a file ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since the intended use here is just for debugging I'd prefer not write them to a file, we could make this a oneTime logger or just remove, although I do think the current behavior could be helpful in the future. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We generally don't leave debugging code enabled. If its just for (your) debugging, I'd either remove it, or else make it conditional on an |
||
} | ||
|
||
@Override | ||
public Object onTraversalSuccess() { | ||
final double snpMean = this.snpSumDelta / snpCount; | ||
final double snpVariance = (this.snpSumDeltaSquared - this.snpSumDelta * this.snpSumDelta / snpCount) / snpCount; | ||
final double snpStd = Math.sqrt(snpVariance); | ||
final double indelMean = this.indelSumDelta / indelCount; | ||
final double indelVariance = (this.indelSumDeltaSquared - this.indelSumDelta * this.indelSumDelta / indelCount) / indelCount; | ||
final double indelStd = Math.sqrt(indelVariance); | ||
|
||
this.logger.info(String.format("SNP average delta %f and standard deviation: %f", snpMean, snpStd)); | ||
this.logger.info(String.format("INDEL average delta %f and standard deviation: %f", indelMean, indelStd)); | ||
|
||
try (final InfoConcordanceRecord.InfoConcordanceWriter | ||
concordanceWriter = InfoConcordanceRecord.getWriter(Paths.get(this.summary))){ | ||
concordanceWriter.writeRecord(new InfoConcordanceRecord(VariantContext.Type.SNP, this.evalInfoKey, this.truthInfoKey, snpMean, snpStd)); | ||
concordanceWriter.writeRecord(new InfoConcordanceRecord(VariantContext.Type.INDEL, this.evalInfoKey, this.truthInfoKey, indelMean, indelStd)); | ||
} catch (IOException e) { | ||
throw new UserException("Encountered an IO exception writing the concordance summary table", e); | ||
} | ||
|
||
return "SUCCESS"; | ||
} | ||
|
||
@Override | ||
protected boolean areVariantsAtSameLocusConcordant(VariantContext truth, VariantContext eval) { | ||
final boolean sameRefAllele = truth.getReference().equals(eval.getReference()); | ||
final boolean containsAltAllele = eval.getAlternateAlleles().contains(truth.getAlternateAllele(0)); | ||
return sameRefAllele && containsAltAllele; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it ok/correct to ignore the other truth alternate alleles ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since the info fields are site-level values the alt alleles it should be fine to grab the first allele only. |
||
} | ||
|
||
@Override | ||
protected Predicate<VariantContext> makeTruthVariantFilter() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this have an eval filter as well ? As it stands, even with using the same input file for truth and eval, the integration test is racking up a bunch of FILTERED_TRUE_NEGATIVEs. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added |
||
return vc -> !vc.isFiltered() && !vc.isSymbolicOrSV(); | ||
} | ||
|
||
@Override | ||
protected Predicate<VariantContext> makeEvalVariantFilter() { | ||
return vc -> !vc.isFiltered() && !vc.isSymbolicOrSV(); | ||
} | ||
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,140 @@ | ||
package org.broadinstitute.hellbender.tools.walkers.validation; | ||
|
||
import htsjdk.variant.variantcontext.VariantContext; | ||
|
||
import java.io.IOException; | ||
import java.nio.file.Path; | ||
|
||
import org.broadinstitute.hellbender.exceptions.UserException; | ||
|
||
import org.broadinstitute.hellbender.utils.tsv.DataLine; | ||
import org.broadinstitute.hellbender.utils.tsv.TableColumnCollection; | ||
import org.broadinstitute.hellbender.utils.tsv.TableWriter; | ||
import org.broadinstitute.hellbender.utils.tsv.TableReader; | ||
|
||
/** | ||
* Keeps track of concordance between two info fields. | ||
*/ | ||
public class InfoConcordanceRecord { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Whole class needs javadoc. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added |
||
private static final String VARIANT_TYPE_COLUMN_NAME = "type"; | ||
private static final String EVAL_INFO_KEY = "eval_info_key"; | ||
private static final String TRUE_INFO_KEY = "true_info_key"; | ||
private static final String MEAN_DIFFERENCE = "mean_difference"; | ||
private static final String STD_DIFFERENCE = "std_difference"; | ||
private static final String[] INFO_CONCORDANCE_COLUMN_HEADER = | ||
{VARIANT_TYPE_COLUMN_NAME, EVAL_INFO_KEY, TRUE_INFO_KEY, MEAN_DIFFERENCE, STD_DIFFERENCE}; | ||
final VariantContext.Type type; | ||
private final String evalKey; | ||
private final String trueKey; | ||
private final double mean; | ||
private final double std; | ||
|
||
/** | ||
* Record keeps track of concordance between values from INFO-field keys of a VCF. | ||
* | ||
* @param type SNP or INDEL | ||
* @param evalKey The INFO field key from the eval VCF | ||
* @param trueKey The INFO field key from the truth VCF | ||
* @param mean The mean of the differences in values for these INFO fields. | ||
* @param std The standard deviation of the differences in values for these INFO fields. | ||
*/ | ||
public InfoConcordanceRecord(VariantContext.Type type, String evalKey, String trueKey, double mean, double std) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Public methods and classes need javadoc - it should be trivial to add. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added |
||
this.type = type; | ||
this.evalKey = evalKey; | ||
this.trueKey = trueKey; | ||
this.mean = mean; | ||
this.std = std; | ||
} | ||
|
||
/** | ||
* | ||
* @return Variant type (e.g. SNP or INDEL) | ||
*/ | ||
public VariantContext.Type getVariantType() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. All of the public methods and (embedded classes) still need javadoc. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added |
||
return this.type; | ||
} | ||
|
||
/** | ||
* | ||
* @return The mean of the differences between two INFO fields | ||
*/ | ||
public double getMean() { | ||
return this.mean; | ||
} | ||
|
||
/** | ||
* | ||
* @return The Standard Deviation of the differences between two INFO fields | ||
*/ | ||
public double getStd() { | ||
return this.std; | ||
} | ||
|
||
/** | ||
* | ||
* @return The INFO field for the eval VCF | ||
*/ | ||
public String getEvalKey() { | ||
return this.evalKey; | ||
} | ||
|
||
/** | ||
* | ||
* @return The INFO field for the truth VCF | ||
*/ | ||
public String getTrueKey() { | ||
return this.trueKey; | ||
} | ||
|
||
/** | ||
* Get a table writer | ||
* @param outputTable A Path where the output table will be written | ||
* @return A Table writer for INFO field concordances | ||
*/ | ||
public static InfoConcordanceWriter getWriter(Path outputTable) { | ||
try { | ||
InfoConcordanceWriter writer = new InfoConcordanceWriter(outputTable); | ||
return writer; | ||
} | ||
catch (IOException e) { | ||
throw new UserException(String.format("Encountered an IO exception while writing from %s.", outputTable), e); | ||
} | ||
} | ||
|
||
/** | ||
* Table writing class for InfoConcordanceRecords | ||
*/ | ||
public static class InfoConcordanceWriter extends TableWriter<InfoConcordanceRecord> { | ||
private InfoConcordanceWriter(Path output) throws IOException { | ||
super(output.toFile(), new TableColumnCollection(INFO_CONCORDANCE_COLUMN_HEADER)); | ||
} | ||
|
||
@Override | ||
protected void composeLine(InfoConcordanceRecord record, DataLine dataLine) { | ||
dataLine.set(VARIANT_TYPE_COLUMN_NAME, record.getVariantType().toString()) | ||
.set(EVAL_INFO_KEY, record.getEvalKey()) | ||
.set(TRUE_INFO_KEY, record.getTrueKey()) | ||
.set(MEAN_DIFFERENCE, record.getMean()) | ||
.set(STD_DIFFERENCE, record.getStd()); | ||
} | ||
} | ||
|
||
/** | ||
* Table reading class for InfoConcordanceRecords | ||
*/ | ||
public static class InfoConcordanceReader extends TableReader<InfoConcordanceRecord> { | ||
public InfoConcordanceReader(Path summary) throws IOException { | ||
super(summary.toFile()); | ||
} | ||
|
||
@Override | ||
protected InfoConcordanceRecord createRecord(DataLine dataLine) { | ||
VariantContext.Type type = VariantContext.Type.valueOf(dataLine.get(VARIANT_TYPE_COLUMN_NAME)); | ||
String evalKey = dataLine.get(EVAL_INFO_KEY); | ||
String trueKey = dataLine.get(TRUE_INFO_KEY); | ||
double mean = Double.parseDouble(dataLine.get(MEAN_DIFFERENCE)); | ||
double std = Double.parseDouble(dataLine.get(STD_DIFFERENCE)); | ||
return new InfoConcordanceRecord(type, evalKey, trueKey, mean, std); | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Have/can you test these new versions with the intel environment, at least manually ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes I have tested and I did not see any issues.