Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed Concordance tool to allow for no variation alleles in the truth data. #5718

Merged
merged 2 commits into from
Feb 26, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.vcf.*;
import org.apache.commons.collections4.Predicate;
import org.apache.commons.lang.mutable.MutableInt;
import org.apache.commons.lang.mutable.MutableLong;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.BetaFeature;
Expand Down Expand Up @@ -291,8 +290,12 @@ public Object onTraversalSuccess() {
@Override
protected boolean areVariantsAtSameLocusConcordant(final VariantContext truth, final VariantContext eval) {
final boolean sameRefAllele = truth.getReference().equals(eval.getReference());

// we assume that the truth has a single alternate allele
final boolean containsAltAllele = eval.getAlternateAlleles().contains(truth.getAlternateAllele(0));
final boolean containsAltAllele =
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You need to update the comment on line 294.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed!

(truth.getAlternateAlleles().size() == eval.getAlternateAlleles().size()) &&
((truth.getAlternateAlleles().size() > 0) &&
eval.getAlternateAlleles().contains(truth.getAlternateAllele(0)));

return sameRefAllele && containsAltAllele;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,16 @@
import htsjdk.variant.variantcontext.VariantContext;
import org.broadinstitute.hellbender.CommandLineProgramTest;
import org.broadinstitute.hellbender.engine.AbstractConcordanceWalker;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.testutils.VariantContextTestUtils;
import org.testng.Assert;
import org.testng.annotations.Test;

import java.io.File;
import java.util.*;
import java.io.IOException;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.stream.Collectors;

/**
Expand Down Expand Up @@ -47,7 +51,6 @@ public void testSimple() throws Exception {
Assert.assertEquals(indelRecord.getFalseNegatives(), 2);
Assert.assertEquals(snpRecord.getSensitivity(), 1.0/4, epsilon);
Assert.assertEquals(snpRecord.getPrecision(), 1.0, epsilon);

}

// Test going from an integer chromosome (22) to a character chromosome (X)
Expand Down Expand Up @@ -209,4 +212,35 @@ public void testDreamSensitivity() throws Exception {
// 78
Assert.assertEquals(snpRecord.getTruePositives() + snpRecord.getFalseNegatives(), 78);
}

@Test
public void testDoesNotCrashWithNO_VARIATIONAlleles() {
final String testDir = toolsTestDir + "concordance/";
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This appears elsewhere in this test class, so perhaps extract a static constant for it.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good! Fixed!

final File evalVcf = new File(testDir, "noVariationAlleles.vcf");
final File truthVcf = new File(testDir, "noVariationAlleles.vcf");
final File summary = createTempFile("summary", ".txt");

final String[] args = {
"--" + AbstractConcordanceWalker.EVAL_VARIANTS_LONG_NAME, evalVcf.toString(),
"--" + AbstractConcordanceWalker.TRUTH_VARIANTS_LONG_NAME, truthVcf.toString(),
"--" + Concordance.SUMMARY_LONG_NAME, summary.toString(),
};

runCommandLine(args);

try {
final ConcordanceSummaryRecord.Reader reader = new ConcordanceSummaryRecord.Reader(summary);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's some strange spacing here.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah. I didn't reformat the code before I checked it in. I like to align my blocks of variable definitions.

Fixed!

final ConcordanceSummaryRecord snpRecord = reader.readRecord();
final ConcordanceSummaryRecord indelRecord = reader.readRecord();

// Some token validation:
Assert.assertEquals(snpRecord.getSensitivity(), 1, 0.005);
Assert.assertEquals(indelRecord.getSensitivity(), 0, 0.005);
Assert.assertEquals(snpRecord.getTruePositives() + snpRecord.getFalseNegatives(), 1);
Assert.assertEquals(indelRecord.getTruePositives() + indelRecord.getFalseNegatives(), 2);
}
catch (final IOException | java.lang.NullPointerException ex) {
throw new GATKException("Could not get summary file! ", ex);
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
##fileformat=VCFv4.2
##FILTER=<ID=LowQual,Description="Low quality">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##GATKCommandLine=<ID=HaplotypeCaller,CommandLine="HaplotypeCaller --output NexPond-412726.HC.vcf --intervals /cromwell_root/haplotypecallerspark-evaluation/inputData/whole_exome_illumina_coding_v1.Homo_sapiens_assembly19.targets.interval_list --input /cromwell_root/haplotypecallerspark-evaluation/inputData/NexPond-412726.bam --reference /cromwell_root/broad-references/hg19/v0/Homo_sapiens_assembly19.fasta --emit-ref-confidence NONE --gvcf-gq-bands 1 --gvcf-gq-bands 2 --gvcf-gq-bands 3 --gvcf-gq-bands 4 --gvcf-gq-bands 5 --gvcf-gq-bands 6 --gvcf-gq-bands 7 --gvcf-gq-bands 8 --gvcf-gq-bands 9 --gvcf-gq-bands 10 --gvcf-gq-bands 11 --gvcf-gq-bands 12 --gvcf-gq-bands 13 --gvcf-gq-bands 14 --gvcf-gq-bands 15 --gvcf-gq-bands 16 --gvcf-gq-bands 17 --gvcf-gq-bands 18 --gvcf-gq-bands 19 --gvcf-gq-bands 20 --gvcf-gq-bands 21 --gvcf-gq-bands 22 --gvcf-gq-bands 23 --gvcf-gq-bands 24 --gvcf-gq-bands 25 --gvcf-gq-bands 26 --gvcf-gq-bands 27 --gvcf-gq-bands 28 --gvcf-gq-bands 29 --gvcf-gq-bands 30 --gvcf-gq-bands 31 --gvcf-gq-bands 32 --gvcf-gq-bands 33 --gvcf-gq-bands 34 --gvcf-gq-bands 35 --gvcf-gq-bands 36 --gvcf-gq-bands 37 --gvcf-gq-bands 38 --gvcf-gq-bands 39 --gvcf-gq-bands 40 --gvcf-gq-bands 41 --gvcf-gq-bands 42 --gvcf-gq-bands 43 --gvcf-gq-bands 44 --gvcf-gq-bands 45 --gvcf-gq-bands 46 --gvcf-gq-bands 47 --gvcf-gq-bands 48 --gvcf-gq-bands 49 --gvcf-gq-bands 50 --gvcf-gq-bands 51 --gvcf-gq-bands 52 --gvcf-gq-bands 53 --gvcf-gq-bands 54 --gvcf-gq-bands 55 --gvcf-gq-bands 56 --gvcf-gq-bands 57 --gvcf-gq-bands 58 --gvcf-gq-bands 59 --gvcf-gq-bands 60 --gvcf-gq-bands 70 --gvcf-gq-bands 80 --gvcf-gq-bands 90 --gvcf-gq-bands 99 --indel-size-to-eliminate-in-ref-model 10 --use-alleles-trigger false --disable-optimizations false --just-determine-active-regions false --dont-genotype false --max-mnp-distance 0 --dont-trim-active-regions false --max-disc-ar-extension 25 --max-gga-ar-extension 300 --padding-around-indels 150 --padding-around-snps 20 --adaptive-pruning false --do-not-recover-dangling-branches false --recover-dangling-heads false --consensus false --kmer-size 10 --kmer-size 25 --dont-increase-kmer-sizes-for-cycles false --allow-non-unique-kmers-in-ref false --num-pruning-samples 1 --min-dangling-branch-length 4 --max-num-haplotypes-in-population 128 --min-pruning 2 --adaptive-pruning-initial-error-rate 0.001 --pruning-lod-threshold 1.0 --max-unpruned-variants 100 --debug-graph-transformations false --kmer-length-for-read-error-correction 25 --min-observations-for-kmer-to-be-solid 20 --likelihood-calculation-engine PairHMM --base-quality-score-threshold 18 --pair-hmm-gap-continuation-penalty 10 --pair-hmm-implementation FASTEST_AVAILABLE --pcr-indel-model CONSERVATIVE --phred-scaled-global-read-mismapping-rate 45 --native-pair-hmm-threads 4 --native-pair-hmm-use-double-precision false --debug false --use-filtered-reads-for-annotations false --bam-writer-type CALLED_HAPLOTYPES --dont-use-soft-clipped-bases false --capture-assembly-failure-bam false --error-correct-reads false --do-not-run-physical-phasing false --min-base-quality-score 10 --smith-waterman JAVA --correct-overlapping-quality false --use-new-qual-calculator false --annotate-with-num-discovered-alleles false --heterozygosity 0.001 --indel-heterozygosity 1.25E-4 --heterozygosity-stdev 0.01 --standard-min-confidence-threshold-for-calling 10.0 --max-alternate-alleles 6 --max-genotype-count 1024 --sample-ploidy 2 --num-reference-samples-if-no-call 0 --genotyping-mode DISCOVERY --genotype-filtered-alleles false --contamination-fraction-to-filter 0.0 --output-mode EMIT_VARIANTS_ONLY --all-site-pls false --min-assembly-region-size 50 --max-assembly-region-size 300 --assembly-region-padding 100 --max-reads-per-alignment-start 50 --active-probability-threshold 0.002 --max-prob-propagation-distance 50 --interval-set-rule UNION --interval-padding 0 --interval-exclusion-padding 0 --interval-merging-rule ALL --read-validation-stringency SILENT --seconds-between-progress-updates 10.0 --disable-sequence-dictionary-validation false --create-output-bam-index true --create-output-bam-md5 false --create-output-variant-index true --create-output-variant-md5 false --lenient false --add-output-sam-program-record true --add-output-vcf-command-line true --cloud-prefetch-buffer 40 --cloud-index-prefetch-buffer -1 --disable-bam-index-caching false --sites-only-vcf-output false --help false --version false --showHidden false --verbosity INFO --QUIET false --use-jdk-deflater false --use-jdk-inflater false --gcs-max-retries 20 --gcs-project-for-requester-pays --disable-tool-default-read-filters false --minimum-mapping-quality 20 --disable-tool-default-annotations false --enable-all-annotations false",Version=4.0.11.0-88-g2ae01ef-SNAPSHOT,Date="December 11, 2018 3:49:48 AM UTC">
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
##INFO=<ID=DS,Number=0,Type=Flag,Description="Were any of the samples downsampled?">
##INFO=<ID=ExcessHet,Number=1,Type=Float,Description="Phred-scaled p-value for exact test of excess heterozygosity">
##INFO=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias">
##INFO=<ID=InbreedingCoeff,Number=1,Type=Float,Description="Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation">
##INFO=<ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed">
##INFO=<ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed">
##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities">
##INFO=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth">
##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias">
##INFO=<ID=SOR,Number=1,Type=Float,Description="Symmetric Odds Ratio of 2x2 contingency table to detect strand bias">
##contig=<ID=1,length=249250621,assembly=GRCh37>
##contig=<ID=2,length=243199373,assembly=GRCh37>
##contig=<ID=3,length=198022430,assembly=GRCh37>
##contig=<ID=4,length=191154276,assembly=GRCh37>
##contig=<ID=5,length=180915260,assembly=GRCh37>
##contig=<ID=6,length=171115067,assembly=GRCh37>
##contig=<ID=7,length=159138663,assembly=GRCh37>
##contig=<ID=8,length=146364022,assembly=GRCh37>
##contig=<ID=9,length=141213431,assembly=GRCh37>
##contig=<ID=10,length=135534747,assembly=GRCh37>
##contig=<ID=11,length=135006516,assembly=GRCh37>
##contig=<ID=12,length=133851895,assembly=GRCh37>
##contig=<ID=13,length=115169878,assembly=GRCh37>
##contig=<ID=14,length=107349540,assembly=GRCh37>
##contig=<ID=15,length=102531392,assembly=GRCh37>
##contig=<ID=16,length=90354753,assembly=GRCh37>
##contig=<ID=17,length=81195210,assembly=GRCh37>
##contig=<ID=18,length=78077248,assembly=GRCh37>
##contig=<ID=19,length=59128983,assembly=GRCh37>
##contig=<ID=20,length=63025520,assembly=GRCh37>
##contig=<ID=21,length=48129895,assembly=GRCh37>
##contig=<ID=22,length=51304566,assembly=GRCh37>
##contig=<ID=X,length=155270560,assembly=GRCh37>
##contig=<ID=Y,length=59373566,assembly=GRCh37>
##contig=<ID=MT,length=16569,assembly=GRCh37>
##contig=<ID=GL000207.1,length=4262,assembly=GRCh37>
##contig=<ID=GL000226.1,length=15008,assembly=GRCh37>
##contig=<ID=GL000229.1,length=19913,assembly=GRCh37>
##contig=<ID=GL000231.1,length=27386,assembly=GRCh37>
##contig=<ID=GL000210.1,length=27682,assembly=GRCh37>
##contig=<ID=GL000239.1,length=33824,assembly=GRCh37>
##contig=<ID=GL000235.1,length=34474,assembly=GRCh37>
##contig=<ID=GL000201.1,length=36148,assembly=GRCh37>
##contig=<ID=GL000247.1,length=36422,assembly=GRCh37>
##contig=<ID=GL000245.1,length=36651,assembly=GRCh37>
##contig=<ID=GL000197.1,length=37175,assembly=GRCh37>
##contig=<ID=GL000203.1,length=37498,assembly=GRCh37>
##contig=<ID=GL000246.1,length=38154,assembly=GRCh37>
##contig=<ID=GL000249.1,length=38502,assembly=GRCh37>
##contig=<ID=GL000196.1,length=38914,assembly=GRCh37>
##contig=<ID=GL000248.1,length=39786,assembly=GRCh37>
##contig=<ID=GL000244.1,length=39929,assembly=GRCh37>
##contig=<ID=GL000238.1,length=39939,assembly=GRCh37>
##contig=<ID=GL000202.1,length=40103,assembly=GRCh37>
##contig=<ID=GL000234.1,length=40531,assembly=GRCh37>
##contig=<ID=GL000232.1,length=40652,assembly=GRCh37>
##contig=<ID=GL000206.1,length=41001,assembly=GRCh37>
##contig=<ID=GL000240.1,length=41933,assembly=GRCh37>
##contig=<ID=GL000236.1,length=41934,assembly=GRCh37>
##contig=<ID=GL000241.1,length=42152,assembly=GRCh37>
##contig=<ID=GL000243.1,length=43341,assembly=GRCh37>
##contig=<ID=GL000242.1,length=43523,assembly=GRCh37>
##contig=<ID=GL000230.1,length=43691,assembly=GRCh37>
##contig=<ID=GL000237.1,length=45867,assembly=GRCh37>
##contig=<ID=GL000233.1,length=45941,assembly=GRCh37>
##contig=<ID=GL000204.1,length=81310,assembly=GRCh37>
##contig=<ID=GL000198.1,length=90085,assembly=GRCh37>
##contig=<ID=GL000208.1,length=92689,assembly=GRCh37>
##contig=<ID=GL000191.1,length=106433,assembly=GRCh37>
##contig=<ID=GL000227.1,length=128374,assembly=GRCh37>
##contig=<ID=GL000228.1,length=129120,assembly=GRCh37>
##contig=<ID=GL000214.1,length=137718,assembly=GRCh37>
##contig=<ID=GL000221.1,length=155397,assembly=GRCh37>
##contig=<ID=GL000209.1,length=159169,assembly=GRCh37>
##contig=<ID=GL000218.1,length=161147,assembly=GRCh37>
##contig=<ID=GL000220.1,length=161802,assembly=GRCh37>
##contig=<ID=GL000213.1,length=164239,assembly=GRCh37>
##contig=<ID=GL000211.1,length=166566,assembly=GRCh37>
##contig=<ID=GL000199.1,length=169874,assembly=GRCh37>
##contig=<ID=GL000217.1,length=172149,assembly=GRCh37>
##contig=<ID=GL000216.1,length=172294,assembly=GRCh37>
##contig=<ID=GL000215.1,length=172545,assembly=GRCh37>
##contig=<ID=GL000205.1,length=174588,assembly=GRCh37>
##contig=<ID=GL000219.1,length=179198,assembly=GRCh37>
##contig=<ID=GL000224.1,length=179693,assembly=GRCh37>
##contig=<ID=GL000223.1,length=180455,assembly=GRCh37>
##contig=<ID=GL000195.1,length=182896,assembly=GRCh37>
##contig=<ID=GL000212.1,length=186858,assembly=GRCh37>
##contig=<ID=GL000222.1,length=186861,assembly=GRCh37>
##contig=<ID=GL000200.1,length=187035,assembly=GRCh37>
##contig=<ID=GL000193.1,length=189789,assembly=GRCh37>
##contig=<ID=GL000194.1,length=191469,assembly=GRCh37>
##contig=<ID=GL000225.1,length=211173,assembly=GRCh37>
##contig=<ID=GL000192.1,length=547496,assembly=GRCh37>
##contig=<ID=NC_007605,length=171823,assembly=NC_007605.1>
##source=HaplotypeCaller
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878
7 100550837 . C . 258.77 . AN=2;DP=25;MQ=43.42 GT:AD:DP 0/0:25:25
7 100550841 . C . 246.77 . AN=2;DP=28;MQ=45.49 GT:AD:DP 0/0:28:28
7 100550873 . G C 479.77 . AC=1;AF=0.500;AN=2;BaseQRankSum=3.009;DP=38;ExcessHet=3.0103;FS=1.547;MLEAC=1;MLEAF=0.500;MQ=53.19;MQRankSum=0.000;QD=13.71;ReadPosRankSum=0.366;SOR=0.412 GT:AD:DP:GQ:PL 0/1:16,19:35:99:508,0,353