Skip to content

Commit

Permalink
Handle trailing reference blocks in CombineGVCFs. (#4615)
Browse files Browse the repository at this point in the history
  • Loading branch information
cmnbroad authored Apr 20, 2018
1 parent 3a8d9e3 commit 54295ab
Show file tree
Hide file tree
Showing 5 changed files with 176 additions and 22 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -412,14 +412,17 @@ public Object onTraversalSuccess() {
return null;
}

SimpleInterval interval = prevPos != null ? new SimpleInterval(prevPos.getContig(), prevPos.getStart(), variantContextsOverlappingCurrentMerge.stream().map(VariantContext::getEnd).max(Comparator.naturalOrder()).get()) :
storedReferenceContext.getInterval();

createIntermediateVariants(interval);

// there shouldn't be any state left unless the user cut in the middle of a gVCF block
if ( !variantContextsOverlappingCurrentMerge.isEmpty() ) {
logger.warn("You have asked for an interval that cuts in the middle of one or more gVCF blocks. Please note that this will cause you to lose records that don't end within your interval.");
// finish off the last blocks
final SimpleInterval lastInterval = new SimpleInterval(
variantContextsOverlappingCurrentMerge.get(0).getContig(),
variantContextsOverlappingCurrentMerge.get(0).getStart(),
variantContextsOverlappingCurrentMerge.stream().map(VariantContext::getEnd).max(Comparator.naturalOrder()).get());
createIntermediateVariants(lastInterval);
// there shouldn't be any state left unless the user cut in the middle of a gVCF block
if ( !variantContextsOverlappingCurrentMerge.isEmpty() ) {
logger.warn("You have asked for an interval that cuts in the middle of one or more gVCF blocks. Please note that this will cause you to lose records that don't end within your interval.");
}
}

return null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,10 @@ public Object[][] gvcfsToCombine() {
//Testing allele-specific annotations
{new File[]{getTestFile("NA12878.AS.chr20snippet.g.vcf"), getTestFile("NA12892.AS.chr20snippet.g.vcf")}, getTestFile("testAlleleSpecificAnnotations.vcf"), Arrays.asList("-G", "Standard", "-G", "AS_Standard"), b37_reference_20_21},
//Testing allele-specific annotations missing AS_Standard Group
{new File[]{getTestFile("NA12878.AS.chr20snippet.g.vcf"), getTestFile("NA12892.AS.chr20snippet.g.vcf")}, getTestFile("testAlleleSpecificAnnotationsNoGroup.vcf"), Arrays.asList("-G", "Standard", "-G", "AS_Standard"), b37_reference_20_21},};
{new File[]{getTestFile("NA12878.AS.chr20snippet.g.vcf"), getTestFile("NA12892.AS.chr20snippet.g.vcf")}, getTestFile("testAlleleSpecificAnnotationsNoGroup.vcf"), Arrays.asList("-G", "Standard", "-G", "AS_Standard"), b37_reference_20_21},
//Test that trailing reference blocks are emitted with correct intervals
{new File[]{getTestFile("gvcfExample1WithTrailingReferenceBlocks.g.vcf"), getTestFile("gvcfExample2WithTrailingReferenceBlocks.g.vcf")}, getTestFile("gvcfWithTrailingReferenceBlocksExpected.g.vcf"), NO_EXTRA_ARGS, b38_reference_20_21},
};
}


Expand Down Expand Up @@ -118,11 +121,6 @@ public void compareToGATK3(File[] inputs, File outputFile, List<String> extraArg
} else {
System.out.println("Found precomputed gatk3Result");
}
final Path expectedResultsDir = Paths.get(getToolTestDataDir());
if ( !Files.exists(expectedResultsDir)) {
Files.createDirectory(expectedResultsDir);
}
Files.copy(gatk3Result.toPath(), expectedResultsDir.resolve(outputFile.getName()), StandardCopyOption.REPLACE_EXISTING);

assertVariantContextsMatch(Arrays.asList(inputs), gatk3Result, extraArgs, reference);
}
Expand Down Expand Up @@ -151,7 +149,7 @@ private void assertVariantContextsMatch(List<File> inputs, File expected, List<S
}

public void runCombineGVCFSandAssertSomething(List<File> inputs, File expected, List<String> additionalArguments, BiConsumer<VariantContext, VariantContext> assertion, String reference) throws IOException {
final File output = createTempFile("genotypegvcf", ".vcf");
final File output = createTempFile("combinegvcfs", ".vcf");

final ArgumentsBuilder args = new ArgumentsBuilder();
args.addReference(new File(reference))
Expand Down Expand Up @@ -195,7 +193,7 @@ private static VCFHeader getHeaderFromFile(final File vcfFile) throws IOExceptio

@Test
public void testOneStartsBeforeTwoAndEndsAfterwards() throws Exception {
final File output = createTempFile("genotypegvcf", ".vcf");
final File output = createTempFile("combinegvcfs", ".vcf");

final ArgumentsBuilder args = new ArgumentsBuilder();
args.addReference(new File(b37_reference_20_21))
Expand Down Expand Up @@ -227,7 +225,7 @@ public void testOneStartsBeforeTwoAndEndsAfterwards() throws Exception {

@Test()
public void testTetraploidRun() throws IOException {
final File output = createTempFile("genotypegvcf", ".vcf");
final File output = createTempFile("combinegvcfs", ".vcf");

final ArgumentsBuilder args = new ArgumentsBuilder();
args.addReference(new File(b37_reference_20_21))
Expand All @@ -248,7 +246,7 @@ public void testTetraploidRun() throws IOException {

@Test
public void testTwoSpansManyBlocksInOne() throws Exception {
final File output = createTempFile("genotypegvcf", ".vcf");
final File output = createTempFile("combinegvcfs", ".vcf");

final ArgumentsBuilder args = new ArgumentsBuilder();
args.addReference(new File(b37_reference_20_21))
Expand All @@ -267,7 +265,7 @@ public void testTwoSpansManyBlocksInOne() throws Exception {
// Ensuring that no exception is thrown and that the resulting VCF is empty
@Test
public void testNoDataInInterval() throws Exception {
final File output = createTempFile("genotypegvcf", ".vcf");
final File output = createTempFile("combinegvcfs", ".vcf");

final ArgumentsBuilder args = new ArgumentsBuilder();
args.addReference(new File(b37_reference_20_21))
Expand All @@ -286,7 +284,7 @@ public void testNoDataInInterval() throws Exception {

@Test
public void testOneHasAltAndTwoHasNothing() throws Exception {
final File output = createTempFile("genotypegvcf", ".vcf");
final File output = createTempFile("combinegvcfs", ".vcf");

final ArgumentsBuilder args = new ArgumentsBuilder();
args.addReference(new File(b37_reference_20_21))
Expand All @@ -307,7 +305,7 @@ public void testOneHasAltAndTwoHasNothing() throws Exception {

@Test
public void testOneHasAltAndTwoHasRefBlock() throws Exception {
final File output = createTempFile("genotypegvcf", ".vcf");
final File output = createTempFile("combinegvcfs", ".vcf");

final ArgumentsBuilder args = new ArgumentsBuilder();
args.addReference(new File(b37_reference_20_21))
Expand All @@ -332,7 +330,7 @@ public void testOneHasAltAndTwoHasRefBlock() throws Exception {

@Test
public void testOneHasDeletionAndTwoHasRefBlock() throws Exception {
final File output = createTempFile("genotypegvcf", ".vcf");
final File output = createTempFile("combinegvcfs", ".vcf");

final ArgumentsBuilder args = new ArgumentsBuilder();
args.addReference(new File(b37_reference_20_21))
Expand Down Expand Up @@ -369,7 +367,7 @@ public void testOneHasDeletionAndTwoHasRefBlock() throws Exception {
// This test asserts that the behavior is rational in the case of whole genome gvcfs which have variants starting at
// the first and ending on the last base of each contig.
public void testStartChromosome() throws Exception {
final File output = createTempFile("genotypegvcf", ".vcf");
final File output = createTempFile("combinegvcfs", ".vcf");

final ArgumentsBuilder args = new ArgumentsBuilder();
args.addReference(new File(b37_reference_20_21))
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
##fileformat=VCFv4.1
##ALT=<ID=NON_REF,Description="Represents any possible alternative allele at this location">
##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block">
##FORMAT=<ID=MIN_GQ,Number=1,Type=Integer,Description="Minimum GQ observed within the GVCF block">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.">
##GVCFBlock=minGQ=0(inclusive),maxGQ=5(exclusive)
##INFO=<ID=BLOCK_SIZE,Number=1,Type=Integer,Description="Size of the homozygous reference GVCF block">
##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
##INFO=<ID=HaplotypeScore,Number=1,Type=Float,Description="Consistency of the site with at most two segregating haplotypes">
##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
##INFO=<ID=MQ0,Number=1,Type=Integer,Description="Total Mapping Quality Zero Reads">
##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities">
##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias">
##contig=<ID=chr20,length=64444167>
##contig=<ID=chr21,length=46709983>
##contig=<ID=chr20_GL383577v2_alt,length=128386>
##contig=<ID=chr20_KI270869v1_alt,length=118774>
##contig=<ID=chr20_KI270871v1_alt,length=58661>
##contig=<ID=chr20_KI270870v1_alt,length= 183433>
##contig=<ID=chr21_GL383578v2_alt,length=63917>
##contig=<ID=chr21_KI270874v1_alt,length= 166743>
##contig=<ID=chr21_KI270873v1_alt,length=143900>
##contig=<ID=chr21_GL383579v2_alt,length=201197>
##contig=<ID=chr21_GL383580v2_alt,length=74653>
##contig=<ID=chr21_GL383581v2_alt,length=116689>
##contig=<ID=chr21_KI270872v1_alt,length=82692>
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA1
chr20 69491 . G <NON_REF> . . BLOCK_SIZE=20;END=69510 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:94:99:82:99:0,120,1800
chr20 69511 . A G,<NON_REF> 2253.77 . BaseQRankSum=1.169;DP=82;MQ=31.05;MQ0=0;MQRankSum=-0.866;ReadPosRankSum=1.689 GT:AD:DP:GQ:PL:SB 1/1:1,79,0:80:99:2284,207,0,2287,237,2316:0,1,46,33
chr20 69512 . C <NON_REF> . . BLOCK_SIZE=10;END=69521 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:96:99:82:99:0,120,1800
chr20 69522 . T <NON_REF> . . BLOCK_SIZE=27;END=69548 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:95:0:95:0:0,0,0
chr20 69549 . G <NON_REF> . . BLOCK_SIZE=86;END=69634 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:156:99:56:66:0,66,990
chr20 69635 . G T,<NON_REF> 60.77 . BaseQRankSum=0.937;DP=7;MQ=34.15;MQ0=0;MQRankSum=1.300;ReadPosRankSum=1.754 GT:AD:DP:GQ:PL:SB 0/1:4,3,0:7:89:89,0,119,101,128,229:0,4,0,3
chr20 69762 . T <NON_REF> . . BLOCK_SIZE=1;END=69762 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:7:18:7:18:0,18,270
chr20 69763 . A <NON_REF> . . BLOCK_SIZE=4;END=69766 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:7:21:7:21:0,21,253
chr20 69767 . C <NON_REF> . . BLOCK_SIZE=5;END=69771 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:7:12:7:12:0,12,180
chr20 69772 . AAAGC A,<NON_REF> 60.77 . BaseQRankSum=0.937;DP=7;MQ=34.15;MQ0=0;MQRankSum=1.300;ReadPosRankSum=1.754 GT:AD:DP:GQ:PL:SB 0/1:4,3,0:7:89:89,0,119,101,128,229:0,4,0,3
chr20 69773 . A <NON_REF> . . BLOCK_SIZE=10;END=69783 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:7:0:3:0:0,0,0
chr21 1 . G <NON_REF> . . END=46709983 GT:DP:GQ:MIN_DP:PL 0/0:0:0:0:0,0,0
chr20_GL383577v2_alt 1 . G <NON_REF> . . END=128386 GT:DP:GQ:MIN_DP:PL 0/0:0:0:0:0,0,0
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
##fileformat=VCFv4.1
##ALT=<ID=NON_REF,Description="Represents any possible alternative allele at this location">
##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block">
##FORMAT=<ID=MIN_GQ,Number=1,Type=Integer,Description="Minimum GQ observed within the GVCF block">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.">
##GVCFBlock=minGQ=0(inclusive),maxGQ=5(exclusive)
##INFO=<ID=BLOCK_SIZE,Number=1,Type=Integer,Description="Size of the homozygous reference GVCF block">
##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
##INFO=<ID=HaplotypeScore,Number=1,Type=Float,Description="Consistency of the site with at most two segregating haplotypes">
##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
##INFO=<ID=MQ0,Number=1,Type=Integer,Description="Total Mapping Quality Zero Reads">
##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities">
##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias">
##contig=<ID=chr20,length=64444167>
##contig=<ID=chr21,length=46709983>
##contig=<ID=chr20_GL383577v2_alt,length=128386>
##contig=<ID=chr20_KI270869v1_alt,length=118774>
##contig=<ID=chr20_KI270871v1_alt,length=58661>
##contig=<ID=chr20_KI270870v1_alt,length= 183433>
##contig=<ID=chr21_GL383578v2_alt,length=63917>
##contig=<ID=chr21_KI270874v1_alt,length= 166743>
##contig=<ID=chr21_KI270873v1_alt,length=143900>
##contig=<ID=chr21_GL383579v2_alt,length=201197>
##contig=<ID=chr21_GL383580v2_alt,length=74653>
##contig=<ID=chr21_GL383581v2_alt,length=116689>
##contig=<ID=chr21_KI270872v1_alt,length=82692>
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA2
chr20 69498 . C <NON_REF> . . BLOCK_SIZE=9;END=69506 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:94:99:82:99:0,120,1800
chr20 69514 . T <NON_REF> . . BLOCK_SIZE=111;END=69624 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:96:99:82:99:0,120,1800
chr20 69625 . A <NON_REF> . . BLOCK_SIZE=51;END=69675 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:7:18:7:18:0,18,270
chr20 69767 . C <NON_REF> . . BLOCK_SIZE=8;END=69774 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:7:12:7:15:0,12,180
chr20 69775 . C <NON_REF> . . BLOCK_SIZE=17;END=69791 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:7:15:7:15:0,15,160
chr21 1 . G <NON_REF> . . END=46709983 GT:DP:GQ:MIN_DP:PL 0/0:0:0:0:0,0,0
chr20_GL383577v2_alt 1 . G <NON_REF> . . END=128386 GT:DP:GQ:MIN_DP:PL 0/0:0:0:0:0,0,0
Loading

0 comments on commit 54295ab

Please sign in to comment.