-
Notifications
You must be signed in to change notification settings - Fork 589
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
PrintBGZFBlockInformation: a tool to dump information about blocks in…
… a BGZF file (#4239) Added a new GATK tool called PrintBGZFBlockInformation, a tool to dump information about blocks in a BGZF file. This tool can detect various kinds of BGZF file corruption such as premature BGZF terminator blocks, truncated files, and files that were regular-GZIPPED by accident.
- Loading branch information
Showing
11 changed files
with
2,147 additions
and
0 deletions.
There are no files selected for viewing
250 changes: 250 additions & 0 deletions
250
src/main/java/org/broadinstitute/hellbender/tools/PrintBGZFBlockInformation.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,250 @@ | ||
package org.broadinstitute.hellbender.tools; | ||
|
||
import htsjdk.samtools.util.BlockCompressedStreamConstants; | ||
import htsjdk.samtools.util.IOUtil; | ||
import org.apache.commons.lang3.StringUtils; | ||
import org.broadinstitute.barclay.argparser.Argument; | ||
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; | ||
import org.broadinstitute.barclay.argparser.ExperimentalFeature; | ||
import org.broadinstitute.hellbender.cmdline.CommandLineProgram; | ||
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; | ||
import org.broadinstitute.hellbender.exceptions.UserException; | ||
import org.broadinstitute.hellbender.utils.io.IOUtils; | ||
import picard.cmdline.programgroups.OtherProgramGroup; | ||
|
||
import java.io.*; | ||
import java.nio.file.Files; | ||
import java.nio.file.Path; | ||
import java.util.ArrayList; | ||
import java.util.List; | ||
|
||
/** | ||
* A diagnostic tool that prints information about the compressed blocks in a BGZF format file, | ||
* such as a .vcf.gz file. This tool can detect various kinds of BGZF file corruption such as | ||
* premature BGZF terminator blocks, truncated files, and files that were regular-GZIPPED by | ||
* accident. | ||
* <p> | ||
* The output looks like this: | ||
* </p> | ||
* <pre> | ||
* Block #1 at file offset 0 | ||
* - compressed size: 12409 | ||
* - uncompressed size: 65498 | ||
* | ||
* Block #2 at file offset 12409 | ||
* - compressed size: 6497 | ||
* - uncompressed size: 65498 | ||
* ... | ||
* etc. | ||
* </pre> | ||
* <p> | ||
* The output can be redirected to a file using the -O option. | ||
* </p> | ||
*/ | ||
@ExperimentalFeature | ||
@CommandLineProgramProperties( | ||
summary = "Print information about the compressed blocks in a BGZF format file", | ||
oneLineSummary = "Print information about the compressed blocks in a BGZF format file", | ||
programGroup = OtherProgramGroup.class | ||
) | ||
public class PrintBGZFBlockInformation extends CommandLineProgram { | ||
|
||
@Argument(fullName = "bgzf-file", doc = "The BGZF-format file for which to print block information", optional = false) | ||
private String bgzfPathString; | ||
|
||
@Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, doc = "File to which to write block information (if not specified, prints to standard output)", optional = true) | ||
private String output; | ||
|
||
private Path bgzfPath; | ||
|
||
private long streamOffset = 0l; | ||
|
||
private PrintStream outStream; | ||
|
||
@Override | ||
protected void onStartup() { | ||
super.onStartup(); | ||
|
||
bgzfPath = IOUtils.getPath(bgzfPathString); | ||
|
||
if ( ! Files.exists(bgzfPath) ) { | ||
throw new UserException.CouldNotReadInputFile("File " + bgzfPathString + " does not exist"); | ||
} | ||
|
||
if ( ! IOUtil.hasBlockCompressedExtension(bgzfPathString) ) { | ||
throw new UserException.CouldNotReadInputFile("File " + bgzfPathString + " does not end in a recognized BGZF file extension (" + | ||
StringUtils.join(IOUtil.BLOCK_COMPRESSED_EXTENSIONS, ",") + ")"); | ||
} | ||
|
||
try { | ||
// Check that the file is in BGZF format. This catches the "regular GZIP" case as well: | ||
if ( ! IOUtil.isBlockCompressed(bgzfPath) ) { | ||
throw new UserException.CouldNotReadInputFile(bgzfPath, "File is not a valid BGZF file. Could possibly be a regular GZIP file?"); | ||
} | ||
} | ||
catch ( IOException e ) { | ||
throw new UserException.CouldNotReadInputFile(bgzfPath, "Unable to determine whether file is a valid BGZF file", e); | ||
} | ||
|
||
if ( output != null ) { | ||
try { | ||
outStream = new PrintStream(output); | ||
} catch (FileNotFoundException e) { | ||
throw new UserException.CouldNotCreateOutputFile(output, "Unable to open output file", e); | ||
} | ||
} else { | ||
outStream = System.out; | ||
} | ||
} | ||
|
||
@Override | ||
protected Object doWork() { | ||
BGZFBlockMetadata previousBlockInfo = null; | ||
int blockNumber = 0; | ||
final List<Integer> nonFinalTerminatorBlockIndices = new ArrayList<>(); | ||
|
||
try ( InputStream bgzfInputStream = Files.newInputStream(bgzfPath) ) { | ||
outStream.printf("BGZF block information for file: %s\n\n", bgzfPath.getFileName()); | ||
|
||
BGZFBlockMetadata blockInfo; | ||
|
||
while ( (blockInfo = processNextBlock(bgzfInputStream, bgzfPathString)) != null ) { | ||
++blockNumber; | ||
|
||
// If we saw a 0-byte terminator block that was not the final block in the file, | ||
// emit an error message | ||
if ( previousBlockInfo != null && previousBlockInfo.uncompressedSize == 0 ) { | ||
nonFinalTerminatorBlockIndices.add(blockNumber - 1); | ||
|
||
outStream.println("*******************************************************"); | ||
outStream.println("ERROR: Premature BGZF 0-byte terminator block was found"); | ||
outStream.println("at block number: " + (blockNumber - 1)); | ||
outStream.println("*******************************************************"); | ||
outStream.println(); | ||
} | ||
|
||
outStream.printf("Block #%d at file offset %d\n", blockNumber, blockInfo.blockOffset); | ||
outStream.printf("\t- compressed size: %d\n", blockInfo.compressedSize); | ||
outStream.printf("\t- uncompressed size: %d\n", blockInfo.uncompressedSize); | ||
outStream.println(); | ||
|
||
previousBlockInfo = blockInfo; | ||
} | ||
} catch ( IOException e ) { | ||
throw new UserException.CouldNotReadInputFile("Error while parsing BGZF file.", e); | ||
} | ||
|
||
// Check whether the last block in the file was a 0-byte BGZF terminator block | ||
if ( previousBlockInfo == null || previousBlockInfo.uncompressedSize != 0 ) { | ||
outStream.println("******************************************************"); | ||
outStream.println("ERROR: Final BGZF 0-byte terminator block was MISSING!"); | ||
outStream.println("******************************************************"); | ||
outStream.println(); | ||
} else { | ||
outStream.println("***************************************************************************"); | ||
outStream.println("Final BGZF 0-byte terminator block FOUND as expected at block number " + blockNumber); | ||
outStream.println("***************************************************************************"); | ||
outStream.println(); | ||
} | ||
|
||
// Emit an error message at the end if we encountered any terminator blocks before the final block: | ||
if ( ! nonFinalTerminatorBlockIndices.isEmpty() ) { | ||
outStream.println("***********************************************************"); | ||
outStream.println("ERROR: Premature BGZF 0-byte terminator block(s) were found"); | ||
outStream.println("at block number(s): " + StringUtils.join(nonFinalTerminatorBlockIndices, ",")); | ||
outStream.println("***********************************************************"); | ||
outStream.println(); | ||
} | ||
|
||
return 0; | ||
} | ||
|
||
@Override | ||
protected void onShutdown() { | ||
if ( outStream != null && outStream != System.out ) { | ||
outStream.close(); | ||
} | ||
} | ||
|
||
// Code adapted from HTSJDK's BlockCompressedInputStream class | ||
private BGZFBlockMetadata processNextBlock(InputStream stream, String streamSource) throws IOException { | ||
final byte[] buffer = new byte[BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE]; | ||
long blockAddress = streamOffset; | ||
|
||
final int headerByteCount = readBytes(stream, buffer, 0, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH); | ||
|
||
// Return null when we hit EOF | ||
if ( headerByteCount <= 0 ) { | ||
return null; | ||
} | ||
if (headerByteCount != BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH) { | ||
throw new IOException("Incorrect header size for file: " + streamSource); | ||
} | ||
streamOffset += headerByteCount; | ||
|
||
final int blockLength = unpackInt16(buffer, BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET) + 1; | ||
|
||
if (blockLength < BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH || blockLength > buffer.length) { | ||
throw new IOException("Unexpected compressed block length: " + blockLength + " for " + streamSource); | ||
} | ||
|
||
final int remaining = blockLength - BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH; | ||
final int dataByteCount = readBytes(stream, buffer, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH, | ||
remaining); | ||
|
||
if (dataByteCount != remaining) { | ||
throw new IOException("Premature end of file: " + streamSource); | ||
} | ||
streamOffset += dataByteCount; | ||
|
||
final int uncompressedLength = unpackInt32(buffer, blockLength - 4); | ||
|
||
if (uncompressedLength < 0) { | ||
throw new IOException(streamSource + " has invalid uncompressed length: " + uncompressedLength); | ||
} | ||
|
||
return new BGZFBlockMetadata(blockAddress, blockLength, uncompressedLength); | ||
} | ||
|
||
private static int unpackInt16(final byte[] buffer, final int offset) { | ||
return ((buffer[offset] & 0xFF) | | ||
((buffer[offset+1] & 0xFF) << 8)); | ||
} | ||
|
||
private static int unpackInt32(final byte[] buffer, final int offset) { | ||
return ((buffer[offset] & 0xFF) | | ||
((buffer[offset+1] & 0xFF) << 8) | | ||
((buffer[offset+2] & 0xFF) << 16) | | ||
((buffer[offset+3] & 0xFF) << 24)); | ||
} | ||
|
||
private static int readBytes(final InputStream stream, final byte[] buffer, final int offset, final int length) throws IOException { | ||
int bytesRead = 0; | ||
while (bytesRead < length) { | ||
final int count = stream.read(buffer, offset + bytesRead, length - bytesRead); | ||
|
||
// Return EOF if we get EOF from read() and we haven't read any bytes | ||
if ( count < 0 && bytesRead == 0 ) { | ||
return count; | ||
// Otherwise if we hit EOF and we have read something, return the bytes read | ||
} else if (count <= 0) { | ||
break; | ||
} | ||
|
||
bytesRead += count; | ||
} | ||
return bytesRead; | ||
} | ||
|
||
private static final class BGZFBlockMetadata { | ||
private final long blockOffset; | ||
private final int compressedSize; | ||
private final int uncompressedSize; | ||
|
||
public BGZFBlockMetadata(final long blockOffset, final int compressedSize, final int uncompressedSize) { | ||
this.blockOffset = blockOffset; | ||
this.compressedSize = compressedSize; | ||
this.uncompressedSize = uncompressedSize; | ||
} | ||
} | ||
} |
103 changes: 103 additions & 0 deletions
103
...st/java/org/broadinstitute/hellbender/tools/PrintBGZFBlockInformationIntegrationTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
package org.broadinstitute.hellbender.tools; | ||
|
||
import org.broadinstitute.hellbender.CommandLineProgramTest; | ||
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; | ||
import org.broadinstitute.hellbender.exceptions.UserException; | ||
import org.broadinstitute.hellbender.testutils.IntegrationTestSpec; | ||
import org.testng.annotations.Test; | ||
|
||
import java.io.File; | ||
import java.io.IOException; | ||
|
||
public class PrintBGZFBlockInformationIntegrationTest extends CommandLineProgramTest { | ||
|
||
/* Well-formed large BGZF file */ | ||
@Test | ||
public void testNormalLargeInput() throws IOException { | ||
final File input = new File(largeFileTestDir, "gvcfs/HG00096.g.vcf.gz"); | ||
final File actualOutput = createTempFile("PrintBGZFBlockInformationIntegrationTest_testNormalLargeInput", ".out"); | ||
final File expectedOutput = new File(toolsTestDir + "PrintBGZFBlockInformation/expected_PrintBGZFBlockInformationIntegrationTest_testNormalLargeInput.out"); | ||
|
||
final String[] args = { | ||
"--bgzf-file", input.getAbsolutePath(), | ||
"--" + StandardArgumentDefinitions.OUTPUT_LONG_NAME, actualOutput.getAbsolutePath() | ||
}; | ||
runCommandLine(args); | ||
|
||
IntegrationTestSpec.assertEqualTextFiles(actualOutput, expectedOutput); | ||
} | ||
|
||
/* Well-formed small BGZF file */ | ||
@Test | ||
public void testNormalSmallInput() throws IOException { | ||
final File input = new File(toolsTestDir, "PrintBGZFBlockInformation/4featuresHG38Header.vcf.gz"); | ||
final File actualOutput = createTempFile("PrintBGZFBlockInformationIntegrationTest_testNormalSmallInput", ".out"); | ||
final File expectedOutput = new File(toolsTestDir + "PrintBGZFBlockInformation/expected_PrintBGZFBlockInformationIntegrationTest_testNormalSmallInput.out"); | ||
|
||
final String[] args = { | ||
"--bgzf-file", input.getAbsolutePath(), | ||
"--" + StandardArgumentDefinitions.OUTPUT_LONG_NAME, actualOutput.getAbsolutePath() | ||
}; | ||
runCommandLine(args); | ||
|
||
IntegrationTestSpec.assertEqualTextFiles(actualOutput, expectedOutput); | ||
} | ||
|
||
/* Malformed BGZF file missing the final 0-byte terminator block */ | ||
@Test | ||
public void testMissingBGZFTerminatorBlock() throws IOException { | ||
final File input = new File(toolsTestDir, "PrintBGZFBlockInformation/4featuresHG38Header.NO_BGZF_TERMINATOR_BLOCK.vcf.gz"); | ||
final File actualOutput = createTempFile("PrintBGZFBlockInformationIntegrationTest_testMissingBGZFTerminatorBlock", ".out"); | ||
final File expectedOutput = new File(toolsTestDir + "PrintBGZFBlockInformation/expected_PrintBGZFBlockInformationIntegrationTest_testMissingBGZFTerminatorBlock.out"); | ||
|
||
final String[] args = { | ||
"--bgzf-file", input.getAbsolutePath(), | ||
"--" + StandardArgumentDefinitions.OUTPUT_LONG_NAME, actualOutput.getAbsolutePath() | ||
}; | ||
runCommandLine(args); | ||
|
||
IntegrationTestSpec.assertEqualTextFiles(actualOutput, expectedOutput); | ||
} | ||
|
||
/* Malformed BGZF file with an incomplete (truncated) final block */ | ||
@Test(expectedExceptions= UserException.CouldNotReadInputFile.class) | ||
public void testTruncatedFinalBlock() throws IOException { | ||
final File input = new File(toolsTestDir, "PrintBGZFBlockInformation/4featuresHG38Header.TRUNCATED_FINAL_BLOCK.vcf.gz"); | ||
final File actualOutput = createTempFile("PrintBGZFBlockInformationIntegrationTest_testTruncatedFinalBlock", ".out"); | ||
|
||
final String[] args = { | ||
"--bgzf-file", input.getAbsolutePath(), | ||
"--" + StandardArgumentDefinitions.OUTPUT_LONG_NAME, actualOutput.getAbsolutePath() | ||
}; | ||
runCommandLine(args); | ||
} | ||
|
||
/* Malformed BGZF file with an extra 0-byte terminator block in the middle */ | ||
@Test | ||
public void testExtraTerminatorBlockInMiddle() throws IOException { | ||
final File input = new File(toolsTestDir, "PrintBGZFBlockInformation/4featuresHG38Header.EXTRA_TERMINATOR_BLOCK_IN_MIDDLE.vcf.gz"); | ||
final File actualOutput = createTempFile("PrintBGZFBlockInformationIntegrationTest_testExtraTerminatorBlockInMiddle", ".out"); | ||
final File expectedOutput = new File(toolsTestDir + "PrintBGZFBlockInformation/expected_PrintBGZFBlockInformationIntegrationTest_testExtraTerminatorBlockInMiddle.out"); | ||
|
||
final String[] args = { | ||
"--bgzf-file", input.getAbsolutePath(), | ||
"--" + StandardArgumentDefinitions.OUTPUT_LONG_NAME, actualOutput.getAbsolutePath() | ||
}; | ||
runCommandLine(args); | ||
|
||
IntegrationTestSpec.assertEqualTextFiles(actualOutput, expectedOutput); | ||
} | ||
|
||
/* Regular GZIP file masquerading as a BGZF file */ | ||
@Test(expectedExceptions= UserException.CouldNotReadInputFile.class) | ||
public void testRegularGzipFile() throws IOException { | ||
final File input = new File(toolsTestDir, "PrintBGZFBlockInformation/4featuresHG38Header.REGULAR_GZIP.vcf.gz"); | ||
final File actualOutput = createTempFile("PrintBGZFBlockInformationIntegrationTest_testRegularGzipFile", ".out"); | ||
|
||
final String[] args = { | ||
"--bgzf-file", input.getAbsolutePath(), | ||
"--" + StandardArgumentDefinitions.OUTPUT_LONG_NAME, actualOutput.getAbsolutePath() | ||
}; | ||
runCommandLine(args); | ||
} | ||
} |
Binary file added
BIN
+24.6 KB
...ols/PrintBGZFBlockInformation/4featuresHG38Header.EXTRA_TERMINATOR_BLOCK_IN_MIDDLE.vcf.gz
Binary file not shown.
Binary file added
BIN
+24.5 KB
...ender/tools/PrintBGZFBlockInformation/4featuresHG38Header.NO_BGZF_TERMINATOR_BLOCK.vcf.gz
Binary file not shown.
Binary file added
BIN
+25.1 KB
...titute/hellbender/tools/PrintBGZFBlockInformation/4featuresHG38Header.REGULAR_GZIP.vcf.gz
Binary file not shown.
Binary file added
BIN
+24.4 KB
...llbender/tools/PrintBGZFBlockInformation/4featuresHG38Header.TRUNCATED_FINAL_BLOCK.vcf.gz
Binary file not shown.
Binary file added
BIN
+24.6 KB
.../org/broadinstitute/hellbender/tools/PrintBGZFBlockInformation/4featuresHG38Header.vcf.gz
Binary file not shown.
36 changes: 36 additions & 0 deletions
36
...on/expected_PrintBGZFBlockInformationIntegrationTest_testExtraTerminatorBlockInMiddle.out
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
BGZF block information for file: 4featuresHG38Header.EXTRA_TERMINATOR_BLOCK_IN_MIDDLE.vcf.gz | ||
|
||
Block #1 at file offset 0 | ||
- compressed size: 12409 | ||
- uncompressed size: 65498 | ||
|
||
Block #2 at file offset 12409 | ||
- compressed size: 28 | ||
- uncompressed size: 0 | ||
|
||
******************************************************* | ||
ERROR: Premature BGZF 0-byte terminator block was found | ||
at block number: 2 | ||
******************************************************* | ||
|
||
Block #3 at file offset 12437 | ||
- compressed size: 6497 | ||
- uncompressed size: 65498 | ||
|
||
Block #4 at file offset 18934 | ||
- compressed size: 6229 | ||
- uncompressed size: 46819 | ||
|
||
Block #5 at file offset 25163 | ||
- compressed size: 28 | ||
- uncompressed size: 0 | ||
|
||
*************************************************************************** | ||
Final BGZF 0-byte terminator block FOUND as expected at block number 5 | ||
*************************************************************************** | ||
|
||
*********************************************************** | ||
ERROR: Premature BGZF 0-byte terminator block(s) were found | ||
at block number(s): 2 | ||
*********************************************************** | ||
|
Oops, something went wrong.