-
Notifications
You must be signed in to change notification settings - Fork 589
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
PrintBGZFBlockInformation: a tool to dump information about blocks in a BGZF file #4239
Merged
Merged
Changes from all commits
Commits
Show all changes
5 commits
Select commit
Hold shift + click to select a range
4b90d90
PrintBGZFBlockInformation: a tool to dump information about blocks in…
droazen c57a265
Update tool and address comments
droazen 35ce7cd
Address more review comments
droazen f9e3288
Change tool package
droazen bef06f9
Update example in tool docs
droazen File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
250 changes: 250 additions & 0 deletions
250
src/main/java/org/broadinstitute/hellbender/tools/PrintBGZFBlockInformation.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,250 @@ | ||
package org.broadinstitute.hellbender.tools; | ||
|
||
import htsjdk.samtools.util.BlockCompressedStreamConstants; | ||
import htsjdk.samtools.util.IOUtil; | ||
import org.apache.commons.lang3.StringUtils; | ||
import org.broadinstitute.barclay.argparser.Argument; | ||
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; | ||
import org.broadinstitute.barclay.argparser.ExperimentalFeature; | ||
import org.broadinstitute.hellbender.cmdline.CommandLineProgram; | ||
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; | ||
import org.broadinstitute.hellbender.exceptions.UserException; | ||
import org.broadinstitute.hellbender.utils.io.IOUtils; | ||
import picard.cmdline.programgroups.OtherProgramGroup; | ||
|
||
import java.io.*; | ||
import java.nio.file.Files; | ||
import java.nio.file.Path; | ||
import java.util.ArrayList; | ||
import java.util.List; | ||
|
||
/** | ||
* A diagnostic tool that prints information about the compressed blocks in a BGZF format file, | ||
* such as a .vcf.gz file. This tool can detect various kinds of BGZF file corruption such as | ||
* premature BGZF terminator blocks, truncated files, and files that were regular-GZIPPED by | ||
* accident. | ||
* <p> | ||
* The output looks like this: | ||
* </p> | ||
* <pre> | ||
* Block #1 at file offset 0 | ||
* - compressed size: 12409 | ||
* - uncompressed size: 65498 | ||
* | ||
* Block #2 at file offset 12409 | ||
* - compressed size: 6497 | ||
* - uncompressed size: 65498 | ||
* ... | ||
* etc. | ||
* </pre> | ||
* <p> | ||
* The output can be redirected to a file using the -O option. | ||
* </p> | ||
*/ | ||
@ExperimentalFeature | ||
@CommandLineProgramProperties( | ||
summary = "Print information about the compressed blocks in a BGZF format file", | ||
oneLineSummary = "Print information about the compressed blocks in a BGZF format file", | ||
programGroup = OtherProgramGroup.class | ||
) | ||
public class PrintBGZFBlockInformation extends CommandLineProgram { | ||
|
||
@Argument(fullName = "bgzf-file", doc = "The BGZF-format file for which to print block information", optional = false) | ||
private String bgzfPathString; | ||
|
||
@Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, doc = "File to which to write block information (if not specified, prints to standard output)", optional = true) | ||
private String output; | ||
|
||
private Path bgzfPath; | ||
|
||
private long streamOffset = 0l; | ||
|
||
private PrintStream outStream; | ||
|
||
@Override | ||
protected void onStartup() { | ||
super.onStartup(); | ||
|
||
bgzfPath = IOUtils.getPath(bgzfPathString); | ||
|
||
if ( ! Files.exists(bgzfPath) ) { | ||
throw new UserException.CouldNotReadInputFile("File " + bgzfPathString + " does not exist"); | ||
} | ||
|
||
if ( ! IOUtil.hasBlockCompressedExtension(bgzfPathString) ) { | ||
throw new UserException.CouldNotReadInputFile("File " + bgzfPathString + " does not end in a recognized BGZF file extension (" + | ||
StringUtils.join(IOUtil.BLOCK_COMPRESSED_EXTENSIONS, ",") + ")"); | ||
} | ||
|
||
try { | ||
// Check that the file is in BGZF format. This catches the "regular GZIP" case as well: | ||
if ( ! IOUtil.isBlockCompressed(bgzfPath) ) { | ||
throw new UserException.CouldNotReadInputFile(bgzfPath, "File is not a valid BGZF file. Could possibly be a regular GZIP file?"); | ||
} | ||
} | ||
catch ( IOException e ) { | ||
throw new UserException.CouldNotReadInputFile(bgzfPath, "Unable to determine whether file is a valid BGZF file", e); | ||
} | ||
|
||
if ( output != null ) { | ||
try { | ||
outStream = new PrintStream(output); | ||
} catch (FileNotFoundException e) { | ||
throw new UserException.CouldNotCreateOutputFile(output, "Unable to open output file", e); | ||
} | ||
} else { | ||
outStream = System.out; | ||
} | ||
} | ||
|
||
@Override | ||
protected Object doWork() { | ||
BGZFBlockMetadata previousBlockInfo = null; | ||
int blockNumber = 0; | ||
final List<Integer> nonFinalTerminatorBlockIndices = new ArrayList<>(); | ||
|
||
try ( InputStream bgzfInputStream = Files.newInputStream(bgzfPath) ) { | ||
outStream.printf("BGZF block information for file: %s\n\n", bgzfPath.getFileName()); | ||
|
||
BGZFBlockMetadata blockInfo; | ||
|
||
while ( (blockInfo = processNextBlock(bgzfInputStream, bgzfPathString)) != null ) { | ||
++blockNumber; | ||
|
||
// If we saw a 0-byte terminator block that was not the final block in the file, | ||
// emit an error message | ||
if ( previousBlockInfo != null && previousBlockInfo.uncompressedSize == 0 ) { | ||
nonFinalTerminatorBlockIndices.add(blockNumber - 1); | ||
|
||
outStream.println("*******************************************************"); | ||
outStream.println("ERROR: Premature BGZF 0-byte terminator block was found"); | ||
outStream.println("at block number: " + (blockNumber - 1)); | ||
outStream.println("*******************************************************"); | ||
outStream.println(); | ||
} | ||
|
||
outStream.printf("Block #%d at file offset %d\n", blockNumber, blockInfo.blockOffset); | ||
outStream.printf("\t- compressed size: %d\n", blockInfo.compressedSize); | ||
outStream.printf("\t- uncompressed size: %d\n", blockInfo.uncompressedSize); | ||
outStream.println(); | ||
|
||
previousBlockInfo = blockInfo; | ||
} | ||
} catch ( IOException e ) { | ||
throw new UserException.CouldNotReadInputFile("Error while parsing BGZF file.", e); | ||
} | ||
|
||
// Check whether the last block in the file was a 0-byte BGZF terminator block | ||
if ( previousBlockInfo == null || previousBlockInfo.uncompressedSize != 0 ) { | ||
outStream.println("******************************************************"); | ||
outStream.println("ERROR: Final BGZF 0-byte terminator block was MISSING!"); | ||
outStream.println("******************************************************"); | ||
outStream.println(); | ||
} else { | ||
outStream.println("***************************************************************************"); | ||
outStream.println("Final BGZF 0-byte terminator block FOUND as expected at block number " + blockNumber); | ||
outStream.println("***************************************************************************"); | ||
outStream.println(); | ||
} | ||
|
||
// Emit an error message at the end if we encountered any terminator blocks before the final block: | ||
if ( ! nonFinalTerminatorBlockIndices.isEmpty() ) { | ||
outStream.println("***********************************************************"); | ||
outStream.println("ERROR: Premature BGZF 0-byte terminator block(s) were found"); | ||
outStream.println("at block number(s): " + StringUtils.join(nonFinalTerminatorBlockIndices, ",")); | ||
outStream.println("***********************************************************"); | ||
outStream.println(); | ||
} | ||
|
||
return 0; | ||
} | ||
|
||
@Override | ||
protected void onShutdown() { | ||
if ( outStream != null && outStream != System.out ) { | ||
outStream.close(); | ||
} | ||
} | ||
|
||
// Code adapted from HTSJDK's BlockCompressedInputStream class | ||
private BGZFBlockMetadata processNextBlock(InputStream stream, String streamSource) throws IOException { | ||
final byte[] buffer = new byte[BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE]; | ||
long blockAddress = streamOffset; | ||
|
||
final int headerByteCount = readBytes(stream, buffer, 0, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH); | ||
|
||
// Return null when we hit EOF | ||
if ( headerByteCount <= 0 ) { | ||
return null; | ||
} | ||
if (headerByteCount != BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH) { | ||
throw new IOException("Incorrect header size for file: " + streamSource); | ||
} | ||
streamOffset += headerByteCount; | ||
|
||
final int blockLength = unpackInt16(buffer, BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET) + 1; | ||
|
||
if (blockLength < BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH || blockLength > buffer.length) { | ||
throw new IOException("Unexpected compressed block length: " + blockLength + " for " + streamSource); | ||
} | ||
|
||
final int remaining = blockLength - BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH; | ||
final int dataByteCount = readBytes(stream, buffer, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH, | ||
remaining); | ||
|
||
if (dataByteCount != remaining) { | ||
throw new IOException("Premature end of file: " + streamSource); | ||
} | ||
streamOffset += dataByteCount; | ||
|
||
final int uncompressedLength = unpackInt32(buffer, blockLength - 4); | ||
|
||
if (uncompressedLength < 0) { | ||
throw new IOException(streamSource + " has invalid uncompressed length: " + uncompressedLength); | ||
} | ||
|
||
return new BGZFBlockMetadata(blockAddress, blockLength, uncompressedLength); | ||
} | ||
|
||
private static int unpackInt16(final byte[] buffer, final int offset) { | ||
return ((buffer[offset] & 0xFF) | | ||
((buffer[offset+1] & 0xFF) << 8)); | ||
} | ||
|
||
private static int unpackInt32(final byte[] buffer, final int offset) { | ||
return ((buffer[offset] & 0xFF) | | ||
((buffer[offset+1] & 0xFF) << 8) | | ||
((buffer[offset+2] & 0xFF) << 16) | | ||
((buffer[offset+3] & 0xFF) << 24)); | ||
} | ||
|
||
private static int readBytes(final InputStream stream, final byte[] buffer, final int offset, final int length) throws IOException { | ||
int bytesRead = 0; | ||
while (bytesRead < length) { | ||
final int count = stream.read(buffer, offset + bytesRead, length - bytesRead); | ||
|
||
// Return EOF if we get EOF from read() and we haven't read any bytes | ||
if ( count < 0 && bytesRead == 0 ) { | ||
return count; | ||
// Otherwise if we hit EOF and we have read something, return the bytes read | ||
} else if (count <= 0) { | ||
break; | ||
} | ||
|
||
bytesRead += count; | ||
} | ||
return bytesRead; | ||
} | ||
|
||
private static final class BGZFBlockMetadata { | ||
private final long blockOffset; | ||
private final int compressedSize; | ||
private final int uncompressedSize; | ||
|
||
public BGZFBlockMetadata(final long blockOffset, final int compressedSize, final int uncompressedSize) { | ||
this.blockOffset = blockOffset; | ||
this.compressedSize = compressedSize; | ||
this.uncompressedSize = uncompressedSize; | ||
} | ||
} | ||
} |
103 changes: 103 additions & 0 deletions
103
...st/java/org/broadinstitute/hellbender/tools/PrintBGZFBlockInformationIntegrationTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
package org.broadinstitute.hellbender.tools; | ||
|
||
import org.broadinstitute.hellbender.CommandLineProgramTest; | ||
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; | ||
import org.broadinstitute.hellbender.exceptions.UserException; | ||
import org.broadinstitute.hellbender.testutils.IntegrationTestSpec; | ||
import org.testng.annotations.Test; | ||
|
||
import java.io.File; | ||
import java.io.IOException; | ||
|
||
public class PrintBGZFBlockInformationIntegrationTest extends CommandLineProgramTest { | ||
|
||
/* Well-formed large BGZF file */ | ||
@Test | ||
public void testNormalLargeInput() throws IOException { | ||
final File input = new File(largeFileTestDir, "gvcfs/HG00096.g.vcf.gz"); | ||
final File actualOutput = createTempFile("PrintBGZFBlockInformationIntegrationTest_testNormalLargeInput", ".out"); | ||
final File expectedOutput = new File(toolsTestDir + "PrintBGZFBlockInformation/expected_PrintBGZFBlockInformationIntegrationTest_testNormalLargeInput.out"); | ||
|
||
final String[] args = { | ||
"--bgzf-file", input.getAbsolutePath(), | ||
"--" + StandardArgumentDefinitions.OUTPUT_LONG_NAME, actualOutput.getAbsolutePath() | ||
}; | ||
runCommandLine(args); | ||
|
||
IntegrationTestSpec.assertEqualTextFiles(actualOutput, expectedOutput); | ||
} | ||
|
||
/* Well-formed small BGZF file */ | ||
@Test | ||
public void testNormalSmallInput() throws IOException { | ||
final File input = new File(toolsTestDir, "PrintBGZFBlockInformation/4featuresHG38Header.vcf.gz"); | ||
final File actualOutput = createTempFile("PrintBGZFBlockInformationIntegrationTest_testNormalSmallInput", ".out"); | ||
final File expectedOutput = new File(toolsTestDir + "PrintBGZFBlockInformation/expected_PrintBGZFBlockInformationIntegrationTest_testNormalSmallInput.out"); | ||
|
||
final String[] args = { | ||
"--bgzf-file", input.getAbsolutePath(), | ||
"--" + StandardArgumentDefinitions.OUTPUT_LONG_NAME, actualOutput.getAbsolutePath() | ||
}; | ||
runCommandLine(args); | ||
|
||
IntegrationTestSpec.assertEqualTextFiles(actualOutput, expectedOutput); | ||
} | ||
|
||
/* Malformed BGZF file missing the final 0-byte terminator block */ | ||
@Test | ||
public void testMissingBGZFTerminatorBlock() throws IOException { | ||
final File input = new File(toolsTestDir, "PrintBGZFBlockInformation/4featuresHG38Header.NO_BGZF_TERMINATOR_BLOCK.vcf.gz"); | ||
final File actualOutput = createTempFile("PrintBGZFBlockInformationIntegrationTest_testMissingBGZFTerminatorBlock", ".out"); | ||
final File expectedOutput = new File(toolsTestDir + "PrintBGZFBlockInformation/expected_PrintBGZFBlockInformationIntegrationTest_testMissingBGZFTerminatorBlock.out"); | ||
|
||
final String[] args = { | ||
"--bgzf-file", input.getAbsolutePath(), | ||
"--" + StandardArgumentDefinitions.OUTPUT_LONG_NAME, actualOutput.getAbsolutePath() | ||
}; | ||
runCommandLine(args); | ||
|
||
IntegrationTestSpec.assertEqualTextFiles(actualOutput, expectedOutput); | ||
} | ||
|
||
/* Malformed BGZF file with an incomplete (truncated) final block */ | ||
@Test(expectedExceptions= UserException.CouldNotReadInputFile.class) | ||
public void testTruncatedFinalBlock() throws IOException { | ||
final File input = new File(toolsTestDir, "PrintBGZFBlockInformation/4featuresHG38Header.TRUNCATED_FINAL_BLOCK.vcf.gz"); | ||
final File actualOutput = createTempFile("PrintBGZFBlockInformationIntegrationTest_testTruncatedFinalBlock", ".out"); | ||
|
||
final String[] args = { | ||
"--bgzf-file", input.getAbsolutePath(), | ||
"--" + StandardArgumentDefinitions.OUTPUT_LONG_NAME, actualOutput.getAbsolutePath() | ||
}; | ||
runCommandLine(args); | ||
} | ||
|
||
/* Malformed BGZF file with an extra 0-byte terminator block in the middle */ | ||
@Test | ||
public void testExtraTerminatorBlockInMiddle() throws IOException { | ||
final File input = new File(toolsTestDir, "PrintBGZFBlockInformation/4featuresHG38Header.EXTRA_TERMINATOR_BLOCK_IN_MIDDLE.vcf.gz"); | ||
final File actualOutput = createTempFile("PrintBGZFBlockInformationIntegrationTest_testExtraTerminatorBlockInMiddle", ".out"); | ||
final File expectedOutput = new File(toolsTestDir + "PrintBGZFBlockInformation/expected_PrintBGZFBlockInformationIntegrationTest_testExtraTerminatorBlockInMiddle.out"); | ||
|
||
final String[] args = { | ||
"--bgzf-file", input.getAbsolutePath(), | ||
"--" + StandardArgumentDefinitions.OUTPUT_LONG_NAME, actualOutput.getAbsolutePath() | ||
}; | ||
runCommandLine(args); | ||
|
||
IntegrationTestSpec.assertEqualTextFiles(actualOutput, expectedOutput); | ||
} | ||
|
||
/* Regular GZIP file masquerading as a BGZF file */ | ||
@Test(expectedExceptions= UserException.CouldNotReadInputFile.class) | ||
public void testRegularGzipFile() throws IOException { | ||
final File input = new File(toolsTestDir, "PrintBGZFBlockInformation/4featuresHG38Header.REGULAR_GZIP.vcf.gz"); | ||
final File actualOutput = createTempFile("PrintBGZFBlockInformationIntegrationTest_testRegularGzipFile", ".out"); | ||
|
||
final String[] args = { | ||
"--bgzf-file", input.getAbsolutePath(), | ||
"--" + StandardArgumentDefinitions.OUTPUT_LONG_NAME, actualOutput.getAbsolutePath() | ||
}; | ||
runCommandLine(args); | ||
} | ||
} |
Binary file added
BIN
+24.6 KB
...ols/PrintBGZFBlockInformation/4featuresHG38Header.EXTRA_TERMINATOR_BLOCK_IN_MIDDLE.vcf.gz
Binary file not shown.
Binary file added
BIN
+24.5 KB
...ender/tools/PrintBGZFBlockInformation/4featuresHG38Header.NO_BGZF_TERMINATOR_BLOCK.vcf.gz
Binary file not shown.
Binary file added
BIN
+25.1 KB
...titute/hellbender/tools/PrintBGZFBlockInformation/4featuresHG38Header.REGULAR_GZIP.vcf.gz
Binary file not shown.
Binary file added
BIN
+24.4 KB
...llbender/tools/PrintBGZFBlockInformation/4featuresHG38Header.TRUNCATED_FINAL_BLOCK.vcf.gz
Binary file not shown.
Binary file added
BIN
+24.6 KB
.../org/broadinstitute/hellbender/tools/PrintBGZFBlockInformation/4featuresHG38Header.vcf.gz
Binary file not shown.
36 changes: 36 additions & 0 deletions
36
...on/expected_PrintBGZFBlockInformationIntegrationTest_testExtraTerminatorBlockInMiddle.out
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
BGZF block information for file: 4featuresHG38Header.EXTRA_TERMINATOR_BLOCK_IN_MIDDLE.vcf.gz | ||
|
||
Block #1 at file offset 0 | ||
- compressed size: 12409 | ||
- uncompressed size: 65498 | ||
|
||
Block #2 at file offset 12409 | ||
- compressed size: 28 | ||
- uncompressed size: 0 | ||
|
||
******************************************************* | ||
ERROR: Premature BGZF 0-byte terminator block was found | ||
at block number: 2 | ||
******************************************************* | ||
|
||
Block #3 at file offset 12437 | ||
- compressed size: 6497 | ||
- uncompressed size: 65498 | ||
|
||
Block #4 at file offset 18934 | ||
- compressed size: 6229 | ||
- uncompressed size: 46819 | ||
|
||
Block #5 at file offset 25163 | ||
- compressed size: 28 | ||
- uncompressed size: 0 | ||
|
||
*************************************************************************** | ||
Final BGZF 0-byte terminator block FOUND as expected at block number 5 | ||
*************************************************************************** | ||
|
||
*********************************************************** | ||
ERROR: Premature BGZF 0-byte terminator block(s) were found | ||
at block number(s): 2 | ||
*********************************************************** | ||
|
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add a brief comment explaining what this test means.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added explanatory comments to all tests