Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PrintBGZFBlockInformation: a tool to dump information about blocks in a BGZF file #4239

Merged
merged 5 commits into from
Jan 15, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,250 @@
package org.broadinstitute.hellbender.tools;

import htsjdk.samtools.util.BlockCompressedStreamConstants;
import htsjdk.samtools.util.IOUtil;
import org.apache.commons.lang3.StringUtils;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.argparser.ExperimentalFeature;
import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import picard.cmdline.programgroups.OtherProgramGroup;

import java.io.*;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;

/**
* A diagnostic tool that prints information about the compressed blocks in a BGZF format file,
* such as a .vcf.gz file. This tool can detect various kinds of BGZF file corruption such as
* premature BGZF terminator blocks, truncated files, and files that were regular-GZIPPED by
* accident.
* <p>
* The output looks like this:
* </p>
* <pre>
* Block #1 at file offset 0
* - compressed size: 12409
* - uncompressed size: 65498
*
* Block #2 at file offset 12409
* - compressed size: 6497
* - uncompressed size: 65498
* ...
* etc.
* </pre>
* <p>
* The output can be redirected to a file using the -O option.
* </p>
*/
@ExperimentalFeature
@CommandLineProgramProperties(
summary = "Print information about the compressed blocks in a BGZF format file",
oneLineSummary = "Print information about the compressed blocks in a BGZF format file",
programGroup = OtherProgramGroup.class
)
public class PrintBGZFBlockInformation extends CommandLineProgram {

@Argument(fullName = "bgzf-file", doc = "The BGZF-format file for which to print block information", optional = false)
private String bgzfPathString;

@Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, doc = "File to which to write block information (if not specified, prints to standard output)", optional = true)
private String output;

private Path bgzfPath;

private long streamOffset = 0l;

private PrintStream outStream;

@Override
protected void onStartup() {
super.onStartup();

bgzfPath = IOUtils.getPath(bgzfPathString);

if ( ! Files.exists(bgzfPath) ) {
throw new UserException.CouldNotReadInputFile("File " + bgzfPathString + " does not exist");
}

if ( ! IOUtil.hasBlockCompressedExtension(bgzfPathString) ) {
throw new UserException.CouldNotReadInputFile("File " + bgzfPathString + " does not end in a recognized BGZF file extension (" +
StringUtils.join(IOUtil.BLOCK_COMPRESSED_EXTENSIONS, ",") + ")");
}

try {
// Check that the file is in BGZF format. This catches the "regular GZIP" case as well:
if ( ! IOUtil.isBlockCompressed(bgzfPath) ) {
throw new UserException.CouldNotReadInputFile(bgzfPath, "File is not a valid BGZF file. Could possibly be a regular GZIP file?");
}
}
catch ( IOException e ) {
throw new UserException.CouldNotReadInputFile(bgzfPath, "Unable to determine whether file is a valid BGZF file", e);
}

if ( output != null ) {
try {
outStream = new PrintStream(output);
} catch (FileNotFoundException e) {
throw new UserException.CouldNotCreateOutputFile(output, "Unable to open output file", e);
}
} else {
outStream = System.out;
}
}

@Override
protected Object doWork() {
BGZFBlockMetadata previousBlockInfo = null;
int blockNumber = 0;
final List<Integer> nonFinalTerminatorBlockIndices = new ArrayList<>();

try ( InputStream bgzfInputStream = Files.newInputStream(bgzfPath) ) {
outStream.printf("BGZF block information for file: %s\n\n", bgzfPath.getFileName());

BGZFBlockMetadata blockInfo;

while ( (blockInfo = processNextBlock(bgzfInputStream, bgzfPathString)) != null ) {
++blockNumber;

// If we saw a 0-byte terminator block that was not the final block in the file,
// emit an error message
if ( previousBlockInfo != null && previousBlockInfo.uncompressedSize == 0 ) {
nonFinalTerminatorBlockIndices.add(blockNumber - 1);

outStream.println("*******************************************************");
outStream.println("ERROR: Premature BGZF 0-byte terminator block was found");
outStream.println("at block number: " + (blockNumber - 1));
outStream.println("*******************************************************");
outStream.println();
}

outStream.printf("Block #%d at file offset %d\n", blockNumber, blockInfo.blockOffset);
outStream.printf("\t- compressed size: %d\n", blockInfo.compressedSize);
outStream.printf("\t- uncompressed size: %d\n", blockInfo.uncompressedSize);
outStream.println();

previousBlockInfo = blockInfo;
}
} catch ( IOException e ) {
throw new UserException.CouldNotReadInputFile("Error while parsing BGZF file.", e);
}

// Check whether the last block in the file was a 0-byte BGZF terminator block
if ( previousBlockInfo == null || previousBlockInfo.uncompressedSize != 0 ) {
outStream.println("******************************************************");
outStream.println("ERROR: Final BGZF 0-byte terminator block was MISSING!");
outStream.println("******************************************************");
outStream.println();
} else {
outStream.println("***************************************************************************");
outStream.println("Final BGZF 0-byte terminator block FOUND as expected at block number " + blockNumber);
outStream.println("***************************************************************************");
outStream.println();
}

// Emit an error message at the end if we encountered any terminator blocks before the final block:
if ( ! nonFinalTerminatorBlockIndices.isEmpty() ) {
outStream.println("***********************************************************");
outStream.println("ERROR: Premature BGZF 0-byte terminator block(s) were found");
outStream.println("at block number(s): " + StringUtils.join(nonFinalTerminatorBlockIndices, ","));
outStream.println("***********************************************************");
outStream.println();
}

return 0;
}

@Override
protected void onShutdown() {
if ( outStream != null && outStream != System.out ) {
outStream.close();
}
}

// Code adapted from HTSJDK's BlockCompressedInputStream class
private BGZFBlockMetadata processNextBlock(InputStream stream, String streamSource) throws IOException {
final byte[] buffer = new byte[BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE];
long blockAddress = streamOffset;

final int headerByteCount = readBytes(stream, buffer, 0, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);

// Return null when we hit EOF
if ( headerByteCount <= 0 ) {
return null;
}
if (headerByteCount != BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH) {
throw new IOException("Incorrect header size for file: " + streamSource);
}
streamOffset += headerByteCount;

final int blockLength = unpackInt16(buffer, BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET) + 1;

if (blockLength < BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH || blockLength > buffer.length) {
throw new IOException("Unexpected compressed block length: " + blockLength + " for " + streamSource);
}

final int remaining = blockLength - BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH;
final int dataByteCount = readBytes(stream, buffer, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH,
remaining);

if (dataByteCount != remaining) {
throw new IOException("Premature end of file: " + streamSource);
}
streamOffset += dataByteCount;

final int uncompressedLength = unpackInt32(buffer, blockLength - 4);

if (uncompressedLength < 0) {
throw new IOException(streamSource + " has invalid uncompressed length: " + uncompressedLength);
}

return new BGZFBlockMetadata(blockAddress, blockLength, uncompressedLength);
}

private static int unpackInt16(final byte[] buffer, final int offset) {
return ((buffer[offset] & 0xFF) |
((buffer[offset+1] & 0xFF) << 8));
}

private static int unpackInt32(final byte[] buffer, final int offset) {
return ((buffer[offset] & 0xFF) |
((buffer[offset+1] & 0xFF) << 8) |
((buffer[offset+2] & 0xFF) << 16) |
((buffer[offset+3] & 0xFF) << 24));
}

private static int readBytes(final InputStream stream, final byte[] buffer, final int offset, final int length) throws IOException {
int bytesRead = 0;
while (bytesRead < length) {
final int count = stream.read(buffer, offset + bytesRead, length - bytesRead);

// Return EOF if we get EOF from read() and we haven't read any bytes
if ( count < 0 && bytesRead == 0 ) {
return count;
// Otherwise if we hit EOF and we have read something, return the bytes read
} else if (count <= 0) {
break;
}

bytesRead += count;
}
return bytesRead;
}

private static final class BGZFBlockMetadata {
private final long blockOffset;
private final int compressedSize;
private final int uncompressedSize;

public BGZFBlockMetadata(final long blockOffset, final int compressedSize, final int uncompressedSize) {
this.blockOffset = blockOffset;
this.compressedSize = compressedSize;
this.uncompressedSize = uncompressedSize;
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
package org.broadinstitute.hellbender.tools;

import org.broadinstitute.hellbender.CommandLineProgramTest;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.testutils.IntegrationTestSpec;
import org.testng.annotations.Test;

import java.io.File;
import java.io.IOException;

public class PrintBGZFBlockInformationIntegrationTest extends CommandLineProgramTest {

/* Well-formed large BGZF file */
@Test
public void testNormalLargeInput() throws IOException {
final File input = new File(largeFileTestDir, "gvcfs/HG00096.g.vcf.gz");
final File actualOutput = createTempFile("PrintBGZFBlockInformationIntegrationTest_testNormalLargeInput", ".out");
final File expectedOutput = new File(toolsTestDir + "PrintBGZFBlockInformation/expected_PrintBGZFBlockInformationIntegrationTest_testNormalLargeInput.out");

final String[] args = {
"--bgzf-file", input.getAbsolutePath(),
"--" + StandardArgumentDefinitions.OUTPUT_LONG_NAME, actualOutput.getAbsolutePath()
};
runCommandLine(args);

IntegrationTestSpec.assertEqualTextFiles(actualOutput, expectedOutput);
}

/* Well-formed small BGZF file */
@Test
public void testNormalSmallInput() throws IOException {
final File input = new File(toolsTestDir, "PrintBGZFBlockInformation/4featuresHG38Header.vcf.gz");
final File actualOutput = createTempFile("PrintBGZFBlockInformationIntegrationTest_testNormalSmallInput", ".out");
final File expectedOutput = new File(toolsTestDir + "PrintBGZFBlockInformation/expected_PrintBGZFBlockInformationIntegrationTest_testNormalSmallInput.out");

final String[] args = {
"--bgzf-file", input.getAbsolutePath(),
"--" + StandardArgumentDefinitions.OUTPUT_LONG_NAME, actualOutput.getAbsolutePath()
};
runCommandLine(args);

IntegrationTestSpec.assertEqualTextFiles(actualOutput, expectedOutput);
}

/* Malformed BGZF file missing the final 0-byte terminator block */
@Test
public void testMissingBGZFTerminatorBlock() throws IOException {
final File input = new File(toolsTestDir, "PrintBGZFBlockInformation/4featuresHG38Header.NO_BGZF_TERMINATOR_BLOCK.vcf.gz");
final File actualOutput = createTempFile("PrintBGZFBlockInformationIntegrationTest_testMissingBGZFTerminatorBlock", ".out");
final File expectedOutput = new File(toolsTestDir + "PrintBGZFBlockInformation/expected_PrintBGZFBlockInformationIntegrationTest_testMissingBGZFTerminatorBlock.out");

final String[] args = {
"--bgzf-file", input.getAbsolutePath(),
"--" + StandardArgumentDefinitions.OUTPUT_LONG_NAME, actualOutput.getAbsolutePath()
};
runCommandLine(args);

IntegrationTestSpec.assertEqualTextFiles(actualOutput, expectedOutput);
}

/* Malformed BGZF file with an incomplete (truncated) final block */
@Test(expectedExceptions= UserException.CouldNotReadInputFile.class)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a brief comment explaining what this test means.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added explanatory comments to all tests

public void testTruncatedFinalBlock() throws IOException {
final File input = new File(toolsTestDir, "PrintBGZFBlockInformation/4featuresHG38Header.TRUNCATED_FINAL_BLOCK.vcf.gz");
final File actualOutput = createTempFile("PrintBGZFBlockInformationIntegrationTest_testTruncatedFinalBlock", ".out");

final String[] args = {
"--bgzf-file", input.getAbsolutePath(),
"--" + StandardArgumentDefinitions.OUTPUT_LONG_NAME, actualOutput.getAbsolutePath()
};
runCommandLine(args);
}

/* Malformed BGZF file with an extra 0-byte terminator block in the middle */
@Test
public void testExtraTerminatorBlockInMiddle() throws IOException {
final File input = new File(toolsTestDir, "PrintBGZFBlockInformation/4featuresHG38Header.EXTRA_TERMINATOR_BLOCK_IN_MIDDLE.vcf.gz");
final File actualOutput = createTempFile("PrintBGZFBlockInformationIntegrationTest_testExtraTerminatorBlockInMiddle", ".out");
final File expectedOutput = new File(toolsTestDir + "PrintBGZFBlockInformation/expected_PrintBGZFBlockInformationIntegrationTest_testExtraTerminatorBlockInMiddle.out");

final String[] args = {
"--bgzf-file", input.getAbsolutePath(),
"--" + StandardArgumentDefinitions.OUTPUT_LONG_NAME, actualOutput.getAbsolutePath()
};
runCommandLine(args);

IntegrationTestSpec.assertEqualTextFiles(actualOutput, expectedOutput);
}

/* Regular GZIP file masquerading as a BGZF file */
@Test(expectedExceptions= UserException.CouldNotReadInputFile.class)
public void testRegularGzipFile() throws IOException {
final File input = new File(toolsTestDir, "PrintBGZFBlockInformation/4featuresHG38Header.REGULAR_GZIP.vcf.gz");
final File actualOutput = createTempFile("PrintBGZFBlockInformationIntegrationTest_testRegularGzipFile", ".out");

final String[] args = {
"--bgzf-file", input.getAbsolutePath(),
"--" + StandardArgumentDefinitions.OUTPUT_LONG_NAME, actualOutput.getAbsolutePath()
};
runCommandLine(args);
}
}
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
BGZF block information for file: 4featuresHG38Header.EXTRA_TERMINATOR_BLOCK_IN_MIDDLE.vcf.gz

Block #1 at file offset 0
- compressed size: 12409
- uncompressed size: 65498

Block #2 at file offset 12409
- compressed size: 28
- uncompressed size: 0

*******************************************************
ERROR: Premature BGZF 0-byte terminator block was found
at block number: 2
*******************************************************

Block #3 at file offset 12437
- compressed size: 6497
- uncompressed size: 65498

Block #4 at file offset 18934
- compressed size: 6229
- uncompressed size: 46819

Block #5 at file offset 25163
- compressed size: 28
- uncompressed size: 0

***************************************************************************
Final BGZF 0-byte terminator block FOUND as expected at block number 5
***************************************************************************

***********************************************************
ERROR: Premature BGZF 0-byte terminator block(s) were found
at block number(s): 2
***********************************************************

Loading