All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.tools.PrintBGZFBlockInformation Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.tools;

import htsjdk.samtools.util.BlockCompressedStreamConstants;
import htsjdk.samtools.util.IOUtil;
import org.apache.commons.lang3.StringUtils;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.argparser.ExperimentalFeature;
import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import picard.cmdline.programgroups.OtherProgramGroup;

import java.io.*;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;

/**
 * A diagnostic tool that prints information about the compressed blocks in a BGZF format file,
 * such as a .vcf.gz file or a .bam file. This tool can detect various kinds of BGZF file corruption
 * such as premature BGZF terminator blocks, truncated files, and files that were regular-GZIPPED by
 * accident.
 * 

* The output looks like this: *

*
 *        Block #1 at file offset 0
 *            - compressed size: 12409
 *            - uncompressed size: 65498
 *
 *        Block #2 at file offset 12409
 *            - compressed size: 6497
 *            - uncompressed size: 65498
 *        ...
 *        etc.
 * 
*

* The output can be redirected to a file using the -O option. *

*/ @ExperimentalFeature @CommandLineProgramProperties( summary = "Print information about the compressed blocks in a BGZF format file", oneLineSummary = "Print information about the compressed blocks in a BGZF format file", programGroup = OtherProgramGroup.class ) public class PrintBGZFBlockInformation extends CommandLineProgram { @Argument(fullName = "bgzf-file", doc = "The BGZF-format file for which to print block information", optional = false) private String bgzfPathString; @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, doc = "File to which to write block information (if not specified, prints to standard output)", optional = true) private String output; private Path bgzfPath; private long streamOffset = 0l; private PrintStream outStream; @Override protected void onStartup() { super.onStartup(); bgzfPath = IOUtils.getPath(bgzfPathString); if ( ! Files.exists(bgzfPath) ) { throw new UserException.CouldNotReadInputFile("File " + bgzfPathString + " does not exist"); } try { // Check that the file is in BGZF format. This catches the "regular GZIP" case as well: if ( ! IOUtil.isBlockCompressed(bgzfPath) ) { throw new UserException.CouldNotReadInputFile(bgzfPath, "File is not a valid BGZF file. Could be a regular GZIP file, or some other non-BGZF format."); } } catch ( IOException e ) { throw new UserException.CouldNotReadInputFile(bgzfPath, "Unable to determine whether file is a valid BGZF file", e); } if ( output != null ) { try { outStream = new PrintStream(output); } catch (FileNotFoundException e) { throw new UserException.CouldNotCreateOutputFile(output, "Unable to open output file", e); } } else { outStream = System.out; } } @Override protected Object doWork() { BGZFBlockMetadata previousBlockInfo = null; int blockNumber = 0; final List nonFinalTerminatorBlockIndices = new ArrayList<>(); try ( InputStream bgzfInputStream = Files.newInputStream(bgzfPath) ) { outStream.printf("BGZF block information for file: %s\n\n", bgzfPath.getFileName()); BGZFBlockMetadata blockInfo; while ( (blockInfo = processNextBlock(bgzfInputStream, bgzfPathString)) != null ) { ++blockNumber; // If we saw a 0-byte terminator block that was not the final block in the file, // emit an error message if ( previousBlockInfo != null && previousBlockInfo.uncompressedSize == 0 ) { nonFinalTerminatorBlockIndices.add(blockNumber - 1); outStream.println("*******************************************************"); outStream.println("ERROR: Premature BGZF 0-byte terminator block was found"); outStream.println("at block number: " + (blockNumber - 1)); outStream.println("*******************************************************"); outStream.println(); } outStream.printf("Block #%d at file offset %d\n", blockNumber, blockInfo.blockOffset); outStream.printf("\t- compressed size: %d\n", blockInfo.compressedSize); outStream.printf("\t- uncompressed size: %d\n", blockInfo.uncompressedSize); outStream.println(); previousBlockInfo = blockInfo; } } catch ( IOException e ) { throw new UserException.CouldNotReadInputFile("Error while parsing BGZF file.", e); } // Check whether the last block in the file was a 0-byte BGZF terminator block if ( previousBlockInfo == null || previousBlockInfo.uncompressedSize != 0 ) { outStream.println("******************************************************"); outStream.println("ERROR: Final BGZF 0-byte terminator block was MISSING!"); outStream.println("******************************************************"); outStream.println(); } else { outStream.println("***************************************************************************"); outStream.println("Final BGZF 0-byte terminator block FOUND as expected at block number " + blockNumber); outStream.println("***************************************************************************"); outStream.println(); } // Emit an error message at the end if we encountered any terminator blocks before the final block: if ( ! nonFinalTerminatorBlockIndices.isEmpty() ) { outStream.println("***********************************************************"); outStream.println("ERROR: Premature BGZF 0-byte terminator block(s) were found"); outStream.println("at block number(s): " + StringUtils.join(nonFinalTerminatorBlockIndices, ",")); outStream.println("***********************************************************"); outStream.println(); } return 0; } @Override protected void onShutdown() { if ( outStream != null && outStream != System.out ) { outStream.close(); } } // Code adapted from HTSJDK's BlockCompressedInputStream class private BGZFBlockMetadata processNextBlock(InputStream stream, String streamSource) throws IOException { final byte[] buffer = new byte[BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE]; long blockAddress = streamOffset; final int headerByteCount = readBytes(stream, buffer, 0, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH); // Return null when we hit EOF if ( headerByteCount <= 0 ) { return null; } if (headerByteCount != BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH) { throw new IOException("Incorrect header size for file: " + streamSource); } streamOffset += headerByteCount; final int blockLength = unpackInt16(buffer, BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET) + 1; if (blockLength < BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH || blockLength > buffer.length) { throw new IOException("Unexpected compressed block length: " + blockLength + " for " + streamSource); } final int remaining = blockLength - BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH; final int dataByteCount = readBytes(stream, buffer, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH, remaining); if (dataByteCount != remaining) { throw new IOException("Premature end of file: " + streamSource); } streamOffset += dataByteCount; final int uncompressedLength = unpackInt32(buffer, blockLength - 4); if (uncompressedLength < 0) { throw new IOException(streamSource + " has invalid uncompressed length: " + uncompressedLength); } return new BGZFBlockMetadata(blockAddress, blockLength, uncompressedLength); } private static int unpackInt16(final byte[] buffer, final int offset) { return ((buffer[offset] & 0xFF) | ((buffer[offset+1] & 0xFF) << 8)); } private static int unpackInt32(final byte[] buffer, final int offset) { return ((buffer[offset] & 0xFF) | ((buffer[offset+1] & 0xFF) << 8) | ((buffer[offset+2] & 0xFF) << 16) | ((buffer[offset+3] & 0xFF) << 24)); } private static int readBytes(final InputStream stream, final byte[] buffer, final int offset, final int length) throws IOException { int bytesRead = 0; while (bytesRead < length) { final int count = stream.read(buffer, offset + bytesRead, length - bytesRead); // Return EOF if we get EOF from read() and we haven't read any bytes if ( count < 0 && bytesRead == 0 ) { return count; // Otherwise if we hit EOF and we have read something, return the bytes read } else if (count <= 0) { break; } bytesRead += count; } return bytesRead; } private static final class BGZFBlockMetadata { private final long blockOffset; private final int compressedSize; private final int uncompressedSize; public BGZFBlockMetadata(final long blockOffset, final int compressedSize, final int uncompressedSize) { this.blockOffset = blockOffset; this.compressedSize = compressedSize; this.uncompressedSize = uncompressedSize; } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy