
org.broadinstitute.hellbender.utils.io.BlockCompressedIntervalStream Maven / Gradle / Ivy
The newest version!
package org.broadinstitute.hellbender.utils.io;
import com.google.common.annotations.VisibleForTesting;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.seekablestream.SeekableBufferedStream;
import htsjdk.samtools.seekablestream.SeekableStream;
import htsjdk.samtools.seekablestream.SeekableStreamFactory;
import htsjdk.samtools.util.BlockCompressedInputStream;
import htsjdk.samtools.util.BlockCompressedOutputStream;
import htsjdk.samtools.util.BlockCompressedStreamConstants;
import htsjdk.tribble.*;
import org.broadinstitute.hellbender.engine.FeatureInput;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.SVInterval;
import org.broadinstitute.hellbender.utils.SVIntervalTree;
import org.broadinstitute.hellbender.utils.codecs.FeatureSink;
import org.broadinstitute.hellbender.tools.sv.SVFeaturesHeader;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.file.Path;
import java.util.*;
public class BlockCompressedIntervalStream {
// our final empty block adds to the extra information
// it has a virtual file pointer to the start of the index
public static final byte[] EMPTY_GZIP_BLOCK_WITH_INDEX_POINTER = {
BlockCompressedStreamConstants.GZIP_ID1,
(byte)BlockCompressedStreamConstants.GZIP_ID2,
BlockCompressedStreamConstants.GZIP_CM_DEFLATE,
BlockCompressedStreamConstants.GZIP_FLG,
0, 0, 0, 0, // modification time
BlockCompressedStreamConstants.GZIP_XFL,
(byte)BlockCompressedStreamConstants.GZIP_OS_UNKNOWN,
BlockCompressedStreamConstants.GZIP_XLEN + 12, 0,
BlockCompressedStreamConstants.BGZF_ID1,
BlockCompressedStreamConstants.BGZF_ID2,
BlockCompressedStreamConstants.BGZF_LEN, 0,
39, 0, // Total block size - 1 as little-endian short
(byte)'I', (byte)'P', 8, 0, // index pointer extra data
// 8-byte little-endian long representing file-pointer to beginning of index
// (this data gets overwritten each time a stream is closed)
1, 2, 3, 4, 5, 6, 7, 8,
3, 0, // empty payload
0, 0, 0, 0, // crc
0, 0, 0, 0, // uncompressedSize
};
public static final int FILE_POINTER_OFFSET = 22;
public static final String BCI_FILE_EXTENSION = ".bci";
// each compressed block of data will have (at least) one of these as a part of the index
// for each contig that appears in a compressed block, the SVInterval tracks the smallest
// starting coordinate and largest end coordinate of any object in the block
// the filePosition member is the virtual file offset of the first object in the block (or, the
// first object for a new contig, if there are multiple contigs represented within the block)
public final static class IndexEntry {
final SVInterval interval;
long filePosition;
public IndexEntry( final SVInterval interval, final long filePosition ) {
this.interval = interval;
this.filePosition = filePosition;
}
public IndexEntry( final DataInputStream dis ) throws IOException {
this.interval = new SVInterval(dis.readInt(), dis.readInt(), dis.readInt());
this.filePosition = dis.readLong();
}
public SVInterval getInterval() { return interval; }
public long getFilePosition() { return filePosition; }
public void write( final DataOutputStream dos ) throws IOException {
dos.writeInt(interval.getContig());
dos.writeInt(interval.getStart());
dos.writeInt(interval.getEnd());
dos.writeLong(filePosition);
}
}
@FunctionalInterface
public interface WriteFunc {
void write( F feature, Writer writer ) throws IOException;
}
// a class for writing arbitrary objects to a block compressed stream with a self-contained index
// the only restriction is that you must supply a lambda that writes enough state to a DataOutputStream
// to allow you to reconstitute the object when you read it back in later
public static class Writer implements FeatureSink {
final String path;
final SVFeaturesHeader header;
final WriteFunc writeFunc;
final OutputStream os;
final BlockCompressedOutputStream bcos;
final DataOutputStream dos;
Feature lastInterval;
final List indexEntries;
long blockFilePosition;
int blockContig;
int blockStart;
int blockEnd;
boolean firstBlockMember;
public final static int DEFAULT_COMPRESSION_LEVEL = 6;
public Writer( final GATKPath path,
final SVFeaturesHeader header,
final WriteFunc writeFunc ) {
this(path, header, writeFunc, DEFAULT_COMPRESSION_LEVEL);
}
public Writer( final GATKPath path,
final SVFeaturesHeader header,
final WriteFunc writeFunc,
final int compressionLevel ) {
this.path = path.toString();
this.header = header;
this.writeFunc = writeFunc;
this.os = path.getOutputStream();
this.bcos = new BlockCompressedOutputStream(os, (Path)null, compressionLevel);
this.dos = new DataOutputStream(bcos);
this.lastInterval = null;
this.indexEntries = new ArrayList<>();
this.firstBlockMember = true;
writeHeader();
}
@VisibleForTesting
public Writer( final String streamSource,
final OutputStream os,
final SVFeaturesHeader header,
final WriteFunc writeFunc ) {
this.path = streamSource;
this.header = header;
this.writeFunc = writeFunc;
this.os = os;
this.bcos = new BlockCompressedOutputStream(os, (Path)null, DEFAULT_COMPRESSION_LEVEL);
this.dos = new DataOutputStream(bcos);
this.lastInterval = null;
this.indexEntries = new ArrayList<>();
this.firstBlockMember = true;
writeHeader();
}
private void writeHeader() {
try {
writeClassAndVersion(header.getClassName(), header.getVersion());
writeSamples(header.getSampleNames());
writeDictionary(header.getDictionary());
dos.flush();
} catch ( final IOException ioe ) {
throw new UserException("can't write header to " + path, ioe);
}
}
private void writeClassAndVersion( final String className, final String version )
throws IOException {
dos.writeUTF(className);
dos.writeUTF(version);
}
private void writeSamples( final Collection sampleNames ) throws IOException {
final int nSampleNames = sampleNames.size();
if ( nSampleNames == 0 ) {
throw new UserException("Attempting to create " + path +
" with an empty list of sample names. It won't go well.");
}
dos.writeInt(nSampleNames);
for ( final String sampleName : sampleNames ) {
dos.writeUTF(sampleName);
}
}
private void writeDictionary( final SAMSequenceDictionary dictionary ) throws IOException {
dos.writeInt(dictionary.size());
for ( final SAMSequenceRecord rec : dictionary.getSequences() ) {
dos.writeInt(rec.getSequenceLength());
dos.writeUTF(rec.getSequenceName());
}
}
public DataOutputStream getStream() { return dos; }
public int getSampleIndex( final String sampleName ) {
final Integer sampleIndex = header.getSampleIndex(sampleName);
if ( sampleIndex == null ) {
throw new UserException("Can't find index for sampleName " + sampleName +
". Supply a complete list of sample names.");
}
return sampleIndex;
}
public int getContigIndex( final String contigName ) {
final SAMSequenceRecord contig = header.getDictionary().getSequence(contigName);
if ( contig == null ) {
throw new UserException("can't find contig name " + contigName +
". Please check your dictionaries for completeness and consistency.");
}
return contig.getSequenceIndex();
}
@Override
public void write( final F feature ) {
final long prevFilePosition = bcos.getPosition();
// write the object
try {
writeFunc.write(feature, this);
} catch ( final IOException ioe ) {
throw new UserException("can't write to " + path, ioe);
}
// if this is the first interval we've seen in a block, just capture the block-start data
if ( firstBlockMember || lastInterval == null ) {
startBlock(prevFilePosition, feature);
return;
}
// if the contig changes emit a new index entry (for the previous contig) and
// restart tracking of the block
if ( !feature.contigsMatch(lastInterval) ) {
addIndexEntry();
startBlock(prevFilePosition, feature);
return;
}
// extend the tracked interval, as necessary
blockEnd = Math.max(blockEnd, feature.getEnd());
lastInterval = feature;
// if writing this element caused a new block to be compressed and added to the file
if ( isNewBlock(prevFilePosition, bcos.getPosition()) ) {
addIndexEntry();
firstBlockMember = true;
}
}
@Override
public void close() {
// take care of any pending index entry, if necessary
if ( !firstBlockMember ) {
addIndexEntry();
}
try {
dos.flush(); // complete the data block
long indexPosition = bcos.getPosition(); // current position is the start of the index
// write the index entries
dos.writeInt(indexEntries.size());
for ( final IndexEntry indexEntry : indexEntries ) {
indexEntry.write(dos);
}
dos.flush(); // and complete the block
// write a 0-length terminator block at the end that captures the index position
final byte[] emptyBlockWithIndexPointer =
Arrays.copyOf(EMPTY_GZIP_BLOCK_WITH_INDEX_POINTER,
EMPTY_GZIP_BLOCK_WITH_INDEX_POINTER.length);
for ( int idx = FILE_POINTER_OFFSET; idx != FILE_POINTER_OFFSET + 8; ++idx ) {
emptyBlockWithIndexPointer[idx] = (byte)indexPosition;
indexPosition >>>= 8;
}
os.write(emptyBlockWithIndexPointer);
bcos.close(false); // we've already handled the terminator block
} catch ( final IOException ioe ) {
throw new UserException("unable to add index and close " + path, ioe);
}
}
private void startBlock( final long filePosition, final Feature interval ) {
blockFilePosition = filePosition;
lastInterval = interval;
blockContig = header.getDictionary().getSequenceIndex(interval.getContig());
blockStart = interval.getStart();
blockEnd = interval.getEnd();
firstBlockMember = false;
}
private void addIndexEntry() {
final SVInterval blockInterval = new SVInterval(blockContig, blockStart, blockEnd);
indexEntries.add(new IndexEntry(blockInterval, blockFilePosition));
}
}
// a class for reading arbitrary objects from a block compressed stream with a self-contained index
// the only restriction is that you must supply a lambda that reads from a DataInputStream
// to reconstitute the object.
public static final class Reader implements FeatureReader {
final String path;
final FeatureCodec> codec;
final long indexFilePointer;
final BlockCompressedInputStream bcis;
final DataInputStream dis;
final SVFeaturesHeader header;
final long dataFilePointer;
SVIntervalTree index;
boolean usedByIterator;
public Reader( final FeatureInput inputDescriptor, final FeatureCodec> codec ) {
this.path = inputDescriptor.getRawInputString();
this.codec = codec;
final SeekableStream ss;
try {
ss = SeekableStreamFactory.getInstance().getStreamFor(path);
} catch ( final IOException ioe ) {
throw new UserException("unable to open " + path, ioe);
}
this.indexFilePointer = findIndexFilePointer(ss);
this.bcis = new BlockCompressedInputStream(new SeekableBufferedStream(ss));
this.dis = new DataInputStream(bcis);
this.header = readHeader();
this.dataFilePointer = bcis.getPosition(); // having read header, we're pointing at the data
this.index = null;
this.usedByIterator = false;
final String expectedClassName = codec.getFeatureType().getSimpleName();
if ( !header.getClassName().equals(expectedClassName) ) {
throw new UserException("can't use " + path + " to read " + expectedClassName +
" features -- it contains " + header.getClassName() + " features");
}
}
public Reader( final Reader reader ) {
this.path = reader.path;
this.codec = reader.codec;
this.indexFilePointer = reader.indexFilePointer;
try {
this.bcis = new BlockCompressedInputStream(
new SeekableBufferedStream(
SeekableStreamFactory.getInstance().getStreamFor(path)));
} catch ( final IOException ioe ) {
throw new UserException("unable to clone stream for " + path, ioe);
}
this.dis = new DataInputStream(bcis);
this.header = reader.header;
this.dataFilePointer = reader.dataFilePointer;
this.index = reader.index;
this.usedByIterator = true;
}
@VisibleForTesting
public Reader( final String inputStreamName,
final SeekableStream ss,
final FeatureCodec> codec ) {
this.path = inputStreamName;
this.codec = codec;
this.indexFilePointer = findIndexFilePointer(ss);
this.bcis = new BlockCompressedInputStream(new SeekableBufferedStream(ss));
this.dis = new DataInputStream(bcis);
this.header = readHeader();
this.dataFilePointer = bcis.getPosition();
this.index = null;
this.usedByIterator = false;
}
public FeatureCodecHeader getFeatureCodecHeader() {
return new FeatureCodecHeader(header, dataFilePointer);
}
public DataInputStream getStream() { return dis; }
public List getSampleNames() { return header.getSampleNames(); }
public SAMSequenceDictionary getDictionary() { return header.getDictionary(); }
public String getVersion() { return header.getVersion(); }
public boolean hasNext() {
final long position = bcis.getPosition();
// A BlockCompressedInputStream returns a 0 position when closed
return position > 0 && position < indexFilePointer;
}
@Override
public CloseableTribbleIterator query( final String chr, final int start, final int end )
throws IOException {
if ( index == null ) {
loadIndex(bcis);
}
final SVInterval interval =
new SVInterval(getDictionary().getSequenceIndex(chr), start, end);
return new OverlapIterator<>(interval, this);
}
@Override public CloseableTribbleIterator iterator() {
return new CompleteIterator<>(this);
}
@Override public void close() {
try {
dis.close();
} catch ( final IOException ioe ) {
throw new UserException("unable to close " + path, ioe);
}
}
@Override public List getSequenceNames() {
final SAMSequenceDictionary dict = getDictionary();
final List names = new ArrayList<>(dict.size());
for ( final SAMSequenceRecord rec : dict.getSequences() ) {
names.add(rec.getSequenceName());
}
return names;
}
@Override public Object getHeader() { return header; }
@Override public boolean isQueryable() { return true; }
public long getPosition() { return bcis.getPosition(); }
public void seekStream( final long filePointer ) {
try {
bcis.seek(filePointer);
} catch ( final IOException ioe ) {
throw new UserException("unable to position stream for " + path, ioe);
}
}
public T readStream() {
try {
return codec.decode(this);
} catch ( final IOException ioe ) {
throw new GATKException("can't read " + path, ioe);
}
}
private long findIndexFilePointer( final SeekableStream ss ) {
final int finalBlockLen = EMPTY_GZIP_BLOCK_WITH_INDEX_POINTER.length;
final byte[] finalBlock = new byte[finalBlockLen];
try {
ss.seek(ss.length() - finalBlockLen);
ss.readFully(finalBlock);
ss.seek(0);
} catch ( final IOException ioe ) {
throw new UserException("unable to read final bgzip block from " + path, ioe);
}
for ( int idx = 0; idx != FILE_POINTER_OFFSET; ++idx ) {
if ( EMPTY_GZIP_BLOCK_WITH_INDEX_POINTER[idx] != finalBlock[idx] ) {
throw new UserException(
"unable to recover index pointer from final block of " + path);
}
}
for ( int idx = FILE_POINTER_OFFSET + 8; idx != finalBlockLen; ++idx ) {
if ( EMPTY_GZIP_BLOCK_WITH_INDEX_POINTER[idx] != finalBlock[idx] ) {
throw new UserException(
"unable to recover index pointer from final block of " + path);
}
}
long indexFilePointer = 0;
int idx = FILE_POINTER_OFFSET + 8;
while ( --idx >= FILE_POINTER_OFFSET ) {
indexFilePointer <<= 8;
indexFilePointer |= finalBlock[idx] & 0xFFL;
}
return indexFilePointer;
}
private SVFeaturesHeader readHeader() {
try {
final String className = dis.readUTF();
final String version = dis.readUTF();
final List sampleNames = readSampleNames(dis);
final SAMSequenceDictionary dictionary = readDictionary(dis);
return new SVFeaturesHeader(className, version, dictionary, sampleNames);
} catch ( final IOException ioe ) {
throw new UserException("can't read header from " + path, ioe);
}
}
private List readSampleNames( final DataInputStream dis ) throws IOException {
final int nSamples = dis.readInt();
final List sampleNames = new ArrayList<>(nSamples);
for ( int sampleId = 0; sampleId < nSamples; ++sampleId ) {
sampleNames.add(dis.readUTF());
}
return sampleNames;
}
private SAMSequenceDictionary readDictionary( final DataInputStream dis ) throws IOException {
final int nRecs = dis.readInt();
final List seqRecs = new ArrayList<>(nRecs);
for ( int idx = 0; idx != nRecs; ++idx ) {
final int contigSize = dis.readInt();
final String contigName = dis.readUTF();
seqRecs.add(new SAMSequenceRecord(contigName, contigSize));
}
return new SAMSequenceDictionary(seqRecs);
}
private void loadIndex( final BlockCompressedInputStream bcis ) {
final SVIntervalTree intervalTree = new SVIntervalTree<>();
try {
bcis.seek(indexFilePointer);
final DataInputStream dis = new DataInputStream(bcis);
int nEntries = dis.readInt();
while ( nEntries-- > 0 ) {
final IndexEntry entry = new IndexEntry(dis);
intervalTree.put(entry.getInterval(), entry.getFilePosition());
}
bcis.seek(dataFilePointer);
} catch ( final IOException ioe ) {
throw new UserException("unable to read index from " + path, ioe);
}
index = intervalTree;
}
private Reader getReaderForIterator() {
if ( !usedByIterator ) {
usedByIterator = true;
return this;
}
return new Reader<>(this);
}
private static class CompleteIterator
implements CloseableTribbleIterator {
final Reader reader;
public CompleteIterator( final Reader reader ) {
this.reader = reader.getReaderForIterator();
reader.seekStream(reader.dataFilePointer);
}
@Override public Iterator iterator() {
return new CompleteIterator<>(reader);
}
@Override public boolean hasNext() {
return reader.hasNext();
}
@Override public T next() {
if ( !hasNext() ) {
throw new NoSuchElementException("feature iterator has no next element");
}
return reader.readStream();
}
@Override public void close() { reader.close(); }
}
// find all the objects in the stream inflating just those blocks that might have relevant objects
private static class OverlapIterator
implements CloseableTribbleIterator {
final SVInterval interval;
final Reader reader;
final Iterator> indexEntryIterator;
long blockStartPosition;
T nextT;
public OverlapIterator( final SVInterval interval, final Reader reader ) {
this.interval = interval;
this.reader = reader.getReaderForIterator();
this.indexEntryIterator = reader.index.overlappers(interval);
this.blockStartPosition = -1;
advance();
}
@Override public boolean hasNext() { return nextT != null; }
@Override public T next() {
final T result = nextT;
if ( result == null ) {
throw new NoSuchElementException("overlapper iterator has no next element");
}
advance();
return result; }
@Override public void close() { reader.close(); nextT = null; }
@Override public CloseableTribbleIterator iterator() {
return new OverlapIterator<>(interval, reader);
}
private void advance() {
do {
if ( isNewBlock(blockStartPosition, reader.getPosition()) ) {
if ( !indexEntryIterator.hasNext() ) {
nextT = null;
return;
}
blockStartPosition = indexEntryIterator.next().getValue();
reader.seekStream(blockStartPosition);
}
nextT = reader.readStream();
final int nextContigId =
reader.getDictionary().getSequenceIndex(nextT.getContig());
if ( interval.getContig() != nextContigId ||
interval.getEnd() < nextT.getStart() ) {
nextT = null;
return;
}
} while ( nextT.getEnd() < interval.getStart() );
}
}
}
public static boolean isNewBlock( final long filePosition1, final long filePosition2 ) {
// upper 48 bits contain the block offset
// check to see if there are any bit differences in those upper 48 bits
return ((filePosition1 ^ filePosition2) & ~0xffffL) != 0;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy