org.seqdoop.hadoop_bam.util.BGZFEnhancedGzipCodec Maven / Gradle / Ivy
Show all versions of hadoop-bam Show documentation
package org.seqdoop.hadoop_bam.util;
import htsjdk.samtools.util.BlockCompressedInputStream;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import org.apache.hadoop.fs.Seekable;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionInputStream;
import org.apache.hadoop.io.compress.Decompressor;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.io.compress.SplitCompressionInputStream;
import org.apache.hadoop.io.compress.SplittableCompressionCodec;
/**
* A Hadoop {@link CompressionCodec} for the
* BGZF compression format,
* which reads and writes files with a .gz
suffix.
*
* BGZF is a splittable extension of gzip, which means that all BGZF files are standard
* gzip files, however the reverse is not necessarily the case. BGZF files often have the
* standard .gz
suffix (such as those produced by the
* bcftools
command),
* which causes a difficulty since it is not immediately apparent from the filename alone
* whether a file is a BGZF file, or merely a regular gzip file. BGZFEnhancedGzipCodec
* will read the start of the file to look for BGZF headers to detect the type of
* compression.
*
*
* BGZFEnhancedGzipCodec will read BGZF or gzip files, but currently always writes regular gzip files.
*
*
* To use BGZFEnhancedGzipCodec, set it on the configuration object as follows. This will
* override the built-in GzipCodec that is mapped to the .gz
suffix.
*
* {@code
* conf.set("io.compression.codecs", BGZFEnhancedGzipCodec.class.getCanonicalName())
* }
* @see BGZFCodec
*/
public class BGZFEnhancedGzipCodec extends GzipCodec implements SplittableCompressionCodec {
@Override
public SplitCompressionInputStream createInputStream(InputStream seekableIn, Decompressor decompressor, long start, long end, READ_MODE readMode) throws IOException {
if (!(seekableIn instanceof Seekable)) {
throw new IOException("seekableIn must be an instance of " +
Seekable.class.getName());
}
if (!BlockCompressedInputStream.isValidFile(new BufferedInputStream(seekableIn))) {
// data is regular gzip, not BGZF
((Seekable)seekableIn).seek(0);
final CompressionInputStream compressionInputStream = createInputStream(seekableIn,
decompressor);
return new SplitCompressionInputStream(compressionInputStream, start, end) {
@Override
public int read(byte[] b, int off, int len) throws IOException {
return compressionInputStream.read(b, off, len);
}
@Override
public void resetState() throws IOException {
compressionInputStream.resetState();
}
@Override
public int read() throws IOException {
return compressionInputStream.read();
}
};
}
BGZFSplitGuesser splitGuesser = new BGZFSplitGuesser(seekableIn);
long adjustedStart = splitGuesser.guessNextBGZFBlockStart(start, end);
((Seekable)seekableIn).seek(adjustedStart);
return new BGZFSplitCompressionInputStream(seekableIn, adjustedStart, end);
}
}