org.apache.hadoop.io.compress.BZip2Codec Maven / Gradle / Ivy
The newest version!
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.io.compress;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.fs.Seekable;
import org.apache.hadoop.io.compress.bzip2.BZip2Constants;
import org.apache.hadoop.io.compress.bzip2.BZip2DummyCompressor;
import org.apache.hadoop.io.compress.bzip2.BZip2DummyDecompressor;
import org.apache.hadoop.io.compress.bzip2.CBZip2InputStream;
import org.apache.hadoop.io.compress.bzip2.CBZip2OutputStream;
/**
* This class provides CompressionOutputStream and CompressionInputStream for
* compression and decompression. Currently we dont have an implementation of
* the Compressor and Decompressor interfaces, so those methods of
* CompressionCodec which have a Compressor or Decompressor type argument, throw
* UnsupportedOperationException.
*/
@InterfaceAudience.Public
@InterfaceStability.Evolving
public class BZip2Codec implements SplittableCompressionCodec {
private static final String HEADER = "BZ";
private static final int HEADER_LEN = HEADER.length();
private static final String SUB_HEADER = "h9";
private static final int SUB_HEADER_LEN = SUB_HEADER.length();
/**
* Creates a new instance of BZip2Codec
*/
public BZip2Codec() { }
/**
* Creates CompressionOutputStream for BZip2
*
* @param out
* The output Stream
* @return The BZip2 CompressionOutputStream
* @throws java.io.IOException
* Throws IO exception
*/
public CompressionOutputStream createOutputStream(OutputStream out)
throws IOException {
return new BZip2CompressionOutputStream(out);
}
/**
* Creates a compressor using given OutputStream.
*
* @return CompressionOutputStream
@throws java.io.IOException
*/
public CompressionOutputStream createOutputStream(OutputStream out,
Compressor compressor) throws IOException {
return createOutputStream(out);
}
/**
* This functionality is currently not supported.
*
* @return BZip2DummyCompressor.class
*/
public Class extends org.apache.hadoop.io.compress.Compressor> getCompressorType() {
return BZip2DummyCompressor.class;
}
/**
* This functionality is currently not supported.
*
* @return Compressor
*/
public Compressor createCompressor() {
return new BZip2DummyCompressor();
}
/**
* Creates CompressionInputStream to be used to read off uncompressed data.
*
* @param in
* The InputStream
* @return Returns CompressionInputStream for BZip2
* @throws java.io.IOException
* Throws IOException
*/
public CompressionInputStream createInputStream(InputStream in)
throws IOException {
return new BZip2CompressionInputStream(in);
}
/**
* This functionality is currently not supported.
*
* @return CompressionInputStream
*/
public CompressionInputStream createInputStream(InputStream in,
Decompressor decompressor) throws IOException {
return createInputStream(in);
}
/**
* Creates CompressionInputStream to be used to read off uncompressed data
* in one of the two reading modes. i.e. Continuous or Blocked reading modes
*
* @param seekableIn The InputStream
* @param start The start offset into the compressed stream
* @param end The end offset into the compressed stream
* @param readMode Controls whether progress is reported continuously or
* only at block boundaries.
*
* @return CompressionInputStream for BZip2 aligned at block boundaries
*/
public SplitCompressionInputStream createInputStream(InputStream seekableIn,
Decompressor decompressor, long start, long end, READ_MODE readMode)
throws IOException {
if (!(seekableIn instanceof Seekable)) {
throw new IOException("seekableIn must be an instance of " +
Seekable.class.getName());
}
//find the position of first BZip2 start up marker
((Seekable)seekableIn).seek(0);
// BZip2 start of block markers are of 6 bytes. But the very first block
// also has "BZh9", making it 10 bytes. This is the common case. But at
// time stream might start without a leading BZ.
final long FIRST_BZIP2_BLOCK_MARKER_POSITION =
CBZip2InputStream.numberOfBytesTillNextMarker(seekableIn);
long adjStart = Math.max(0L, start - FIRST_BZIP2_BLOCK_MARKER_POSITION);
((Seekable)seekableIn).seek(adjStart);
SplitCompressionInputStream in =
new BZip2CompressionInputStream(seekableIn, adjStart, end, readMode);
// The following if clause handles the following case:
// Assume the following scenario in BZip2 compressed stream where
// . represent compressed data.
// .....[48 bit Block].....[48 bit Block].....[48 bit Block]...
// ........................[47 bits][1 bit].....[48 bit Block]...
// ................................^[Assume a Byte alignment here]
// ........................................^^[current position of stream]
// .....................^^[We go back 10 Bytes in stream and find a Block marker]
// ........................................^^[We align at wrong position!]
// ...........................................................^^[While this pos is correct]
if (in.getPos() <= start) {
((Seekable)seekableIn).seek(start);
in = new BZip2CompressionInputStream(seekableIn, start, end, readMode);
}
return in;
}
/**
* This functionality is currently not supported.
*
* @return BZip2DummyDecompressor.class
*/
public Class extends org.apache.hadoop.io.compress.Decompressor> getDecompressorType() {
return BZip2DummyDecompressor.class;
}
/**
* This functionality is currently not supported.
*
* @return Decompressor
*/
public Decompressor createDecompressor() {
return new BZip2DummyDecompressor();
}
/**
* .bz2 is recognized as the default extension for compressed BZip2 files
*
* @return A String telling the default bzip2 file extension
*/
public String getDefaultExtension() {
return ".bz2";
}
private static class BZip2CompressionOutputStream extends
CompressionOutputStream {
// class data starts here//
private CBZip2OutputStream output;
private boolean needsReset;
// class data ends here//
public BZip2CompressionOutputStream(OutputStream out)
throws IOException {
super(out);
needsReset = true;
}
private void writeStreamHeader() throws IOException {
if (super.out != null) {
// The compressed bzip2 stream should start with the
// identifying characters BZ. Caller of CBZip2OutputStream
// i.e. this class must write these characters.
out.write(HEADER.getBytes());
}
}
public void finish() throws IOException {
if (needsReset) {
// In the case that nothing is written to this stream, we still need to
// write out the header before closing, otherwise the stream won't be
// recognized by BZip2CompressionInputStream.
internalReset();
}
this.output.finish();
needsReset = true;
}
private void internalReset() throws IOException {
if (needsReset) {
needsReset = false;
writeStreamHeader();
this.output = new CBZip2OutputStream(out);
}
}
public void resetState() throws IOException {
// Cannot write to out at this point because out might not be ready
// yet, as in SequenceFile.Writer implementation.
needsReset = true;
}
public void write(int b) throws IOException {
if (needsReset) {
internalReset();
}
this.output.write(b);
}
public void write(byte[] b, int off, int len) throws IOException {
if (needsReset) {
internalReset();
}
this.output.write(b, off, len);
}
public void close() throws IOException {
if (needsReset) {
// In the case that nothing is written to this stream, we still need to
// write out the header before closing, otherwise the stream won't be
// recognized by BZip2CompressionInputStream.
internalReset();
}
this.output.flush();
this.output.close();
needsReset = true;
}
}// end of class BZip2CompressionOutputStream
/**
* This class is capable to de-compress BZip2 data in two modes;
* CONTINOUS and BYBLOCK. BYBLOCK mode makes it possible to
* do decompression starting any arbitrary position in the stream.
*
* So this facility can easily be used to parallelize decompression
* of a large BZip2 file for performance reasons. (It is exactly
* done so for Hadoop framework. See LineRecordReader for an
* example). So one can break the file (of course logically) into
* chunks for parallel processing. These "splits" should be like
* default Hadoop splits (e.g as in FileInputFormat getSplit metod).
* So this code is designed and tested for FileInputFormat's way
* of splitting only.
*/
private static class BZip2CompressionInputStream extends
SplitCompressionInputStream {
// class data starts here//
private CBZip2InputStream input;
boolean needsReset;
private BufferedInputStream bufferedIn;
private boolean isHeaderStripped = false;
private boolean isSubHeaderStripped = false;
private READ_MODE readMode = READ_MODE.CONTINUOUS;
private long startingPos = 0L;
// Following state machine handles different states of compressed stream
// position
// HOLD : Don't advertise compressed stream position
// ADVERTISE : Read 1 more character and advertise stream position
// See more comments about it before updatePos method.
private enum POS_ADVERTISEMENT_STATE_MACHINE {
HOLD, ADVERTISE
};
POS_ADVERTISEMENT_STATE_MACHINE posSM = POS_ADVERTISEMENT_STATE_MACHINE.HOLD;
long compressedStreamPosition = 0;
// class data ends here//
public BZip2CompressionInputStream(InputStream in) throws IOException {
this(in, 0L, Long.MAX_VALUE, READ_MODE.CONTINUOUS);
}
public BZip2CompressionInputStream(InputStream in, long start, long end,
READ_MODE readMode) throws IOException {
super(in, start, end);
needsReset = false;
bufferedIn = new BufferedInputStream(super.in);
this.startingPos = super.getPos();
this.readMode = readMode;
if (this.startingPos == 0) {
// We only strip header if it is start of file
bufferedIn = readStreamHeader();
}
input = new CBZip2InputStream(bufferedIn, readMode);
if (this.isHeaderStripped) {
input.updateReportedByteCount(HEADER_LEN);
}
if (this.isSubHeaderStripped) {
input.updateReportedByteCount(SUB_HEADER_LEN);
}
this.updatePos(false);
}
private BufferedInputStream readStreamHeader() throws IOException {
// We are flexible enough to allow the compressed stream not to
// start with the header of BZ. So it works fine either we have
// the header or not.
if (super.in != null) {
bufferedIn.mark(HEADER_LEN);
byte[] headerBytes = new byte[HEADER_LEN];
int actualRead = bufferedIn.read(headerBytes, 0, HEADER_LEN);
if (actualRead != -1) {
String header = new String(headerBytes);
if (header.compareTo(HEADER) != 0) {
bufferedIn.reset();
} else {
this.isHeaderStripped = true;
// In case of BYBLOCK mode, we also want to strip off
// remaining two character of the header.
if (this.readMode == READ_MODE.BYBLOCK) {
actualRead = bufferedIn.read(headerBytes, 0,
SUB_HEADER_LEN);
if (actualRead != -1) {
this.isSubHeaderStripped = true;
}
}
}
}
}
if (bufferedIn == null) {
throw new IOException("Failed to read bzip2 stream.");
}
return bufferedIn;
}// end of method
public void close() throws IOException {
if (!needsReset) {
input.close();
needsReset = true;
}
}
/**
* This method updates compressed stream position exactly when the
* client of this code has read off at least one byte passed any BZip2
* end of block marker.
*
* This mechanism is very helpful to deal with data level record
* boundaries. Please see constructor and next methods of
* org.apache.hadoop.mapred.LineRecordReader as an example usage of this
* feature. We elaborate it with an example in the following:
*
* Assume two different scenarios of the BZip2 compressed stream, where
* [m] represent end of block, \n is line delimiter and . represent compressed
* data.
*
* ............[m]......\n.......
*
* ..........\n[m]......\n.......
*
* Assume that end is right after [m]. In the first case the reading
* will stop at \n and there is no need to read one more line. (To see the
* reason of reading one more line in the next() method is explained in LineRecordReader.)
* While in the second example LineRecordReader needs to read one more line
* (till the second \n). Now since BZip2Codecs only update position
* at least one byte passed a maker, so it is straight forward to differentiate
* between the two cases mentioned.
*
*/
public int read(byte[] b, int off, int len) throws IOException {
if (needsReset) {
internalReset();
}
int result = 0;
result = this.input.read(b, off, len);
if (result == BZip2Constants.END_OF_BLOCK) {
this.posSM = POS_ADVERTISEMENT_STATE_MACHINE.ADVERTISE;
}
if (this.posSM == POS_ADVERTISEMENT_STATE_MACHINE.ADVERTISE) {
result = this.input.read(b, off, off + 1);
// This is the precise time to update compressed stream position
// to the client of this code.
this.updatePos(true);
this.posSM = POS_ADVERTISEMENT_STATE_MACHINE.HOLD;
}
return result;
}
public int read() throws IOException {
byte b[] = new byte[1];
int result = this.read(b, 0, 1);
return (result < 0) ? result : (b[0] & 0xff);
}
private void internalReset() throws IOException {
if (needsReset) {
needsReset = false;
BufferedInputStream bufferedIn = readStreamHeader();
input = new CBZip2InputStream(bufferedIn, this.readMode);
}
}
public void resetState() throws IOException {
// Cannot read from bufferedIn at this point because bufferedIn
// might not be ready
// yet, as in SequenceFile.Reader implementation.
needsReset = true;
}
public long getPos() {
return this.compressedStreamPosition;
}
/*
* As the comments before read method tell that
* compressed stream is advertised when at least
* one byte passed EOB have been read off. But
* there is an exception to this rule. When we
* construct the stream we advertise the position
* exactly at EOB. In the following method
* shouldAddOn boolean captures this exception.
*
*/
private void updatePos(boolean shouldAddOn) {
int addOn = shouldAddOn ? 1 : 0;
this.compressedStreamPosition = this.startingPos
+ this.input.getProcessedByteCount() + addOn;
}
}// end of BZip2CompressionInputStream
}