org.apache.hadoop.io.compress.zlib.BuiltInGzipDecompressor Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of hadoop-apache Show documentation
Show all versions of hadoop-apache Show documentation
Shaded version of Apache Hadoop for Presto
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.io.compress.zlib;
import java.io.IOException;
import java.util.zip.Checksum;
import java.util.zip.DataFormatException;
import java.util.zip.Inflater;
import org.apache.hadoop.io.compress.Decompressor;
import org.apache.hadoop.io.compress.DoNotPool;
import org.apache.hadoop.util.DataChecksum;
/**
* A {@link Decompressor} based on the popular gzip compressed file format.
* http://www.gzip.org/
*
*/
@DoNotPool
public class BuiltInGzipDecompressor implements Decompressor {
private static final int GZIP_MAGIC_ID = 0x8b1f; // if read as LE short int
private static final int GZIP_DEFLATE_METHOD = 8;
private static final int GZIP_FLAGBIT_HEADER_CRC = 0x02;
private static final int GZIP_FLAGBIT_EXTRA_FIELD = 0x04;
private static final int GZIP_FLAGBIT_FILENAME = 0x08;
private static final int GZIP_FLAGBIT_COMMENT = 0x10;
private static final int GZIP_FLAGBITS_RESERVED = 0xe0;
// 'true' (nowrap) => Inflater will handle raw deflate stream only
private Inflater inflater = new Inflater(true);
private byte[] userBuf = null;
private int userBufOff = 0;
private int userBufLen = 0;
private byte[] localBuf = new byte[256];
private int localBufOff = 0;
private int headerBytesRead = 0;
private int trailerBytesRead = 0;
private int numExtraFieldBytesRemaining = -1;
private Checksum crc = DataChecksum.newCrc32();
private boolean hasExtraField = false;
private boolean hasFilename = false;
private boolean hasComment = false;
private boolean hasHeaderCRC = false;
private GzipStateLabel state;
/**
* The current state of the gzip decoder, external to the Inflater context.
* (Technically, the private variables localBuf through hasHeaderCRC are
* also part of the state, so this enum is merely the label for it.)
*/
private enum GzipStateLabel {
/**
* Immediately prior to or (strictly) within the 10-byte basic gzip header.
*/
HEADER_BASIC,
/**
* Immediately prior to or within the optional "extra field."
*/
HEADER_EXTRA_FIELD,
/**
* Immediately prior to or within the optional filename field.
*/
HEADER_FILENAME,
/**
* Immediately prior to or within the optional comment field.
*/
HEADER_COMMENT,
/**
* Immediately prior to or within the optional 2-byte header CRC value.
*/
HEADER_CRC,
/**
* Immediately prior to or within the main compressed (deflate) data stream.
*/
DEFLATE_STREAM,
/**
* Immediately prior to or (strictly) within the 4-byte uncompressed CRC.
*/
TRAILER_CRC,
/**
* Immediately prior to or (strictly) within the 4-byte uncompressed size.
*/
TRAILER_SIZE,
/**
* Immediately after the trailer (and potentially prior to the next gzip
* member/substream header), without reset() having been called.
*/
FINISHED;
}
/**
* Creates a new (pure Java) gzip decompressor.
*/
public BuiltInGzipDecompressor() {
state = GzipStateLabel.HEADER_BASIC;
crc.reset();
// FIXME? Inflater docs say: 'it is also necessary to provide an extra
// "dummy" byte as input. This is required by the ZLIB native
// library in order to support certain optimizations.' However,
// this does not appear to be true, and in any case, it's not
// entirely clear where the byte should go or what its value
// should be. Perhaps it suffices to have some deflated bytes
// in the first buffer load? (But how else would one do it?)
}
@Override
public synchronized boolean needsInput() {
if (state == GzipStateLabel.DEFLATE_STREAM) { // most common case
return inflater.needsInput();
}
// see userBufLen comment at top of decompress(); currently no need to
// verify userBufLen <= 0
return (state != GzipStateLabel.FINISHED);
}
/** {@inheritDoc} */
/*
* In our case, the input data includes both gzip header/trailer bytes (which
* we handle in executeState()) and deflate-stream bytes (which we hand off
* to Inflater).
*
* NOTE: This code assumes the data passed in via b[] remains unmodified
* until _we_ signal that it's safe to modify it (via needsInput()).
* The alternative would require an additional buffer-copy even for
* the bulk deflate stream, which is a performance hit we don't want
* to absorb. (Decompressor now documents this requirement.)
*/
@Override
public synchronized void setInput(byte[] b, int off, int len) {
if (b == null) {
throw new NullPointerException();
}
if (off < 0 || len < 0 || off > b.length - len) {
throw new ArrayIndexOutOfBoundsException();
}
userBuf = b;
userBufOff = off;
userBufLen = len; // note: might be zero
}
/**
* Decompress the data (gzip header, deflate stream, gzip trailer) in the
* provided buffer.
*
* @return the number of decompressed bytes placed into b
*/
/* From the caller's perspective, this is where the state machine lives.
* The code is written such that we never return from decompress() with
* data remaining in userBuf unless we're in FINISHED state and there was
* data beyond the current gzip member (e.g., we're within a concatenated
* gzip stream). If this ever changes, {@link #needsInput()} will also
* need to be modified (i.e., uncomment the userBufLen condition).
*
* The actual deflate-stream processing (decompression) is handled by
* Java's Inflater class. Unlike the gzip header/trailer code (execute*
* methods below), the deflate stream is never copied; Inflater operates
* directly on the user's buffer.
*/
@Override
public synchronized int decompress(byte[] b, int off, int len)
throws IOException {
int numAvailBytes = 0;
if (state != GzipStateLabel.DEFLATE_STREAM) {
executeHeaderState();
if (userBufLen <= 0) {
return numAvailBytes;
}
}
// "executeDeflateStreamState()"
if (state == GzipStateLabel.DEFLATE_STREAM) {
// hand off user data (or what's left of it) to Inflater--but note that
// Inflater may not have consumed all of previous bufferload (e.g., if
// data highly compressed or output buffer very small), in which case
// userBufLen will be zero
if (userBufLen > 0) {
inflater.setInput(userBuf, userBufOff, userBufLen);
userBufOff += userBufLen;
userBufLen = 0;
}
// now decompress it into b[]
try {
numAvailBytes = inflater.inflate(b, off, len);
} catch (DataFormatException dfe) {
throw new IOException(dfe.getMessage());
}
crc.update(b, off, numAvailBytes); // CRC-32 is on _uncompressed_ data
if (inflater.finished()) {
state = GzipStateLabel.TRAILER_CRC;
int bytesRemaining = inflater.getRemaining();
assert (bytesRemaining >= 0) :
"logic error: Inflater finished; byte-count is inconsistent";
// could save a copy of userBufLen at call to inflater.setInput() and
// verify that bytesRemaining <= origUserBufLen, but would have to
// be a (class) member variable...seems excessive for a sanity check
userBufOff -= bytesRemaining;
userBufLen = bytesRemaining; // or "+=", but guaranteed 0 coming in
} else {
return numAvailBytes; // minor optimization
}
}
executeTrailerState();
return numAvailBytes;
}
/**
* Parse the gzip header (assuming we're in the appropriate state).
* In order to deal with degenerate cases (e.g., user buffer is one byte
* long), we copy (some) header bytes to another buffer. (Filename,
* comment, and extra-field bytes are simply skipped.)
*
* See http://www.ietf.org/rfc/rfc1952.txt for the gzip spec. Note that
* no version of gzip to date (at least through 1.4.0, 2010-01-20) supports
* the FHCRC header-CRC16 flagbit; instead, the implementation treats it
* as a multi-file continuation flag (which it also doesn't support). :-(
* Sun's JDK v6 (1.6) supports the header CRC, however, and so do we.
*/
private void executeHeaderState() throws IOException {
// this can happen because DecompressorStream's decompress() is written
// to call decompress() first, setInput() second:
if (userBufLen <= 0) {
return;
}
// "basic"/required header: somewhere in first 10 bytes
if (state == GzipStateLabel.HEADER_BASIC) {
int n = Math.min(userBufLen, 10-localBufOff); // (or 10-headerBytesRead)
checkAndCopyBytesToLocal(n); // modifies userBufLen, etc.
if (localBufOff >= 10) { // should be strictly ==
processBasicHeader(); // sig, compression method, flagbits
localBufOff = 0; // no further need for basic header
state = GzipStateLabel.HEADER_EXTRA_FIELD;
}
}
if (userBufLen <= 0) {
return;
}
// optional header stuff (extra field, filename, comment, header CRC)
if (state == GzipStateLabel.HEADER_EXTRA_FIELD) {
if (hasExtraField) {
// 2 substates: waiting for 2 bytes => get numExtraFieldBytesRemaining,
// or already have 2 bytes & waiting to finish skipping specified length
if (numExtraFieldBytesRemaining < 0) {
int n = Math.min(userBufLen, 2-localBufOff);
checkAndCopyBytesToLocal(n);
if (localBufOff >= 2) {
numExtraFieldBytesRemaining = readUShortLE(localBuf, 0);
localBufOff = 0;
}
}
if (numExtraFieldBytesRemaining > 0 && userBufLen > 0) {
int n = Math.min(userBufLen, numExtraFieldBytesRemaining);
checkAndSkipBytes(n); // modifies userBufLen, etc.
numExtraFieldBytesRemaining -= n;
}
if (numExtraFieldBytesRemaining == 0) {
state = GzipStateLabel.HEADER_FILENAME;
}
} else {
state = GzipStateLabel.HEADER_FILENAME;
}
}
if (userBufLen <= 0) {
return;
}
if (state == GzipStateLabel.HEADER_FILENAME) {
if (hasFilename) {
boolean doneWithFilename = checkAndSkipBytesUntilNull();
if (!doneWithFilename) {
return; // exit early: used up entire buffer without hitting NULL
}
}
state = GzipStateLabel.HEADER_COMMENT;
}
if (userBufLen <= 0) {
return;
}
if (state == GzipStateLabel.HEADER_COMMENT) {
if (hasComment) {
boolean doneWithComment = checkAndSkipBytesUntilNull();
if (!doneWithComment) {
return; // exit early: used up entire buffer
}
}
state = GzipStateLabel.HEADER_CRC;
}
if (userBufLen <= 0) {
return;
}
if (state == GzipStateLabel.HEADER_CRC) {
if (hasHeaderCRC) {
assert (localBufOff < 2);
int n = Math.min(userBufLen, 2-localBufOff);
copyBytesToLocal(n);
if (localBufOff >= 2) {
long headerCRC = readUShortLE(localBuf, 0);
if (headerCRC != (crc.getValue() & 0xffff)) {
throw new IOException("gzip header CRC failure");
}
localBufOff = 0;
crc.reset();
state = GzipStateLabel.DEFLATE_STREAM;
}
} else {
crc.reset(); // will reuse for CRC-32 of uncompressed data
state = GzipStateLabel.DEFLATE_STREAM; // switching to Inflater now
}
}
}
/**
* Parse the gzip trailer (assuming we're in the appropriate state).
* In order to deal with degenerate cases (e.g., user buffer is one byte
* long), we copy trailer bytes (all 8 of 'em) to a local buffer.
*
* See http://www.ietf.org/rfc/rfc1952.txt for the gzip spec.
*/
private void executeTrailerState() throws IOException {
if (userBufLen <= 0) {
return;
}
// verify that the CRC-32 of the decompressed stream matches the value
// stored in the gzip trailer
if (state == GzipStateLabel.TRAILER_CRC) {
// localBuf was empty before we handed off to Inflater, so we handle this
// exactly like header fields
assert (localBufOff < 4); // initially 0, but may need multiple calls
int n = Math.min(userBufLen, 4-localBufOff);
copyBytesToLocal(n);
if (localBufOff >= 4) {
long streamCRC = readUIntLE(localBuf, 0);
if (streamCRC != crc.getValue()) {
throw new IOException("gzip stream CRC failure");
}
localBufOff = 0;
crc.reset();
state = GzipStateLabel.TRAILER_SIZE;
}
}
if (userBufLen <= 0) {
return;
}
// verify that the mod-2^32 decompressed stream size matches the value
// stored in the gzip trailer
if (state == GzipStateLabel.TRAILER_SIZE) {
assert (localBufOff < 4); // initially 0, but may need multiple calls
int n = Math.min(userBufLen, 4-localBufOff);
copyBytesToLocal(n); // modifies userBufLen, etc.
if (localBufOff >= 4) { // should be strictly ==
long inputSize = readUIntLE(localBuf, 0);
if (inputSize != (inflater.getBytesWritten() & 0xffffffffL)) {
throw new IOException(
"stored gzip size doesn't match decompressed size");
}
localBufOff = 0;
state = GzipStateLabel.FINISHED;
}
}
if (state == GzipStateLabel.FINISHED) {
return;
}
}
/**
* Returns the total number of compressed bytes input so far, including
* gzip header/trailer bytes.
*
* @return the total (non-negative) number of compressed bytes read so far
*/
public synchronized long getBytesRead() {
return headerBytesRead + inflater.getBytesRead() + trailerBytesRead;
}
/**
* Returns the number of bytes remaining in the input buffer; normally
* called when finished() is true to determine amount of post-gzip-stream
* data. Note that, other than the finished state with concatenated data
* after the end of the current gzip stream, this will never return a
* non-zero value unless called after {@link #setInput(byte[] b, int off,
* int len)} and before {@link #decompress(byte[] b, int off, int len)}.
* (That is, after {@link #decompress(byte[] b, int off, int len)} it
* always returns zero, except in finished state with concatenated data.)
*
* @return the total (non-negative) number of unprocessed bytes in input
*/
@Override
public synchronized int getRemaining() {
return userBufLen;
}
@Override
public synchronized boolean needsDictionary() {
return inflater.needsDictionary();
}
@Override
public synchronized void setDictionary(byte[] b, int off, int len) {
inflater.setDictionary(b, off, len);
}
/**
* Returns true if the end of the gzip substream (single "member") has been
* reached.
*/
@Override
public synchronized boolean finished() {
return (state == GzipStateLabel.FINISHED);
}
/**
* Resets everything, including the input buffer, regardless of whether the
* current gzip substream is finished.
*/
@Override
public synchronized void reset() {
// could optionally emit INFO message if state != GzipStateLabel.FINISHED
inflater.reset();
state = GzipStateLabel.HEADER_BASIC;
crc.reset();
userBufOff = userBufLen = 0;
localBufOff = 0;
headerBytesRead = 0;
trailerBytesRead = 0;
numExtraFieldBytesRemaining = -1;
hasExtraField = false;
hasFilename = false;
hasComment = false;
hasHeaderCRC = false;
}
@Override
public synchronized void end() {
inflater.end();
}
/**
* Check ID bytes (throw if necessary), compression method (throw if not 8),
* and flag bits (set hasExtraField, hasFilename, hasComment, hasHeaderCRC).
* Ignore MTIME, XFL, OS. Caller must ensure we have at least 10 bytes (at
* the start of localBuf).
*/
/*
* Flag bits (remainder are reserved and must be zero):
* bit 0 FTEXT
* bit 1 FHCRC (never implemented in gzip, at least through version
* 1.4.0; instead interpreted as "continuation of multi-
* part gzip file," which is unsupported through 1.4.0)
* bit 2 FEXTRA
* bit 3 FNAME
* bit 4 FCOMMENT
* [bit 5 encrypted]
*/
private void processBasicHeader() throws IOException {
if (readUShortLE(localBuf, 0) != GZIP_MAGIC_ID) {
throw new IOException("not a gzip file");
}
if (readUByte(localBuf, 2) != GZIP_DEFLATE_METHOD) {
throw new IOException("gzip data not compressed with deflate method");
}
int flg = readUByte(localBuf, 3);
if ((flg & GZIP_FLAGBITS_RESERVED) != 0) {
throw new IOException("unknown gzip format (reserved flagbits set)");
}
hasExtraField = ((flg & GZIP_FLAGBIT_EXTRA_FIELD) != 0);
hasFilename = ((flg & GZIP_FLAGBIT_FILENAME) != 0);
hasComment = ((flg & GZIP_FLAGBIT_COMMENT) != 0);
hasHeaderCRC = ((flg & GZIP_FLAGBIT_HEADER_CRC) != 0);
}
private void checkAndCopyBytesToLocal(int len) {
System.arraycopy(userBuf, userBufOff, localBuf, localBufOff, len);
localBufOff += len;
// alternatively, could call checkAndSkipBytes(len) for rest...
crc.update(userBuf, userBufOff, len);
userBufOff += len;
userBufLen -= len;
headerBytesRead += len;
}
private void checkAndSkipBytes(int len) {
crc.update(userBuf, userBufOff, len);
userBufOff += len;
userBufLen -= len;
headerBytesRead += len;
}
// returns true if saw NULL, false if ran out of buffer first; called _only_
// during gzip-header processing (not trailer)
// (caller can check before/after state of userBufLen to compute num bytes)
private boolean checkAndSkipBytesUntilNull() {
boolean hitNull = false;
if (userBufLen > 0) {
do {
hitNull = (userBuf[userBufOff] == 0);
crc.update(userBuf[userBufOff]);
++userBufOff;
--userBufLen;
++headerBytesRead;
} while (userBufLen > 0 && !hitNull);
}
return hitNull;
}
// this one doesn't update the CRC and does support trailer processing but
// otherwise is same as its "checkAnd" sibling
private void copyBytesToLocal(int len) {
System.arraycopy(userBuf, userBufOff, localBuf, localBufOff, len);
localBufOff += len;
userBufOff += len;
userBufLen -= len;
if (state == GzipStateLabel.TRAILER_CRC ||
state == GzipStateLabel.TRAILER_SIZE) {
trailerBytesRead += len;
} else {
headerBytesRead += len;
}
}
private int readUByte(byte[] b, int off) {
return ((int)b[off] & 0xff);
}
// caller is responsible for not overrunning buffer
private int readUShortLE(byte[] b, int off) {
return ((((b[off+1] & 0xff) << 8) |
((b[off] & 0xff) )) & 0xffff);
}
// caller is responsible for not overrunning buffer
private long readUIntLE(byte[] b, int off) {
return ((((long)(b[off+3] & 0xff) << 24) |
((long)(b[off+2] & 0xff) << 16) |
((long)(b[off+1] & 0xff) << 8) |
((long)(b[off] & 0xff) )) & 0xffffffffL);
}
}