org.apache.commons.compress.compressors.snappy.SnappyCompressorInputStream Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of commons-compress Show documentation
Apache Commons Compress software defines an API for working with compression and archive formats. These include: bzip2, gzip, pack200, lzma, xz, Snappy, traditional Unix Compress, DEFLATE, DEFLATE64, LZ4, Brotli, Zstandard and ar, cpio, jar, tar, zip, dump, 7z, arj.
There is a newer version: 1.26.1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.commons.compress.compressors.snappy;

import java.io.IOException;
import java.io.InputStream;

import org.apache.commons.compress.compressors.lz77support.AbstractLZ77CompressorInputStream;
import org.apache.commons.compress.utils.ByteUtils;

/**
 * CompressorInputStream for the raw Snappy format.
 *
 * This implementation uses an internal buffer in order to handle
 * the back-references that are at the heart of the LZ77 algorithm.
 * The size of the buffer must be at least as big as the biggest
 * offset used in the compressed stream.  The current version of the
 * Snappy algorithm as defined by Google works on 32k blocks and
 * doesn't contain offsets bigger than 32k which is the default block
 * size used by this class.
 *
 * @see Snappy compressed format description
 * @since 1.7
 */
public class SnappyCompressorInputStream extends AbstractLZ77CompressorInputStream {

    private enum State {
        NO_BLOCK, IN_LITERAL, IN_BACK_REFERENCE
    }

    /** Mask used to determine the type of "tag" is being processed */
    private static final int TAG_MASK = 0x03;

    /** Default block size */
    public static final int DEFAULT_BLOCK_SIZE = 32768;

    /** The size of the uncompressed data */
    private final int size;

    /** Number of uncompressed bytes still to be read. */
    private int uncompressedBytesRemaining;

    /** Current state of the stream */
    private State state = State.NO_BLOCK;

    private boolean endReached;

    /**
     * Constructor using the default buffer size of 32k.
     *
     * @param is
     *            An InputStream to read compressed data from
     *
     * @throws IOException if reading fails
     */
    public SnappyCompressorInputStream(final InputStream is) throws IOException {
        this(is, DEFAULT_BLOCK_SIZE);
    }

    /**
     * Constructor using a configurable buffer size.
     *
     * @param is
     *            An InputStream to read compressed data from
     * @param blockSize
     *            The block size used in compression
     *
     * @throws IOException if reading fails
     * @throws IllegalArgumentException if blockSize is not bigger than 0
     */
    public SnappyCompressorInputStream(final InputStream is, final int blockSize)
            throws IOException {
        super(is, blockSize);
        uncompressedBytesRemaining = size = (int) readSize();
    }

    /**
     * Try to fill the buffer with the next block of data.
     */
    private void fill() throws IOException {
        if (uncompressedBytesRemaining == 0) {
            endReached = true;
            return;
        }

        int b = readOneByte();
        if (b == -1) {
            throw new IOException("Premature end of stream reading block start");
        }
        int length = 0;
        int offset = 0;

        switch (b & TAG_MASK) {

        case 0x00:

            length = readLiteralLength(b);
            if (length < 0) {
                throw new IOException("Illegal block with a negative literal size found");
            }
            uncompressedBytesRemaining -= length;
            startLiteral(length);
            state = State.IN_LITERAL;
            break;

        case 0x01:

            /*
             * These elements can encode lengths between [4..11] bytes and
             * offsets between [0..2047] bytes. (len-4) occupies three bits
             * and is stored in bits [2..4] of the tag byte. The offset
             * occupies 11 bits, of which the upper three are stored in the
             * upper three bits ([5..7]) of the tag byte, and the lower
             * eight are stored in a byte following the tag byte.
             */

            length = 4 + ((b >> 2) & 0x07);
            uncompressedBytesRemaining -= length;
            offset = (b & 0xE0) << 3;
            b = readOneByte();
            if (b == -1) {
                throw new IOException("Premature end of stream reading back-reference length");
            }
            offset |= b;

            try {
                startBackReference(offset, length);
            } catch (final IllegalArgumentException ex) {
                throw new IOException("Illegal block with bad offset found", ex);
            }
            state = State.IN_BACK_REFERENCE;
            break;

        case 0x02:

            /*
             * These elements can encode lengths between [1..64] and offsets
             * from [0..65535]. (len-1) occupies six bits and is stored in
             * the upper six bits ([2..7]) of the tag byte. The offset is
             * stored as a little-endian 16-bit integer in the two bytes
             * following the tag byte.
             */

            length = (b >> 2) + 1;
            if (length < 0) {
                throw new IOException("Illegal block with a negative match length found");
            }
            uncompressedBytesRemaining -= length;

            offset = (int) ByteUtils.fromLittleEndian(supplier, 2);

            try {
                startBackReference(offset, length);
            } catch (final IllegalArgumentException ex) {
                throw new IOException("Illegal block with bad offset found", ex);
            }
            state = State.IN_BACK_REFERENCE;
            break;

        case 0x03:

            /*
             * These are like the copies with 2-byte offsets (see previous
             * subsection), except that the offset is stored as a 32-bit
             * integer instead of a 16-bit integer (and thus will occupy
             * four bytes).
             */

            length = (b >> 2) + 1;
            if (length < 0) {
                throw new IOException("Illegal block with a negative match length found");
            }
            uncompressedBytesRemaining -= length;

            offset = (int) ByteUtils.fromLittleEndian(supplier, 4) & 0x7fffffff;

            try {
                startBackReference(offset, length);
            } catch (final IllegalArgumentException ex) {
                throw new IOException("Illegal block with bad offset found", ex);
            }
            state = State.IN_BACK_REFERENCE;
            break;
        default:
            // impossible as TAG_MASK is two bits and all four possible cases have been covered
            break;
        }
    }

    /**
     * Get the uncompressed size of the stream
     *
     * @return the uncompressed size
     */
    @Override
    public int getSize() {
        return size;
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public int read(final byte[] b, final int off, final int len) throws IOException {
        if (len == 0) {
            return 0;
        }
        if (endReached) {
            return -1;
        }
        switch (state) {
        case NO_BLOCK:
            fill();
            return read(b, off, len);
        case IN_LITERAL:
            final int litLen = readLiteral(b, off, len);
            if (!hasMoreDataInBlock()) {
                state = State.NO_BLOCK;
            }
            return litLen > 0 ? litLen : read(b, off, len);
        case IN_BACK_REFERENCE:
            final int backReferenceLen = readBackReference(b, off, len);
            if (!hasMoreDataInBlock()) {
                state = State.NO_BLOCK;
            }
            return backReferenceLen > 0 ? backReferenceLen : read(b, off, len);
        default:
            throw new IOException("Unknown stream state " + state);
        }
    }

    /*
     * For literals up to and including 60 bytes in length, the
     * upper six bits of the tag byte contain (len-1). The literal
     * follows immediately thereafter in the bytestream. - For
     * longer literals, the (len-1) value is stored after the tag
     * byte, little-endian. The upper six bits of the tag byte
     * describe how many bytes are used for the length; 60, 61, 62
     * or 63 for 1-4 bytes, respectively. The literal itself follows
     * after the length.
     */
    private int readLiteralLength(final int b) throws IOException {
        final int length;
        switch (b >> 2) {
        case 60:
            length = readOneByte();
            if (length == -1) {
                throw new IOException("Premature end of stream reading literal length");
            }
            break;
        case 61:
            length = (int) ByteUtils.fromLittleEndian(supplier, 2);
            break;
        case 62:
            length = (int) ByteUtils.fromLittleEndian(supplier, 3);
            break;
        case 63:
            length = (int) ByteUtils.fromLittleEndian(supplier, 4);
            break;
        default:
            length = b >> 2;
            break;
        }

        return length + 1;
    }

    /**
     * The stream starts with the uncompressed length (up to a maximum of 2^32 -
     * 1), stored as a little-endian varint. Varints consist of a series of
     * bytes, where the lower 7 bits are data and the upper bit is set iff there
     * are more bytes to be read. In other words, an uncompressed length of 64
     * would be stored as 0x40, and an uncompressed length of 2097150 (0x1FFFFE)
     * would be stored as 0xFE 0xFF 0x7F.
     *
     * @return The size of the uncompressed data
     *
     * @throws IOException
     *             Could not read a byte
     */
    private long readSize() throws IOException {
        int index = 0;
        long sz = 0;
        int b = 0;

        do {
            b = readOneByte();
            if (b == -1) {
                throw new IOException("Premature end of stream reading size");
            }
            sz |= (b & 0x7f) << (index++ * 7);
        } while (0 != (b & 0x80));
        return sz;
    }
}