All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.commons.compress.compressors.snappy.SnappyCompressorInputStream Maven / Gradle / Ivy

Go to download

Apache Commons Compress software defines an API for working with compression and archive formats. These include: bzip2, gzip, pack200, lzma, xz, Snappy, traditional Unix Compress, DEFLATE, DEFLATE64, LZ4, Brotli, Zstandard and ar, cpio, jar, tar, zip, dump, 7z, arj.

There is a newer version: 1.26.1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.commons.compress.compressors.snappy;

import java.io.IOException;
import java.io.InputStream;

import org.apache.commons.compress.compressors.lz77support.AbstractLZ77CompressorInputStream;
import org.apache.commons.compress.utils.ByteUtils;

/**
 * CompressorInputStream for the raw Snappy format.
 *
 * 

This implementation uses an internal buffer in order to handle * the back-references that are at the heart of the LZ77 algorithm. * The size of the buffer must be at least as big as the biggest * offset used in the compressed stream. The current version of the * Snappy algorithm as defined by Google works on 32k blocks and * doesn't contain offsets bigger than 32k which is the default block * size used by this class.

* * @see Snappy compressed format description * @since 1.7 */ public class SnappyCompressorInputStream extends AbstractLZ77CompressorInputStream { private enum State { NO_BLOCK, IN_LITERAL, IN_BACK_REFERENCE } /** Mask used to determine the type of "tag" is being processed */ private static final int TAG_MASK = 0x03; /** Default block size */ public static final int DEFAULT_BLOCK_SIZE = 32768; /** The size of the uncompressed data */ private final int size; /** Number of uncompressed bytes still to be read. */ private int uncompressedBytesRemaining; /** Current state of the stream */ private State state = State.NO_BLOCK; private boolean endReached; /** * Constructor using the default buffer size of 32k. * * @param is * An InputStream to read compressed data from * * @throws IOException if reading fails */ public SnappyCompressorInputStream(final InputStream is) throws IOException { this(is, DEFAULT_BLOCK_SIZE); } /** * Constructor using a configurable buffer size. * * @param is * An InputStream to read compressed data from * @param blockSize * The block size used in compression * * @throws IOException if reading fails * @throws IllegalArgumentException if blockSize is not bigger than 0 */ public SnappyCompressorInputStream(final InputStream is, final int blockSize) throws IOException { super(is, blockSize); uncompressedBytesRemaining = size = (int) readSize(); } /** * Try to fill the buffer with the next block of data. */ private void fill() throws IOException { if (uncompressedBytesRemaining == 0) { endReached = true; return; } int b = readOneByte(); if (b == -1) { throw new IOException("Premature end of stream reading block start"); } int length = 0; int offset = 0; switch (b & TAG_MASK) { case 0x00: length = readLiteralLength(b); if (length < 0) { throw new IOException("Illegal block with a negative literal size found"); } uncompressedBytesRemaining -= length; startLiteral(length); state = State.IN_LITERAL; break; case 0x01: /* * These elements can encode lengths between [4..11] bytes and * offsets between [0..2047] bytes. (len-4) occupies three bits * and is stored in bits [2..4] of the tag byte. The offset * occupies 11 bits, of which the upper three are stored in the * upper three bits ([5..7]) of the tag byte, and the lower * eight are stored in a byte following the tag byte. */ length = 4 + ((b >> 2) & 0x07); uncompressedBytesRemaining -= length; offset = (b & 0xE0) << 3; b = readOneByte(); if (b == -1) { throw new IOException("Premature end of stream reading back-reference length"); } offset |= b; try { startBackReference(offset, length); } catch (final IllegalArgumentException ex) { throw new IOException("Illegal block with bad offset found", ex); } state = State.IN_BACK_REFERENCE; break; case 0x02: /* * These elements can encode lengths between [1..64] and offsets * from [0..65535]. (len-1) occupies six bits and is stored in * the upper six bits ([2..7]) of the tag byte. The offset is * stored as a little-endian 16-bit integer in the two bytes * following the tag byte. */ length = (b >> 2) + 1; if (length < 0) { throw new IOException("Illegal block with a negative match length found"); } uncompressedBytesRemaining -= length; offset = (int) ByteUtils.fromLittleEndian(supplier, 2); try { startBackReference(offset, length); } catch (final IllegalArgumentException ex) { throw new IOException("Illegal block with bad offset found", ex); } state = State.IN_BACK_REFERENCE; break; case 0x03: /* * These are like the copies with 2-byte offsets (see previous * subsection), except that the offset is stored as a 32-bit * integer instead of a 16-bit integer (and thus will occupy * four bytes). */ length = (b >> 2) + 1; if (length < 0) { throw new IOException("Illegal block with a negative match length found"); } uncompressedBytesRemaining -= length; offset = (int) ByteUtils.fromLittleEndian(supplier, 4) & 0x7fffffff; try { startBackReference(offset, length); } catch (final IllegalArgumentException ex) { throw new IOException("Illegal block with bad offset found", ex); } state = State.IN_BACK_REFERENCE; break; default: // impossible as TAG_MASK is two bits and all four possible cases have been covered break; } } /** * Get the uncompressed size of the stream * * @return the uncompressed size */ @Override public int getSize() { return size; } /** * {@inheritDoc} */ @Override public int read(final byte[] b, final int off, final int len) throws IOException { if (len == 0) { return 0; } if (endReached) { return -1; } switch (state) { case NO_BLOCK: fill(); return read(b, off, len); case IN_LITERAL: final int litLen = readLiteral(b, off, len); if (!hasMoreDataInBlock()) { state = State.NO_BLOCK; } return litLen > 0 ? litLen : read(b, off, len); case IN_BACK_REFERENCE: final int backReferenceLen = readBackReference(b, off, len); if (!hasMoreDataInBlock()) { state = State.NO_BLOCK; } return backReferenceLen > 0 ? backReferenceLen : read(b, off, len); default: throw new IOException("Unknown stream state " + state); } } /* * For literals up to and including 60 bytes in length, the * upper six bits of the tag byte contain (len-1). The literal * follows immediately thereafter in the bytestream. - For * longer literals, the (len-1) value is stored after the tag * byte, little-endian. The upper six bits of the tag byte * describe how many bytes are used for the length; 60, 61, 62 * or 63 for 1-4 bytes, respectively. The literal itself follows * after the length. */ private int readLiteralLength(final int b) throws IOException { final int length; switch (b >> 2) { case 60: length = readOneByte(); if (length == -1) { throw new IOException("Premature end of stream reading literal length"); } break; case 61: length = (int) ByteUtils.fromLittleEndian(supplier, 2); break; case 62: length = (int) ByteUtils.fromLittleEndian(supplier, 3); break; case 63: length = (int) ByteUtils.fromLittleEndian(supplier, 4); break; default: length = b >> 2; break; } return length + 1; } /** * The stream starts with the uncompressed length (up to a maximum of 2^32 - * 1), stored as a little-endian varint. Varints consist of a series of * bytes, where the lower 7 bits are data and the upper bit is set iff there * are more bytes to be read. In other words, an uncompressed length of 64 * would be stored as 0x40, and an uncompressed length of 2097150 (0x1FFFFE) * would be stored as 0xFE 0xFF 0x7F. * * @return The size of the uncompressed data * * @throws IOException * Could not read a byte */ private long readSize() throws IOException { int index = 0; long sz = 0; int b = 0; do { b = readOneByte(); if (b == -1) { throw new IOException("Premature end of stream reading size"); } sz |= (b & 0x7f) << (index++ * 7); } while (0 != (b & 0x80)); return sz; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy