org.xbib.io.compress.xz.LZMA2InputStream Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of archive Show documentation
Archive algorithms
The newest version!
package org.xbib.io.compress.xz;

import org.xbib.io.compress.xz.lz.LZDecoder;
import org.xbib.io.compress.xz.lzma.LZMADecoder;
import org.xbib.io.compress.xz.rangecoder.RangeDecoder;

import java.io.DataInputStream;
import java.io.IOException;
import java.io.InputStream;

/**
 * Decompresses a raw LZMA2 stream (no XZ headers).
 */
public class LZMA2InputStream extends InputStream {
    /**
     * Smallest valid LZMA2 dictionary size.
     * Very tiny dictionaries would be a performance problem, so
     * the minimum is 4 KiB.
     */
    public static final int DICT_SIZE_MIN = 4096;

    /**
     * Largest dictionary size supported by this implementation.
     * The LZMA2 algorithm allows dictionaries up to one byte less than 4 GiB.
     * This implementation supports only 16 bytes less than 2 GiB for raw
     * LZMA2 streams, and for .xz files the maximum is 1.5 GiB. This
     * limitation is due to Java using signed 32-bit integers for array
     * indexing. The limitation shouldn't matter much in practice since so
     * huge dictionaries are not normally used.
     */
    public static final int DICT_SIZE_MAX = Integer.MAX_VALUE & ~15;

    private static final int COMPRESSED_SIZE_MAX = 1 << 16;

    private DataInputStream in;

    private final LZDecoder lz;
    private final RangeDecoder rc = new RangeDecoder(COMPRESSED_SIZE_MAX);
    private LZMADecoder lzma;

    private int uncompressedSize = 0;
    private boolean isLZMAChunk;

    private boolean needDictReset = true;
    private boolean needProps = true;
    private boolean endReached = false;

    private IOException exception = null;

    /**
     * Gets approximate decompressor memory requirements as kibibytes for
     * the given dictionary size.
     *
     * @param dictSize LZMA2 dictionary size as bytes, must be
     *                 in the range [DICT_SIZE_MIN,
     *                 DICT_SIZE_MAX]
     * @return approximate memory requirements as kibibytes (KiB)
     */
    public static int getMemoryUsage(int dictSize) {
        // The base state is aroudn 30-40 KiB (probabilities etc.),
        // range decoder needs COMPRESSED_SIZE_MAX bytes for buffering,
        // and LZ decoder needs a dictionary buffer.
        return 40 + COMPRESSED_SIZE_MAX / 1024 + getDictSize(dictSize) / 1024;
    }

    private static int getDictSize(int dictSize) {
        if (dictSize < DICT_SIZE_MIN || dictSize > DICT_SIZE_MAX) {
            throw new IllegalArgumentException(
                    "Unsupported dictionary size " + dictSize);
        }

        // Round dictionary size upward to a multiple of 16. This way LZMA
        // can use LZDecoder.getPos() for calculating LZMA's posMask.
        // Note that this check is needed only for raw LZMA2 streams; it is
        // redundant with .xz.
        return (dictSize + 15) & ~15;
    }

    /**
     * Creates a new input stream that decompresses raw LZMA2 data
     * from in.
     * The caller needs to know the dictionary size used when compressing;
     * the dictionary size isn't stored as part of a raw LZMA2 stream.
     * Specifying a too small dictionary size will prevent decompressing
     * the stream. Specifying a too big dictionary is waste of memory but
     * decompression will work.
     * There is no need to specify a dictionary bigger than
     * the uncompressed size of the data even if a bigger dictionary
     * was used when compressing. If you know the uncompressed size
     * of the data, this might allow saving some memory.
     *
     * @param in       input stream from which LZMA2-compressed
     *                 data is read
     * @param dictSize LZMA2 dictionary size as bytes, must be
     *                 in the range [DICT_SIZE_MIN,
     *                 DICT_SIZE_MAX]
     */
    public LZMA2InputStream(InputStream in, int dictSize) {
        this(in, dictSize, null);
    }

    /**
     * Creates a new LZMA2 decompressor using a preset dictionary.
     * This is like LZMAInputStream(InputStream, int) except
     * that the dictionary may be initialized using a preset dictionary.
     * If a preset dictionary was used when compressing the data, the
     * same preset dictionary must be provided when decompressing.
     *
     * @param in         input stream from which LZMA2-compressed
     *                   data is read
     * @param dictSize   LZMA2 dictionary size as bytes, must be
     *                   in the range [DICT_SIZE_MIN,
     *                   DICT_SIZE_MAX]
     * @param presetDict preset dictionary or null
     *                   to use no preset dictionary
     */
    public LZMA2InputStream(InputStream in, int dictSize, byte[] presetDict) {
        // Check for null because otherwise null isn't detect
        // in this constructor.
        if (in == null) {
            throw new NullPointerException();
        }

        this.in = new DataInputStream(in);
        this.lz = new LZDecoder(getDictSize(dictSize), presetDict);

        if (presetDict != null && presetDict.length > 0) {
            needDictReset = false;
        }
    }

    /**
     * Decompresses the next byte from this input stream.
     * Reading lots of data with read() from this input stream
     * may be inefficient. Wrap it in java.io.BufferedInputStream
     * if you need to read lots of data one byte at a time.
     *
     * @return the next decompressed byte, or -1
     * to indicate the end of the compressed stream
     * @throws CorruptedInputException
     * @throws XZIOException           if the stream has been closed
     * @throws java.io.IOException     may be thrown by in
     */
    public int read() throws IOException {
        byte[] buf = new byte[1];
        return read(buf, 0, 1) == -1 ? -1 : (buf[0] & 0xFF);
    }

    /**
     * Decompresses into an array of bytes.
     * If len is zero, no bytes are read and 0
     * is returned. Otherwise this will block until len
     * bytes have been decompressed, the end of LZMA2 stream is reached,
     * or an exception is thrown.
     *
     * @param buf target buffer for uncompressed data
     * @param off start offset in buf
     * @param len maximum number of uncompressed bytes to read
     * @return number of bytes read, or -1 to indicate
     * the end of the compressed stream
     * @throws CorruptedInputException
     * @throws XZIOException           if the stream has been closed
     * @throws java.io.IOException     may be thrown by in
     */
    public int read(byte[] buf, int off, int len) throws IOException {
        if (off < 0 || len < 0 || off + len < 0 || off + len > buf.length) {
            throw new IndexOutOfBoundsException();
        }

        if (len == 0) {
            return 0;
        }

        if (in == null) {
            throw new XZIOException("Stream closed");
        }

        if (exception != null) {
            throw exception;
        }

        if (endReached) {
            return -1;
        }

        try {
            int size = 0;

            while (len > 0) {
                if (uncompressedSize == 0) {
                    decodeChunkHeader();
                    if (endReached) {
                        return size == 0 ? -1 : size;
                    }
                }

                int copySizeMax = Math.min(uncompressedSize, len);

                if (!isLZMAChunk) {
                    lz.copyUncompressed(in, copySizeMax);
                } else {
                    lz.setLimit(copySizeMax);
                    lzma.decode();
                }

                int copiedSize = lz.flush(buf, off);
                off += copiedSize;
                len -= copiedSize;
                size += copiedSize;
                uncompressedSize -= copiedSize;

                if (uncompressedSize == 0) {
                    if (!rc.isFinished() || lz.hasPending()) {
                        throw new CorruptedInputException();
                    }
                }
            }

            return size;

        } catch (IOException e) {
            exception = e;
            throw e;
        }
    }

    private void decodeChunkHeader() throws IOException {
        int control = in.readUnsignedByte();

        if (control == 0x00) {
            endReached = true;
            return;
        }

        if (control >= 0xE0 || control == 0x01) {
            needProps = true;
            needDictReset = false;
            lz.reset();
        } else if (needDictReset) {
            throw new CorruptedInputException();
        }

        if (control >= 0x80) {
            isLZMAChunk = true;

            uncompressedSize = (control & 0x1F) << 16;
            uncompressedSize += in.readUnsignedShort() + 1;

            int compressedSize = in.readUnsignedShort() + 1;

            if (control >= 0xC0) {
                needProps = false;
                decodeProps();

            } else if (needProps) {
                throw new CorruptedInputException();

            } else if (control >= 0xA0) {
                lzma.reset();
            }

            rc.prepareInputBuffer(in, compressedSize);

        } else if (control > 0x02) {
            throw new CorruptedInputException();

        } else {
            isLZMAChunk = false;
            uncompressedSize = in.readUnsignedShort() + 1;
        }
    }

    private void decodeProps() throws IOException {
        int props = in.readUnsignedByte();

        if (props > (4 * 5 + 4) * 9 + 8) {
            throw new CorruptedInputException();
        }

        int pb = props / (9 * 5);
        props -= pb * 9 * 5;
        int lp = props / 9;
        int lc = props - lp * 9;

        if (lc + lp > 4) {
            throw new CorruptedInputException();
        }

        lzma = new LZMADecoder(lz, rc, lc, lp, pb);
    }

    /**
     * Returns the number of uncompressed bytes that can be read
     * without blocking. The value is returned with an assumption
     * that the compressed input data will be valid. If the compressed
     * data is corrupt, CorruptedInputException may get
     * thrown before the number of bytes claimed to be available have
     * been read from this input stream.
     * In LZMAInputStream, the return value will be non-zero when the
     * decompressor is in the middle of an LZMA2 chunk. The return value
     * will then be the number of uncompressed bytes remaining from that
     * chunk.
     *
     * @return the number of uncompressed bytes that can be read
     * without blocking
     */
    public int available() throws IOException {
        if (in == null) {
            throw new XZIOException("Stream closed");
        }

        if (exception != null) {
            throw exception;
        }

        return uncompressedSize;
    }

    /**
     * Closes the stream and calls in.close().
     * If the stream was already closed, this does nothing.
     *
     * @throws java.io.IOException if thrown by in.close()
     */
    public void close() throws IOException {
        if (in != null) {
            try {
                in.close();
            } finally {
                in = null;
            }
        }
    }
}