org.xbib.io.compress.lzf.ChunkEncoder Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of archive Show documentation
Archive algorithms
The newest version!

package org.xbib.io.compress.lzf;

import java.io.IOException;
import java.io.OutputStream;

/**
 * Class that handles actual encoding of individual chunks. Resulting chunks can
 * be compressed or non-compressed; compression is only used if it actually
 * reduces chunk size (including overhead of additional header bytes)
 */
public final class ChunkEncoder {
    // Beyond certain point we won't be able to compress; let's use 16 bytes as cut-off

    private static final int MIN_BLOCK_TO_COMPRESS = 16;
    private static final int MIN_HASH_SIZE = 256;
    // Not much point in bigger tables, with 8k window
    private static final int MAX_HASH_SIZE = 16384;
    private static final int MAX_OFF = 1 << 13; // 8k
    private static final int MAX_REF = (1 << 8) + (1 << 3); // 264
    // // Encoding tables etc
    private final BufferRecycler bufferRecycler;
    /**
     * Hash table contains lookup based on 3-byte sequence; key is hash of such
     * triplet, value is offset in buffer.
     */
    private int[] hashTable;
    private final int hashModulo;
    /**
     * Buffer in which encoded content is stored during processing
     */
    private byte[] encodeBuffer;
    /**
     * Small buffer passed to LZFChunk, needed for writing chunk header
     */
    private byte[] headerBuffer;

    /**
     * @param totalLength Total encoded length; used for calculating size of
     *                    hash table to use
     */
    public ChunkEncoder(int totalLength) {
        int largestChunkLen = Math.max(totalLength, LZFChunk.MAX_CHUNK_LEN);

        int suggestedHashLen = calcHashLen(largestChunkLen);
        bufferRecycler = BufferRecycler.instance();
        hashTable = bufferRecycler.allocEncodingHash(suggestedHashLen);
        hashModulo = hashTable.length - 1;
        // Ok, then, what's the worst case output buffer length?
        // length indicator for each 32 literals, so:
        int bufferLen = largestChunkLen + ((largestChunkLen + 31) >> 5);
        encodeBuffer = bufferRecycler.allocEncodingBuffer(bufferLen);
    }

    /**
     * Alternate constructor used when we want to avoid allocation encoding
     * buffer, in cases where caller wants full control over allocations.
     */
    private ChunkEncoder(int totalLength, boolean bogus) {
        int largestChunkLen = Math.max(totalLength, LZFChunk.MAX_CHUNK_LEN);
        int suggestedHashLen = calcHashLen(largestChunkLen);
        bufferRecycler = BufferRecycler.instance();
        hashTable = bufferRecycler.allocEncodingHash(suggestedHashLen);
        hashModulo = hashTable.length - 1;
        encodeBuffer = null;
    }

    public static ChunkEncoder nonAllocatingEncoder(int totalLength) {
        return new ChunkEncoder(totalLength, true);
    }

    /**
     * Method to close once encoder is no longer in use. Note: after calling
     * this method, further calls to {@link #encodeChunk} will fail
     */
    public void close() {
        byte[] buf = encodeBuffer;
        if (buf != null) {
            encodeBuffer = null;
            bufferRecycler.releaseEncodeBuffer(buf);
        }
        int[] ibuf = hashTable;
        if (ibuf != null) {
            hashTable = null;
            bufferRecycler.releaseEncodingHash(ibuf);
        }
    }

    /**
     * Method for compressing (or not) individual chunks
     */
    public LZFChunk encodeChunk(byte[] data, int offset, int len) {
        if (len >= MIN_BLOCK_TO_COMPRESS) {
            /* If we have non-trivial block, and can compress it by at least
             * 2 bytes (since header is 2 bytes longer), let's compress:
             */
            int compLen = tryCompress(data, offset, offset + len, encodeBuffer, 0);
            if (compLen < (len - 2)) { // nah; just return uncompressed
                return LZFChunk.createCompressed(len, encodeBuffer, 0, compLen);
            }
        }
        // Otherwise leave uncompressed:
        return LZFChunk.createNonCompressed(data, offset, len);
    }

    /**
     * Method for encoding individual chunk, writing it to given output stream.
     */
    public void encodeAndWriteChunk(byte[] data, int offset, int len, OutputStream out)
            throws IOException {
        byte[] headerBuf = headerBuffer;
        if (headerBuf == null) {
            headerBuffer = headerBuf = new byte[LZFChunk.MAX_HEADER_LEN];
        }
        if (len >= MIN_BLOCK_TO_COMPRESS) {
            /* If we have non-trivial block, and can compress it by at least
             * 2 bytes (since header is 2 bytes longer), let's compress:
             */
            int compLen = tryCompress(data, offset, offset + len, encodeBuffer, 0);
            if (compLen < (len - 2)) { // nah; just return uncompressed
                LZFChunk.writeCompressedHeader(len, compLen, out, headerBuf);
                out.write(encodeBuffer, 0, compLen);
                return;
            }
        }
        // Otherwise leave uncompressed:
        LZFChunk.writeNonCompressedHeader(len, out, headerBuf);
        out.write(data, offset, len);
    }

    /**
     * Main workhorse method that will try to compress given chunk, and return
     * end position (offset to byte after last included byte)
     */
    protected int tryCompress(byte[] in, int inPos, int inEnd, byte[] out, int outPos) {
        final int[] hashTable = this.hashTable;
        ++outPos;
        int seen = first(in, inPos); // past 4 bytes we have seen... (last one is LSB)
        int literals = 0;
        inEnd -= 4;
        final int firstPos = inPos; // so that we won't have back references across block boundary

        while (inPos < inEnd) {
            byte p2 = in[inPos + 2];
            // next
            seen = (seen << 8) + (p2 & 255);
            int off = hash(seen);
            int ref = hashTable[off];
            hashTable[off] = inPos;

            // First expected common case: no back-ref (for whatever reason)
            if (ref >= inPos // can't refer forward (i.e. leftovers)
                    || ref < firstPos // or to previous block
                    || (off = inPos - ref) > MAX_OFF
                    || in[ref + 2] != p2 // must match hash
                    || in[ref + 1] != (byte) (seen >> 8)
                    || in[ref] != (byte) (seen >> 16)) {
                out[outPos++] = in[inPos++];
                literals++;
                if (literals == LZFChunk.MAX_LITERAL) {
                    out[outPos - 33] = (byte) 31; // <= out[outPos - literals - 1] = MAX_LITERAL_MINUS_1;
                    literals = 0;
                    outPos++;
                }
                continue;
            }
            // match
            int maxLen = inEnd - inPos + 2;
            if (maxLen > MAX_REF) {
                maxLen = MAX_REF;
            }
            if (literals == 0) {
                outPos--;
            } else {
                out[outPos - literals - 1] = (byte) (literals - 1);
                literals = 0;
            }
            int len = 3;
            while (len < maxLen && in[ref + len] == in[inPos + len]) {
                len++;
            }
            len -= 2;
            --off; // was off by one earlier
            if (len < 7) {
                out[outPos++] = (byte) ((off >> 8) + (len << 5));
            } else {
                out[outPos++] = (byte) ((off >> 8) + (7 << 5));
                out[outPos++] = (byte) (len - 7);
            }
            out[outPos++] = (byte) off;
            outPos++;
            inPos += len;
            seen = first(in, inPos);
            seen = (seen << 8) + (in[inPos + 2] & 255);
            hashTable[hash(seen)] = inPos;
            ++inPos;
            seen = (seen << 8) + (in[inPos + 2] & 255); // hash = next(hash, in, inPos);
            hashTable[hash(seen)] = inPos;
            ++inPos;
        }
        // try offlining the tail
        return handleTail(in, inPos, inEnd + 4, out, outPos, literals);
    }

    private int handleTail(byte[] in, int inPos, int inEnd, byte[] out, int outPos,
                                 int literals) {
        while (inPos < inEnd) {
            out[outPos++] = in[inPos++];
            literals++;
            if (literals == LZFChunk.MAX_LITERAL) {
                out[outPos - literals - 1] = (byte) (literals - 1);
                literals = 0;
                outPos++;
            }
        }
        out[outPos - literals - 1] = (byte) (literals - 1);
        if (literals == 0) {
            outPos--;
        }
        return outPos;
    }

    private static int calcHashLen(int chunkSize) {
        // in general try get hash table size of 2x input size
        chunkSize += chunkSize;
        // but no larger than max size:
        if (chunkSize >= MAX_HASH_SIZE) {
            return MAX_HASH_SIZE;
        }
        // otherwise just need to round up to nearest 2x
        int hashLen = MIN_HASH_SIZE;
        while (hashLen < chunkSize) {
            hashLen += hashLen;
        }
        return hashLen;
    }

    private int first(byte[] in, int inPos) {
        return (in[inPos] << 8) + (in[inPos + 1] & 0xFF);
    }

    private int hash(int h) {
        return ((h * 57321) >> 9) & hashModulo;
    }
}