org.h2.compress.CompressLZF Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of h2 Show documentation
H2 Database Engine
There is a newer version: 2.3.232
/*
 * Copyright 2004-2014 H2 Group. Multiple-Licensed under the MPL 2.0,
 * and the EPL 1.0 (http://h2database.com/html/license.html).
 *
 * This code is based on the LZF algorithm from Marc Lehmann. It is a
 * re-implementation of the C code:
 * http://cvs.schmorp.de/liblzf/lzf_c.c?view=markup
 * http://cvs.schmorp.de/liblzf/lzf_d.c?view=markup
 *
 * According to a mail from Marc Lehmann, it's OK to use his algorithm:
 * Date: 2010-07-15 15:57
 * Subject: Re: Question about LZF licensing
 * ...
 * The algorithm is not copyrighted (and cannot be copyrighted afaik) - as long
 * as you wrote everything yourself, without copying my code, that's just fine
 * (looking is of course fine too).
 * ...
 *
 * Still I would like to keep his copyright info:
 *
 * Copyright (c) 2000-2005 Marc Alexander Lehmann 
 * Copyright (c) 2005 Oren J. Maurice 
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   1.  Redistributions of source code must retain the above copyright notice,
 *       this list of conditions and the following disclaimer.
 *
 *   2.  Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *
 *   3.  The name of the author may not be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ''AS IS'' AND ANY EXPRESS OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO
 * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

package org.h2.compress;

import java.nio.ByteBuffer;

/**
 * 
 * This class implements the LZF lossless data compression algorithm. LZF is a
 * Lempel-Ziv variant with byte-aligned output, and optimized for speed.
 * 
 * 
 * Safety/Use Notes:
 * 
 * 
 * Each instance should be used by a single thread only.
 * The data buffers should be smaller than 1 GB.
 * For performance reasons, safety checks on expansion are omitted.
 * Invalid compressed data can cause an ArrayIndexOutOfBoundsException.
 * 
 * 
 * The LZF compressed format knows literal runs and back-references:
 * 
 * 
 * Literal run: directly copy bytes from input to output.
 * Back-reference: copy previous data to output stream, with specified
 * offset from location and length. The length is at least 3 bytes.
 * 
 *
 * The first byte of the compressed stream is the control byte. For literal
 * runs, the highest three bits of the control byte are not set, the the lower
 * bits are the literal run length, and the next bytes are data to copy directly
 * into the output. For back-references, the highest three bits of the control
 * byte are the back-reference length. If all three bits are set, then the
 * back-reference length is stored in the next byte. The lower bits of the
 * control byte combined with the next byte form the offset for the
 * back-reference.
 * 
 */
public final class CompressLZF implements Compressor {

    /**
     * The number of entries in the hash table. The size is a trade-off between
     * hash collisions (reduced compression) and speed (amount that fits in CPU
     * cache).
     */
    private static final int HASH_SIZE = 1 << 14;

    /**
     * The maximum number of literals in a chunk (32).
     */
    private static final int MAX_LITERAL = 1 << 5;

    /**
     * The maximum offset allowed for a back-reference (8192).
     */
    private static final int MAX_OFF = 1 << 13;

    /**
     * The maximum back-reference length (264).
     */
    private static final int MAX_REF = (1 << 8) + (1 << 3);

    /**
     * Hash table for matching byte sequences (reused for performance).
     */
    private int[] cachedHashTable;

    @Override
    public void setOptions(String options) {
        // nothing to do
    }

    /**
     * Return the integer with the first two bytes 0, then the bytes at the
     * index, then at index+1.
     */
    private static int first(byte[] in, int inPos) {
        return (in[inPos] << 8) | (in[inPos + 1] & 255);
    }

    /**
     * Return the integer with the first two bytes 0, then the bytes at the
     * index, then at index+1.
     */
    private static int first(ByteBuffer in, int inPos) {
        return (in.get(inPos) << 8) | (in.get(inPos + 1) & 255);
    }

    /**
     * Shift the value 1 byte left, and add the byte at index inPos+2.
     */
    private static int next(int v, byte[] in, int inPos) {
        return (v << 8) | (in[inPos + 2] & 255);
    }

    /**
     * Shift the value 1 byte left, and add the byte at index inPos+2.
     */
    private static int next(int v, ByteBuffer in, int inPos) {
        return (v << 8) | (in.get(inPos + 2) & 255);
    }

    /**
     * Compute the address in the hash table.
     */
    private static int hash(int h) {
        return ((h * 2777) >> 9) & (HASH_SIZE - 1);
    }

    @Override
    public int compress(byte[] in, int inLen, byte[] out, int outPos) {
        int inPos = 0;
        if (cachedHashTable == null) {
            cachedHashTable = new int[HASH_SIZE];
        }
        int[] hashTab = cachedHashTable;
        int literals = 0;
        outPos++;
        int future = first(in, 0);
        while (inPos < inLen - 4) {
            byte p2 = in[inPos + 2];
            // next
            future = (future << 8) + (p2 & 255);
            int off = hash(future);
            int ref = hashTab[off];
            hashTab[off] = inPos;
            // if (ref < inPos
            //       && ref > 0
            //       && (off = inPos - ref - 1) < MAX_OFF
            //       && in[ref + 2] == p2
            //       && (((in[ref] & 255) << 8) | (in[ref + 1] & 255)) ==
            //           ((future >> 8) & 0xffff)) {
            if (ref < inPos
                        && ref > 0
                        && (off = inPos - ref - 1) < MAX_OFF
                        && in[ref + 2] == p2
                        && in[ref + 1] == (byte) (future >> 8)
                        && in[ref] == (byte) (future >> 16)) {
                // match
                int maxLen = inLen - inPos - 2;
                if (maxLen > MAX_REF) {
                    maxLen = MAX_REF;
                }
                if (literals == 0) {
                    // multiple back-references,
                    // so there is no literal run control byte
                    outPos--;
                } else {
                    // set the control byte at the start of the literal run
                    // to store the number of literals
                    out[outPos - literals - 1] = (byte) (literals - 1);
                    literals = 0;
                }
                int len = 3;
                while (len < maxLen && in[ref + len] == in[inPos + len]) {
                    len++;
                }
                len -= 2;
                if (len < 7) {
                    out[outPos++] = (byte) ((off >> 8) + (len << 5));
                } else {
                    out[outPos++] = (byte) ((off >> 8) + (7 << 5));
                    out[outPos++] = (byte) (len - 7);
                }
                out[outPos++] = (byte) off;
                // move one byte forward to allow for a literal run control byte
                outPos++;
                inPos += len;
                // rebuild the future, and store the last bytes to the
                // hashtable. Storing hashes of the last bytes in back-reference
                // improves the compression ratio and only reduces speed
                // slightly.
                future = first(in, inPos);
                future = next(future, in, inPos);
                hashTab[hash(future)] = inPos++;
                future = next(future, in, inPos);
                hashTab[hash(future)] = inPos++;
            } else {
                // copy one byte from input to output as part of literal
                out[outPos++] = in[inPos++];
                literals++;
                // at the end of this literal chunk, write the length
                // to the control byte and start a new chunk
                if (literals == MAX_LITERAL) {
                    out[outPos - literals - 1] = (byte) (literals - 1);
                    literals = 0;
                    // move ahead one byte to allow for the
                    // literal run control byte
                    outPos++;
                }
            }
        }
        // write the remaining few bytes as literals
        while (inPos < inLen) {
            out[outPos++] = in[inPos++];
            literals++;
            if (literals == MAX_LITERAL) {
                out[outPos - literals - 1] = (byte) (literals - 1);
                literals = 0;
                outPos++;
            }
        }
        // writes the final literal run length to the control byte
        out[outPos - literals - 1] = (byte) (literals - 1);
        if (literals == 0) {
            outPos--;
        }
        return outPos;
    }

    /**
     * Compress a number of bytes.
     *
     * @param in the input data
     * @param inPos the offset at the input buffer
     * @param out the output area
     * @param outPos the offset at the output array
     * @return the end position
     */
    public int compress(ByteBuffer in, int inPos, byte[] out, int outPos) {
        int inLen = in.capacity() - inPos;
        if (cachedHashTable == null) {
            cachedHashTable = new int[HASH_SIZE];
        }
        int[] hashTab = cachedHashTable;
        int literals = 0;
        outPos++;
        int future = first(in, 0);
        while (inPos < inLen - 4) {
            byte p2 = in.get(inPos + 2);
            // next
            future = (future << 8) + (p2 & 255);
            int off = hash(future);
            int ref = hashTab[off];
            hashTab[off] = inPos;
            // if (ref < inPos
            //       && ref > 0
            //       && (off = inPos - ref - 1) < MAX_OFF
            //       && in[ref + 2] == p2
            //       && (((in[ref] & 255) << 8) | (in[ref + 1] & 255)) ==
            //           ((future >> 8) & 0xffff)) {
            if (ref < inPos
                        && ref > 0
                        && (off = inPos - ref - 1) < MAX_OFF
                        && in.get(ref + 2) == p2
                        && in.get(ref + 1) == (byte) (future >> 8)
                        && in.get(ref) == (byte) (future >> 16)) {
                // match
                int maxLen = inLen - inPos - 2;
                if (maxLen > MAX_REF) {
                    maxLen = MAX_REF;
                }
                if (literals == 0) {
                    // multiple back-references,
                    // so there is no literal run control byte
                    outPos--;
                } else {
                    // set the control byte at the start of the literal run
                    // to store the number of literals
                    out[outPos - literals - 1] = (byte) (literals - 1);
                    literals = 0;
                }
                int len = 3;
                while (len < maxLen && in.get(ref + len) == in.get(inPos + len)) {
                    len++;
                }
                len -= 2;
                if (len < 7) {
                    out[outPos++] = (byte) ((off >> 8) + (len << 5));
                } else {
                    out[outPos++] = (byte) ((off >> 8) + (7 << 5));
                    out[outPos++] = (byte) (len - 7);
                }
                out[outPos++] = (byte) off;
                // move one byte forward to allow for a literal run control byte
                outPos++;
                inPos += len;
                // rebuild the future, and store the last bytes to the
                // hashtable. Storing hashes of the last bytes in back-reference
                // improves the compression ratio and only reduces speed
                // slightly.
                future = first(in, inPos);
                future = next(future, in, inPos);
                hashTab[hash(future)] = inPos++;
                future = next(future, in, inPos);
                hashTab[hash(future)] = inPos++;
            } else {
                // copy one byte from input to output as part of literal
                out[outPos++] = in.get(inPos++);
                literals++;
                // at the end of this literal chunk, write the length
                // to the control byte and start a new chunk
                if (literals == MAX_LITERAL) {
                    out[outPos - literals - 1] = (byte) (literals - 1);
                    literals = 0;
                    // move ahead one byte to allow for the
                    // literal run control byte
                    outPos++;
                }
            }
        }
        // write the remaining few bytes as literals
        while (inPos < inLen) {
            out[outPos++] = in.get(inPos++);
            literals++;
            if (literals == MAX_LITERAL) {
                out[outPos - literals - 1] = (byte) (literals - 1);
                literals = 0;
                outPos++;
            }
        }
        // writes the final literal run length to the control byte
        out[outPos - literals - 1] = (byte) (literals - 1);
        if (literals == 0) {
            outPos--;
        }
        return outPos;
    }

    @Override
    public void expand(byte[] in, int inPos, int inLen, byte[] out, int outPos,
            int outLen) {
        // if ((inPos | outPos | outLen) < 0) {
        if (inPos < 0 || outPos < 0 || outLen < 0) {
            throw new IllegalArgumentException();
        }
        do {
            int ctrl = in[inPos++] & 255;
            if (ctrl < MAX_LITERAL) {
                // literal run of length = ctrl + 1,
                ctrl++;
                // copy to output and move forward this many bytes
                // while (ctrl-- > 0) {
                //     out[outPos++] = in[inPos++];
                // }
                System.arraycopy(in, inPos, out, outPos, ctrl);
                outPos += ctrl;
                inPos += ctrl;
            } else {
                // back reference
                // the highest 3 bits are the match length
                int len = ctrl >> 5;
                // if the length is maxed, add the next byte to the length
                if (len == 7) {
                    len += in[inPos++] & 255;
                }
                // minimum back-reference is 3 bytes,
                // so 2 was subtracted before storing size
                len += 2;

                // ctrl is now the offset for a back-reference...
                // the logical AND operation removes the length bits
                ctrl = -((ctrl & 0x1f) << 8) - 1;

                // the next byte augments/increases the offset
                ctrl -= in[inPos++] & 255;

                // copy the back-reference bytes from the given
                // location in output to current position
                ctrl += outPos;
                if (outPos + len >= out.length) {
                    // reduce array bounds checking
                    throw new ArrayIndexOutOfBoundsException();
                }
                for (int i = 0; i < len; i++) {
                    out[outPos++] = out[ctrl++];
                }
            }
        } while (outPos < outLen);
    }

    /**
     * Expand a number of compressed bytes.
     *
     * @param in the compressed data
     * @param out the output area
     */
    public static void expand(ByteBuffer in, ByteBuffer out) {
        do {
            int ctrl = in.get() & 255;
            if (ctrl < MAX_LITERAL) {
                // literal run of length = ctrl + 1,
                ctrl++;
                // copy to output and move forward this many bytes
                // (maybe slice would be faster)
                for (int i = 0; i < ctrl; i++) {
                    out.put(in.get());
                }
            } else {
                // back reference
                // the highest 3 bits are the match length
                int len = ctrl >> 5;
                // if the length is maxed, add the next byte to the length
                if (len == 7) {
                    len += in.get() & 255;
                }
                // minimum back-reference is 3 bytes,
                // so 2 was subtracted before storing size
                len += 2;

                // ctrl is now the offset for a back-reference...
                // the logical AND operation removes the length bits
                ctrl = -((ctrl & 0x1f) << 8) - 1;

                // the next byte augments/increases the offset
                ctrl -= in.get() & 255;

                // copy the back-reference bytes from the given
                // location in output to current position
                // (maybe slice would be faster)
                ctrl += out.position();
                for (int i = 0; i < len; i++) {
                    out.put(out.get(ctrl++));
                }
            }
        } while (out.position() < out.capacity());
    }

    @Override
    public int getAlgorithm() {
        return Compressor.LZF;
    }

}