All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.h2.compress.CompressLZF Maven / Gradle / Ivy

There is a newer version: 1.0.0-beta2
Show newest version
/*
 * Copyright 2004-2019 H2 Group. Multiple-Licensed under the MPL 2.0,
 * and the EPL 1.0 (https://h2database.com/html/license.html).
 *
 * This code is based on the LZF algorithm from Marc Lehmann. It is a
 * re-implementation of the C code:
 * http://cvs.schmorp.de/liblzf/lzf_c.c?view=markup
 * http://cvs.schmorp.de/liblzf/lzf_d.c?view=markup
 *
 * According to a mail from Marc Lehmann, it's OK to use his algorithm:
 * Date: 2010-07-15 15:57
 * Subject: Re: Question about LZF licensing
 * ...
 * The algorithm is not copyrighted (and cannot be copyrighted afaik) - as long
 * as you wrote everything yourself, without copying my code, that's just fine
 * (looking is of course fine too).
 * ...
 *
 * Still I would like to keep his copyright info:
 *
 * Copyright (c) 2000-2005 Marc Alexander Lehmann 
 * Copyright (c) 2005 Oren J. Maurice 
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   1.  Redistributions of source code must retain the above copyright notice,
 *       this list of conditions and the following disclaimer.
 *
 *   2.  Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *
 *   3.  The name of the author may not be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ''AS IS'' AND ANY EXPRESS OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO
 * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

package org.h2.compress;

import java.nio.ByteBuffer;

/**
 * 

* This class implements the LZF lossless data compression algorithm. LZF is a * Lempel-Ziv variant with byte-aligned output, and optimized for speed. *

*

* Safety/Use Notes: *

*
    *
  • Each instance should be used by a single thread only.
  • *
  • The data buffers should be smaller than 1 GB.
  • *
  • For performance reasons, safety checks on expansion are omitted.
  • *
  • Invalid compressed data can cause an ArrayIndexOutOfBoundsException.
  • *
*

* The LZF compressed format knows literal runs and back-references: *

*
    *
  • Literal run: directly copy bytes from input to output.
  • *
  • Back-reference: copy previous data to output stream, with specified * offset from location and length. The length is at least 3 bytes.
  • *
*

* The first byte of the compressed stream is the control byte. For literal * runs, the highest three bits of the control byte are not set, the lower * bits are the literal run length, and the next bytes are data to copy directly * into the output. For back-references, the highest three bits of the control * byte are the back-reference length. If all three bits are set, then the * back-reference length is stored in the next byte. The lower bits of the * control byte combined with the next byte form the offset for the * back-reference. *

*/ public final class CompressLZF implements Compressor { /** * The number of entries in the hash table. The size is a trade-off between * hash collisions (reduced compression) and speed (amount that fits in CPU * cache). */ private static final int HASH_SIZE = 1 << 14; /** * The maximum number of literals in a chunk (32). */ private static final int MAX_LITERAL = 1 << 5; /** * The maximum offset allowed for a back-reference (8192). */ private static final int MAX_OFF = 1 << 13; /** * The maximum back-reference length (264). */ private static final int MAX_REF = (1 << 8) + (1 << 3); /** * Hash table for matching byte sequences (reused for performance). */ private int[] cachedHashTable; @Override public void setOptions(String options) { // nothing to do } /** * Return the integer with the first two bytes 0, then the bytes at the * index, then at index+1. */ private static int first(byte[] in, int inPos) { return (in[inPos] << 8) | (in[inPos + 1] & 255); } /** * Return the integer with the first two bytes 0, then the bytes at the * index, then at index+1. */ private static int first(ByteBuffer in, int inPos) { return (in.get(inPos) << 8) | (in.get(inPos + 1) & 255); } /** * Shift the value 1 byte left, and add the byte at index inPos+2. */ private static int next(int v, byte[] in, int inPos) { return (v << 8) | (in[inPos + 2] & 255); } /** * Shift the value 1 byte left, and add the byte at index inPos+2. */ private static int next(int v, ByteBuffer in, int inPos) { return (v << 8) | (in.get(inPos + 2) & 255); } /** * Compute the address in the hash table. */ private static int hash(int h) { return ((h * 2777) >> 9) & (HASH_SIZE - 1); } @Override public int compress(byte[] in, int inLen, byte[] out, int outPos) { int inPos = 0; if (cachedHashTable == null) { cachedHashTable = new int[HASH_SIZE]; } int[] hashTab = cachedHashTable; int literals = 0; outPos++; int future = first(in, 0); while (inPos < inLen - 4) { byte p2 = in[inPos + 2]; // next future = (future << 8) + (p2 & 255); int off = hash(future); int ref = hashTab[off]; hashTab[off] = inPos; // if (ref < inPos // && ref > 0 // && (off = inPos - ref - 1) < MAX_OFF // && in[ref + 2] == p2 // && (((in[ref] & 255) << 8) | (in[ref + 1] & 255)) == // ((future >> 8) & 0xffff)) { if (ref < inPos && ref > 0 && (off = inPos - ref - 1) < MAX_OFF && in[ref + 2] == p2 && in[ref + 1] == (byte) (future >> 8) && in[ref] == (byte) (future >> 16)) { // match int maxLen = inLen - inPos - 2; if (maxLen > MAX_REF) { maxLen = MAX_REF; } if (literals == 0) { // multiple back-references, // so there is no literal run control byte outPos--; } else { // set the control byte at the start of the literal run // to store the number of literals out[outPos - literals - 1] = (byte) (literals - 1); literals = 0; } int len = 3; while (len < maxLen && in[ref + len] == in[inPos + len]) { len++; } len -= 2; if (len < 7) { out[outPos++] = (byte) ((off >> 8) + (len << 5)); } else { out[outPos++] = (byte) ((off >> 8) + (7 << 5)); out[outPos++] = (byte) (len - 7); } out[outPos++] = (byte) off; // move one byte forward to allow for a literal run control byte outPos++; inPos += len; // rebuild the future, and store the last bytes to the // hashtable. Storing hashes of the last bytes in back-reference // improves the compression ratio and only reduces speed // slightly. future = first(in, inPos); future = next(future, in, inPos); hashTab[hash(future)] = inPos++; future = next(future, in, inPos); hashTab[hash(future)] = inPos++; } else { // copy one byte from input to output as part of literal out[outPos++] = in[inPos++]; literals++; // at the end of this literal chunk, write the length // to the control byte and start a new chunk if (literals == MAX_LITERAL) { out[outPos - literals - 1] = (byte) (literals - 1); literals = 0; // move ahead one byte to allow for the // literal run control byte outPos++; } } } // write the remaining few bytes as literals while (inPos < inLen) { out[outPos++] = in[inPos++]; literals++; if (literals == MAX_LITERAL) { out[outPos - literals - 1] = (byte) (literals - 1); literals = 0; outPos++; } } // writes the final literal run length to the control byte out[outPos - literals - 1] = (byte) (literals - 1); if (literals == 0) { outPos--; } return outPos; } /** * Compress a number of bytes. * * @param in the input data * @param inPos the offset at the input buffer * @param out the output area * @param outPos the offset at the output array * @return the end position */ public int compress(ByteBuffer in, int inPos, byte[] out, int outPos) { int inLen = in.capacity() - inPos; if (cachedHashTable == null) { cachedHashTable = new int[HASH_SIZE]; } int[] hashTab = cachedHashTable; int literals = 0; outPos++; int future = first(in, 0); while (inPos < inLen - 4) { byte p2 = in.get(inPos + 2); // next future = (future << 8) + (p2 & 255); int off = hash(future); int ref = hashTab[off]; hashTab[off] = inPos; // if (ref < inPos // && ref > 0 // && (off = inPos - ref - 1) < MAX_OFF // && in[ref + 2] == p2 // && (((in[ref] & 255) << 8) | (in[ref + 1] & 255)) == // ((future >> 8) & 0xffff)) { if (ref < inPos && ref > 0 && (off = inPos - ref - 1) < MAX_OFF && in.get(ref + 2) == p2 && in.get(ref + 1) == (byte) (future >> 8) && in.get(ref) == (byte) (future >> 16)) { // match int maxLen = inLen - inPos - 2; if (maxLen > MAX_REF) { maxLen = MAX_REF; } if (literals == 0) { // multiple back-references, // so there is no literal run control byte outPos--; } else { // set the control byte at the start of the literal run // to store the number of literals out[outPos - literals - 1] = (byte) (literals - 1); literals = 0; } int len = 3; while (len < maxLen && in.get(ref + len) == in.get(inPos + len)) { len++; } len -= 2; if (len < 7) { out[outPos++] = (byte) ((off >> 8) + (len << 5)); } else { out[outPos++] = (byte) ((off >> 8) + (7 << 5)); out[outPos++] = (byte) (len - 7); } out[outPos++] = (byte) off; // move one byte forward to allow for a literal run control byte outPos++; inPos += len; // rebuild the future, and store the last bytes to the // hashtable. Storing hashes of the last bytes in back-reference // improves the compression ratio and only reduces speed // slightly. future = first(in, inPos); future = next(future, in, inPos); hashTab[hash(future)] = inPos++; future = next(future, in, inPos); hashTab[hash(future)] = inPos++; } else { // copy one byte from input to output as part of literal out[outPos++] = in.get(inPos++); literals++; // at the end of this literal chunk, write the length // to the control byte and start a new chunk if (literals == MAX_LITERAL) { out[outPos - literals - 1] = (byte) (literals - 1); literals = 0; // move ahead one byte to allow for the // literal run control byte outPos++; } } } // write the remaining few bytes as literals while (inPos < inLen) { out[outPos++] = in.get(inPos++); literals++; if (literals == MAX_LITERAL) { out[outPos - literals - 1] = (byte) (literals - 1); literals = 0; outPos++; } } // writes the final literal run length to the control byte out[outPos - literals - 1] = (byte) (literals - 1); if (literals == 0) { outPos--; } return outPos; } @Override public void expand(byte[] in, int inPos, int inLen, byte[] out, int outPos, int outLen) { // if ((inPos | outPos | outLen) < 0) { if (inPos < 0 || outPos < 0 || outLen < 0) { throw new IllegalArgumentException(); } do { int ctrl = in[inPos++] & 255; if (ctrl < MAX_LITERAL) { // literal run of length = ctrl + 1, ctrl++; // copy to output and move forward this many bytes // while (ctrl-- > 0) { // out[outPos++] = in[inPos++]; // } System.arraycopy(in, inPos, out, outPos, ctrl); outPos += ctrl; inPos += ctrl; } else { // back reference // the highest 3 bits are the match length int len = ctrl >> 5; // if the length is maxed, add the next byte to the length if (len == 7) { len += in[inPos++] & 255; } // minimum back-reference is 3 bytes, // so 2 was subtracted before storing size len += 2; // ctrl is now the offset for a back-reference... // the logical AND operation removes the length bits ctrl = -((ctrl & 0x1f) << 8) - 1; // the next byte augments/increases the offset ctrl -= in[inPos++] & 255; // copy the back-reference bytes from the given // location in output to current position ctrl += outPos; if (outPos + len >= out.length) { // reduce array bounds checking throw new ArrayIndexOutOfBoundsException(); } for (int i = 0; i < len; i++) { out[outPos++] = out[ctrl++]; } } } while (outPos < outLen); } /** * Expand a number of compressed bytes. * * @param in the compressed data * @param out the output area */ public static void expand(ByteBuffer in, ByteBuffer out) { do { int ctrl = in.get() & 255; if (ctrl < MAX_LITERAL) { // literal run of length = ctrl + 1, ctrl++; // copy to output and move forward this many bytes // (maybe slice would be faster) for (int i = 0; i < ctrl; i++) { out.put(in.get()); } } else { // back reference // the highest 3 bits are the match length int len = ctrl >> 5; // if the length is maxed, add the next byte to the length if (len == 7) { len += in.get() & 255; } // minimum back-reference is 3 bytes, // so 2 was subtracted before storing size len += 2; // ctrl is now the offset for a back-reference... // the logical AND operation removes the length bits ctrl = -((ctrl & 0x1f) << 8) - 1; // the next byte augments/increases the offset ctrl -= in.get() & 255; // copy the back-reference bytes from the given // location in output to current position // (maybe slice would be faster) ctrl += out.position(); for (int i = 0; i < len; i++) { out.put(out.get(ctrl++)); } } } while (out.position() < out.capacity()); } @Override public int getAlgorithm() { return Compressor.LZF; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy