com.ning.compress.lzf.LZFEncoder Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of spark-core Show documentation
Shaded version of Apache Spark 2.x.x for Presto
The newest version!
/* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this
 * file except in compliance with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under
 * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
 * OF ANY KIND, either express or implied. See the License for the specific language
 * governing permissions and limitations under the License.
 */

package com.ning.compress.lzf;

import com.ning.compress.BufferRecycler;
import com.ning.compress.lzf.util.ChunkEncoderFactory;

/**
 * Encoder that handles splitting of input into chunks to encode,
 * calls {@link ChunkEncoder} to compress individual chunks and
 * combines resulting chunks into contiguous output byte array.
 * 
 * @author Tatu Saloranta
 */
public class LZFEncoder
{
    /* Approximate maximum size for a full chunk, in case where it does not compress
     * at all. Such chunks are converted to uncompressed chunks, but during compression
     * process this amount of space is still needed.
     */
    public final static int MAX_CHUNK_RESULT_SIZE = LZFChunk.MAX_HEADER_LEN + LZFChunk.MAX_CHUNK_LEN + (LZFChunk.MAX_CHUNK_LEN * 32 / 31);

    // Static methods only, no point in instantiating
    private LZFEncoder() { }
    
    /*
    ///////////////////////////////////////////////////////////////////////
    // Helper methods
    ///////////////////////////////////////////////////////////////////////
     */

    /**
     * Helper method that can be used to estimate maximum space needed to
     * try compression of given amount of data. This is slightly larger
     * than maximum resulting content since compressor has a choice of
     * uncompressed chunks to use, but that is only done after compression
     * fails to reduce size; and this temporary expansion of up to 3.3% or so
     * (1 indicator for every 31 bytes of uncompressed data)
     * is more than what eventual expansion would be (5 bytes header per
     * each uncompressed chunk, usually 0.01%).
     */
    public static int estimateMaxWorkspaceSize(int inputSize)
    {
        // single chunk; give a rough estimate with +5% (1 + 1/32 + 1/64)
        if (inputSize <= LZFChunk.MAX_CHUNK_LEN) {
            return LZFChunk.MAX_HEADER_LEN + inputSize + (inputSize >> 5) + (inputSize >> 6);
        }
        // one more special case, 2 chunks
        inputSize -= LZFChunk.MAX_CHUNK_LEN;
        if (inputSize <= LZFChunk.MAX_CHUNK_LEN) { // uncompressed chunk actually has 5 byte header but
            return MAX_CHUNK_RESULT_SIZE + inputSize + LZFChunk.MAX_HEADER_LEN;
        }
        // check number of chunks we should be creating (assuming use of full chunks)
        int chunkCount = 1 + ((inputSize + (LZFChunk.MAX_CHUNK_LEN-1)) / LZFChunk.MAX_CHUNK_LEN);
        return MAX_CHUNK_RESULT_SIZE + chunkCount * (LZFChunk.MAX_CHUNK_LEN + LZFChunk.MAX_HEADER_LEN);
    }
    
    /*
    ///////////////////////////////////////////////////////////////////////
    // Encoding methods, blocks
    ///////////////////////////////////////////////////////////////////////
     */

    /**
     * Method for compressing given input data using LZF encoding and
     * block structure (compatible with lzf command line utility).
     * Result consists of a sequence of chunks.
     *
     * Note that {@link ChunkEncoder} instance used is one produced by
     * {@link ChunkEncoderFactory#optimalInstance}, which typically
     * is "unsafe" instance if one can be used on current JVM.
     */
    public static byte[] encode(byte[] data) {
        return encode(data, 0, data.length);
    }

    /**
     * Method that will use "safe" {@link ChunkEncoder}, as produced by
     * {@link ChunkEncoderFactory#safeInstance}, for encoding. Safe here
     * means that it does not use any non-compliant features beyond core JDK.
     */
    public static byte[] safeEncode(byte[] data) {
        return safeEncode(data, 0, data.length);
    }

    /**
     * Method for compressing given input data using LZF encoding and
     * block structure (compatible with lzf command line utility).
     * Result consists of a sequence of chunks.
     *

     * Note that {@link ChunkEncoder} instance used is one produced by
     * {@link ChunkEncoderFactory#optimalInstance}, which typically
     * is "unsafe" instance if one can be used on current JVM.
     */
    public static byte[] encode(byte[] data, int offset, int length)
    {
        ChunkEncoder enc = ChunkEncoderFactory.optimalInstance(length);
        byte[] result = encode(enc, data, offset, length);
        enc.close(); // important for buffer reuse!
        return result;
    }

    /**
     * Method that will use "safe" {@link ChunkEncoder}, as produced by
     * {@link ChunkEncoderFactory#safeInstance}, for encoding. Safe here
     * means that it does not use any non-compliant features beyond core JDK.
     */
    public static byte[] safeEncode(byte[] data, int offset, int length)
    {
        ChunkEncoder enc = ChunkEncoderFactory.safeInstance(length);
        byte[] result = encode(enc, data, offset, length);
        enc.close();
        return result;
    }    

    /**
     * Method for compressing given input data using LZF encoding and
     * block structure (compatible with lzf command line utility).
     * Result consists of a sequence of chunks.
     *

     * Note that {@link ChunkEncoder} instance used is one produced by
     * {@link ChunkEncoderFactory#optimalInstance}, which typically
     * is "unsafe" instance if one can be used on current JVM.
     */
    public static byte[] encode(byte[] data, int offset, int length, BufferRecycler bufferRecycler)
    {
        ChunkEncoder enc = ChunkEncoderFactory.optimalInstance(length, bufferRecycler);
        byte[] result = encode(enc, data, offset, length);
        enc.close(); // important for buffer reuse!
        return result;
    }

    /**
     * Method that will use "safe" {@link ChunkEncoder}, as produced by
     * {@link ChunkEncoderFactory#safeInstance}, for encoding. Safe here
     * means that it does not use any non-compliant features beyond core JDK.
     */
    public static byte[] safeEncode(byte[] data, int offset, int length, BufferRecycler bufferRecycler)
    {
        ChunkEncoder enc = ChunkEncoderFactory.safeInstance(length, bufferRecycler);
        byte[] result = encode(enc, data, offset, length);
        enc.close();
        return result;
    }    

    /**
     * Compression method that uses specified {@link ChunkEncoder} for actual
     * encoding.
     */
    public static byte[] encode(ChunkEncoder enc, byte[] data, int length) {
        return encode(enc, data, 0, length);
    }

    /**
     * Method that encodes given input using provided {@link ChunkEncoder},
     * and aggregating it into a single byte array and returning that.
     *

     * NOTE: method does NOT call {@link ChunkEncoder#close}; caller is responsible
     * for doing that after it is done using the encoder.
     */
    public static byte[] encode(ChunkEncoder enc, byte[] data, int offset, int length)
    {
        int left = length;
        int chunkLen = Math.min(LZFChunk.MAX_CHUNK_LEN, left);
        LZFChunk first = enc.encodeChunk(data, offset, chunkLen);
        left -= chunkLen;
        // shortcut: if it all fit in, no need to coalesce:
        if (left < 1) {
            return first.getData();
        }
        // otherwise need to get other chunks:
        int resultBytes = first.length();
        offset += chunkLen;
        LZFChunk last = first;

        do {
            chunkLen = Math.min(left, LZFChunk.MAX_CHUNK_LEN);
            LZFChunk chunk = enc.encodeChunk(data, offset, chunkLen);
            offset += chunkLen;
            left -= chunkLen;
            resultBytes += chunk.length();
            last.setNext(chunk);
            last = chunk;
        } while (left > 0);
        // and then coalesce returns into single contiguous byte array
        byte[] result = new byte[resultBytes];
        int ptr = 0;
        for (; first != null; first = first.next()) {
            ptr = first.copyTo(result, ptr);
        }
        return result;
    }

    /*
    ///////////////////////////////////////////////////////////////////////
    // Encoding methods, append in caller-provided buffer(s)
    ///////////////////////////////////////////////////////////////////////
     */

    /**
     * Alternate version that accepts pre-allocated output buffer.
     *

     * Note that {@link ChunkEncoder} instance used is one produced by
     * {@link ChunkEncoderFactory#optimalNonAllocatingInstance}, which typically
     * is "unsafe" instance if one can be used on current JVM.
     */
    public static int appendEncoded(byte[] input, int inputPtr, int inputLength,
            byte[] outputBuffer, int outputPtr) {
        ChunkEncoder enc = ChunkEncoderFactory.optimalNonAllocatingInstance(inputLength);
        int len = appendEncoded(enc, input, inputPtr, inputLength, outputBuffer, outputPtr);
        enc.close();
        return len;
    }

    /**
     * Alternate version that accepts pre-allocated output buffer.
     *

     * Method that will use "safe" {@link ChunkEncoder}, as produced by
     * {@link ChunkEncoderFactory#safeInstance}, for encoding. Safe here
     * means that it does not use any non-compliant features beyond core JDK.
     */
    public static int safeAppendEncoded(byte[] input, int inputPtr, int inputLength,
            byte[] outputBuffer, int outputPtr) {
        ChunkEncoder enc = ChunkEncoderFactory.safeNonAllocatingInstance(inputLength);
        int len = appendEncoded(enc, input, inputPtr, inputLength, outputBuffer, outputPtr);
        enc.close();
        return len;
    }
    
    /**
     * Alternate version that accepts pre-allocated output buffer.
     *

     * Note that {@link ChunkEncoder} instance used is one produced by
     * {@link ChunkEncoderFactory#optimalNonAllocatingInstance}, which typically
     * is "unsafe" instance if one can be used on current JVM.
     */
    public static int appendEncoded(byte[] input, int inputPtr, int inputLength,
            byte[] outputBuffer, int outputPtr, BufferRecycler bufferRecycler) {
        ChunkEncoder enc = ChunkEncoderFactory.optimalNonAllocatingInstance(inputLength, bufferRecycler);
        int len = appendEncoded(enc, input, inputPtr, inputLength, outputBuffer, outputPtr);
        enc.close();
        return len;
    }

    /**
     * Alternate version that accepts pre-allocated output buffer.
     *
     * Method that will use "safe" {@link ChunkEncoder}, as produced by
     * {@link ChunkEncoderFactory#safeInstance}, for encoding. Safe here
     * means that it does not use any non-compliant features beyond core JDK.
     */
    public static int safeAppendEncoded(byte[] input, int inputPtr, int inputLength,
            byte[] outputBuffer, int outputPtr, BufferRecycler bufferRecycler) {
        ChunkEncoder enc = ChunkEncoderFactory.safeNonAllocatingInstance(inputLength, bufferRecycler);
        int len = appendEncoded(enc, input, inputPtr, inputLength, outputBuffer, outputPtr);
        enc.close();
        return len;
    }

	/**
     * Alternate version that accepts pre-allocated output buffer.
     */
    public static int appendEncoded(ChunkEncoder enc, byte[] input, int inputPtr, int inputLength,
            byte[] outputBuffer, int outputPtr)
    {
        int left = inputLength;
        int chunkLen = Math.min(LZFChunk.MAX_CHUNK_LEN, left);

        outputPtr = enc.appendEncodedChunk(input, inputPtr, chunkLen, outputBuffer, outputPtr);
        left -= chunkLen;
        // shortcut: if it all fit in, no need to coalesce:
        if (left < 1) {
            return outputPtr;
        }
        // otherwise need to keep on encoding...
        inputPtr += chunkLen;
        do {
            chunkLen = Math.min(left, LZFChunk.MAX_CHUNK_LEN);
            outputPtr = enc.appendEncodedChunk(input, inputPtr, chunkLen, outputBuffer, outputPtr);
            inputPtr += chunkLen;
            left -= chunkLen;
        } while (left > 0);
        return outputPtr;
    }
}