com.fasterxml.storemate.shared.util.UTF8Encoder Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of storemate-shared Show documentation
Utility stuff shared by client and server modules
There is a newer version: 1.1.4
package com.fasterxml.storemate.shared.util;

import java.io.*;
import java.lang.ref.SoftReference;
import java.nio.charset.Charset;
import java.util.*;

/**
 * Helper class used for efficient encoding of JSON String values (including
 * JSON field names) into Strings or UTF-8 byte arrays.
 *
 * Note that methods in here are somewhat optimized, but not ridiculously so.
 * Reason is that conversion method results are expected to be cached so that
 * these methods will not be hot spots during normal operation.
 */
public final class UTF8Encoder
{
    // Size of the initial block, when starting from scratch
    private final static int INITIAL_BLOCK_SIZE = 500;
    
    private final static int SURR1_FIRST = 0xD800;
    private final static int SURR1_LAST = 0xDBFF;
    private final static int SURR2_FIRST = 0xDC00;
    private final static int SURR2_LAST = 0xDFFF;
    
    /**
     * Helper object used for efficient recycling of encoding buffers.
     */
    final protected static ThreadLocal> _threadEncoder
        = new ThreadLocal>();

    /**
     * We will reuse encoding buffer for performance; works because encoders are
     * reused with ThreadLocal
     */
    protected byte[] _encodingBuffer;
    
    /*
    /**********************************************************************
    /* Construction, instance access
    /**********************************************************************
     */
    
    public UTF8Encoder() {
        _encodingBuffer = new byte[INITIAL_BLOCK_SIZE];
    }

    /*
    /**********************************************************************
    /* Public API
    /**********************************************************************
     */
    
    /**
     * Will encode given String as UTF-8, return
     * resulting byte array.
     */
    public static byte[] encodeAsUTF8(String text)
    {
        if (text == null) {
            return null;
        }
        SoftReference ref = _threadEncoder.get();
        UTF8Encoder enc = (ref == null) ? null : ref.get();

        if (enc == null) {
            enc = new UTF8Encoder();
            _threadEncoder.set(new SoftReference(enc));
        }
        return enc._encodeAsUTF8(text);
    }

    /**
     * Will encode given String as UTF-8, prepend it with a given
     * prefix, and return resulting byte array.
     */
    public static byte[] encodeAsUTF8(byte[] prefix, String text)
    {
        SoftReference ref = _threadEncoder.get();
        UTF8Encoder enc = (ref == null) ? null : ref.get();

        if (enc == null) {
            enc = new UTF8Encoder();
            _threadEncoder.set(new SoftReference(enc));
        }
        return enc._encodeAsUTF8(prefix, text);
    }

    /**
     * Will encode given String as UTF-8, after allocation of a fixed
     * length of empty space (left as null bytes), and also add optional
     * suffix empty space;
     * and return resulting byte array.
     * 
     * @param fillWithNulls If true, will ensure that prefix and suffix sections
     *    are initialized to null bytes; if false, may contain any data
     */
    public static byte[] encodeAsUTF8(String text, int prefixLength, int suffixLength,
            boolean fillWithNulls)
    {
        SoftReference ref = _threadEncoder.get();
        UTF8Encoder enc = (ref == null) ? null : ref.get();

        if (enc == null) {
            enc = new UTF8Encoder();
            _threadEncoder.set(new SoftReference(enc));
        }
        // shoud
        return enc._encodeAsUTF8(text, prefixLength, suffixLength, fillWithNulls);
    }
    
    public static OutputStream encodeAsUTF8(String text, OutputStream out)
        throws IOException
    {
        if (text == null) {
            return null;
        }
        // !!! TODO: optimize
        byte[] bytes = encodeAsUTF8(text);
        out.write(bytes);
        return out;
    }
    
    private final static Charset UTF8 = Charset.forName("UTF-8");

    public static String decodeFromUTF8(byte[] bytes)
    {
        if (bytes == null) {
            return null;
        }
        return decodeFromUTF8(bytes, 0, bytes.length);
    }
    
    public static String decodeFromUTF8(byte[] bytes, int offset, int length)
    {
        // !!! TODO: optimize
        return new String(bytes, offset, length, UTF8);
    }

    /*
    /**********************************************************
    /* Internal methods, decoding from UTF-8
    /**********************************************************
     */
    
    /*
    /**********************************************************
    /* Internal methods, encoding as UTF-8
    /**********************************************************
     */
    
    private byte[] _encodeAsUTF8(String text)
    {
        int inputPtr = 0;
        int inputEnd = text.length();
        int outputPtr = 0;
        byte[] outputBuffer = _encodingBuffer;
        int outputEnd = outputBuffer.length;
        
        final ByteArrayBuilder byteBuilder = new ByteArrayBuilder(_encodingBuffer);
        
        main_loop:
        while (inputPtr < inputEnd) {
            int c = text.charAt(inputPtr++);

            // first tight loop for ascii
            while (c <= 0x7F) {
                if (outputPtr >= outputEnd) {
                    outputBuffer = byteBuilder.finishCurrentSegment();
                    outputEnd = outputBuffer.length;
                    outputPtr = 0;
                }
                outputBuffer[outputPtr++] = (byte) c;
                if (inputPtr >= inputEnd) {
                    break main_loop;
                }
                c = text.charAt(inputPtr++);
            }

            // then multi-byte...
            if (outputPtr >= outputEnd) {
                outputBuffer = byteBuilder.finishCurrentSegment();
                outputEnd = outputBuffer.length;
                outputPtr = 0;
            }
            if (c < 0x800) { // 2-byte
                outputBuffer[outputPtr++] = (byte) (0xc0 | (c >> 6));
            } else { // 3 or 4 bytes
                // Surrogates?
                if (c < SURR1_FIRST || c > SURR2_LAST) { // nope
                    outputBuffer[outputPtr++] = (byte) (0xe0 | (c >> 12));
                    if (outputPtr >= outputEnd) {
                        outputBuffer = byteBuilder.finishCurrentSegment();
                        outputEnd = outputBuffer.length;
                        outputPtr = 0;
                    }
                    outputBuffer[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
                } else { // yes, surrogate pair
                    if (c > SURR1_LAST) { // must be from first range
                        _throwIllegalSurrogate(c);
                    }
                    // and if so, followed by another from next range
                    if (inputPtr >= inputEnd) {
                        _throwIllegalSurrogate(c);
                    }
                    c = _convertSurrogate(c, text.charAt(inputPtr++));
                    if (c > 0x10FFFF) { // illegal, as per RFC 4627
                        _throwIllegalSurrogate(c);
                    }
                    outputBuffer[outputPtr++] = (byte) (0xf0 | (c >> 18));
                    if (outputPtr >= outputEnd) {
                        outputBuffer = byteBuilder.finishCurrentSegment();
                        outputEnd = outputBuffer.length;
                        outputPtr = 0;
                    }
                    outputBuffer[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
                    if (outputPtr >= outputEnd) {
                        outputBuffer = byteBuilder.finishCurrentSegment();
                        outputEnd = outputBuffer.length;
                        outputPtr = 0;
                    }
                    outputBuffer[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
                }
            }
            if (outputPtr >= outputEnd) {
                outputBuffer = byteBuilder.finishCurrentSegment();
                outputEnd = outputBuffer.length;
                outputPtr = 0;
            }
            outputBuffer[outputPtr++] = (byte) (0x80 | (c & 0x3f));
        }
        
        // let's ensure we are reusing the largest chunk:
        _encodingBuffer = byteBuilder.getCurrentSegment();
        return byteBuilder.complete(outputPtr);
    }

    private byte[] _encodeAsUTF8(byte[] prefix, String text)
    {
        int inputPtr = 0;
        int inputEnd = text.length();
        int outputPtr = prefix.length;

        if (outputPtr > _encodingBuffer.length) {
            _encodingBuffer = new byte[outputPtr];
        }
        System.arraycopy(prefix, 0, _encodingBuffer, 0, outputPtr);
        
        final ByteArrayBuilder byteBuilder = new ByteArrayBuilder(_encodingBuffer);
        byte[] outputBuffer = byteBuilder.getCurrentSegment();
        int outputEnd = outputBuffer.length;
        
        main_loop:
        while (inputPtr < inputEnd) {
            int c = text.charAt(inputPtr++);

            // first tight loop for ascii
            while (c <= 0x7F) {
                if (outputPtr >= outputEnd) {
                    outputBuffer = byteBuilder.finishCurrentSegment();
                    outputEnd = outputBuffer.length;
                    outputPtr = 0;
                }
                outputBuffer[outputPtr++] = (byte) c;
                if (inputPtr >= inputEnd) {
                    break main_loop;
                }
                c = text.charAt(inputPtr++);
            }

            // then multi-byte...
            if (outputPtr >= outputEnd) {
                outputBuffer = byteBuilder.finishCurrentSegment();
                outputEnd = outputBuffer.length;
                outputPtr = 0;
            }
            if (c < 0x800) { // 2-byte
                outputBuffer[outputPtr++] = (byte) (0xc0 | (c >> 6));
            } else { // 3 or 4 bytes
                // Surrogates?
                if (c < SURR1_FIRST || c > SURR2_LAST) { // nope
                    outputBuffer[outputPtr++] = (byte) (0xe0 | (c >> 12));
                    if (outputPtr >= outputEnd) {
                        outputBuffer = byteBuilder.finishCurrentSegment();
                        outputEnd = outputBuffer.length;
                        outputPtr = 0;
                    }
                    outputBuffer[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
                } else { // yes, surrogate pair
                    if (c > SURR1_LAST) { // must be from first range
                        _throwIllegalSurrogate(c);
                    }
                    // and if so, followed by another from next range
                    if (inputPtr >= inputEnd) {
                        _throwIllegalSurrogate(c);
                    }
                    c = _convertSurrogate(c, text.charAt(inputPtr++));
                    if (c > 0x10FFFF) { // illegal, as per RFC 4627
                        _throwIllegalSurrogate(c);
                    }
                    outputBuffer[outputPtr++] = (byte) (0xf0 | (c >> 18));
                    if (outputPtr >= outputEnd) {
                        outputBuffer = byteBuilder.finishCurrentSegment();
                        outputEnd = outputBuffer.length;
                        outputPtr = 0;
                    }
                    outputBuffer[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
                    if (outputPtr >= outputEnd) {
                        outputBuffer = byteBuilder.finishCurrentSegment();
                        outputEnd = outputBuffer.length;
                        outputPtr = 0;
                    }
                    outputBuffer[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
                }
            }
            if (outputPtr >= outputEnd) {
                outputBuffer = byteBuilder.finishCurrentSegment();
                outputEnd = outputBuffer.length;
                outputPtr = 0;
            }
            outputBuffer[outputPtr++] = (byte) (0x80 | (c & 0x3f));
        }
        
        // let's ensure we are reusing the largest chunk:
        _encodingBuffer = byteBuilder.getCurrentSegment();
        return byteBuilder.complete(outputPtr);
    }

    private byte[] _encodeAsUTF8(String text, int prefixLen, int suffixLen,
            boolean clearPrefix)
    {
        int inputPtr = 0;
        int inputEnd = text.length();
        int outputPtr = prefixLen;

        if (outputPtr >= _encodingBuffer.length) {
            _encodingBuffer = new byte[outputPtr + Math.min(128, inputEnd << 1)];
        } else if (clearPrefix) {
            Arrays.fill(_encodingBuffer, 0, outputPtr, (byte) 0);
        }
        
        final ByteArrayBuilder byteBuilder = new ByteArrayBuilder(_encodingBuffer);
        byte[] outputBuffer = byteBuilder.getCurrentSegment();
        int outputEnd = outputBuffer.length;
        
        main_loop:
        while (inputPtr < inputEnd) {
            int c = text.charAt(inputPtr++);

            // first tight loop for ascii
            while (c <= 0x7F) {
                if (outputPtr >= outputEnd) {
                    outputBuffer = byteBuilder.finishCurrentSegment();
                    outputEnd = outputBuffer.length;
                    outputPtr = 0;
                }
                outputBuffer[outputPtr++] = (byte) c;
                if (inputPtr >= inputEnd) {
                    break main_loop;
                }
                c = text.charAt(inputPtr++);
            }

            // then multi-byte...
            if (outputPtr >= outputEnd) {
                outputBuffer = byteBuilder.finishCurrentSegment();
                outputEnd = outputBuffer.length;
                outputPtr = 0;
            }
            if (c < 0x800) { // 2-byte
                outputBuffer[outputPtr++] = (byte) (0xc0 | (c >> 6));
            } else { // 3 or 4 bytes
                // Surrogates?
                if (c < SURR1_FIRST || c > SURR2_LAST) { // nope
                    outputBuffer[outputPtr++] = (byte) (0xe0 | (c >> 12));
                    if (outputPtr >= outputEnd) {
                        outputBuffer = byteBuilder.finishCurrentSegment();
                        outputEnd = outputBuffer.length;
                        outputPtr = 0;
                    }
                    outputBuffer[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
                } else { // yes, surrogate pair
                    if (c > SURR1_LAST) { // must be from first range
                        _throwIllegalSurrogate(c);
                    }
                    // and if so, followed by another from next range
                    if (inputPtr >= inputEnd) {
                        _throwIllegalSurrogate(c);
                    }
                    c = _convertSurrogate(c, text.charAt(inputPtr++));
                    if (c > 0x10FFFF) { // illegal, as per RFC 4627
                        _throwIllegalSurrogate(c);
                    }
                    outputBuffer[outputPtr++] = (byte) (0xf0 | (c >> 18));
                    if (outputPtr >= outputEnd) {
                        outputBuffer = byteBuilder.finishCurrentSegment();
                        outputEnd = outputBuffer.length;
                        outputPtr = 0;
                    }
                    outputBuffer[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
                    if (outputPtr >= outputEnd) {
                        outputBuffer = byteBuilder.finishCurrentSegment();
                        outputEnd = outputBuffer.length;
                        outputPtr = 0;
                    }
                    outputBuffer[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
                }
            }
            if (outputPtr >= outputEnd) {
                outputBuffer = byteBuilder.finishCurrentSegment();
                outputEnd = outputBuffer.length;
                outputPtr = 0;
            }
            outputBuffer[outputPtr++] = (byte) (0x80 | (c & 0x3f));
        }
        // let's ensure we are reusing the largest chunk:
        _encodingBuffer = byteBuilder.getCurrentSegment();
        return byteBuilder.complete(outputPtr, suffixLen);
    }    
    /*
    /**********************************************************
    /* Internal helper methods
    /**********************************************************
     */

    /**
     * Method called to calculate UTF code point, from a surrogate pair.
     */
    private int _convertSurrogate(int firstPart, int secondPart)
    {
        // Ok, then, is the second part valid?
        if (secondPart < SURR2_FIRST || secondPart > SURR2_LAST) {
            throw new IllegalArgumentException("Broken surrogate pair: first char 0x"+Integer.toHexString(firstPart)+", second 0x"+Integer.toHexString(secondPart)+"; illegal combination");
        }
        return 0x10000 + ((firstPart - SURR1_FIRST) << 10) + (secondPart - SURR2_FIRST);
    }
    
    private void _throwIllegalSurrogate(int code)
    {
        if (code > 0x10FFFF) { // over max?
            throw new IllegalArgumentException("Illegal character point (0x"+Integer.toHexString(code)+") to output; max is 0x10FFFF as per RFC 4627");
        }
        if (code >= SURR1_FIRST) {
            if (code <= SURR1_LAST) { // Unmatched first part (closing without second part?)
                throw new IllegalArgumentException("Unmatched first part of surrogate pair (0x"+Integer.toHexString(code)+")");
            }
            throw new IllegalArgumentException("Unmatched second part of surrogate pair (0x"+Integer.toHexString(code)+")");
        }
        // should we ever get this?
        throw new IllegalArgumentException("Illegal character point (0x"+Integer.toHexString(code)+") to output");
    }

    /*
    ///////////////////////////////////////////////////////////////////////
    // Helper classes
    ///////////////////////////////////////////////////////////////////////
     */

    /**
     * Helper class that is similar to {@link java.io.ByteArrayOutputStream}
     * in usage, but more geared to specific use case we have here.
     * Specific changes include segment storage (no need to have linear
     * backing buffer, can avoid reallocs, copying), as well API
     * not necessarily based on {@link java.io.OutputStream}.
     * In short, this is a very much specialized builder object.
     */
    private final static class ByteArrayBuilder
    {
        private final static byte[] NO_BYTES = new byte[0];
        
        /**
         * Maximum block size we will use for individual non-aggregated
         * blocks. Let's limit to using 256k chunks.
         */
        private final static int MAX_BLOCK_SIZE = (1 << 18);

        private LinkedList _pastBlocks = null;
        
        /**
         * Number of bytes within byte arrays in {@link _pastBlocks}.
         */
        private int _pastLen;

        private byte[] _currBlock;
        
        public ByteArrayBuilder(byte[] firstBlock) {
            _currBlock = firstBlock;
        }
        
        /**
         * Method called when results are finalized and we can get the
         * full aggregated result buffer to return to the caller
         */
        public byte[] complete(int lastBlockLength)
        {
            int totalLen = _pastLen + lastBlockLength;

            if (totalLen == 0) { // quick check: nothing aggregated?
                return NO_BYTES;
            }
            byte[] result = new byte[totalLen];
            int offset = 0;

            if (_pastBlocks != null) {
                for (byte[] block : _pastBlocks) {
                    int len = block.length;
                    System.arraycopy(block, 0, result, offset, len);
                    offset += len;
                }
            }
            if (lastBlockLength > 0) {
                System.arraycopy(_currBlock, 0, result, offset, lastBlockLength);
                offset += lastBlockLength;
            }
            if (offset != totalLen) { // just a sanity check
                throw new RuntimeException("Internal error: total len assumed to be "+totalLen+", copied "+offset+" bytes");
            }
            return result;
        }

        /**
         * Alternative completion method that will add specific amount of 'fluff'
         * (null bytes) after encoding result.
         * 
         * @param fluffToAppend Number of null bytes to append after buffered
         *    content
         */
        public byte[] complete(int lastBlockLength, int fluffToAppend)
        {
            int totalLen = _pastLen + lastBlockLength + fluffToAppend;

            if (totalLen == 0) { // quick check: nothing aggregated?
                return NO_BYTES;
            }
            byte[] result = new byte[totalLen];
            int offset = 0;

            if (_pastBlocks != null) {
                for (byte[] block : _pastBlocks) {
                    int len = block.length;
                    System.arraycopy(block, 0, result, offset, len);
                    offset += len;
                }
            }
            if (lastBlockLength > 0) {
                System.arraycopy(_currBlock, 0, result, offset, lastBlockLength);
                offset += lastBlockLength;
            }
            offset += fluffToAppend;
            if (offset != totalLen) { // just a sanity check
                throw new RuntimeException("Internal error: total len assumed to be "+totalLen+", copied "+offset+" bytes");
            }
            return result;
        }
        
        public byte[] getCurrentSegment() {
            return _currBlock;
        }

        public byte[] finishCurrentSegment() {
            _allocMore();
            return _currBlock;
        }
        
        private void _allocMore()
        {
            _pastLen += _currBlock.length;

            /* Let's allocate block that's half the total size, except
             * never smaller than twice the initial block size.
             * The idea is just to grow with reasonable rate, to optimize
             * between minimal number of chunks and minimal amount of
             * wasted space.
             */
            int newSize = Math.max((_pastLen >> 1), (INITIAL_BLOCK_SIZE + INITIAL_BLOCK_SIZE));
            // plus not to exceed max we define...
            if (newSize > MAX_BLOCK_SIZE) {
                newSize = MAX_BLOCK_SIZE;
            }
            if (_pastBlocks == null) {
                _pastBlocks = new LinkedList();
            }
            _pastBlocks.add(_currBlock);
            _currBlock = new byte[newSize];
        }
    }
}