com.yahoo.text.Utf8 Maven / Gradle / Ivy

// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.text;

import java.io.IOException;
import java.io.OutputStream;
import java.nio.BufferOverflowException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.ReadOnlyBufferException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CodingErrorAction;

import static java.nio.charset.StandardCharsets.UTF_8;

/**
 * Utility class with functions for handling UTF-8
 *
 * @author arnej27959
 * @author Steinar Knutsen
 * @author baldersheim
 */
public final class Utf8 {

    private static final byte [] TRUE = {(byte) 't', (byte) 'r', (byte) 'u', (byte) 'e'};
    private static final byte [] FALSE = {(byte) 'f', (byte) 'a', (byte) 'l', (byte) 's', (byte) 'e'};
    private static final byte[] LONG_MIN_VALUE_BYTES = String.valueOf(Long.MIN_VALUE).getBytes(UTF_8);

    /** Returns the Charset instance for UTF-8 */
    public static Charset getCharset() {
        return UTF_8;
    }

    /** To be used instead of String.String(byte[] bytes) */
    public static String toStringStd(byte[] data) {
        return new String(data, UTF_8);
    }

    /**
     * Utility method as toString(byte[]).
     *
     * @param data
     *            bytes to decode
     * @param offset
     *            index of first byte to decode
     * @param length
     *            number of bytes to decode
     * @return String decoded from UTF-8
     */
    public static String toString(byte[] data, int offset, int length) {
        return toString(ByteBuffer.wrap(data, offset, length));
    }

    /**
     * Fetch a string from a ByteBuffer instance. ByteBuffer instances are
     * stateful, so it is assumed to caller manipulates the instance's limit if
     * the entire buffer is not a string.
     *
     * @param data
     *            The UTF-8 data source
     * @return a decoded String
     */
    public static String toString(ByteBuffer data) {
        CharBuffer c = UTF_8.decode(data);
        return c.toString();
    }

    /**
     * Uses String.getBytes directly.
     */
    public static byte[] toBytesStd(String str) {
        return str.getBytes(UTF_8);
    }

    /**
     * Encode a long as its decimal representation, i.e. toAsciiBytes(15L) will
     * return "15" encoded as UTF-8. In other words it is an optimized version
     * of String.valueOf() followed by UTF-8 encoding. Avoid going through
     * string in order to get a simple UTF-8 sequence.
     *
     * @param l
     *            value to represent as a decimal number encded as utf8
     * @return byte array
     */
    public static byte[] toAsciiBytes(long l) {
        // Handle Long.MIN_VALUE specifically, since it breaks all the assumptions
        if (Long.MIN_VALUE == l) {
            return LONG_MIN_VALUE_BYTES;
        }
        int count=1;
        for (long v= l<0 ? -l : l; v >= 10; v=v/10, count++);
        byte [] buf = new byte [count + ((l<0) ? 1 : 0)];
        int offset = 0;
        if (l < 0) {
            buf[offset++] = (byte) '-';
            l = -l;
        }
        for (count--; count >= 0; l=l/10, count--) {
            buf[count+offset] = (byte)(0x30 + l%10);
        }
        return buf;
    }

    public static byte [] toAsciiBytes(boolean v) {
        return v ? TRUE : FALSE;
    }

    /**
     * Encode a UTF-8 string.
     *
     * @param string The string to encode.
     * @return Utf8 encoded array
     */
    public static byte[] toBytes(String string) {
        // This is just wrapper for String::getBytes. Pre-Java 9 this had a more efficient approach for ASCII-only strings.
        return string.getBytes(UTF_8);
    }
    /**
     * Decode a UTF-8 string.
     *
     * @param utf8 the bytes to decode
     * @return Utf8 encoded array
     */
    public static String toString(byte[] utf8) {
        // This is just wrapper for String::new. Pre-Java 9 this had a more efficient approach for ASCII-onlu strings.
        return new String(utf8, UTF_8);
    }

    /**
     * Utility method as toBytes(String).
     *
     * @param str
     *            String to encode
     * @param offset
     *            index of first character to encode
     * @param length
     *            number of characters to encode
     * @return substring encoded as UTF-8
     */
    public static byte[] toBytes(String str, int offset, int length) {
        CharBuffer c = CharBuffer.wrap(str, offset, offset + length);
        ByteBuffer b = UTF_8.encode(c);
        byte[] result = new byte[b.remaining()];
        b.get(result);
        return result;
    }

    /**
     * Direct encoding of a String into an array.
     *
     * @param str
     *            string to encode
     * @param srcOffset
     *            index of first character in string to encode
     * @param srcLen
     *            number of characters in string to encode
     * @param dst
     *            destination for encoded data
     * @param dstOffset
     *            index of first position to write data
     * @return the number of bytes written to the array.
     */
    public static int toBytes(String str, int srcOffset, int srcLen, byte[] dst, int dstOffset) {
        CharBuffer c = CharBuffer.wrap(str, srcOffset, srcOffset + srcLen);
        ByteBuffer b = UTF_8.encode(c);
        int encoded = b.remaining();
        b.get(dst, dstOffset, encoded);
        return encoded;
    }

    /**
     * Encode a string directly into a ByteBuffer instance.
     *
     * 
     * This method is somewhat more cumbersome than the rest of the helper
     * methods in this library, as it is intended for use cases in the following
     * style, if extraneous copying is highly undesirable:
     *
     * 
     * String[] a = {"abc", "def", "ghi\u00e8"};
     * int[] aLens = {3, 3, 5};
     * CharsetEncoder ce = Utf8.getNewEncoder();
     * ByteBuffer forWire = ByteBuffer.allocate(someNumber);
     *
     * for (int i = 0; i < a.length; i++) {
     *     forWire.putInt(aLens[i]);
     *     Utf8.toBytes(a[i], 0, a[i].length(), forWire, ce);
     * }
     * 
     *
     * @see Utf8#getNewEncoder()
     *
     * @param src the string to encode
     * @param srcOffset index of first character to encode
     * @param srcLen number of characters to encode
     * @param dst the destination ByteBuffer
     * @param encoder the character encoder to use
     */
    public static void toBytes(String src, int srcOffset, int srcLen, ByteBuffer dst, CharsetEncoder encoder) {
        CharBuffer c = CharBuffer.wrap(src, srcOffset, srcOffset + srcLen);
        encoder.encode(c, dst, true);
    }

    /**
     * Create a new UTF-8 encoder.
     *
     * @see Utf8#toBytes(String, int, int, ByteBuffer, CharsetEncoder)
     */
    public static CharsetEncoder getNewEncoder() {
        return UTF_8.newEncoder().onMalformedInput(CodingErrorAction.REPLACE)
                .onUnmappableCharacter(CodingErrorAction.REPLACE);
    }

    /**
     * Count the number of bytes needed to represent a given sequence of 16-bit
     * char values as a UTF-8 encoded array. This method is written to be cheap
     * to invoke.
     *
     * Note: It is strongly assumed to character sequence is valid.
     */
    public static int byteCount(CharSequence str) { return byteCount(str, 0, str.length()); }

    /**
     * Count the number of bytes needed to represent a given sequence of 16-bit
     * char values as a UTF-8 encoded array. This method is written to be cheap
     * to invoke.
     *
     * Note: It is strongly assumed to character sequence is valid.
     */
    public static int byteCount(CharSequence str, int offset, int length) {
        int count = 0;
        int barrier = offset + length;
        int i = offset;
        while (i < barrier) {
            int codePoint = (int) str.charAt(i);
            if (codePoint < 0x800) {
                if (codePoint < 0x80) {
                    ++count;
                } else {
                    count += 2;
                }
                ++i;
            } else {
                // bit masking to check (codePoint >= 0xd800 && codePoint <
                // 0xe000)
                if ((codePoint & 0xF800) == 0xD800) {
                    count += 4;
                    i += 2;
                } else {
                    count += 3;
                    ++i;
                }
            }
        }
        return count;
    }

    /**
     * Count the number of Unicode code units ("UTF-16 characters") needed to
     * represent a given array of UTF-8 characters. This method is written to be
     * cheap to invoke.
     *
     * Note: It is strongly assumed the sequence is valid.
     */
    public static int unitCount(byte[] utf8) { return unitCount(utf8, 0, utf8.length); }

    /**
     * Count the number of Unicode code units ("UTF-16 characters") needed to
     * represent a given array of UTF-8 characters. This method is written to be
     * cheap to invoke.
     *
     * Note: It is strongly assumed the sequence is valid.
     *
     * @param utf8
     *            raw data
     * @param offset
     *            index of first byte of UTF-8 sequence to check
     * @param length
     *            number of bytes in the UTF-8 sequence to check
     */
    public static int unitCount(byte[] utf8, int offset, int length) {
        int units = 0;
        int barrier = offset + length;
        int i = offset;
        while (i < barrier) {
            byte firstByte = utf8[i];
            if (firstByte >= -16) {
                if (firstByte >= 0) {
                    ++units;
                    ++i;
                } else {
                    units += 2;
                    i += 4;
                }
            } else {
                if (firstByte >= -32) {
                    ++units;
                    i += 3;
                } else {
                    ++units;
                    i += 2;
                }
            }
        }
        return units;
    }

    /**
     * Calculate the number of Unicode code units ("UTF-16 characters") needed
     * to represent a given UTF-8 encoded code point.
     *
     * @param firstByte
     *            the first byte of a character encoded as UTF-8
     * @return the number of UTF-16 code units needed to represent the given
     *         code point
     */
    public static int unitCount(byte firstByte) {
        int units = 0;
        if (firstByte >= -16) {
            if (firstByte >= 0) {
                units = 1;
            } else {
                units = 2;
            }
        } else {
            units = 1;
        }
        return units;
    }

    /**
     * Inspects a byte assumed to be the first byte in a UTF8 to check how many
     * bytes in total the sequence of bytes will use.
     *
     * @param firstByte
     *            the first byte of a UTF8 encoded character
     * @return the number of bytes used to encode the character
     */
    // To avoid code duplication, this function should be used by unitCount(),
    // but then unitCount(byte[], int, int) would not be as tight. This class is in general
    // meant to be safe to use in performance sensitive code.
    public static int totalBytes(byte firstByte) {
        if (firstByte >= -16) {
            if (firstByte >= 0) {
                return 1;
            } else {
                return 4;
            }
        } else {
            if (firstByte >= -32) {
                return 3;
            } else {
                return 2;
            }
        }
    }

    /**
     * Returns an integer array the length as the input string plus one. For
     * every index in the array, the corresponding value gives the index into
     * the UTF-8 byte sequence that can be created from the input.
     *
     * @param value
     *            a String to generate UTF-8 byte indexes from
     * @return an array containing corresponding UTF-8 byte indexes
     */
    public static int[] calculateBytePositions(CharSequence value) {
        int[] positions = new int[value.length() + 1];

        int bytePos = 0;
        int barrier = value.length();
        int i = 0;
        int codepointNo = 0;
        positions[codepointNo++] = bytePos;
        while (i < barrier) {
            int codePoint = (int) value.charAt(i);
            if (codePoint < 0x800) {
                if (codePoint < 0x80) {
                    ++bytePos;
                } else {
                    bytePos += 2;
                }
                ++i;
            } else {
                // bit masking to check (codePoint >= 0xd800 && codePoint <
                // 0xe000)
                if ((codePoint & 0xF800) == 0xD800) {
                    // double position write, as we have a surrogate pair
                    positions[codepointNo++] = bytePos;
                    bytePos += 4;
                    i += 2;
                } else {
                    bytePos += 3;
                    ++i;
                }
            }
            positions[codepointNo++] = bytePos;
        }
        return positions;
    }

    /**
     * Returns an array of the same length as the input array plus one. For
     * every index in the array, the corresponding value gives the index into
     * the Java string (UTF-16 sequence) that can be created from the input.
     *
     * @param utf8
     *            a byte array containing a string encoded as UTF-8. Note: It is
     *            strongly assumed that this sequence is correct.
     * @return an array containing corresponding UTF-16 character indexes. If input
     *            array is empty, returns an array containg a single zero.
     */
    public static int[] calculateStringPositions(byte[] utf8) {
        if (utf8.length == 0) {
            return new int[] { 0 };
        }
        int[] positions = new int[utf8.length + 1];
        int utf8BytePos = 0;
        int charPos = 0;
        int lastUtf8SequencePos = 0;
        int utf8SequenceLen = 0;
        while (utf8BytePos < utf8.length) {
            utf8SequenceLen = totalBytes(utf8[utf8BytePos]);
            lastUtf8SequencePos = utf8BytePos;
            for (int utf8SequenceCnt = 0; utf8SequenceCnt < utf8SequenceLen; utf8SequenceCnt++) {
                positions[utf8BytePos + utf8SequenceCnt] = charPos;
            }
            utf8BytePos += utf8SequenceLen;
            charPos++;
        }
        //we need to check if the last UTF-8 sequence resulted in a surrogate pair:
        int lastCharLen = unitCount(utf8, lastUtf8SequencePos, utf8SequenceLen);
        positions[utf8.length] = charPos + lastCharLen - 1;
        return positions;
    }


    /**
     * Encode a valid Unicode codepoint as a sequence of UTF-8 bytes into a new allocated array.
     *
     * @param codepoint Unicode codepoint to encode
     * @return number of bytes written
     * @throws IndexOutOfBoundsException if there is insufficient room for the encoded data in the given array
     */
    public static byte[] encode(int codepoint) {
        byte[] destination = new byte[codePointAsUtf8Length(codepoint)];
        encode(codepoint, destination, 0);
        return destination;
    }

    /**
     * Encode a valid Unicode codepoint as a sequence of UTF-8 bytes into an array.
     *
     * @param codepoint Unicode codepoint to encode
     * @param destination array to write into
     * @param offset index of first byte written
     * @return index of the first byte after the last byte written (i.e. offset plus number of bytes written)
     * @throws IndexOutOfBoundsException if there is insufficient room for the encoded data in the given array
     */
    public static int encode(int codepoint, byte[] destination, int offset) {
        int writeOffset = offset;
        byte firstByte = firstByte(codepoint);
        int leftToWrite = codePointAsUtf8Length(codepoint) - 1;
        destination[writeOffset++] = firstByte;
        while (leftToWrite-- > 0) {
            destination[writeOffset++] = trailingOctet(codepoint, leftToWrite);
        }
        return writeOffset;
    }

    /**
     * Encode a valid Unicode codepoint as a sequence of UTF-8 bytes into a
     * ByteBuffer.
     *
     * @param codepoint
     *            Unicode codepoint to encode
     * @param destination
     *            buffer to write into
     * @throws BufferOverflowException
     *             if the buffer's limit is met while writing (propagated from
     *             the ByteBuffer)
     * @throws ReadOnlyBufferException
     *             if the buffer is read only (propagated from the ByteBuffer)
     */
    public static void encode(int codepoint, ByteBuffer destination) {
        byte firstByte = firstByte(codepoint);
        int leftToWrite = codePointAsUtf8Length(codepoint) - 1;
        destination.put(firstByte);
        while (leftToWrite-- > 0) {
            destination.put(trailingOctet(codepoint, leftToWrite));
        }
    }

    /**
     * Encode a valid Unicode codepoint as a sequence of UTF-8 bytes into an
     * OutputStream.
     *
     * @param codepoint
     *            Unicode codepoint to encode
     * @param destination
     *            buffer to write into
     * @return number of bytes written
     * @throws IOException
     *             propagated from stream
     */
    public static int encode(int codepoint, OutputStream destination) throws IOException {
        byte firstByte = firstByte(codepoint);
        int toWrite = codePointAsUtf8Length(codepoint);
        int leftToWrite = toWrite - 1;
        destination.write(firstByte);
        while (leftToWrite-- > 0) {
            destination.write(trailingOctet(codepoint, leftToWrite));
        }
        return toWrite;
    }


    private static byte trailingOctet(int codepoint, int leftToWrite) {
        return (byte) (0x80 | ((codepoint >> (6 * leftToWrite)) & 0x3F));
    }

    private static byte firstByte(int codepoint) {
        if (codepoint < 0x800) {
            if (codepoint < 0x80) {
                return (byte) codepoint;
            } else {
                return (byte) (0xC0 | codepoint >> 6);
            }
        } else {
            if (codepoint < 0x10000) {
                return (byte) (0xE0 | codepoint >> 12);
            } else {
                return (byte) (0xF0 | codepoint >> 18);
            }
        }

    }

    /**
     * Return the number of octets needed to encode a valid Unicode codepoint as UTF-8.
     *
     * @param codepoint the Unicode codepoint to inspect
     * @return the number of bytes needed for UTF-8 representation
     */
    public static int codePointAsUtf8Length(int codepoint) {
        if (codepoint < 0x800) {
            if (codepoint < 0x80) {
                return 1;
            } else {
                return 2;
            }
        } else {
            if (codepoint < 0x10000) {
                return 3;
            } else {
                return 4;
            }
        }
    }

}