All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.yahoo.text.Utf8 Maven / Gradle / Ivy

Go to download

Library for use in Java components of Vespa. Shared code which do not fit anywhere else.

There is a newer version: 8.441.21
Show newest version
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.text;

import java.io.IOException;
import java.io.OutputStream;
import java.nio.BufferOverflowException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.ReadOnlyBufferException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CodingErrorAction;

import static java.nio.charset.StandardCharsets.UTF_8;

/**
 * Utility class with functions for handling UTF-8
 *
 * @author arnej27959
 * @author Steinar Knutsen
 * @author baldersheim
 */
public final class Utf8 {

    private static final byte [] TRUE = {(byte) 't', (byte) 'r', (byte) 'u', (byte) 'e'};
    private static final byte [] FALSE = {(byte) 'f', (byte) 'a', (byte) 'l', (byte) 's', (byte) 'e'};
    private static final byte[] LONG_MIN_VALUE_BYTES = String.valueOf(Long.MIN_VALUE).getBytes(UTF_8);

    /** Returns the Charset instance for UTF-8 */
    public static Charset getCharset() {
        return UTF_8;
    }

    /** To be used instead of String.String(byte[] bytes) */
    public static String toStringStd(byte[] data) {
        return new String(data, UTF_8);
    }

    /**
     * Utility method as toString(byte[]).
     *
     * @param data
     *            bytes to decode
     * @param offset
     *            index of first byte to decode
     * @param length
     *            number of bytes to decode
     * @return String decoded from UTF-8
     */
    public static String toString(byte[] data, int offset, int length) {
        return toString(ByteBuffer.wrap(data, offset, length));
    }

    /**
     * Fetch a string from a ByteBuffer instance. ByteBuffer instances are
     * stateful, so it is assumed to caller manipulates the instance's limit if
     * the entire buffer is not a string.
     *
     * @param data
     *            The UTF-8 data source
     * @return a decoded String
     */
    public static String toString(ByteBuffer data) {
        CharBuffer c = UTF_8.decode(data);
        return c.toString();
    }

    /**
     * Uses String.getBytes directly.
     */
    public static byte[] toBytesStd(String str) {
        return str.getBytes(UTF_8);
    }

    /**
     * Encode a long as its decimal representation, i.e. toAsciiBytes(15L) will
     * return "15" encoded as UTF-8. In other words it is an optimized version
     * of String.valueOf() followed by UTF-8 encoding. Avoid going through
     * string in order to get a simple UTF-8 sequence.
     *
     * @param l
     *            value to represent as a decimal number encded as utf8
     * @return byte array
     */
    public static byte[] toAsciiBytes(long l) {
        // Handle Long.MIN_VALUE specifically, since it breaks all the assumptions
        if (Long.MIN_VALUE == l) {
            return LONG_MIN_VALUE_BYTES;
        }
        int count=1;
        for (long v= l<0 ? -l : l; v >= 10; v=v/10, count++);
        byte [] buf = new byte [count + ((l<0) ? 1 : 0)];
        int offset = 0;
        if (l < 0) {
            buf[offset++] = (byte) '-';
            l = -l;
        }
        for (count--; count >= 0; l=l/10, count--) {
            buf[count+offset] = (byte)(0x30 + l%10);
        }
        return buf;
    }

    public static byte [] toAsciiBytes(boolean v) {
        return v ? TRUE : FALSE;
    }

    /**
     * Encode a UTF-8 string.
     *
     * @param string The string to encode.
     * @return Utf8 encoded array
     */
    public static byte[] toBytes(String string) {
        // This is just wrapper for String::getBytes. Pre-Java 9 this had a more efficient approach for ASCII-only strings.
        return string.getBytes(UTF_8);
    }
    /**
     * Decode a UTF-8 string.
     *
     * @param utf8 the bytes to decode
     * @return Utf8 encoded array
     */
    public static String toString(byte[] utf8) {
        // This is just wrapper for String::new. Pre-Java 9 this had a more efficient approach for ASCII-onlu strings.
        return new String(utf8, UTF_8);
    }

    /**
     * Utility method as toBytes(String).
     *
     * @param str
     *            String to encode
     * @param offset
     *            index of first character to encode
     * @param length
     *            number of characters to encode
     * @return substring encoded as UTF-8
     */
    public static byte[] toBytes(String str, int offset, int length) {
        CharBuffer c = CharBuffer.wrap(str, offset, offset + length);
        ByteBuffer b = UTF_8.encode(c);
        byte[] result = new byte[b.remaining()];
        b.get(result);
        return result;
    }

    /**
     * Direct encoding of a String into an array.
     *
     * @param str
     *            string to encode
     * @param srcOffset
     *            index of first character in string to encode
     * @param srcLen
     *            number of characters in string to encode
     * @param dst
     *            destination for encoded data
     * @param dstOffset
     *            index of first position to write data
     * @return the number of bytes written to the array.
     */
    public static int toBytes(String str, int srcOffset, int srcLen, byte[] dst, int dstOffset) {
        CharBuffer c = CharBuffer.wrap(str, srcOffset, srcOffset + srcLen);
        ByteBuffer b = UTF_8.encode(c);
        int encoded = b.remaining();
        b.get(dst, dstOffset, encoded);
        return encoded;
    }

    /**
     * Encode a string directly into a ByteBuffer instance.
     *
     * 

* This method is somewhat more cumbersome than the rest of the helper * methods in this library, as it is intended for use cases in the following * style, if extraneous copying is highly undesirable: * *

     * String[] a = {"abc", "def", "ghi\u00e8"};
     * int[] aLens = {3, 3, 5};
     * CharsetEncoder ce = Utf8.getNewEncoder();
     * ByteBuffer forWire = ByteBuffer.allocate(someNumber);
     *
     * for (int i = 0; i < a.length; i++) {
     *     forWire.putInt(aLens[i]);
     *     Utf8.toBytes(a[i], 0, a[i].length(), forWire, ce);
     * }
     * 
* * @see Utf8#getNewEncoder() * * @param src the string to encode * @param srcOffset index of first character to encode * @param srcLen number of characters to encode * @param dst the destination ByteBuffer * @param encoder the character encoder to use */ public static void toBytes(String src, int srcOffset, int srcLen, ByteBuffer dst, CharsetEncoder encoder) { CharBuffer c = CharBuffer.wrap(src, srcOffset, srcOffset + srcLen); encoder.encode(c, dst, true); } /** * Create a new UTF-8 encoder. * * @see Utf8#toBytes(String, int, int, ByteBuffer, CharsetEncoder) */ public static CharsetEncoder getNewEncoder() { return UTF_8.newEncoder().onMalformedInput(CodingErrorAction.REPLACE) .onUnmappableCharacter(CodingErrorAction.REPLACE); } /** * Count the number of bytes needed to represent a given sequence of 16-bit * char values as a UTF-8 encoded array. This method is written to be cheap * to invoke. * * Note: It is strongly assumed to character sequence is valid. */ public static int byteCount(CharSequence str) { return byteCount(str, 0, str.length()); } /** * Count the number of bytes needed to represent a given sequence of 16-bit * char values as a UTF-8 encoded array. This method is written to be cheap * to invoke. * * Note: It is strongly assumed to character sequence is valid. */ public static int byteCount(CharSequence str, int offset, int length) { int count = 0; int barrier = offset + length; int i = offset; while (i < barrier) { int codePoint = (int) str.charAt(i); if (codePoint < 0x800) { if (codePoint < 0x80) { ++count; } else { count += 2; } ++i; } else { // bit masking to check (codePoint >= 0xd800 && codePoint < // 0xe000) if ((codePoint & 0xF800) == 0xD800) { count += 4; i += 2; } else { count += 3; ++i; } } } return count; } /** * Count the number of Unicode code units ("UTF-16 characters") needed to * represent a given array of UTF-8 characters. This method is written to be * cheap to invoke. * * Note: It is strongly assumed the sequence is valid. */ public static int unitCount(byte[] utf8) { return unitCount(utf8, 0, utf8.length); } /** * Count the number of Unicode code units ("UTF-16 characters") needed to * represent a given array of UTF-8 characters. This method is written to be * cheap to invoke. * * Note: It is strongly assumed the sequence is valid. * * @param utf8 * raw data * @param offset * index of first byte of UTF-8 sequence to check * @param length * number of bytes in the UTF-8 sequence to check */ public static int unitCount(byte[] utf8, int offset, int length) { int units = 0; int barrier = offset + length; int i = offset; while (i < barrier) { byte firstByte = utf8[i]; if (firstByte >= -16) { if (firstByte >= 0) { ++units; ++i; } else { units += 2; i += 4; } } else { if (firstByte >= -32) { ++units; i += 3; } else { ++units; i += 2; } } } return units; } /** * Calculate the number of Unicode code units ("UTF-16 characters") needed * to represent a given UTF-8 encoded code point. * * @param firstByte * the first byte of a character encoded as UTF-8 * @return the number of UTF-16 code units needed to represent the given * code point */ public static int unitCount(byte firstByte) { int units = 0; if (firstByte >= -16) { if (firstByte >= 0) { units = 1; } else { units = 2; } } else { units = 1; } return units; } /** * Inspects a byte assumed to be the first byte in a UTF8 to check how many * bytes in total the sequence of bytes will use. * * @param firstByte * the first byte of a UTF8 encoded character * @return the number of bytes used to encode the character */ // To avoid code duplication, this function should be used by unitCount(), // but then unitCount(byte[], int, int) would not be as tight. This class is in general // meant to be safe to use in performance sensitive code. public static int totalBytes(byte firstByte) { if (firstByte >= -16) { if (firstByte >= 0) { return 1; } else { return 4; } } else { if (firstByte >= -32) { return 3; } else { return 2; } } } /** * Returns an integer array the length as the input string plus one. For * every index in the array, the corresponding value gives the index into * the UTF-8 byte sequence that can be created from the input. * * @param value * a String to generate UTF-8 byte indexes from * @return an array containing corresponding UTF-8 byte indexes */ public static int[] calculateBytePositions(CharSequence value) { int[] positions = new int[value.length() + 1]; int bytePos = 0; int barrier = value.length(); int i = 0; int codepointNo = 0; positions[codepointNo++] = bytePos; while (i < barrier) { int codePoint = (int) value.charAt(i); if (codePoint < 0x800) { if (codePoint < 0x80) { ++bytePos; } else { bytePos += 2; } ++i; } else { // bit masking to check (codePoint >= 0xd800 && codePoint < // 0xe000) if ((codePoint & 0xF800) == 0xD800) { // double position write, as we have a surrogate pair positions[codepointNo++] = bytePos; bytePos += 4; i += 2; } else { bytePos += 3; ++i; } } positions[codepointNo++] = bytePos; } return positions; } /** * Returns an array of the same length as the input array plus one. For * every index in the array, the corresponding value gives the index into * the Java string (UTF-16 sequence) that can be created from the input. * * @param utf8 * a byte array containing a string encoded as UTF-8. Note: It is * strongly assumed that this sequence is correct. * @return an array containing corresponding UTF-16 character indexes. If input * array is empty, returns an array containg a single zero. */ public static int[] calculateStringPositions(byte[] utf8) { if (utf8.length == 0) { return new int[] { 0 }; } int[] positions = new int[utf8.length + 1]; int utf8BytePos = 0; int charPos = 0; int lastUtf8SequencePos = 0; int utf8SequenceLen = 0; while (utf8BytePos < utf8.length) { utf8SequenceLen = totalBytes(utf8[utf8BytePos]); lastUtf8SequencePos = utf8BytePos; for (int utf8SequenceCnt = 0; utf8SequenceCnt < utf8SequenceLen; utf8SequenceCnt++) { positions[utf8BytePos + utf8SequenceCnt] = charPos; } utf8BytePos += utf8SequenceLen; charPos++; } //we need to check if the last UTF-8 sequence resulted in a surrogate pair: int lastCharLen = unitCount(utf8, lastUtf8SequencePos, utf8SequenceLen); positions[utf8.length] = charPos + lastCharLen - 1; return positions; } /** * Encode a valid Unicode codepoint as a sequence of UTF-8 bytes into a new allocated array. * * @param codepoint Unicode codepoint to encode * @return number of bytes written * @throws IndexOutOfBoundsException if there is insufficient room for the encoded data in the given array */ public static byte[] encode(int codepoint) { byte[] destination = new byte[codePointAsUtf8Length(codepoint)]; encode(codepoint, destination, 0); return destination; } /** * Encode a valid Unicode codepoint as a sequence of UTF-8 bytes into an array. * * @param codepoint Unicode codepoint to encode * @param destination array to write into * @param offset index of first byte written * @return index of the first byte after the last byte written (i.e. offset plus number of bytes written) * @throws IndexOutOfBoundsException if there is insufficient room for the encoded data in the given array */ public static int encode(int codepoint, byte[] destination, int offset) { int writeOffset = offset; byte firstByte = firstByte(codepoint); int leftToWrite = codePointAsUtf8Length(codepoint) - 1; destination[writeOffset++] = firstByte; while (leftToWrite-- > 0) { destination[writeOffset++] = trailingOctet(codepoint, leftToWrite); } return writeOffset; } /** * Encode a valid Unicode codepoint as a sequence of UTF-8 bytes into a * ByteBuffer. * * @param codepoint * Unicode codepoint to encode * @param destination * buffer to write into * @throws BufferOverflowException * if the buffer's limit is met while writing (propagated from * the ByteBuffer) * @throws ReadOnlyBufferException * if the buffer is read only (propagated from the ByteBuffer) */ public static void encode(int codepoint, ByteBuffer destination) { byte firstByte = firstByte(codepoint); int leftToWrite = codePointAsUtf8Length(codepoint) - 1; destination.put(firstByte); while (leftToWrite-- > 0) { destination.put(trailingOctet(codepoint, leftToWrite)); } } /** * Encode a valid Unicode codepoint as a sequence of UTF-8 bytes into an * OutputStream. * * @param codepoint * Unicode codepoint to encode * @param destination * buffer to write into * @return number of bytes written * @throws IOException * propagated from stream */ public static int encode(int codepoint, OutputStream destination) throws IOException { byte firstByte = firstByte(codepoint); int toWrite = codePointAsUtf8Length(codepoint); int leftToWrite = toWrite - 1; destination.write(firstByte); while (leftToWrite-- > 0) { destination.write(trailingOctet(codepoint, leftToWrite)); } return toWrite; } private static byte trailingOctet(int codepoint, int leftToWrite) { return (byte) (0x80 | ((codepoint >> (6 * leftToWrite)) & 0x3F)); } private static byte firstByte(int codepoint) { if (codepoint < 0x800) { if (codepoint < 0x80) { return (byte) codepoint; } else { return (byte) (0xC0 | codepoint >> 6); } } else { if (codepoint < 0x10000) { return (byte) (0xE0 | codepoint >> 12); } else { return (byte) (0xF0 | codepoint >> 18); } } } /** * Return the number of octets needed to encode a valid Unicode codepoint as UTF-8. * * @param codepoint the Unicode codepoint to inspect * @return the number of bytes needed for UTF-8 representation */ public static int codePointAsUtf8Length(int codepoint) { if (codepoint < 0x800) { if (codepoint < 0x80) { return 1; } else { return 2; } } else { if (codepoint < 0x10000) { return 3; } else { return 4; } } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy