All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.socket.utf8.UTF8 Maven / Gradle / Ivy

There is a newer version: 1.1.28
Show newest version
package io.socket.utf8;

import java.util.ArrayList;
import java.util.List;

/**
 * UTF-8 encoder/decoder ported from utf8.js.
 *
 * @see https://github.com/mathiasbynens/utf8.js
 */
public final class UTF8 {

    private static final String INVALID_CONTINUATION_BYTE = "Invalid continuation byte";
    private static int[] byteArray;
    private static int byteCount;
    private static int byteIndex;

    private UTF8 () {}

    public static String encode(String string) throws UTF8Exception {
        int[] codePoints = ucs2decode(string);
        int length = codePoints.length;
        int index = -1;
        int codePoint;
        StringBuilder byteString = new StringBuilder();
        while (++index < length) {
            codePoint = codePoints[index];
            byteString.append(encodeCodePoint(codePoint));
        }
        return byteString.toString();
    }

    public static String decode(String byteString) throws UTF8Exception {
        byteArray = ucs2decode(byteString);
        byteCount = byteArray.length;
        byteIndex = 0;
        List codePoints = new ArrayList();
        int tmp;
        while ((tmp = decodeSymbol()) != -1) {
            codePoints.add(tmp);
        }
        return ucs2encode(listToArray(codePoints));
    }

    private static int[] ucs2decode(String string) {
        int length = string.length();
        int[] output = new int[string.codePointCount(0, length)];
        int counter = 0;
        int value;
        for (int i = 0; i < length; i += Character.charCount(value)) {
            value = string.codePointAt(i);
            output[counter++] = value;
        }
        return output;
    }

    private static String encodeCodePoint(int codePoint) throws UTF8Exception {
        StringBuilder symbol = new StringBuilder();
        if ((codePoint & 0xFFFFFF80) == 0) {
            return symbol.append(Character.toChars(codePoint)).toString();
        }
        if ((codePoint & 0xFFFFF800) == 0) {
            symbol.append(Character.toChars(((codePoint >> 6) & 0x1F) | 0xC0));
        } else if ((codePoint & 0xFFFF0000) == 0) {
            checkScalarValue(codePoint);
            symbol.append(Character.toChars(((codePoint >> 12) & 0x0F) | 0xE0));
            symbol.append(createByte(codePoint, 6));
        } else if ((codePoint & 0xFFE00000) == 0) {
            symbol.append(Character.toChars(((codePoint >> 18) & 0x07) | 0xF0));
            symbol.append(createByte(codePoint, 12));
            symbol.append(createByte(codePoint, 6));
        }
        symbol.append(Character.toChars((codePoint & 0x3F) | 0x80));
        return symbol.toString();
    }

    private static char[] createByte(int codePoint, int shift) {
        return Character.toChars(((codePoint >> shift) & 0x3F) | 0x80);
    }

    private static int decodeSymbol() throws UTF8Exception {
        int byte1;
        int byte2;
        int byte3;
        int byte4;
        int codePoint;

        if (byteIndex > byteCount) {
            throw new UTF8Exception("Invalid byte index");
        }

        if (byteIndex == byteCount) {
            return -1;
        }

        byte1 = byteArray[byteIndex] & 0xFF;
        byteIndex++;

        if ((byte1 & 0x80) == 0) {
            return byte1;
        }

        if ((byte1 & 0xE0) == 0xC0) {
            byte2 = readContinuationByte();
            codePoint = ((byte1 & 0x1F) << 6) | byte2;
            if (codePoint >= 0x80) {
                return codePoint;
            } else {
                throw new UTF8Exception(INVALID_CONTINUATION_BYTE);
            }
        }

        if ((byte1 & 0xF0) == 0xE0) {
            byte2 = readContinuationByte();
            byte3 = readContinuationByte();
            codePoint = ((byte1 & 0x0F) << 12) | (byte2 << 6) | byte3;
            if (codePoint >= 0x0800) {
                checkScalarValue(codePoint);
                return codePoint;
            } else {
                throw new UTF8Exception(INVALID_CONTINUATION_BYTE);
            }
        }

        if ((byte1 & 0xF8) == 0xF0) {
            byte2 = readContinuationByte();
            byte3 = readContinuationByte();
            byte4 = readContinuationByte();
            codePoint = ((byte1 & 0x0F) << 0x12) | (byte2 << 0x0C) | (byte3 << 0x06) | byte4;
            if (codePoint >= 0x010000 && codePoint <= 0x10FFFF) {
                return codePoint;
            }
        }

        throw new UTF8Exception(INVALID_CONTINUATION_BYTE);
    }

    private static int readContinuationByte() throws UTF8Exception {
        if (byteIndex >= byteCount) {
            throw new UTF8Exception("Invalid byte index");
        }

        int continuationByte = byteArray[byteIndex] & 0xFF;
        byteIndex++;

        if ((continuationByte & 0xC0) == 0x80) {
            return continuationByte & 0x3F;
        }

        throw new UTF8Exception(INVALID_CONTINUATION_BYTE);
    }

    private static String ucs2encode(int[] array) {
        StringBuilder output = new StringBuilder();
        for (int value : array) {
            output.appendCodePoint(value);
        }
        return output.toString();
    }

    private static void checkScalarValue(int codePoint) throws UTF8Exception {
        if (codePoint >= 0xD800 && codePoint <= 0xDFFF) {
            throw new UTF8Exception(
                    "Lone surrogate U+" + Integer.toHexString(codePoint).toUpperCase() +
                    " is not a scalar value"
            );
        }
    }

    private static int[] listToArray(List list) {
        int size = list.size();
        int[] array = new int[size];
        for (int i = 0; i < size; i++) {
            array[i] = list.get(i);
        }
        return array;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy