All Downloads are FREE. Search and download functionalities are using the official Maven repository.

webit.script.util.charset.UTF8 Maven / Gradle / Ivy

// Copyright (c) 2013-2014, Webit Team. All Rights Reserved.
package webit.script.util.charset;

public class UTF8 {

    public static final int MAX_BYTES_PER_CHAR = 3;

    public static int decode(final byte[] sa, int index, int len, final char[] da) {
        final int end = index + len;
        int count = 0;

        while (index < end) {
            int b1 = sa[index++];
            if (b1 >= 0) {
                // 1 byte, 7 bits: 0xxxxxxx
                da[count++] = (char) b1;
                continue;
            } else if ((b1 >> 5) == -2 && index < end) {
                // 2 bytes, 11 bits: 110xxxxx 10xxxxxx
                int b2 = sa[index++];
                if ((b1 & 0x1e) != 0x0 && (b2 & 0xc0) == 0x80) {
                    da[count++] = (char) (((b1 << 6) ^ b2)
                            ^ (((byte) 0xC0 << 6)
                            ^ ((byte) 0x80)));
                    continue;
                }
                index--;
            } else if ((b1 >> 4) == -2 && index + 1 < end) {
                // 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx
                int b2 = sa[index++];
                int b3 = sa[index++];
                if ((b1 != (byte) 0xe0 || (b2 & 0xe0) != 0x80)
                        && (b2 & 0xc0) == 0x80 && (b3 & 0xc0) == 0x80) {
                    da[count++] = (char) ((b1 << 12)
                            ^ (b2 << 6)
                            ^ (b3
                            ^ (((byte) 0xE0 << 12)
                            ^ ((byte) 0x80 << 6)
                            ^ ((byte) 0x80))));
                    continue;
                }
                index -= 2;
            } else if ((b1 >> 3) == -2 && index + 2 < end) {
                // 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
                int uc = ((b1 << 18)
                        ^ (sa[index++] << 12)
                        ^ (sa[index++] << 6)
                        ^ (sa[index++]
                        ^ (((byte) 0xF0 << 18)
                        ^ ((byte) 0x80 << 12)
                        ^ ((byte) 0x80 << 6)
                        ^ ((byte) 0x80))));
                if (Character.isSupplementaryCodePoint(uc)) {
                    da[count++] = (char) ((uc >>> 10) + (Character.MIN_HIGH_SURROGATE - (Character.MIN_SUPPLEMENTARY_CODE_POINT >>> 10)));
                    da[count++] = (char) ((uc & 0x3ff) + Character.MIN_LOW_SURROGATE);
                    continue;
                }
                index -= 3;
            }
            da[count++] = '\uFFFD';
        }
        return count;
    }

    public static int encode(final byte[] da, final char[] sa, int from, final int to) {
        int dp = 0;
        char c;
        char d;
        int uc;
        while (from < to) {
            if ((c = sa[from++]) < 0x80) {
                // Have at most seven bits
                da[dp++] = (byte) c;
                continue;
            }
            if (c < 0x800) {
                // 2 bytes, 11 bits
                da[dp++] = (byte) (0xc0 | (c >> 6));
                da[dp++] = (byte) (0x80 | (c & 0x3f));
                continue;
            }
            if (c >>> 11 != 0x1B) {
                //if not SURROGATE: c < Character.MIN_HIGH_SURROGATE || c > Character.MAX_LOW_SURROGATE
                // 3 bytes, 16 bits
                da[dp++] = (byte) (0xe0 | ((c >> 12)));
                da[dp++] = (byte) (0x80 | ((c >> 6) & 0x3f));
                da[dp++] = (byte) (0x80 | (c & 0x3f));
                continue;
            }
            if (c <= Character.MAX_HIGH_SURROGATE && from < to) {
                // if is HIGH_SURROGATE && has next char
                if ((d = sa[from++]) >>> 10 == 0x37) {
                    // if is LOW_SURROGATE: Character.MIN_LOW_SURROGATE <= d <= Character.MAX_LOW_SURROGATE
                    uc = Character.toCodePoint(c, d);
                    da[dp++] = (byte) (0xf0 | ((uc >> 18)));
                    da[dp++] = (byte) (0x80 | ((uc >> 12) & 0x3f));
                    da[dp++] = (byte) (0x80 | ((uc >> 6) & 0x3f));
                    da[dp++] = (byte) (0x80 | (uc & 0x3f));
                    continue;
                }
                --from; // back the LOW_SURROGATE char
            }
            da[dp++] = '?';
        }
        return dp;
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy