org.bouncycastle.util.encoders.UTF8 Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of bcprov-jdk15to18 Show documentation
The Bouncy Castle Crypto package is a Java implementation of cryptographic algorithms. This jar contains JCE provider and lightweight API for the Bouncy Castle Cryptography APIs for JDK 1.5 to JDK 1.8.
There is a newer version: 1.79
Show newest version
package org.bouncycastle.util.encoders;

/**
 * Utilities for working with UTF-8 encodings.
 * 
 * Decoding of UTF-8 is based on a presentation by Bob Steagall at CppCon2018 (see
 * https://github.com/BobSteagall/CppCon2018). It uses a Deterministic Finite Automaton (DFA) to
 * recognize and decode multi-byte code points.
 */
public class UTF8
{
    // Constants for the categorization of code units
    private static final byte C_ILL = 0;            //- C0..C1, F5..FF  ILLEGAL octets that should never appear in a UTF-8 sequence
    private static final byte C_CR1 = 1;            //- 80..8F          Continuation range 1
    private static final byte C_CR2 = 2;            //- 90..9F          Continuation range 2
    private static final byte C_CR3 = 3;            //- A0..BF          Continuation range 3
    private static final byte C_L2A = 4;            //- C2..DF          Leading byte range A / 2-byte sequence
    private static final byte C_L3A = 5;            //- E0              Leading byte range A / 3-byte sequence
    private static final byte C_L3B = 6;            //- E1..EC, EE..EF  Leading byte range B / 3-byte sequence
    private static final byte C_L3C = 7;            //- ED              Leading byte range C / 3-byte sequence
    private static final byte C_L4A = 8;            //- F0              Leading byte range A / 4-byte sequence
    private static final byte C_L4B = 9;            //- F1..F3          Leading byte range B / 4-byte sequence
    private static final byte C_L4C = 10;           //- F4              Leading byte range C / 4-byte sequence
//  private static final byte C_ASC = 11;           //- 00..7F          ASCII leading byte range

    // Constants for the states of a DFA
    private static final byte S_ERR = -2;           //- Error state
    private static final byte S_END = -1;           //- End (or Accept) state
    private static final byte S_CS1 = 0x00;         //- Continuation state 1
    private static final byte S_CS2 = 0x10;         //- Continuation state 2
    private static final byte S_CS3 = 0x20;         //- Continuation state 3
    private static final byte S_P3A = 0x30;         //- Partial 3-byte sequence state A
    private static final byte S_P3B = 0x40;         //- Partial 3-byte sequence state B
    private static final byte S_P4A = 0x50;         //- Partial 4-byte sequence state A
    private static final byte S_P4B = 0x60;         //- Partial 4-byte sequence state B

    private static final short[] firstUnitTable = new short[128];
    private static final byte[] transitionTable = new byte[S_P4B + 16];

    private static void fill(byte[] table, int first, int last, byte b)
    {
        for (int i = first; i <= last; ++i)
        {
            table[i] = b;
        }
    }

    static
    {
        byte[] categories = new byte[128];
        fill(categories, 0x00, 0x0F, C_CR1);
        fill(categories, 0x10, 0x1F, C_CR2);
        fill(categories, 0x20, 0x3F, C_CR3);
        fill(categories, 0x40, 0x41, C_ILL);
        fill(categories, 0x42, 0x5F, C_L2A);
        fill(categories, 0x60, 0x60, C_L3A);
        fill(categories, 0x61, 0x6C, C_L3B);
        fill(categories, 0x6D, 0x6D, C_L3C);
        fill(categories, 0x6E, 0x6F, C_L3B);
        fill(categories, 0x70, 0x70, C_L4A);
        fill(categories, 0x71, 0x73, C_L4B);
        fill(categories, 0x74, 0x74, C_L4C);
        fill(categories, 0x75, 0x7F, C_ILL);

        fill(transitionTable, 0, transitionTable.length - 1, S_ERR);
        fill(transitionTable, S_CS1 + 0x8, S_CS1 + 0xB, S_END);
        fill(transitionTable, S_CS2 + 0x8, S_CS2 + 0xB, S_CS1);
        fill(transitionTable, S_CS3 + 0x8, S_CS3 + 0xB, S_CS2);
        fill(transitionTable, S_P3A + 0xA, S_P3A + 0xB, S_CS1);
        fill(transitionTable, S_P3B + 0x8, S_P3B + 0x9, S_CS1);
        fill(transitionTable, S_P4A + 0x9, S_P4A + 0xB, S_CS2);
        fill(transitionTable, S_P4B + 0x8, S_P4B + 0x8, S_CS2);

        byte[] firstUnitMasks = {0x00, 0x00, 0x00, 0x00, 0x1F, 0x0F, 0x0F, 0x0F, 0x07, 0x07, 0x07};
        byte[] firstUnitTransitions = {S_ERR, S_ERR, S_ERR, S_ERR, S_CS1, S_P3A, S_CS2, S_P3B, S_P4A, S_CS3, S_P4B};

        for (int i = 0x00; i < 0x80; ++i)
        {
            byte category = categories[i];

            int codePoint = i & firstUnitMasks[category];
            byte state = firstUnitTransitions[category];

            firstUnitTable[i] = (short)((codePoint << 8) | state);
        }
    }

    /**
     * Transcode a UTF-8 encoding into a UTF-16 representation. In the general case the output
     * {@code utf16} array should be at least as long as the input {@code utf8} one to handle
     * arbitrary inputs. The number of output UTF-16 code units is returned, or -1 if any errors are
     * encountered (in which case an arbitrary amount of data may have been written into the output
     * array). Errors that will be detected are malformed UTF-8, including incomplete, truncated or
     * "overlong" encodings, and unmappable code points. In particular, no unmatched surrogates will
     * be produced. An error will also result if {@code utf16} is found to be too small to store the
     * complete output.
     *
     * @param utf8  A non-null array containing a well-formed UTF-8 encoding.
     * @param utf16 A non-null array, at least as long as the {@code utf8} array in order to ensure
     *              the output will fit.
     * @return The number of UTF-16 code units written to {@code utf16} (beginning from index 0), or
     * else -1 if the input was either malformed or encoded any unmappable characters, or if
     * the {@code utf16} is too small.
     */
    public static int transcodeToUTF16(byte[] utf8, char[] utf16)
    {
        return transcodeToUTF16(utf8, 0, utf8.length, utf16);
    }

    /**
     * Transcode a UTF-8 encoding into a UTF-16 representation. In the general case the output
     * {@code utf16} array should be at least as long as the input length from {@code utf8} to handle
     * arbitrary inputs. The number of output UTF-16 code units is returned, or -1 if any errors are
     * encountered (in which case an arbitrary amount of data may have been written into the output
     * array). Errors that will be detected are malformed UTF-8, including incomplete, truncated or
     * "overlong" encodings, and unmappable code points. In particular, no unmatched surrogates will
     * be produced. An error will also result if {@code utf16} is found to be too small to store the
     * complete output.
     *
     * @param utf8  A non-null array containing a well-formed UTF-8 encoding.
     * @param utf8Off start position in the array for the well-formed encoding.
     * @param utf8Length length in bytes of the well-formed encoding.
     * @param utf16 A non-null array, at least as long as the {@code utf8} array in order to ensure
     *              the output will fit.
     * @return The number of UTF-16 code units written to {@code utf16} (beginning from index 0), or
     * else -1 if the input was either malformed or encoded any unmappable characters, or if
     * the {@code utf16} is too small.
     */
    public static int transcodeToUTF16(byte[] utf8, int utf8Off, int utf8Length, char[] utf16)
    {
        int i = utf8Off, j = 0;
        int maxI = utf8Off + utf8Length;

        while (i < maxI)
        {
            byte codeUnit = utf8[i++];
            if (codeUnit >= 0)
            {
                if (j >= utf16.length)
                {
                    return -1;
                }

                utf16[j++] = (char)codeUnit;
                continue;
            }

            short first = firstUnitTable[codeUnit & 0x7F];
            int codePoint = first >>> 8;
            byte state = (byte)first;

            while (state >= 0)
            {
                if (i >= maxI)
                {
                    return -1;
                }

                codeUnit = utf8[i++];
                codePoint = (codePoint << 6) | (codeUnit & 0x3F);
                state = transitionTable[state + ((codeUnit & 0xFF) >>> 4)];
            }

            if (state == S_ERR)
            {
                return -1;
            }

            if (codePoint <= 0xFFFF)
            {
                if (j >= utf16.length)
                {
                    return -1;
                }

                // Code points from U+D800 to U+DFFF are caught by the DFA
                utf16[j++] = (char)codePoint;
            }
            else
            {
                if (j >= utf16.length - 1)
                {
                    return -1;
                }

                // Code points above U+10FFFF are caught by the DFA
                utf16[j++] = (char)(0xD7C0 + (codePoint >>> 10));
                utf16[j++] = (char)(0xDC00 | (codePoint & 0x3FF));
            }
        }

        return j;
    }
}