org.bouncycastle.util.encoders.UTF8 Maven / Gradle / Ivy
Show all versions of bcprov-jdk15to18 Show documentation
package org.bouncycastle.util.encoders;
/**
* Utilities for working with UTF-8 encodings.
*
* Decoding of UTF-8 is based on a presentation by Bob Steagall at CppCon2018 (see
* https://github.com/BobSteagall/CppCon2018). It uses a Deterministic Finite Automaton (DFA) to
* recognize and decode multi-byte code points.
*/
public class UTF8
{
// Constants for the categorization of code units
private static final byte C_ILL = 0; //- C0..C1, F5..FF ILLEGAL octets that should never appear in a UTF-8 sequence
private static final byte C_CR1 = 1; //- 80..8F Continuation range 1
private static final byte C_CR2 = 2; //- 90..9F Continuation range 2
private static final byte C_CR3 = 3; //- A0..BF Continuation range 3
private static final byte C_L2A = 4; //- C2..DF Leading byte range A / 2-byte sequence
private static final byte C_L3A = 5; //- E0 Leading byte range A / 3-byte sequence
private static final byte C_L3B = 6; //- E1..EC, EE..EF Leading byte range B / 3-byte sequence
private static final byte C_L3C = 7; //- ED Leading byte range C / 3-byte sequence
private static final byte C_L4A = 8; //- F0 Leading byte range A / 4-byte sequence
private static final byte C_L4B = 9; //- F1..F3 Leading byte range B / 4-byte sequence
private static final byte C_L4C = 10; //- F4 Leading byte range C / 4-byte sequence
// private static final byte C_ASC = 11; //- 00..7F ASCII leading byte range
// Constants for the states of a DFA
private static final byte S_ERR = -2; //- Error state
private static final byte S_END = -1; //- End (or Accept) state
private static final byte S_CS1 = 0x00; //- Continuation state 1
private static final byte S_CS2 = 0x10; //- Continuation state 2
private static final byte S_CS3 = 0x20; //- Continuation state 3
private static final byte S_P3A = 0x30; //- Partial 3-byte sequence state A
private static final byte S_P3B = 0x40; //- Partial 3-byte sequence state B
private static final byte S_P4A = 0x50; //- Partial 4-byte sequence state A
private static final byte S_P4B = 0x60; //- Partial 4-byte sequence state B
private static final short[] firstUnitTable = new short[128];
private static final byte[] transitionTable = new byte[S_P4B + 16];
private static void fill(byte[] table, int first, int last, byte b)
{
for (int i = first; i <= last; ++i)
{
table[i] = b;
}
}
static
{
byte[] categories = new byte[128];
fill(categories, 0x00, 0x0F, C_CR1);
fill(categories, 0x10, 0x1F, C_CR2);
fill(categories, 0x20, 0x3F, C_CR3);
fill(categories, 0x40, 0x41, C_ILL);
fill(categories, 0x42, 0x5F, C_L2A);
fill(categories, 0x60, 0x60, C_L3A);
fill(categories, 0x61, 0x6C, C_L3B);
fill(categories, 0x6D, 0x6D, C_L3C);
fill(categories, 0x6E, 0x6F, C_L3B);
fill(categories, 0x70, 0x70, C_L4A);
fill(categories, 0x71, 0x73, C_L4B);
fill(categories, 0x74, 0x74, C_L4C);
fill(categories, 0x75, 0x7F, C_ILL);
fill(transitionTable, 0, transitionTable.length - 1, S_ERR);
fill(transitionTable, S_CS1 + 0x8, S_CS1 + 0xB, S_END);
fill(transitionTable, S_CS2 + 0x8, S_CS2 + 0xB, S_CS1);
fill(transitionTable, S_CS3 + 0x8, S_CS3 + 0xB, S_CS2);
fill(transitionTable, S_P3A + 0xA, S_P3A + 0xB, S_CS1);
fill(transitionTable, S_P3B + 0x8, S_P3B + 0x9, S_CS1);
fill(transitionTable, S_P4A + 0x9, S_P4A + 0xB, S_CS2);
fill(transitionTable, S_P4B + 0x8, S_P4B + 0x8, S_CS2);
byte[] firstUnitMasks = {0x00, 0x00, 0x00, 0x00, 0x1F, 0x0F, 0x0F, 0x0F, 0x07, 0x07, 0x07};
byte[] firstUnitTransitions = {S_ERR, S_ERR, S_ERR, S_ERR, S_CS1, S_P3A, S_CS2, S_P3B, S_P4A, S_CS3, S_P4B};
for (int i = 0x00; i < 0x80; ++i)
{
byte category = categories[i];
int codePoint = i & firstUnitMasks[category];
byte state = firstUnitTransitions[category];
firstUnitTable[i] = (short)((codePoint << 8) | state);
}
}
/**
* Transcode a UTF-8 encoding into a UTF-16 representation. In the general case the output
* {@code utf16} array should be at least as long as the input {@code utf8} one to handle
* arbitrary inputs. The number of output UTF-16 code units is returned, or -1 if any errors are
* encountered (in which case an arbitrary amount of data may have been written into the output
* array). Errors that will be detected are malformed UTF-8, including incomplete, truncated or
* "overlong" encodings, and unmappable code points. In particular, no unmatched surrogates will
* be produced. An error will also result if {@code utf16} is found to be too small to store the
* complete output.
*
* @param utf8 A non-null array containing a well-formed UTF-8 encoding.
* @param utf16 A non-null array, at least as long as the {@code utf8} array in order to ensure
* the output will fit.
* @return The number of UTF-16 code units written to {@code utf16} (beginning from index 0), or
* else -1 if the input was either malformed or encoded any unmappable characters, or if
* the {@code utf16} is too small.
*/
public static int transcodeToUTF16(byte[] utf8, char[] utf16)
{
return transcodeToUTF16(utf8, 0, utf8.length, utf16);
}
/**
* Transcode a UTF-8 encoding into a UTF-16 representation. In the general case the output
* {@code utf16} array should be at least as long as the input length from {@code utf8} to handle
* arbitrary inputs. The number of output UTF-16 code units is returned, or -1 if any errors are
* encountered (in which case an arbitrary amount of data may have been written into the output
* array). Errors that will be detected are malformed UTF-8, including incomplete, truncated or
* "overlong" encodings, and unmappable code points. In particular, no unmatched surrogates will
* be produced. An error will also result if {@code utf16} is found to be too small to store the
* complete output.
*
* @param utf8 A non-null array containing a well-formed UTF-8 encoding.
* @param utf8Off start position in the array for the well-formed encoding.
* @param utf8Length length in bytes of the well-formed encoding.
* @param utf16 A non-null array, at least as long as the {@code utf8} array in order to ensure
* the output will fit.
* @return The number of UTF-16 code units written to {@code utf16} (beginning from index 0), or
* else -1 if the input was either malformed or encoded any unmappable characters, or if
* the {@code utf16} is too small.
*/
public static int transcodeToUTF16(byte[] utf8, int utf8Off, int utf8Length, char[] utf16)
{
int i = utf8Off, j = 0;
int maxI = utf8Off + utf8Length;
while (i < maxI)
{
byte codeUnit = utf8[i++];
if (codeUnit >= 0)
{
if (j >= utf16.length)
{
return -1;
}
utf16[j++] = (char)codeUnit;
continue;
}
short first = firstUnitTable[codeUnit & 0x7F];
int codePoint = first >>> 8;
byte state = (byte)first;
while (state >= 0)
{
if (i >= maxI)
{
return -1;
}
codeUnit = utf8[i++];
codePoint = (codePoint << 6) | (codeUnit & 0x3F);
state = transitionTable[state + ((codeUnit & 0xFF) >>> 4)];
}
if (state == S_ERR)
{
return -1;
}
if (codePoint <= 0xFFFF)
{
if (j >= utf16.length)
{
return -1;
}
// Code points from U+D800 to U+DFFF are caught by the DFA
utf16[j++] = (char)codePoint;
}
else
{
if (j >= utf16.length - 1)
{
return -1;
}
// Code points above U+10FFFF are caught by the DFA
utf16[j++] = (char)(0xD7C0 + (codePoint >>> 10));
utf16[j++] = (char)(0xDC00 | (codePoint & 0x3FF));
}
}
return j;
}
}