All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.bouncycastle.util.encoders.UTF8 Maven / Gradle / Ivy

Go to download

The Bouncy Castle Crypto package is a Java implementation of cryptographic algorithms. This jar contains JCE provider and lightweight API for the Bouncy Castle Cryptography APIs for Java 1.8 and later with debug enabled.

The newest version!
package org.bouncycastle.util.encoders;

/**
 * Utilities for working with UTF-8 encodings.
 * 

* Decoding of UTF-8 is based on a presentation by Bob Steagall at CppCon2018 (see * https://github.com/BobSteagall/CppCon2018). It uses a Deterministic Finite Automaton (DFA) to * recognize and decode multi-byte code points. */ public class UTF8 { // Constants for the categorization of code units private static final byte C_ILL = 0; //- C0..C1, F5..FF ILLEGAL octets that should never appear in a UTF-8 sequence private static final byte C_CR1 = 1; //- 80..8F Continuation range 1 private static final byte C_CR2 = 2; //- 90..9F Continuation range 2 private static final byte C_CR3 = 3; //- A0..BF Continuation range 3 private static final byte C_L2A = 4; //- C2..DF Leading byte range A / 2-byte sequence private static final byte C_L3A = 5; //- E0 Leading byte range A / 3-byte sequence private static final byte C_L3B = 6; //- E1..EC, EE..EF Leading byte range B / 3-byte sequence private static final byte C_L3C = 7; //- ED Leading byte range C / 3-byte sequence private static final byte C_L4A = 8; //- F0 Leading byte range A / 4-byte sequence private static final byte C_L4B = 9; //- F1..F3 Leading byte range B / 4-byte sequence private static final byte C_L4C = 10; //- F4 Leading byte range C / 4-byte sequence // private static final byte C_ASC = 11; //- 00..7F ASCII leading byte range // Constants for the states of a DFA private static final byte S_ERR = -2; //- Error state private static final byte S_END = -1; //- End (or Accept) state private static final byte S_CS1 = 0x00; //- Continuation state 1 private static final byte S_CS2 = 0x10; //- Continuation state 2 private static final byte S_CS3 = 0x20; //- Continuation state 3 private static final byte S_P3A = 0x30; //- Partial 3-byte sequence state A private static final byte S_P3B = 0x40; //- Partial 3-byte sequence state B private static final byte S_P4A = 0x50; //- Partial 4-byte sequence state A private static final byte S_P4B = 0x60; //- Partial 4-byte sequence state B private static final short[] firstUnitTable = new short[128]; private static final byte[] transitionTable = new byte[S_P4B + 16]; private static void fill(byte[] table, int first, int last, byte b) { for (int i = first; i <= last; ++i) { table[i] = b; } } static { byte[] categories = new byte[128]; fill(categories, 0x00, 0x0F, C_CR1); fill(categories, 0x10, 0x1F, C_CR2); fill(categories, 0x20, 0x3F, C_CR3); fill(categories, 0x40, 0x41, C_ILL); fill(categories, 0x42, 0x5F, C_L2A); fill(categories, 0x60, 0x60, C_L3A); fill(categories, 0x61, 0x6C, C_L3B); fill(categories, 0x6D, 0x6D, C_L3C); fill(categories, 0x6E, 0x6F, C_L3B); fill(categories, 0x70, 0x70, C_L4A); fill(categories, 0x71, 0x73, C_L4B); fill(categories, 0x74, 0x74, C_L4C); fill(categories, 0x75, 0x7F, C_ILL); fill(transitionTable, 0, transitionTable.length - 1, S_ERR); fill(transitionTable, S_CS1 + 0x8, S_CS1 + 0xB, S_END); fill(transitionTable, S_CS2 + 0x8, S_CS2 + 0xB, S_CS1); fill(transitionTable, S_CS3 + 0x8, S_CS3 + 0xB, S_CS2); fill(transitionTable, S_P3A + 0xA, S_P3A + 0xB, S_CS1); fill(transitionTable, S_P3B + 0x8, S_P3B + 0x9, S_CS1); fill(transitionTable, S_P4A + 0x9, S_P4A + 0xB, S_CS2); fill(transitionTable, S_P4B + 0x8, S_P4B + 0x8, S_CS2); byte[] firstUnitMasks = {0x00, 0x00, 0x00, 0x00, 0x1F, 0x0F, 0x0F, 0x0F, 0x07, 0x07, 0x07}; byte[] firstUnitTransitions = {S_ERR, S_ERR, S_ERR, S_ERR, S_CS1, S_P3A, S_CS2, S_P3B, S_P4A, S_CS3, S_P4B}; for (int i = 0x00; i < 0x80; ++i) { byte category = categories[i]; int codePoint = i & firstUnitMasks[category]; byte state = firstUnitTransitions[category]; firstUnitTable[i] = (short)((codePoint << 8) | state); } } /** * Transcode a UTF-8 encoding into a UTF-16 representation. In the general case the output * {@code utf16} array should be at least as long as the input {@code utf8} one to handle * arbitrary inputs. The number of output UTF-16 code units is returned, or -1 if any errors are * encountered (in which case an arbitrary amount of data may have been written into the output * array). Errors that will be detected are malformed UTF-8, including incomplete, truncated or * "overlong" encodings, and unmappable code points. In particular, no unmatched surrogates will * be produced. An error will also result if {@code utf16} is found to be too small to store the * complete output. * * @param utf8 A non-null array containing a well-formed UTF-8 encoding. * @param utf16 A non-null array, at least as long as the {@code utf8} array in order to ensure * the output will fit. * @return The number of UTF-16 code units written to {@code utf16} (beginning from index 0), or * else -1 if the input was either malformed or encoded any unmappable characters, or if * the {@code utf16} is too small. */ public static int transcodeToUTF16(byte[] utf8, char[] utf16) { return transcodeToUTF16(utf8, 0, utf8.length, utf16); } /** * Transcode a UTF-8 encoding into a UTF-16 representation. In the general case the output * {@code utf16} array should be at least as long as the input length from {@code utf8} to handle * arbitrary inputs. The number of output UTF-16 code units is returned, or -1 if any errors are * encountered (in which case an arbitrary amount of data may have been written into the output * array). Errors that will be detected are malformed UTF-8, including incomplete, truncated or * "overlong" encodings, and unmappable code points. In particular, no unmatched surrogates will * be produced. An error will also result if {@code utf16} is found to be too small to store the * complete output. * * @param utf8 A non-null array containing a well-formed UTF-8 encoding. * @param utf8Off start position in the array for the well-formed encoding. * @param utf8Length length in bytes of the well-formed encoding. * @param utf16 A non-null array, at least as long as the {@code utf8} array in order to ensure * the output will fit. * @return The number of UTF-16 code units written to {@code utf16} (beginning from index 0), or * else -1 if the input was either malformed or encoded any unmappable characters, or if * the {@code utf16} is too small. */ public static int transcodeToUTF16(byte[] utf8, int utf8Off, int utf8Length, char[] utf16) { int i = utf8Off, j = 0; int maxI = utf8Off + utf8Length; while (i < maxI) { byte codeUnit = utf8[i++]; if (codeUnit >= 0) { if (j >= utf16.length) { return -1; } utf16[j++] = (char)codeUnit; continue; } short first = firstUnitTable[codeUnit & 0x7F]; int codePoint = first >>> 8; byte state = (byte)first; while (state >= 0) { if (i >= maxI) { return -1; } codeUnit = utf8[i++]; codePoint = (codePoint << 6) | (codeUnit & 0x3F); state = transitionTable[state + ((codeUnit & 0xFF) >>> 4)]; } if (state == S_ERR) { return -1; } if (codePoint <= 0xFFFF) { if (j >= utf16.length) { return -1; } // Code points from U+D800 to U+DFFF are caught by the DFA utf16[j++] = (char)codePoint; } else { if (j >= utf16.length - 1) { return -1; } // Code points above U+10FFFF are caught by the DFA utf16[j++] = (char)(0xD7C0 + (codePoint >>> 10)); utf16[j++] = (char)(0xDC00 | (codePoint & 0x3FF)); } } return j; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy