All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.upokecenter.util.DataUtilities Maven / Gradle / Ivy

Go to download

A Java implementation of Concise Binary Object Representation (CBOR), a general-purpose binary data format defined in RFC 7049.

There is a newer version: 5.0.0-alpha1
Show newest version
package com.upokecenter.util;
/*
Written by Peter O. in 2013.
Any copyright is dedicated to the Public Domain.
http://creativecommons.org/publicdomain/zero/1.0/
If you like this, you should donate to Peter O.
at: http://peteroupc.github.io/
 */

import java.io.*;

    /**
     * Contains methods useful for reading and writing text strings. It is designed
     * to have no dependencies other than the basic runtime class library.
     * 

Many of these methods work with text encoded in UTF-8, an * encoding form of the Unicode Standard which uses one byte to encode * the most basic characters and two to four bytes to encode other * characters. For example, the GetUtf8 method converts a text * string to an array of bytes in UTF-8.

In C# and Java, text * strings are represented as sequences of 16-bit values called * char s. These sequences are well-formed under UTF-16, a * 16-bit encoding form of Unicode, except if they contain unpaired * surrogate code points. (A surrogate code point is used to encode * supplementary characters, those with code points U+10000 or higher, * in UTF-16. A surrogate pair is a high surrogate, U+D800 to U+DBFF, * followed by a low surrogate, U+DC00 to U+DFFF. An unpaired surrogate * code point is a surrogate not appearing in a surrogate pair.) Many * of the methods in this class allow setting the behavior to follow * when unpaired surrogate code points are found in text strings, such * as throwing an error or treating the unpaired surrogate as a * replacement character (U+FFFD).

*/ public final class DataUtilities { private DataUtilities() { } private static final int StreamedStringBufferLength = 4096; /** * Generates a text string from a UTF-8 byte array. * @param bytes A byte array containing text encoded in UTF-8. * @param replace If true, replaces invalid encoding with the replacement * character (U+FFFD). If false, stops processing when invalid UTF-8 is * seen. * @return A string represented by the UTF-8 byte array. * @throws NullPointerException The parameter {@code bytes} is null. * @throws IllegalArgumentException The string is not valid UTF-8 and {@code replace} * is false. */ public static String GetUtf8String(byte[] bytes, boolean replace) { if (bytes == null) { throw new NullPointerException("bytes"); } StringBuilder b = new StringBuilder(); if (ReadUtf8FromBytes(bytes, 0, bytes.length, b, replace) != 0) { throw new IllegalArgumentException("Invalid UTF-8"); } return b.toString(); } /** * Finds the number of Unicode code points in the given text string. Unpaired * surrogate code points increase this number by 1. This is not * necessarily the length of the string in "char" s. * @param str The parameter {@code str} is a text string. * @return The number of Unicode code points in the given string. * @throws NullPointerException The parameter {@code str} is null. */ public static int CodePointLength(String str) { if (str == null) { throw new NullPointerException("str"); } int i = 0; int count = 0; while (i < str.length()) { int c = CodePointAt(str, i); ++count; i += (c >= 0x10000) ? 2 : 1; } return count; } /** * Generates a text string from a portion of a UTF-8 byte array. * @param bytes A byte array containing text encoded in UTF-8. * @param offset Offset into the byte array to start reading. * @param bytesCount Length, in bytes, of the UTF-8 text string. * @param replace If true, replaces invalid encoding with the replacement * character (U+FFFD). If false, stops processing when invalid UTF-8 is * seen. * @return A string represented by the UTF-8 byte array. * @throws NullPointerException The parameter {@code bytes} is null. * @throws IllegalArgumentException The portion of the byte array is not valid UTF-8 * and {@code replace} is false. * @throws IllegalArgumentException The parameter {@code offset} is less than 0, * {@code bytesCount} is less than 0, or offset plus bytesCount is * greater than the length of "data" . */ public static String GetUtf8String( byte[] bytes, int offset, int bytesCount, boolean replace) { if (bytes == null) { throw new NullPointerException("bytes"); } if (offset < 0) { throw new IllegalArgumentException("offset (" + offset + ") is less than " + "0"); } if (offset > bytes.length) { throw new IllegalArgumentException("offset (" + offset + ") is more than " + bytes.length); } if (bytesCount < 0) { throw new IllegalArgumentException("bytesCount (" + bytesCount + ") is less than 0"); } if (bytesCount > bytes.length) { throw new IllegalArgumentException("bytesCount (" + bytesCount + ") is more than " + bytes.length); } if (bytes.length - offset < bytesCount) { throw new IllegalArgumentException("bytes's length minus " + offset + " (" + (bytes.length - offset) + ") is less than " + bytesCount); } StringBuilder b = new StringBuilder(); if (ReadUtf8FromBytes(bytes, offset, bytesCount, b, replace) != 0) { throw new IllegalArgumentException("Invalid UTF-8"); } return b.toString(); } /** *

Encodes a string in UTF-8 as a byte array. This method does not insert a * byte-order mark (U+FEFF) at the beginning of the encoded byte * array.

REMARK: It is not recommended to use * Encoding.UTF8.GetBytes in.getNET(), or the getBytes() * method in Java to do this. For instance, getBytes() encodes * text strings in a default (so not fixed) character encoding, which * can be undesirable.

* @param str The parameter {@code str} is a text string. * @param replace If true, replaces unpaired surrogate code points with the * replacement character (U+FFFD). If false, stops processing when an * unpaired surrogate code point is seen. * @return The string encoded in UTF-8. * @throws NullPointerException The parameter {@code str} is null. * @throws IllegalArgumentException The string contains an unpaired surrogate code * point and {@code replace} is false, or an internal error occurred. */ public static byte[] GetUtf8Bytes(String str, boolean replace) { return GetUtf8Bytes(str, replace, false); } /** *

Encodes a string in UTF-8 as a byte array. This method does not insert a * byte-order mark (U+FEFF) at the beginning of the encoded byte * array.

REMARK: It is not recommended to use * Encoding.UTF8.GetBytes in.getNET(), or the getBytes() * method in Java to do this. For instance, getBytes() encodes * text strings in a default (so not fixed) character encoding, which * can be undesirable.

* @param str The parameter {@code str} is a text string. * @param replace If true, replaces unpaired surrogate code points with the * replacement character (U+FFFD). If false, stops processing when an * unpaired surrogate code point is seen. * @param lenientLineBreaks If true, replaces carriage return (CR) not followed * by line feed (LF) and LF not preceded by CR with CR-LF pairs. * @return The string encoded in UTF-8. * @throws NullPointerException The parameter {@code str} is null. * @throws IllegalArgumentException The string contains an unpaired surrogate code * point and {@code replace} is false, or an internal error occurred. */ public static byte[] GetUtf8Bytes( String str, boolean replace, boolean lenientLineBreaks) { if (str == null) { throw new NullPointerException("str"); } if (!lenientLineBreaks && str.length() == 1) { int c = str.charAt(0); if ((c & 0xf800) == 0xd800) { if (replace) { c = 0xfffd; } else { throw new IllegalArgumentException("Unpaired surrogate code point"); } } if (c <= 0x80) { return new byte[] { (byte)c }; } else if (c <= 0x7ff) { return new byte[] { (byte)(0xc0 | ((c >> 6) & 0x1f)), (byte)(0x80 | (c & 0x3f)), }; } else { return new byte[] { (byte)(0xe0 | ((c >> 12) & 0x0f)), (byte)(0x80 | ((c >> 6) & 0x3f)), (byte)(0x80 | (c & 0x3f)), }; } } else if (str.length() == 2) { int c = str.charAt(0); int c2 = str.charAt(1); if ((c & 0xfc00) == 0xd800 && (c2 & 0xfc00) == 0xdc00) { c = 0x10000 + ((c & 0x3ff) << 10) + (c2 & 0x3ff); return new byte[] { (byte)(0xf0 | ((c >> 18) & 0x07)), (byte)(0x80 | ((c >> 12) & 0x3f)), (byte)(0x80 | ((c >> 6) & 0x3f)), (byte)(0x80 | (c & 0x3f)), }; } else if (!lenientLineBreaks && c <= 0x80 && c2 <= 0x80) { return new byte[] { (byte)c, (byte)c2 }; } } try { { java.io.ByteArrayOutputStream ms = null; try { ms = new java.io.ByteArrayOutputStream(); if (WriteUtf8(str, 0, str.length(), ms, replace, lenientLineBreaks) != 0) { throw new IllegalArgumentException("Unpaired surrogate code point"); } return ms.toByteArray(); } finally { try { if (ms != null) { ms.close(); } } catch (java.io.IOException ex) {} } } } catch (IOException ex) { throw new IllegalArgumentException("I/O error occurred", ex); } } /** * Calculates the number of bytes needed to encode a string in UTF-8. * @param str The parameter {@code str} is a text string. * @param replace If true, treats unpaired surrogate code points as having 3 * UTF-8 bytes (the UTF-8 length of the replacement character U+FFFD). * @return The number of bytes needed to encode the given string in UTF-8, or * -1 if the string contains an unpaired surrogate code point and * {@code replace} is false. * @throws NullPointerException The parameter {@code str} is null. */ public static long GetUtf8Length(String str, boolean replace) { if (str == null) { throw new NullPointerException("str"); } long size = 0; for (int i = 0; i < str.length(); ++i) { int c = str.charAt(i); if (c <= 0x7f) { ++size; } else if (c <= 0x7ff) { size += 2; } else if (c <= 0xd7ff || c >= 0xe000) { size += 3; } else if (c <= 0xdbff) { // UTF-16 leading surrogate ++i; if (i >= str.length() || str.charAt(i) < 0xdc00 || str.charAt(i) > 0xdfff) { if (replace) { size += 3; --i; } else { return -1; } } else { size += 4; } } else { if (replace) { size += 3; } else { return -1; } } } return size; } /** * Gets the Unicode code point just before the given index of the string. * @param str The parameter {@code str} is a text string. * @param index Index of the current position into the string. * @return The Unicode code point at the previous position. Returns -1 if * {@code index} is 0 or less, or is greater than the string's length. * Returns the replacement character (U+FFFD) if the code point at the * previous position is an unpaired surrogate code point. If the return * value is 65536 (0x10000) or greater, the code point takes up two * UTF-16 code units. * @throws NullPointerException The parameter {@code str} is null. */ public static int CodePointBefore(String str, int index) { return CodePointBefore(str, index, 0); } /** * Gets the Unicode code point just before the given index of the string. * @param str The parameter {@code str} is a text string. * @param index Index of the current position into the string. * @param surrogateBehavior Specifies what kind of value to return if the * previous code point is an unpaired surrogate code point: if 0, * return the replacement character (U+FFFD); if 1, return the value of * the surrogate code point; if neither 0 nor 1, return -1. * @return The Unicode code point at the previous position. Returns -1 if * {@code index} is 0 or less, or is greater than the string's length. * Returns a value as specified under {@code surrogateBehavior} if the * code point at the previous position is an unpaired surrogate code * point. If the return value is 65536 (0x10000) or greater, the code * point takes up two UTF-16 code units. * @throws NullPointerException The parameter {@code str} is null. */ public static int CodePointBefore( String str, int index, int surrogateBehavior) { if (str == null) { throw new NullPointerException("str"); } if (index <= 0) { return -1; } if (index > str.length()) { return -1; } int c = str.charAt(index - 1); if ((c & 0xfc00) == 0xdc00 && index - 2 >= 0 && (str.charAt(index - 2) & 0xfc00) == 0xd800) { // Get the Unicode code point for the surrogate pair return 0x10000 + ((str.charAt(index - 2) & 0x3ff) << 10) + (c & 0x3ff); } // unpaired surrogate if ((c & 0xf800) == 0xd800) { return (surrogateBehavior == 0) ? 0xfffd : ((surrogateBehavior == 1) ? c : -1); } return c; } /** * Gets the Unicode code point at the given index of the string. * @param str The parameter {@code str} is a text string. * @param index Index of the current position into the string. * @return The Unicode code point at the given position. Returns -1 if {@code * index} is 0 or less, or is greater than the string's length. Returns * the replacement character (U+FFFD) if the code point at that * position is an unpaired surrogate code point. If the return value is * 65536 (0x10000) or greater, the code point takes up two UTF-16 code * units. * @throws NullPointerException The parameter {@code str} is null. */ public static int CodePointAt(String str, int index) { return CodePointAt(str, index, 0); } /** * Gets the Unicode code point at the given index of the string.

The * following example shows how to iterate a text string code point by * code point, terminating the loop when an unpaired surrogate is * found.

for (int i = 0;i<str.length(); ++i) { int
     * codePoint = DataUtilities.CodePointAt(str, i, 2); if (codePoint <
     * 0) { break; /* Unpaired surrogate */ }
     *  System.out.println("codePoint:"+codePoint); if (codePoint >=
     * 0x10000) { i++; /* Supplementary code point */ } }
.

* @param str The parameter {@code str} is a text string. * @param index Index of the current position into the string. * @param surrogateBehavior Specifies what kind of value to return if the code * point at the given index is an unpaired surrogate code point: if 0, * return the replacement character (U+FFFD); if 1, return the value of * the surrogate code point; if neither 0 nor 1, return -1. * @return The Unicode code point at the given position. Returns -1 if {@code * index} is 0 or less, or is greater than the string's length. Returns * a value as specified under {@code surrogateBehavior} if the code * point at that position is an unpaired surrogate code point. If the * return value is 65536 (0x10000) or greater, the code point takes up * two UTF-16 code units. * @throws NullPointerException The parameter {@code str} is null. */ public static int CodePointAt( String str, int index, int surrogateBehavior) { if (str == null) { throw new NullPointerException("str"); } if (index >= str.length()) { return -1; } if (index < 0) { return -1; } int c = str.charAt(index); if ((c & 0xfc00) == 0xd800 && index + 1 < str.length() && (str.charAt(index + 1) & 0xfc00) == 0xdc00) { // Get the Unicode code point for the surrogate pair c = 0x10000 + ((c & 0x3ff) << 10) + (str.charAt(index + 1) & 0x3ff); } else if ((c & 0xf800) == 0xd800) { // unpaired surrogate return (surrogateBehavior == 0) ? 0xfffd : ((surrogateBehavior == 1) ? c : (-1)); } return c; } /** * Returns a string with the basic upper-case letters A to Z (U+0041 to U+005A) * converted to the corresponding basic lower-case letters. Other * characters remain unchanged. * @param str The parameter {@code str} is a text string. * @return The converted string, or null if {@code str} is null. */ public static String ToLowerCaseAscii(String str) { if (str == null) { return null; } int len = str.length(); char c = (char)0; boolean hasUpperCase = false; for (int i = 0; i < len; ++i) { c = str.charAt(i); if (c >= 'A' && c <= 'Z') { hasUpperCase = true; break; } } if (!hasUpperCase) { return str; } StringBuilder builder = new StringBuilder(); for (int i = 0; i < len; ++i) { c = str.charAt(i); if (c >= 'A' && c <= 'Z') { builder.append((char)(c + 0x20)); } else { builder.append(c); } } return builder.toString(); } /** * Returns a string with the basic lower-case letters A to Z (U+0061 to U+007A) * converted to the corresponding basic upper-case letters. Other * characters remain unchanged. * @param str The parameter {@code str} is a text string. * @return The converted string, or null if {@code str} is null. */ public static String ToUpperCaseAscii(String str) { if (str == null) { return null; } int len = str.length(); char c = (char)0; boolean hasLowerCase = false; for (int i = 0; i < len; ++i) { c = str.charAt(i); if (c >= 'a' && c <= 'z') { hasLowerCase = true; break; } } if (!hasLowerCase) { return str; } StringBuilder builder = new StringBuilder(); for (int i = 0; i < len; ++i) { c = str.charAt(i); if (c >= 'a' && c <= 'z') { builder.append((char)(c - 0x20)); } else { builder.append(c); } } return builder.toString(); } /** * Compares two strings in Unicode code point order. Unpaired surrogate code * points are treated as individual code points. * @param strA The first string. Can be null. * @param strB The second string. Can be null. * @return A value indicating which string is " less" or " greater" . 0: Both * strings are equal or null. Less than 0: a is null and b isn't; or * the first code point that's different is less in A than in B; or b * starts with a and is longer than a. Greater than 0: b is null and a * isn't; or the first code point that's different is greater in A than * in B; or a starts with b and is longer than b. */ public static int CodePointCompare(String strA, String strB) { if (strA == null) { return (strB == null) ? 0 : -1; } if (strB == null) { return 1; } int len, ca, cb; len = Math.min(strA.length(), strB.length()); for (int i = 0; i < len; ++i) { ca = strA.charAt(i); cb = strB.charAt(i); if (ca == cb) { // normal code units and illegal surrogates // are treated as single code points if ((ca & 0xf800) != 0xd800) { continue; } boolean incindex = false; if (i + 1 < strA.length() && (strA.charAt(i + 1) & 0xfc00) == 0xdc00) { ca = 0x10000 + ((ca & 0x3ff) << 10) + (strA.charAt(i + 1) & 0x3ff); incindex = true; } if (i + 1 < strB.length() && (strB.charAt(i + 1) & 0xfc00) == 0xdc00) { cb = 0x10000 + ((cb & 0x3ff) << 10) + (strB.charAt(i + 1) & 0x3ff); incindex = true; } if (ca != cb) { return ca - cb; } if (incindex) { ++i; } } else { if ((ca & 0xf800) != 0xd800 && (cb & 0xf800) != 0xd800) { return ca - cb; } if ((ca & 0xfc00) == 0xd800 && i + 1 < strA.length() && (strA.charAt(i + 1) & 0xfc00) == 0xdc00) { ca = 0x10000 + ((ca & 0x3ff) << 10) + (strA.charAt(i + 1) & 0x3ff); } if ((cb & 0xfc00) == 0xd800 && i + 1 < strB.length() && (strB.charAt(i + 1) & 0xfc00) == 0xdc00) { cb = 0x10000 + ((cb & 0x3ff) << 10) + (strB.charAt(i + 1) & 0x3ff); } return ca - cb; } } return (strA.length() == strB.length()) ? 0 : ((strA.length() < strB.length()) ? -1 : 1); } /** * Writes a portion of a string in UTF-8 encoding to a data stream. * @param str A string to write. * @param offset The Index starting at 0 where the string portion to write * begins. * @param length The length of the string portion to write. * @param stream A writable data stream. * @param replace If true, replaces unpaired surrogate code points with the * replacement character (U+FFFD). If false, stops processing when an * unpaired surrogate code point is seen. * @return 0 if the entire string portion was written; or -1 if the string * portion contains an unpaired surrogate code point and {@code * replace} is false. * @throws NullPointerException The parameter {@code str} is null or {@code * stream} is null. * @throws java.io.IOException An I/O error occurred. * @throws IllegalArgumentException Either {@code offset} or {@code length} is less * than 0 or greater than {@code str} 's length, or {@code str} 's * length minus {@code offset} is less than {@code length}. */ public static int WriteUtf8( String str, int offset, int length, OutputStream stream, boolean replace) throws java.io.IOException { return WriteUtf8(str, offset, length, stream, replace, false); } /** * Writes a portion of a string in UTF-8 encoding to a data stream. * @param str A string to write. * @param offset The Index starting at 0 where the string portion to write * begins. * @param length The length of the string portion to write. * @param stream A writable data stream. * @param replace If true, replaces unpaired surrogate code points with the * replacement character (U+FFFD). If false, stops processing when an * unpaired surrogate code point is seen. * @param lenientLineBreaks If true, replaces carriage return (CR) not followed * by line feed (LF) and LF not preceded by CR with CR-LF pairs. * @return 0 if the entire string portion was written; or -1 if the string * portion contains an unpaired surrogate code point and {@code * replace} is false. * @throws NullPointerException The parameter {@code str} is null or {@code * stream} is null. * @throws IllegalArgumentException The parameter {@code offset} is less than 0, * {@code length} is less than 0, or {@code offset} plus {@code length} * is greater than the string's length. * @throws java.io.IOException An I/O error occurred. */ public static int WriteUtf8( String str, int offset, int length, OutputStream stream, boolean replace, boolean lenientLineBreaks) throws java.io.IOException { if (stream == null) { throw new NullPointerException("stream"); } if (str == null) { throw new NullPointerException("str"); } if (offset < 0) { throw new IllegalArgumentException("offset (" + offset + ") is less than " + "0"); } if (offset > str.length()) { throw new IllegalArgumentException("offset (" + offset + ") is more than " + str.length()); } if (length < 0) { throw new IllegalArgumentException("length (" + length + ") is less than " + "0"); } if (length > str.length()) { throw new IllegalArgumentException("length (" + length + ") is more than " + str.length()); } if (str.length() - offset < length) { throw new IllegalArgumentException("str.length() minus offset (" + (str.length() - offset) + ") is less than " + length); } int endIndex, c; byte[] bytes; int retval = 0; bytes = new byte[StreamedStringBufferLength]; int byteIndex = 0; endIndex = offset + length; for (int index = offset; index < endIndex; ++index) { c = str.charAt(index); if (c <= 0x7f) { if (lenientLineBreaks) { if (c == 0x0d && (index + 1 >= endIndex || str.charAt(index + 1) != 0x0a)) { // bare CR, convert to CRLF if (byteIndex + 2 > StreamedStringBufferLength) { // Write bytes retrieved so far stream.write(bytes, 0, byteIndex); byteIndex = 0; } bytes[byteIndex++] = 0x0d; bytes[byteIndex++] = 0x0a; continue; } else if (c == 0x0d) { // CR-LF pair if (byteIndex + 2 > StreamedStringBufferLength) { // Write bytes retrieved so far stream.write(bytes, 0, byteIndex); byteIndex = 0; } bytes[byteIndex++] = 0x0d; bytes[byteIndex++] = 0x0a; ++index; continue; } if (c == 0x0a) { // bare LF, convert to CRLF if (byteIndex + 2 > StreamedStringBufferLength) { // Write bytes retrieved so far stream.write(bytes, 0, byteIndex); byteIndex = 0; } bytes[byteIndex++] = 0x0d; bytes[byteIndex++] = 0x0a; continue; } } if (byteIndex >= StreamedStringBufferLength) { // Write bytes retrieved so far stream.write(bytes, 0, byteIndex); byteIndex = 0; } bytes[byteIndex++] = (byte)c; } else if (c <= 0x7ff) { if (byteIndex + 2 > StreamedStringBufferLength) { // Write bytes retrieved so far stream.write(bytes, 0, byteIndex); byteIndex = 0; } bytes[byteIndex++] = (byte)(0xc0 | ((c >> 6) & 0x1f)); bytes[byteIndex++] = (byte)(0x80 | (c & 0x3f)); } else { if ((c & 0xfc00) == 0xd800 && index + 1 < endIndex && (str.charAt(index + 1) & 0xfc00) == 0xdc00) { // Get the Unicode code point for the surrogate pair c = 0x10000 + ((c & 0x3ff) << 10) + (str.charAt(index + 1) & 0x3ff); ++index; } else if ((c & 0xf800) == 0xd800) { // unpaired surrogate if (!replace) { retval = -1; break; // write bytes read so far } c = 0xfffd; } if (c <= 0xffff) { if (byteIndex + 3 > StreamedStringBufferLength) { // Write bytes retrieved so far stream.write(bytes, 0, byteIndex); byteIndex = 0; } bytes[byteIndex++] = (byte)(0xe0 | ((c >> 12) & 0x0f)); bytes[byteIndex++] = (byte)(0x80 | ((c >> 6) & 0x3f)); bytes[byteIndex++] = (byte)(0x80 | (c & 0x3f)); } else { if (byteIndex + 4 > StreamedStringBufferLength) { // Write bytes retrieved so far stream.write(bytes, 0, byteIndex); byteIndex = 0; } bytes[byteIndex++] = (byte)(0xf0 | ((c >> 18) & 0x07)); bytes[byteIndex++] = (byte)(0x80 | ((c >> 12) & 0x3f)); bytes[byteIndex++] = (byte)(0x80 | ((c >> 6) & 0x3f)); bytes[byteIndex++] = (byte)(0x80 | (c & 0x3f)); } } } stream.write(bytes, 0, byteIndex); return retval; } /** * Writes a string in UTF-8 encoding to a data stream. * @param str A string to write. * @param stream A writable data stream. * @param replace If true, replaces unpaired surrogate code points with the * replacement character (U+FFFD). If false, stops processing when an * unpaired surrogate code point is seen. * @return 0 if the entire string was written; or -1 if the string contains an * unpaired surrogate code point and {@code replace} is false. * @throws NullPointerException The parameter {@code str} is null or {@code * stream} is null. * @throws java.io.IOException An I/O error occurred. */ public static int WriteUtf8(String str, OutputStream stream, boolean replace) throws java.io.IOException { if (str == null) { throw new NullPointerException("str"); } return WriteUtf8(str, 0, str.length(), stream, replace); } /** * Reads a string in UTF-8 encoding from a byte array. * @param data A byte array containing a UTF-8 text string. * @param offset Offset into the byte array to start reading. * @param bytesCount Length, in bytes, of the UTF-8 text string. * @param builder A string builder object where the resulting string will be * stored. * @param replace If true, replaces invalid encoding with the replacement * character (U+FFFD). If false, stops processing when invalid UTF-8 is * seen. * @return 0 if the entire string was read without errors, or -1 if the string * is not valid UTF-8 and {@code replace} is false. * @throws NullPointerException The parameter {@code data} is null or {@code * builder} is null. * @throws IllegalArgumentException The parameter {@code offset} is less than 0, * {@code bytesCount} is less than 0, or offset plus bytesCount is * greater than the length of {@code data}. */ public static int ReadUtf8FromBytes( byte[] data, int offset, int bytesCount, StringBuilder builder, boolean replace) { if (data == null) { throw new NullPointerException("data"); } if (offset < 0) { throw new IllegalArgumentException("offset (" + offset + ") is less than " + "0"); } if (offset > data.length) { throw new IllegalArgumentException("offset (" + offset + ") is more than " + data.length); } if (bytesCount < 0) { throw new IllegalArgumentException("bytesCount (" + bytesCount + ") is less than 0"); } if (bytesCount > data.length) { throw new IllegalArgumentException("bytesCount (" + bytesCount + ") is more than " + data.length); } if (data.length - offset < bytesCount) { throw new IllegalArgumentException("data.length minus offset (" + (data.length - offset) + ") is less than " + bytesCount); } if (builder == null) { throw new NullPointerException("builder"); } int cp = 0; int bytesSeen = 0; int bytesNeeded = 0; int lower = 0x80; int upper = 0xbf; int pointer, endpointer, b; pointer = offset; endpointer = offset + bytesCount; while (pointer < endpointer) { b = data[pointer] & (int)0xff; ++pointer; if (bytesNeeded == 0) { if ((b & 0x7f) == b) { builder.append((char)b); } else if (b >= 0xc2 && b <= 0xdf) { bytesNeeded = 1; cp = (b - 0xc0) << 6; } else if (b >= 0xe0 && b <= 0xef) { lower = (b == 0xe0) ? 0xa0 : 0x80; upper = (b == 0xed) ? 0x9f : 0xbf; bytesNeeded = 2; cp = (b - 0xe0) << 12; } else if (b >= 0xf0 && b <= 0xf4) { lower = (b == 0xf0) ? 0x90 : 0x80; upper = (b == 0xf4) ? 0x8f : 0xbf; bytesNeeded = 3; cp = (b - 0xf0) << 18; } else { if (replace) { builder.append((char)0xfffd); } else { return -1; } } continue; } if (b < lower || b > upper) { cp = bytesNeeded = bytesSeen = 0; lower = 0x80; upper = 0xbf; if (replace) { --pointer; builder.append((char)0xfffd); continue; } return -1; } else { lower = 0x80; upper = 0xbf; ++bytesSeen; cp += (b - 0x80) << (6 * (bytesNeeded - bytesSeen)); if (bytesSeen != bytesNeeded) { continue; } int ret, ch, lead, trail; ret = cp; cp = 0; bytesSeen = 0; bytesNeeded = 0; if (ret <= 0xffff) { builder.append((char)ret); } else { ch = ret - 0x10000; lead = (ch >> 10) + 0xd800; trail = (ch & 0x3ff) + 0xdc00; builder.append((char)lead); builder.append((char)trail); } } } if (bytesNeeded != 0) { if (replace) { builder.append((char)0xfffd); } else { return -1; } } return 0; } /** * Reads a string in UTF-8 encoding from a data stream in full and returns that * string. Replaces invalid encoding with the replacement character * (U+FFFD). * @param stream A readable data stream. * @return The string read. * @throws java.io.IOException An I/O error occurred. * @throws NullPointerException The parameter {@code stream} is null. */ public static String ReadUtf8ToString(InputStream stream) throws java.io.IOException { return ReadUtf8ToString(stream, -1, true); } /** * Reads a string in UTF-8 encoding from a data stream and returns that string. * @param stream A readable data stream. * @param bytesCount The length, in bytes, of the string. If this is less than * 0, this function will read until the end of the stream. * @param replace If true, replaces invalid encoding with the replacement * character (U+FFFD). If false, throws an error if an unpaired * surrogate code point is seen. * @return The string read. * @throws java.io.IOException An I/O error occurred; or, the string is not * valid UTF-8 and {@code replace} is false. * @throws NullPointerException The parameter {@code stream} is null. */ public static String ReadUtf8ToString( InputStream stream, int bytesCount, boolean replace) throws java.io.IOException { StringBuilder builder = new StringBuilder(); if (DataUtilities.ReadUtf8(stream, bytesCount, builder, replace) == -1) { throw new IOException( "Unpaired surrogate code point found.", new IllegalArgumentException("Unpaired surrogate code point found.")); } return builder.toString(); } /** * Reads a string in UTF-8 encoding from a data stream. * @param stream A readable data stream. * @param bytesCount The length, in bytes, of the string. If this is less than * 0, this function will read until the end of the stream. * @param builder A string builder object where the resulting string will be * stored. * @param replace If true, replaces invalid encoding with the replacement * character (U+FFFD). If false, stops processing when an unpaired * surrogate code point is seen. * @return 0 if the entire string was read without errors, -1 if the string is * not valid UTF-8 and {@code replace} is false, or -2 if the end of * the stream was reached before the last character was read completely * (which is only the case if {@code bytesCount} is 0 or greater). * @throws java.io.IOException An I/O error occurred. * @throws NullPointerException The parameter {@code stream} is null or {@code * builder} is null. */ public static int ReadUtf8( InputStream stream, int bytesCount, StringBuilder builder, boolean replace) throws java.io.IOException { if (stream == null) { throw new NullPointerException("stream"); } if (builder == null) { throw new NullPointerException("builder"); } int b; int cp = 0; int bytesSeen = 0; int bytesNeeded = 0; int lower = 0x80; int upper = 0xbf; int pointer = 0; while (pointer < bytesCount || bytesCount < 0) { b = stream.read(); if (b < 0) { if (bytesNeeded != 0) { bytesNeeded = 0; if (replace) { builder.append((char)0xfffd); if (bytesCount >= 0) { return -2; } break; // end of stream } return -1; } if (bytesCount >= 0) { return -2; } break; // end of stream } if (bytesCount > 0) { ++pointer; } if (bytesNeeded == 0) { if ((b & 0x7f) == b) { builder.append((char)b); } else if (b >= 0xc2 && b <= 0xdf) { bytesNeeded = 1; cp = (b - 0xc0) << 6; } else if (b >= 0xe0 && b <= 0xef) { lower = (b == 0xe0) ? 0xa0 : 0x80; upper = (b == 0xed) ? 0x9f : 0xbf; bytesNeeded = 2; cp = (b - 0xe0) << 12; } else if (b >= 0xf0 && b <= 0xf4) { lower = (b == 0xf0) ? 0x90 : 0x80; upper = (b == 0xf4) ? 0x8f : 0xbf; bytesNeeded = 3; cp = (b - 0xf0) << 18; } else { if (replace) { builder.append((char)0xfffd); } else { return -1; } } continue; } if (b < lower || b > upper) { cp = bytesNeeded = bytesSeen = 0; lower = 0x80; upper = 0xbf; if (replace) { builder.append((char)0xfffd); // "Read" the last byte again if (b < 0x80) { builder.append((char)b); } else if (b >= 0xc2 && b <= 0xdf) { bytesNeeded = 1; cp = (b - 0xc0) << 6; } else if (b >= 0xe0 && b <= 0xef) { lower = (b == 0xe0) ? 0xa0 : 0x80; upper = (b == 0xed) ? 0x9f : 0xbf; bytesNeeded = 2; cp = (b - 0xe0) << 12; } else if (b >= 0xf0 && b <= 0xf4) { lower = (b == 0xf0) ? 0x90 : 0x80; upper = (b == 0xf4) ? 0x8f : 0xbf; bytesNeeded = 3; cp = (b - 0xf0) << 18; } else { builder.append((char)0xfffd); } continue; } return -1; } else { lower = 0x80; upper = 0xbf; ++bytesSeen; cp += (b - 0x80) << (6 * (bytesNeeded - bytesSeen)); if (bytesSeen != bytesNeeded) { continue; } int ret, ch, lead, trail; ret = cp; cp = 0; bytesSeen = 0; bytesNeeded = 0; if (ret <= 0xffff) { builder.append((char)ret); } else { ch = ret - 0x10000; lead = (ch >> 10) + 0xd800; trail = (ch & 0x3ff) + 0xdc00; builder.append((char)lead); builder.append((char)trail); } } } if (bytesNeeded != 0) { if (replace) { builder.append((char)0xfffd); } else { return -1; } } return 0; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy