com.upokecenter.util.DataUtilities Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of cbor Show documentation
A Java implementation of Concise Binary Object Representation (CBOR), a general-purpose binary data format defined in RFC 7049.
There is a newer version: 5.0.0-alpha1
Show newest version
package com.upokecenter.util;
/*
Written by Peter O. in 2013.
Any copyright is dedicated to the Public Domain.
http://creativecommons.org/publicdomain/zero/1.0/
If you like this, you should donate to Peter O.
at: http://peteroupc.github.io/
 */

import java.io.*;

    /**
     * Contains methods useful for reading and writing text strings. It is designed
     * to have no dependencies other than the basic runtime class library.
     * Many of these methods work with text encoded in UTF-8, an
     * encoding form of the Unicode Standard which uses one byte to encode
     * the most basic characters and two to four bytes to encode other
     * characters. For example, the GetUtf8 method converts a text
     * string to an array of bytes in UTF-8.
 In C# and Java, text
     * strings are represented as sequences of 16-bit values called
     * char s. These sequences are well-formed under UTF-16, a
     * 16-bit encoding form of Unicode, except if they contain unpaired
     * surrogate code points. (A surrogate code point is used to encode
     * supplementary characters, those with code points U+10000 or higher,
     * in UTF-16. A surrogate pair is a high surrogate, U+D800 to U+DBFF,
     * followed by a low surrogate, U+DC00 to U+DFFF. An unpaired surrogate
     * code point is a surrogate not appearing in a surrogate pair.) Many
     * of the methods in this class allow setting the behavior to follow
     * when unpaired surrogate code points are found in text strings, such
     * as throwing an error or treating the unpaired surrogate as a
     * replacement character (U+FFFD).
     */
  public final class DataUtilities {
private DataUtilities() {
}
    private static final int StreamedStringBufferLength = 4096;

    /**
     * Generates a text string from a UTF-8 byte array.
     * @param bytes A byte array containing text encoded in UTF-8.
     * @param replace If true, replaces invalid encoding with the replacement
     * character (U+FFFD). If false, stops processing when invalid UTF-8 is
     * seen.
     * @return A string represented by the UTF-8 byte array.
     * @throws NullPointerException The parameter {@code bytes} is null.
     * @throws IllegalArgumentException The string is not valid UTF-8 and {@code replace}
     * is false.
     */
    public static String GetUtf8String(byte[] bytes, boolean replace) {
      if (bytes == null) {
        throw new NullPointerException("bytes");
      }
      StringBuilder b = new StringBuilder();
      if (ReadUtf8FromBytes(bytes, 0, bytes.length, b, replace) != 0) {
        throw new IllegalArgumentException("Invalid UTF-8");
      }
      return b.toString();
    }

    /**
     * Finds the number of Unicode code points in the given text string. Unpaired
     * surrogate code points increase this number by 1. This is not
     *  necessarily the length of the string in "char" s.
     * @param str The parameter {@code str} is a text string.
     * @return The number of Unicode code points in the given string.
     * @throws NullPointerException The parameter {@code str} is null.
     */
    public static int CodePointLength(String str) {
      if (str == null) {
        throw new NullPointerException("str");
      }
      int i = 0;
      int count = 0;
      while (i < str.length()) {
        int c = CodePointAt(str, i);
        ++count;
        i += (c >= 0x10000) ? 2 : 1;
      }
      return count;
    }

    /**
     * Generates a text string from a portion of a UTF-8 byte array.
     * @param bytes A byte array containing text encoded in UTF-8.
     * @param offset Offset into the byte array to start reading.
     * @param bytesCount Length, in bytes, of the UTF-8 text string.
     * @param replace If true, replaces invalid encoding with the replacement
     * character (U+FFFD). If false, stops processing when invalid UTF-8 is
     * seen.
     * @return A string represented by the UTF-8 byte array.
     * @throws NullPointerException The parameter {@code bytes} is null.
     * @throws IllegalArgumentException The portion of the byte array is not valid UTF-8
     * and {@code replace} is false.
     * @throws IllegalArgumentException The parameter {@code offset} is less than 0,
     * {@code bytesCount} is less than 0, or offset plus bytesCount is
     *  greater than the length of "data" .
     */
    public static String GetUtf8String(
      byte[] bytes,
      int offset,
      int bytesCount,
      boolean replace) {
      if (bytes == null) {
        throw new NullPointerException("bytes");
      }
      if (offset < 0) {
        throw new IllegalArgumentException("offset (" + offset + ") is less than " +
                    "0");
      }
      if (offset > bytes.length) {
        throw new IllegalArgumentException("offset (" + offset + ") is more than " +
                    bytes.length);
      }
      if (bytesCount < 0) {
        throw new IllegalArgumentException("bytesCount (" + bytesCount +
                    ") is less than 0");
      }
      if (bytesCount > bytes.length) {
        throw new IllegalArgumentException("bytesCount (" + bytesCount +
                    ") is more than " + bytes.length);
      }
      if (bytes.length - offset < bytesCount) {
        throw new IllegalArgumentException("bytes's length minus " + offset + " (" +
                (bytes.length - offset) + ") is less than " + bytesCount);
      }
      StringBuilder b = new StringBuilder();
      if (ReadUtf8FromBytes(bytes, offset, bytesCount, b, replace) != 0) {
        throw new IllegalArgumentException("Invalid UTF-8");
      }
      return b.toString();
    }

    /**
     * Encodes a string in UTF-8 as a byte array. This method does not insert a
     * byte-order mark (U+FEFF) at the beginning of the encoded byte
     * array.
 REMARK: It is not recommended to use
     * Encoding.UTF8.GetBytes in.getNET(), or the getBytes()
     * method in Java to do this. For instance, getBytes() encodes
     * text strings in a default (so not fixed) character encoding, which
     * can be undesirable.
     * @param str The parameter {@code str} is a text string.
     * @param replace If true, replaces unpaired surrogate code points with the
     * replacement character (U+FFFD). If false, stops processing when an
     * unpaired surrogate code point is seen.
     * @return The string encoded in UTF-8.
     * @throws NullPointerException The parameter {@code str} is null.
     * @throws IllegalArgumentException The string contains an unpaired surrogate code
     * point and {@code replace} is false, or an internal error occurred.
     */
    public static byte[] GetUtf8Bytes(String str, boolean replace) {
      return GetUtf8Bytes(str, replace, false);
    }

    /**
     * Encodes a string in UTF-8 as a byte array. This method does not insert a
     * byte-order mark (U+FEFF) at the beginning of the encoded byte
     * array.
 REMARK: It is not recommended to use
     * Encoding.UTF8.GetBytes in.getNET(), or the getBytes()
     * method in Java to do this. For instance, getBytes() encodes
     * text strings in a default (so not fixed) character encoding, which
     * can be undesirable.
     * @param str The parameter {@code str} is a text string.
     * @param replace If true, replaces unpaired surrogate code points with the
     * replacement character (U+FFFD). If false, stops processing when an
     * unpaired surrogate code point is seen.
     * @param lenientLineBreaks If true, replaces carriage return (CR) not followed
     * by line feed (LF) and LF not preceded by CR with CR-LF pairs.
     * @return The string encoded in UTF-8.
     * @throws NullPointerException The parameter {@code str} is null.
     * @throws IllegalArgumentException The string contains an unpaired surrogate code
     * point and {@code replace} is false, or an internal error occurred.
     */
    public static byte[] GetUtf8Bytes(
      String str,
      boolean replace,
      boolean lenientLineBreaks) {
      if (str == null) {
        throw new NullPointerException("str");
      }
      if (!lenientLineBreaks && str.length() == 1) {
        int c = str.charAt(0);
        if ((c & 0xf800) == 0xd800) {
          if (replace) {
            c = 0xfffd;
          } else {
            throw new IllegalArgumentException("Unpaired surrogate code point");
          }
        }
        if (c <= 0x80) {
          return new byte[] { (byte)c };
        } else if (c <= 0x7ff) {
          return new byte[] {
            (byte)(0xc0 | ((c >> 6) & 0x1f)),
            (byte)(0x80 | (c & 0x3f)),
           };
        } else {
          return new byte[] {
            (byte)(0xe0 | ((c >> 12) & 0x0f)),
            (byte)(0x80 | ((c >> 6) & 0x3f)),
            (byte)(0x80 | (c & 0x3f)),
           };
        }
      } else if (str.length() == 2) {
        int c = str.charAt(0);
        int c2 = str.charAt(1);
        if ((c & 0xfc00) == 0xd800 && (c2 & 0xfc00) == 0xdc00) {
          c = 0x10000 + ((c & 0x3ff) << 10) + (c2 & 0x3ff);
          return new byte[] {
            (byte)(0xf0 | ((c >> 18) & 0x07)),
            (byte)(0x80 | ((c >> 12) & 0x3f)),
            (byte)(0x80 | ((c >> 6) & 0x3f)),
            (byte)(0x80 | (c & 0x3f)),
           };
        } else if (!lenientLineBreaks && c <= 0x80 && c2 <= 0x80) {
          return new byte[] { (byte)c, (byte)c2 };
        }
      }
      try {
        {
          java.io.ByteArrayOutputStream ms = null;
try {
ms = new java.io.ByteArrayOutputStream();

          if (WriteUtf8(str, 0, str.length(), ms, replace, lenientLineBreaks) !=
               0) {
            throw new IllegalArgumentException("Unpaired surrogate code point");
          }
          return ms.toByteArray();
}
finally {
try { if (ms != null) {
 ms.close();
 } } catch (java.io.IOException ex) {}
}
}
      } catch (IOException ex) {
        throw new IllegalArgumentException("I/O error occurred", ex);
      }
    }

    /**
     * Calculates the number of bytes needed to encode a string in UTF-8.
     * @param str The parameter {@code str} is a text string.
     * @param replace If true, treats unpaired surrogate code points as having 3
     * UTF-8 bytes (the UTF-8 length of the replacement character U+FFFD).
     * @return The number of bytes needed to encode the given string in UTF-8, or
     * -1 if the string contains an unpaired surrogate code point and
     * {@code replace} is false.
     * @throws NullPointerException The parameter {@code str} is null.
     */
    public static long GetUtf8Length(String str, boolean replace) {
      if (str == null) {
        throw new NullPointerException("str");
      }
      long size = 0;
      for (int i = 0; i < str.length(); ++i) {
        int c = str.charAt(i);
        if (c <= 0x7f) {
          ++size;
        } else if (c <= 0x7ff) {
          size += 2;
        } else if (c <= 0xd7ff || c >= 0xe000) {
          size += 3;
        } else if (c <= 0xdbff) { // UTF-16 leading surrogate
          ++i;
          if (i >= str.length() || str.charAt(i) < 0xdc00 || str.charAt(i) > 0xdfff) {
            if (replace) {
              size += 3;
              --i;
            } else {
              return -1;
            }
          } else {
            size += 4;
          }
        } else {
          if (replace) {
            size += 3;
          } else {
            return -1;
          }
        }
      }
      return size;
    }

    /**
     * Gets the Unicode code point just before the given index of the string.
     * @param str The parameter {@code str} is a text string.
     * @param index Index of the current position into the string.
     * @return The Unicode code point at the previous position. Returns -1 if
     * {@code index} is 0 or less, or is greater than the string's length.
     * Returns the replacement character (U+FFFD) if the code point at the
     * previous position is an unpaired surrogate code point. If the return
     * value is 65536 (0x10000) or greater, the code point takes up two
     * UTF-16 code units.
     * @throws NullPointerException The parameter {@code str} is null.
     */
    public static int CodePointBefore(String str, int index) {
      return CodePointBefore(str, index, 0);
    }

    /**
     * Gets the Unicode code point just before the given index of the string.
     * @param str The parameter {@code str} is a text string.
     * @param index Index of the current position into the string.
     * @param surrogateBehavior Specifies what kind of value to return if the
     * previous code point is an unpaired surrogate code point: if 0,
     * return the replacement character (U+FFFD); if 1, return the value of
     * the surrogate code point; if neither 0 nor 1, return -1.
     * @return The Unicode code point at the previous position. Returns -1 if
     * {@code index} is 0 or less, or is greater than the string's length.
     * Returns a value as specified under {@code surrogateBehavior} if the
     * code point at the previous position is an unpaired surrogate code
     * point. If the return value is 65536 (0x10000) or greater, the code
     * point takes up two UTF-16 code units.
     * @throws NullPointerException The parameter {@code str} is null.
     */
    public static int CodePointBefore(
      String str,
      int index,
      int surrogateBehavior) {
      if (str == null) {
        throw new NullPointerException("str");
      }
      if (index <= 0) {
        return -1;
      }
      if (index > str.length()) {
        return -1;
      }
      int c = str.charAt(index - 1);
      if ((c & 0xfc00) == 0xdc00 && index - 2 >= 0 &&
          (str.charAt(index - 2) & 0xfc00) == 0xd800) {
        // Get the Unicode code point for the surrogate pair
        return 0x10000 + ((str.charAt(index - 2) & 0x3ff) << 10) + (c & 0x3ff);
      }
      // unpaired surrogate
      if ((c & 0xf800) == 0xd800) {
        return (surrogateBehavior == 0) ? 0xfffd : ((surrogateBehavior == 1) ?
                    c : -1);
      }
      return c;
    }

    /**
     * Gets the Unicode code point at the given index of the string.
     * @param str The parameter {@code str} is a text string.
     * @param index Index of the current position into the string.
     * @return The Unicode code point at the given position. Returns -1 if {@code
     * index} is 0 or less, or is greater than the string's length. Returns
     * the replacement character (U+FFFD) if the code point at that
     * position is an unpaired surrogate code point. If the return value is
     * 65536 (0x10000) or greater, the code point takes up two UTF-16 code
     * units.
     * @throws NullPointerException The parameter {@code str} is null.
     */
    public static int CodePointAt(String str, int index) {
      return CodePointAt(str, index, 0);
    }

    /**
     * Gets the Unicode code point at the given index of the string. 
The
     * following example shows how to iterate a text string code point by
     * code point, terminating the loop when an unpaired surrogate is
     * found.
 for (int i = 0;i<str.length(); ++i) { int
     * codePoint = DataUtilities.CodePointAt(str, i, 2); if (codePoint <
     * 0) { break; /* Unpaired surrogate */ }
     *  System.out.println("codePoint:"+codePoint); if (codePoint >=
     * 0x10000) { i++; /* Supplementary code point */ } }
 . 
     * @param str The parameter {@code str} is a text string.
     * @param index Index of the current position into the string.
     * @param surrogateBehavior Specifies what kind of value to return if the code
     * point at the given index is an unpaired surrogate code point: if 0,
     * return the replacement character (U+FFFD); if 1, return the value of
     * the surrogate code point; if neither 0 nor 1, return -1.
     * @return The Unicode code point at the given position. Returns -1 if {@code
     * index} is 0 or less, or is greater than the string's length. Returns
     * a value as specified under {@code surrogateBehavior} if the code
     * point at that position is an unpaired surrogate code point. If the
     * return value is 65536 (0x10000) or greater, the code point takes up
     * two UTF-16 code units.
     * @throws NullPointerException The parameter {@code str} is null.
     */
    public static int CodePointAt(
      String str,
      int index,
      int surrogateBehavior) {
      if (str == null) {
        throw new NullPointerException("str");
      }
      if (index >= str.length()) {
        return -1;
      }
      if (index < 0) {
        return -1;
      }
      int c = str.charAt(index);
      if ((c & 0xfc00) == 0xd800 && index + 1 < str.length() &&
          (str.charAt(index + 1) & 0xfc00) == 0xdc00) {
        // Get the Unicode code point for the surrogate pair
        c = 0x10000 + ((c & 0x3ff) << 10) + (str.charAt(index + 1) & 0x3ff);
      } else if ((c & 0xf800) == 0xd800) {
        // unpaired surrogate
        return (surrogateBehavior == 0) ? 0xfffd : ((surrogateBehavior == 1) ?
                    c : (-1));
      }
      return c;
    }

    /**
     * Returns a string with the basic upper-case letters A to Z (U+0041 to U+005A)
     * converted to the corresponding basic lower-case letters. Other
     * characters remain unchanged.
     * @param str The parameter {@code str} is a text string.
     * @return The converted string, or null if {@code str} is null.
     */
    public static String ToLowerCaseAscii(String str) {
      if (str == null) {
        return null;
      }
      int len = str.length();
      char c = (char)0;
      boolean hasUpperCase = false;
      for (int i = 0; i < len; ++i) {
        c = str.charAt(i);
        if (c >= 'A' && c <= 'Z') {
          hasUpperCase = true;
          break;
        }
      }
      if (!hasUpperCase) {
        return str;
      }
      StringBuilder builder = new StringBuilder();
      for (int i = 0; i < len; ++i) {
        c = str.charAt(i);
        if (c >= 'A' && c <= 'Z') {
          builder.append((char)(c + 0x20));
        } else {
          builder.append(c);
        }
      }
      return builder.toString();
    }

    /**
     * Returns a string with the basic lower-case letters A to Z (U+0061 to U+007A)
     * converted to the corresponding basic upper-case letters. Other
     * characters remain unchanged.
     * @param str The parameter {@code str} is a text string.
     * @return The converted string, or null if {@code str} is null.
     */
    public static String ToUpperCaseAscii(String str) {
      if (str == null) {
        return null;
      }
      int len = str.length();
      char c = (char)0;
      boolean hasLowerCase = false;
      for (int i = 0; i < len; ++i) {
        c = str.charAt(i);
        if (c >= 'a' && c <= 'z') {
          hasLowerCase = true;
          break;
        }
      }
      if (!hasLowerCase) {
        return str;
      }
      StringBuilder builder = new StringBuilder();
      for (int i = 0; i < len; ++i) {
        c = str.charAt(i);
        if (c >= 'a' && c <= 'z') {
          builder.append((char)(c - 0x20));
        } else {
          builder.append(c);
        }
      }
      return builder.toString();
    }

    /**
     * Compares two strings in Unicode code point order. Unpaired surrogate code
     * points are treated as individual code points.
     * @param strA The first string. Can be null.
     * @param strB The second string. Can be null.
     * @return A value indicating which string is " less" or " greater" . 0: Both
     * strings are equal or null. Less than 0: a is null and b isn't; or
     * the first code point that's different is less in A than in B; or b
     * starts with a and is longer than a. Greater than 0: b is null and a
     * isn't; or the first code point that's different is greater in A than
     * in B; or a starts with b and is longer than b.
     */
    public static int CodePointCompare(String strA, String strB) {
      if (strA == null) {
        return (strB == null) ? 0 : -1;
      }
      if (strB == null) {
        return 1;
      }
      int len, ca, cb;
      len = Math.min(strA.length(), strB.length());
      for (int i = 0; i < len; ++i) {
        ca = strA.charAt(i);
        cb = strB.charAt(i);
        if (ca == cb) {
          // normal code units and illegal surrogates
          // are treated as single code points
          if ((ca & 0xf800) != 0xd800) {
            continue;
          }
          boolean incindex = false;
          if (i + 1 < strA.length() && (strA.charAt(i + 1) & 0xfc00) == 0xdc00) {
            ca = 0x10000 + ((ca & 0x3ff) << 10) + (strA.charAt(i + 1) & 0x3ff);
            incindex = true;
          }
          if (i + 1 < strB.length() && (strB.charAt(i + 1) & 0xfc00) == 0xdc00) {
            cb = 0x10000 + ((cb & 0x3ff) << 10) + (strB.charAt(i + 1) & 0x3ff);
            incindex = true;
          }
          if (ca != cb) {
            return ca - cb;
          }
          if (incindex) {
            ++i;
          }
        } else {
          if ((ca & 0xf800) != 0xd800 && (cb & 0xf800) != 0xd800) {
            return ca - cb;
          }
          if ((ca & 0xfc00) == 0xd800 && i + 1 < strA.length() &&
              (strA.charAt(i + 1) & 0xfc00) == 0xdc00) {
            ca = 0x10000 + ((ca & 0x3ff) << 10) + (strA.charAt(i + 1) & 0x3ff);
          }
          if ((cb & 0xfc00) == 0xd800 && i + 1 < strB.length() &&
              (strB.charAt(i + 1) & 0xfc00) == 0xdc00) {
            cb = 0x10000 + ((cb & 0x3ff) << 10) + (strB.charAt(i + 1) & 0x3ff);
          }
          return ca - cb;
        }
      }
      return (strA.length() == strB.length()) ? 0 : ((strA.length() < strB.length()) ?
                    -1 : 1);
    }

    /**
     * Writes a portion of a string in UTF-8 encoding to a data stream.
     * @param str A string to write.
     * @param offset The Index starting at 0 where the string portion to write
     * begins.
     * @param length The length of the string portion to write.
     * @param stream A writable data stream.
     * @param replace If true, replaces unpaired surrogate code points with the
     * replacement character (U+FFFD). If false, stops processing when an
     * unpaired surrogate code point is seen.
     * @return 0 if the entire string portion was written; or -1 if the string
     * portion contains an unpaired surrogate code point and {@code
     * replace} is false.
     * @throws NullPointerException The parameter {@code str} is null or {@code
     * stream} is null.
     * @throws java.io.IOException An I/O error occurred.
     * @throws IllegalArgumentException Either {@code offset} or {@code length} is less
     * than 0 or greater than {@code str} 's length, or {@code str} 's
     * length minus {@code offset} is less than {@code length}.
     */
    public static int WriteUtf8(
      String str,
      int offset,
      int length,
      OutputStream stream,
      boolean replace) throws java.io.IOException {
      return WriteUtf8(str, offset, length, stream, replace, false);
    }

    /**
     * Writes a portion of a string in UTF-8 encoding to a data stream.
     * @param str A string to write.
     * @param offset The Index starting at 0 where the string portion to write
     * begins.
     * @param length The length of the string portion to write.
     * @param stream A writable data stream.
     * @param replace If true, replaces unpaired surrogate code points with the
     * replacement character (U+FFFD). If false, stops processing when an
     * unpaired surrogate code point is seen.
     * @param lenientLineBreaks If true, replaces carriage return (CR) not followed
     * by line feed (LF) and LF not preceded by CR with CR-LF pairs.
     * @return 0 if the entire string portion was written; or -1 if the string
     * portion contains an unpaired surrogate code point and {@code
     * replace} is false.
     * @throws NullPointerException The parameter {@code str} is null or {@code
     * stream} is null.
     * @throws IllegalArgumentException The parameter {@code offset} is less than 0,
     * {@code length} is less than 0, or {@code offset} plus {@code length}
     * is greater than the string's length.
     * @throws java.io.IOException An I/O error occurred.
     */
    public static int WriteUtf8(
      String str,
      int offset,
      int length,
      OutputStream stream,
      boolean replace,
      boolean lenientLineBreaks) throws java.io.IOException {
      if (stream == null) {
        throw new NullPointerException("stream");
      }
      if (str == null) {
        throw new NullPointerException("str");
      }
      if (offset < 0) {
        throw new IllegalArgumentException("offset (" + offset + ") is less than " +
                    "0");
      }
      if (offset > str.length()) {
        throw new IllegalArgumentException("offset (" + offset + ") is more than " +
                    str.length());
      }
      if (length < 0) {
        throw new IllegalArgumentException("length (" + length + ") is less than " +
                    "0");
      }
      if (length > str.length()) {
        throw new IllegalArgumentException("length (" + length + ") is more than " +
                    str.length());
      }
      if (str.length() - offset < length) {
        throw new IllegalArgumentException("str.length() minus offset (" +
                (str.length() - offset) + ") is less than " + length);
      }
      int endIndex, c;
      byte[] bytes;
      int retval = 0;
      bytes = new byte[StreamedStringBufferLength];
      int byteIndex = 0;
      endIndex = offset + length;
      for (int index = offset; index < endIndex; ++index) {
        c = str.charAt(index);
        if (c <= 0x7f) {
          if (lenientLineBreaks) {
            if (c == 0x0d && (index + 1 >= endIndex || str.charAt(index + 1) !=
                    0x0a)) {
              // bare CR, convert to CRLF
              if (byteIndex + 2 > StreamedStringBufferLength) {
                // Write bytes retrieved so far
                stream.write(bytes, 0, byteIndex);
                byteIndex = 0;
              }
              bytes[byteIndex++] = 0x0d;
              bytes[byteIndex++] = 0x0a;
              continue;
            } else if (c == 0x0d) {
              // CR-LF pair
              if (byteIndex + 2 > StreamedStringBufferLength) {
                // Write bytes retrieved so far
                stream.write(bytes, 0, byteIndex);
                byteIndex = 0;
              }
              bytes[byteIndex++] = 0x0d;
              bytes[byteIndex++] = 0x0a;
              ++index;
              continue;
            }
            if (c == 0x0a) {
              // bare LF, convert to CRLF
              if (byteIndex + 2 > StreamedStringBufferLength) {
                // Write bytes retrieved so far
                stream.write(bytes, 0, byteIndex);
                byteIndex = 0;
              }
              bytes[byteIndex++] = 0x0d;
              bytes[byteIndex++] = 0x0a;
              continue;
            }
          }
          if (byteIndex >= StreamedStringBufferLength) {
            // Write bytes retrieved so far
            stream.write(bytes, 0, byteIndex);
            byteIndex = 0;
          }
          bytes[byteIndex++] = (byte)c;
        } else if (c <= 0x7ff) {
          if (byteIndex + 2 > StreamedStringBufferLength) {
            // Write bytes retrieved so far
            stream.write(bytes, 0, byteIndex);
            byteIndex = 0;
          }
          bytes[byteIndex++] = (byte)(0xc0 | ((c >> 6) & 0x1f));
          bytes[byteIndex++] = (byte)(0x80 | (c & 0x3f));
        } else {
          if ((c & 0xfc00) == 0xd800 && index + 1 < endIndex &&
              (str.charAt(index + 1) & 0xfc00) == 0xdc00) {
            // Get the Unicode code point for the surrogate pair
            c = 0x10000 + ((c & 0x3ff) << 10) + (str.charAt(index + 1) & 0x3ff);
            ++index;
          } else if ((c & 0xf800) == 0xd800) {
            // unpaired surrogate
            if (!replace) {
              retval = -1;
              break; // write bytes read so far
            }
            c = 0xfffd;
          }
          if (c <= 0xffff) {
            if (byteIndex + 3 > StreamedStringBufferLength) {
              // Write bytes retrieved so far
              stream.write(bytes, 0, byteIndex);
              byteIndex = 0;
            }
            bytes[byteIndex++] = (byte)(0xe0 | ((c >> 12) & 0x0f));
            bytes[byteIndex++] = (byte)(0x80 | ((c >> 6) & 0x3f));
            bytes[byteIndex++] = (byte)(0x80 | (c & 0x3f));
          } else {
            if (byteIndex + 4 > StreamedStringBufferLength) {
              // Write bytes retrieved so far
              stream.write(bytes, 0, byteIndex);
              byteIndex = 0;
            }
            bytes[byteIndex++] = (byte)(0xf0 | ((c >> 18) & 0x07));
            bytes[byteIndex++] = (byte)(0x80 | ((c >> 12) & 0x3f));
            bytes[byteIndex++] = (byte)(0x80 | ((c >> 6) & 0x3f));
            bytes[byteIndex++] = (byte)(0x80 | (c & 0x3f));
          }
        }
      }
      stream.write(bytes, 0, byteIndex);
      return retval;
    }

    /**
     * Writes a string in UTF-8 encoding to a data stream.
     * @param str A string to write.
     * @param stream A writable data stream.
     * @param replace If true, replaces unpaired surrogate code points with the
     * replacement character (U+FFFD). If false, stops processing when an
     * unpaired surrogate code point is seen.
     * @return 0 if the entire string was written; or -1 if the string contains an
     * unpaired surrogate code point and {@code replace} is false.
     * @throws NullPointerException The parameter {@code str} is null or {@code
     * stream} is null.
     * @throws java.io.IOException An I/O error occurred.
     */
    public static int WriteUtf8(String str, OutputStream stream, boolean replace) throws java.io.IOException {
      if (str == null) {
        throw new NullPointerException("str");
      }
      return WriteUtf8(str, 0, str.length(), stream, replace);
    }

    /**
     * Reads a string in UTF-8 encoding from a byte array.
     * @param data A byte array containing a UTF-8 text string.
     * @param offset Offset into the byte array to start reading.
     * @param bytesCount Length, in bytes, of the UTF-8 text string.
     * @param builder A string builder object where the resulting string will be
     * stored.
     * @param replace If true, replaces invalid encoding with the replacement
     * character (U+FFFD). If false, stops processing when invalid UTF-8 is
     * seen.
     * @return 0 if the entire string was read without errors, or -1 if the string
     * is not valid UTF-8 and {@code replace} is false.
     * @throws NullPointerException The parameter {@code data} is null or {@code
     * builder} is null.
     * @throws IllegalArgumentException The parameter {@code offset} is less than 0,
     * {@code bytesCount} is less than 0, or offset plus bytesCount is
     * greater than the length of {@code data}.
     */
    public static int ReadUtf8FromBytes(
      byte[] data,
      int offset,
      int bytesCount,
      StringBuilder builder,
      boolean replace) {
      if (data == null) {
        throw new NullPointerException("data");
      }
      if (offset < 0) {
        throw new IllegalArgumentException("offset (" + offset + ") is less than " +
                    "0");
      }
      if (offset > data.length) {
        throw new IllegalArgumentException("offset (" + offset + ") is more than " +
                    data.length);
      }
      if (bytesCount < 0) {
        throw new IllegalArgumentException("bytesCount (" + bytesCount +
                    ") is less than 0");
      }
      if (bytesCount > data.length) {
        throw new IllegalArgumentException("bytesCount (" + bytesCount +
                    ") is more than " + data.length);
      }
      if (data.length - offset < bytesCount) {
        throw new IllegalArgumentException("data.length minus offset (" +
                (data.length - offset) + ") is less than " + bytesCount);
      }
      if (builder == null) {
        throw new NullPointerException("builder");
      }
      int cp = 0;
      int bytesSeen = 0;
      int bytesNeeded = 0;
      int lower = 0x80;
      int upper = 0xbf;
      int pointer, endpointer, b;
      pointer = offset;
      endpointer = offset + bytesCount;
      while (pointer < endpointer) {
        b = data[pointer] & (int)0xff;
        ++pointer;
        if (bytesNeeded == 0) {
          if ((b & 0x7f) == b) {
            builder.append((char)b);
          } else if (b >= 0xc2 && b <= 0xdf) {
            bytesNeeded = 1;
            cp = (b - 0xc0) << 6;
          } else if (b >= 0xe0 && b <= 0xef) {
            lower = (b == 0xe0) ? 0xa0 : 0x80;
            upper = (b == 0xed) ? 0x9f : 0xbf;
            bytesNeeded = 2;
            cp = (b - 0xe0) << 12;
          } else if (b >= 0xf0 && b <= 0xf4) {
            lower = (b == 0xf0) ? 0x90 : 0x80;
            upper = (b == 0xf4) ? 0x8f : 0xbf;
            bytesNeeded = 3;
            cp = (b - 0xf0) << 18;
          } else {
            if (replace) {
              builder.append((char)0xfffd);
            } else {
              return -1;
            }
          }
          continue;
        }
        if (b < lower || b > upper) {
          cp = bytesNeeded = bytesSeen = 0;
          lower = 0x80;
          upper = 0xbf;
          if (replace) {
            --pointer;
            builder.append((char)0xfffd);
            continue;
          }
          return -1;
        } else {
          lower = 0x80;
          upper = 0xbf;
          ++bytesSeen;
          cp += (b - 0x80) << (6 * (bytesNeeded - bytesSeen));
          if (bytesSeen != bytesNeeded) {
            continue;
          }
          int ret, ch, lead, trail;
          ret = cp;
          cp = 0;
          bytesSeen = 0;
          bytesNeeded = 0;
          if (ret <= 0xffff) {
            builder.append((char)ret);
          } else {
            ch = ret - 0x10000;
            lead = (ch >> 10) + 0xd800;
            trail = (ch & 0x3ff) + 0xdc00;
            builder.append((char)lead);
            builder.append((char)trail);
          }
        }
      }
      if (bytesNeeded != 0) {
        if (replace) {
          builder.append((char)0xfffd);
        } else {
          return -1;
        }
      }
      return 0;
    }

    /**
     * Reads a string in UTF-8 encoding from a data stream in full and returns that
     * string. Replaces invalid encoding with the replacement character
     * (U+FFFD).
     * @param stream A readable data stream.
     * @return The string read.
     * @throws java.io.IOException An I/O error occurred.
     * @throws NullPointerException The parameter {@code stream} is null.
     */
    public static String ReadUtf8ToString(InputStream stream) throws java.io.IOException {
      return ReadUtf8ToString(stream, -1, true);
    }

    /**
     * Reads a string in UTF-8 encoding from a data stream and returns that string.
     * @param stream A readable data stream.
     * @param bytesCount The length, in bytes, of the string. If this is less than
     * 0, this function will read until the end of the stream.
     * @param replace If true, replaces invalid encoding with the replacement
     * character (U+FFFD). If false, throws an error if an unpaired
     * surrogate code point is seen.
     * @return The string read.
     * @throws java.io.IOException An I/O error occurred; or, the string is not
     * valid UTF-8 and {@code replace} is false.
     * @throws NullPointerException The parameter {@code stream} is null.
     */
    public static String ReadUtf8ToString(
      InputStream stream,
      int bytesCount,
      boolean replace) throws java.io.IOException {
      StringBuilder builder = new StringBuilder();
      if (DataUtilities.ReadUtf8(stream, bytesCount, builder, replace) == -1) {
        throw new IOException(
       "Unpaired surrogate code point found.",
       new IllegalArgumentException("Unpaired surrogate code point found."));
      }
      return builder.toString();
    }

    /**
     * Reads a string in UTF-8 encoding from a data stream.
     * @param stream A readable data stream.
     * @param bytesCount The length, in bytes, of the string. If this is less than
     * 0, this function will read until the end of the stream.
     * @param builder A string builder object where the resulting string will be
     * stored.
     * @param replace If true, replaces invalid encoding with the replacement
     * character (U+FFFD). If false, stops processing when an unpaired
     * surrogate code point is seen.
     * @return 0 if the entire string was read without errors, -1 if the string is
     * not valid UTF-8 and {@code replace} is false, or -2 if the end of
     * the stream was reached before the last character was read completely
     * (which is only the case if {@code bytesCount} is 0 or greater).
     * @throws java.io.IOException An I/O error occurred.
     * @throws NullPointerException The parameter {@code stream} is null or {@code
     * builder} is null.
     */
    public static int ReadUtf8(
      InputStream stream,
      int bytesCount,
      StringBuilder builder,
      boolean replace) throws java.io.IOException {
      if (stream == null) {
        throw new NullPointerException("stream");
      }
      if (builder == null) {
        throw new NullPointerException("builder");
      }
      int b;
      int cp = 0;
      int bytesSeen = 0;
      int bytesNeeded = 0;
      int lower = 0x80;
      int upper = 0xbf;
      int pointer = 0;
      while (pointer < bytesCount || bytesCount < 0) {
        b = stream.read();
        if (b < 0) {
          if (bytesNeeded != 0) {
            bytesNeeded = 0;
            if (replace) {
              builder.append((char)0xfffd);
              if (bytesCount >= 0) {
                return -2;
              }
              break; // end of stream
            }
            return -1;
          }
          if (bytesCount >= 0) {
            return -2;
          }
          break; // end of stream
        }
        if (bytesCount > 0) {
          ++pointer;
        }
        if (bytesNeeded == 0) {
          if ((b & 0x7f) == b) {
            builder.append((char)b);
          } else if (b >= 0xc2 && b <= 0xdf) {
            bytesNeeded = 1;
            cp = (b - 0xc0) << 6;
          } else if (b >= 0xe0 && b <= 0xef) {
            lower = (b == 0xe0) ? 0xa0 : 0x80;
            upper = (b == 0xed) ? 0x9f : 0xbf;
            bytesNeeded = 2;
            cp = (b - 0xe0) << 12;
          } else if (b >= 0xf0 && b <= 0xf4) {
            lower = (b == 0xf0) ? 0x90 : 0x80;
            upper = (b == 0xf4) ? 0x8f : 0xbf;
            bytesNeeded = 3;
            cp = (b - 0xf0) << 18;
          } else {
            if (replace) {
              builder.append((char)0xfffd);
            } else {
              return -1;
            }
          }
          continue;
        }
        if (b < lower || b > upper) {
          cp = bytesNeeded = bytesSeen = 0;
          lower = 0x80;
          upper = 0xbf;
          if (replace) {
            builder.append((char)0xfffd);
            // "Read" the last byte again
            if (b < 0x80) {
              builder.append((char)b);
            } else if (b >= 0xc2 && b <= 0xdf) {
              bytesNeeded = 1;
              cp = (b - 0xc0) << 6;
            } else if (b >= 0xe0 && b <= 0xef) {
              lower = (b == 0xe0) ? 0xa0 : 0x80;
              upper = (b == 0xed) ? 0x9f : 0xbf;
              bytesNeeded = 2;
              cp = (b - 0xe0) << 12;
            } else if (b >= 0xf0 && b <= 0xf4) {
              lower = (b == 0xf0) ? 0x90 : 0x80;
              upper = (b == 0xf4) ? 0x8f : 0xbf;
              bytesNeeded = 3;
              cp = (b - 0xf0) << 18;
            } else {
              builder.append((char)0xfffd);
            }
            continue;
          }
          return -1;
        } else {
          lower = 0x80;
          upper = 0xbf;
          ++bytesSeen;
          cp += (b - 0x80) << (6 * (bytesNeeded - bytesSeen));
          if (bytesSeen != bytesNeeded) {
            continue;
          }
          int ret, ch, lead, trail;
          ret = cp;
          cp = 0;
          bytesSeen = 0;
          bytesNeeded = 0;
          if (ret <= 0xffff) {
            builder.append((char)ret);
          } else {
            ch = ret - 0x10000;
            lead = (ch >> 10) + 0xd800;
            trail = (ch & 0x3ff) + 0xdc00;
            builder.append((char)lead);
            builder.append((char)trail);
          }
        }
      }
      if (bytesNeeded != 0) {
        if (replace) {
          builder.append((char)0xfffd);
        } else {
          return -1;
        }
      }
      return 0;
    }
  }