All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.upokecenter.util.DataUtilities Maven / Gradle / Ivy

package com.upokecenter.util;
/*
Written in 2013 by Peter O.
Any copyright is dedicated to the Public Domain.
http://creativecommons.org/publicdomain/zero/1.0/
If you like this, you should donate to Peter O.
at: http://upokecenter.dreamhosters.com/articles/donate-now-2/
 */

import java.io.*;

    /**
     * Contains methods useful for reading and writing strings. It is designed to
     * have no dependencies other than the basic runtime class library.
     */
  public final class DataUtilities {
private DataUtilities() {
}
    private static final int StreamedStringBufferLength = 4096;

    /**
     * Generates a text string from a UTF-8 byte array.
     * @param bytes A byte array containing text encoded in UTF-8.
     * @param replace If true, replaces invalid encoding with the replacement
     * character (U + FFFD). If false, stops processing when invalid UTF-8
     * is seen.
     * @return A string represented by the UTF-8 byte array.
     * @throws NullPointerException The parameter {@code bytes} is null.
     * @throws IllegalArgumentException The string is not valid UTF-8 and {@code replace}
     * is false.
     */
    public static String GetUtf8String(byte[] bytes, boolean replace) {
      if (bytes == null) {
        throw new NullPointerException("bytes");
      }
      StringBuilder b = new StringBuilder();
      if (ReadUtf8FromBytes(bytes, 0, bytes.length, b, replace) != 0) {
        throw new IllegalArgumentException("Invalid UTF-8");
      }
      return b.toString();
    }

    /**
     * Generates a text string from a portion of a UTF-8 byte array.
     * @param bytes A byte array containing text encoded in UTF-8.
     * @param offset Offset into the byte array to start reading.
     * @param bytesCount Length, in bytes, of the UTF-8 string.
     * @param replace If true, replaces invalid encoding with the replacement
     * character (U + FFFD). If false, stops processing when invalid UTF-8
     * is seen.
     * @return A string represented by the UTF-8 byte array.
     * @throws NullPointerException The parameter {@code bytes} is null.
     * @throws IllegalArgumentException The portion of the byte array is not valid UTF-8
     * and {@code replace} is false.
     * @throws IllegalArgumentException The parameter {@code offset} is less than 0,
     * {@code bytesCount} is less than 0, or offset plus bytesCount is
     * greater than the length of "data" .
     */
    public static String GetUtf8String(
byte[] bytes,
int offset,
int bytesCount,
boolean replace) {
      if (bytes == null) {
        throw new NullPointerException("bytes");
      }
      if (offset < 0) {
        throw new IllegalArgumentException("offset (" + offset + ") is less than " +
                    "0");
      }
      if (offset > bytes.length) {
        throw new IllegalArgumentException("offset (" + offset + ") is more than " +
                    bytes.length);
      }
      if (bytesCount < 0) {
        throw new IllegalArgumentException("bytesCount (" + bytesCount +
                    ") is less than 0");
      }
      if (bytesCount > bytes.length) {
        throw new IllegalArgumentException("bytesCount (" + bytesCount +
                    ") is more than " + bytes.length);
      }
      if (bytes.length - offset < bytesCount) {
        throw new IllegalArgumentException("bytes's length minus " + offset + " (" +
                (bytes.length - offset) + ") is less than " + bytesCount);
      }
      StringBuilder b = new StringBuilder();
      if (ReadUtf8FromBytes(bytes, offset, bytesCount, b, replace) != 0) {
        throw new IllegalArgumentException("Invalid UTF-8");
      }
      return b.toString();
    }

    /**
     * Encodes a string in UTF-8 as a byte array.
     * @param str A text string.
     * @param replace If true, replaces unpaired surrogate code points with the
     * replacement character (U + FFFD). If false, stops processing when an
     * unpaired surrogate code point is seen.
     * @return The string encoded in UTF-8.
     * @throws NullPointerException The parameter {@code str} is null.
     * @throws IllegalArgumentException The string contains an unpaired surrogate code
     * point and {@code replace} is false, or an internal error occurred.
     */
        public static byte[] GetUtf8Bytes(String str, boolean replace) {
          return GetUtf8Bytes(str, replace, false);
        }

    /**
     * Encodes a string in UTF-8 as a byte array.
     * @param str A text string.
     * @param replace If true, replaces unpaired surrogate code points with the
     * replacement character (U + FFFD). If false, stops processing when an
     * unpaired surrogate code point is seen.
     * @param lenientLineBreaks If true, replaces carriage return (CR) not followed
     * by line feed (LF) and LF not preceded by CR with CR-LF pairs.
     * @return The string encoded in UTF-8.
     * @throws NullPointerException The parameter {@code str} is null.
     * @throws IllegalArgumentException The string contains an unpaired surrogate code
     * point and {@code replace} is false, or an internal error occurred.
     */
    public static byte[] GetUtf8Bytes(
String str,
boolean replace,
boolean lenientLineBreaks) {
      if (str == null) {
        throw new NullPointerException("str");
      }
      try {
        java.io.ByteArrayOutputStream ms = null;
try {
ms = new java.io.ByteArrayOutputStream();

          if (WriteUtf8(str, ms, replace) != 0) {
            throw new IllegalArgumentException("Unpaired surrogate code point");
          }
          return ms.toByteArray();
}
finally {
try { if (ms != null)ms.close(); } catch (java.io.IOException ex) {}
}
      } catch (IOException ex) {
        throw new IllegalArgumentException("I/O error occurred", ex);
      }
    }

    /**
     * Calculates the number of bytes needed to encode a string in UTF-8.
     * @param str A string object.
     * @param replace If true, treats unpaired surrogate code points as having 3
     * UTF-8 bytes (the UTF-8 length of the replacement character U + FFFD).
     * @return The number of bytes needed to encode the given string in UTF-8, or
     * -1 if the string contains an unpaired surrogate code point and {@code
     * replace} is false.
     * @throws NullPointerException The parameter {@code str} is null.
     */
    public static long GetUtf8Length(String str, boolean replace) {
      if (str == null) {
        throw new NullPointerException("str");
      }
      long size = 0;
      for (int i = 0; i < str.length(); ++i) {
        int c = str.charAt(i);
        if (c <= 0x7f) {
          ++size;
        } else if (c <= 0x7ff) {
          size += 2;
        } else if (c <= 0xd7ff || c >= 0xe000) {
          size += 3;
        } else if (c <= 0xdbff) {  // UTF-16 leading surrogate
          ++i;
          if (i >= str.length() || str.charAt(i) < 0xdc00 || str.charAt(i) > 0xdfff) {
            if (replace) {
              size += 3;
              --i;
            } else {
              return -1;
            }
          } else {
            size += 4;
          }
        } else {
          if (replace) {
            size += 3;
          } else {
            return -1;
          }
        }
      }
      return size;
    }

    /**
     * Gets the Unicode code point just before the given index of the string.
     * @param str A string.
     * @param index Index of the current position into the string.
     * @return The Unicode code point at the previous position. Returns -1 if
     * {@code index} is 0 or less, or is greater than the string's length.
     * Returns the replacement character (U + FFFD) if the previous
     * character is an unpaired surrogate code point.
     * @throws NullPointerException The parameter {@code str} is null.
     */
    public static int CodePointBefore(String str, int index) {
      return CodePointBefore(str, index, 0);
    }

    /**
     * Gets the Unicode code point just before the given index of the string.
     * @param str A string.
     * @param index Index of the current position into the string.
     * @param surrogateBehavior Specifies what kind of value to return if the
     * previous character is an unpaired surrogate code point: if 0, return
     * the replacement character (U + FFFD); if 1, return the value of the
     * surrogate code point; if neither 0 nor 1, return -1.
     * @return The Unicode code point at the previous position. Returns -1 if
     * {@code index} is 0 or less, or is greater than the string's length.
     * Returns a value as specified under {@code surrogateBehavior} if the
     * previous character is an unpaired surrogate code point.
     * @throws NullPointerException The parameter {@code str} is null.
     */
    public static int CodePointBefore(
String str,
int index,
int surrogateBehavior) {
      if (str == null) {
        throw new NullPointerException("str");
      }
      if (index <= 0) {
        return -1;
      }
      if (index > str.length()) {
        return -1;
      }
      int c = str.charAt(index - 1);
      if ((c & 0xfc00) == 0xdc00 && index - 2 >= 0 &&
          str.charAt(index - 2) >= 0xd800 && str.charAt(index - 2) <= 0xdbff) {
        // Get the Unicode code point for the surrogate pair
        return 0x10000 + ((str.charAt(index - 2) - 0xd800) << 10) + (c - 0xdc00);
      }
      if ((c & 0xf800) == 0xd800) {
        // unpaired surrogate
        return (surrogateBehavior == 0) ? 0xfffd : ((surrogateBehavior == 1) ?
                    c : (-1));
      }
      return c;
    }

    /**
     * Gets the Unicode code point at the given index of the string.
     * @param str A string.
     * @param index Index of the current position into the string.
     * @return The Unicode code point at the given position. Returns -1 if {@code
     * index} is less than 0, or is the string's length or greater. Returns
     * the replacement character (U + FFFD) if the current character is an
     * unpaired surrogate code point.
     * @throws NullPointerException The parameter {@code str} is null.
     */
    public static int CodePointAt(String str, int index) {
      return CodePointAt(str, index, 0);
    }

    /**
     * Gets the Unicode code point at the given index of the string.
     * @param str A string.
     * @param index Index of the current position into the string.
     * @param surrogateBehavior Specifies what kind of value to return if the
     * previous character is an unpaired surrogate code point: if 0, return
     * the replacement character (U + FFFD); if 1, return the value of the
     * surrogate code point; if neither 0 nor 1, return -1.
     * @return The Unicode code point at the current position. Returns -1 if {@code
     * index} is less than 0, or is the string's length or greater. Returns
     * a value as specified under {@code surrogateBehavior} if the previous
     * character is an unpaired surrogate code point.
     * @throws NullPointerException The parameter {@code str} is null.
     */
    public static int CodePointAt(
String str,
int index,
int surrogateBehavior) {
      if (str == null) {
        throw new NullPointerException("str");
      }
      if (index >= str.length()) {
        return -1;
      }
      if (index < 0) {
        return -1;
      }
      int c = str.charAt(index);
      if ((c & 0xfc00) == 0xd800 && index + 1 < str.length() &&
          str.charAt(index + 1) >= 0xdc00 && str.charAt(index + 1) <= 0xdfff) {
        // Get the Unicode code point for the surrogate pair
        c = 0x10000 + ((c - 0xd800) << 10) + (str.charAt(index + 1) - 0xdc00);
        ++index;
      } else if ((c & 0xf800) == 0xd800) {
        // unpaired surrogate
        return (surrogateBehavior == 0) ? 0xfffd : ((surrogateBehavior == 1) ?
                    c : (-1));
      }
      return c;
    }

    /**
     * Returns a string with upper-case ASCII letters (A to Z) converted to
     * lower-case. Other characters remain unchanged.
     * @param str A string.
     * @return The converted string, or null if {@code str} is null.
     */
    public static String ToLowerCaseAscii(String str) {
      if (str == null) {
        return null;
      }
      int len = str.length();
      char c = (char)0;
      boolean hasUpperCase = false;
      for (int i = 0; i < len; ++i) {
        c = str.charAt(i);
        if (c >= 'A' && c <= 'Z') {
          hasUpperCase = true;
          break;
        }
      }
      if (!hasUpperCase) {
        return str;
      }
      StringBuilder builder = new StringBuilder();
      for (int i = 0; i < len; ++i) {
        c = str.charAt(i);
        if (c >= 'A' && c <= 'Z') {
          builder.append((char)(c + 0x20));
        } else {
          builder.append(c);
        }
      }
      return builder.toString();
    }

    /**
     * Compares two strings in Unicode code point order. Unpaired surrogates are
     * treated as individual code points.
     * @param strA The first string. Can be null.
     * @param strB The second string. Can be null.
     * @return A value indicating which string is " less" or " greater" . 0: Both
     * strings are equal or null. Less than 0: a is null and b isn't; or the
     * first code point that's different is less in A than in B; or b starts
     * with a and is longer than a. Greater than 0: b is null and a isn't;
     * or the first code point that's different is greater in A than in B;
     * or a starts with b and is longer than b.
     */
    public static int CodePointCompare(String strA, String strB) {
      if (strA == null) {
        return (strB == null) ? 0 : -1;
      }
      if (strB == null) {
        return 1;
      }
      int len = Math.min(strA.length(), strB.length());
      for (int i = 0; i < len; ++i) {
        int ca = strA.charAt(i);
        int cb = strB.charAt(i);
        if (ca == cb) {
          // normal code units and illegal surrogates
          // are treated as single code points
          if ((ca & 0xf800) != 0xd800) {
            continue;
          }
          boolean incindex = false;
          if (i + 1 < strA.length() && strA.charAt(i + 1) >= 0xdc00 && strA.charAt(i + 1) <=
              0xdfff) {
            ca = 0x10000 + ((ca - 0xd800) << 10) + (strA.charAt(i + 1) - 0xdc00);
            incindex = true;
          }
          if (i + 1 < strB.length() && strB.charAt(i + 1) >= 0xdc00 && strB.charAt(i + 1) <=
              0xdfff) {
            cb = 0x10000 + ((cb - 0xd800) << 10) + (strB.charAt(i + 1) - 0xdc00);
            incindex = true;
          }
          if (ca != cb) {
            return ca - cb;
          }
          if (incindex) {
            ++i;
          }
        } else {
          if ((ca & 0xf800) != 0xd800 && (cb & 0xf800) != 0xd800) {
            return ca - cb;
          }
          if ((ca & 0xfc00) == 0xd800 && i + 1 < strA.length() &&
              strA.charAt(i + 1) >= 0xdc00 && strA.charAt(i + 1) <= 0xdfff) {
            ca = 0x10000 + ((ca - 0xd800) << 10) + (strA.charAt(i + 1) - 0xdc00);
          }
          if ((cb & 0xfc00) == 0xd800 && i + 1 < strB.length() &&
              strB.charAt(i + 1) >= 0xdc00 && strB.charAt(i + 1) <= 0xdfff) {
            cb = 0x10000 + ((cb - 0xd800) << 10) + (strB.charAt(i + 1) - 0xdc00);
          }
          return ca - cb;
        }
      }
      return (strA.length() == strB.length()) ? 0 : ((strA.length() < strB.length()) ?
                    -1 : 1);
    }

    /**
     * Writes a portion of a string in UTF-8 encoding to a data stream.
     * @param str A string to write.
     * @param offset The zero-based index where the string portion to write begins.
     * @param length The length of the string portion to write.
     * @param stream A writable data stream.
     * @param replace If true, replaces unpaired surrogate code points with the
     * replacement character (U + FFFD). If false, stops processing when an
     * unpaired surrogate code point is seen.
     * @return 0 if the entire string portion was written; or -1 if the string
     * portion contains an unpaired surrogate code point and {@code replace}
     * is false.
     * @throws NullPointerException The parameter {@code str} is null or {@code
     * stream} is null.
     * @throws IllegalArgumentException The parameter {@code offset} is less than 0,
     * {@code length} is less than 0, or {@code offset} plus {@code length}
     * is greater than the string's length.
     * @throws java.io.IOException An I/O error occurred.
     */
    public static int WriteUtf8(
String str,
int offset,
int length,
OutputStream stream,
boolean replace) throws java.io.IOException {
      return WriteUtf8(str, offset, length, stream, replace, false);
    }

    /**
     * Writes a portion of a string in UTF-8 encoding to a data stream.
     * @param str A string to write.
     * @param offset The zero-based index where the string portion to write begins.
     * @param length The length of the string portion to write.
     * @param stream A writable data stream.
     * @param replace If true, replaces unpaired surrogate code points with the
     * replacement character (U + FFFD). If false, stops processing when an
     * unpaired surrogate code point is seen.
     * @param lenientLineBreaks If true, replaces carriage return (CR) not followed
     * by line feed (LF) and LF not preceded by CR with CR-LF pairs.
     * @return 0 if the entire string portion was written; or -1 if the string
     * portion contains an unpaired surrogate code point and {@code replace}
     * is false.
     * @throws NullPointerException The parameter {@code str} is null or {@code
     * stream} is null.
     * @throws IllegalArgumentException The parameter {@code offset} is less than 0,
     * {@code length} is less than 0, or {@code offset} plus {@code length}
     * is greater than the string's length.
     * @throws java.io.IOException An I/O error occurred.
     */
    public static int WriteUtf8(
String str,
int offset,
int length,
OutputStream stream,
boolean replace,
boolean lenientLineBreaks) throws java.io.IOException {
      if (stream == null) {
        throw new NullPointerException("stream");
      }
      if (str == null) {
        throw new NullPointerException("str");
      }
      if (offset < 0) {
        throw new IllegalArgumentException("offset (" + offset + ") is less than " +
                    "0");
      }
      if (offset > str.length()) {
        throw new IllegalArgumentException("offset (" + offset + ") is more than " +
                    str.length());
      }
      if (length < 0) {
        throw new IllegalArgumentException("length (" + length + ") is less than " +
                    "0");
      }
      if (length > str.length()) {
        throw new IllegalArgumentException("length (" + length + ") is more than " +
                    str.length());
      }
      if (str.length() - offset < length) {
        throw new IllegalArgumentException("str.length() minus offset (" +
                (str.length() - offset) + ") is less than " + length);
      }
      byte[] bytes;
      int retval = 0;
      bytes = new byte[StreamedStringBufferLength];
      int byteIndex = 0;
      int endIndex = offset + length;
      for (int index = offset; index < endIndex; ++index) {
        int c = str.charAt(index);
        if (c <= 0x7f) {
          if (lenientLineBreaks) {
            if (c == 0x0d && (index + 1 >= endIndex || str.charAt(index + 1) !=
                    0x0a)) {
              // bare CR, convert to CRLF
              if (byteIndex + 2 > StreamedStringBufferLength) {
                // Write bytes retrieved so far
                stream.write(bytes, 0, byteIndex);
                byteIndex = 0;
              }
              bytes[byteIndex++] = 0x0d;
              bytes[byteIndex++] = 0x0a;
              continue;
            } else if (c == 0x0d) {
              // CR-LF pair
              if (byteIndex + 2 > StreamedStringBufferLength) {
                // Write bytes retrieved so far
                stream.write(bytes, 0, byteIndex);
                byteIndex = 0;
              }
              bytes[byteIndex++] = 0x0d;
              bytes[byteIndex++] = 0x0a;
              ++index;
              continue;
            }
            if (c == 0x0a) {
              // bare LF, convert to CRLF
              if (byteIndex + 2 > StreamedStringBufferLength) {
                // Write bytes retrieved so far
                stream.write(bytes, 0, byteIndex);
                byteIndex = 0;
              }
              bytes[byteIndex++] = 0x0d;
              bytes[byteIndex++] = 0x0a;
              continue;
            }
          }
          if (byteIndex >= StreamedStringBufferLength) {
            // Write bytes retrieved so far
            stream.write(bytes, 0, byteIndex);
            byteIndex = 0;
          }
          bytes[byteIndex++] = (byte)c;
        } else if (c <= 0x7ff) {
          if (byteIndex + 2 > StreamedStringBufferLength) {
            // Write bytes retrieved so far
            stream.write(bytes, 0, byteIndex);
            byteIndex = 0;
          }
          bytes[byteIndex++] = (byte)(0xc0 | ((c >> 6) & 0x1f));
          bytes[byteIndex++] = (byte)(0x80 | (c & 0x3f));
        } else {
          if ((c & 0xfc00) == 0xd800 && index + 1 < endIndex &&
              str.charAt(index + 1) >= 0xdc00 && str.charAt(index + 1) <= 0xdfff) {
            // Get the Unicode code point for the surrogate pair
            c = 0x10000 + ((c - 0xd800) << 10) + (str.charAt(index + 1) - 0xdc00);
            ++index;
          } else if ((c & 0xf800) == 0xd800) {
            // unpaired surrogate
            if (!replace) {
              retval = -1;
              break;  // write bytes read so far
            }
            c = 0xfffd;
          }
          if (c <= 0xffff) {
            if (byteIndex + 3 > StreamedStringBufferLength) {
              // Write bytes retrieved so far
              stream.write(bytes, 0, byteIndex);
              byteIndex = 0;
            }
            bytes[byteIndex++] = (byte)(0xe0 | ((c >> 12) & 0x0f));
            bytes[byteIndex++] = (byte)(0x80 | ((c >> 6) & 0x3f));
            bytes[byteIndex++] = (byte)(0x80 | (c & 0x3f));
          } else {
            if (byteIndex + 4 > StreamedStringBufferLength) {
              // Write bytes retrieved so far
              stream.write(bytes, 0, byteIndex);
              byteIndex = 0;
            }
            bytes[byteIndex++] = (byte)(0xf0 | ((c >> 18) & 0x07));
            bytes[byteIndex++] = (byte)(0x80 | ((c >> 12) & 0x3f));
            bytes[byteIndex++] = (byte)(0x80 | ((c >> 6) & 0x3f));
            bytes[byteIndex++] = (byte)(0x80 | (c & 0x3f));
          }
        }
      }
      stream.write(bytes, 0, byteIndex);
      return retval;
    }

    /**
     * Writes a string in UTF-8 encoding to a data stream.
     * @param str A string to write.
     * @param stream A writable data stream.
     * @param replace If true, replaces unpaired surrogate code points with the
     * replacement character (U + FFFD). If false, stops processing when an
     * unpaired surrogate code point is seen.
     * @return 0 if the entire string was written; or -1 if the string contains an
     * unpaired surrogate code point and {@code replace} is false.
     * @throws NullPointerException The parameter {@code str} is null or {@code
     * stream} is null.
     * @throws java.io.IOException An I/O error occurred.
     */
    public static int WriteUtf8(String str, OutputStream stream, boolean replace) throws java.io.IOException {
      if (str == null) {
        throw new NullPointerException("str");
      }
      return WriteUtf8(str, 0, str.length(), stream, replace);
    }

    /**
     * Reads a string in UTF-8 encoding from a byte array.
     * @param data A byte array containing a UTF-8 string.
     * @param offset Offset into the byte array to start reading.
     * @param bytesCount Length, in bytes, of the UTF-8 string.
     * @param builder A string builder object where the resulting string will be
     * stored.
     * @param replace If true, replaces invalid encoding with the replacement
     * character (U + FFFD). If false, stops processing when invalid UTF-8
     * is seen.
     * @return 0 if the entire string was read without errors, or -1 if the string
     * is not valid UTF-8 and {@code replace} is false.
     * @throws NullPointerException The parameter {@code data} is null or {@code
     * builder} is null.
     * @throws IllegalArgumentException The parameter {@code offset} is less than 0,
     * {@code bytesCount} is less than 0, or offset plus bytesCount is
     * greater than the length of {@code data}.
     */
    public static int ReadUtf8FromBytes(
byte[] data,
int offset,
int bytesCount,
StringBuilder builder,
boolean replace) {
      if (data == null) {
        throw new NullPointerException("data");
      }
      if (offset < 0) {
        throw new IllegalArgumentException("offset (" + offset + ") is less than " +
                    "0");
      }
      if (offset > data.length) {
        throw new IllegalArgumentException("offset (" + offset + ") is more than " +
                    data.length);
      }
      if (bytesCount < 0) {
        throw new IllegalArgumentException("bytesCount (" + bytesCount +
                    ") is less than 0");
      }
      if (bytesCount > data.length) {
        throw new IllegalArgumentException("bytesCount (" + bytesCount +
                    ") is more than " + data.length);
      }
      if (data.length - offset < bytesCount) {
        throw new IllegalArgumentException("data.length minus offset (" +
                (data.length - offset) + ") is less than " + bytesCount);
      }
      if (builder == null) {
        throw new NullPointerException("builder");
      }
      int cp = 0;
      int bytesSeen = 0;
      int bytesNeeded = 0;
      int lower = 0x80;
      int upper = 0xbf;
      int pointer = offset;
      int endpointer = offset + bytesCount;
      while (pointer < endpointer) {
        int b = data[pointer] & (int)0xff;
        ++pointer;
        if (bytesNeeded == 0) {
          if ((b & 0x7f) == b) {
            builder.append((char)b);
          } else if (b >= 0xc2 && b <= 0xdf) {
            bytesNeeded = 1;
            cp = (b - 0xc0) << 6;
          } else if (b >= 0xe0 && b <= 0xef) {
            lower = (b == 0xe0) ? 0xa0 : 0x80;
            upper = (b == 0xed) ? 0x9f : 0xbf;
            bytesNeeded = 2;
            cp = (b - 0xe0) << 12;
          } else if (b >= 0xf0 && b <= 0xf4) {
            lower = (b == 0xf0) ? 0x90 : 0x80;
            upper = (b == 0xf4) ? 0x8f : 0xbf;
            bytesNeeded = 3;
            cp = (b - 0xf0) << 18;
          } else {
            if (replace) {
              builder.append((char)0xfffd);
            } else {
              return -1;
            }
          }
          continue;
        }
        if (b < lower || b > upper) {
          cp = bytesNeeded = bytesSeen = 0;
          lower = 0x80;
          upper = 0xbf;
          if (replace) {
            --pointer;
            builder.append((char)0xfffd);
            continue;
          }
          return -1;
        } else {
          lower = 0x80;
          upper = 0xbf;
          ++bytesSeen;
          cp += (b - 0x80) << (6 * (bytesNeeded - bytesSeen));
          if (bytesSeen != bytesNeeded) {
            continue;
          }
          int ret = cp;
          cp = 0;
          bytesSeen = 0;
          bytesNeeded = 0;
          if (ret <= 0xffff) {
            builder.append((char)ret);
          } else {
            int ch = ret - 0x10000;
            int lead = (ch / 0x400) + 0xd800;
            int trail = (ch & 0x3ff) + 0xdc00;
            builder.append((char)lead);
            builder.append((char)trail);
          }
        }
      }
      if (bytesNeeded != 0) {
        if (replace) {
          builder.append((char)0xfffd);
        } else {
          return -1;
        }
      }
      return 0;
    }

    /**
     * Reads a string in UTF-8 encoding from a data stream in full and returns that
     * string. Replaces invalid encoding with the replacement character (U +
     * FFFD).
     * @param stream A readable data stream.
     * @return The string read.
     * @throws java.io.IOException An I/O error occurred.
     * @throws NullPointerException The parameter {@code stream} is null.
     */
    public static String ReadUtf8ToString(InputStream stream) throws java.io.IOException {
      return ReadUtf8ToString(stream, -1, true);
    }

    /**
     * Reads a string in UTF-8 encoding from a data stream and returns that string.
     * @param stream A readable data stream.
     * @param bytesCount The length, in bytes, of the string. If this is less than
     * 0, this function will read until the end of the stream.
     * @param replace If true, replaces invalid encoding with the replacement
     * character (U + FFFD). If false, throws an error if an unpaired
     * surrogate code point is seen.
     * @return The string read.
     * @throws java.io.IOException An I/O error occurred; or, the string is not
     * valid UTF-8 and {@code replace} is false.
     * @throws NullPointerException The parameter {@code stream} is null.
     */
    public static String ReadUtf8ToString(
InputStream stream,
int bytesCount,
boolean replace) throws java.io.IOException {
      StringBuilder builder = new StringBuilder();
      int retval = DataUtilities.ReadUtf8(stream, bytesCount, builder, replace);
      if (retval == -1) {
        throw new IOException(
       "Unpaired surrogate code point found.",
       new java.nio.charset.MalformedInputException(1));
      }
      return builder.toString();
    }

    /**
     * Reads a string in UTF-8 encoding from a data stream.
     * @param stream A readable data stream.
     * @param bytesCount The length, in bytes, of the string. If this is less than
     * 0, this function will read until the end of the stream.
     * @param builder A string builder object where the resulting string will be
     * stored.
     * @param replace If true, replaces invalid encoding with the replacement
     * character (U + FFFD). If false, stops processing when an unpaired
     * surrogate code point is seen.
     * @return 0 if the entire string was read without errors, -1 if the string is
     * not valid UTF-8 and {@code replace} is false, or -2 if the end of the
     * stream was reached before the last character was read completely
     * (which is only the case if {@code bytesCount} is 0 or greater).
     * @throws java.io.IOException An I/O error occurred.
     * @throws NullPointerException The parameter {@code stream} is null or {@code
     * builder} is null.
     */
    public static int ReadUtf8(
InputStream stream,
int bytesCount,
StringBuilder builder,
boolean replace) throws java.io.IOException {
      if (stream == null) {
        throw new NullPointerException("stream");
      }
      if (builder == null) {
        throw new NullPointerException("builder");
      }
      int cp = 0;
      int bytesSeen = 0;
      int bytesNeeded = 0;
      int lower = 0x80;
      int upper = 0xbf;
      int pointer = 0;
      while (pointer < bytesCount || bytesCount < 0) {
        int b = stream.read();
        if (b < 0) {
          if (bytesNeeded != 0) {
            bytesNeeded = 0;
            if (replace) {
              builder.append((char)0xfffd);
              if (bytesCount >= 0) {
                return -2;
              }
              break;  // end of stream
            }
            return -1;
          }
          if (bytesCount >= 0) {
            return -2;
          }
          break;  // end of stream
        }
        if (bytesCount > 0) {
          ++pointer;
        }
        if (bytesNeeded == 0) {
          if ((b & 0x7f) == b) {
            builder.append((char)b);
          } else if (b >= 0xc2 && b <= 0xdf) {
            bytesNeeded = 1;
            cp = (b - 0xc0) << 6;
          } else if (b >= 0xe0 && b <= 0xef) {
            lower = (b == 0xe0) ? 0xa0 : 0x80;
            upper = (b == 0xed) ? 0x9f : 0xbf;
            bytesNeeded = 2;
            cp = (b - 0xe0) << 12;
          } else if (b >= 0xf0 && b <= 0xf4) {
            lower = (b == 0xf0) ? 0x90 : 0x80;
            upper = (b == 0xf4) ? 0x8f : 0xbf;
            bytesNeeded = 3;
            cp = (b - 0xf0) << 18;
          } else {
            if (replace) {
              builder.append((char)0xfffd);
            } else {
              return -1;
            }
          }
          continue;
        }
        if (b < lower || b > upper) {
          cp = bytesNeeded = bytesSeen = 0;
          lower = 0x80;
          upper = 0xbf;
          if (replace) {
            builder.append((char)0xfffd);
            // "Read" the last byte again
            if (b < 0x80) {
              builder.append((char)b);
            } else if (b >= 0xc2 && b <= 0xdf) {
              bytesNeeded = 1;
              cp = (b - 0xc0) << 6;
            } else if (b >= 0xe0 && b <= 0xef) {
              lower = (b == 0xe0) ? 0xa0 : 0x80;
              upper = (b == 0xed) ? 0x9f : 0xbf;
              bytesNeeded = 2;
              cp = (b - 0xe0) << 12;
            } else if (b >= 0xf0 && b <= 0xf4) {
              lower = (b == 0xf0) ? 0x90 : 0x80;
              upper = (b == 0xf4) ? 0x8f : 0xbf;
              bytesNeeded = 3;
              cp = (b - 0xf0) << 18;
            } else {
              builder.append((char)0xfffd);
            }
            continue;
          }
          return -1;
        } else {
          lower = 0x80;
          upper = 0xbf;
          ++bytesSeen;
          cp += (b - 0x80) << (6 * (bytesNeeded - bytesSeen));
          if (bytesSeen != bytesNeeded) {
            continue;
          }
          int ret = cp;
          cp = 0;
          bytesSeen = 0;
          bytesNeeded = 0;
          if (ret <= 0xffff) {
            builder.append((char)ret);
          } else {
            int ch = ret - 0x10000;
            int lead = (ch / 0x400) + 0xd800;
            int trail = (ch & 0x3ff) + 0xdc00;
            builder.append((char)lead);
            builder.append((char)trail);
          }
        }
      }
      if (bytesNeeded != 0) {
        if (replace) {
          builder.append((char)0xfffd);
        } else {
          return -1;
        }
      }
      return 0;
    }
  }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy