All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.upokecenter.text.CharacterReader Maven / Gradle / Ivy

Go to download

A Java library that implements character encodings used in Web pages and email.

There is a newer version: 0.6.0
Show newest version
package com.upokecenter.text;
/*
Written by Peter O. in 2014.
Any copyright is dedicated to the Public Domain.
http://creativecommons.org/publicdomain/zero/1.0/
If you like this, you should donate to Peter O.
at: http://peteroupc.github.io/
 */

import java.io.*;

    /**
     * A general-purpose character input for reading text from byte streams and
     * text strings. When reading byte streams, this class supports the
     * UTF-8 character encoding by default, but can be configured to support
     * UTF-16 and UTF-32 as well.
     */
  public final class CharacterReader implements ICharacterInput {
    private final int mode;
    private final boolean errorThrow;
    private final boolean dontSkipUtf8Bom;
    private final String str;
    private final int strLength;
    private final IByteReader stream;

    private int offset;
    private ICharacterInput reader;

    /**
     * Initializes a new instance of the {@link
     * com.upokecenter.text.CharacterReader} class using a Unicode 16-bit
     * string; if the string begins with a byte-order mark (U + FEFF), it
     * won't be skipped, and any unpaired surrogate code points (U+D800 to
     * U + DFFF) in the string are replaced with replacement characters
     * (U + FFFD).
     * @param str The string to read.
     * @throws java.lang.NullPointerException The parameter {@code str} is null.
     */
    public CharacterReader(String str) {
 this(str, false, false);
    }

    /**
     * Initializes a new instance of the {@link
     * com.upokecenter.text.CharacterReader} class using a Unicode 16-bit
     * string; any unpaired surrogate code points (U + D800 to U + DFFF) in the
     * string are replaced with replacement characters (U + FFFD).
     * @param str The string to read.
     * @param skipByteOrderMark If true and the string begins with a byte-order
     * mark (U + FEFF), will skip that code point as it reads the string.
     * @throws java.lang.NullPointerException The parameter {@code str} is null.
     */
    public CharacterReader(String str, boolean skipByteOrderMark) {
 this(str, skipByteOrderMark, false);
    }

    /**
     * Initializes a new instance of the {@link
     * com.upokecenter.text.CharacterReader} class using a Unicode 16-bit
     * string.
     * @param str The string to read.
     * @param skipByteOrderMark If true and the string begins with a byte-order
     * mark (U + FEFF), will skip that code point as it reads the string.
     * @param errorThrow If true, will throw an exception if unpaired surrogate
     * code points (U + D800 to U + DFFF) are found in the string. If false,
     * replaces those byte sequences with replacement characters (U + FFFD) as
     * the stream is read.
     * @throws java.lang.NullPointerException The parameter {@code str} is null.
     */
  public CharacterReader(
  String str,
  boolean skipByteOrderMark,
  boolean errorThrow) {
      if (str == null) {
        throw new NullPointerException("str");
      }
      this.strLength = str.length();
      this.offset = (skipByteOrderMark && this.strLength > 0 && str.charAt(0) ==
        0xfeff) ? 1 : 0;
      this.str = str;
      this.errorThrow = errorThrow;
      this.mode = -1;
      this.dontSkipUtf8Bom = false;
      this.stream = null;
    }

    /**
     * Initializes a new instance of the {@link
     * com.upokecenter.text.CharacterReader} class.
     * @param str The parameter {@code str} is a text string.
     * @param offset A zero-based index showing where the desired portion of {@code
     * str} begins.
     * @param length The number of elements in the desired portion of {@code str}
     * (but not more than {@code str} 's length).
     */
    public CharacterReader(String str, int offset, int length) {
 this(str, offset, length, false, false);
    }

    /**
     * Initializes a new instance of the {@link
     * com.upokecenter.text.CharacterReader} class.
     * @param str The parameter {@code str} is a text string.
     * @param offset A zero-based index showing where the desired portion of {@code
     * str} begins.
     * @param length The number of elements in the desired portion of {@code str}
     * (but not more than {@code str} 's length).
     * @param skipByteOrderMark If true and the string begins with a byte-order
     * mark (U + FEFF), will skip that code point as it reads the string.
     * @param errorThrow If true, will throw an exception if unpaired surrogate
     * code points (U + D800 to U + DFFF) are found in the string. If false,
     * replaces those byte sequences with replacement characters (U + FFFD) as
     * the stream is read.
     * @throws java.lang.NullPointerException The parameter {@code str} is null.
     */
    public CharacterReader(
  String str,
  int offset,
  int length,
  boolean skipByteOrderMark,
  boolean errorThrow) {
      if (str == null) {
  throw new NullPointerException("str");
}
if (offset < 0) {
  throw new IllegalArgumentException("offset (" + offset +
    ") is less than 0");
}
if (offset > str.length()) {
  throw new IllegalArgumentException("offset (" + offset +
    ") is more than " + str.length());
}
if (length < 0) {
  throw new IllegalArgumentException("length (" + length +
    ") is less than 0");
}
if (length > str.length()) {
  throw new IllegalArgumentException("length (" + length +
    ") is more than " + str.length());
}
if (str.length() - offset < length) {
  throw new IllegalArgumentException("str's length minus " + offset + " (" +
    (str.length() - offset) + ") is less than " + length);
}
      this.strLength = length;
      this.offset = (skipByteOrderMark && length > 0 && str.charAt(offset) ==
        0xfeff) ? offset + 1 : 0;
      this.str = str;
      this.errorThrow = errorThrow;
      this.mode = -1;
      this.dontSkipUtf8Bom = false;
      this.stream = null;
    }

    /**
     * Initializes a new instance of the {@link
     * com.upokecenter.text.CharacterReader} class; will read the stream as
     * UTF-8, skip the byte-order mark (U + FEFF) if it appears first in the
     * stream, and replace invalid byte sequences with replacement
     * characters (U + FFFD).
     * @param stream A readable data stream.
     * @throws java.lang.NullPointerException The parameter {@code stream} is null.
     */
    public CharacterReader(InputStream stream) {
 this(stream, 0, false);
    }

    /**
     * Initializes a new instance of the {@link
     * com.upokecenter.text.CharacterReader} class; will skip the byte-order
     * mark (U + FEFF) if it appears first in the stream.
     * @param stream A readable byte stream.
     * @param mode The method to use when detecting encodings other than UTF-8 in
     * the byte stream. This usually involves checking whether the stream
     * begins with a byte-order mark (BOM, U + FEFF) or a non-zero basic code
     * point (NZB, U + 0001 to U + 007F) before reading the rest of the stream.
     * This value can be one of the following: 
  • 0: UTF-8 only.
  • *
  • 1: Detect UTF-16 using BOM or NZB, otherwise UTF-8.
  • 2: * Detect UTF-16/UTF-32 using BOM or NZB, otherwise UTF-8. (Tries to * detect UTF-32 first.)
  • 3: Detect UTF-16 using BOM, otherwise * UTF-8.
  • 4: Detect UTF-16/UTF-32 using BOM, otherwise UTF-8. * (Tries to detect UTF-32 first.)
. * @param errorThrow If true, will throw an exception if invalid byte sequences * (in the detected encoding) are found in the byte stream. If false, * replaces those byte sequences with replacement characters (U + FFFD) as * the stream is read. * @throws java.lang.NullPointerException The parameter {@code stream} is null. */ public CharacterReader(InputStream stream, int mode, boolean errorThrow) { this(stream, mode, errorThrow, false); } /** * Initializes a new instance of the {@link * com.upokecenter.text.CharacterReader} class; will skip the byte-order * mark (U + FEFF) if it appears first in the stream and replace invalid * byte sequences with replacement characters (U + FFFD). * @param stream A readable byte stream. * @param mode The method to use when detecting encodings other than UTF-8 in * the byte stream. This usually involves checking whether the stream * begins with a byte-order mark (BOM, U + FEFF) or a non-zero basic code * point (NZB, U + 0001 to U + 007F) before reading the rest of the stream. * This value can be one of the following:
  • 0: UTF-8 only.
  • *
  • 1: Detect UTF-16 using BOM or NZB, otherwise UTF-8.
  • 2: * Detect UTF-16/UTF-32 using BOM or NZB, otherwise UTF-8. (Tries to * detect UTF-32 first.)
  • 3: Detect UTF-16 using BOM, otherwise * UTF-8.
  • 4: Detect UTF-16/UTF-32 using BOM, otherwise UTF-8. * (Tries to detect UTF-32 first.)
. * @throws java.lang.NullPointerException The parameter {@code stream} is null. */ public CharacterReader(InputStream stream, int mode) { this(stream, mode, false, false); } /** * Initializes a new instance of the {@link * com.upokecenter.text.CharacterReader} class. * @param stream A readable byte stream. * @param mode The method to use when detecting encodings other than UTF-8 in * the byte stream. This usually involves checking whether the stream * begins with a byte-order mark (BOM, U + FEFF) or a non-zero basic code * point (NZB, U + 0001 to U + 007F) before reading the rest of the stream. * This value can be one of the following:
  • 0: UTF-8 only.
  • *
  • 1: Detect UTF-16 using BOM or NZB, otherwise UTF-8.
  • 2: * Detect UTF-16/UTF-32 using BOM or NZB, otherwise UTF-8. (Tries to * detect UTF-32 first.)
  • 3: Detect UTF-16 using BOM, otherwise * UTF-8.
  • 4: Detect UTF-16/UTF-32 using BOM, otherwise UTF-8. * (Tries to detect UTF-32 first.)
. * @param errorThrow If true, will throw an exception if invalid byte sequences * (in the detected encoding) are found in the byte stream. If false, * replaces those byte sequences with replacement characters (U + FFFD) as * the stream is read. * @param dontSkipUtf8Bom If the stream is detected as UTF-8 and this parameter * is {@code true}, won't skip the BOM character if it occurs at the * start of the stream. * @throws java.lang.NullPointerException The parameter {@code stream} is null. */ public CharacterReader( InputStream stream, int mode, boolean errorThrow, boolean dontSkipUtf8Bom) { if (stream == null) { throw new NullPointerException("stream"); } this.stream = new WrappedStream(stream); this.mode = mode; this.errorThrow = errorThrow; this.dontSkipUtf8Bom = dontSkipUtf8Bom; this.str = ""; this.strLength = -1; } private interface IByteReader { int read(); } /** * Reads a series of code points from a Unicode stream or a string. * @param chars An array where the code points that were read will be stored. * @param index A zero-based index showing where the desired portion of {@code * chars} begins. * @param length The number of elements in the desired portion of {@code chars} * (but not more than {@code chars} 's length). * @return The number of code points read from the stream. This can be less * than the {@code length} parameter if the end of the stream is * reached. * @throws java.lang.NullPointerException The parameter {@code chars} is null. * @throws IllegalArgumentException Either {@code index} or {@code length} is * less than 0 or greater than {@code chars} 's length, or {@code chars} * 's length minus {@code index} is less than {@code length}. */ public int Read(int[] chars, int index, int length) { if (chars == null) { throw new NullPointerException("chars"); } if (index < 0) { throw new IllegalArgumentException("index (" + index + ") is less than 0"); } if (index > chars.length) { throw new IllegalArgumentException("index (" + index + ") is more than " + chars.length); } if (length < 0) { throw new IllegalArgumentException("length (" + length + ") is less than 0"); } if (length > chars.length) { throw new IllegalArgumentException("length (" + length + ") is more than " + chars.length); } if (chars.length - index < length) { throw new IllegalArgumentException("chars's length minus " + index + " (" + (chars.length - index) + ") is less than " + length); } int count = 0; for (int i = 0; i < length; ++i) { int c = this.ReadChar(); if (c < 0) { return count; } chars[index + i] = c; ++count; } return count; } /** * Reads the next character from a Unicode stream or a string. * @return The next character, or -1 if the end of the string or stream was * reached. */ public int ReadChar() { if (this.reader != null) { return this.reader.ReadChar(); } if (this.stream != null) { return this.DetectUnicodeEncoding(); } else { int c = (this.offset < this.strLength) ? this.str.charAt(this.offset) : -1; if ((c & 0xfc00) == 0xd800 && this.offset + 1 < this.strLength && this.str.charAt(this.offset + 1) >= 0xdc00 && this.str.charAt(this.offset + 1) <= 0xdfff) { // Get the Unicode code point for the surrogate pair c = 0x10000 + ((c - 0xd800) << 10) + (this.str.charAt(this.offset + 1) - 0xdc00); ++this.offset; } else if ((c & 0xf800) == 0xd800) { // unpaired surrogate if (this.errorThrow) { throw new IllegalStateException("Unpaired surrogate code point"); } else { c = 0xfffd; } } ++this.offset; return c; } } private int DetectUtf8Or16Or32(int c1) { int c2, c3, c4; if (c1 == 0xff || c1 == 0xfe) { // Start of a possible byte-order mark // FF FE 0 0 --> UTF-32LE // FF FE ... --> UTF-16LE // FE FF --> UTF-16BE c2 = this.stream.read(); boolean bigEndian = c1 == 0xfe; int otherbyte = bigEndian ? 0xff : 0xfe; if (c2 == otherbyte) { c3 = this.stream.read(); c4 = this.stream.read(); if (!bigEndian && c3 == 0 && c4 == 0) { this.reader = new Utf32Reader(this.stream, false, this.errorThrow); return this.reader.ReadChar(); } else { Utf16Reader newReader = new Utf16Reader( this.stream, bigEndian, this.errorThrow); newReader.Unget(c3, c4); this.reader = newReader; return newReader.ReadChar(); } } // Assume UTF-8 here, so the 0xff or 0xfe is invalid if (this.errorThrow) { throw new IllegalStateException("Invalid Unicode stream"); } else { Utf8Reader utf8reader = new Utf8Reader(this.stream, this.errorThrow); utf8reader.Unget(c2); this.reader = utf8reader; return 0xfffd; } } else if (c1 == 0 && this.mode == 4) { // Here, the relevant cases are: // 0 0 0 NZA --> UTF-32BE (if mode is 4) // 0 0 FE FF --> UTF-32BE // Anything else is treated as UTF-8 c2 = this.stream.read(); c3 = this.stream.read(); c4 = this.stream.read(); if (c2 == 0 && ((c3 == 0xfe && c4 == 0xff) || (c3 == 0 && c4 >= 0x01 && c4 <= 0x7f))) { this.reader = new Utf32Reader(this.stream, true, this.errorThrow); return c3 == 0 ? c4 : this.reader.ReadChar(); } else { Utf8Reader utf8reader = new Utf8Reader(this.stream, this.errorThrow); utf8reader.UngetThree(c2, c3, c4); this.reader = utf8reader; return c1; } } else if (this.mode == 2) { if (c1 >= 0x01 && c1 <= 0x7f) { // Nonzero ASCII character c2 = this.stream.read(); if (c2 == 0) { // NZA 0, so UTF-16LE or UTF-32LE c3 = this.stream.read(); c4 = this.stream.read(); if (c3 == 0 && c4 == 0) { this.reader = new Utf32Reader( this.stream, false, this.errorThrow); return c1; } else { Utf16Reader newReader = new Utf16Reader( this.stream, false, this.errorThrow); newReader.Unget(c3, c4); this.reader = newReader; return c1; } } else { // NZA NZ, so UTF-8 Utf8Reader utf8reader = new Utf8Reader(this.stream, this.errorThrow); utf8reader.Unget(c2); this.reader = utf8reader; return c1; } } else if (c1 == 0) { // Zero c2 = this.stream.read(); if (c2 >= 0x01 && c2 <= 0x7f) { // 0 NZA, so UTF-16BE Utf16Reader newReader = new Utf16Reader(this.stream, true, this.errorThrow); this.reader = newReader; return c2; } else if (c2 == 0) { // 0 0, so maybe UTF-32BE c3 = this.stream.read(); c4 = this.stream.read(); if (c3 == 0 && c4 >= 0x01 && c4 <= 0x7f) { // 0 0 0 NZA this.reader = new Utf32Reader(this.stream, true, this.errorThrow); return c4; } else if (c3 == 0xfe && c4 == 0xff) { // 0 0 FE FF this.reader = new Utf32Reader(this.stream, true, this.errorThrow); return this.reader.ReadChar(); } else { // 0 0 ... Utf8Reader newReader = new Utf8Reader(this.stream, this.errorThrow); newReader.UngetThree(c2, c3, c4); this.reader = newReader; return c1; } } else { // 0 NonAscii, so UTF-8 Utf8Reader utf8reader = new Utf8Reader(this.stream, this.errorThrow); utf8reader.Unget(c2); this.reader = utf8reader; return c1; } } } // Use default of UTF-8 return -2; } private int DetectUtf8OrUtf16(int c1) { int mode = this.mode; int c2; if (c1 == 0xff || c1 == 0xfe) { c2 = this.stream.read(); boolean bigEndian = c1 == 0xfe; int otherbyte = bigEndian ? 0xff : 0xfe; if (c2 == otherbyte) { Utf16Reader newReader = new Utf16Reader( this.stream, bigEndian, this.errorThrow); this.reader = newReader; return newReader.ReadChar(); } // Assume UTF-8 here, so the 0xff or 0xfe is invalid if (this.errorThrow) { throw new IllegalStateException("Invalid Unicode stream"); } else { Utf8Reader utf8reader = new Utf8Reader(this.stream, this.errorThrow); utf8reader.Unget(c2); this.reader = utf8reader; return 0xfffd; } } else if (mode == 1) { if (c1 >= 0x01 && c1 <= 0x7f) { // Nonzero ASCII character c2 = this.stream.read(); if (c2 == 0) { // NZA 0, so UTF-16LE Utf16Reader newReader = new Utf16Reader( this.stream, false, this.errorThrow); this.reader = newReader; } else { // NZA NZ Utf8Reader utf8reader = new Utf8Reader(this.stream, this.errorThrow); utf8reader.Unget(c2); this.reader = utf8reader; } return c1; } else if (c1 == 0) { // Zero c2 = this.stream.read(); if (c2 >= 0x01 && c2 <= 0x7f) { // 0 NZA, so UTF-16BE Utf16Reader newReader = new Utf16Reader(this.stream, true, this.errorThrow); this.reader = newReader; return c2; } else { Utf8Reader utf8reader = new Utf8Reader(this.stream, this.errorThrow); utf8reader.Unget(c2); this.reader = utf8reader; return c1; } } } // Use default of UTF-8 return -2; } // Detects a Unicode encoding private int DetectUnicodeEncoding() { int mode = this.mode; int c1 = this.stream.read(); int c2; if (c1 < 0) { return -1; } Utf8Reader utf8reader; if (mode == 0) { // UTF-8 only utf8reader = new Utf8Reader(this.stream, this.errorThrow); this.reader = utf8reader; c1 = utf8reader.ReadChar(); if (c1 == 0xfeff) { // Skip BOM c1 = utf8reader.ReadChar(); } return c1; } else if (mode == 1 || mode == 3) { c2 = this.DetectUtf8OrUtf16(c1); if (c2 >= -1) { return c2; } } else if (mode == 2 || mode == 4) { // UTF-8, UTF-16, or UTF-32 c2 = this.DetectUtf8Or16Or32(c1); if (c2 >= -1) { return c2; } } // Default case: assume UTF-8 utf8reader = new Utf8Reader(this.stream, this.errorThrow); this.reader = utf8reader; utf8reader.Unget(c1); c1 = utf8reader.ReadChar(); if (!this.dontSkipUtf8Bom && c1 == 0xfeff) { // Skip BOM c1 = utf8reader.ReadChar(); } return c1; } private static final class SavedState { private int[] saved; private int savedLength; private void Ensure(int size) { this.saved = (this.saved == null) ? ((new int[this.savedLength + size])) : this.saved; if (this.savedLength + size < this.saved.length) { int[] newsaved = new int[this.savedLength + size + 4]; System.arraycopy(this.saved, 0, newsaved, 0, this.savedLength); this.saved = newsaved; } } public void AddOne(int a) { this.Ensure(1); this.saved[this.savedLength++] = a; } public void AddTwo(int a, int b) { this.Ensure(2); this.saved[this.savedLength + 1] = a; this.saved[this.savedLength] = b; this.savedLength += 2; } public void AddThree(int a, int b, int c) { this.Ensure(3); this.saved[this.savedLength + 2] = a; this.saved[this.savedLength + 1] = b; this.saved[this.savedLength] = c; this.savedLength += 3; } public int Read(IByteReader input) { if (this.savedLength > 0) { int ret = this.saved[--this.savedLength]; return ret; } return input.read(); } } private static final class Utf16Reader implements ICharacterInput { private final boolean bigEndian; private final IByteReader stream; private final SavedState state; private final boolean errorThrow; public Utf16Reader(IByteReader stream, boolean bigEndian, boolean errorThrow) { this.stream = stream; this.bigEndian = bigEndian; this.state = new SavedState(); this.errorThrow = errorThrow; } public void Unget(int c1, int c2) { this.state.AddTwo(c1, c2); } public int ReadChar() { int c1 = this.state.Read(this.stream); if (c1 < 0) { return -1; } int c2 = this.state.Read(this.stream); if (c2 < 0) { this.state.AddOne(-1); if (this.errorThrow) { throw new IllegalStateException("Invalid UTF-16"); } else { return 0xfffd; } } c1 = this.bigEndian ? ((c1 << 8) | c2) : ((c2 << 8) | c1); int surr = c1 & 0xfc00; if (surr == 0xd800) { surr = c1; c1 = this.state.Read(this.stream); c2 = this.state.Read(this.stream); if (c1 < 0 || c2 < 0) { this.state.AddOne(-1); if (this.errorThrow) { throw new IllegalStateException("Invalid UTF-16"); } else { return 0xfffd; } } int unit2 = this.bigEndian ? ((c1 << 8) | c2) : ((c2 << 8) | c1); if ((unit2 & 0xfc00) == 0xdc00) { return 0x10000 + ((surr - 0xd800) << 10) + (unit2 - 0xdc00); } this.Unget(c1, c2); if (this.errorThrow) { throw new IllegalStateException("Invalid UTF-16"); } else { return 0xfffd; } } if (surr == 0xdc00) { if (this.errorThrow) { throw new IllegalStateException("Invalid UTF-16"); } else { return 0xfffd; } } return c1; } public int Read(int[] chars, int index, int length) { int count = 0; for (int i = 0; i < length; ++i) { int c = this.ReadChar(); if (c < 0) { return count; } chars[index + i] = c; ++count; } return count; } } private static final class Utf32Reader implements ICharacterInput { private final boolean bigEndian; private final IByteReader stream; private final boolean errorThrow; private final SavedState state; public Utf32Reader(IByteReader stream, boolean bigEndian, boolean errorThrow) { this.stream = stream; this.bigEndian = bigEndian; this.state = new SavedState(); this.errorThrow = errorThrow; } public int ReadChar() { int c1 = this.state.Read(this.stream); if (c1 < 0) { return -1; } int c2 = this.state.Read(this.stream); int c3 = this.state.Read(this.stream); int c4 = this.state.Read(this.stream); if (c2 < 0 || c3 < 0 || c4 < 0) { this.state.AddOne(-1); if (this.errorThrow) { throw new IllegalStateException("Invalid UTF-32"); } else { return 0xfffd; } } c1 = this.bigEndian ? ((c1 << 24) | (c2 << 16) | (c3 << 8) | c4) : ((c4 << 24) | (c3 << 16) | (c2 << 8) | c1); if (c1 < 0 || c1 >= 0x110000 || (c1 & 0xfff800) == 0xd800) { if (this.errorThrow) { throw new IllegalStateException("Invalid UTF-32"); } else { return 0xfffd; } } return c1; } public int Read(int[] chars, int index, int length) { int count = 0; for (int i = 0; i < length; ++i) { int c = this.ReadChar(); if (c < 0) { return count; } chars[index + i] = c; ++count; } return count; } } private static final class Utf8Reader implements ICharacterInput { private final IByteReader stream; private final SavedState state; private final boolean errorThrow; private int lastChar; public Utf8Reader(IByteReader stream, boolean errorThrow) { this.stream = stream; this.lastChar = -1; this.state = new SavedState(); this.errorThrow = errorThrow; } public void Unget(int ch) { this.state.AddOne(ch); } public void UngetThree(int a, int b, int c) { this.state.AddThree(a, b, c); } public int ReadChar() { int cp = 0; int bytesSeen = 0; int bytesNeeded = 0; int lower = 0; int upper = 0; while (true) { int b; if (this.lastChar != -1) { b = this.lastChar; this.lastChar = -1; } else { b = this.state.Read(this.stream); } if (b < 0) { if (bytesNeeded != 0) { bytesNeeded = 0; if (this.errorThrow) { throw new IllegalStateException("Invalid UTF-8"); } else { return 0xfffd; } } return -1; } if (bytesNeeded == 0) { if ((b & 0x7f) == b) { return b; } if (b >= 0xc2 && b <= 0xdf) { bytesNeeded = 1; lower = 0x80; upper = 0xbf; cp = (b - 0xc0) << 6; } else if (b >= 0xe0 && b <= 0xef) { lower = (b == 0xe0) ? 0xa0 : 0x80; upper = (b == 0xed) ? 0x9f : 0xbf; bytesNeeded = 2; cp = (b - 0xe0) << 12; } else if (b >= 0xf0 && b <= 0xf4) { lower = (b == 0xf0) ? 0x90 : 0x80; upper = (b == 0xf4) ? 0x8f : 0xbf; bytesNeeded = 3; cp = (b - 0xf0) << 18; } else { if (this.errorThrow) { throw new IllegalStateException("Invalid UTF-8"); } else { return 0xfffd; } } continue; } if (b < lower || b > upper) { cp = bytesNeeded = bytesSeen = 0; this.state.AddOne(b); if (this.errorThrow) { throw new IllegalStateException("Invalid UTF-8"); } else { return 0xfffd; } } lower = 0x80; upper = 0xbf; ++bytesSeen; cp += (b - 0x80) << (6 * (bytesNeeded - bytesSeen)); if (bytesSeen != bytesNeeded) { continue; } int ret = cp; cp = 0; bytesSeen = 0; bytesNeeded = 0; return ret; } } public int Read(int[] chars, int index, int length) { int count = 0; for (int i = 0; i < length; ++i) { int c = this.ReadChar(); if (c < 0) { return count; } chars[index + i] = c; ++count; } return count; } } private static final class WrappedStream implements IByteReader { private final InputStream stream; public WrappedStream(InputStream stream) { this.stream = stream; } public int read() { try { return this.stream.read(); } catch (IOException ex) { throw new IllegalStateException(ex.getMessage(), ex); } } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy