All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.upokecenter.text.CharacterReader Maven / Gradle / Ivy

Go to download

A Java library that implements character encodings used in Web pages and email.

There is a newer version: 0.6.0
Show newest version
package com.upokecenter.text;
/*
Written by Peter O. in 2014.
Any copyright is dedicated to the Public Domain.
http://creativecommons.org/publicdomain/zero/1.0/
If you like this, you should donate to Peter O.
at: http://peteroupc.github.io/
 */

import java.io.*;

    /**
     *
     */
  public final class CharacterReader implements ICharacterInput {
    private final int mode;
    private final boolean errorThrow;
    private final boolean dontSkipUtf8Bom;
    private final String str;
    private final int strLength;
    private final IByteReader stream;

    private int offset;
    private ICharacterInput reader;

    /**
     *
     */
    public CharacterReader(String str) {
 this(str, false, false);
    }

    /**
     *
     */
    public CharacterReader(String str, boolean skipByteOrderMark) {
 this(str, skipByteOrderMark, false);
    }

    /**
     *
     */
  public CharacterReader(
  String str,
  boolean skipByteOrderMark,
  boolean errorThrow) {
      if (str == null) {
        throw new NullPointerException("str");
      }
      this.strLength = str.length();
      this.offset = (skipByteOrderMark && this.strLength > 0 && str.charAt(0) ==
        0xfeff) ? 1 : 0;
      this.str = str;
      this.errorThrow = errorThrow;
      this.mode = -1;
      this.dontSkipUtf8Bom = false;
      this.stream = null;
    }

    /**
     *
     */
    public CharacterReader(String str, int offset, int length) {
 this(str, offset, length, false, false);
    }

    /**
     *
     */
    public CharacterReader(
  String str,
  int offset,
  int length,
  boolean skipByteOrderMark,
  boolean errorThrow) {
      if (str == null) {
  throw new NullPointerException("str");
}
if (offset < 0) {
  throw new IllegalArgumentException("offset (" + offset +
    ") is less than 0");
}
if (offset > str.length()) {
  throw new IllegalArgumentException("offset (" + offset +
    ") is more than " + str.length());
}
if (length < 0) {
  throw new IllegalArgumentException("length (" + length +
    ") is less than 0");
}
if (length > str.length()) {
  throw new IllegalArgumentException("length (" + length +
    ") is more than " + str.length());
}
if (str.length() - offset < length) {
  throw new IllegalArgumentException("str's length minus " + offset + " (" +
    (str.length() - offset) + ") is less than " + length);
}
      this.strLength = length;
      this.offset = (skipByteOrderMark && length > 0 && str.charAt(offset) ==
        0xfeff) ? offset + 1 : 0;
      this.str = str;
      this.errorThrow = errorThrow;
      this.mode = -1;
      this.dontSkipUtf8Bom = false;
      this.stream = null;
    }

    /**
     *
     */
    public CharacterReader(InputStream stream) {
 this(stream, 0, false);
    }

    /**
     *
     */
    public CharacterReader(InputStream stream, int mode, boolean errorThrow) {
 this(stream, mode, errorThrow, false);
    }

    /**
     *
     */
    public CharacterReader(InputStream stream, int mode) {
 this(stream, mode, false, false);
    }

    /**
     *
     */
    public CharacterReader(
  InputStream stream,
  int mode,
  boolean errorThrow,
  boolean dontSkipUtf8Bom) {
      if (stream == null) {
        throw new NullPointerException("stream");
      }
      this.stream = new WrappedStream(stream);
      this.mode = mode;
      this.errorThrow = errorThrow;
      this.dontSkipUtf8Bom = dontSkipUtf8Bom;
      this.str = "";
      this.strLength = -1;
    }

    private interface IByteReader {
      int read();
    }

    /**
     *
     */
    public int Read(int[] chars, int index, int length) {
      if (chars == null) {
        throw new NullPointerException("chars");
      }
      if (index < 0) {
        throw new IllegalArgumentException("index (" + index +
          ") is less than 0");
      }
      if (index > chars.length) {
        throw new IllegalArgumentException("index (" + index +
          ") is more than " + chars.length);
      }
      if (length < 0) {
        throw new IllegalArgumentException("length (" + length +
          ") is less than 0");
      }
      if (length > chars.length) {
        throw new IllegalArgumentException("length (" + length +
          ") is more than " + chars.length);
      }
      if (chars.length - index < length) {
        throw new IllegalArgumentException("chars's length minus " + index + " (" +
          (chars.length - index) + ") is less than " + length);
      }
      int count = 0;
      for (int i = 0; i < length; ++i) {
        int c = this.ReadChar();
        if (c < 0) {
          return count;
        }
        chars[index + i] = c;
        ++count;
      }
      return count;
    }

    /**
     *
     */
    public int ReadChar() {
      if (this.reader != null) {
        return this.reader.ReadChar();
      }
      if (this.stream != null) {
        return this.DetectUnicodeEncoding();
      } else {
        int c = (this.offset < this.strLength) ? this.str.charAt(this.offset) : -1;
        if ((c & 0xfc00) == 0xd800 && this.offset + 1 < this.strLength &&
                this.str.charAt(this.offset + 1) >= 0xdc00 && this.str.charAt(this.offset + 1)
                <= 0xdfff) {
          // Get the Unicode code point for the surrogate pair
          c = 0x10000 + ((c - 0xd800) << 10) + (this.str.charAt(this.offset + 1) -
          0xdc00);
          ++this.offset;
        } else if ((c & 0xf800) == 0xd800) {
          // unpaired surrogate
          if (this.errorThrow) {
 throw new IllegalStateException("Unpaired surrogate code point");
} else {
 c = 0xfffd;
}
        }
        ++this.offset;
        return c;
      }
    }

    private int DetectUtf8Or16Or32(int c1) {
      int c2, c3, c4;
      if (c1 == 0xff || c1 == 0xfe) {
        // Start of a possible byte-order mark
        // FF FE 0 0 --> UTF-32LE
        // FF FE ... --> UTF-16LE
        // FE FF --> UTF-16BE
        c2 = this.stream.read();
        boolean bigEndian = c1 == 0xfe;
        int otherbyte = bigEndian ? 0xff : 0xfe;
        if (c2 == otherbyte) {
          c3 = this.stream.read();
          c4 = this.stream.read();
          if (!bigEndian && c3 == 0 && c4 == 0) {
            this.reader = new Utf32Reader(this.stream, false, this.errorThrow);
            return this.reader.ReadChar();
          } else {
      Utf16Reader newReader = new Utf16Reader(
  this.stream,
  bigEndian,
  this.errorThrow);
            newReader.Unget(c3, c4);
            this.reader = newReader;
            return newReader.ReadChar();
          }
        }
        // Assume UTF-8 here, so the 0xff or 0xfe is invalid
        if (this.errorThrow) {
          throw new IllegalStateException("Invalid Unicode stream");
        } else {
          Utf8Reader utf8reader = new Utf8Reader(this.stream, this.errorThrow);
          utf8reader.Unget(c2);
          this.reader = utf8reader;
          return 0xfffd;
        }
      } else if (c1 == 0 && this.mode == 4) {
        // Here, the relevant cases are:
        // 0 0 0 NZA --> UTF-32BE (if mode is 4)
        // 0 0 FE FF --> UTF-32BE
        // Anything else is treated as UTF-8
        c2 = this.stream.read();
        c3 = this.stream.read();
        c4 = this.stream.read();
        if (c2 == 0 &&
           ((c3 == 0xfe && c4 == 0xff) ||
            (c3 == 0 && c4 >= 0x01 && c4 <= 0x7f))) {
          this.reader = new Utf32Reader(this.stream, true, this.errorThrow);
          return c3 == 0 ? c4 : this.reader.ReadChar();
        } else {
          Utf8Reader utf8reader = new Utf8Reader(this.stream, this.errorThrow);
          utf8reader.UngetThree(c2, c3, c4);
          this.reader = utf8reader;
          return c1;
        }
      } else if (this.mode == 2) {
        if (c1 >= 0x01 && c1 <= 0x7f) {
          // Nonzero ASCII character
          c2 = this.stream.read();
          if (c2 == 0) {
            // NZA 0, so UTF-16LE or UTF-32LE
            c3 = this.stream.read();
            c4 = this.stream.read();
            if (c3 == 0 && c4 == 0) {
            this.reader = new Utf32Reader(
  this.stream,
  false,
  this.errorThrow);
              return c1;
            } else {
          Utf16Reader newReader = new Utf16Reader(
  this.stream,
  false,
  this.errorThrow);
              newReader.Unget(c3, c4);
              this.reader = newReader;
              return c1;
            }
          } else {
            // NZA NZ, so UTF-8
            Utf8Reader utf8reader = new Utf8Reader(this.stream, this.errorThrow);
            utf8reader.Unget(c2);
            this.reader = utf8reader;
            return c1;
          }
        } else if (c1 == 0) {
          // Zero
          c2 = this.stream.read();
          if (c2 >= 0x01 && c2 <= 0x7f) {
            // 0 NZA, so UTF-16BE
            Utf16Reader newReader = new Utf16Reader(this.stream, true, this.errorThrow);
            this.reader = newReader;
            return c2;
          } else if (c2 == 0) {
            // 0 0, so maybe UTF-32BE
            c3 = this.stream.read();
            c4 = this.stream.read();
            if (c3 == 0 && c4 >= 0x01 && c4 <= 0x7f) {
              // 0 0 0 NZA
              this.reader = new Utf32Reader(this.stream, true, this.errorThrow);
              return c4;
            } else if (c3 == 0xfe && c4 == 0xff) {
              // 0 0 FE FF
              this.reader = new Utf32Reader(this.stream, true, this.errorThrow);
              return this.reader.ReadChar();
            } else {
              // 0 0 ...
              Utf8Reader newReader = new Utf8Reader(this.stream, this.errorThrow);
              newReader.UngetThree(c2, c3, c4);
              this.reader = newReader;
              return c1;
            }
          } else {
            // 0 NonAscii, so UTF-8
            Utf8Reader utf8reader = new Utf8Reader(this.stream, this.errorThrow);
            utf8reader.Unget(c2);
            this.reader = utf8reader;
            return c1;
          }
        }
      }
      // Use default of UTF-8
      return -2;
    }

    private int DetectUtf8OrUtf16(int c1) {
      int mode = this.mode;
      int c2;
      if (c1 == 0xff || c1 == 0xfe) {
        c2 = this.stream.read();
        boolean bigEndian = c1 == 0xfe;
        int otherbyte = bigEndian ? 0xff : 0xfe;
        if (c2 == otherbyte) {
      Utf16Reader newReader = new Utf16Reader(
  this.stream,
  bigEndian,
  this.errorThrow);
          this.reader = newReader;
          return newReader.ReadChar();
        }
        // Assume UTF-8 here, so the 0xff or 0xfe is invalid
        if (this.errorThrow) {
          throw new IllegalStateException("Invalid Unicode stream");
        } else {
          Utf8Reader utf8reader = new Utf8Reader(this.stream, this.errorThrow);
          utf8reader.Unget(c2);
          this.reader = utf8reader;
          return 0xfffd;
        }
      } else if (mode == 1) {
        if (c1 >= 0x01 && c1 <= 0x7f) {
          // Nonzero ASCII character
          c2 = this.stream.read();
          if (c2 == 0) {
            // NZA 0, so UTF-16LE
          Utf16Reader newReader = new Utf16Reader(
  this.stream,
  false,
  this.errorThrow);
            this.reader = newReader;
          } else {
            // NZA NZ
            Utf8Reader utf8reader = new Utf8Reader(this.stream, this.errorThrow);
            utf8reader.Unget(c2);
            this.reader = utf8reader;
          }
          return c1;
        } else if (c1 == 0) {
          // Zero
          c2 = this.stream.read();
          if (c2 >= 0x01 && c2 <= 0x7f) {
            // 0 NZA, so UTF-16BE
            Utf16Reader newReader = new Utf16Reader(this.stream, true, this.errorThrow);
            this.reader = newReader;
            return c2;
          } else {
            Utf8Reader utf8reader = new Utf8Reader(this.stream, this.errorThrow);
            utf8reader.Unget(c2);
            this.reader = utf8reader;
            return c1;
          }
        }
      }
      // Use default of UTF-8
      return -2;
    }

    // Detects a Unicode encoding
    private int DetectUnicodeEncoding() {
      int mode = this.mode;
      int c1 = this.stream.read();
      int c2;
      if (c1 < 0) {
        return -1;
      }
      Utf8Reader utf8reader;
      if (mode == 0) {
        // UTF-8 only
        utf8reader = new Utf8Reader(this.stream, this.errorThrow);
        this.reader = utf8reader;
        c1 = utf8reader.ReadChar();
        if (c1 == 0xfeff) {
          // Skip BOM
          c1 = utf8reader.ReadChar();
        }
        return c1;
      } else if (mode == 1 || mode == 3) {
        c2 = this.DetectUtf8OrUtf16(c1);
        if (c2 >= -1) {
 return c2;
}
      } else if (mode == 2 || mode == 4) {
        // UTF-8, UTF-16, or UTF-32
        c2 = this.DetectUtf8Or16Or32(c1);
        if (c2 >= -1) {
 return c2;
}
      }
      // Default case: assume UTF-8
      utf8reader = new Utf8Reader(this.stream, this.errorThrow);
      this.reader = utf8reader;
      utf8reader.Unget(c1);
      c1 = utf8reader.ReadChar();
      if (!this.dontSkipUtf8Bom && c1 == 0xfeff) {
        // Skip BOM
        c1 = utf8reader.ReadChar();
      }
      return c1;
    }

    private static final class SavedState {
      private int[] saved;
      private int savedLength;

      private void Ensure(int size) {
        this.saved = (this.saved == null) ? ((new int[this.savedLength + size])) : this.saved;
        if (this.savedLength + size < this.saved.length) {
          int[] newsaved = new int[this.savedLength + size + 4];
          System.arraycopy(this.saved, 0, newsaved, 0, this.savedLength);
          this.saved = newsaved;
        }
      }

      public void AddOne(int a) {
        this.Ensure(1);
        this.saved[this.savedLength++] = a;
      }

      public void AddTwo(int a, int b) {
        this.Ensure(2);
        this.saved[this.savedLength + 1] = a;
        this.saved[this.savedLength] = b;
        this.savedLength += 2;
      }

      public void AddThree(int a, int b, int c) {
        this.Ensure(3);
        this.saved[this.savedLength + 2] = a;
        this.saved[this.savedLength + 1] = b;
        this.saved[this.savedLength] = c;
        this.savedLength += 3;
      }

      public int Read(IByteReader input) {
        if (this.savedLength > 0) {
          int ret = this.saved[--this.savedLength];
          return ret;
        }
        return input.read();
      }
    }

    private static final class Utf16Reader implements ICharacterInput {
      private final boolean bigEndian;
      private final IByteReader stream;
      private final SavedState state;
      private final boolean errorThrow;

      public Utf16Reader(IByteReader stream, boolean bigEndian, boolean errorThrow) {
        this.stream = stream;
        this.bigEndian = bigEndian;
        this.state = new SavedState();
        this.errorThrow = errorThrow;
      }

      public void Unget(int c1, int c2) {
        this.state.AddTwo(c1, c2);
      }

      public int ReadChar() {
        int c1 = this.state.Read(this.stream);
        if (c1 < 0) {
          return -1;
        }
        int c2 = this.state.Read(this.stream);
        if (c2 < 0) {
          this.state.AddOne(-1);
          if (this.errorThrow) {
 throw new IllegalStateException("Invalid UTF-16");
} else {
 return 0xfffd;
}
        }
        c1 = this.bigEndian ? ((c1 << 8) | c2) : ((c2 << 8) | c1);
        int surr = c1 & 0xfc00;
        if (surr == 0xd800) {
          surr = c1;
          c1 = this.state.Read(this.stream);
          c2 = this.state.Read(this.stream);
          if (c1 < 0 || c2 < 0) {
            this.state.AddOne(-1);
            if (this.errorThrow) {
 throw new IllegalStateException("Invalid UTF-16");
} else {
 return 0xfffd;
}
          }
          int unit2 = this.bigEndian ? ((c1 << 8) | c2) : ((c2 << 8) | c1);
          if ((unit2 & 0xfc00) == 0xdc00) {
            return 0x10000 + ((surr - 0xd800) << 10) + (unit2 - 0xdc00);
          }
          this.Unget(c1, c2);
          if (this.errorThrow) {
 throw new IllegalStateException("Invalid UTF-16");
} else {
 return 0xfffd;
}
        }
        if (surr == 0xdc00) {
          if (this.errorThrow) {
 throw new IllegalStateException("Invalid UTF-16");
} else {
 return 0xfffd;
}
        }
        return c1;
      }

      public int Read(int[] chars, int index, int length) {
        int count = 0;
        for (int i = 0; i < length; ++i) {
          int c = this.ReadChar();
          if (c < 0) {
            return count;
          }
          chars[index + i] = c;
          ++count;
        }
        return count;
      }
    }

    private static final class Utf32Reader implements ICharacterInput {
      private final boolean bigEndian;
      private final IByteReader stream;
      private final boolean errorThrow;
      private final SavedState state;

      public Utf32Reader(IByteReader stream, boolean bigEndian, boolean errorThrow) {
        this.stream = stream;
        this.bigEndian = bigEndian;
        this.state = new SavedState();
        this.errorThrow = errorThrow;
      }

      public int ReadChar() {
        int c1 = this.state.Read(this.stream);
        if (c1 < 0) {
          return -1;
        }
        int c2 = this.state.Read(this.stream);
        int c3 = this.state.Read(this.stream);
        int c4 = this.state.Read(this.stream);
        if (c2 < 0 || c3 < 0 || c4 < 0) {
          this.state.AddOne(-1);
          if (this.errorThrow) {
 throw new IllegalStateException("Invalid UTF-32");
} else {
 return 0xfffd;
}
        }
        c1 = this.bigEndian ? ((c1 << 24) | (c2 << 16) | (c3 << 8) | c4) :
          ((c4 << 24) | (c3 << 16) | (c2 << 8) | c1);
        if (c1 < 0 || c1 >= 0x110000 || (c1 & 0xfff800) == 0xd800) {
          if (this.errorThrow) {
 throw new IllegalStateException("Invalid UTF-32");
} else {
 return 0xfffd;
}
        }
        return c1;
      }

      public int Read(int[] chars, int index, int length) {
        int count = 0;
        for (int i = 0; i < length; ++i) {
          int c = this.ReadChar();
          if (c < 0) {
            return count;
          }
          chars[index + i] = c;
          ++count;
        }
        return count;
      }
    }

    private static final class Utf8Reader implements ICharacterInput {
      private final IByteReader stream;
      private final SavedState state;
      private final boolean errorThrow;
      private int lastChar;

      public Utf8Reader(IByteReader stream, boolean errorThrow) {
        this.stream = stream;
        this.lastChar = -1;
        this.state = new SavedState();
        this.errorThrow = errorThrow;
      }

      public void Unget(int ch) {
        this.state.AddOne(ch);
      }

      public void UngetThree(int a, int b, int c) {
        this.state.AddThree(a, b, c);
      }

      public int ReadChar() {
        int cp = 0;
        int bytesSeen = 0;
        int bytesNeeded = 0;
        int lower = 0;
        int upper = 0;
        while (true) {
          int b;
          if (this.lastChar != -1) {
            b = this.lastChar;
            this.lastChar = -1;
          } else {
            b = this.state.Read(this.stream);
          }
          if (b < 0) {
            if (bytesNeeded != 0) {
              bytesNeeded = 0;
              if (this.errorThrow) {
 throw new IllegalStateException("Invalid UTF-8");
} else {
 return 0xfffd;
}
            }
            return -1;
          }
          if (bytesNeeded == 0) {
            if ((b & 0x7f) == b) {
              return b;
            }
            if (b >= 0xc2 && b <= 0xdf) {
              bytesNeeded = 1;
              lower = 0x80;
              upper = 0xbf;
              cp = (b - 0xc0) << 6;
            } else if (b >= 0xe0 && b <= 0xef) {
              lower = (b == 0xe0) ? 0xa0 : 0x80;
              upper = (b == 0xed) ? 0x9f : 0xbf;
              bytesNeeded = 2;
              cp = (b - 0xe0) << 12;
            } else if (b >= 0xf0 && b <= 0xf4) {
              lower = (b == 0xf0) ? 0x90 : 0x80;
              upper = (b == 0xf4) ? 0x8f : 0xbf;
              bytesNeeded = 3;
              cp = (b - 0xf0) << 18;
            } else {
              if (this.errorThrow) {
 throw new IllegalStateException("Invalid UTF-8");
} else {
 return 0xfffd;
}
            }
            continue;
          }
          if (b < lower || b > upper) {
            cp = bytesNeeded = bytesSeen = 0;
            this.state.AddOne(b);
            if (this.errorThrow) {
 throw new IllegalStateException("Invalid UTF-8");
} else {
 return 0xfffd;
}
          }
          lower = 0x80;
          upper = 0xbf;
          ++bytesSeen;
          cp += (b - 0x80) << (6 * (bytesNeeded - bytesSeen));
          if (bytesSeen != bytesNeeded) {
            continue;
          }
          int ret = cp;
          cp = 0;
          bytesSeen = 0;
          bytesNeeded = 0;
          return ret;
        }
      }

      public int Read(int[] chars, int index, int length) {
        int count = 0;
        for (int i = 0; i < length; ++i) {
          int c = this.ReadChar();
          if (c < 0) {
            return count;
          }
          chars[index + i] = c;
          ++count;
        }
        return count;
      }
    }

    private static final class WrappedStream implements IByteReader {
      private final InputStream stream;

      public WrappedStream(InputStream stream) {
        this.stream = stream;
      }

      public int read() {
        try {
          return this.stream.read();
        } catch (IOException ex) {
          throw new IllegalStateException(ex.getMessage(), ex);
        }
      }
    }
  }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy