javolution.io.UTF8StreamReader Maven / Gradle / Ivy
/*
* Javolution - Java(TM) Solution for Real-Time and Embedded Systems
* Copyright (C) 2012 - Javolution (http://javolution.org/)
* All rights reserved.
*
* Permission to use, copy, modify, and distribute this software is
* freely granted, provided that this notice is preserved.
*/
package javolution.io;
import java.io.CharConversionException;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
/**
* A UTF-8 stream reader.
*
* This reader supports surrogate char
pairs (representing
* characters in the range [U+10000 .. U+10FFFF]). It can also be used
* to read characters unicodes (31 bits) directly
* (ref. {@link #read()}).
*
* Each invocation of one of the read()
methods may cause one
* or more bytes to be read from the underlying byte-input stream.
* To enable the efficient conversion of bytes to characters, more bytes may
* be read ahead from the underlying stream than are necessary to satisfy
* the current read operation.
*
* Instances of this class can be reused for different input streams
* and can be part of a higher level component (e.g. parser) in order
* to avoid dynamic buffer allocation when the input source changes.
* Also wrapping using a java.io.BufferedReader
is unnescessary
* as instances of this class embed their own data buffers.
*
* Note: This reader is unsynchronized and does not test if the UTF-8
* encoding is well-formed (e.g. UTF-8 sequences longer than
* necessary to encode a character).
*
* @author Jean-Marie Dautelle
* @version 2.0, December 9, 2004
* @see UTF8StreamWriter
*/
public final class UTF8StreamReader extends Reader {
/**
* Holds the current input stream or null
if closed.
*/
private InputStream _inputStream;
/**
* Holds the start index.
*/
private int _start;
/**
* Holds the end index.
*/
private int _end;
/**
* Holds the bytes buffer.
*/
private final byte[] _bytes;
/**
* Creates a UTF-8 reader having a byte buffer of moderate capacity (2048).
*/
public UTF8StreamReader() {
_bytes = new byte[2048];
}
/**
* Creates a UTF-8 reader having a byte buffer of moderate capacity (2048),
* initialized to read with the specified InputStream
*
* @param inputStream InputStream to Read From
*/
public UTF8StreamReader(InputStream inputStream) {
_bytes = new byte[2048];
_inputStream = inputStream;
}
/**
* Creates a UTF-8 reader having a byte buffer of specified capacity.
*
* @param capacity the capacity of the byte buffer.
*/
public UTF8StreamReader(int capacity) {
_bytes = new byte[capacity];
}
/**
* Creates a UTF-8 reader having a byte buffer of specified capacity,
* initialized to read with the specified InputStream
*
* @param capacity the capacity of the byte buffer.
*/
public UTF8StreamReader(InputStream inputStream, int capacity) {
_bytes = new byte[capacity];
_inputStream = inputStream;
}
/**
* Sets the input stream to use for reading until this reader is closed.
* For example:[code]
* Reader reader = new UTF8StreamReader().setInput(inStream);
* [/code] is equivalent but reads twice as fast as [code]
* Reader reader = new java.io.InputStreamReader(inStream, "UTF-8");
* [/code]
*
* @param inStream the input stream.
* @return this UTF-8 reader.
* @throws IllegalStateException if this reader is being reused and
* it has not been {@link #close closed} or {@link #reset reset}.
*/
public UTF8StreamReader setInput(InputStream inStream) {
if (_inputStream != null)
throw new IllegalStateException("Reader not closed or reset");
_inputStream = inStream;
return this;
}
/**
* Indicates if this stream is ready to be read.
*
* @return true
if the next read() is guaranteed not to block
* for input; false
otherwise.
* @throws IOException if an I/O error occurs.
*/
public boolean ready() throws IOException {
if (_inputStream == null)
throw new IOException("Stream closed");
return ((_end - _start) > 0) || (_inputStream.available() != 0);
}
/**
* Closes and {@link #reset resets} this reader for reuse.
*
* @throws IOException if an I/O error occurs.
*/
public void close() throws IOException {
if (_inputStream != null) {
_inputStream.close();
reset();
}
}
/**
* Reads a single character. This method will block until a character is
* available, an I/O error occurs or the end of the stream is reached.
*
* @return the 31-bits Unicode of the character read, or -1 if the end of
* the stream has been reached.
* @throws IOException if an I/O error occurs.
*/
public int read() throws IOException {
byte b = _bytes[_start];
return ((b >= 0) && (_start++ < _end)) ? b : read2();
}
// Reads one full character, blocks if necessary.
private int read2() throws IOException {
if (_start < _end) {
byte b = _bytes[_start++];
// Decodes UTF-8.
if ((b >= 0) && (_moreBytes == 0)) {
// 0xxxxxxx
return b;
} else if (((b & 0xc0) == 0x80) && (_moreBytes != 0)) {
// 10xxxxxx (continuation byte)
_code = (_code << 6) | (b & 0x3f); // Adds 6 bits to code.
if (--_moreBytes == 0) {
return _code;
} else {
return read2();
}
} else if (((b & 0xe0) == 0xc0) && (_moreBytes == 0)) {
// 110xxxxx
_code = b & 0x1f;
_moreBytes = 1;
return read2();
} else if (((b & 0xf0) == 0xe0) && (_moreBytes == 0)) {
// 1110xxxx
_code = b & 0x0f;
_moreBytes = 2;
return read2();
} else if (((b & 0xf8) == 0xf0) && (_moreBytes == 0)) {
// 11110xxx
_code = b & 0x07;
_moreBytes = 3;
return read2();
} else if (((b & 0xfc) == 0xf8) && (_moreBytes == 0)) {
// 111110xx
_code = b & 0x03;
_moreBytes = 4;
return read2();
} else if (((b & 0xfe) == 0xfc) && (_moreBytes == 0)) {
// 1111110x
_code = b & 0x01;
_moreBytes = 5;
return read2();
} else {
throw new CharConversionException("Invalid UTF-8 Encoding");
}
} else { // No more bytes in buffer.
if (_inputStream == null)
throw new IOException("No input stream or stream closed");
_start = 0;
_end = _inputStream.read(_bytes, 0, _bytes.length);
if (_end > 0) {
return read2(); // Continues.
} else { // Done.
if (_moreBytes == 0) {
return -1;
} else { // Incomplete sequence.
throw new CharConversionException(
"Unexpected end of stream");
}
}
}
}
private int _code;
private int _moreBytes;
/**
* Reads characters into a portion of an array. This method will block
* until some input is available, an I/O error occurs or the end of
* the stream is reached.
*
* Note: Characters between U+10000 and U+10FFFF are represented
* by surrogate pairs (two char
).
*
* @param cbuf the destination buffer.
* @param off the offset at which to start storing characters.
* @param len the maximum number of characters to read
* @return the number of characters read, or -1 if the end of the
* stream has been reached
* @throws IOException if an I/O error occurs.
*/
public int read(char cbuf[], int off, int len) throws IOException {
if (_inputStream == null)
throw new IOException("No input stream or stream closed");
if (_start >= _end) { // Fills buffer.
_start = 0;
_end = _inputStream.read(_bytes, 0, _bytes.length);
if (_end <= 0) { // Done.
return _end;
}
}
final int off_plus_len = off + len;
for (int i = off; i < off_plus_len;) {
// assert(_start < _end)
byte b = _bytes[_start];
if ((b >= 0) && (++_start < _end)) {
cbuf[i++] = (char) b; // Most common case.
} else if (b < 0) {
if (i < off_plus_len - 1) { // Up to two 'char' can be read.
int code = read2();
if (code < 0x10000) {
cbuf[i++] = (char) code;
} else if (code <= 0x10ffff) { // Surrogates.
cbuf[i++] = (char) (((code - 0x10000) >> 10) + 0xd800);
cbuf[i++] = (char) (((code - 0x10000) & 0x3ff) + 0xdc00);
} else {
throw new CharConversionException("Cannot convert U+"
+ Integer.toHexString(code)
+ " to char (code greater than U+10FFFF)");
}
if (_start < _end) {
continue;
}
}
return i - off;
} else { // End of buffer (_start >= _end).
cbuf[i++] = (char) b;
return i - off;
}
}
return len;
}
/**
* Reads characters into the specified appendable. This method will block
* until the end of the stream is reached.
*
* @param dest the destination buffer.
* @throws IOException if an I/O error occurs.
*/
public void read(Appendable dest) throws IOException {
if (_inputStream == null)
throw new IOException("No input stream or stream closed");
while (true) {
if (_start >= _end) { // Fills buffer.
_start = 0;
_end = _inputStream.read(_bytes, 0, _bytes.length);
if (_end <= 0) { // Done.
break;
}
}
byte b = _bytes[_start];
if (b >= 0) {
dest.append((char) b); // Most common case.
_start++;
} else {
int code = read2();
if (code < 0x10000) {
dest.append((char) code);
} else if (code <= 0x10ffff) { // Surrogates.
dest.append((char) (((code - 0x10000) >> 10) + 0xd800));
dest.append((char) (((code - 0x10000) & 0x3ff) + 0xdc00));
} else {
throw new CharConversionException("Cannot convert U+"
+ Integer.toHexString(code)
+ " to char (code greater than U+10FFFF)");
}
}
}
}
public void reset() {
_code = 0;
_end = 0;
_inputStream = null;
_moreBytes = 0;
_start = 0;
}
/**
* @deprecated Replaced by {@link #setInput(InputStream)}
*/
public UTF8StreamReader setInputStream(InputStream inStream) {
return this.setInput(inStream);
}
}