javolution.io.UTF8ByteBufferReader Maven / Gradle / Ivy
/*
* Javolution - Java(TM) Solution for Real-Time and Embedded Systems
* Copyright (C) 2012 - Javolution (http://javolution.org/)
* All rights reserved.
*
* Permission to use, copy, modify, and distribute this software is
* freely granted, provided that this notice is preserved.
*/
package javolution.io;
import java.io.CharConversionException;
import java.io.IOException;
import java.io.Reader;
import java.nio.BufferUnderflowException;
import java.nio.ByteBuffer;
/**
* A UTF-8 java.nio.ByteBuffer
reader.
*
*
* This reader can be used for efficient decoding of native byte
* buffers (e.g. MappedByteBuffer
), high-performance
* messaging (no intermediate buffer), etc.
*
* This reader supports surrogate char
pairs (representing
* characters in the range [U+10000 .. U+10FFFF]). It can also be used
* to read characters unicodes (31 bits) directly
* (ref. {@link #read()}).
*
* Each invocation of one of the read()
methods may cause one
* or more bytes to be read from the underlying byte buffer.
* The end of stream is reached when the byte buffer position and limit
* coincide.
*
* @author Jean-Marie Dautelle
* @version 2.0, December 9, 2004
* @see UTF8ByteBufferWriter
*/
public final class UTF8ByteBufferReader extends Reader {
/**
* Holds the byte buffer source.
*/
private ByteBuffer _byteBuffer;
/**
* Default constructor.
*/
public UTF8ByteBufferReader() {}
/**
* Constructor for Initializing to Read from the Specified Byte Buffer
*
* @param byteBuffer ByteBuffer to Read From
*/
public UTF8ByteBufferReader(ByteBuffer byteBuffer) {
_byteBuffer = byteBuffer;
}
/**
* Sets the ByteBuffer
to use for reading available bytes
* from current buffer position.
*
* @param byteBuffer the ByteBuffer
source.
* @return this UTF-8 reader.
* @throws IllegalStateException if this reader is being reused and
* it has not been {@link #close closed} or {@link #reset reset}.
*/
public UTF8ByteBufferReader setInput(ByteBuffer byteBuffer) {
if (_byteBuffer != null)
throw new IllegalStateException("Reader not closed or reset");
_byteBuffer = byteBuffer;
return this;
}
/**
* Indicates if this stream is ready to be read.
*
* @return true
if the byte buffer has remaining bytes to
* read; false
otherwise.
* @throws IOException if an I/O error occurs.
*/
public boolean ready() throws IOException {
if (_byteBuffer != null) {
return _byteBuffer.hasRemaining();
} else {
throw new IOException("Reader closed");
}
}
/**
* Closes and {@link #reset resets} this reader for reuse.
*
* @throws IOException if an I/O error occurs.
*/
public void close() throws IOException {
if (_byteBuffer != null) {
reset();
}
}
/**
* Reads a single character. This method does not block, -1
* is returned if the buffer's limit has been reached.
*
* @return the 31-bits Unicode of the character read, or -1 if there is
* no more remaining bytes to be read.
* @throws IOException if an I/O error occurs (e.g. incomplete
* character sequence being read).
*/
public int read() throws IOException {
if (_byteBuffer != null) {
if (_byteBuffer.hasRemaining()) {
byte b = _byteBuffer.get();
return (b >= 0) ? b : read2(b);
} else {
return -1;
}
} else {
throw new IOException("Reader closed");
}
}
// Reads one full character, throws CharConversionException if limit reached.
private int read2(byte b) throws IOException {
try {
// Decodes UTF-8.
if ((b >= 0) && (_moreBytes == 0)) {
// 0xxxxxxx
return b;
} else if (((b & 0xc0) == 0x80) && (_moreBytes != 0)) {
// 10xxxxxx (continuation byte)
_code = (_code << 6) | (b & 0x3f); // Adds 6 bits to code.
if (--_moreBytes == 0) {
return _code;
} else {
return read2(_byteBuffer.get());
}
} else if (((b & 0xe0) == 0xc0) && (_moreBytes == 0)) {
// 110xxxxx
_code = b & 0x1f;
_moreBytes = 1;
return read2(_byteBuffer.get());
} else if (((b & 0xf0) == 0xe0) && (_moreBytes == 0)) {
// 1110xxxx
_code = b & 0x0f;
_moreBytes = 2;
return read2(_byteBuffer.get());
} else if (((b & 0xf8) == 0xf0) && (_moreBytes == 0)) {
// 11110xxx
_code = b & 0x07;
_moreBytes = 3;
return read2(_byteBuffer.get());
} else if (((b & 0xfc) == 0xf8) && (_moreBytes == 0)) {
// 111110xx
_code = b & 0x03;
_moreBytes = 4;
return read2(_byteBuffer.get());
} else if (((b & 0xfe) == 0xfc) && (_moreBytes == 0)) {
// 1111110x
_code = b & 0x01;
_moreBytes = 5;
return read2(_byteBuffer.get());
} else {
throw new CharConversionException("Invalid UTF-8 Encoding");
}
} catch (BufferUnderflowException e) {
throw new CharConversionException("Incomplete Sequence");
}
}
private int _code;
private int _moreBytes;
/**
* Reads characters into a portion of an array. This method does not
* block.
*
* Note: Characters between U+10000 and U+10FFFF are represented
* by surrogate pairs (two char
).
*
* @param cbuf the destination buffer.
* @param off the offset at which to start storing characters.
* @param len the maximum number of characters to read
* @return the number of characters read, or -1 if there is no more
* byte remaining.
* @throws IOException if an I/O error occurs.
*/
public int read(char cbuf[], int off, int len) throws IOException {
if (_byteBuffer == null)
throw new IOException("Reader closed");
final int off_plus_len = off + len;
int remaining = _byteBuffer.remaining();
if (remaining <= 0)
return -1;
for (int i = off; i < off_plus_len;) {
if (remaining-- > 0) {
byte b = _byteBuffer.get();
if (b >= 0) {
cbuf[i++] = (char) b; // Most common case.
} else {
if (i < off_plus_len - 1) { // Up to two 'char' can be read.
int code = read2(b);
remaining = _byteBuffer.remaining(); // Recalculates.
if (code < 0x10000) {
cbuf[i++] = (char) code;
} else if (code <= 0x10ffff) { // Surrogates.
cbuf[i++] = (char) (((code - 0x10000) >> 10) + 0xd800);
cbuf[i++] = (char) (((code - 0x10000) & 0x3ff) + 0xdc00);
} else {
throw new CharConversionException(
"Cannot convert U+"
+ Integer.toHexString(code)
+ " to char (code greater than U+10FFFF)");
}
} else { // Not enough space in destination (go back).
_byteBuffer.position(_byteBuffer.position() - 1);
remaining++;
return i - off;
}
}
} else {
return i - off;
}
}
return len;
}
/**
* Reads characters into the specified appendable. This method does not
* block.
*
* Note: Characters between U+10000 and U+10FFFF are represented
* by surrogate pairs (two char
).
*
* @param dest the destination buffer.
* @throws IOException if an I/O error occurs.
*/
public void read(Appendable dest) throws IOException {
if (_byteBuffer == null)
throw new IOException("Reader closed");
while (_byteBuffer.hasRemaining()) {
byte b = _byteBuffer.get();
if (b >= 0) {
dest.append((char) b); // Most common case.
} else {
int code = read2(b);
if (code < 0x10000) {
dest.append((char) code);
} else if (code <= 0x10ffff) { // Surrogates.
dest.append((char) (((code - 0x10000) >> 10) + 0xd800));
dest.append((char) (((code - 0x10000) & 0x3ff) + 0xdc00));
} else {
throw new CharConversionException("Cannot convert U+"
+ Integer.toHexString(code)
+ " to char (code greater than U+10FFFF)");
}
}
}
}
public void reset() {
_byteBuffer = null;
_code = 0;
_moreBytes = 0;
}
/**
* @deprecated Replaced by {@link #setInput(ByteBuffer)}
*/
public UTF8ByteBufferReader setByteBuffer(ByteBuffer byteBuffer) {
return this.setInput(byteBuffer);
}
}