com.fasterxml.aalto.in.Utf32Reader Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of aalto-xml Show documentation
Show all versions of aalto-xml Show documentation
Ultra-high performance non-blocking XML processor (Stax/Stax2, SAX/SAX2)
/* Woodstox Lite ("wool") XML processor
*
* Copyright (c) 2006- Tatu Saloranta, [email protected]
*
* Licensed under the License specified in the file LICENSE which is
* included with the source code.
* You may not use this file except in compliance with the License.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.fasterxml.aalto.in;
import java.io.*;
import com.fasterxml.aalto.in.ReaderConfig;
import com.fasterxml.aalto.util.XmlConsts;
/**
* Since JDK does not come with UTF-32/UCS-4, let's implement a simple
* decoder to use.
*/
public final class Utf32Reader
extends Reader
{
final static char NULL_CHAR = (char) 0;
final ReaderConfig mConfig;
protected InputStream mIn;
protected byte[] mBuffer;
protected int mPtr;
protected int mLength;
final boolean mBigEndian;
/**
* Although input is fine with full Unicode set, Java still uses
* 16-bit chars, so we may have to split high-order chars into
* surrogate pairs.
*/
char mSurrogate = NULL_CHAR;
/**
* Total read character count; used for error reporting purposes
*/
int mCharCount = 0;
/**
* Total read byte count; used for error reporting purposes
*/
int mByteCount = 0;
/*
////////////////////////////////////////
// Life-cycle
////////////////////////////////////////
*/
public Utf32Reader(ReaderConfig cfg, InputStream in,
byte[] buf, int ptr, int len,
boolean isBigEndian)
{
mConfig = cfg;
mBigEndian = isBigEndian;
}
/*
////////////////////////////////////////
// Reader API
////////////////////////////////////////
*/
@Override
public void close() throws IOException
{
InputStream in = mIn;
if (in != null) {
mIn = null;
freeBuffers();
in.close();
}
}
char[] mTmpBuf = null;
/**
* Although this method is implemented by the base class, AND it should
* never be called by Woodstox code, let's still implement it bit more
* efficiently just in case
*/
@Override
public int read() throws IOException
{
if (mTmpBuf == null) {
mTmpBuf = new char[1];
}
if (read(mTmpBuf, 0, 1) < 1) {
return -1;
}
return mTmpBuf[0];
}
/*
////////////////////////////////////////
// Public API
////////////////////////////////////////
*/
@Override
public int read(char[] cbuf, int start, int len) throws IOException
{
// Already EOF?
if (mBuffer == null) {
return -1;
}
if (len < 1) {
return len;
}
// Let's then ensure there's enough room...
if (start < 0 || (start+len) > cbuf.length) {
reportBounds(cbuf, start, len);
}
len += start;
int outPtr = start;
// Ok, first; do we have a surrogate from last round?
if (mSurrogate != NULL_CHAR) {
cbuf[outPtr++] = mSurrogate;
mSurrogate = NULL_CHAR;
// No need to load more, already got one char
} else {
/* Note: we'll try to avoid blocking as much as possible. As a
* result, we only need to get 4 bytes for a full char.
*/
int left = (mLength - mPtr);
if (left < 4) {
if (!loadMore(left)) { // (legal) EOF?
return -1;
}
}
}
byte[] buf = mBuffer;
main_loop:
while (outPtr < len) {
int ptr = mPtr;
int ch;
if (mBigEndian) {
ch = (buf[ptr] << 24) | ((buf[ptr+1] & 0xFF) << 16)
| ((buf[ptr+2] & 0xFF) << 8) | (buf[ptr+3] & 0xFF);
} else {
ch = (buf[ptr] & 0xFF) | ((buf[ptr+1] & 0xFF) << 8)
| ((buf[ptr+2] & 0xFF) << 16) | (buf[ptr+3] << 24);
}
mPtr += 4;
// Does it need to be split to surrogates?
if (ch >= 0xD800) {
// Illegal?
if (ch > XmlConsts.MAX_UNICODE_CHAR) {
reportInvalid(ch, outPtr-start,
"(above "+Integer.toHexString(XmlConsts.MAX_UNICODE_CHAR)+") ");
}
if (ch > 0xFFFF) { // need to split into surrogates?
ch -= 0x10000; // to normalize it starting with 0x0
cbuf[outPtr++] = (char) (0xD800 + (ch >> 10));
// hmmh. can this ever be 0? (not legal, at least?)
ch = (0xDC00 | (ch & 0x03FF));
// Room for second part?
if (outPtr >= len) { // nope
mSurrogate = (char) ch;
break main_loop;
}
} else { // in 16-bit range... just need validity checks
if (ch < 0xE000) {
reportInvalid(ch, outPtr-start, "(a surrogate char) ");
} else if (ch >= 0xFFFE) {
reportInvalid(ch, outPtr-start, "");
}
}
}
cbuf[outPtr++] = (char) ch;
if (mPtr >= mLength) {
break main_loop;
}
}
len = outPtr - start;
mCharCount += len;
return len;
}
/*
////////////////////////////////////////
// Internal methods
////////////////////////////////////////
*/
/**
* @param available Number of "unused" bytes in the input buffer
*
* @return True, if enough bytes were read to allow decoding of at least
* one full character; false if EOF was encountered instead.
*/
private boolean loadMore(int available)
throws IOException
{
mByteCount += (mLength - available);
// Bytes that need to be moved to the beginning of buffer?
if (available > 0) {
if (mPtr > 0) {
for (int i = 0; i < available; ++i) {
mBuffer[i] = mBuffer[mPtr+i];
}
mPtr = 0;
}
mLength = available;
} else {
/* Ok; here we can actually reasonably expect an EOF,
* so let's do a separate read right away:
*/
mPtr = 0;
int count = mIn.read(mBuffer);
if (count < 1) {
mLength = 0;
if (count < 0) { // -1
freeBuffers(); // to help GC?
return false;
}
// 0 count is no good; let's err out
reportStrangeStream();
}
mLength = count;
}
/* Need at least 4 bytes; if we don't get that many, it's an
* error.
*/
while (mLength < 4) {
int count = mIn.read(mBuffer, mLength, mBuffer.length - mLength);
if (count < 1) {
if (count < 0) { // -1, EOF... no good!
freeBuffers(); // to help GC?
reportUnexpectedEOF(mLength, 4);
}
// 0 count is no good; let's err out
reportStrangeStream();
}
mLength += count;
}
return true;
}
public final void freeBuffers()
{
byte[] buf = mBuffer;
if (buf != null) {
mBuffer = null;
if (mConfig != null) {
mConfig.freeFullBBuffer(buf);
}
}
}
/*
//////////////////////////////////////////
// Error reporting
//////////////////////////////////////////
*/
private void reportUnexpectedEOF(int gotBytes, int needed)
throws IOException
{
int bytePos = mByteCount + gotBytes;
int charPos = mCharCount;
throw new CharConversionException("Unexpected EOF in the middle of a 4-byte UTF-32 char: got "
+gotBytes+", needed "+needed
+", at char #"+charPos+", byte #"+bytePos+")");
}
private void reportInvalid(int value, int offset, String msg)
throws IOException
{
int bytePos = mByteCount + mPtr - 1;
int charPos = mCharCount + offset;
throw new CharConversionException("Invalid UTF-32 character 0x"
+Integer.toHexString(value)
+msg+" at char #"+charPos+", byte #"+bytePos+")");
}
protected void reportBounds(char[] cbuf, int start, int len)
throws IOException
{
throw new ArrayIndexOutOfBoundsException("read(buf,"+start+","+len+"), cbuf["+cbuf.length+"]");
}
protected void reportStrangeStream()
throws IOException
{
throw new IOException("Strange I/O stream, returned 0 bytes on read");
}
}