com.ctc.wstx.io.UTF8Reader Maven / Gradle / Ivy
/* Woodstox XML processor
*
* Copyright (c) 2004 Tatu Saloranta, [email protected]
*
* Licensed under the License specified in file LICENSE, included with
* the source code.
* You may not use this file except in compliance with the License.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.ctc.wstx.io;
import java.io.*;
import com.ctc.wstx.api.ReaderConfig;
import com.ctc.wstx.cfg.XmlConsts;
/**
* Optimized Reader that reads UTF-8 encoded content from an input stream.
* In addition to doing (hopefully) optimal conversion, it can also take
* array of "pre-read" (leftover) bytes; this is necessary when preliminary
* stream/reader is trying to figure out XML encoding.
*/
public final class UTF8Reader
extends BaseReader
{
boolean mXml11 = false;
char mSurrogate = NULL_CHAR;
/**
* Total read character count; used for error reporting purposes
*/
int mCharCount = 0;
/**
* Total read byte count; used for error reporting purposes
*/
int mByteCount = 0;
/*
////////////////////////////////////////
// Life-cycle
////////////////////////////////////////
*/
public UTF8Reader(ReaderConfig cfg, InputStream in, byte[] buf, int ptr, int len,
boolean recycleBuffer)
{
super(cfg, in, buf, ptr, len, recycleBuffer);
}
@Override
public void setXmlCompliancy(int xmlVersion) {
mXml11 = (xmlVersion == XmlConsts.XML_V_11);
}
/*
////////////////////////////////////////
// Public API
////////////////////////////////////////
*/
@SuppressWarnings("cast")
@Override
public int read(char[] cbuf, int start, int len) throws IOException
{
// Let's first ensure there's enough room...
if (start < 0 || (start+len) > cbuf.length) {
reportBounds(cbuf, start, len);
}
// Already EOF?
if (mByteBuffer == null) {
return -1;
}
if (len < 1) { // dummy call?
return 0;
}
len += start;
int outPtr = start;
// Ok, first; do we have a surrogate from last round?
if (mSurrogate != NULL_CHAR) {
cbuf[outPtr++] = mSurrogate;
mSurrogate = NULL_CHAR;
// No need to load more, already got one char
// 05-Apr-2021, tatu: but if at the end must return:
if (mBytePtr >= mByteBufferEnd) {
mCharCount += 1;
return 1;
}
} else {
// To prevent unnecessary blocking (esp. with network streams),
// we'll only require decoding of a single char
int left = (mByteBufferEnd - mBytePtr);
/* So; only need to load more if we can't provide at least
* one more character. We need not do thorough check here,
* but let's check the common cases here: either completely
* empty buffer (left == 0), or one with less than max. byte
* count for a single char, and starting of a multi-byte
* encoding (this leaves possibility of a 2/3-byte char
* that is still fully accessible... but that can be checked
* by the load method)
*/
if (left < 4) {
// Need to load more?
if (left < 1 || mByteBuffer[mBytePtr] < 0) {
if (!loadMore(left)) { // (legal) EOF?
return -1;
}
}
}
}
/* This may look silly, but using a local var is indeed faster
* (if and when HotSpot properly gets things running) than
* member variable...
*/
byte[] buf = mByteBuffer;
int inPtr = mBytePtr;
int inBufLen = mByteBufferEnd;
main_loop:
while (outPtr < len) {
// At this point we have at least one byte available
int c = (int) buf[inPtr++];
// Let's first do the quickie loop for common case; 7-bit ascii:
if (c >= 0) { // ascii? can probably loop, then
if (c == 0x7F && mXml11) { // DEL illegal in xml1.1
int bytePos = mByteCount + inPtr - 1;
int charPos = mCharCount + (outPtr-start);
reportInvalidXml11(c, bytePos, charPos);
}
cbuf[outPtr++] = (char) c; // ok since MSB is never on
/* Ok, how many such chars could we safely process
* without overruns? (will combine 2 in-loop comparisons
* into just one)
*/
int outMax = (len - outPtr); // max output
int inMax = (inBufLen - inPtr); // max input
int inEnd = inPtr + ((inMax < outMax) ? inMax : outMax);
ascii_loop:
while (true) {
if (inPtr >= inEnd) {
break main_loop;
}
c = ((int) buf[inPtr++]) & 0xFF;
if (c >= 0x7F) { // DEL, or multi-byte
break ascii_loop;
}
cbuf[outPtr++] = (char) c;
}
if (c == 0x7F) {
if (mXml11) { // DEL illegal in xml1.1
int bytePos = mByteCount + inPtr - 1;
int charPos = mCharCount + (outPtr-start);
reportInvalidXml11(c, bytePos, charPos);
} // but not in xml 1.0
cbuf[outPtr++] = (char) c;
if(inPtr >= inEnd){
break main_loop;
}
continue main_loop;
}
}
int needed;
// Ok; if we end here, we got multi-byte combination
if ((c & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF)
c = (c & 0x1F);
needed = 1;
} else if ((c & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF)
c = (c & 0x0F);
needed = 2;
} else if ((c & 0xF8) == 0xF0) {
// 4 bytes; double-char BS, with surrogates and all...
c = (c & 0x0F);
needed = 3;
} else {
reportInvalidInitial(c & 0xFF, outPtr-start);
// never gets here...
needed = 1;
}
/* Do we have enough bytes? If not, let's just push back the
* byte and leave, since we have already gotten at least one
* char decoded. This way we will only block (with read from
* input stream) when absolutely necessary.
*/
if ((inBufLen - inPtr) < needed) {
--inPtr;
break main_loop;
}
int d = (int) buf[inPtr++];
if ((d & 0xC0) != 0x080) {
reportInvalidOther(d & 0xFF, outPtr-start);
}
c = (c << 6) | (d & 0x3F);
if (needed > 1) { // needed == 1 means 2 bytes total
d = buf[inPtr++]; // 3rd byte
if ((d & 0xC0) != 0x080) {
reportInvalidOther(d & 0xFF, outPtr-start);
}
c = (c << 6) | (d & 0x3F);
if (needed > 2) { // 4 bytes? (need surrogates)
d = buf[inPtr++];
if ((d & 0xC0) != 0x080) {
reportInvalidOther(d & 0xFF, outPtr-start);
}
c = (c << 6) | (d & 0x3F);
if (c > XmlConsts.MAX_UNICODE_CHAR) {
reportInvalid(c, outPtr-start,
"(above "+Integer.toHexString(XmlConsts.MAX_UNICODE_CHAR)+") ");
}
/* Ugh. Need to mess with surrogates. Ok; let's inline them
* there, then, if there's room: if only room for one,
* need to save the surrogate for the rainy day...
*/
c -= 0x10000; // to normalize it starting with 0x0
cbuf[outPtr++] = (char) (0xD800 + (c >> 10));
// hmmh. can this ever be 0? (not legal, at least?)
c = (0xDC00 | (c & 0x03FF));
// Room for second part?
if (outPtr >= len) { // nope
mSurrogate = (char) c;
break main_loop;
}
// sure, let's fall back to normal processing:
} else {
/* Otherwise, need to check that 3-byte chars are
* legal ones (should not expand to surrogates;
* 0xFFFE and 0xFFFF are illegal)
*/
if (c >= 0xD800) {
// But first, let's check max chars:
if (c < 0xE000) {
reportInvalid(c, outPtr-start, "(a surrogate character) ");
} else if (c >= 0xFFFE) {
reportInvalid(c, outPtr-start, "");
}
} else if (mXml11 && c == 0x2028) { // LSEP?
/* 10-May-2006, TSa: Since LSEP is "non-associative",
* it needs additional handling. One way to do
* this is to convert preceding \r to \n. This
* should be implemented better when integrating
* decoder and tokenizer.
*/
if (outPtr > start && cbuf[outPtr-1] == '\r') {
cbuf[outPtr-1] = '\n';
}
c = CONVERT_LSEP_TO;
}
}
} else { // (needed == 1)
if (mXml11) { // high-order ctrl char detection...
if (c <= 0x9F) {
if (c == 0x85) { // NEL, let's convert?
c = CONVERT_NEL_TO;
} else if (c >= 0x7F) { // DEL, ctrl chars
int bytePos = mByteCount + inPtr - 1;
int charPos = mCharCount + (outPtr-start);
reportInvalidXml11(c, bytePos, charPos);
}
}
}
}
cbuf[outPtr++] = (char) c;
if (inPtr >= inBufLen) {
break main_loop;
}
}
mBytePtr = inPtr;
len = outPtr - start;
mCharCount += len;
return len;
}
/*
////////////////////////////////////////
// Internal methods
////////////////////////////////////////
*/
private void reportInvalidInitial(int mask, int offset)
throws IOException
{
// input (byte) ptr has been advanced by one, by now:
int bytePos = mByteCount + mBytePtr - 1;
int charPos = mCharCount + offset + 1;
throw new CharConversionException("Invalid UTF-8 start byte 0x"
+Integer.toHexString(mask)
+" (at char #"+charPos+", byte #"+bytePos+")");
}
private void reportInvalidOther(int mask, int offset)
throws IOException
{
int bytePos = mByteCount + mBytePtr - 1;
int charPos = mCharCount + offset;
throw new CharConversionException("Invalid UTF-8 middle byte 0x"
+Integer.toHexString(mask)
+" (at char #"+charPos+", byte #"+bytePos+")");
}
private void reportUnexpectedEOF(int gotBytes, int needed)
throws IOException
{
int bytePos = mByteCount + gotBytes;
int charPos = mCharCount;
throw new CharConversionException("Unexpected EOF in the middle of a multi-byte char: got "
+gotBytes+", needed "+needed
+", at char #"+charPos+", byte #"+bytePos+")");
}
private void reportInvalid(int value, int offset, String msg)
throws IOException
{
int bytePos = mByteCount + mBytePtr - 1;
int charPos = mCharCount + offset;
throw new CharConversionException("Invalid UTF-8 character 0x"
+Integer.toHexString(value)+msg
+" at char #"+charPos+", byte #"+bytePos+")");
}
/**
* @param available Number of "unused" bytes in the input buffer
*
* @return True, if enough bytes were read to allow decoding of at least
* one full character; false if EOF was encountered instead.
*/
private boolean loadMore(int available)
throws IOException
{
mByteCount += (mByteBufferEnd - available);
// Bytes that need to be moved to the beginning of buffer?
if (available > 0) {
// 11-Nov-2008, TSa: can only move if we own the buffer; otherwise
// we are stuck with the data.
if (mBytePtr > 0 && canModifyBuffer()) {
for (int i = 0; i < available; ++i) {
mByteBuffer[i] = mByteBuffer[mBytePtr+i];
}
mBytePtr = 0;
mByteBufferEnd = available;
}
} else {
// Ok; here we can actually reasonably expect an EOF,
// so let's do a separate read right away:
int count = readBytes();
if (count < 1) {
if (count < 0) { // -1
freeBuffers(); // to help GC?
return false;
}
// 0 count is no good; let's err out
reportStrangeStream();
}
}
// We now have at least one byte... and that allows us to
// calculate exactly how many bytes we need!
@SuppressWarnings("cast")
int c = (int) mByteBuffer[mBytePtr];
if (c >= 0) { // single byte (ascii) char... cool, can return
return true;
}
// Ok, a multi-byte char, let's check how many bytes we'll need:
int needed;
if ((c & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF)
needed = 2;
} else if ((c & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF)
needed = 3;
} else if ((c & 0xF8) == 0xF0) {
// 4 bytes; double-char BS, with surrogates and all...
needed = 4;
} else {
reportInvalidInitial(c & 0xFF, 0);
// never gets here... but compiler whines without this:
needed = 1;
}
/* And then we'll just need to load up to that many bytes;
* if an EOF is hit, that'll be an error. But we need not do
* actual decoding here, just load enough bytes.
*/
while ((mBytePtr + needed) > mByteBufferEnd) {
int count = readBytesAt(mByteBufferEnd);
if (count < 1) {
if (count < 0) { // -1, EOF... no good!
freeBuffers();
reportUnexpectedEOF(mByteBufferEnd, needed);
}
// 0 count is no good; let's err out
reportStrangeStream();
}
}
return true;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy