com.fasterxml.aalto.in.Utf8Scanner Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of aalto-xml Show documentation
Show all versions of aalto-xml Show documentation
Ultra-high performance non-blocking XML processor (Stax/Stax2, SAX/SAX2)
/* Aalto XML processor
*
* Copyright (c) 2006- Tatu Saloranta, [email protected]
*
* Licensed under the License specified in the file LICENSE which is
* included with the source code.
* You may not use this file except in compliance with the License.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.fasterxml.aalto.in;
import java.io.*;
import javax.xml.stream.XMLStreamException;
import com.fasterxml.aalto.impl.ErrorConsts;
import com.fasterxml.aalto.util.DataUtil;
import com.fasterxml.aalto.util.XmlCharTypes;
import com.fasterxml.aalto.util.XmlChars;
/**
* Scanner for tokenizing XML content from a byte stream encoding using
* UTF-8 encoding, or something suitably close it for decoding purposes
* (including ISO-Latin1 and US-ASCII).
*/
public final class Utf8Scanner
extends StreamScanner
{
/*
/**********************************************************************
/* Life-cycle
/**********************************************************************
*/
public Utf8Scanner(ReaderConfig cfg, InputStream in,
byte[] buffer, int ptr, int last)
{
super(cfg, in, buffer, ptr, last);
}
/*
/**********************************************************************
/* Internal methods, secondary parsing
/**********************************************************************
*/
@Override
protected final void finishToken() throws XMLStreamException
{
_tokenIncomplete = false;
switch (_currToken) {
case PROCESSING_INSTRUCTION:
finishPI();
break;
case CHARACTERS:
finishCharacters();
break;
case COMMENT:
finishComment();
break;
case SPACE:
finishSpace();
break;
case DTD:
finishDTD(true); // true -> get text
break;
case CDATA:
finishCData();
break;
default:
ErrorConsts.throwInternalError();
}
}
@Override
protected int handleStartElement(byte b)
throws XMLStreamException
{
_currToken = START_ELEMENT;
_currNsCount = 0;
PName elemName = parsePName(b);
/* Ok. Need to create a qualified name. Simplest for element
* in default ns (no extra work -- expressed as null binding);
* otherwise need to find binding
*/
String prefix = elemName.getPrefix();
boolean allBound; // flag to check 'late' bindings
if (prefix == null) { // element in default ns
allBound = true; // which need not be bound
} else {
elemName = bindName(elemName, prefix);
allBound = elemName.isBound();
}
_tokenName = elemName;
_currElem = new ElementScope(elemName, _currElem);
// And then attribute parsing loop:
int attrPtr = 0;
while (true) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
b = _inputBuffer[_inputPtr++];
int c = (int) b & 0xFF;
// Intervening space to skip?
if (c <= INT_SPACE) {
do {
if (c == INT_LF) {
markLF();
} else if (c == INT_CR) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (_inputBuffer[_inputPtr] == BYTE_LF) {
++_inputPtr;
}
markLF();
} else if (c != INT_SPACE && c != INT_TAB) {
throwInvalidSpace(c);
}
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
b = _inputBuffer[_inputPtr++];
c = (int) b & 0xFF;
} while (c <= INT_SPACE);
} else if (c != INT_SLASH && c != INT_GT) {
c = decodeCharForError(b);
throwUnexpectedChar(c, " expected space, or '>' or \"/>\"");
}
// Ok; either need to get an attribute name, or end marker:
if (c == INT_SLASH) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
b = _inputBuffer[_inputPtr++];
if (b != BYTE_GT) {
c = decodeCharForError(b);
throwUnexpectedChar(c, " expected '>'");
}
_isEmptyTag = true;
break;
} else if (c == INT_GT) {
_isEmptyTag = false;
break;
} else if (c == INT_LT) {
reportInputProblem("Unexpected '<' character in element (missing closing '>'?)");
}
// Ok, an attr name:
PName attrName = parsePName(b);
prefix = attrName.getPrefix();
boolean isNsDecl;
if (prefix == null) { // can be default ns decl:
isNsDecl = (attrName.getLocalName() == "xmlns");
} else {
// May be a namespace decl though?
if (prefix == "xmlns") {
isNsDecl = true;
} else {
attrName = bindName(attrName, prefix);
if (allBound) {
allBound = attrName.isBound();
}
isNsDecl = false;
}
}
// Optional space to skip again
while (true) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
b = _inputBuffer[_inputPtr++];
c = (int) b & 0xFF;
if (c > INT_SPACE) {
break;
}
if (c == INT_LF) {
markLF();
} else if (c == INT_CR) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (_inputBuffer[_inputPtr] == BYTE_LF) {
++_inputPtr;
}
markLF();
} else if (c != INT_SPACE && c != INT_TAB) {
throwInvalidSpace(c);
}
}
if (c != INT_EQ) {
c = decodeCharForError(b);
throwUnexpectedChar(c, " expected '='");
}
// Optional space to skip again
while (true) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
b = _inputBuffer[_inputPtr++];
c = (int) b & 0xFF;
if (c > INT_SPACE) {
break;
}
if (c == INT_LF) {
markLF();
} else if (c == INT_CR) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (_inputBuffer[_inputPtr] == BYTE_LF) {
++_inputPtr;
}
markLF();
} else if (c != INT_SPACE && c != INT_TAB) {
throwInvalidSpace(c);
}
}
if (c != INT_QUOTE && c != INT_APOS) {
c = decodeCharForError(b);
throwUnexpectedChar(c, " Expected a quote");
}
/* Ok, finally: value parsing. However, ns URIs are to be handled
* different from attribute values... let's offline URIs, since
* they should be less common than attribute values.
*/
if (isNsDecl) { // default ns, or explicit?
handleNsDeclaration(attrName, b);
++_currNsCount;
} else { // nope, a 'real' attribute:
attrPtr = collectValue(attrPtr, b, attrName);
}
}
{
// Note: this call also checks attribute uniqueness
int act = _attrCollector.finishLastValue(attrPtr);
if (act < 0) { // error, dup attr indicated by -1
act = _attrCollector.getCount(); // let's get correct count
reportInputProblem(_attrCollector.getErrorMsg());
}
_attrCount = act;
}
++_depth;
/* Was there any prefix that wasn't bound prior to use?
* That's legal, assuming declaration was found later on...
* let's check
*/
if (!allBound) {
if (!elemName.isBound()) { // element itself unbound
reportUnboundPrefix(_tokenName, false);
}
for (int i = 0, len = _attrCount; i < len; ++i) {
PName attrName = _attrCollector.getName(i);
if (!attrName.isBound()) {
reportUnboundPrefix(attrName, true);
}
}
}
return START_ELEMENT;
}
/**
* This method implements the tight loop for parsing attribute
* values. It's off-lined from the main start element method to
* simplify main method, which makes code more maintainable
* and possibly easier for JIT/HotSpot to optimize.
*/
private final int collectValue(int attrPtr, byte quoteByte, PName attrName)
throws XMLStreamException
{
char[] attrBuffer = _attrCollector.startNewValue(attrName, attrPtr);
final int[] TYPES = _charTypes.ATTR_CHARS;
final int quoteChar = (int) quoteByte;
value_loop:
while (true) {
int c;
ascii_loop:
while (true) {
int ptr = _inputPtr;
if (ptr >= _inputEnd) {
loadMoreGuaranteed();
ptr = _inputPtr;
}
if (attrPtr >= attrBuffer.length) {
attrBuffer = _attrCollector.valueBufferFull();
}
int max = _inputEnd;
{
int max2 = ptr + (attrBuffer.length - attrPtr);
if (max2 < max) {
max = max2;
}
}
while (ptr < max) {
c = (int) _inputBuffer[ptr++] & 0xFF;
if (TYPES[c] != 0) {
_inputPtr = ptr;
break ascii_loop;
}
attrBuffer[attrPtr++] = (char) c;
}
_inputPtr = ptr;
}
switch (TYPES[c]) {
case XmlCharTypes.CT_INVALID:
c = handleInvalidXmlChar(c);
case XmlCharTypes.CT_WS_CR:
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (_inputBuffer[_inputPtr] == BYTE_LF) {
++_inputPtr;
}
// fall through
case XmlCharTypes.CT_WS_LF:
markLF();
// fall through
case XmlCharTypes.CT_WS_TAB:
// Plus, need to convert these all to simple space
c = INT_SPACE;
break;
case XmlCharTypes.CT_MULTIBYTE_2:
c = decodeUtf8_2(c);
break;
case XmlCharTypes.CT_MULTIBYTE_3:
c = decodeUtf8_3(c);
break;
case XmlCharTypes.CT_MULTIBYTE_4:
c = decodeUtf8_4(c);
// Let's add first part right away:
attrBuffer[attrPtr++] = (char) (0xD800 | (c >> 10));
c = 0xDC00 | (c & 0x3FF);
if (attrPtr >= attrBuffer.length) {
attrBuffer = _attrCollector.valueBufferFull();
}
break;
case XmlCharTypes.CT_MULTIBYTE_N:
reportInvalidInitial(c);
case XmlCharTypes.CT_LT:
throwUnexpectedChar(c, "'<' not allowed in attribute value");
case XmlCharTypes.CT_AMP:
c = handleEntityInText(false);
if (c == 0) { // unexpanded general entity... not good
reportUnexpandedEntityInAttr(attrName, false);
}
// Ok; does it need a surrogate though? (over 16 bits)
if ((c >> 16) != 0) {
c -= 0x10000;
attrBuffer[attrPtr++] = (char) (0xD800 | (c >> 10));
c = 0xDC00 | (c & 0x3FF);
if (attrPtr >= attrBuffer.length) {
attrBuffer = _attrCollector.valueBufferFull();
}
}
break;
case XmlCharTypes.CT_ATTR_QUOTE:
if (c == quoteChar) {
break value_loop;
}
// default:
// Other chars are not important here...
}
// We know there's room for at least one char without checking
attrBuffer[attrPtr++] = (char) c;
}
return attrPtr;
}
/**
* Method called from the main START_ELEMENT handling loop, to
* parse namespace URI values.
*/
private void handleNsDeclaration(PName name, byte quoteByte)
throws XMLStreamException
{
int attrPtr = 0;
char[] attrBuffer = _nameBuffer;
while (true) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
byte b = _inputBuffer[_inputPtr++];
if (b == quoteByte) {
break;
}
int c;
if (b == BYTE_AMP) { // entity
c = handleEntityInText(false);
if (c == 0) { // general entity; should never happen
reportUnexpandedEntityInAttr(name, true);
}
// Ok; does it need a surrogate though? (over 16 bits)
if ((c >> 16) != 0) {
if (attrPtr >= attrBuffer.length) {
_nameBuffer = attrBuffer = DataUtil.growArrayBy(attrBuffer, attrBuffer.length);
}
c -= 0x10000;
attrBuffer[attrPtr++] = (char) (0xD800 | (c >> 10));
c = 0xDC00 | (c & 0x3FF);
}
} else if (b == BYTE_LT) { // error
c = (int) b;
throwUnexpectedChar(c, "'<' not allowed in attribute value");
} else {
c = (int) b & 0xFF;
if (c < INT_SPACE) {
if (c == INT_LF) {
markLF();
} else if (c == INT_CR) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (_inputBuffer[_inputPtr] == BYTE_LF) {
++_inputPtr;
}
markLF();
} else {
if (c < 0) {
c = decodeMultiByteChar(c, _inputPtr);
if (c < 0) { // surrogate pair
c = -c;
// Let's add first part right away:
if (attrPtr >= attrBuffer.length) {
_nameBuffer = attrBuffer = DataUtil.growArrayBy(attrBuffer, attrBuffer.length);
}
c -= 0x10000;
attrBuffer[attrPtr++] = (char) (0xD800 | (c >> 10));
c = 0xDC00 | (c & 0x3FF);
}
} else if (c != INT_TAB) {
throwInvalidSpace(c);
}
}
}
}
if (attrPtr >= attrBuffer.length) {
_nameBuffer = attrBuffer = DataUtil.growArrayBy(attrBuffer, attrBuffer.length);
}
attrBuffer[attrPtr++] = (char) c;
}
/* Simple optimization: for default ns removal (or, with
* ns 1.1, any other as well), will use empty value... no
* need to try to intern:
*/
if (attrPtr == 0) {
bindNs(name, "");
} else {
String uri = _config.canonicalizeURI(attrBuffer, attrPtr);
bindNs(name, uri);
}
}
/**
* Method called when an ampersand is encounter in text segment.
* Method needs to determine whether it is a pre-defined or character
* entity (in which case it will be expanded into a single char or
* surrogate pair), or a general
* entity (in which case it will most likely be returned as
* ENTITY_REFERENCE event)
*
* @param inAttr True, if reference is from attribute value; false
* if from normal text content
*
* @return 0 if a general parsed entity encountered; integer
* value of a (valid) XML content character otherwise
*/
@Override
protected final int handleEntityInText(boolean inAttr)
throws XMLStreamException
{
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
byte b = _inputBuffer[_inputPtr++];
if (b == BYTE_HASH) {
return handleCharEntity();
}
String start;
if (b == BYTE_a) { // amp or apos?
b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne();
if (b == BYTE_m) { // amp?
b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne();
if (b == BYTE_p) {
b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne();
if (b == BYTE_SEMICOLON) {
return INT_AMP;
}
start = "amp";
} else {
start = "am";
}
} else if (b == BYTE_p) { // apos?
b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne();
if (b == BYTE_o) {
b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne();
if (b == BYTE_s) {
b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne();
if (b == BYTE_SEMICOLON) {
return INT_APOS;
}
start = "apos";
} else {
start = "apo";
}
} else {
start = "ap";
}
} else {
start = "a";
}
} else if (b == BYTE_l) { // lt?
b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne();
if (b == BYTE_t) {
b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne();
if (b == BYTE_SEMICOLON) {
return INT_LT;
}
start = "lt";
} else {
start = "l";
}
} else if (b == BYTE_g) { // gt?
b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne();
if (b == BYTE_t) {
b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne();
if (b == BYTE_SEMICOLON) {
return INT_GT;
}
start = "gt";
} else {
start = "g";
}
} else if (b == BYTE_q) { // quot?
b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne();
if (b == BYTE_u) {
b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne();
if (b == BYTE_o) {
b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne();
if (b == BYTE_t) {
b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne();
if (b == BYTE_SEMICOLON) {
return INT_QUOTE;
}
start = "quot";
} else {
start = "quo";
}
} else {
start = "qu";
}
} else {
start = "q";
}
} else {
start = "";
}
final int[] TYPES = _charTypes.NAME_CHARS;
/* All righty: we have the beginning of the name, plus the first
* byte too. So let's see what we can do with it.
*/
char[] cbuf = _nameBuffer;
int cix = 0;
for (int len = start.length(); cix < len; ++cix) {
cbuf[cix] = start.charAt(cix);
}
//int colon = -1;
while (b != BYTE_SEMICOLON) {
boolean ok;
int c = (int) b & 0xFF;
// Has to be a valid name start char though:
switch (TYPES[c]) {
case XmlCharTypes.CT_NAME_NONE:
case XmlCharTypes.CT_NAME_COLON: // not ok for entities?
case XmlCharTypes.CT_NAME_NONFIRST:
ok = (cix > 0);
break;
case XmlCharTypes.CT_NAME_ANY:
ok = true;
break;
case InputCharTypes.CT_INPUT_NAME_MB_2:
c = decodeUtf8_2(c);
ok = XmlChars.is10NameStartChar(c);
break;
case InputCharTypes.CT_INPUT_NAME_MB_3:
c = decodeUtf8_3(c);
ok = XmlChars.is10NameStartChar(c);
break;
case InputCharTypes.CT_INPUT_NAME_MB_4:
c = decodeUtf8_4(c);
ok = XmlChars.is10NameStartChar(c);
if (ok) {
if (cix >= cbuf.length) {
_nameBuffer = cbuf = DataUtil.growArrayBy(cbuf, cbuf.length);
}
// Let's add first part right away:
c -= 0x10000;
cbuf[cix++] = (char) (0xD800 | (c >> 10));
c = 0xDC00 | (c & 0x3FF);
}
break;
case InputCharTypes.CT_INPUT_NAME_MB_N:
default:
ok = false;
break;
}
if (!ok) {
reportInvalidNameChar(c, cix);
}
if (cix >= cbuf.length) {
_nameBuffer = cbuf = DataUtil.growArrayBy(cbuf, cbuf.length);
}
cbuf[cix++] = (char) c;
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
b = _inputBuffer[_inputPtr++];
}
// Ok, let's construct a (temporary) entity name, then:
String pname = new String(cbuf, 0, cix);
// (note: hash is dummy... not to be compared to anything etc)
_tokenName = new PNameC(pname, null, pname, 0);
/* One more thing: do we actually allow entities in this mode
* and with this event?
*/
if (_config.willExpandEntities()) {
reportInputProblem("General entity reference (&"+pname+";) encountered in entity expanding mode: operation not (yet) implemented");
}
if (inAttr) {
reportInputProblem("General entity reference (&"+pname+";) encountered in attribute value, in non-entity-expanding mode: no way to handle it");
}
return 0;
}
/*
/**********************************************************************
/* Internal methods, name parsing:
/**********************************************************************
*/
/**
* Parsing of public ids is bit more complicated than that of system
* ids, since white space is to be coalesced.
*/
@Override
protected String parsePublicId(byte quoteChar) throws XMLStreamException
{
char[] outputBuffer = _nameBuffer;
int outPtr = 0;
final int[] TYPES = XmlCharTypes.PUBID_CHARS;
boolean addSpace = false;
main_loop:
while (true) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
// Easier to check without char type table, first:
byte b = _inputBuffer[_inputPtr++];
if (b == quoteChar) {
break main_loop;
}
int c = (int) b & 0xFF;
if (TYPES[c] != XmlCharTypes.PUBID_OK) {
throwUnexpectedChar(c, " in public identifier");
}
// White space? Needs to be coalecsed
if (c <= INT_SPACE) {
addSpace = true;
continue;
}
if (addSpace) {
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
outputBuffer[outPtr++] = ' ';
addSpace = false;
}
if (outPtr >= outputBuffer.length) {
_nameBuffer = outputBuffer = DataUtil.growArrayBy(outputBuffer, outputBuffer.length);
outPtr = 0;
}
outputBuffer[outPtr++] = (char) c;
}
return new String(outputBuffer, 0, outPtr);
}
@Override
protected String parseSystemId(byte quoteChar) throws XMLStreamException
{
// caller has init'ed the buffer...
char[] outputBuffer = _nameBuffer;
int outPtr = 0;
// attribute types are closest matches, so let's use them
final int[] TYPES = _charTypes.ATTR_CHARS;
//boolean spaceToAdd = false;
main_loop:
while (true) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
int c = (int) _inputBuffer[_inputPtr++] & 0xFF;
if (TYPES[c] != 0) {
switch (TYPES[c]) {
case XmlCharTypes.CT_INVALID:
c = handleInvalidXmlChar(c);
case XmlCharTypes.CT_WS_CR:
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (_inputBuffer[_inputPtr] == BYTE_LF) {
++_inputPtr;
}
markLF();
c = INT_LF;
break;
case XmlCharTypes.CT_WS_LF:
markLF();
break;
case XmlCharTypes.CT_MULTIBYTE_2:
c = decodeUtf8_2(c);
break;
case XmlCharTypes.CT_MULTIBYTE_3:
c = decodeUtf8_3(c);
break;
case XmlCharTypes.CT_MULTIBYTE_4:
c = decodeUtf8_4(c);
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
// Let's add first part right away:
outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10));
c = 0xDC00 | (c & 0x3FF);
// And let the other char output down below
break;
case XmlCharTypes.CT_MULTIBYTE_N:
reportInvalidInitial(c);
case XmlCharTypes.CT_ATTR_QUOTE:
if (c == (int) quoteChar) {
break main_loop;
}
}
}
if (outPtr >= outputBuffer.length) {
_nameBuffer = outputBuffer = DataUtil.growArrayBy(outputBuffer, outputBuffer.length);
outPtr = 0;
}
outputBuffer[outPtr++] = (char) c;
}
return new String(outputBuffer, 0, outPtr);
}
/*
/**********************************************************************
/* Content skipping
/**********************************************************************
*/
@Override
protected final boolean skipCharacters() throws XMLStreamException
{
final int[] TYPES = _charTypes.TEXT_CHARS;
final byte[] inputBuffer = _inputBuffer;
while (true) {
int c;
// Then the tight ascii non-funny-char loop:
ascii_loop:
while (true) {
int ptr = _inputPtr;
int max = _inputEnd;
if (ptr >= max) {
loadMoreGuaranteed();
ptr = _inputPtr;
max = _inputEnd;
}
while (ptr < max) {
c = (int) inputBuffer[ptr++] & 0xFF;
if (TYPES[c] != 0) {
_inputPtr = ptr;
break ascii_loop;
}
}
_inputPtr = ptr;
}
switch (TYPES[c]) {
case XmlCharTypes.CT_INVALID:
c = handleInvalidXmlChar(c);
case XmlCharTypes.CT_WS_CR:
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (inputBuffer[_inputPtr] == BYTE_LF) {
++_inputPtr;
}
markLF();
break;
case XmlCharTypes.CT_WS_LF:
markLF();
break;
case XmlCharTypes.CT_MULTIBYTE_2:
skipUtf8_2(c);
break;
case XmlCharTypes.CT_MULTIBYTE_3:
skipUtf8_3(c);
break;
case XmlCharTypes.CT_MULTIBYTE_4:
skipUtf8_4(c);
break;
case XmlCharTypes.CT_MULTIBYTE_N:
reportInvalidInitial(c);
case XmlCharTypes.CT_LT:
--_inputPtr;
return false;
case XmlCharTypes.CT_AMP:
c = handleEntityInText(false);
if (c == 0) { // unexpandable general parsed entity
return true;
}
break;
case XmlCharTypes.CT_RBRACKET: // ']]>'?
{
// Let's then just count number of brackets --
// in case they are not followed by '>'
int count = 1;
byte b;
while (true) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
b = inputBuffer[_inputPtr];
if (b != BYTE_RBRACKET) {
break;
}
++_inputPtr; // to skip past bracket
++count;
}
if (b == BYTE_GT && count > 1) {
reportIllegalCDataEnd();
}
}
break;
// default:
// Other types are not important here...
}
}
}
@Override
protected final void skipComment() throws XMLStreamException
{
final int[] TYPES = _charTypes.OTHER_CHARS;
final byte[] inputBuffer = _inputBuffer;
while (true) {
int c;
// Then the tight ascii non-funny-char loop:
ascii_loop:
while (true) {
int ptr = _inputPtr;
int max = _inputEnd;
if (ptr >= max) {
loadMoreGuaranteed();
ptr = _inputPtr;
max = _inputEnd;
}
while (ptr < max) {
c = (int) inputBuffer[ptr++] & 0xFF;
if (TYPES[c] != 0) {
_inputPtr = ptr;
break ascii_loop;
}
}
_inputPtr = ptr;
}
switch (TYPES[c]) {
case XmlCharTypes.CT_INVALID:
c = handleInvalidXmlChar(c);
case XmlCharTypes.CT_WS_CR:
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (inputBuffer[_inputPtr] == BYTE_LF) {
++_inputPtr;
}
markLF();
break;
case XmlCharTypes.CT_WS_LF:
markLF();
break;
case XmlCharTypes.CT_MULTIBYTE_2:
skipUtf8_2(c);
break;
case XmlCharTypes.CT_MULTIBYTE_3:
skipUtf8_3(c);
break;
case XmlCharTypes.CT_MULTIBYTE_4:
skipUtf8_4(c);
break;
case XmlCharTypes.CT_MULTIBYTE_N:
reportInvalidInitial(c);
case XmlCharTypes.CT_HYPHEN: // '-->'?
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (_inputBuffer[_inputPtr] == BYTE_HYPHEN) { // ok, must be end then
++_inputPtr;
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (_inputBuffer[_inputPtr++] != BYTE_GT) {
reportDoubleHyphenInComments();
}
return;
}
break;
// default:
// Other types are not important here...
}
}
}
@Override
protected final void skipCData() throws XMLStreamException
{
final int[] TYPES = _charTypes.OTHER_CHARS;
final byte[] inputBuffer = _inputBuffer;
while (true) {
int c;
// Then the tight ascii non-funny-char loop:
ascii_loop:
while (true) {
int ptr = _inputPtr;
int max = _inputEnd;
if (ptr >= max) {
loadMoreGuaranteed();
ptr = _inputPtr;
max = _inputEnd;
}
while (ptr < max) {
c = (int) inputBuffer[ptr++] & 0xFF;
if (TYPES[c] != 0) {
_inputPtr = ptr;
break ascii_loop;
}
}
_inputPtr = ptr;
}
switch (TYPES[c]) {
case XmlCharTypes.CT_INVALID:
c = handleInvalidXmlChar(c);
case XmlCharTypes.CT_WS_CR:
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (inputBuffer[_inputPtr] == BYTE_LF) {
++_inputPtr;
}
markLF();
break;
case XmlCharTypes.CT_WS_LF:
markLF();
break;
case XmlCharTypes.CT_MULTIBYTE_2:
skipUtf8_2(c);
break;
case XmlCharTypes.CT_MULTIBYTE_3:
skipUtf8_3(c);
break;
case XmlCharTypes.CT_MULTIBYTE_4:
skipUtf8_4(c);
break;
case XmlCharTypes.CT_MULTIBYTE_N:
reportInvalidInitial(c);
case XmlCharTypes.CT_RBRACKET: // ']]>'?
{
// end is nigh?
int count = 0;
byte b;
do {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
++count;
b = _inputBuffer[_inputPtr++];
} while (b == BYTE_RBRACKET);
if (b == BYTE_GT) {
if (count > 1) { // gotcha
return;
}
// can still skip plain ']>'...
} else {
--_inputPtr; // need to push back last char
}
}
break;
// default:
// Other types are not important here...
}
}
}
@Override
protected final void skipPI() throws XMLStreamException
{
final int[] TYPES = _charTypes.OTHER_CHARS;
final byte[] inputBuffer = _inputBuffer;
while (true) {
int c;
// Then the tight ascii non-funny-char loop:
ascii_loop:
while (true) {
int ptr = _inputPtr;
int max = _inputEnd;
if (ptr >= max) {
loadMoreGuaranteed();
ptr = _inputPtr;
max = _inputEnd;
}
while (ptr < max) {
c = (int) inputBuffer[ptr++] & 0xFF;
if (TYPES[c] != 0) {
_inputPtr = ptr;
break ascii_loop;
}
}
_inputPtr = ptr;
}
switch (TYPES[c]) {
case XmlCharTypes.CT_INVALID:
c = handleInvalidXmlChar(c);
case XmlCharTypes.CT_WS_CR:
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (inputBuffer[_inputPtr] == BYTE_LF) {
++_inputPtr;
}
markLF();
break;
case XmlCharTypes.CT_WS_LF:
markLF();
break;
case XmlCharTypes.CT_MULTIBYTE_2:
skipUtf8_2(c);
break;
case XmlCharTypes.CT_MULTIBYTE_3:
skipUtf8_3(c);
break;
case XmlCharTypes.CT_MULTIBYTE_4:
skipUtf8_4(c);
break;
case XmlCharTypes.CT_MULTIBYTE_N:
reportInvalidInitial(c);
case XmlCharTypes.CT_QMARK: // '?>'?
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (_inputBuffer[_inputPtr] == BYTE_GT) {
++_inputPtr;
return;
}
break;
// default:
// Other types are not important here...
}
}
}
@Override
protected final void skipSpace() throws XMLStreamException
{
// mTmpChar has a space, but it's been checked, can ignore
int ptr = _inputPtr;
while (true) {
if (ptr >= _inputEnd) {
if (!loadMore()) {
break;
}
ptr = _inputPtr;
}
int c = (int) _inputBuffer[ptr] & 0xFF;
if (c > INT_SPACE) { // !!! TODO: xml 1.1 ws
break;
}
++ptr;
if (c == INT_LF) {
markLF(ptr);
} else if (c == INT_CR) {
if (ptr >= _inputEnd) {
if (!loadMore()) {
break;
}
ptr = _inputPtr;
}
if (_inputBuffer[ptr] == BYTE_LF) {
++ptr;
}
markLF(ptr);
} else if (c != INT_SPACE && c != INT_TAB) {
_inputPtr = ptr;
throwInvalidSpace(c);
}
}
_inputPtr = ptr;
}
/*
private final int skipMultiByteChar(int c, int ptr)
throws XMLStreamException
{
int needed;
// Ok; if we end here, we got multi-byte combination
if ((c & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF)
needed = 1;
} else if ((c & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF)
needed = 2;
} else if ((c & 0xF8) == 0xF0) {
// 4 bytes; double-char with surrogates and all...
needed = 3;
} else {
reportInvalidInitial(c & 0xFF);
needed = 1; // never gets here
}
if (ptr >= _inputEnd) {
loadMoreGuaranteed();
ptr = _inputPtr;
}
c = (int) _inputBuffer[ptr++];
if ((c & 0xC0) != 0x080) {
reportInvalidOther(c & 0xFF, ptr);
}
if (needed > 1) { // needed == 1 means 2 bytes total
if (ptr >= _inputEnd) {
loadMoreGuaranteed();
ptr = _inputPtr;
}
c = (int) _inputBuffer[ptr++];
if ((c & 0xC0) != 0x080) {
reportInvalidOther(c & 0xFF, ptr);
}
if (needed > 2) { // 4 bytes? (need surrogates)
if (ptr >= _inputEnd) {
loadMoreGuaranteed();
ptr = _inputPtr;
}
c = (int) _inputBuffer[ptr++];
if ((c & 0xC0) != 0x080) {
reportInvalidOther(c & 0xFF, ptr);
}
}
}
return ptr;
}
private final int skipMultiByteChar(int c, int type, int ptr)
throws XMLStreamException
{
type -= XmlCharTypes.CT_MULTIBYTE_N; // number of more bytes needed
if (ptr >= _inputEnd) {
loadMoreGuaranteed();
ptr = _inputPtr;
}
c = (int) _inputBuffer[ptr++];
if ((c & 0xC0) != 0x080) {
reportInvalidOther(c & 0xFF, ptr);
}
if (type > 1) { // needed == 1 means 2 bytes total
if (ptr >= _inputEnd) {
loadMoreGuaranteed();
ptr = _inputPtr;
}
c = (int) _inputBuffer[ptr++];
if ((c & 0xC0) != 0x080) {
reportInvalidOther(c & 0xFF, ptr);
}
if (type > 2) { // 4 bytes? (need surrogates)
if (ptr >= _inputEnd) {
loadMoreGuaranteed();
ptr = _inputPtr;
}
c = (int) _inputBuffer[ptr++];
if ((c & 0xC0) != 0x080) {
reportInvalidOther(c & 0xFF, ptr);
}
}
}
return ptr;
}
*/
private final void skipUtf8_2(int c)
throws XMLStreamException
{
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
c = (int) _inputBuffer[_inputPtr++];
if ((c & 0xC0) != 0x080) {
reportInvalidOther(c & 0xFF, _inputPtr);
}
}
/* Alas, can't heavily optimize skipping, since we still have to
* do validity checks...
*/
private final void skipUtf8_3(int c)
throws XMLStreamException
{
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
c &= 0x0F;
if (c >= 0xD) { // have to check
c <<= 6;
int d = (int) _inputBuffer[_inputPtr++];
if ((d & 0xC0) != 0x080) {
reportInvalidOther(d & 0xFF, _inputPtr);
}
c |= (d & 0x3F);
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
d = (int) _inputBuffer[_inputPtr++];
if ((d & 0xC0) != 0x080) {
reportInvalidOther(d & 0xFF, _inputPtr);
}
c = (c << 6) | (d & 0x3F);
// 0xD800-0xDFFF, 0xFFFE-0xFFFF illegal
if (c >= 0xD800) { // surrogates illegal, as well as 0xFFFE/0xFFFF
if (c < 0xE000 || (c >= 0xFFFE && c <= 0xFFFF)) {
c = handleInvalidXmlChar(c);
}
}
} else { // no checks, can discard
c = (int) _inputBuffer[_inputPtr++];
if ((c & 0xC0) != 0x080) {
reportInvalidOther(c & 0xFF, _inputPtr);
}
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
c = (int) _inputBuffer[_inputPtr++];
if ((c & 0xC0) != 0x080) {
reportInvalidOther(c & 0xFF, _inputPtr);
}
}
}
private final void skipUtf8_4(int c) throws XMLStreamException
{
if ((_inputPtr + 4) > _inputEnd) {
skipUtf8_4Slow(c);
return;
}
int d = (int) _inputBuffer[_inputPtr++];
if ((d & 0xC0) != 0x080) {
reportInvalidOther(d & 0xFF, _inputPtr);
}
d = (int) _inputBuffer[_inputPtr++];
if ((d & 0xC0) != 0x080) {
reportInvalidOther(d & 0xFF, _inputPtr);
}
d = (int) _inputBuffer[_inputPtr++];
if ((d & 0xC0) != 0x080) {
reportInvalidOther(d & 0xFF, _inputPtr);
}
}
private final void skipUtf8_4Slow(int c) throws XMLStreamException
{
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
int d = (int) _inputBuffer[_inputPtr++];
if ((d & 0xC0) != 0x080) {
reportInvalidOther(d & 0xFF, _inputPtr);
}
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
d = (int) _inputBuffer[_inputPtr++];
if ((d & 0xC0) != 0x080) {
reportInvalidOther(d & 0xFF, _inputPtr);
}
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
d = (int) _inputBuffer[_inputPtr++];
if ((d & 0xC0) != 0x080) {
reportInvalidOther(d & 0xFF, _inputPtr);
}
}
/*
/**********************************************************************
/* Content parsing
/**********************************************************************
*/
@Override
protected final void finishCData()
throws XMLStreamException
{
final int[] TYPES = _charTypes.OTHER_CHARS;
final byte[] inputBuffer = _inputBuffer;
char[] outputBuffer = _textBuilder.resetWithEmpty();
int outPtr = 0;
/* At this point, space (if any) has been skipped, and we are
* to parse and store the contents
*/
main_loop:
while (true) {
int c;
// Then the tight ascii non-funny-char loop:
ascii_loop:
while (true) {
int ptr = _inputPtr;
if (ptr >= _inputEnd) {
loadMoreGuaranteed();
ptr = _inputPtr;
}
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
int max = _inputEnd;
{
int max2 = ptr + (outputBuffer.length - outPtr);
if (max2 < max) {
max = max2;
}
}
while (ptr < max) {
c = (int) inputBuffer[ptr++] & 0xFF;
if (TYPES[c] != 0) {
_inputPtr = ptr;
break ascii_loop;
}
outputBuffer[outPtr++] = (char) c;
}
_inputPtr = ptr;
}
// And then exceptions:
switch (TYPES[c]) {
case XmlCharTypes.CT_INVALID:
c = handleInvalidXmlChar(c);
case XmlCharTypes.CT_WS_CR:
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (inputBuffer[_inputPtr] == BYTE_LF) {
++_inputPtr;
}
markLF();
c = INT_LF;
break;
case XmlCharTypes.CT_WS_LF:
markLF();
break;
case XmlCharTypes.CT_MULTIBYTE_2:
c = decodeUtf8_2(c);
break;
case XmlCharTypes.CT_MULTIBYTE_3:
c = decodeUtf8_3(c);
break;
case XmlCharTypes.CT_MULTIBYTE_4:
c = decodeUtf8_4(c);
// Let's add first part right away:
outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10));
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
c = 0xDC00 | (c & 0x3FF);
// And let the other char output down below
break;
case XmlCharTypes.CT_MULTIBYTE_N:
reportInvalidInitial(c);
case XmlCharTypes.CT_RBRACKET: // close ']]>' marker?
/* Ok: let's just parse all consequtive right brackets,
* and see if followed by greater-than char. This because
* we can only push back at most one char at a time, and
* thus can't easily just check a subset
*/
int count = 0; // ignoring first one
byte b;
do {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
b = _inputBuffer[_inputPtr];
if (b != BYTE_RBRACKET) {
break;
}
++_inputPtr;
++count;
} while (true);
// Was the marker found?
boolean ok = (b == BYTE_GT && count >= 1);
if (ok) {
--count;
}
// Brackets to copy to output?
for (; count > 0; --count) {
outputBuffer[outPtr++] = ']';
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
}
if (ok) {
++_inputPtr; // to consume '>'
break main_loop;
}
break;
}
// Ok, can output the char; there's room for one char at least
outputBuffer[outPtr++] = (char) c;
}
_textBuilder.setCurrentLength(outPtr);
/* 03-Feb-2009, tatu: To support coalescing mode, may need to
* do some extra work
*/
if (_cfgCoalescing && !_entityPending) {
finishCoalescedText();
}
}
@Override
protected final void finishCharacters() throws XMLStreamException
{
int outPtr;
int c;
char[] outputBuffer;
// Ok, so what was the first char / entity?
c = _tmpChar;
if (c < 0) { // from entity; can just copy as is
c = -c;
outputBuffer = _textBuilder.resetWithEmpty();
outPtr = 0;
if ((c >> 16) != 0) { // surrogate pair?
c -= 0x10000;
/* Note: after resetting the buffer, it's known to have
* space for more than 2 chars we need to add
*/
outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10));
c = 0xDC00 | (c & 0x3FF);
}
outputBuffer[outPtr++] = (char) c;
} else { // white space that we are interested in?
if (c == INT_CR || c == INT_LF) {
++_inputPtr; // wasn't advanced yet, in this case
outPtr = checkInTreeIndentation(c);
if (outPtr < 0) {
return;
}
// Above call also initializes the text builder appropriately
outputBuffer = _textBuilder.getBufferWithoutReset();
} else {
outputBuffer = _textBuilder.resetWithEmpty();
outPtr = 0;
}
}
final int[] TYPES = _charTypes.TEXT_CHARS;
final byte[] inputBuffer = _inputBuffer;
main_loop:
while (true) {
// Then the tight ascii non-funny-char loop:
ascii_loop:
while (true) {
int ptr = _inputPtr;
if (ptr >= _inputEnd) {
loadMoreGuaranteed();
ptr = _inputPtr;
}
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
int max = _inputEnd;
{
int max2 = ptr + (outputBuffer.length - outPtr);
if (max2 < max) {
max = max2;
}
}
while (ptr < max) {
c = (int) inputBuffer[ptr++] & 0xFF;
if (TYPES[c] != 0) {
_inputPtr = ptr;
break ascii_loop;
}
outputBuffer[outPtr++] = (char) c;
}
_inputPtr = ptr;
}
// And then fallback for funny chars / UTF-8 multibytes:
switch (TYPES[c]) {
case XmlCharTypes.CT_INVALID:
c = handleInvalidXmlChar(c);
case XmlCharTypes.CT_WS_CR:
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (inputBuffer[_inputPtr] == BYTE_LF) {
++_inputPtr;
}
markLF();
c = INT_LF;
break;
case XmlCharTypes.CT_WS_LF:
markLF();
break;
case XmlCharTypes.CT_MULTIBYTE_2:
c = decodeUtf8_2(c);
break;
case XmlCharTypes.CT_MULTIBYTE_3:
if ((_inputEnd - _inputPtr) >= 2) {
c = decodeUtf8_3fast(c);
} else {
c = decodeUtf8_3(c);
}
break;
case XmlCharTypes.CT_MULTIBYTE_4:
c = decodeUtf8_4(c);
// Let's add first part right away:
outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10));
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
c = 0xDC00 | (c & 0x3FF);
// And let the other char output down below
break;
case XmlCharTypes.CT_MULTIBYTE_N:
reportInvalidInitial(c);
case XmlCharTypes.CT_LT:
--_inputPtr;
break main_loop;
case XmlCharTypes.CT_AMP:
c = handleEntityInText(false);
if (c == 0) { // unexpandable general parsed entity
// _inputPtr set by entity expansion method
_entityPending = true;
break main_loop;
}
// Ok; does it need a surrogate though? (over 16 bits)
if ((c >> 16) != 0) {
c -= 0x10000;
outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10));
// Need to ensure room for one more char
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
c = 0xDC00 | (c & 0x3FF);
}
break;
case XmlCharTypes.CT_RBRACKET: // ']]>'?
{
// Let's then just count number of brackets --
// in case they are not followed by '>'
int count = 1;
byte b;
while (true) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
b = inputBuffer[_inputPtr];
if (b != BYTE_RBRACKET) {
break;
}
++_inputPtr; // to skip past bracket
++count;
}
if (b == BYTE_GT && count > 1) {
reportIllegalCDataEnd();
}
// Nope. Need to output all brackets, then; except
// for one that can be left for normal output
while (count > 1) {
outputBuffer[outPtr++] = ']';
// Need to ensure room for one more char
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
--count;
}
}
// Can just output the first ']' along normal output
break;
// default:
// Other types are not important here...
}
// We know there's room for one more:
outputBuffer[outPtr++] = (char) c;
}
_textBuilder.setCurrentLength(outPtr);
/* 03-Feb-2009, tatu: To support coalescing mode, may need to
* do some extra work
*/
if (_cfgCoalescing && !_entityPending) {
finishCoalescedText();
}
}
@Override
protected final void finishComment() throws XMLStreamException
{
final int[] TYPES = _charTypes.OTHER_CHARS;
final byte[] inputBuffer = _inputBuffer;
char[] outputBuffer = _textBuilder.resetWithEmpty();
int outPtr = 0;
main_loop:
while (true) {
int c;
// Then the tight ascii non-funny-char loop:
ascii_loop:
while (true) {
int ptr = _inputPtr;
if (ptr >= _inputEnd) {
loadMoreGuaranteed();
ptr = _inputPtr;
}
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
int max = _inputEnd;
{
int max2 = ptr + (outputBuffer.length - outPtr);
if (max2 < max) {
max = max2;
}
}
while (ptr < max) {
c = (int) inputBuffer[ptr++] & 0xFF;
if (TYPES[c] != 0) {
_inputPtr = ptr;
break ascii_loop;
}
outputBuffer[outPtr++] = (char) c;
}
_inputPtr = ptr;
}
switch (TYPES[c]) {
case XmlCharTypes.CT_INVALID:
c = handleInvalidXmlChar(c);
case XmlCharTypes.CT_WS_CR:
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (inputBuffer[_inputPtr] == BYTE_LF) {
++_inputPtr;
}
markLF();
c = INT_LF;
break;
case XmlCharTypes.CT_WS_LF:
markLF();
break;
case XmlCharTypes.CT_MULTIBYTE_2:
c = decodeUtf8_2(c);
break;
case XmlCharTypes.CT_MULTIBYTE_3:
c = decodeUtf8_3(c);
break;
case XmlCharTypes.CT_MULTIBYTE_4:
c = decodeUtf8_4(c);
// Let's add first part right away:
outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10));
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
c = 0xDC00 | (c & 0x3FF);
// And let the other char output down below
break;
case XmlCharTypes.CT_MULTIBYTE_N:
reportInvalidInitial(c);
case XmlCharTypes.CT_HYPHEN: // '-->'?
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (_inputBuffer[_inputPtr] == BYTE_HYPHEN) { // ok, must be end then
++_inputPtr;
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (_inputBuffer[_inputPtr++] != BYTE_GT) {
reportDoubleHyphenInComments();
}
break main_loop;
}
break;
// default:
// Other types are not important here...
}
// Ok, can output the char (we know there's room for one more)
outputBuffer[outPtr++] = (char) c;
}
_textBuilder.setCurrentLength(outPtr);
}
/**
* When this method gets called we know that we have an internal subset,
* and that the opening '[' has already been read.
*/
@Override
protected final void finishDTD(boolean copyContents) throws XMLStreamException
{
char[] outputBuffer = copyContents ? _textBuilder.resetWithEmpty() : null;
int outPtr = 0;
final int[] TYPES = _charTypes.DTD_CHARS;
boolean inDecl = false; // in declaration/directive?
int quoteChar = 0; // inside quoted string?
main_loop:
while (true) {
int c;
/* First we'll have a quickie loop for speeding through
* uneventful chars...
*/
ascii_loop:
while (true) {
int ptr = _inputPtr;
if (ptr >= _inputEnd) {
loadMoreGuaranteed();
ptr = _inputPtr;
}
int max = _inputEnd;
if (outputBuffer != null) {
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
{
int max2 = ptr + (outputBuffer.length - outPtr);
if (max2 < max) {
max = max2;
}
}
}
while (ptr < max) {
c = (int) _inputBuffer[ptr++] & 0xFF;
if (TYPES[c] != 0) {
_inputPtr = ptr;
break ascii_loop;
}
if (outputBuffer != null) {
outputBuffer[outPtr++] = (char) c;
}
}
_inputPtr = ptr;
}
switch (TYPES[c]) {
// First, common types
case XmlCharTypes.CT_INVALID:
c = handleInvalidXmlChar(c);
case XmlCharTypes.CT_WS_CR:
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (_inputBuffer[_inputPtr] == BYTE_LF) {
++_inputPtr;
}
markLF();
c = INT_LF;
break;
case XmlCharTypes.CT_WS_LF:
markLF();
break;
case XmlCharTypes.CT_MULTIBYTE_2:
c = decodeUtf8_2(c);
break;
case XmlCharTypes.CT_MULTIBYTE_3:
c = decodeUtf8_3(c);
break;
case XmlCharTypes.CT_MULTIBYTE_4:
c = decodeUtf8_4(c);
if (outputBuffer != null) {
// Let's add first part right away:
outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10));
c = 0xDC00 | (c & 0x3FF);
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
// And let the other char output down below
}
break;
case XmlCharTypes.CT_MULTIBYTE_N:
reportInvalidInitial(c);
// Then DTD-specific types:
case XmlCharTypes.CT_DTD_QUOTE: // apos or quot
if (quoteChar == 0) {
quoteChar = c;
} else {
if (quoteChar == c) {
quoteChar = 0;
}
}
break;
case XmlCharTypes.CT_DTD_LT:
if (!inDecl) {
inDecl = true;
}
break;
case XmlCharTypes.CT_DTD_GT:
if (quoteChar == 0) {
inDecl = false;
}
break;
case XmlCharTypes.CT_DTD_RBRACKET:
if (!inDecl && quoteChar == 0) {
break main_loop;
}
break;
// default:
// Other types are not important here...
}
if (outputBuffer != null) { // will have room for one more
outputBuffer[outPtr++] = (char) c;
}
}
if (outputBuffer != null) {
_textBuilder.setCurrentLength(outPtr);
}
// but still need to match the '>'...
byte b = skipInternalWs(false, null);
if (b != BYTE_GT) {
throwUnexpectedChar(decodeCharForError(b), " expected '>' after the internal subset");
}
}
@Override
protected final void finishPI() throws XMLStreamException
{
final int[] TYPES = _charTypes.OTHER_CHARS;
final byte[] inputBuffer = _inputBuffer;
char[] outputBuffer = _textBuilder.resetWithEmpty();
int outPtr = 0;
/* At this point, space (if any) has been skipped, and we are
* to parse and store the contents
*/
main_loop:
while (true) {
int c;
// Then the tight ascii non-funny-char loop:
ascii_loop:
while (true) {
int ptr = _inputPtr;
if (ptr >= _inputEnd) {
loadMoreGuaranteed();
ptr = _inputPtr;
}
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
int max = _inputEnd;
{
int max2 = ptr + (outputBuffer.length - outPtr);
if (max2 < max) {
max = max2;
}
}
while (ptr < max) {
c = (int) inputBuffer[ptr++] & 0xFF;
if (TYPES[c] != 0) {
_inputPtr = ptr;
break ascii_loop;
}
outputBuffer[outPtr++] = (char) c;
}
_inputPtr = ptr;
}
// And then exceptions:
switch (TYPES[c]) {
case XmlCharTypes.CT_INVALID:
c = handleInvalidXmlChar(c);
case XmlCharTypes.CT_WS_CR:
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (inputBuffer[_inputPtr] == BYTE_LF) {
++_inputPtr;
}
markLF();
c = INT_LF;
break;
case XmlCharTypes.CT_WS_LF:
markLF();
break;
case XmlCharTypes.CT_MULTIBYTE_2:
c = decodeUtf8_2(c);
break;
case XmlCharTypes.CT_MULTIBYTE_3:
c = decodeUtf8_3(c);
break;
case XmlCharTypes.CT_MULTIBYTE_4:
c = decodeUtf8_4(c);
// Let's add first part right away:
outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10));
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
c = 0xDC00 | (c & 0x3FF);
// And let the other char output down below
break;
case XmlCharTypes.CT_MULTIBYTE_N:
reportInvalidInitial(c);
case XmlCharTypes.CT_QMARK:
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (_inputBuffer[_inputPtr] == BYTE_GT) { // ok, the end!
++_inputPtr;
break main_loop;
}
// Not end mark, just need to reprocess the second char
// default:
// Other types are not important here...
}
// Ok, can output the char (we know there's room for one more)
outputBuffer[outPtr++] = (char) c;
}
_textBuilder.setCurrentLength(outPtr);
}
/**
* Note: this method is only called in cases where it is known
* that only space chars are legal. Thus, encountering a non-space
* is an error (WFC or VC). However, an end-of-input is ok.
*/
@Override
protected final void finishSpace() throws XMLStreamException
{
/* Ok: so, mTmpChar contains first space char. If it looks
* like indentation, we can probably optimize a bit...
*/
int tmp = _tmpChar;
char[] outputBuffer;
int outPtr;
if (tmp == BYTE_CR || tmp == BYTE_LF) {
outPtr = checkPrologIndentation(tmp);
if (outPtr < 0) {
return;
}
// Above call also initializes the text builder appropriately
outputBuffer = _textBuilder.getBufferWithoutReset();
} else {
outputBuffer = _textBuilder.resetWithEmpty();
outputBuffer[0] = (char) tmp;
outPtr = 1;
}
int ptr = _inputPtr;
while (true) {
if (ptr >= _inputEnd) {
if (!loadMore()) {
break;
}
ptr = _inputPtr;
}
int c = (int) _inputBuffer[ptr] & 0xFF;
// !!! TODO: check for xml 1.1 whitespace?
if (c > INT_SPACE) {
break;
}
++ptr;
if (c == INT_LF) {
markLF(ptr);
} else if (c == INT_CR) {
if (ptr >= _inputEnd) {
if (!loadMore()) { // still need to output the lf
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
outputBuffer[outPtr++] = '\n';
break;
}
ptr = _inputPtr;
}
if (_inputBuffer[ptr] == BYTE_LF) {
++ptr;
}
markLF(ptr);
c = INT_LF; // need to convert to canonical lf
} else if (c != INT_SPACE && c != INT_TAB) {
_inputPtr = ptr;
throwInvalidSpace(c);
}
// Ok, can output the char
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
outputBuffer[outPtr++] = (char) c;
}
_inputPtr = ptr;
_textBuilder.setCurrentLength(outPtr);
}
/*
/**********************************************************************
/* 2nd level parsing/skipping for coalesced text
/**********************************************************************
*/
/**
* Method that gets called after a primary text segment (of type
* CHARACTERS or CDATA, not applicable to SPACE) has been read in
* text buffer. Method has to see if the following event would
* be textual as well, and if so, read it (and any other following
* textual segments).
*/
protected final void finishCoalescedText()
throws XMLStreamException
{
while (true) {
// no matter what, will need (and can get) one char
if (_inputPtr >= _inputEnd) {
if (!loadMore()) { // most likely an error, will be handled later on
return;
}
}
if (_inputBuffer[_inputPtr] == BYTE_LT) { // markup of some kind
/* In worst case, need 3 chars ("= _inputEnd) {
if (!loadAndRetain(3)) {
// probably an error, but will be handled later
return;
}
}
if (_inputBuffer[_inputPtr+1] != BYTE_EXCL
|| _inputBuffer[_inputPtr+2] != BYTE_LBRACKET) {
// can't be CDATA, we are done here
return;
}
// but let's verify it still:
_inputPtr += 3;
for (int i = 0; i < 6; ++i) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
byte b = _inputBuffer[_inputPtr++];
if (b != (byte) CDATA_STR.charAt(i)) {
int ch = decodeCharForError(b);
reportTreeUnexpChar(ch, " (expected '"+CDATA_STR.charAt(i)+"' for CDATA section)");
}
}
finishCoalescedCData();
} else { // textual (or entity, error etc)
finishCoalescedCharacters();
if (_entityPending) {
break;
}
}
}
}
// note: code mostly copied from 'finishCharacters', just simplified
// in some places
protected final void finishCoalescedCharacters()
throws XMLStreamException
{
// first char can't be from (char) entity (wrt finishCharacters)
final int[] TYPES = _charTypes.TEXT_CHARS;
final byte[] inputBuffer = _inputBuffer;
char[] outputBuffer = _textBuilder.getBufferWithoutReset();
int outPtr = _textBuilder.getCurrentLength();
int c;
main_loop:
while (true) {
// Then the tight ascii non-funny-char loop:
ascii_loop:
while (true) {
int ptr = _inputPtr;
if (ptr >= _inputEnd) {
loadMoreGuaranteed();
ptr = _inputPtr;
}
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
int max = _inputEnd;
{
int max2 = ptr + (outputBuffer.length - outPtr);
if (max2 < max) {
max = max2;
}
}
while (ptr < max) {
c = (int) inputBuffer[ptr++] & 0xFF;
if (TYPES[c] != 0) {
_inputPtr = ptr;
break ascii_loop;
}
outputBuffer[outPtr++] = (char) c;
}
_inputPtr = ptr;
}
// And then fallback for funny chars / UTF-8 multibytes:
switch (TYPES[c]) {
case XmlCharTypes.CT_INVALID:
c = handleInvalidXmlChar(c);
case XmlCharTypes.CT_WS_CR:
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (inputBuffer[_inputPtr] == BYTE_LF) {
++_inputPtr;
}
markLF();
c = INT_LF;
break;
case XmlCharTypes.CT_WS_LF:
markLF();
break;
case XmlCharTypes.CT_MULTIBYTE_2:
c = decodeUtf8_2(c);
break;
case XmlCharTypes.CT_MULTIBYTE_3:
if ((_inputEnd - _inputPtr) >= 2) {
c = decodeUtf8_3fast(c);
} else {
c = decodeUtf8_3(c);
}
break;
case XmlCharTypes.CT_MULTIBYTE_4:
c = decodeUtf8_4(c);
// Let's add first part right away:
outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10));
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
c = 0xDC00 | (c & 0x3FF);
// And let the other char output down below
break;
case XmlCharTypes.CT_MULTIBYTE_N:
reportInvalidInitial(c);
case XmlCharTypes.CT_LT:
--_inputPtr;
break main_loop;
case XmlCharTypes.CT_AMP:
c = handleEntityInText(false);
if (c == 0) { // unexpandable general parsed entity
// _inputPtr set by entity expansion method
_entityPending = true;
break main_loop;
}
// Ok; does it need a surrogate though? (over 16 bits)
if ((c >> 16) != 0) {
c -= 0x10000;
outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10));
// Need to ensure room for one more char
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
c = 0xDC00 | (c & 0x3FF);
}
break;
case XmlCharTypes.CT_RBRACKET: // ']]>'?
{
// Let's then just count number of brackets --
// in case they are not followed by '>'
int count = 1;
byte b;
while (true) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
b = inputBuffer[_inputPtr];
if (b != BYTE_RBRACKET) {
break;
}
++_inputPtr; // to skip past bracket
++count;
}
if (b == BYTE_GT && count > 1) {
reportIllegalCDataEnd();
}
// Nope. Need to output all brackets, then; except
// for one that can be left for normal output
while (count > 1) {
outputBuffer[outPtr++] = ']';
// Need to ensure room for one more char
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
--count;
}
}
// Can just output the first ']' along normal output
break;
// default:
// Other types are not important here...
}
// We know there's room for one more:
outputBuffer[outPtr++] = (char) c;
}
_textBuilder.setCurrentLength(outPtr);
}
// note: code mostly copied from 'finishCharacters', just simplified
// in some places
protected final void finishCoalescedCData()
throws XMLStreamException
{
final int[] TYPES = _charTypes.OTHER_CHARS;
final byte[] inputBuffer = _inputBuffer;
char[] outputBuffer = _textBuilder.getBufferWithoutReset();
int outPtr = _textBuilder.getCurrentLength();
/* At this point, space (if any) has been skipped, and we are
* to parse and store the contents
*/
main_loop:
while (true) {
int c;
// Then the tight ascii non-funny-char loop:
ascii_loop:
while (true) {
int ptr = _inputPtr;
if (ptr >= _inputEnd) {
loadMoreGuaranteed();
ptr = _inputPtr;
}
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
int max = _inputEnd;
{
int max2 = ptr + (outputBuffer.length - outPtr);
if (max2 < max) {
max = max2;
}
}
while (ptr < max) {
c = (int) inputBuffer[ptr++] & 0xFF;
if (TYPES[c] != 0) {
_inputPtr = ptr;
break ascii_loop;
}
outputBuffer[outPtr++] = (char) c;
}
_inputPtr = ptr;
}
// And then exceptions:
switch (TYPES[c]) {
case XmlCharTypes.CT_INVALID:
c = handleInvalidXmlChar(c);
case XmlCharTypes.CT_WS_CR:
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (inputBuffer[_inputPtr] == BYTE_LF) {
++_inputPtr;
}
markLF();
c = INT_LF;
break;
case XmlCharTypes.CT_WS_LF:
markLF();
break;
case XmlCharTypes.CT_MULTIBYTE_2:
c = decodeUtf8_2(c);
break;
case XmlCharTypes.CT_MULTIBYTE_3:
c = decodeUtf8_3(c);
break;
case XmlCharTypes.CT_MULTIBYTE_4:
c = decodeUtf8_4(c);
// Let's add first part right away:
outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10));
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
c = 0xDC00 | (c & 0x3FF);
// And let the other char output down below
break;
case XmlCharTypes.CT_MULTIBYTE_N:
reportInvalidInitial(c);
case XmlCharTypes.CT_RBRACKET: // close ']]>' marker?
/* Ok: let's just parse all consequtive right brackets,
* and see if followed by greater-than char. This because
* we can only push back at most one char at a time, and
* thus can't easily just check a subset
*/
int count = 0; // ignoring first one
byte b;
do {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
b = _inputBuffer[_inputPtr];
if (b != BYTE_RBRACKET) {
break;
}
++_inputPtr;
++count;
} while (true);
// Was the marker found?
boolean ok = (b == BYTE_GT && count >= 1);
if (ok) {
--count;
}
// Brackets to copy to output?
for (; count > 0; --count) {
outputBuffer[outPtr++] = ']';
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
}
if (ok) {
++_inputPtr; // to consume '>'
break main_loop;
}
break;
}
// Ok, can output the char; there's room for one char at least
outputBuffer[outPtr++] = (char) c;
}
_textBuilder.setCurrentLength(outPtr);
}
/**
* Method that gets called after a primary text segment (of type
* CHARACTERS or CDATA, not applicable to SPACE) has been skipped.
* Method has to see if the following event would
* be textual as well, and if so, skip it (and any other following
* textual segments).
*
* @return True if we encountered an unexpandable entity
*/
@Override
protected final boolean skipCoalescedText()
throws XMLStreamException
{
while (true) {
// no matter what, will need (and can get) one char
if (_inputPtr >= _inputEnd) {
if (!loadMore()) { // most likely an error, will be handled later on
return false;
}
}
if (_inputBuffer[_inputPtr] == BYTE_LT) { // markup of some kind
/* In worst case, need 3 chars ("= _inputEnd) {
if (!loadAndRetain(3)) { // probably an error, but will be handled later
return false;
}
}
if (_inputBuffer[_inputPtr+1] != BYTE_EXCL
|| _inputBuffer[_inputPtr+2] != BYTE_LBRACKET) {
// can't be CDATA, we are done here
return false;
}
// but let's verify it still:
_inputPtr += 3;
for (int i = 0; i < 6; ++i) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
byte b = _inputBuffer[_inputPtr++];
if (b != (byte) CDATA_STR.charAt(i)) {
int ch = decodeCharForError(b);
reportTreeUnexpChar(ch, " (expected '"+CDATA_STR.charAt(i)+"' for CDATA section)");
}
}
skipCData();
} else { // textual (or entity, error etc)
if (skipCharacters()) {
return true;
}
}
}
}
/*
/**********************************************************************
/* Other methods, utf-decoding
/**********************************************************************
*/
/**
* @return Either decoded character (if positive int); or negated
* value of a high-order char (one that needs surrogate pair)
*/
private final int decodeMultiByteChar(int c, int ptr)
throws XMLStreamException
{
int needed;
if ((c & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF)
c &= 0x1F;
needed = 1;
} else if ((c & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF)
c &= 0x0F;
needed = 2;
} else if ((c & 0xF8) == 0xF0) {
// 4 bytes; double-char with surrogates and all...
c &= 0x07;
needed = 3;
} else {
reportInvalidInitial(c & 0xFF);
needed = 1; // never gets here
}
if (ptr >= _inputEnd) {
loadMoreGuaranteed();
ptr = _inputPtr;
}
int d = (int) _inputBuffer[ptr++];
if ((d & 0xC0) != 0x080) {
reportInvalidOther(d & 0xFF, ptr);
}
c = (c << 6) | (d & 0x3F);
if (needed > 1) { // needed == 1 means 2 bytes total
if (ptr >= _inputEnd) {
loadMoreGuaranteed();
ptr = _inputPtr;
}
d = (int) _inputBuffer[ptr++];
if ((d & 0xC0) != 0x080) {
reportInvalidOther(d & 0xFF, ptr);
}
c = (c << 6) | (d & 0x3F);
if (needed > 2) { // 4 bytes? (need surrogates)
if (ptr >= _inputEnd) {
loadMoreGuaranteed();
ptr = _inputPtr;
}
d = (int) _inputBuffer[ptr++];
if ((d & 0xC0) != 0x080) {
reportInvalidOther(d & 0xFF, ptr);
}
c = (c << 6) | (d & 0x3F);
/* Need to signal such pair differently (to make comparison
* easier)
*/
c = -c;
}
}
_inputPtr = ptr;
return c;
}
private final int decodeUtf8_2(int c)
throws XMLStreamException
{
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
int d = (int) _inputBuffer[_inputPtr++];
if ((d & 0xC0) != 0x080) {
reportInvalidOther(d & 0xFF, _inputPtr);
}
return ((c & 0x1F) << 6) | (d & 0x3F);
}
private final int decodeUtf8_3(int c1)
throws XMLStreamException
{
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
c1 &= 0x0F;
int d = (int) _inputBuffer[_inputPtr++];
if ((d & 0xC0) != 0x080) {
reportInvalidOther(d & 0xFF, _inputPtr);
}
int c = (c1 << 6) | (d & 0x3F);
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
d = (int) _inputBuffer[_inputPtr++];
if ((d & 0xC0) != 0x080) {
reportInvalidOther(d & 0xFF, _inputPtr);
}
c = (c << 6) | (d & 0x3F);
if (c1 >= 0xD) { // 0xD800-0xDFFF, 0xFFFE-0xFFFF illegal
if (c >= 0xD800) { // surrogates illegal, as well as 0xFFFE/0xFFFF
if (c < 0xE000 || (c >= 0xFFFE && c <= 0xFFFF)) {
c = handleInvalidXmlChar(c);
}
}
}
return c;
}
private final int decodeUtf8_3fast(int c1)
throws XMLStreamException
{
c1 &= 0x0F;
int d = (int) _inputBuffer[_inputPtr++];
if ((d & 0xC0) != 0x080) {
reportInvalidOther(d & 0xFF, _inputPtr);
}
int c = (c1 << 6) | (d & 0x3F);
d = (int) _inputBuffer[_inputPtr++];
if ((d & 0xC0) != 0x080) {
reportInvalidOther(d & 0xFF, _inputPtr);
}
c = (c << 6) | (d & 0x3F);
if (c1 >= 0xD) { // 0xD800-0xDFFF, 0xFFFE-0xFFFF illegal
if (c >= 0xD800) { // surrogates illegal, as well as 0xFFFE/0xFFFF
if (c < 0xE000 || (c >= 0xFFFE && c <= 0xFFFF)) {
c = handleInvalidXmlChar(c);
}
}
}
return c;
}
/**
* @return Character value minus 0x10000; this so that caller
* can readily expand it to actual surrogates
*/
private final int decodeUtf8_4(int c)
throws XMLStreamException
{
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
int d = (int) _inputBuffer[_inputPtr++];
if ((d & 0xC0) != 0x080) {
reportInvalidOther(d & 0xFF, _inputPtr);
}
c = ((c & 0x07) << 6) | (d & 0x3F);
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
d = (int) _inputBuffer[_inputPtr++];
if ((d & 0xC0) != 0x080) {
reportInvalidOther(d & 0xFF, _inputPtr);
}
c = (c << 6) | (d & 0x3F);
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
d = (int) _inputBuffer[_inputPtr++];
if ((d & 0xC0) != 0x080) {
reportInvalidOther(d & 0xFF, _inputPtr);
}
/* note: won't change it to negative here, since caller
* already knows it'll need a surrogate
*/
return ((c << 6) | (d & 0x3F)) - 0x10000;
}
/*
/**********************************************************************
/* Internal methods, error reporting
/**********************************************************************
*/
/**
* Method called called to decode a full UTF-8 characters, given
* its first byte. Note: does not do any validity checks, since this
* is only to be used for informational purposes (often when an error
* has already been encountered)
*/
@Override
public int decodeCharForError(byte b) throws XMLStreamException
{
int c = (int) b;
if (c >= 0) { // ascii? fine as is...
return c;
}
int needed;
// Ok; if we end here, we got multi-byte combination
if ((c & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF)
c &= 0x1F;
needed = 1;
} else if ((c & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF)
c &= 0x0F;
needed = 2;
} else if ((c & 0xF8) == 0xF0) {
// 4 bytes; double-char with surrogates and all...
c &= 0x07;
needed = 3;
} else {
reportInvalidInitial(c & 0xFF);
needed = 1; // never gets here
}
int d = nextByte();
if ((d & 0xC0) != 0x080) {
reportInvalidOther(d & 0xFF);
}
c = (c << 6) | (d & 0x3F);
if (needed > 1) { // needed == 1 means 2 bytes total
d = nextByte(); // 3rd byte
if ((d & 0xC0) != 0x080) {
reportInvalidOther(d & 0xFF);
}
c = (c << 6) | (d & 0x3F);
if (needed > 2) { // 4 bytes? (need surrogates)
d = nextByte();
if ((d & 0xC0) != 0x080) {
reportInvalidOther(d & 0xFF);
}
c = (c << 6) | (d & 0x3F);
}
}
return c;
}
protected void reportInvalidOther(int mask, int ptr) throws XMLStreamException
{
_inputPtr = ptr;
reportInvalidOther(mask);
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy