com.fasterxml.aalto.in.StreamScanner Maven / Gradle / Ivy
Show all versions of aalto-xml Show documentation
/* Aalto XML processor
*
* Copyright (c) 2006- Tatu Saloranta, [email protected]
*
* Licensed under the License specified in the file LICENSE which is
* included with the source code.
* You may not use this file except in compliance with the License.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.fasterxml.aalto.in;
import java.io.*;
import javax.xml.stream.XMLStreamException;
import com.fasterxml.aalto.impl.ErrorConsts;
import com.fasterxml.aalto.impl.IoStreamException;
import com.fasterxml.aalto.util.DataUtil;
import com.fasterxml.aalto.util.TextBuilder;
/**
* Base class for various byte stream based scanners (generally one
* for each type of encoding supported).
*/
public abstract class StreamScanner
extends ByteBasedScanner
{
/*
/**********************************************************************
/* Configuration
/**********************************************************************
*/
/**
* Underlying InputStream to use for reading content.
*/
protected InputStream _in;
/*
/**********************************************************************
/* Input buffering
/**********************************************************************
*/
protected byte[] _inputBuffer;
/*
/**********************************************************************
/* Life-cycle
/**********************************************************************
*/
public StreamScanner(ReaderConfig cfg, InputStream in,
byte[] buffer, int ptr, int last)
{
super(cfg);
_in = in;
_inputBuffer = buffer;
_inputPtr = ptr;
_inputEnd = last;
}
@Override
protected void _releaseBuffers()
{
super._releaseBuffers();
/* Note: if we have block input (_in == null), the buffer we
* use is not owned by scanner, can't recycle.
* Also note that this method will always get called before
* _closeSource(); so that _in won't be cleared before we
* have a chance to see it.
*/
if (_in != null && _inputBuffer != null) {
_config.freeFullBBuffer(_inputBuffer);
_inputBuffer = null;
}
}
@Override
protected void _closeSource() throws IOException
{
if (_in != null) {
_in.close();
_in = null;
}
}
/*
/**********************************************************************
/* Abstract methods for sub-classes to implement
/**********************************************************************
*/
protected abstract int handleEntityInText(boolean inAttr)
throws XMLStreamException;
protected abstract String parsePublicId(byte quoteChar)
throws XMLStreamException;
protected abstract String parseSystemId(byte quoteChar)
throws XMLStreamException;
/*
/**********************************************************************
/* Implementation of parsing API
/**********************************************************************
*/
@Override
public final int nextFromProlog(boolean isProlog) throws XMLStreamException
{
if (_tokenIncomplete) { // left-overs from last thingy?
skipToken();
}
// First: keep track of where event started
setStartLocation();
// Ok: we should get a WS or '<'. So, let's skip through WS
while (true) {
if (_inputPtr >= _inputEnd) {
if (!loadMore()) {
setStartLocation();
return TOKEN_EOI;
}
}
int c = _inputBuffer[_inputPtr++] & 0xFF;
// Really should get white space or '<'...
if (c == INT_LT) {
break;
}
/* 26-Mar-2008, tatus: White space in prolog/epilog is
* not to be reported at all (by default at least), as
* it is not part of XML Infoset content. So let's
* just actively skip it here
*/
if (c != INT_SPACE) {
if (c == INT_LF) {
markLF();
} else if (c == INT_CR) {
if (_inputPtr >= _inputEnd) {
if (!loadMore()) {
markLF();
setStartLocation();
return TOKEN_EOI;
}
}
if (_inputBuffer[_inputPtr] == BYTE_LF) {
++_inputPtr;
}
markLF();
} else if (c != INT_TAB) {
reportPrologUnexpChar(isProlog, decodeCharForError((byte)c), null);
}
}
}
// Ok, got LT:
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed(COMMENT); // not necessarily a comment of course
}
byte b = _inputBuffer[_inputPtr++];
if (b == BYTE_EXCL) { // comment/DOCTYPE? (CDATA not legal)
return handlePrologDeclStart(isProlog);
}
if (b == BYTE_QMARK) {
return handlePIStart();
}
/* End tag not allowed if no open tree; and only one root
* element (one root-level start tag)
*/
if (b == BYTE_SLASH || !isProlog) {
reportPrologUnexpElement(isProlog, b);
}
return handleStartElement(b);
}
@Override
public final int nextFromTree() throws XMLStreamException
{
if (_tokenIncomplete) { // left-overs?
if (skipToken()) { // Figured out next event (ENTITY_REFERENCE)?
// !!! We don't yet parse DTD, don't know real contents
return _nextEntity();
}
} else { // note: START_ELEMENT/END_ELEMENT never incomplete
if (_currToken == START_ELEMENT) {
if (_isEmptyTag) {
--_depth;
return (_currToken = END_ELEMENT);
}
} else if (_currToken == END_ELEMENT) {
_currElem = _currElem.getParent();
// Any namespace declarations that need to be unbound?
while (_lastNsDecl != null && _lastNsDecl.getLevel() >= _depth) {
_lastNsDecl =_lastNsDecl.unbind();
}
} else {
// It's possible CHARACTERS entity with an entity ref:
if (_entityPending) {
_entityPending = false;
return _nextEntity();
}
}
}
// and except for special cases, mark down actual start location of the event
setStartLocation();
/* Any more data? Although it'd be an error not to get any,
* let's leave error reporting up to caller
*/
if (_inputPtr >= _inputEnd) {
if (!loadMore()) {
setStartLocation();
return TOKEN_EOI;
}
}
byte b = _inputBuffer[_inputPtr];
/* Can get pretty much any type; start/end element, comment/PI,
* CDATA, text, entity reference...
*/
if (b == BYTE_LT) { // root element, comment, proc instr?
++_inputPtr;
b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne(COMMENT);
if (b == BYTE_EXCL) { // comment or CDATA
return handleCommentOrCdataStart();
}
if (b == BYTE_QMARK) {
return handlePIStart();
}
if (b == BYTE_SLASH) {
return handleEndElement();
}
return handleStartElement(b);
}
if (b == BYTE_AMP) { // entity reference
++_inputPtr;
/* Need to expand; should indicate either text, or an unexpanded
* entity reference
*/
int i = handleEntityInText(false);
if (i == 0) { // general entity
return (_currToken = ENTITY_REFERENCE);
}
/* Nope, a char entity; need to indicate it came from an entity.
* Since we may want to store the char as is, too, let's negate
* entity-based char
*/
_tmpChar = -i;
} else {
/* Let's store it for future reference. May or may not be used --
* so let's not advance input ptr quite yet.
*/
_tmpChar = (int) b & 0xFF; // need to ensure it won't be negative
}
if (_cfgLazyParsing) {
_tokenIncomplete = true;
} else {
finishCharacters();
}
return (_currToken = CHARACTERS);
}
/**
* Helper method used to isolate things that need to be (re)set in
* cases where
*/
protected int _nextEntity() {
// !!! Also, have to assume start location has been set or such
_textBuilder.resetWithEmpty();
// !!! TODO: handle start location?
return (_currToken = ENTITY_REFERENCE);
}
/*
/**********************************************************************
/* Internal methods, secondary parsing
/**********************************************************************
*/
private final int handlePrologDeclStart(boolean isProlog)
throws XMLStreamException
{
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
byte b = _inputBuffer[_inputPtr++];
if (b == BYTE_HYPHEN) { // Comment?
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
b = _inputBuffer[_inputPtr++];
if (b == BYTE_HYPHEN) {
if (_cfgLazyParsing) {
_tokenIncomplete = true;
} else {
finishComment();
}
return (_currToken = COMMENT);
}
} else if (b == BYTE_D) { // DOCTYPE?
if (isProlog) { // no DOCTYPE in epilog
handleDtdStart();
// incomplete flag is set by handleDtdStart
if (!_cfgLazyParsing) {
if (_tokenIncomplete) {
finishDTD(true); // must copy contents, may be needed
_tokenIncomplete = false;
}
}
return DTD;
}
}
/* error... for error recovery purposes, let's just pretend
* like it was unfinished CHARACTERS, though.
*/
_tokenIncomplete = true;
_currToken = CHARACTERS;
reportPrologUnexpChar(isProlog, decodeCharForError(b), " (expected '-' for COMMENT)");
return _currToken; // never gets here
}
private final int handleDtdStart()
throws XMLStreamException
{
matchAsciiKeyword("DOCTYPE");
// And then some white space and root name
byte b = skipInternalWs(true, "after DOCTYPE keyword, before root name");
_tokenName = parsePName(b);
b = skipInternalWs(false, null);
if (b == BYTE_P) { // PUBLIC
matchAsciiKeyword("PUBLIC");
b = skipInternalWs(true, null);
_publicId = parsePublicId(b);
b = skipInternalWs(true, null);
_systemId = parseSystemId(b);
b = skipInternalWs(false, null);
} else if (b == BYTE_S) { // SYSTEM
matchAsciiKeyword("SYSTEM");
b = skipInternalWs(true, null);
_publicId = null;
_systemId = parseSystemId(b);
b = skipInternalWs(false, null);
} else {
_publicId = _systemId = null;
}
/* Ok; so, need to get either an internal subset, or the
* end:
*/
if (b == BYTE_GT) { // fine, we are done
_tokenIncomplete = false;
return (_currToken = DTD);
}
if (b != BYTE_LBRACKET) { // If not end, must have int. subset
String msg = (_systemId != null) ?
" (expected '[' for the internal subset, or '>' to end DOCTYPE declaration)" :
" (expected a 'PUBLIC' or 'SYSTEM' keyword, '[' for the internal subset, or '>' to end DOCTYPE declaration)";
reportTreeUnexpChar(decodeCharForError(b), msg);
}
/* Need not parse the int. subset yet, can leave as is, and then
* either skip or parse later on
*/
_tokenIncomplete = true;
return (_currToken = DTD);
}
private final int handleCommentOrCdataStart()
throws XMLStreamException
{
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
byte b = _inputBuffer[_inputPtr++];
// Let's first see if it's a comment (simpler)
if (b == BYTE_HYPHEN) { // Comment
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
b = _inputBuffer[_inputPtr++];
if (b != BYTE_HYPHEN) {
reportTreeUnexpChar(decodeCharForError(b), " (expected '-' for COMMENT)");
}
if (_cfgLazyParsing) {
_tokenIncomplete = true;
} else {
finishComment();
}
return (_currToken = COMMENT);
}
// If not, should be CDATA:
if (b == BYTE_LBRACKET) { // CDATA
_currToken = CDATA;
for (int i = 0; i < 6; ++i) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
b = _inputBuffer[_inputPtr++];
if (b != (byte) CDATA_STR.charAt(i)) {
int ch = decodeCharForError(b);
reportTreeUnexpChar(ch, " (expected '"+CDATA_STR.charAt(i)+"' for CDATA section)");
}
}
if (_cfgLazyParsing) {
_tokenIncomplete = true;
} else {
finishCData();
}
return CDATA;
}
reportTreeUnexpChar(decodeCharForError(b), " (expected either '-' for COMMENT or '[CDATA[' for CDATA section)");
return TOKEN_EOI; // never gets here
}
/**
* Method called after leading '' has been parsed; needs to parse
* target.
*/
private final int handlePIStart() throws XMLStreamException
{
_currToken = PROCESSING_INSTRUCTION;
// Ok, first, need a name
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
byte b = _inputBuffer[_inputPtr++];
_tokenName = parsePName(b);
{ // but is it "xml" (case insensitive)?
String ln = _tokenName.getLocalName();
if (ln.length() == 3 && ln.equalsIgnoreCase("xml") &&
_tokenName.getPrefix() == null) {
reportInputProblem(ErrorConsts.ERR_WF_PI_XML_TARGET);
}
}
/* Let's then verify that we either get a space, or closing
* '?>': this way we'll catch some problems right away, and also
* simplify actual processing of contents.
*/
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
int c = (int) _inputBuffer[_inputPtr++] & 0xFF;
if (c <= INT_SPACE) {
// Ok, let's skip the white space...
while (true) {
if (c == INT_LF) {
markLF();
} else if (c == INT_CR) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (_inputBuffer[_inputPtr] == BYTE_LF) {
++_inputPtr;
}
markLF();
} else if (c != INT_SPACE && c != INT_TAB) {
throwInvalidSpace(c);
}
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
c = (int) _inputBuffer[_inputPtr] & 0xFF;
if (c > INT_SPACE) {
break;
}
++_inputPtr;
}
// Ok, got non-space, need to push back:
if (_cfgLazyParsing) {
_tokenIncomplete = true;
} else {
finishPI();
}
} else {
if (c != INT_QMARK) {
reportMissingPISpace(decodeCharForError((byte)c));
}
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
b = _inputBuffer[_inputPtr++];
if (b != BYTE_GT) {
reportMissingPISpace(decodeCharForError(b));
}
_textBuilder.resetWithEmpty();
_tokenIncomplete = false;
}
return PROCESSING_INSTRUCTION;
}
/**
* @return Code point for the entity that expands to a valid XML
* content character.
*/
protected final int handleCharEntity()
throws XMLStreamException
{
// Hex or decimal?
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
byte b = _inputBuffer[_inputPtr++];
int value = 0;
if (b == BYTE_x) { // hex
while (true) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
b = _inputBuffer[_inputPtr++];
if (b == BYTE_SEMICOLON) {
break;
}
value = value << 4;
int c = (int) b;
if (c <= '9' && c >= '0') {
value += (c - '0');
} else if (c >= 'a' && c <= 'f') {
value += 10 + (c - 'a');
} else if (c >= 'A' && c <= 'F') {
value += 10 + (c - 'A');
} else {
throwUnexpectedChar(decodeCharForError(b), "; expected a hex digit (0-9a-fA-F)");
}
if (value > MAX_UNICODE_CHAR) { // Overflow?
reportEntityOverflow();
}
}
} else { // numeric (decimal)
while (b != BYTE_SEMICOLON) {
int c = (int) b;
if (c <= '9' && c >= '0') {
value = (value * 10) + (c - '0');
if (value > MAX_UNICODE_CHAR) { // Overflow?
reportEntityOverflow();
}
} else {
throwUnexpectedChar(decodeCharForError(b), "; expected a decimal number");
}
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
b = _inputBuffer[_inputPtr++];
}
}
verifyXmlChar(value);
return value;
}
/**
* Parsing of start element requires parsing of the element name
* (and attribute names), and is thus encoding-specific.
*/
protected abstract int handleStartElement(byte b)
throws XMLStreamException;
/**
* Note that this method is currently also shareable for all Ascii-based
* encodings, and at least between UTF-8 and ISO-Latin1. The reason is
* that since we already know exact bytes that need to be matched,
* there's no danger of getting invalid encodings or such.
* So, for now, let's leave this method here in the base class.
*/
protected final int handleEndElement()
throws XMLStreamException
{
--_depth;
_currToken = END_ELEMENT;
// Ok, at this point we have seen '/', need the name
_tokenName = _currElem.getName();
int size = _tokenName.sizeInQuads();
/* Do we need to take the slow route? Let's separate that out
* to another method.
* Note: we'll require max bytes for name PLUS one (for trailing
* '>', most likely).
*/
if ((_inputEnd - _inputPtr) < ((size << 2) + 1)) { // may need to load more
return handleEndElementSlow(size);
}
int ptr = _inputPtr;
byte[] buf = _inputBuffer;
// First all full chunks of 4 bytes (if any)
--size;
for (int qix = 0; qix < size; ++qix) {
int q = (buf[ptr] << 24)
| ((buf[ptr+1] & 0xFF) << 16)
| ((buf[ptr+2] & 0xFF) << 8)
| ((buf[ptr+3] & 0xFF))
;
ptr += 4;
// match?
if (q != _tokenName.getQuad(qix)) {
_inputPtr = ptr;
reportUnexpectedEndTag(_tokenName.getPrefixedName());
}
}
/* After which we can deal with the last entry: it's bit
* tricky as we don't actually fully know byte length...
*/
int lastQ = _tokenName.getQuad(size);
int q = buf[ptr++] & 0xFF;
if (q != lastQ) { // need second byte?
q = (q << 8) | (buf[ptr++] & 0xFF);
if (q != lastQ) { // need third byte?
q = (q << 8) | (buf[ptr++] & 0xFF);
if (q != lastQ) { // need full 4 bytes?
q = (q << 8) | (buf[ptr++] & 0xFF);
if (q != lastQ) { // still no match? failure!
_inputPtr = ptr;
reportUnexpectedEndTag(_tokenName.getPrefixedName());
}
}
}
}
// Trailing space?
int i2 = _inputBuffer[ptr] & 0xFF;
_inputPtr = ptr + 1;
while (i2 <= INT_SPACE) {
if (i2 == INT_LF) {
markLF();
} else if (i2 == INT_CR) {
byte b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne();
if (b != BYTE_LF) {
markLF(_inputPtr-1);
i2 = (int) b & 0xFF;
continue;
}
markLF();
} else if (i2 != INT_SPACE && i2 != INT_TAB) {
throwInvalidSpace(i2);
}
i2 = (int) ((_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne()) & 0xFF;
}
if (i2 != INT_GT) {
throwUnexpectedChar(decodeCharForError((byte)i2), " expected space or closing '>'");
}
return END_ELEMENT;
}
private final int handleEndElementSlow(int size)
throws XMLStreamException
{
/* Nope, will likely cross the input boundary; need
* to do proper checks
*/
--size;
for (int qix = 0; qix < size; ++qix) { // first, full chunks
int q = 0;
for (int i = 0; i < 4; ++i) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
q = (q << 8) | (_inputBuffer[_inputPtr++] & 0xFF);
}
// match?
if (q != _tokenName.getQuad(qix)) {
reportUnexpectedEndTag(_tokenName.getPrefixedName());
}
}
// And then the last 1-4 bytes:
int lastQ = _tokenName.getQuad(size);
int q = 0;
int i = 0;
while (true) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
q = (q << 8) | (_inputBuffer[_inputPtr++] & 0xFF);
if (q == lastQ) { // match
break;
}
if (++i > 3) { // no match, error
reportUnexpectedEndTag(_tokenName.getPrefixedName());
break; // never gets here
}
}
// Trailing space?
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
int i2 = _inputBuffer[_inputPtr++];
while (i2 <= INT_SPACE) {
if (i2 == INT_LF) {
markLF();
} else if (i2 == INT_CR) {
byte b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne();
if (b != BYTE_LF) {
markLF(_inputPtr-1);
i2 = (int) b & 0xFF;
continue;
}
markLF();
} else if (i2 != INT_SPACE && i2 != INT_TAB) {
throwInvalidSpace(i2);
}
i2 = (int) ((_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne()) & 0xFF;
}
if (i2 != INT_GT) {
throwUnexpectedChar(decodeCharForError((byte)i2), " expected space or closing '>'");
}
return END_ELEMENT;
}
/* 28-Oct-2006, tatus: This is the old (slow) implementation. I'll
* leave it here, since it's known to work, so in case new impl
* has problems, one can refer to the old impl
*/
/*
protected final int handleEndElement2()
throws XMLStreamException
{
--_depth;
_currToken = END_ELEMENT;
// Ok, at this point we have seen '/', need the name
_tokenName = _currElem.getName();
int i2;
int qix = 0;
while (true) {
int q;
int expQuad = _tokenName.getQuad(qix);
// First byte of a quad:
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
i2 = _inputBuffer[_inputPtr++] & 0xFF;
if (i2 < 65) {
// Ok; "_" (45), "." (46) and "0"-"9"/":" (48 - 57/58) still name chars
if (i2 < 45 || i2 > 58 || i2 == 47) {
if (0 != expQuad || _tokenName.sizeInQuads() != qix) {
reportUnexpectedEndTag(_tokenName.getPrefixedName());
}
break;
}
}
q = i2;
++qix; // since this started a new quad
// second byte
//i2 = (int) ((_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne()) & 0xFF;
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
i2 = _inputBuffer[_inputPtr++] & 0xFF;
if (i2 < 65) {
if (i2 < 45 || i2 > 58 || i2 == 47) {
if (q != expQuad || _tokenName.sizeInQuads() != qix) {
reportUnexpectedEndTag(_tokenName.getPrefixedName());
}
break;
}
}
q = (q << 8) | i2;
// third byte
//i2 = (int) ((_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne()) & 0xFF;
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
i2 = _inputBuffer[_inputPtr++] & 0xFF;
if (i2 < 65) {
if (i2 < 45 || i2 > 58 || i2 == 47) { // 2 (ascii) char name?
if (q != expQuad || _tokenName.sizeInQuads() != qix) {
reportUnexpectedEndTag(_tokenName.getPrefixedName());
}
break;
}
}
q = (q << 8) | i2;
// fourth byte
//i2 = (int) ((_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne()) & 0xFF;
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
i2 = _inputBuffer[_inputPtr++] & 0xFF;
if (i2 < 65) {
if (i2 < 45 || i2 > 58 || i2 == 47) { // 2 (ascii) char name?
if (q != expQuad || _tokenName.sizeInQuads() != qix) {
reportUnexpectedEndTag(_tokenName.getPrefixedName());
}
break;
}
}
q = (q << 8) | i2;
// Full quad, ok; need to compare now:
if (q != expQuad) {
// Let's just fall through, then; will throw exception
reportUnexpectedEndTag(_tokenName.getPrefixedName());
}
}
// Note: i2 still holds the last byte read (except if we detected
// a mismatch; but that caused an exception above)
// Trailing space?
while (i2 <= INT_SPACE) {
if (i2 == INT_LF) {
markLF();
} else if (i2 == INT_CR) {
byte b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne();
if (b != BYTE_LF) {
markLF(_inputPtr-1);
i2 = (int) b & 0xFF;
continue;
}
markLF();
} else if (i2 != INT_SPACE && i2 != INT_TAB) {
throwInvalidSpace(i2);
}
i2 = (int) ((_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne()) & 0xFF;
}
if (i2 != INT_GT) {
throwUnexpectedChar(decodeCharForError((byte)i2), " expected space or closing '>'");
}
return END_ELEMENT;
}
*/
/*
/**********************************************************************
/* Common name/entity parsing
/**********************************************************************
*/
/**
* This method can (for now?) be shared between all Ascii-based
* encodings, since it only does coarse validity checking -- real
* checks are done in different method.
*
* Some notes about assumption implementation makes:
*
* - Well-formed xml content can not end with a name: as such,
* end-of-input is an error and we can throw an exception
*
*
*/
protected final PName parsePName(byte b)
throws XMLStreamException
{
// First: can we optimize out bounds checks?
if ((_inputEnd - _inputPtr) < 8) { // got 1 byte, but need 7, plus one trailing
return parsePNameSlow(b);
}
// If so, can also unroll loops nicely
int q = b & 0xFF;
// Let's do just quick sanity check first; a thorough check will be
// done later on if necessary, now we'll just do the very cheap
// check to catch extra spaces etc.
if (q < INT_A) { // lowest acceptable start char, except for ':' that would be allowed in non-ns mode
throwUnexpectedChar(q, "; expected a name start character");
}
int i2 = _inputBuffer[_inputPtr++] & 0xFF;
// For other bytes beyond first we have to do bit more complicated
// check, to reliably find out where name ends. Still can do quite
// simple checks though
if (i2 < 65) {
// Ok; "_" (45), "." (46) and "0"-"9"/":" (48 - 57/58) still name chars
if (i2 < 45 || i2 > 58 || i2 == 47) {
return findPName(q, 1);
}
}
q = (q << 8) | i2;
i2 = (int) _inputBuffer[_inputPtr++] & 0xFF;
if (i2 < 65) {
if (i2 < 45 || i2 > 58 || i2 == 47) { // 2 (ascii) char name?
return findPName(q, 2);
}
}
q = (q << 8) | i2;
i2 = (int) _inputBuffer[_inputPtr++] & 0xFF;
if (i2 < 65) {
if (i2 < 45 || i2 > 58 || i2 == 47) { // 3 (ascii) char name?
return findPName(q, 3);
}
}
q = (q << 8) | i2;
i2 = (int) _inputBuffer[_inputPtr++] & 0xFF;
if (i2 < 65) {
if (i2 < 45 || i2 > 58 || i2 == 47) { // 4 (ascii) char name?
return findPName(q, 4);
}
}
// Longer, let's offline:
return parsePNameMedium(i2, q);
}
protected PName parsePNameMedium(int i2, int q1)
throws XMLStreamException
{
// Ok, so far so good; one quad, one byte. Then the second
int q2 = i2;
i2 = _inputBuffer[_inputPtr++] & 0xFF;
if (i2 < 65) {
// Ok; "_" (45), "." (46) and "0"-"9"/":" (48 - 57/58) still name chars
if (i2 < 45 || i2 > 58 || i2 == 47) {
return findPName(q1, q2, 1);
}
}
q2 = (q2 << 8) | i2;
i2 = (int) _inputBuffer[_inputPtr++] & 0xFF;
if (i2 < 65) {
if (i2 < 45 || i2 > 58 || i2 == 47) { // 2 (ascii) char name?
return findPName(q1, q2, 2);
}
}
q2 = (q2 << 8) | i2;
i2 = (int) _inputBuffer[_inputPtr++] & 0xFF;
if (i2 < 65) {
if (i2 < 45 || i2 > 58 || i2 == 47) { // 3 (ascii) char name?
return findPName(q1, q2, 3);
}
}
q2 = (q2 << 8) | i2;
i2 = (int) _inputBuffer[_inputPtr++] & 0xFF;
if (i2 < 65) {
if (i2 < 45 || i2 > 58 || i2 == 47) { // 4 (ascii) char name?
return findPName(q1, q2, 4);
}
}
// Ok, no, longer loop. Let's offline
int[] quads = _quadBuffer;
quads[0] = q1;
quads[1] = q2;
return parsePNameLong(i2, quads);
}
protected final PName parsePNameLong(int q, int[] quads)
throws XMLStreamException
{
int qix = 2;
while (true) {
// Second byte of a new quad
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
int i2 = _inputBuffer[_inputPtr++] & 0xFF;
if (i2 < 65) {
if (i2 < 45 || i2 > 58 || i2 == 47) {
// End of name, a single ascii char?
return findPName(q, quads, qix, 1);
}
}
// 3rd byte:
q = (q << 8) | i2;
i2 = (int) ((_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne()) & 0xFF;
if (i2 < 65) {
if (i2 < 45 || i2 > 58 || i2 == 47) { // 2 (ascii) char name?
return findPName(q, quads, qix, 2);
}
}
// 4th byte:
q = (q << 8) | i2;
i2 = (int) ((_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne()) & 0xFF;
if (i2 < 65) {
if (i2 < 45 || i2 > 58 || i2 == 47) { // 2 (ascii) char name?
return findPName(q, quads, qix, 3);
}
}
q = (q << 8) | i2;
i2 = (int) ((_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne()) & 0xFF;
if (i2 < 65) {
if (i2 < 45 || i2 > 58 || i2 == 47) { // 2 (ascii) char name?
return findPName(q, quads, qix, 4);
}
}
if (qix >= quads.length) { // let's just double?
_quadBuffer = quads = DataUtil.growArrayBy(quads, quads.length);
}
quads[qix] = q;
++qix;
q = i2;
}
}
protected final PName parsePNameSlow(byte b)
throws XMLStreamException
{
int q = b & 0xFF;
// Let's do just quick sanity check first; a thorough check will be
// done later on if necessary, now we'll just do the very cheap
// check to catch extra spaces etc.
if (q < INT_A) { // lowest acceptable start char, except for ':' that would be allowed in non-ns mode
throwUnexpectedChar(q, "; expected a name start character");
}
int[] quads = _quadBuffer;
int qix = 0;
// Let's optimize a bit for shorter PNames...
int firstQuad = 0;
while (true) {
// Second byte
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
int i2 = _inputBuffer[_inputPtr++] & 0xFF;
// For other bytes beyond first we have to do bit more complicated
// check, to reliably find out where name ends. Still can do quite
// simple checks though
if (i2 < 65) {
// Ok; "_" (45), "." (46) and "0"-"9"/":" (48 - 57/58) still name chars
if (i2 < 45 || i2 > 58 || i2 == 47) {
// End of name, a single ascii char?
return findPName(q, 1, firstQuad, qix, quads);
}
}
// 3rd byte:
q = (q << 8) | i2;
i2 = (int) ((_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne()) & 0xFF;
if (i2 < 65) {
if (i2 < 45 || i2 > 58 || i2 == 47) { // 2 (ascii) char name?
return findPName(q, 2, firstQuad, qix, quads);
}
}
// 4th byte:
q = (q << 8) | i2;
i2 = (int) ((_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne()) & 0xFF;
if (i2 < 65) {
if (i2 < 45 || i2 > 58 || i2 == 47) { // 3 (ascii) char name?
return findPName(q, 3, firstQuad, qix, quads);
}
}
q = (q << 8) | i2;
// Ok; one more full quad gotten... but just to squeeze bit
// more mileage out of it, was this the end?
i2 = (int) ((_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne()) & 0xFF;
if (i2 < 65) {
if (i2 < 45 || i2 > 58 || i2 == 47) { // 4 (ascii) char name?
return findPName(q, 4, firstQuad, qix, quads);
}
}
// Nope; didn't end. May need to store the quad in temporary
// buffer and continue
if (qix == 0) { // not yet, was the first quad
firstQuad = q;
} else if (qix == 1) { // second quad, need to init buffer
quads[0] = firstQuad;
quads[1] = q;
} else { // 3rd or after... need to make sure there's room
if (qix >= quads.length) { // let's just double?
_quadBuffer = quads = DataUtil.growArrayBy(quads, quads.length);
}
quads[qix] = q;
}
++qix;
q = i2;
}
}
/**
* Method called to process a sequence of bytes that is likely to
* be a PName. At this point we encountered an end marker, and
* may either hit a formerly seen well-formed PName; an as-of-yet
* unseen well-formed PName; or a non-well-formed sequence (containing
* one or more non-name chars without any valid end markers).
*
* @param onlyQuad Word with 1 to 4 bytes that make up PName
* @param lastByteCount Number of actual bytes contained in onlyQuad; 0 to 3.
*/
private final PName findPName(int onlyQuad, int lastByteCount)
throws XMLStreamException
{
// First, need to push back the byte read but not used:
--_inputPtr;
int hash = ByteBasedPNameTable.calcHash(onlyQuad);
PName name = _symbols.findSymbol(hash, onlyQuad, 0);
if (name == null) {
// Let's simplify things a bit, and just use array based one then:
_quadBuffer[0] = onlyQuad;
name = addPName(hash, _quadBuffer, 1, lastByteCount);
}
return name;
}
/**
* Method called to process a sequence of bytes that is likely to
* be a PName. At this point we encountered an end marker, and
* may either hit a formerly seen well-formed PName; an as-of-yet
* unseen well-formed PName; or a non-well-formed sequence (containing
* one or more non-name chars without any valid end markers).
*
* @param firstQuad First 1 to 4 bytes of the PName
* @param secondQuad Word with last 1 to 4 bytes of the PName
* @param lastByteCount Number of bytes contained in secondQuad; 0 to 3.
*/
private final PName findPName(int firstQuad, int secondQuad,
int lastByteCount)
throws XMLStreamException
{
// First, need to push back the byte read but not used:
--_inputPtr;
int hash = ByteBasedPNameTable.calcHash(firstQuad, secondQuad);
PName name = _symbols.findSymbol(hash, firstQuad, secondQuad);
if (name == null) {
// Let's just use array, then
_quadBuffer[0] = firstQuad;
_quadBuffer[1] = secondQuad;
name = addPName(hash, _quadBuffer, 2, lastByteCount);
}
return name;
}
/**
* Method called to process a sequence of bytes that is likely to
* be a PName. At this point we encountered an end marker, and
* may either hit a formerly seen well-formed PName; an as-of-yet
* unseen well-formed PName; or a non-well-formed sequence (containing
* one or more non-name chars without any valid end markers).
*
* @param lastQuad Word with last 0 to 3 bytes of the PName; not included
* in the quad array
* @param quads Array that contains all the quads, except for the
* last one, for names with more than 8 bytes (i.e. more than
* 2 quads)
* @param qlen Number of quads in the array, except if less than 2
* (in which case only firstQuad and lastQuad are used)
* @param lastByteCount Number of bytes contained in lastQuad; 0 to 3.
*/
private final PName findPName(int lastQuad, int[] quads, int qlen, int lastByteCount)
throws XMLStreamException
{
// First, need to push back the byte read but not used:
--_inputPtr;
/* Nope, long (3 quads or more). At this point, the last quad is
* not yet in the array, let's add:
*/
if (qlen >= quads.length) { // let's just double?
_quadBuffer = quads = DataUtil.growArrayBy(quads, quads.length);
}
quads[qlen++] = lastQuad;
int hash = ByteBasedPNameTable.calcHash(quads, qlen);
PName name = _symbols.findSymbol(hash, quads, qlen);
if (name == null) {
name = addPName(hash, quads, qlen, lastByteCount);
}
return name;
}
/**
* Method called to process a sequence of bytes that is likely to
* be a PName. At this point we encountered an end marker, and
* may either hit a formerly seen well-formed PName; an as-of-yet
* unseen well-formed PName; or a non-well-formed sequence (containing
* one or more non-name chars without any valid end markers).
*
* @param lastQuad Word with last 0 to 3 bytes of the PName; not included
* in the quad array
* @param lastByteCount Number of bytes contained in lastQuad; 0 to 3.
* @param firstQuad First 1 to 4 bytes of the PName (4 if length
* at least 4 bytes; less only if not).
* @param qlen Number of quads in the array, except if less than 2
* (in which case only firstQuad and lastQuad are used)
* @param quads Array that contains all the quads, except for the
* last one, for names with more than 8 bytes (i.e. more than
* 2 quads)
*/
private final PName findPName(int lastQuad, int lastByteCount, int firstQuad,
int qlen, int[] quads)
throws XMLStreamException
{
// Separate handling for short names:
if (qlen <= 1) {
if (qlen == 0) { // 4-bytes or less; only has 'lastQuad' defined
return findPName(lastQuad, lastByteCount);
}
return findPName(firstQuad, lastQuad, lastByteCount);
}
return findPName(lastQuad, quads, qlen, lastByteCount);
}
/*
////////////////////////////////////////////////
// Other parsing helper methods
////////////////////////////////////////////////
*/
/**
* @return First byte following skipped white space
*/
protected byte skipInternalWs(boolean reqd, String msg)
throws XMLStreamException
{
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
byte b = _inputBuffer[_inputPtr++];
int c = b & 0xFF;
if (c > INT_SPACE) {
if (!reqd) {
return b;
}
reportTreeUnexpChar(decodeCharForError(b), " (expected white space "+msg+")");
}
do {
// But let's first handle the space we already got:
if (b == BYTE_LF) {
markLF();
} else if (b == BYTE_CR) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (_inputBuffer[_inputPtr] == BYTE_LF) {
++_inputPtr;
}
markLF();
} else if (b != BYTE_SPACE && b != BYTE_TAB) {
throwInvalidSpace(b);
}
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
b = _inputBuffer[_inputPtr++];
} while ((b & 0xFF) <= INT_SPACE);
return b;
}
private final void matchAsciiKeyword(String keyw)
throws XMLStreamException
{
for (int i = 1, len = keyw.length(); i < len; ++i) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
byte b = _inputBuffer[_inputPtr++];
if (b != (byte) keyw.charAt(i)) {
reportTreeUnexpChar(decodeCharForError(b), " (expected '"+keyw.charAt(i)+"' for "+keyw+" keyword)");
}
}
}
/**
*
* Note: consequtive white space is only considered indentation,
* if the following token seems like a tag (start/end). This so
* that if a CDATA section follows, it can be coalesced in
* coalescing mode. Although we could check if coalescing mode is
* enabled, this should seldom have significant effect either way,
* so it removes one possible source of problems in coalescing mode.
*
* @return -1, if indentation was handled; offset in the output
* buffer, if not
*/
protected final int checkInTreeIndentation(int c)
throws XMLStreamException
{
if (c == INT_CR) {
// First a degenerate case, a lone \r:
if (_inputPtr >= _inputEnd && !loadMore()) {
_textBuilder.resetWithIndentation(0, CHAR_SPACE);
return -1;
}
if (_inputBuffer[_inputPtr] == BYTE_LF) {
++_inputPtr;
}
}
markLF();
// Then need an indentation char (or start/end tag):
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
byte b = _inputBuffer[_inputPtr];
if (b != BYTE_SPACE && b != BYTE_TAB) {
// May still be indentation, if it's lt + non-exclamation mark
if (b == BYTE_LT) {
if ((_inputPtr+1) < _inputEnd && _inputBuffer[_inputPtr+1] != BYTE_EXCL) {
_textBuilder.resetWithIndentation(0, CHAR_SPACE);
return -1;
}
}
char[] outBuf = _textBuilder.resetWithEmpty();
outBuf[0] = CHAR_LF;
_textBuilder.setCurrentLength(1);
return 1;
}
// So how many do we get?
++_inputPtr;
int count = 1;
int max = (b == BYTE_SPACE) ? TextBuilder.MAX_INDENT_SPACES : TextBuilder.MAX_INDENT_TABS;
while (count <= max) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
byte b2 = _inputBuffer[_inputPtr];
if (b2 != b) {
// Has to be followed by a start/end tag...
if (b2 == BYTE_LT && (_inputPtr+1) < _inputEnd
&& _inputBuffer[_inputPtr+1] != BYTE_EXCL) {
_textBuilder.resetWithIndentation(count, (char) b);
return -1;
}
break;
}
++_inputPtr;
++count;
}
// Nope, hit something else, or too long: need to just copy the stuff
// we know buffer has enough room either way
char[] outBuf = _textBuilder.resetWithEmpty();
outBuf[0] = CHAR_LF;
char ind = (char) b;
for (int i = 1; i <= count; ++i) {
outBuf[i] = ind;
}
count += 1; // to account for leading lf
_textBuilder.setCurrentLength(count);
return count;
}
/**
* @return -1, if indentation was handled; offset in the output
* buffer, if not
*/
protected final int checkPrologIndentation(int c)
throws XMLStreamException
{
if (c == INT_CR) {
// First a degenerate case, a lone \r:
if (_inputPtr >= _inputEnd && !loadMore()) {
_textBuilder.resetWithIndentation(0, CHAR_SPACE);
return -1;
}
if (_inputBuffer[_inputPtr] == BYTE_LF) {
++_inputPtr;
}
}
markLF();
// Ok, indentation char?
if (_inputPtr >= _inputEnd && !loadMore()) {
_textBuilder.resetWithIndentation(0, CHAR_SPACE);
return -1;
}
byte b = _inputBuffer[_inputPtr]; // won't advance past the char yet
if (b != BYTE_SPACE && b != BYTE_TAB) {
// If lt, it's still indentation ok:
if (b == BYTE_LT) { // need
_textBuilder.resetWithIndentation(0, CHAR_SPACE);
return -1;
}
// Nope... something else
char[] outBuf = _textBuilder.resetWithEmpty();
outBuf[0] = CHAR_LF;
_textBuilder.setCurrentLength(1);
return 1;
}
// So how many do we get?
++_inputPtr;
int count = 1;
int max = (b == BYTE_SPACE) ? TextBuilder.MAX_INDENT_SPACES : TextBuilder.MAX_INDENT_TABS;
while (true) {
if (_inputPtr >= _inputEnd && !loadMore()) {
break;
}
if (_inputBuffer[_inputPtr] != b) {
break;
}
++_inputPtr;
++count;
if (count >= max) { // ok, can't share... but can build it still
// we know buffer has enough room
char[] outBuf = _textBuilder.resetWithEmpty();
outBuf[0] = CHAR_LF;
char ind = (char) b;
for (int i = 1; i <= count; ++i) {
outBuf[i] = ind;
}
count += 1; // to account for leading lf
_textBuilder.setCurrentLength(count);
return count;
}
}
// Ok, gotcha?
_textBuilder.resetWithIndentation(count, (char) b);
return -1;
}
/*
/**********************************************************************
/* Methods for sub-classes, reading data
/**********************************************************************
*/
@Override
protected final boolean loadMore() throws XMLStreamException
{
// First, let's update offsets:
_pastBytesOrChars += _inputEnd;
_rowStartOffset -= _inputEnd;
_inputPtr = 0;
// If it's a block source, there's no input stream, or any more data:
if (_in == null) {
_inputEnd = 0;
return false;
}
try {
int count = _in.read(_inputBuffer, 0, _inputBuffer.length);
if (count < 1) {
_inputEnd = 0;
if (count == 0) {
/* Sanity check; should never happen with correctly written
* InputStreams...
*/
reportInputProblem("InputStream returned 0 bytes, even when asked to read up to "+_inputBuffer.length);
}
return false;
}
_inputEnd = count;
return true;
} catch (IOException ioe) {
throw new IoStreamException(ioe);
}
}
protected final byte nextByte(int tt)
throws XMLStreamException
{
if (_inputPtr >= _inputEnd) {
if (!loadMore()) {
reportInputProblem("Unexpected end-of-input when trying to parse "+ErrorConsts.tokenTypeDesc(tt));
}
}
return _inputBuffer[_inputPtr++];
}
protected final byte nextByte()
throws XMLStreamException
{
if (_inputPtr >= _inputEnd) {
if (!loadMore()) {
reportInputProblem("Unexpected end-of-input when trying to parse "+ErrorConsts.tokenTypeDesc(_currToken));
}
}
return _inputBuffer[_inputPtr++];
}
protected final byte loadOne()
throws XMLStreamException
{
if (!loadMore()) {
reportInputProblem("Unexpected end-of-input when trying to parse "+ErrorConsts.tokenTypeDesc(_currToken));
}
return _inputBuffer[_inputPtr++];
}
protected final byte loadOne(int type)
throws XMLStreamException
{
if (!loadMore()) {
reportInputProblem("Unexpected end-of-input when trying to parse "+ErrorConsts.tokenTypeDesc(type));
}
return _inputBuffer[_inputPtr++];
}
protected final boolean loadAndRetain(int nrOfChars)
throws XMLStreamException
{
/* first: can't move, if we were handed an immutable block
* (alternative to handing InputStream as _in)
*/
if (_in == null) {
return false;
}
// otherwise, need to use cut'n pasted code from loadMore()...
_pastBytesOrChars += _inputPtr;
_rowStartOffset -= _inputPtr;
int remaining = (_inputEnd - _inputPtr); // must be > 0
System.arraycopy(_inputBuffer, _inputPtr, _inputBuffer, 0, remaining);
_inputPtr = 0;
_inputEnd = remaining; // temporarily set to cover copied stuff
try {
do {
int max = _inputBuffer.length - _inputEnd;
int count = _in.read(_inputBuffer, _inputEnd, max);
if (count < 1) {
if (count == 0) {
// Sanity check, should never happen with non-buggy readers/stream
reportInputProblem("InputStream returned 0 bytes, even when asked to read up to "+max);
}
return false;
}
_inputEnd += count;
} while (_inputEnd < nrOfChars);
return true;
} catch (IOException ioe) {
throw new IoStreamException(ioe);
}
}
}