All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.fasterxml.aalto.async.AsyncByteScanner Maven / Gradle / Ivy

There is a newer version: 1.3.3
Show newest version
package com.fasterxml.aalto.async;

import java.io.IOException;

import javax.xml.stream.XMLStreamException;

import com.fasterxml.aalto.AsyncInputFeeder;
import com.fasterxml.aalto.AsyncXMLStreamReader;
import com.fasterxml.aalto.impl.ErrorConsts;
import com.fasterxml.aalto.in.ByteBasedPNameTable;
import com.fasterxml.aalto.in.ByteBasedScanner;
import com.fasterxml.aalto.in.PName;
import com.fasterxml.aalto.in.ReaderConfig;
import com.fasterxml.aalto.util.CharsetNames;
import com.fasterxml.aalto.util.DataUtil;
import com.fasterxml.aalto.util.XmlCharTypes;

public abstract class AsyncByteScanner
    extends ByteBasedScanner
    implements AsyncInputFeeder
{
    protected final static int EVENT_INCOMPLETE = AsyncXMLStreamReader.EVENT_INCOMPLETE;

    /*
    /**********************************************************************
    /* State consts
    /**********************************************************************
     */

    /**
     * Default starting state for many events/contexts -- nothing has been
     * seen so far, no  event incomplete. Not used for all event types.
     */
    protected final static int STATE_DEFAULT = 0;

    // // // States for prolog/epilog major state:

    /**
     * State in which a less-than sign has been seen
     */
    protected final static int STATE_PROLOG_INITIAL = 1; // State before document when we may get xml declaration
    protected final static int STATE_PROLOG_SEEN_LT = 2; // "<" seen after xml declaration
    protected final static int STATE_PROLOG_DECL = 3; // "'
    protected final static int STATE_DTD_BEFORE_IDS = 6; // before "PUBLIC" or "SYSTEM" token
    protected final static int STATE_DTD_PUBLIC_OR_SYSTEM = 7; // parsing "PUBLIC" or "SYSTEM"
    protected final static int STATE_DTD_AFTER_PUBLIC = 8; // "PUBLIC" found, need space
    protected final static int STATE_DTD_AFTER_SYSTEM = 9; // "SYSTEM" found, need space
    protected final static int STATE_DTD_BEFORE_PUBLIC_ID = 10; // after "PUBLIC", space, need quoted public id
    protected final static int STATE_DTD_PUBLIC_ID = 11; // parsing public ID
    protected final static int STATE_DTD_AFTER_PUBLIC_ID = 12; // public ID parsed, need space
    protected final static int STATE_DTD_BEFORE_SYSTEM_ID = 13; // about to parse quoted system id
    protected final static int STATE_DTD_SYSTEM_ID = 14; // parsing system ID
    protected final static int STATE_DTD_AFTER_SYSTEM_ID = 15; // after system ID, optional space, '>' or int subset
    protected final static int STATE_DTD_INT_SUBSET = 16; // parsing internal subset

    protected final static int STATE_DTD_EXPECT_CLOSING_GT = 50; // ']' gotten that should be followed by '>'

    // For CHARACTERS, default is the basic (and only)

    // just seen "&"
    protected final static int STATE_TEXT_AMP = 4;
    // just seen "&#"
//    protected final static int STATE_TEXT_AMP_AND_HASH = 5;
    // seen '&' and partial name:
    protected final static int STATE_TEXT_AMP_NAME = 6;

    // For comments, STATE_DEFAULT means "
    protected final static int STATE_PI_AFTER_TARGET_WS = 2; // "
     * NOTE: non-final due to xml declaration handling occurring later.
     */
    protected XmlCharTypes _charTypes;
    
    /**
     * For now, symbol table contains prefixed names. In future it is
     * possible that they may be split into prefixes and local names?
     *

* NOTE: non-final for async scanners */ protected ByteBasedPNameTable _symbols; /** * This buffer is used for name parsing. Will be expanded if/as * needed; 32 ints can hold names 128 ascii chars long. */ protected int[] _quadBuffer = new int[32]; /* /********************************************************************** /* General state tracking /********************************************************************** */ /** * Due to asynchronous nature of parsing, we may know what * event we are trying to parse, even if it's not yet * complete. Type of that event is stored here. */ protected int _nextEvent = EVENT_INCOMPLETE; /** * In addition to the event type, there is need for additional * state information */ protected int _state; /** * For token/state combinations that are 'shared' between * events (or embedded in them), this is where the surrounding * event state is retained. */ protected int _surroundingEvent = EVENT_INCOMPLETE; /** * There are some multi-byte combinations that must be handled * as a unit: CR+LF linefeeds, multi-byte UTF-8 characters, and * multi-character end markers for comments and PIs. * Since they can be split across input buffer * boundaries, first byte(s) may need to be temporarily stored. *

* If so, this int will store byte(s), in little-endian format * (that is, first pending byte is at 0x000000FF, second [if any] * at 0x0000FF00, and third at 0x00FF0000). This can be * (and is) used to figure out actual number of bytes pending, * for multi-byte (UTF-8) character decoding. *

* Note: it is assumed that if value is 0, there is no data. * Thus, if 0 needed to be added pending, it has to be masked. */ protected int _pendingInput = 0; /** * Flag that is sent when calling application indicates that there will * be no more input to parse. */ protected boolean _endOfInput = false; /* /********************************************************************** /* Name/entity parsing state /********************************************************************** */ /** * Number of complete quads parsed for current name (quads * themselves are stored in {@link #_quadBuffer}). */ protected int _quadCount; /** * Bytes parsed for the current, incomplete, quad */ protected int _currQuad; /** * Number of bytes pending/buffered, stored in {@link #_currQuad} */ protected int _currQuadBytes = 0; /** * Entity value accumulated so far */ protected int _entityValue = 0; /* /********************************************************************** /* (Start) element parsing state /********************************************************************** */ protected boolean _elemAllNsBound; protected boolean _elemAttrCount; protected byte _elemAttrQuote; protected PName _elemAttrName; /** * Pointer for the next character of currently being parsed value * within attribute value buffer */ protected int _elemAttrPtr; /** * Pointer for the next character of currently being parsed namespace * URI for the current namespace declaration */ protected int _elemNsPtr; /* /********************************************************************** /* Other state /********************************************************************** */ /** * Flag that indicates whether we are inside a declaration during parsing * of internal DTD subset. */ protected boolean _inDtdDeclaration; /* /********************************************************************** /* Life-cycle /********************************************************************** */ protected AsyncByteScanner(ReaderConfig cfg) { super(cfg); // 03-Apr-2018, tatu: Can not yet fetch `_charTypes` or `_symbols` since we // do not necessarily know actual encoding from XML declaration // _charTypes = cfg.getCharTypes(); // _symbols = cfg.getBBSymbols(); } /** * Initialization method to call when encoding has been definitely figured out, * from XML declarations, or, from lack of one (using defaults). * * @since 1.1.1 */ protected void _activateEncoding() { // 04-Apr-2018, tatu: Not sure if we should try to enforce; gets tricky so for now // simply make first call stick if (_symbols == null) { _charTypes = _config.getCharTypes(); _symbols = _config.getBBSymbols(); } } @Override public void endOfInput() { _endOfInput = true; } @Override protected void _releaseBuffers() { super._releaseBuffers(); if (_symbols.maybeDirty()) { _config.updateBBSymbols(_symbols); } } /** * Since the async scanner has no access to whatever passes content, * there is no input source in same sense as with blocking scanner; * and there is nothing to close. But we can at least mark input * as having ended. */ @Override protected void _closeSource() throws IOException { // nothing to do, we are done. _endOfInput = true; } /* /********************************************************************** /* Shared helper methods /********************************************************************** */ protected void verifyAndSetXmlVersion() throws XMLStreamException { if (_textBuilder.equalsString("1.0")) { _config.setXmlVersion("1.0"); } else if (_textBuilder.equalsString("1.1")) { _config.setXmlVersion("1.1"); } else { reportInputProblem("Unrecognized XML version '"+_textBuilder.contentsAsString()+"' (expected '1.0' or '1.1')"); } } protected void verifyAndSetXmlEncoding() throws XMLStreamException { String enc = CharsetNames.normalize(_textBuilder.contentsAsString()); if ((CharsetNames.CS_UTF8 != enc) && (CharsetNames.CS_US_ASCII != enc) && (CharsetNames.CS_ISO_LATIN1 != enc)) { reportInputProblem("Unsupported encoding '"+enc+"': only UTF-8 and US-ASCII support by async parser"); } // 03-Apr-2018, tatu: Need to overwrite default (UTF-8) if declared otherwise. // And besides changing configs need to force use of new symbol tables, too... _config.setXmlEncoding(enc); if (enc != null) { _config.setActualEncoding(enc); } _charTypes = _config.getCharTypes(); } protected void verifyAndSetXmlStandalone() throws XMLStreamException { if (_textBuilder.equalsString("yes")) { _config.setXmlStandalone(Boolean.TRUE); } else if (_textBuilder.equalsString("no")) { _config.setXmlStandalone(Boolean.FALSE); } else { reportInputProblem("Invalid standalone value '"+_textBuilder.contentsAsString()+"': can only use 'yes' and 'no'"); } } protected void verifyAndSetPublicId() throws XMLStreamException { _publicId = _textBuilder.contentsAsString(); } protected void verifyAndSetSystemId() throws XMLStreamException { _systemId = _textBuilder.contentsAsString(); } /* /********************************************************************** /* Content accessors for less performance-critical sections /********************************************************************** */ protected abstract byte _currentByte() throws XMLStreamException; protected abstract byte _nextByte() throws XMLStreamException; protected abstract byte _prevByte() throws XMLStreamException; /* /********************************************************************** /* Abstract methods for subclasses to implement wrt prolog/epilog /********************************************************************** */ protected abstract int handlePI() throws XMLStreamException; protected abstract boolean handleDTDInternalSubset(boolean init) throws XMLStreamException; protected abstract int handleComment() throws XMLStreamException; protected abstract int handleStartElementStart(byte b) throws XMLStreamException; protected abstract int handleStartElement() throws XMLStreamException; protected abstract PName parsePName() throws XMLStreamException; protected abstract PName parseNewName(byte b) throws XMLStreamException; protected abstract boolean asyncSkipSpace() throws XMLStreamException; protected abstract boolean handlePartialCR() throws XMLStreamException; /* /********************************************************************** /* Second-level parsing; character content (in tree) /********************************************************************** */ @Override protected final void finishToken() throws XMLStreamException { _tokenIncomplete = false; switch (_currToken) { case PROCESSING_INSTRUCTION: finishPI(); break; case CHARACTERS: finishCharacters(); break; case COMMENT: finishComment(); break; case SPACE: finishSpace(); break; case DTD: finishDTD(true); // true -> get text break; case CDATA: finishCData(); break; default: ErrorConsts.throwInternalError(); } } /** * Method called to initialize state for CHARACTERS event, after * just a single byte has been seen. What needs to be done next * depends on whether coalescing mode is set or not: if it is not * set, just a single character needs to be decoded, after which * current event will be incomplete, but defined as CHARACTERS. * In coalescing mode, the whole content must be read before * current event can be defined. The reason for difference is * that when XMLStreamReader.next() returns, no * blocking can occur when calling other methods. * * @return Event type detected; either CHARACTERS, if at least * one full character was decoded (and can be returned), * EVENT_INCOMPLETE if not (part of a multi-byte character * split across input buffer boundary) */ protected abstract int startCharacters(byte b) throws XMLStreamException; protected abstract boolean handleAttrValue() throws XMLStreamException; protected abstract boolean handleNsDecl() throws XMLStreamException; /* /********************************************************************** /* Abstract methods from base class, parsing /********************************************************************** */ @Override protected void finishCData() throws XMLStreamException { // N/A throwInternal(); } @Override protected void finishComment() throws XMLStreamException { // N/A throwInternal(); } @Override protected void finishDTD(boolean copyContents) throws XMLStreamException { // N/A throwInternal(); } @Override protected void finishPI() throws XMLStreamException { // N/A throwInternal(); } @Override protected void finishSpace() throws XMLStreamException { // N/A throwInternal(); } // // token-skip methods /** * @return True if the whole characters segment was succesfully * skipped; false if not */ @Override protected abstract boolean skipCharacters() throws XMLStreamException; @Override protected void skipCData() throws XMLStreamException { // should never be called throwInternal(); } @Override protected void skipComment() throws XMLStreamException { // should never be called throwInternal(); } @Override protected void skipPI() throws XMLStreamException { // should never be called throwInternal(); } @Override protected void skipSpace() throws XMLStreamException { // should never be called throwInternal(); } @Override protected boolean loadMore() throws XMLStreamException { // should never get called throwInternal(); return false; // never gets here } @Override protected abstract void finishCharacters() throws XMLStreamException; /* /********************************************************************** /* Internal methods, name decoding /********************************************************************** */ /** * Method called to process a sequence of bytes that is likely to * be a PName. At this point we encountered an end marker, and * may either hit a formerly seen well-formed PName; an as-of-yet * unseen well-formed PName; or a non-well-formed sequence (containing * one or more non-name chars without any valid end markers). * * @param lastQuad Word with last 0 to 3 bytes of the PName; not included * in the quad array * @param lastByteCount Number of bytes contained in lastQuad; 0 to 3. */ protected final PName findPName(int lastQuad, int lastByteCount) throws XMLStreamException { // First, need to push back the byte read but not used: --_inputPtr; int qlen = _quadCount; // Also: if last quad is empty, will need take last from qbuf. if (lastByteCount == 0) { lastQuad = _quadBuffer[--qlen]; lastByteCount = 4; } // Separate handling for short names: if (qlen <= 1) { // short name? if (qlen == 0) { // 4-bytes or less; only has 'lastQuad' defined int hash = ByteBasedPNameTable.calcHash(lastQuad); PName name = _symbols.findSymbol(hash, lastQuad, 0); if (name == null) { // Let's simplify things a bit, and just use array based one then: _quadBuffer[0] = lastQuad; name = addPName(_symbols, hash, _quadBuffer, 1, lastByteCount); } return name; } int firstQuad = _quadBuffer[0]; int hash = ByteBasedPNameTable.calcHash(firstQuad, lastQuad); PName name = _symbols.findSymbol(hash, firstQuad, lastQuad); if (name == null) { // As above, let's just use array, then _quadBuffer[1] = lastQuad; name = addPName(_symbols, hash, _quadBuffer, 2, lastByteCount); } return name; } // Nope, long (3 quads or more). At this point, the last quad is // not yet in the array, let's add: if (qlen >= _quadBuffer.length) { // let's just double? _quadBuffer = DataUtil.growArrayBy(_quadBuffer, _quadBuffer.length); } _quadBuffer[qlen++] = lastQuad; int hash = ByteBasedPNameTable.calcHash(_quadBuffer, qlen); PName name = _symbols.findSymbol(hash, _quadBuffer, qlen); if (name == null) { name = addPName(_symbols, hash, _quadBuffer, qlen, lastByteCount); } return name; } protected final PName addPName(ByteBasedPNameTable symbols, int hash, int[] quads, int qlen, int lastQuadBytes) throws XMLStreamException { return addUTFPName(symbols, _charTypes, hash, quads, qlen, lastQuadBytes); } /* /********************************************************************** /* Internal methods, input validation /********************************************************************** */ /** * Method called to verify validity of given character (from entity) and * append it to the text buffer */ protected void verifyAndAppendEntityCharacter(int charFromEntity) throws XMLStreamException { verifyXmlChar(charFromEntity); // Ok; does it need a surrogate though? (over 16 bits) if ((charFromEntity >> 16) != 0) { charFromEntity -= 0x10000; _textBuilder.append((char) (0xD800 | (charFromEntity >> 10))); charFromEntity = 0xDC00 | (charFromEntity & 0x3FF); } _textBuilder.append((char) charFromEntity); } /** * Checks that a character for a PublicId * * @param c A character * @return true if the character is valid for use in the Public ID * of an XML doctype declaration * * @see "http://www.w3.org/TR/xml/#NT-PubidLiteral" */ protected boolean validPublicIdChar(int c) { return c == 0xA || // c == 0xD || // c == 0x20 || // (c >= '0' && c <= '9') || //[0-9] (c >= '@' && c <= 'Z') || //@[A-Z] (c >= 'a' && c <= 'z') || c == '!' || (c >= 0x23 && c <= 0x25) || //#$% (c >= 0x27 && c <= 0x2F) || //'()*+,-./ (c >= ':' && c <= ';') || c == '=' || c == '?' || c == '_'; } /* /********************************************************************** /* Internal methods, error handling /********************************************************************** */ @Override protected int decodeCharForError(byte b) throws XMLStreamException { // !!! TBI return (int) b; } protected void checkPITargetName(PName targetName) throws XMLStreamException { String ln = targetName.getLocalName(); if (ln.length() == 3 && ln.equalsIgnoreCase("xml") && !targetName.hasPrefix()) { reportInputProblem(ErrorConsts.ERR_WF_PI_XML_TARGET); } } protected int throwInternal() { throw new IllegalStateException("Internal error: should never execute this code path"); } protected void reportInvalidOther(int mask, int ptr) throws XMLStreamException { _inputPtr = ptr; reportInvalidOther(mask); } /* /********************************************************************** /* Shared implementation for handling XML prolog; less performance /* sensitive so need not inline access /********************************************************************** */ @Override public final int nextFromProlog(boolean isProlog) throws XMLStreamException { // Had fully complete event? Need to reset state etc: if (_currToken != EVENT_INCOMPLETE) { // First: keep track of where event started setStartLocation(); // yet one more special case: after START_DOCUMENT need to check things... if (_currToken == START_DOCUMENT) { _currToken = EVENT_INCOMPLETE; if (_tokenName != null) { _nextEvent = PROCESSING_INSTRUCTION; _state = STATE_PI_AFTER_TARGET; checkPITargetName(_tokenName); return handlePI(); } } else { _currToken = _nextEvent = EVENT_INCOMPLETE; _state = STATE_DEFAULT; } } // Ok, do we know which event it will be? if (_nextEvent == EVENT_INCOMPLETE) { // nope // The very first thing: XML declaration handling if (_state == STATE_PROLOG_INITIAL) { if (_inputPtr >= _inputEnd) { return _currToken; } // Ok: see if we have what looks like XML declaration; process: if (_pendingInput != 0) { // already parsing (potential) XML declaration Boolean b = startXmlDeclaration(); // is or may be XML declaration, so: if (b == null) { // not yet known; bail out return EVENT_INCOMPLETE; } if (b == Boolean.FALSE) { // no real XML declaration; synthesize one return _startDocumentNoXmlDecl(); } return handleXmlDeclaration(); } if (_currentByte() == BYTE_LT) { // first byte, see if it could be XML declaration ++_inputPtr; _pendingInput = PENDING_STATE_XMLDECL_LT; Boolean b = startXmlDeclaration(); // is or may be XML declaration, so: if (b == null) { return EVENT_INCOMPLETE; } if (b == Boolean.FALSE) { // no real XML declaration; synthesize one return _startDocumentNoXmlDecl(); } return handleXmlDeclaration(); } // can't be XML declaration _state = STATE_DEFAULT; return _startDocumentNoXmlDecl(); } // First: did we have a lone CR at the end of the buffer? if (_pendingInput != 0) { // yup if (!handlePartialCR()) { return _currToken; } } while (_state == STATE_DEFAULT) { if (_inputPtr >= _inputEnd) { // no more input available if (_endOfInput) { // for good? That may be fine setStartLocation(); return TOKEN_EOI; } return _currToken; } byte b = _nextByte(); // Really should get white space or '<'... anything else is // pretty much an error. if (b == BYTE_LT) { // root element, comment, proc instr? _state = STATE_PROLOG_SEEN_LT; break; } if (b == BYTE_SPACE || b == BYTE_CR || b == BYTE_LF || b == BYTE_TAB) { // Prolog/epilog ws is to be skipped, not part of Infoset if (!asyncSkipSpace()) { // ran out of input? if (_endOfInput) { // for good? That may be fine setStartLocation(); return TOKEN_EOI; } return _currToken; } } else { reportPrologUnexpChar(isProlog, decodeCharForError(b), null); } } if (_state == STATE_PROLOG_SEEN_LT) { if (_inputPtr >= _inputEnd) { return _currToken; } byte b = _nextByte(); if (b == BYTE_EXCL) { // comment or DOCTYPE declaration? _state = STATE_PROLOG_DECL; return handlePrologDeclStart(isProlog); } if (b == BYTE_QMARK) { // PI _nextEvent = PROCESSING_INSTRUCTION; _state = STATE_DEFAULT; return handlePI(); } if (b == BYTE_SLASH || !isProlog) { reportPrologUnexpElement(isProlog, b); } return handleStartElementStart(b); } if (_state == STATE_PROLOG_DECL) { return handlePrologDeclStart(isProlog); } // should never have anything else... return throwInternal(); } // At this point, we do know the event type switch (_nextEvent) { case START_ELEMENT: return handleStartElement(); case START_DOCUMENT: return handleXmlDeclaration(); case PROCESSING_INSTRUCTION: return handlePI(); case COMMENT: return handleComment(); case DTD: return handleDTD(); } return throwInternal(); // should never get here } /** * Helper method called when it is determined that the document does NOT start with * an xml declaration. Needs to return START_DOCUMENT, and initialize other state * appropriately. */ protected int _startDocumentNoXmlDecl() throws XMLStreamException { // 03-Apr-2018, tatu: We can finalize encoding at this point _activateEncoding(); _currToken = START_DOCUMENT; return START_DOCUMENT; } private final int handlePrologDeclStart(boolean isProlog) throws XMLStreamException { if (_inputPtr >= _inputEnd) { // nothing we can do? return EVENT_INCOMPLETE; } byte b = _nextByte(); // So far, we have seen "= _inputEnd) { return null; } if (_pendingInput == PENDING_STATE_XMLDECL_LT) { // "<" at start of doc if (_currentByte() != BYTE_QMARK) { // some other _pendingInput = 0; _state = STATE_PROLOG_SEEN_LT; return Boolean.FALSE; } ++_inputPtr; _pendingInput = PENDING_STATE_XMLDECL_LTQ; if (_inputPtr >= _inputEnd) { return null; } } if (_pendingInput == PENDING_STATE_XMLDECL_LTQ) { // "= _inputEnd) { break; } // fall through case STATE_XMLDECL_BEFORE_VERSION: if (!asyncSkipSpace()) { // not enough input break; } if ((_tokenName = _parseNewXmlDeclName(_nextByte())) == null) { // incomplete _state = STATE_XMLDECL_VERSION; break; } if (!_tokenName.hasPrefixedName("version")) { reportInputProblem("Unexpected keyword '"+_tokenName.getPrefixedName()+"' in XML declaration: expected 'version'"); } _state = STATE_XMLDECL_AFTER_VERSION; continue main_loop; case STATE_XMLDECL_VERSION: // "= _inputEnd) { break; } // fall through case STATE_XMLDECL_AFTER_VERSION: // "= _inputEnd) { break; } // fall through case STATE_XMLDECL_VERSION_EQ: // "= _inputEnd || !parseXmlDeclAttr(buf, 0)) { _state = STATE_XMLDECL_VERSION_VALUE; break; } } verifyAndSetXmlVersion(); _state = STATE_XMLDECL_AFTER_VERSION_VALUE; continue main_loop; case STATE_XMLDECL_VERSION_VALUE: // parsing version value if (!parseXmlDeclAttr(_textBuilder.getBufferWithoutReset(), _textBuilder.getCurrentLength())) { _state = STATE_XMLDECL_VERSION_VALUE; break; } verifyAndSetXmlVersion(); _state = STATE_XMLDECL_AFTER_VERSION_VALUE; if (_inputPtr >= _inputEnd) { break; } // fall through case STATE_XMLDECL_AFTER_VERSION_VALUE: // version got; need space or '?' { byte b = _nextByte(); if (b == BYTE_QMARK) { _state = STATE_XMLDECL_ENDQ; continue main_loop; } if (b == BYTE_SPACE || b == BYTE_CR || b == BYTE_LF || b == BYTE_TAB) { _state = STATE_XMLDECL_BEFORE_ENCODING; } else { reportPrologUnexpChar(true, decodeCharForError(b), " (expected space after version value in xml declaration)"); } } if (_inputPtr >= _inputEnd) { break; } // fall through case STATE_XMLDECL_BEFORE_ENCODING: // version, value, space got, need '?' or 'e' if (!asyncSkipSpace()) { // not enough input break; } { byte b = _nextByte(); if (b == BYTE_QMARK) { _state = STATE_XMLDECL_ENDQ; continue main_loop; } if ((_tokenName = _parseNewXmlDeclName(b)) == null) { // incomplete _state = STATE_XMLDECL_ENCODING; break; } // Can actually also get "standalone" instead... if (_tokenName.hasPrefixedName("encoding")) { _state = STATE_XMLDECL_AFTER_ENCODING; } else if (_tokenName.hasPrefixedName("standalone")) { _state = STATE_XMLDECL_AFTER_STANDALONE; continue main_loop; } else { reportInputProblem("Unexpected keyword '"+_tokenName.getPrefixedName()+"' in XML declaration: expected 'encoding'"); } } continue main_loop; case STATE_XMLDECL_ENCODING: // parsing "encoding" if ((_tokenName = _parseXmlDeclName()) == null) { // incomplete break; } // Can actually also get "standalone" instead... if (_tokenName.hasPrefixedName("encoding")) { _state = STATE_XMLDECL_AFTER_ENCODING; } else if (_tokenName.hasPrefixedName("standalone")) { _state = STATE_XMLDECL_AFTER_STANDALONE; continue main_loop; } else { reportInputProblem("Unexpected keyword '"+_tokenName.getPrefixedName()+"' in XML declaration: expected 'encoding'"); } if (_inputPtr >= _inputEnd) { break; } // fall through case STATE_XMLDECL_AFTER_ENCODING: // got "encoding"; must get ' ' or '=' if (!asyncSkipSpace()) { // not enough input break; } { byte b = _nextByte(); if (b != BYTE_EQ) { reportPrologUnexpChar(true, decodeCharForError(b), " (expected '=' after 'encoding' in xml declaration)"); } } _state = STATE_XMLDECL_ENCODING_EQ; if (_inputPtr >= _inputEnd) { break; } // fall through case STATE_XMLDECL_ENCODING_EQ: // "encoding=" if (!asyncSkipSpace()) { // skip space, if any break; } _elemAttrQuote = _nextByte(); if (_elemAttrQuote != BYTE_QUOT && _elemAttrQuote != BYTE_APOS) { reportPrologUnexpChar(true, decodeCharForError(_elemAttrQuote), " (expected '\"' or ''' in xml declaration for encoding value)"); } _state = STATE_XMLDECL_ENCODING_VALUE; { char[] buf = _textBuilder.resetWithEmpty(); if (_inputPtr >= _inputEnd || !parseXmlDeclAttr(buf, 0)) { _state = STATE_XMLDECL_ENCODING_VALUE; break; } } verifyAndSetXmlEncoding(); _state = STATE_XMLDECL_AFTER_ENCODING_VALUE; break; case STATE_XMLDECL_ENCODING_VALUE: // parsing encoding value if (!parseXmlDeclAttr(_textBuilder.getBufferWithoutReset(), _textBuilder.getCurrentLength())) { _state = STATE_XMLDECL_ENCODING_VALUE; break; } verifyAndSetXmlEncoding(); _state = STATE_XMLDECL_AFTER_ENCODING_VALUE; if (_inputPtr >= _inputEnd) { break; } // fall through case STATE_XMLDECL_AFTER_ENCODING_VALUE: // encoding+value gotten; need space or '?' { byte b = _nextByte(); if (b == BYTE_QMARK) { _state = STATE_XMLDECL_ENDQ; continue main_loop; } if (b == BYTE_SPACE || b == BYTE_CR || b == BYTE_LF || b == BYTE_TAB) { _state = STATE_XMLDECL_BEFORE_STANDALONE; } else { reportPrologUnexpChar(true, decodeCharForError(b), " (expected space after encoding value in xml declaration)"); } } if (_inputPtr >= _inputEnd) { break; } // fall through case STATE_XMLDECL_BEFORE_STANDALONE: // after encoding+value+space; get '?' or 's' if (!asyncSkipSpace()) { // not enough input break; } { byte b = _nextByte(); if (b == BYTE_QMARK) { _state = STATE_XMLDECL_ENDQ; continue main_loop; } if ((_tokenName = _parseNewXmlDeclName(b)) == null) { // incomplete _state = STATE_XMLDECL_STANDALONE; break; } if (!_tokenName.hasPrefixedName("standalone")) { reportInputProblem("Unexpected keyword '"+_tokenName.getPrefixedName()+"' in XML declaration: expected 'standalone'"); } } _state = STATE_XMLDECL_AFTER_STANDALONE; continue main_loop; case STATE_XMLDECL_STANDALONE: // parsing "standalone" if ((_tokenName = _parseXmlDeclName()) == null) { // incomplete break; } if (!_tokenName.hasPrefixedName("standalone")) { reportInputProblem("Unexpected keyword 'encoding' in XML declaration: expected 'standalone'"); } _state = STATE_XMLDECL_AFTER_STANDALONE; if (_inputPtr >= _inputEnd) { break; } // fall through case STATE_XMLDECL_AFTER_STANDALONE: // got "standalone"; must get ' ' or '=' if (!asyncSkipSpace()) { // not enough input break; } { byte b = _nextByte(); if (b != BYTE_EQ) { reportPrologUnexpChar(true, decodeCharForError(b), " (expected '=' after 'standalone' in xml declaration)"); } } _state = STATE_XMLDECL_STANDALONE_EQ; if (_inputPtr >= _inputEnd) { break; } // fall through case STATE_XMLDECL_STANDALONE_EQ: // "standalone=" if (!asyncSkipSpace()) { // skip space, if any break; } _elemAttrQuote = _nextByte(); if (_elemAttrQuote != BYTE_QUOT && _elemAttrQuote != BYTE_APOS) { reportPrologUnexpChar(true, decodeCharForError(_elemAttrQuote), " (expected '\"' or ''' in xml declaration for standalone value)"); } { char[] buf = _textBuilder.resetWithEmpty(); if (_inputPtr >= _inputEnd || !parseXmlDeclAttr(buf, 0)) { _state = STATE_XMLDECL_STANDALONE_VALUE; break; } } verifyAndSetXmlStandalone(); _state = STATE_XMLDECL_AFTER_STANDALONE_VALUE; continue main_loop; case STATE_XMLDECL_STANDALONE_VALUE: // encoding+value gotten; need space or '?' if (!parseXmlDeclAttr(_textBuilder.getBufferWithoutReset(), _textBuilder.getCurrentLength())) { _state = STATE_XMLDECL_STANDALONE_VALUE; break; } verifyAndSetXmlStandalone(); _state = STATE_XMLDECL_AFTER_STANDALONE_VALUE; if (_inputPtr >= _inputEnd) { break; } // fall through case STATE_XMLDECL_AFTER_STANDALONE_VALUE: // encoding+value gotten; need space or '?' if (!asyncSkipSpace()) { // skip space, if any break; } if (_nextByte() != BYTE_QMARK) { reportPrologUnexpChar(true, decodeCharForError(_prevByte()), " (expected '?>' to end xml declaration)"); } _state = STATE_XMLDECL_ENDQ; if (_inputPtr >= _inputEnd) { break; } // fall through case STATE_XMLDECL_ENDQ: // Better clear up decoded name, to avoid later problems (would be taken as PI) _tokenName = null; _state = STATE_DEFAULT; _nextEvent = EVENT_INCOMPLETE; if (_nextByte() != BYTE_GT) { reportPrologUnexpChar(true, decodeCharForError(_prevByte()), " (expected '>' to end xml declaration)"); } // 03-Apr-2018, tatu: Finally! Done with XML declaration, we know the encoding for sure. _activateEncoding(); return START_DOCUMENT; default: throwInternal(); } } return EVENT_INCOMPLETE; } private int handleDTD() throws XMLStreamException { // First: left-over CRs? if (_pendingInput == PENDING_STATE_CR) { if (!handlePartialCR()) { return EVENT_INCOMPLETE; } } if (_state == STATE_DTD_INT_SUBSET) { if (handleDTDInternalSubset(false)) { // got it! _state = STATE_DTD_EXPECT_CLOSING_GT; } else { return EVENT_INCOMPLETE; } } main_loop: while (_inputPtr < _inputEnd) { switch (_state) { case STATE_DEFAULT: // seen 'D' _tokenName = parseNewName(BYTE_D); if (_tokenName == null) { _state = STATE_DTD_DOCTYPE; return EVENT_INCOMPLETE; } if (!"DOCTYPE".equals(_tokenName.getPrefixedName())) { reportPrologProblem(true, "expected 'DOCTYPE'"); } _state = STATE_DTD_AFTER_DOCTYPE; continue main_loop; case STATE_DTD_DOCTYPE: _tokenName = parsePName(); if (_tokenName == null) { _state = STATE_DTD_DOCTYPE; return EVENT_INCOMPLETE; } if (!"DOCTYPE".equals(_tokenName.getPrefixedName())) { reportPrologProblem(true, "expected 'DOCTYPE'"); } if (_inputPtr >= _inputEnd) { break; } // fall through case STATE_DTD_AFTER_DOCTYPE: { byte b = _nextByte(); if (b == BYTE_SPACE || b == BYTE_CR || b == BYTE_LF || b == BYTE_TAB) { _state = STATE_DTD_BEFORE_ROOT_NAME; } else { reportPrologUnexpChar(true, decodeCharForError(b), " (expected space after 'DOCTYPE')"); } } // fall through (ok to skip bounds checks, async-skip does it) case STATE_DTD_BEFORE_ROOT_NAME: if (!asyncSkipSpace()) { // not enough input break; } if ((_tokenName = parseNewName(_nextByte())) == null) { // incomplete _state = STATE_DTD_ROOT_NAME; break; } _state = STATE_DTD_ROOT_NAME; continue main_loop; case STATE_DTD_ROOT_NAME: if ((_tokenName = parsePName()) == null) { // incomplete break; } _state = STATE_DTD_AFTER_ROOT_NAME; if (_inputPtr >= _inputEnd) { break; } // fall through case STATE_DTD_AFTER_ROOT_NAME: { byte b = _nextByte(); if (b == BYTE_GT) { _state = STATE_DEFAULT; _nextEvent = EVENT_INCOMPLETE; return DTD; } if (b == BYTE_SPACE || b == BYTE_CR || b == BYTE_LF || b == BYTE_TAB) { _state = STATE_DTD_BEFORE_IDS; } else { reportPrologUnexpChar(true, decodeCharForError(b), " (expected space after root name in DOCTYPE declaration)"); } } // fall through (ok to skip bounds checks, async-skip does it) case STATE_DTD_BEFORE_IDS: if (!asyncSkipSpace()) { // not enough input break; } { byte b = _nextByte(); if (b == BYTE_GT) { _state = STATE_DEFAULT; _nextEvent = EVENT_INCOMPLETE; return DTD; } PName name; if ((name = parseNewName(b)) == null) { _state = STATE_DTD_PUBLIC_OR_SYSTEM; break; } String str = name.getPrefixedName(); if ("PUBLIC".equals(str)) { _state = STATE_DTD_AFTER_PUBLIC; } else if ("SYSTEM".equals(str)) { _state = STATE_DTD_AFTER_SYSTEM; } else { reportPrologProblem(true, "unexpected token '"+str+"': expected either PUBLIC or SYSTEM"); } } continue main_loop; case STATE_DTD_PUBLIC_OR_SYSTEM: { PName name; if ((name = parsePName()) == null) { _state = STATE_DTD_PUBLIC_OR_SYSTEM; break; } String str = name.getPrefixedName(); if ("PUBLIC".equals(str)) { _state = STATE_DTD_AFTER_PUBLIC; } else if ("SYSTEM".equals(str)) { _state = STATE_DTD_AFTER_SYSTEM; } else { reportPrologProblem(true, "unexpected token '"+str+"': expected either PUBLIC or SYSTEM"); } } continue main_loop; case STATE_DTD_AFTER_PUBLIC: { byte b = _nextByte(); if (b == BYTE_SPACE || b == BYTE_CR || b == BYTE_LF || b == BYTE_TAB) { _state = STATE_DTD_BEFORE_PUBLIC_ID; } else { reportPrologUnexpChar(true, decodeCharForError(b), " (expected space after PUBLIC keyword)"); } } continue main_loop; case STATE_DTD_AFTER_SYSTEM: { byte b = _nextByte(); if (b == BYTE_SPACE || b == BYTE_CR || b == BYTE_LF || b == BYTE_TAB) { _state = STATE_DTD_BEFORE_SYSTEM_ID; } else { reportPrologUnexpChar(true, decodeCharForError(b), " (expected space after SYSTEM keyword)"); } } continue main_loop; case STATE_DTD_BEFORE_PUBLIC_ID: if (!asyncSkipSpace()) { break; } _elemAttrQuote = _nextByte(); if (_elemAttrQuote != BYTE_QUOT && _elemAttrQuote != BYTE_APOS) { reportPrologUnexpChar(true, decodeCharForError(_elemAttrQuote), " (expected '\"' or ''' for PUBLIC ID)"); } { char[] buf = _textBuilder.resetWithEmpty(); if (_inputPtr >= _inputEnd || !parseDtdId(buf, 0, false)) { _state = STATE_DTD_PUBLIC_ID; break; } } verifyAndSetPublicId(); _state = STATE_DTD_AFTER_PUBLIC_ID; continue main_loop; case STATE_DTD_PUBLIC_ID: if (!parseDtdId(_textBuilder.getBufferWithoutReset(), _textBuilder.getCurrentLength(), false)) { break; } verifyAndSetPublicId(); _state = STATE_DTD_AFTER_PUBLIC_ID; if (_inputPtr >= _inputEnd) { break; } // fall through case STATE_DTD_AFTER_PUBLIC_ID: { byte b = _nextByte(); if (b == BYTE_SPACE || b == BYTE_CR || b == BYTE_LF || b == BYTE_TAB) { _state = STATE_DTD_BEFORE_SYSTEM_ID; } else { reportPrologUnexpChar(true, decodeCharForError(b), " (expected space after PUBLIC ID)"); } } // fall through (ok to skip bounds checks, async-skip does it) case STATE_DTD_BEFORE_SYSTEM_ID: if (!asyncSkipSpace()) { break; } _elemAttrQuote = _nextByte(); if (_elemAttrQuote != BYTE_QUOT && _elemAttrQuote != BYTE_APOS) { reportPrologUnexpChar(true, decodeCharForError(_elemAttrQuote), " (expected '\"' or ''' for SYSTEM ID)"); } { char[] buf = _textBuilder.resetWithEmpty(); if (_inputPtr >= _inputEnd || !parseDtdId(buf, 0, true)) { _state = STATE_DTD_SYSTEM_ID; break; } } verifyAndSetSystemId(); _state = STATE_DTD_AFTER_SYSTEM_ID; continue main_loop; case STATE_DTD_SYSTEM_ID: if (!parseDtdId(_textBuilder.getBufferWithoutReset(), _textBuilder.getCurrentLength(), true)) { break; } verifyAndSetSystemId(); _state = STATE_DTD_AFTER_SYSTEM_ID; if (_inputPtr >= _inputEnd) { break; } // fall through case STATE_DTD_AFTER_SYSTEM_ID: if (!asyncSkipSpace()) { break; } { byte b = _nextByte(); if (b == BYTE_GT) { _state = STATE_DEFAULT; _nextEvent = EVENT_INCOMPLETE; return DTD; } if (b != BYTE_LBRACKET) { reportPrologUnexpChar(true, decodeCharForError(_elemAttrQuote), " (expected either '[' for internal subset, or '>' to end DOCTYPE)"); } } _state = STATE_DTD_INT_SUBSET; if (handleDTDInternalSubset(true)) { _state = STATE_DTD_EXPECT_CLOSING_GT; } else { return EVENT_INCOMPLETE; } // fall through case STATE_DTD_EXPECT_CLOSING_GT: if (!asyncSkipSpace()) { break; } { byte b = _nextByte(); if (b != BYTE_GT) { reportPrologUnexpChar(true, b, "expected '>' to end DTD"); } } _state = STATE_DEFAULT; _nextEvent = EVENT_INCOMPLETE; return DTD; default: throwInternal(); } } return _currToken; } private final boolean parseDtdId(char[] outputBuffer, int outputPtr, boolean system) throws XMLStreamException { final int quote = (int) _elemAttrQuote; while (_inputPtr < _inputEnd) { int ch = _nextByte() & 0xFF; if (ch == quote) { _textBuilder.setCurrentLength(outputPtr); return true; } if (!system && !validPublicIdChar(ch)) { reportPrologUnexpChar(true, decodeCharForError((byte) ch), " (not valid in " + (system ? "SYSTEM" : "PUBLIC") + " ID)"); } if (outputPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outputPtr = 0; } outputBuffer[outputPtr++] = (char) ch; } _textBuilder.setCurrentLength(outputPtr); return false; } // // // NOTE: specialized versions of `parsePName`, `parseNewName`, to be // // // used in decoding `xml` and pseudo-attributes of XML declaration // // // Tricky part here is that it predates possible encoding declaration // // // so it is essentially part of bootstrapping private final PName _parseNewXmlDeclName(byte b) throws XMLStreamException { int q = b & 0xFF; if (q < INT_A) { // lowest acceptable start char, except for ':' that would be allowed in non-ns mode throwUnexpectedChar(q, "; expected a name start character"); } _quadCount = 0; _currQuad = q; _currQuadBytes = 1; return _parseXmlDeclName(); } private final PName _parseXmlDeclName() throws XMLStreamException { int q = _currQuad; while (true) { int i; switch (_currQuadBytes) { case 0: if (_inputPtr >= _inputEnd) { return null; // all pointers have been set } q = _nextByte() & 0xFF; // Since name char validity is checked later on, only do quickie lookup if (q < 65) { // 'A' if (q < 45 || q > 58 || q == 47) { return _findXmlDeclName(q, 0); } } // fall through case 1: if (_inputPtr >= _inputEnd) { // need to store pointers _currQuad = q; _currQuadBytes = 1; return null; } i = _nextByte() & 0xFF; if (i < 65) { // 'A' if (i < 45 || i > 58 || i == 47) { return _findXmlDeclName(q, 1); } } q = (q << 8) | i; // fall through case 2: if (_inputPtr >= _inputEnd) { // need to store pointers _currQuad = q; _currQuadBytes = 2; return null; } i = _nextByte() & 0xFF; if (i < 65) { // 'A' if (i < 45 || i > 58 || i == 47) { return _findXmlDeclName(q, 2); } } q = (q << 8) | i; // fall through case 3: if (_inputPtr >= _inputEnd) { // need to store pointers _currQuad = q; _currQuadBytes = 3; return null; } i = _nextByte() & 0xFF; if (i < 65) { // 'A' if (i < 45 || i > 58 || i == 47) { return _findXmlDeclName(q, 3); } } q = (q << 8) | i; } // If we get this far, need to add full quad into result array and update state if (_quadCount == 0) { // first quad _quadBuffer[0] = q; _quadCount = 1; } else { if (_quadCount >= _quadBuffer.length) { // let's just double? _quadBuffer = DataUtil.growArrayBy(_quadBuffer, _quadBuffer.length); } _quadBuffer[_quadCount++] = q; } _currQuadBytes = 0; } } protected final PName _findXmlDeclName(int lastQuad, int lastByteCount) throws XMLStreamException { int qlen = _quadCount; // Also: if last quad is empty, will need take last from qbuf. if (lastByteCount == 0) { lastQuad = _quadBuffer[--qlen]; // NOTE: do not change since we may need to delegate with original value, // and byte count not checked here // lastByteCount = 4; } // First things first: we are very likely to find one of short pseudo-attributes, so: PName pname; switch (qlen) { case 0: // 4-bytes or less; only has 'lastQuad' defined pname = AsyncXmlDeclHelper.find(lastQuad); break; case 1: pname = AsyncXmlDeclHelper.find(_quadBuffer[0], lastQuad); break; case 2: pname = AsyncXmlDeclHelper.find(_quadBuffer[0], _quadBuffer[1], lastQuad); break; default: pname = null; } if (pname != null) { // Need to push back the byte read but not used: --_inputPtr; return pname; } // Otherwise most likely a processing instruction instead of XML declaration. A few // ways we could deal with it, but for now let's finalize symbol table etc, delegate _activateEncoding(); return findPName(lastQuad, lastByteCount); } /** * Method called to try to parse an XML pseudo-attribute value. This is relatively * simple, since we can't have linefeeds or entities; and although there are exact * rules for what is allowed, we can do coarse parsing and only later on verify * validity (for encoding could do stricter parsing in future?) *

* NOTE: pseudo-attribute values required to be 7-bit ASCII so can do crude cast. * * @return True if we managed to parse the whole pseudo-attribute */ protected boolean parseXmlDeclAttr(char[] outputBuffer, int outputPtr) throws XMLStreamException { final int quote = (int) _elemAttrQuote; while (_inputPtr < _inputEnd) { int ch = _nextByte() & 0xFF; if (ch == quote) { _textBuilder.setCurrentLength(outputPtr); return true; } // this is not exact check; but does work for all legal (valid) characters: if (ch <= INT_SPACE || ch > INT_z) { reportPrologUnexpChar(true, decodeCharForError((byte) ch), " (not valid in XML pseudo-attribute values)"); } if (outputPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outputPtr = 0; } outputBuffer[outputPtr++] = (char) ch; } _textBuilder.setCurrentLength(outputPtr); return false; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy