All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.fasterxml.aalto.async.AsyncByteScanner Maven / Gradle / Ivy

There is a newer version: 1.3.3
Show newest version
package com.fasterxml.aalto.async;

import java.io.IOException;

import javax.xml.stream.XMLStreamException;

import com.fasterxml.aalto.AsyncInputFeeder;
import com.fasterxml.aalto.AsyncXMLStreamReader;
import com.fasterxml.aalto.impl.ErrorConsts;
import com.fasterxml.aalto.in.ByteBasedPNameTable;
import com.fasterxml.aalto.in.ByteBasedScanner;
import com.fasterxml.aalto.in.PName;
import com.fasterxml.aalto.in.ReaderConfig;
import com.fasterxml.aalto.util.CharsetNames;
import com.fasterxml.aalto.util.DataUtil;

public abstract class AsyncByteScanner
    extends ByteBasedScanner
    implements AsyncInputFeeder
{
    protected final static int EVENT_INCOMPLETE = AsyncXMLStreamReader.EVENT_INCOMPLETE;

    /*
    /**********************************************************************
    /* State consts
    /**********************************************************************
     */

    /**
     * Default starting state for many events/contexts -- nothing has been
     * seen so far, no  event incomplete. Not used for all event types.
     */
    protected final static int STATE_DEFAULT = 0;

    // // // States for prolog/epilog major state:

    /**
     * State in which a less-than sign has been seen
     */
    protected final static int STATE_PROLOG_INITIAL = 1; // State before document when we may get xml declaration
    protected final static int STATE_PROLOG_SEEN_LT = 2; // "<" seen after xml declaration
    protected final static int STATE_PROLOG_DECL = 3; // "'
    protected final static int STATE_DTD_BEFORE_IDS = 6; // before "PUBLIC" or "SYSTEM" token
    protected final static int STATE_DTD_PUBLIC_OR_SYSTEM = 7; // parsing "PUBLIC" or "SYSTEM"
    protected final static int STATE_DTD_AFTER_PUBLIC = 8; // "PUBLIC" found, need space
    protected final static int STATE_DTD_AFTER_SYSTEM = 9; // "SYSTEM" found, need space
    protected final static int STATE_DTD_BEFORE_PUBLIC_ID = 10; // after "PUBLIC", space, need quoted public id
    protected final static int STATE_DTD_PUBLIC_ID = 11; // parsing public ID
    protected final static int STATE_DTD_AFTER_PUBLIC_ID = 12; // public ID parsed, need space
    protected final static int STATE_DTD_BEFORE_SYSTEM_ID = 13; // about to parse quoted system id
    protected final static int STATE_DTD_SYSTEM_ID = 14; // parsing system ID
    protected final static int STATE_DTD_AFTER_SYSTEM_ID = 15; // after system ID, optional space, '>' or int subset
    protected final static int STATE_DTD_INT_SUBSET = 16; // parsing internal subset

    protected final static int STATE_DTD_EXPECT_CLOSING_GT = 50; // ']' gotten that should be followed by '>'
    
    // For CHARACTERS, default is the basic (and only)

    // just seen "&"
    protected final static int STATE_TEXT_AMP = 4;
    // just seen "&#"
//    protected final static int STATE_TEXT_AMP_AND_HASH = 5;
    // seen '&' and partial name:
    protected final static int STATE_TEXT_AMP_NAME = 6;

    // For comments, STATE_DEFAULT means "
    protected final static int STATE_PI_AFTER_TARGET_WS = 2; // "
     * If so, this int will store byte(s), in little-endian format
     * (that is, first pending byte is at 0x000000FF, second [if any]
     * at 0x0000FF00, and third at 0x00FF0000). This can be
     * (and is) used to figure out actual number of bytes pending,
     * for multi-byte (UTF-8) character decoding.
     *

* Note: it is assumed that if value is 0, there is no data. * Thus, if 0 needed to be added pending, it has to be masked. */ protected int _pendingInput = 0; /** * Flag that is sent when calling application indicates that there will * be no more input to parse. */ protected boolean _endOfInput = false; /* /********************************************************************** /* Name/entity parsing state /********************************************************************** */ /** * Number of complete quads parsed for current name (quads * themselves are stored in {@link #_quadBuffer}). */ protected int _quadCount; /** * Bytes parsed for the current, incomplete, quad */ protected int _currQuad; /** * Number of bytes pending/buffered, stored in {@link #_currQuad} */ protected int _currQuadBytes = 0; /** * Entity value accumulated so far */ protected int _entityValue = 0; /* /********************************************************************** /* (Start) element parsing state /********************************************************************** */ protected boolean _elemAllNsBound; protected boolean _elemAttrCount; protected byte _elemAttrQuote; protected PName _elemAttrName; /** * Pointer for the next character of currently being parsed value * within attribute value buffer */ protected int _elemAttrPtr; /** * Pointer for the next character of currently being parsed namespace * URI for the current namespace declaration */ protected int _elemNsPtr; /* /********************************************************************** /* Other state /********************************************************************** */ /** * Flag that indicates whether we are inside a declaration during parsing * of internal DTD subset. */ protected boolean _inDtdDeclaration; /* /********************************************************************** /* Life-cycle /********************************************************************** */ protected AsyncByteScanner(ReaderConfig cfg) { super(cfg); } @Override public void endOfInput() { _endOfInput = true; } /** * Since the async scanner has no access to whatever passes content, * there is no input source in same sense as with blocking scanner; * and there is nothing to close. But we can at least mark input * as having ended. */ @Override protected void _closeSource() throws IOException { // nothing to do, we are done. _endOfInput = true; } /* /********************************************************************** /* Shared helper methods /********************************************************************** */ protected void verifyAndSetXmlVersion() throws XMLStreamException { if (_textBuilder.equalsString("1.0")) { _config.setXmlVersion("1.0"); } else if (_textBuilder.equalsString("1.1")) { _config.setXmlVersion("1.1"); } else { reportInputProblem("Unrecognized XML version '"+_textBuilder.contentsAsString()+"' (expected '1.0' or '1.1')"); } } protected void verifyAndSetXmlEncoding() throws XMLStreamException { String enc = CharsetNames.normalize(_textBuilder.contentsAsString()); _config.setXmlEncoding(enc); /* 09-Feb-2011, tatu: For now, we will only accept UTF-8 and ASCII; could * expand in future (Latin-1 should be doable) */ if (CharsetNames.CS_UTF8 != enc && CharsetNames.CS_US_ASCII != enc) { reportInputProblem("Unsupported encoding '"+enc+"': only UTF-8 and US-ASCII support by async parser"); } } protected void verifyAndSetXmlStandalone() throws XMLStreamException { if (_textBuilder.equalsString("yes")) { _config.setXmlStandalone(Boolean.TRUE); } else if (_textBuilder.equalsString("no")) { _config.setXmlStandalone(Boolean.FALSE); } else { reportInputProblem("Invalid standalone value '"+_textBuilder.contentsAsString()+"': can only use 'yes' and 'no'"); } } protected void verifyAndSetPublicId() throws XMLStreamException { _publicId = _textBuilder.contentsAsString(); } protected void verifyAndSetSystemId() throws XMLStreamException { _systemId = _textBuilder.contentsAsString(); } /* /********************************************************************** /* Second-level parsing; character content (in tree) /********************************************************************** */ @Override protected final void finishToken() throws XMLStreamException { _tokenIncomplete = false; switch (_currToken) { case PROCESSING_INSTRUCTION: finishPI(); break; case CHARACTERS: finishCharacters(); break; case COMMENT: finishComment(); break; case SPACE: finishSpace(); break; case DTD: finishDTD(true); // true -> get text break; case CDATA: finishCData(); break; default: ErrorConsts.throwInternalError(); } } /** * Method called to initialize state for CHARACTERS event, after * just a single byte has been seen. What needs to be done next * depends on whether coalescing mode is set or not: if it is not * set, just a single character needs to be decoded, after which * current event will be incomplete, but defined as CHARACTERS. * In coalescing mode, the whole content must be read before * current event can be defined. The reason for difference is * that when XMLStreamReader.next() returns, no * blocking can occur when calling other methods. * * @return Event type detected; either CHARACTERS, if at least * one full character was decoded (and can be returned), * EVENT_INCOMPLETE if not (part of a multi-byte character * split across input buffer boundary) */ protected abstract int startCharacters(byte b) throws XMLStreamException; protected abstract boolean handleAttrValue() throws XMLStreamException; protected abstract boolean handleNsDecl() throws XMLStreamException; /* /********************************************************************** /* Abstract methods from base class, parsing /********************************************************************** */ @Override protected void finishCData() throws XMLStreamException { // N/A throwInternal(); } @Override protected void finishComment() throws XMLStreamException { // N/A throwInternal(); } @Override protected void finishDTD(boolean copyContents) throws XMLStreamException { // N/A throwInternal(); } @Override protected void finishPI() throws XMLStreamException { // N/A throwInternal(); } @Override protected void finishSpace() throws XMLStreamException { // N/A throwInternal(); } // // token-skip methods /** * @return True if the whole characters segment was succesfully * skipped; false if not */ @Override protected abstract boolean skipCharacters() throws XMLStreamException; @Override protected void skipCData() throws XMLStreamException { // should never be called throwInternal(); } @Override protected void skipComment() throws XMLStreamException { // should never be called throwInternal(); } @Override protected void skipPI() throws XMLStreamException { // should never be called throwInternal(); } @Override protected void skipSpace() throws XMLStreamException { // should never be called throwInternal(); } @Override protected boolean loadMore() throws XMLStreamException { // should never get called throwInternal(); return false; // never gets here } @Override protected abstract void finishCharacters() throws XMLStreamException; /* /********************************************************************** /* Internal methods, name decoding /********************************************************************** */ /** * Method called to process a sequence of bytes that is likely to * be a PName. At this point we encountered an end marker, and * may either hit a formerly seen well-formed PName; an as-of-yet * unseen well-formed PName; or a non-well-formed sequence (containing * one or more non-name chars without any valid end markers). * * @param lastQuad Word with last 0 to 3 bytes of the PName; not included * in the quad array * @param lastByteCount Number of bytes contained in lastQuad; 0 to 3. * @param firstQuad First 1 to 4 bytes of the PName (4 if length * at least 4 bytes; less only if not). * @param qlen Number of quads in the array, except if less than 2 * (in which case only firstQuad and lastQuad are used) * @param quads Array that contains all the quads, except for the * last one, for names with more than 8 bytes (i.e. more than * 2 quads) */ protected final PName findPName(int lastQuad, int lastByteCount) throws XMLStreamException { // First, need to push back the byte read but not used: --_inputPtr; int qlen = _quadCount; // Also: if last quad is empty, will need take last from qbuf. if (lastByteCount == 0) { lastQuad = _quadBuffer[--qlen]; lastByteCount = 4; } // Separate handling for short names: if (qlen <= 1) { // short name? if (qlen == 0) { // 4-bytes or less; only has 'lastQuad' defined int hash = ByteBasedPNameTable.calcHash(lastQuad); PName name = _symbols.findSymbol(hash, lastQuad, 0); if (name == null) { // Let's simplify things a bit, and just use array based one then: _quadBuffer[0] = lastQuad; name = addPName(hash, _quadBuffer, 1, lastByteCount); } return name; } int firstQuad = _quadBuffer[0]; int hash = ByteBasedPNameTable.calcHash(firstQuad, lastQuad); PName name = _symbols.findSymbol(hash, firstQuad, lastQuad); if (name == null) { // As above, let's just use array, then _quadBuffer[1] = lastQuad; name = addPName(hash, _quadBuffer, 2, lastByteCount); } return name; } /* Nope, long (3 quads or more). At this point, the last quad is * not yet in the array, let's add: */ if (qlen >= _quadBuffer.length) { // let's just double? _quadBuffer = DataUtil.growArrayBy(_quadBuffer, _quadBuffer.length); } _quadBuffer[qlen++] = lastQuad; int hash = ByteBasedPNameTable.calcHash(_quadBuffer, qlen); PName name = _symbols.findSymbol(hash, _quadBuffer, qlen); if (name == null) { name = addPName(hash, _quadBuffer, qlen, lastByteCount); } return name; } /* /********************************************************************** /* Internal methods, input validation /********************************************************************** */ /** * Method called to verify validity of given character (from entity) and * append it to the text buffer */ protected void verifyAndAppendEntityCharacter(int charFromEntity) throws XMLStreamException { verifyXmlChar(charFromEntity); // Ok; does it need a surrogate though? (over 16 bits) if ((charFromEntity >> 16) != 0) { charFromEntity -= 0x10000; _textBuilder.append((char) (0xD800 | (charFromEntity >> 10))); charFromEntity = 0xDC00 | (charFromEntity & 0x3FF); } _textBuilder.append((char) charFromEntity); } /** * Checks that a character for a PublicId * * @param c A character * @return true if the character is valid for use in the Public ID * of an XML doctype declaration * * @see "http://www.w3.org/TR/xml/#NT-PubidLiteral" */ protected boolean validPublicIdChar(int c) { return c == 0xA || // c == 0xD || // c == 0x20 || // (c >= '0' && c <= '9') || //[0-9] (c >= '@' && c <= 'Z') || //@[A-Z] (c >= 'a' && c <= 'z') || c == '!' || (c >= 0x23 && c <= 0x25) || //#$% (c >= 0x27 && c <= 0x2F) || //'()*+,-./ (c >= ':' && c <= ';') || c == '=' || c == '?' || c == '_'; } /* /********************************************************************** /* Internal methods, error handling /********************************************************************** */ @Override protected int decodeCharForError(byte b) throws XMLStreamException { // !!! TBI return (int) b; } protected void checkPITargetName(PName targetName) throws XMLStreamException { String ln = targetName.getLocalName(); if (ln.length() == 3 && ln.equalsIgnoreCase("xml") && !targetName.hasPrefix()) { reportInputProblem(ErrorConsts.ERR_WF_PI_XML_TARGET); } } protected int throwInternal() { throw new IllegalStateException("Internal error: should never execute this code path"); } protected void reportInvalidOther(int mask, int ptr) throws XMLStreamException { _inputPtr = ptr; reportInvalidOther(mask); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy