com.fasterxml.aalto.async.AsyncByteScanner Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of aalto-xml Show documentation
Ultra-high performance non-blocking XML processor (Stax/Stax2, SAX/SAX2)
There is a newer version: 1.3.3
package com.fasterxml.aalto.async;

import java.io.IOException;

import javax.xml.stream.XMLStreamException;

import com.fasterxml.aalto.AsyncInputFeeder;
import com.fasterxml.aalto.AsyncXMLStreamReader;
import com.fasterxml.aalto.impl.ErrorConsts;
import com.fasterxml.aalto.in.ByteBasedPNameTable;
import com.fasterxml.aalto.in.ByteBasedScanner;
import com.fasterxml.aalto.in.PName;
import com.fasterxml.aalto.in.ReaderConfig;
import com.fasterxml.aalto.util.CharsetNames;
import com.fasterxml.aalto.util.DataUtil;

public abstract class AsyncByteScanner
    extends ByteBasedScanner
    implements AsyncInputFeeder
{
    protected final static int EVENT_INCOMPLETE = AsyncXMLStreamReader.EVENT_INCOMPLETE;

    /*
    /**********************************************************************
    /* State consts
    /**********************************************************************
     */

    /**
     * Default starting state for many events/contexts -- nothing has been
     * seen so far, no  event incomplete. Not used for all event types.
     */
    protected final static int STATE_DEFAULT = 0;

    // // // States for prolog/epilog major state:

    /**
     * State in which a less-than sign has been seen
     */
    protected final static int STATE_PROLOG_INITIAL = 1; // State before document when we may get xml declaration
    protected final static int STATE_PROLOG_SEEN_LT = 2; // "<" seen after xml declaration
    protected final static int STATE_PROLOG_DECL = 3; // "'
    protected final static int STATE_DTD_BEFORE_IDS = 6; // before "PUBLIC" or "SYSTEM" token
    protected final static int STATE_DTD_PUBLIC_OR_SYSTEM = 7; // parsing "PUBLIC" or "SYSTEM"
    protected final static int STATE_DTD_AFTER_PUBLIC = 8; // "PUBLIC" found, need space
    protected final static int STATE_DTD_AFTER_SYSTEM = 9; // "SYSTEM" found, need space
    protected final static int STATE_DTD_BEFORE_PUBLIC_ID = 10; // after "PUBLIC", space, need quoted public id
    protected final static int STATE_DTD_PUBLIC_ID = 11; // parsing public ID
    protected final static int STATE_DTD_AFTER_PUBLIC_ID = 12; // public ID parsed, need space
    protected final static int STATE_DTD_BEFORE_SYSTEM_ID = 13; // about to parse quoted system id
    protected final static int STATE_DTD_SYSTEM_ID = 14; // parsing system ID
    protected final static int STATE_DTD_AFTER_SYSTEM_ID = 15; // after system ID, optional space, '>' or int subset
    protected final static int STATE_DTD_INT_SUBSET = 16; // parsing internal subset

    protected final static int STATE_DTD_EXPECT_CLOSING_GT = 50; // ']' gotten that should be followed by '>'
    
    // For CHARACTERS, default is the basic (and only)

    // just seen "&"
    protected final static int STATE_TEXT_AMP = 4;
    // just seen "&#"
//    protected final static int STATE_TEXT_AMP_AND_HASH = 5;
    // seen '&' and partial name:
    protected final static int STATE_TEXT_AMP_NAME = 6;

    // For comments, STATE_DEFAULT means "
    protected final static int STATE_PI_AFTER_TARGET_WS = 2; // "
     * If so, this int will store byte(s), in little-endian format
     * (that is, first pending byte is at 0x000000FF, second [if any]
     * at 0x0000FF00, and third at 0x00FF0000). This can be
     * (and is) used to figure out actual number of bytes pending,
     * for multi-byte (UTF-8) character decoding.
     *
     * Note: it is assumed that if value is 0, there is no data.
     * Thus, if 0 needed to be added pending, it has to be masked.
     */
    protected int _pendingInput = 0;

    /**
     * Flag that is sent when calling application indicates that there will
     * be no more input to parse.
     */
    protected boolean _endOfInput = false;
    
    /*
    /**********************************************************************
    /* Name/entity parsing state
    /**********************************************************************
     */

    /**
     * Number of complete quads parsed for current name (quads
     * themselves are stored in {@link #_quadBuffer}).
     */
    protected int _quadCount;

    /**
     * Bytes parsed for the current, incomplete, quad
     */
    protected int _currQuad;

    /**
     * Number of bytes pending/buffered, stored in {@link #_currQuad}
     */
    protected int _currQuadBytes = 0;

    /**
     * Entity value accumulated so far
     */
    protected int _entityValue = 0;
    
    /*
    /**********************************************************************
    /* (Start) element parsing state
    /**********************************************************************
     */

    protected boolean _elemAllNsBound;

    protected boolean _elemAttrCount;

    protected byte _elemAttrQuote;

    protected PName _elemAttrName;

    /**
     * Pointer for the next character of currently being parsed value
     * within attribute value buffer
     */
    protected int _elemAttrPtr;

    /**
     * Pointer for the next character of currently being parsed namespace
     * URI for the current namespace declaration
     */
    protected int _elemNsPtr;

    /*
    /**********************************************************************
    /* Other state
    /**********************************************************************
     */

    /**
     * Flag that indicates whether we are inside a declaration during parsing
     * of internal DTD subset.
     */
    protected boolean _inDtdDeclaration;

    /*
    /**********************************************************************
    /* Life-cycle
    /**********************************************************************
     */
    
    protected AsyncByteScanner(ReaderConfig cfg) {
        super(cfg);
    }

    @Override
    public void endOfInput() {
        _endOfInput = true;
    }
    
    /**
     * Since the async scanner has no access to whatever passes content,
     * there is no input source in same sense as with blocking scanner;
     * and there is nothing to close. But we can at least mark input
     * as having ended.
     */
    @Override
    protected void _closeSource() throws IOException
    {
        // nothing to do, we are done.
        _endOfInput = true;
    }

    /*
    /**********************************************************************
    /* Shared helper methods
    /**********************************************************************
     */

    protected void verifyAndSetXmlVersion() throws XMLStreamException
    {
        if (_textBuilder.equalsString("1.0")) {
            _config.setXmlVersion("1.0");
        } else if (_textBuilder.equalsString("1.1")) {
            _config.setXmlVersion("1.1");
        } else {
            reportInputProblem("Unrecognized XML version '"+_textBuilder.contentsAsString()+"' (expected '1.0' or '1.1')");
        }
    }

    protected void verifyAndSetXmlEncoding() throws XMLStreamException
    {
        String enc = CharsetNames.normalize(_textBuilder.contentsAsString());
        _config.setXmlEncoding(enc);
        /* 09-Feb-2011, tatu: For now, we will only accept UTF-8 and ASCII; could
         *   expand in future (Latin-1 should be doable)
         */
        if (CharsetNames.CS_UTF8 != enc && CharsetNames.CS_US_ASCII != enc) {
            reportInputProblem("Unsupported encoding '"+enc+"': only UTF-8 and US-ASCII support by async parser");
        }
    }

    protected void verifyAndSetXmlStandalone() throws XMLStreamException
    {
        if (_textBuilder.equalsString("yes")) {
            _config.setXmlStandalone(Boolean.TRUE);
        } else if (_textBuilder.equalsString("no")) {
            _config.setXmlStandalone(Boolean.FALSE);
        } else {
            reportInputProblem("Invalid standalone value '"+_textBuilder.contentsAsString()+"': can only use 'yes' and 'no'");
        }
    }

    protected void verifyAndSetPublicId() throws XMLStreamException {
        _publicId = _textBuilder.contentsAsString();
    }

    protected void verifyAndSetSystemId() throws XMLStreamException {
        _systemId = _textBuilder.contentsAsString();
    }

    /*
    /**********************************************************************
    /* Second-level parsing; character content (in tree)
    /**********************************************************************
     */

    @Override
    protected final void finishToken() throws XMLStreamException
    {
        _tokenIncomplete = false;
        switch (_currToken) {
        case PROCESSING_INSTRUCTION:
            finishPI();
            break;
        case CHARACTERS:
            finishCharacters();
            break;
        case COMMENT:
            finishComment();
            break;
        case SPACE:
            finishSpace();
            break;
        case DTD:
            finishDTD(true); // true -> get text
            break;
        case CDATA:
            finishCData();
            break;
        default:
            ErrorConsts.throwInternalError();
        }
    }
    
    /**
     * Method called to initialize state for CHARACTERS event, after
     * just a single byte has been seen. What needs to be done next
     * depends on whether coalescing mode is set or not: if it is not
     * set, just a single character needs to be decoded, after which
     * current event will be incomplete, but defined as CHARACTERS.
     * In coalescing mode, the whole content must be read before
     * current event can be defined. The reason for difference is
     * that when XMLStreamReader.next() returns, no
     * blocking can occur when calling other methods.
     *
     * @return Event type detected; either CHARACTERS, if at least
     *   one full character was decoded (and can be returned),
     *   EVENT_INCOMPLETE if not (part of a multi-byte character
     *   split across input buffer boundary)
     */
    protected abstract int startCharacters(byte b) throws XMLStreamException;

    protected abstract boolean handleAttrValue() throws XMLStreamException;

    protected abstract boolean handleNsDecl() throws XMLStreamException;

    /*
    /**********************************************************************
    /* Abstract methods from base class, parsing
    /**********************************************************************
     */


    @Override
    protected void finishCData() throws XMLStreamException
    {
        // N/A
        throwInternal();
    }

    @Override
    protected void finishComment() throws XMLStreamException
    {
        // N/A
        throwInternal();
    }

    @Override
    protected void finishDTD(boolean copyContents) throws XMLStreamException
    {
        // N/A
        throwInternal();
    }

    @Override
    protected void finishPI() throws XMLStreamException
    {
        // N/A
        throwInternal();
    }

    @Override
    protected void finishSpace() throws XMLStreamException
    {
        // N/A
        throwInternal();
    }

    // // token-skip methods

    /**
     * @return True if the whole characters segment was succesfully
     *   skipped; false if not
     */
    @Override
    protected abstract boolean skipCharacters()
        throws XMLStreamException;

    @Override
    protected void skipCData() throws XMLStreamException
    {
        // should never be called
        throwInternal();
    }

    @Override
    protected void skipComment() throws XMLStreamException
    {
        // should never be called
        throwInternal();
    }

    @Override
    protected void skipPI() throws XMLStreamException
    {
        // should never be called
        throwInternal();
    }

    @Override
    protected void skipSpace() throws XMLStreamException
    {
        // should never be called
        throwInternal();
    }

    @Override
    protected boolean loadMore() throws XMLStreamException
    {
        // should never get called
        throwInternal();
        return false; // never gets here
    }
    
    @Override
    protected abstract void finishCharacters() throws XMLStreamException;

    /*
    /**********************************************************************
    /* Internal methods, name decoding
    /**********************************************************************
     */

    /**
     * Method called to process a sequence of bytes that is likely to
     * be a PName. At this point we encountered an end marker, and
     * may either hit a formerly seen well-formed PName; an as-of-yet
     * unseen well-formed PName; or a non-well-formed sequence (containing
     * one or more non-name chars without any valid end markers).
     *
     * @param lastQuad Word with last 0 to 3 bytes of the PName; not included
     *   in the quad array
     * @param lastByteCount Number of bytes contained in lastQuad; 0 to 3.
     * @param firstQuad First 1 to 4 bytes of the PName (4 if length
     *    at least 4 bytes; less only if not). 
     * @param qlen Number of quads in the array, except if less than 2
     *    (in which case only firstQuad and lastQuad are used)
     * @param quads Array that contains all the quads, except for the
     *    last one, for names with more than 8 bytes (i.e. more than
     *    2 quads)
     */
    protected final PName findPName(int lastQuad, int lastByteCount) throws XMLStreamException
    {
        // First, need to push back the byte read but not used:
        --_inputPtr;
        int qlen = _quadCount;
        // Also: if last quad is empty, will need take last from qbuf.
        if (lastByteCount == 0) {
            lastQuad = _quadBuffer[--qlen];
            lastByteCount = 4;
        }
        // Separate handling for short names:
        if (qlen <= 1) { // short name?
            if (qlen == 0) { // 4-bytes or less; only has 'lastQuad' defined
                int hash = ByteBasedPNameTable.calcHash(lastQuad);
                PName name = _symbols.findSymbol(hash, lastQuad, 0);
                if (name == null) {
                    // Let's simplify things a bit, and just use array based one then:
                    _quadBuffer[0] = lastQuad;
                    name = addPName(hash, _quadBuffer, 1, lastByteCount);
                }
                return name;
            }
            int firstQuad = _quadBuffer[0];
            int hash = ByteBasedPNameTable.calcHash(firstQuad, lastQuad);
            PName name = _symbols.findSymbol(hash, firstQuad, lastQuad);
            if (name == null) {
                // As above, let's just use array, then
                _quadBuffer[1] = lastQuad;
                name = addPName(hash, _quadBuffer, 2, lastByteCount);
            }
            return name;
        }
        /* Nope, long (3 quads or more). At this point, the last quad is
         * not yet in the array, let's add:
         */
        if (qlen >= _quadBuffer.length) { // let's just double?
            _quadBuffer = DataUtil.growArrayBy(_quadBuffer, _quadBuffer.length);
        }
        _quadBuffer[qlen++] = lastQuad;
        int hash = ByteBasedPNameTable.calcHash(_quadBuffer, qlen);
        PName name = _symbols.findSymbol(hash, _quadBuffer, qlen);
        if (name == null) {
            name = addPName(hash, _quadBuffer, qlen, lastByteCount);
        }
        return name;
    }

    /*
    /**********************************************************************
    /* Internal methods, input validation
    /**********************************************************************
     */

    /**
     * Method called to verify validity of given character (from entity) and
     * append it to the text buffer
     */
    protected void verifyAndAppendEntityCharacter(int charFromEntity) throws XMLStreamException
    {
        verifyXmlChar(charFromEntity);
        // Ok; does it need a surrogate though? (over 16 bits)
        if ((charFromEntity >> 16) != 0) {
            charFromEntity -= 0x10000;
            _textBuilder.append((char) (0xD800 | (charFromEntity >> 10)));
            charFromEntity = 0xDC00 | (charFromEntity & 0x3FF);
        }
        _textBuilder.append((char) charFromEntity);
    }

    /**
     * Checks that a character for a PublicId
     *
     * @param c A character
     * @return true if the character is valid for use in the Public ID
     * of an XML doctype declaration
     *
     * @see "http://www.w3.org/TR/xml/#NT-PubidLiteral"
     */
    protected boolean validPublicIdChar(int c) {
        return
            c == 0xA ||                     //
            c == 0xD ||                     //
            c == 0x20 ||                    //
            (c >= '0' && c <= '9') ||       //[0-9]
            (c >= '@' && c <= 'Z') ||       //@[A-Z]
            (c >= 'a' && c <= 'z') ||
            c == '!' ||
            (c >= 0x23 && c <= 0x25) ||     //#$%
            (c >= 0x27 && c <= 0x2F) ||     //'()*+,-./
            (c >= ':' && c <= ';') ||
            c == '=' ||
            c == '?' ||
            c == '_';
    }

    /*
    /**********************************************************************
    /* Internal methods, error handling
    /**********************************************************************
     */

    @Override
    protected int decodeCharForError(byte b) throws XMLStreamException {
        // !!! TBI
        return (int) b;
    }

    protected void checkPITargetName(PName targetName) throws XMLStreamException
    {
        String ln = targetName.getLocalName();
        if (ln.length() == 3 && ln.equalsIgnoreCase("xml") &&
            !targetName.hasPrefix()) {
            reportInputProblem(ErrorConsts.ERR_WF_PI_XML_TARGET);
        }
    }

    protected int throwInternal() {
        throw new IllegalStateException("Internal error: should never execute this code path");
    }

    protected void reportInvalidOther(int mask, int ptr) throws XMLStreamException
    {
        _inputPtr = ptr;
        reportInvalidOther(mask);
    }
}