All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.ctc.wstx.sr.StreamScanner Maven / Gradle / Ivy

Go to download

Woodstox is a high-performance XML processor that implements Stax (JSR-173) and SAX2 APIs

There is a newer version: 4.4.1
Show newest version
/* Woodstox XML processor
 *
 * Copyright (c) 2004- Tatu Saloranta, [email protected]
 *
 * Licensed under the License specified in file LICENSE, included with
 * the source code.
 * You may not use this file except in compliance with the License.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.ctc.wstx.sr;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.URL;
import java.text.MessageFormat;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;

import javax.xml.stream.Location;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLReporter;
import javax.xml.stream.XMLResolver;
import javax.xml.stream.XMLStreamException;

import org.codehaus.stax2.XMLReporter2;
import org.codehaus.stax2.XMLStreamLocation2;
import org.codehaus.stax2.validation.XMLValidationProblem;

import com.ctc.wstx.api.ReaderConfig;
import com.ctc.wstx.cfg.ErrorConsts;
import com.ctc.wstx.cfg.InputConfigFlags;
import com.ctc.wstx.cfg.ParsingErrorMsgs;
import com.ctc.wstx.cfg.XmlConsts;
import com.ctc.wstx.dtd.MinimalDTDReader;
import com.ctc.wstx.ent.EntityDecl;
import com.ctc.wstx.ent.IntEntity;
import com.ctc.wstx.exc.*;
import com.ctc.wstx.io.DefaultInputResolver;
import com.ctc.wstx.io.WstxInputData;
import com.ctc.wstx.io.WstxInputLocation;
import com.ctc.wstx.io.WstxInputSource;
import com.ctc.wstx.util.ExceptionUtil;
import com.ctc.wstx.util.SymbolTable;
import com.ctc.wstx.util.TextBuffer;

/**
 * Abstract base class that defines some basic functionality that all
 * Woodstox reader classes (main XML reader, DTD reader) extend from.
 */

public abstract class StreamScanner
    extends WstxInputData
    implements InputProblemReporter,
        InputConfigFlags, ParsingErrorMsgs
{

    // // // Some well-known chars:

    /**
     * Last (highest) char code of the three, LF, CR and NULL
     */
    public final static char CHAR_CR_LF_OR_NULL = (char) 13;

    public final static int INT_CR_LF_OR_NULL = 13;

    /**
     * Character that allows quick check of whether a char can potentially
     * be some kind of markup, WRT input stream processing;
     * has to contain linefeeds, &, < and > (">" only matters when
     * quoting text, as part of "]]>")
     */
    protected final static char CHAR_FIRST_PURE_TEXT = (char) ('>' + 1);


    /**
     * First character in Unicode (ie one with lowest id) that is legal
     * as part of a local name (all valid name chars minus ':'). Used
     * for doing quick check for local name end; usually name ends in
     * a whitespace or equals sign.
     */
    protected final static char CHAR_LOWEST_LEGAL_LOCALNAME_CHAR = '-';

    /*
    ///////////////////////////////////////////////////////////
    // Character validity constants, structs
    ///////////////////////////////////////////////////////////
     */

    /**
     * We will only use validity array for first 256 characters, mostly
     * because after those characters it's easier to do fairly simple
     * block checks.
     */
    private final static int VALID_CHAR_COUNT = 0x100;

    private final static byte NAME_CHAR_INVALID_B = (byte) 0;
    private final static byte NAME_CHAR_ALL_VALID_B = (byte) 1;
    private final static byte NAME_CHAR_VALID_NONFIRST_B = (byte) -1;

    private final static byte[] sCharValidity = new byte[VALID_CHAR_COUNT];

    static {
        /* First, since all valid-as-first chars are also valid-as-other chars,
         * we'll initialize common chars:
         */
        sCharValidity['_'] = NAME_CHAR_ALL_VALID_B;
        for (int i = 0, last = ('z' - 'a'); i <= last; ++i) {
            sCharValidity['A' + i] = NAME_CHAR_ALL_VALID_B;
            sCharValidity['a' + i] = NAME_CHAR_ALL_VALID_B;
        }
        for (int i = 0xC0; i < 0xF6; ++i) { // not all are fully valid, but
            sCharValidity[i] = NAME_CHAR_ALL_VALID_B;
        }
        // ... now we can 'revert' ones not fully valid:
        sCharValidity[0xD7] = NAME_CHAR_INVALID_B;
        sCharValidity[0xF7] = NAME_CHAR_INVALID_B;

        /* And then we can proceed with ones only valid-as-other.
         */
        sCharValidity['-'] = NAME_CHAR_VALID_NONFIRST_B;
        sCharValidity['.'] = NAME_CHAR_VALID_NONFIRST_B;
        sCharValidity[0xB7] = NAME_CHAR_VALID_NONFIRST_B;
        for (int i = '0'; i <= '9'; ++i) {
            sCharValidity[i] = NAME_CHAR_VALID_NONFIRST_B;
        }
    }

    /**
     * Public identifiers only use 7-bit ascii range.
     */
    private final static int VALID_PUBID_CHAR_COUNT = 0x80;
    private final static byte[] sPubidValidity = new byte[VALID_PUBID_CHAR_COUNT];
//    private final static byte PUBID_CHAR_INVALID_B = (byte) 0;
    private final static byte PUBID_CHAR_VALID_B = (byte) 1;
    static {
        for (int i = 0, last = ('z' - 'a'); i <= last; ++i) {
            sPubidValidity['A' + i] = PUBID_CHAR_VALID_B;
            sPubidValidity['a' + i] = PUBID_CHAR_VALID_B;
        }
        for (int i = '0'; i <= '9'; ++i) {
            sPubidValidity[i] = PUBID_CHAR_VALID_B;
        }

        // 3 main white space types are valid
        sPubidValidity[0x0A] = PUBID_CHAR_VALID_B;
        sPubidValidity[0x0D] = PUBID_CHAR_VALID_B;
        sPubidValidity[0x20] = PUBID_CHAR_VALID_B;

        // And many of punctuation/separator ascii chars too:
        sPubidValidity['-'] = PUBID_CHAR_VALID_B;
        sPubidValidity['\''] = PUBID_CHAR_VALID_B;
        sPubidValidity['('] = PUBID_CHAR_VALID_B;
        sPubidValidity[')'] = PUBID_CHAR_VALID_B;
        sPubidValidity['+'] = PUBID_CHAR_VALID_B;
        sPubidValidity[','] = PUBID_CHAR_VALID_B;
        sPubidValidity['.'] = PUBID_CHAR_VALID_B;
        sPubidValidity['/'] = PUBID_CHAR_VALID_B;
        sPubidValidity[':'] = PUBID_CHAR_VALID_B;
        sPubidValidity['='] = PUBID_CHAR_VALID_B;
        sPubidValidity['?'] = PUBID_CHAR_VALID_B;
        sPubidValidity[';'] = PUBID_CHAR_VALID_B;
        sPubidValidity['!'] = PUBID_CHAR_VALID_B;
        sPubidValidity['*'] = PUBID_CHAR_VALID_B;
        sPubidValidity['#'] = PUBID_CHAR_VALID_B;
        sPubidValidity['@'] = PUBID_CHAR_VALID_B;
        sPubidValidity['$'] = PUBID_CHAR_VALID_B;
        sPubidValidity['_'] = PUBID_CHAR_VALID_B;
        sPubidValidity['%'] = PUBID_CHAR_VALID_B;
    }

    /*
    ///////////////////////////////////////////////////////////
    // Basic configuration
    ///////////////////////////////////////////////////////////
     */

    /**
     * Copy of the configuration object passed by the factory.
     * Contains immutable settings for this reader (or in case
     * of DTD parsers, reader that uses it)
     */
    protected final ReaderConfig mConfig;

    // // // Various extracted settings:

    /**
     * If true, Reader is namespace aware, and should do basic checks
     * (usually enforcing limitations on having colons in names)
     */
    protected final boolean mCfgNsEnabled;

    // Extracted standard on/off settings:

    /**
     * note: left non-final on purpose: sub-class may need to modify
     * the default value after construction.
     */
    protected boolean mCfgReplaceEntities;

    /*
    ///////////////////////////////////////////////////////////
    // Symbol handling, if applicable
    ///////////////////////////////////////////////////////////
     */

    final SymbolTable mSymbols;

    /**
     * Local full name for the event, if it has one (note: element events
     * do NOT use this variable; those names are stored in element stack):
     * target for processing instructions.
     *

* Currently used for proc. instr. target, and entity name (at least * when current entity reference is null). *

* Note: this variable is generally not cleared, since it comes from * a symbol table, ie. this won't be the only reference. */ protected String mCurrName; /* /////////////////////////////////////////////////////////// // Input handling /////////////////////////////////////////////////////////// */ /** * Currently active input source; contains link to parent (nesting) input * sources, if any. */ protected WstxInputSource mInput; /** * Top-most input source this reader can use; due to input source * chaining, this is not necessarily the root of all input; for example, * external DTD subset reader's root input still has original document * input as its parent. */ protected final WstxInputSource mRootInput; /** * Custom resolver used to handle external entities that are to be expanded * by this reader (external param/general entity expander) */ XMLResolver mEntityResolver = null; /** * This is the current depth of the input stack (same as what input * element stack would return as its depth). * It is used to enforce input scope constraints for nesting of * elements (for xml reader) and dtd declaration (for dtd reader) * with regards to input block (entity expansion) boundaries. *

* Basically this value is compared to {@link #mInputTopDepth}, which * indicates what was the depth at the point where the currently active * input scope/block was started. */ protected int mCurrDepth = 0; protected int mInputTopDepth = 0; /** * Flag that indicates whether linefeeds in the input data are to * be normalized or not. * Xml specs mandate that the line feeds are only normalized * when they are from the external entities (main doc, external * general/parsed entities), so normalization has to be * suppressed when expanding internal general/parsed entities. */ protected boolean mNormalizeLFs; /* /////////////////////////////////////////////////////////// // Buffer(s) for local name(s) and text content /////////////////////////////////////////////////////////// */ /** * Temporary buffer used if local name can not be just directly * constructed from input buffer (name is on a boundary or such). */ protected char[] mNameBuffer = null; /* /////////////////////////////////////////////////////////// // Information about starting location of event // Reader is pointing to; updated on-demand /////////////////////////////////////////////////////////// */ // // // Location info at point when current token was started /** * Total number of characters read before start of current token. * For big (gigabyte-sized) sizes are possible, needs to be long, * unlike pointers and sizes related to in-memory buffers. */ protected long mTokenInputTotal = 0; /** * Input row on which current token starts, 1-based */ protected int mTokenInputRow = 1; /** * Column on input row that current token starts; 0-based (although * in the end it'll be converted to 1-based) */ protected int mTokenInputCol = 0; /* /////////////////////////////////////////////////////////// // XML document information (from doc decl if one // was found) common to all entities (main xml // document, external DTD subset) /////////////////////////////////////////////////////////// */ /** * Input stream encoding, if known (passed in, or determined by * auto-detection); null if not. */ String mDocInputEncoding = null; /** * Character encoding from xml declaration, if any; null if no * declaration, or it didn't specify encoding. */ String mDocXmlEncoding = null; /** * XML version as declared by the document; one of constants * from {@link XmlConsts} (like {@link XmlConsts#XML_V_10}). */ protected int mDocXmlVersion = XmlConsts.XML_V_UNKNOWN; /** * Cache of internal character entities; */ protected Map mCachedEntities; /** * Flag for whether or not character references should be treated as entities */ protected boolean mCfgTreatCharRefsAsEntities; /** * Entity reference stream currently points to. */ protected EntityDecl mCurrEntity; /* /////////////////////////////////////////////////////////// // Life-cycle /////////////////////////////////////////////////////////// */ /** * Constructor used when creating a complete new (main-level) reader that * does not share its input buffers or state with another reader. */ protected StreamScanner(WstxInputSource input, ReaderConfig cfg, XMLResolver res) { super(); mInput = input; // 17-Jun-2004, TSa: Need to know root-level input source mRootInput = input; mConfig = cfg; mSymbols = cfg.getSymbols(); int cf = cfg.getConfigFlags(); mCfgNsEnabled = (cf & CFG_NAMESPACE_AWARE) != 0; mCfgReplaceEntities = (cf & CFG_REPLACE_ENTITY_REFS) != 0; mNormalizeLFs = mConfig.willNormalizeLFs(); mInputBuffer = null; mInputPtr = mInputEnd = 0; mEntityResolver = res; mCfgTreatCharRefsAsEntities = mConfig.willTreatCharRefsAsEnts(); mCachedEntities = mCfgTreatCharRefsAsEntities ? new HashMap() : Collections.EMPTY_MAP; } /* /////////////////////////////////////////////////////////// // Package API /////////////////////////////////////////////////////////// */ /** * Method that returns location of the last character returned by this * reader; that is, location "one less" than the currently pointed to * location. */ protected WstxInputLocation getLastCharLocation() { return mInput.getLocation(mCurrInputProcessed + mInputPtr - 1, mCurrInputRow, mInputPtr - mCurrInputRowStart); } protected URL getSource() { return mInput.getSource(); } protected String getSystemId() { return mInput.getSystemId(); } /* /////////////////////////////////////////////////////////// // Partial LocationInfo implementation (not implemented // by this base class, but is by some sub-classes) /////////////////////////////////////////////////////////// */ /** * Returns location of last properly parsed token; as per StAX specs, * apparently needs to be the end of current event, which is the same * as the start of the following event (or EOF if that's next). */ public abstract Location getLocation(); public XMLStreamLocation2 getStartLocation() { // note: +1 is used as columns are 1-based... return mInput.getLocation(mTokenInputTotal, mTokenInputRow, mTokenInputCol + 1); } public XMLStreamLocation2 getCurrentLocation() { return mInput.getLocation(mCurrInputProcessed + mInputPtr, mCurrInputRow, mInputPtr - mCurrInputRowStart + 1); } /* /////////////////////////////////////////////////////////// // InputProblemReporter implementation /////////////////////////////////////////////////////////// */ public WstxException throwWfcException(String msg, boolean deferErrors) throws WstxException { WstxException ex = constructWfcException(msg); if (!deferErrors) { throw ex; } return ex; } public void throwParseError(String msg) throws XMLStreamException { throwParseError(msg, null, null); } /** * Throws generic parse error with specified message and current parsing * location. *

* Note: public access only because core code in other packages needs * to access it. */ public void throwParseError(String format, Object arg, Object arg2) throws XMLStreamException { String msg = (arg != null || arg2 != null) ? MessageFormat.format(format, new Object[] { arg, arg2 }) : format; throw constructWfcException(msg); } public void reportProblem(String probType, String format, Object arg, Object arg2) throws XMLStreamException { XMLReporter rep = mConfig.getXMLReporter(); if (rep != null) { _reportProblem(rep, probType, MessageFormat.format(format, new Object[] { arg, arg2 }), null); } } public void reportProblem(Location loc, String probType, String format, Object arg, Object arg2) throws XMLStreamException { XMLReporter rep = mConfig.getXMLReporter(); if (rep != null) { String msg = (arg != null || arg2 != null) ? MessageFormat.format(format, new Object[] { arg, arg2 }) : format; _reportProblem(rep, probType, msg, loc); } } protected void _reportProblem(XMLReporter rep, String probType, String msg, Location loc) throws XMLStreamException { if (loc == null) { loc = getLastCharLocation(); } _reportProblem(rep, new XMLValidationProblem(loc, msg, XMLValidationProblem.SEVERITY_ERROR, probType)); } protected void _reportProblem(XMLReporter rep, XMLValidationProblem prob) throws XMLStreamException { if (rep != null) { Location loc = prob.getLocation(); if (loc == null) { loc = getLastCharLocation(); prob.setLocation(loc); } // Backwards-compatibility fix: add non-null type, if missing: if (prob.getType() == null) { prob.setType(ErrorConsts.WT_VALIDATION); } // [WSTX-154]: was catching and dropping thrown exception: shouldn't. // [WTSX-157]: need to support XMLReporter2 if (rep instanceof XMLReporter2) { ((XMLReporter2) rep).report(prob); } else { rep.report(prob.getMessage(), prob.getType(), prob, loc); } } } /** *

* Note: this is the base implementation used for implementing * ValidationContext */ public void reportValidationProblem(XMLValidationProblem prob) throws XMLStreamException { // !!! TBI: Fail-fast vs. deferred modes? /* For now let's implement basic functionality: warnings get * reported via XMLReporter, errors and fatal errors result in * immediate exceptions. */ /* 27-May-2008, TSa: [WSTX-153] Above is incorrect: as per Stax * javadocs for XMLReporter, both warnings and non-fatal errors * (which includes all validation errors) should be reported via * XMLReporter interface, and only fatals should cause an * immediate stream exception (by-passing reporter) */ if (prob.getSeverity() > XMLValidationProblem.SEVERITY_ERROR) { throw WstxValidationException.create(prob); } XMLReporter rep = mConfig.getXMLReporter(); if (rep != null) { _reportProblem(rep, prob); } else { /* If no reporter, regular non-fatal errors are to be reported * as exceptions as well, for backwards compatibility */ if (prob.getSeverity() >= XMLValidationProblem.SEVERITY_ERROR) { throw WstxValidationException.create(prob); } } } public void reportValidationProblem(String msg, int severity) throws XMLStreamException { reportValidationProblem(new XMLValidationProblem(getLastCharLocation(), msg, severity)); } public void reportValidationProblem(String msg) throws XMLStreamException { reportValidationProblem(new XMLValidationProblem(getLastCharLocation(), msg, XMLValidationProblem.SEVERITY_ERROR)); } public void reportValidationProblem(Location loc, String msg) throws XMLStreamException { reportValidationProblem(new XMLValidationProblem(loc, msg)); } public void reportValidationProblem(String format, Object arg, Object arg2) throws XMLStreamException { reportValidationProblem(MessageFormat.format(format, new Object[] { arg, arg2 })); } /* /////////////////////////////////////////////////////////// // Other error reporting methods /////////////////////////////////////////////////////////// */ protected WstxException constructWfcException(String msg) { return new WstxParsingException(msg, getLastCharLocation()); } /** * Construct and return a {@link XMLStreamException} to throw * as a result of a failed Typed Access operation (but one not * caused by a Well-Formedness Constraint or Validation Constraint * problem) */ /* protected WstxException _constructTypeException(String msg) { // Hmmh. Should there be a distinct sub-type? return new WstxParsingException(msg, getLastCharLocation()); } */ protected WstxException constructFromIOE(IOException ioe) { return new WstxIOException(ioe); } protected WstxException constructNullCharException() { return new WstxUnexpectedCharException("Illegal character (NULL, unicode 0) encountered: not valid in any content", getLastCharLocation(), CHAR_NULL); } protected void throwUnexpectedChar(int i, String msg) throws WstxException { char c = (char) i; String excMsg = "Unexpected character "+getCharDesc(c)+msg; throw new WstxUnexpectedCharException(excMsg, getLastCharLocation(), c); } protected void throwNullChar() throws WstxException { throw constructNullCharException(); } protected void throwInvalidSpace(int i) throws WstxException { throwInvalidSpace(i, false); } protected WstxException throwInvalidSpace(int i, boolean deferErrors) throws WstxException { char c = (char) i; WstxException ex; if (c == CHAR_NULL) { ex = constructNullCharException(); } else { String msg = "Illegal character ("+getCharDesc(c)+")"; if (mXml11) { msg += " [note: in XML 1.1, it could be included via entity expansion]"; } ex = new WstxUnexpectedCharException(msg, getLastCharLocation(), c); } if (!deferErrors) { throw ex; } return ex; } protected void throwUnexpectedEOF(String msg) throws WstxException { throw new WstxEOFException("Unexpected EOF" +(msg == null ? "" : msg), getLastCharLocation()); } /** * Similar to {@link #throwUnexpectedEOF}, but only indicates ending * of an input block. Used when reading a token that can not span * input block boundaries (ie. can not continue past end of an * entity expansion). */ protected void throwUnexpectedEOB(String msg) throws WstxException { throw new WstxEOFException("Unexpected end of input block" +(msg == null ? "" : msg), getLastCharLocation()); } protected void throwFromIOE(IOException ioe) throws WstxException { throw new WstxIOException(ioe); } protected void throwFromStrE(XMLStreamException strex) throws WstxException { if (strex instanceof WstxException) { throw (WstxException) strex; } WstxException newEx = new WstxException(strex); ExceptionUtil.setInitCause(newEx, strex); throw newEx; } /** * Method called to report an error, when caller's signature only * allows runtime exceptions to be thrown. */ protected void throwLazyError(Exception e) { if (e instanceof XMLStreamException) { WstxLazyException.throwLazily((XMLStreamException) e); } ExceptionUtil.throwRuntimeException(e); } protected String tokenTypeDesc(int type) { return ErrorConsts.tokenTypeDesc(type); } /* /////////////////////////////////////////////////////////// // Input buffer handling /////////////////////////////////////////////////////////// */ /** * Returns current input source this source uses. *

* Note: public only because some implementations are on different * package. */ public final WstxInputSource getCurrentInput() { return mInput; } protected final int inputInBuffer() { return mInputEnd - mInputPtr; } protected final int getNext() throws XMLStreamException { if (mInputPtr >= mInputEnd) { if (!loadMore()) { return -1; } } return (int) mInputBuffer[mInputPtr++]; } /** * Similar to {@link #getNext}, but does not advance pointer * in input buffer. *

* Note: this method only peeks within current input source; * it does not close it and check nested input source (if any). * This is necessary when checking keywords, since they can never * cross input block boundary. */ protected final int peekNext() throws XMLStreamException { if (mInputPtr >= mInputEnd) { if (!loadMoreFromCurrent()) { return -1; } } return (int) mInputBuffer[mInputPtr]; } protected final char getNextChar(String errorMsg) throws XMLStreamException { if (mInputPtr >= mInputEnd) { loadMore(errorMsg); } return mInputBuffer[mInputPtr++]; } /** * Similar to {@link #getNextChar}, but will not read more characters * from parent input source(s) if the current input source doesn't * have more content. This is often needed to prevent "runaway" content, * such as comments that start in an entity but do not have matching * close marker inside entity; XML specification specifically states * such markup is not legal. */ protected final char getNextCharFromCurrent(String errorMsg) throws XMLStreamException { if (mInputPtr >= mInputEnd) { loadMoreFromCurrent(errorMsg); } return mInputBuffer[mInputPtr++]; } /** * Method that will skip through zero or more white space characters, * and return either the character following white space, or -1 to * indicate EOF (end of the outermost input source)/ */ protected final int getNextAfterWS() throws XMLStreamException { if (mInputPtr >= mInputEnd) { if (!loadMore()) { return -1; } } char c = mInputBuffer[mInputPtr++]; while (c <= CHAR_SPACE) { // Linefeed? if (c == '\n' || c == '\r') { skipCRLF(c); } else if (c != CHAR_SPACE && c != '\t') { throwInvalidSpace(c); } // Still a white space? if (mInputPtr >= mInputEnd) { if (!loadMore()) { return -1; } } c = mInputBuffer[mInputPtr++]; } return (int) c; } protected final char getNextCharAfterWS(String errorMsg) throws XMLStreamException { if (mInputPtr >= mInputEnd) { loadMore(errorMsg); } char c = mInputBuffer[mInputPtr++]; while (c <= CHAR_SPACE) { // Linefeed? if (c == '\n' || c == '\r') { skipCRLF(c); } else if (c != CHAR_SPACE && c != '\t') { throwInvalidSpace(c); } // Still a white space? if (mInputPtr >= mInputEnd) { loadMore(errorMsg); } c = mInputBuffer[mInputPtr++]; } return c; } protected final char getNextInCurrAfterWS(String errorMsg) throws XMLStreamException { return getNextInCurrAfterWS(errorMsg, getNextCharFromCurrent(errorMsg)); } protected final char getNextInCurrAfterWS(String errorMsg, char c) throws XMLStreamException { while (c <= CHAR_SPACE) { // Linefeed? if (c == '\n' || c == '\r') { skipCRLF(c); } else if (c != CHAR_SPACE && c != '\t') { throwInvalidSpace(c); } // Still a white space? if (mInputPtr >= mInputEnd) { loadMoreFromCurrent(errorMsg); } c = mInputBuffer[mInputPtr++]; } return c; } /** * Method called when a CR has been spotted in input; checks if next * char is LF, and if so, skips it. Note that next character has to * come from the current input source, to qualify; it can never come * from another (nested) input source. * * @return True, if passed in char is '\r' and next one is '\n'. */ protected final boolean skipCRLF(char c) throws XMLStreamException { boolean result; if (c == '\r' && peekNext() == '\n') { ++mInputPtr; result = true; } else { result = false; } ++mCurrInputRow; mCurrInputRowStart = mInputPtr; return result; } protected final void markLF() { ++mCurrInputRow; mCurrInputRowStart = mInputPtr; } protected final void markLF(int inputPtr) { ++mCurrInputRow; mCurrInputRowStart = inputPtr; } /** * Method to push back last character read; can only be called once, * that is, no more than one char can be guaranteed to be succesfully * returned. */ protected final void pushback() { --mInputPtr; } /* /////////////////////////////////////////////////////////// // Sub-class overridable input handling methods /////////////////////////////////////////////////////////// */ /** * Method called when an entity has been expanded (new input source * has been created). Needs to initialize location information and change * active input source. * * @param entityId Name of the entity being expanded */ protected void initInputSource(WstxInputSource newInput, boolean isExt, String entityId) throws XMLStreamException { mInput = newInput; // Let's make sure new input will be read next time input is needed: mInputPtr = 0; mInputEnd = 0; /* Plus, reset the input location so that'll be accurate for * error reporting etc. */ mInputTopDepth = mCurrDepth; mInput.initInputLocation(this, mCurrDepth); /* 21-Feb-2006, TSa: Linefeeds are NOT normalized when expanding * internal entities (XML, 2.11) */ if (isExt) { mNormalizeLFs = true; } else { mNormalizeLFs = false; } } /** * Method that will try to read one or more characters from currently * open input sources; closing input sources if necessary. * * @return true if reading succeeded (or may succeed), false if * we reached EOF. */ protected boolean loadMore() throws XMLStreamException { WstxInputSource input = mInput; do { /* Need to make sure offsets are properly updated for error * reporting purposes, and do this now while previous amounts * are still known. */ mCurrInputProcessed += mInputEnd; if (mCurrInputProcessed > mConfig.getMaxCharacters()) { throw new XMLStreamException("Character limit ("+mConfig.getMaxCharacters()+") exceeded"); } mCurrInputRowStart -= mInputEnd; int count; try { count = input.readInto(this); if (count > 0) { return true; } input.close(); } catch (IOException ioe) { throw constructFromIOE(ioe); } if (input == mRootInput) { /* Note: no need to check entity/input nesting in this * particular case, since it will be handled by higher level * parsing code (results in an unexpected EOF) */ return false; } WstxInputSource parent = input.getParent(); if (parent == null) { // sanity check! throwNullParent(input); } /* 13-Feb-2006, TSa: Ok, do we violate a proper nesting constraints * with this input block closure? */ if (mCurrDepth != input.getScopeId()) { handleIncompleteEntityProblem(input); } mInput = input = parent; input.restoreContext(this); mInputTopDepth = input.getScopeId(); /* 21-Feb-2006, TSa: Since linefeed normalization needs to be * suppressed for internal entity expansion, we may need to * change the state... */ if (!mNormalizeLFs) { mNormalizeLFs = !input.fromInternalEntity(); } // Maybe there are leftovers from that input in buffer now? } while (mInputPtr >= mInputEnd); return true; } protected final boolean loadMore(String errorMsg) throws XMLStreamException { if (!loadMore()) { throwUnexpectedEOF(errorMsg); } return true; } protected boolean loadMoreFromCurrent() throws XMLStreamException { // Need to update offsets properly mCurrInputProcessed += mInputEnd; mCurrInputRowStart -= mInputEnd; if (mCurrInputProcessed > mConfig.getMaxCharacters()) { throw new XMLStreamException("Character limit ("+mConfig.getMaxCharacters()+") exceeded"); } try { int count = mInput.readInto(this); return (count > 0); } catch (IOException ie) { throw constructFromIOE(ie); } } protected final boolean loadMoreFromCurrent(String errorMsg) throws XMLStreamException { if (!loadMoreFromCurrent()) { throwUnexpectedEOB(errorMsg); } return true; } /** * Method called to make sure current main-level input buffer has at * least specified number of characters available consequtively, * without having to call {@link #loadMore}. It can only be called * when input comes from main-level buffer; further, call can shift * content in input buffer, so caller has to flush any data still * pending. In short, caller has to know exactly what it's doing. :-) *

* Note: method does not check for any other input sources than the * current one -- if current source can not fulfill the request, a * failure is indicated. * * @return true if there's now enough data; false if not (EOF) */ protected boolean ensureInput(int minAmount) throws XMLStreamException { int currAmount = mInputEnd - mInputPtr; if (currAmount >= minAmount) { return true; } try { return mInput.readMore(this, minAmount); } catch (IOException ie) { throw constructFromIOE(ie); } } protected void closeAllInput(boolean force) throws XMLStreamException { WstxInputSource input = mInput; while (true) { try { if (force) { input.closeCompletely(); } else { input.close(); } } catch (IOException ie) { throw constructFromIOE(ie); } if (input == mRootInput) { break; } WstxInputSource parent = input.getParent(); if (parent == null) { // sanity check! throwNullParent(input); } mInput = input = parent; } } protected void throwNullParent(WstxInputSource curr) { throw new IllegalStateException(ErrorConsts.ERR_INTERNAL); //throw new IllegalStateException("Internal error: null parent for input source '"+curr+"'; should never occur (should have stopped at root input '"+mRootInput+"')."); } /* /////////////////////////////////////////////////////////// // Entity resolution /////////////////////////////////////////////////////////// */ /** * Method that tries to resolve a character entity, or (if caller so * specifies), a pre-defined internal entity (lt, gt, amp, apos, quot). * It will succeed iff: *

    *
  1. Entity in question is a simple character entity (either one of * 5 pre-defined ones, or using decimal/hex notation), AND *
  2. *
  3. Entity fits completely inside current input buffer. *
  4. *
* If so, character value of entity is returned. Character 0 is returned * otherwise; if so, caller needs to do full resolution. *

* Note: On entry we are guaranteed there are at least 3 more characters * in this buffer; otherwise we shouldn't be called. * * @param checkStd If true, will check pre-defined internal entities * (gt, lt, amp, apos, quot); if false, will only check actual * character entities. * * @return (Valid) character value, if entity is a character reference, * and could be resolved from current input buffer (does not span * buffer boundary); null char (code 0) if not (either non-char * entity, or spans input buffer boundary). */ protected int resolveSimpleEntity(boolean checkStd) throws XMLStreamException { char[] buf = mInputBuffer; int ptr = mInputPtr; char c = buf[ptr++]; // Numeric reference? if (c == '#') { c = buf[ptr++]; int value = 0; int inputLen = mInputEnd; if (c == 'x') { // hex while (ptr < inputLen) { c = buf[ptr++]; if (c == ';') { break; } value = value << 4; if (c <= '9' && c >= '0') { value += (c - '0'); } else if (c >= 'a' && c <= 'f') { value += (10 + (c - 'a')); } else if (c >= 'A' && c <= 'F') { value += (10 + (c - 'A')); } else { mInputPtr = ptr; // so error points to correct char throwUnexpectedChar(c, "; expected a hex digit (0-9a-fA-F)."); } /* Need to check for overflow; easiest to do right as * it happens... */ if (value > MAX_UNICODE_CHAR) { reportUnicodeOverflow(); } } } else { // numeric (decimal) while (c != ';') { if (c <= '9' && c >= '0') { value = (value * 10) + (c - '0'); // Overflow? if (value > MAX_UNICODE_CHAR) { reportUnicodeOverflow(); } } else { mInputPtr = ptr; // so error points to correct char throwUnexpectedChar(c, "; expected a decimal number."); } if (ptr >= inputLen) { break; } c = buf[ptr++]; } } /* We get here either if we got it all, OR if we ran out of * input in current buffer. */ if (c == ';') { // got the full thing mInputPtr = ptr; validateChar(value); return value; } /* If we ran out of input, need to just fall back, gets * resolved via 'full' resolution mechanism. */ } else if (checkStd) { /* Caller may not want to resolve these quite yet... * (when it wants separate events for non-char entities) */ if (c == 'a') { // amp or apos? c = buf[ptr++]; if (c == 'm') { // amp? if (buf[ptr++] == 'p') { if (ptr < mInputEnd && buf[ptr++] == ';') { mInputPtr = ptr; return '&'; } } } else if (c == 'p') { // apos? if (buf[ptr++] == 'o') { int len = mInputEnd; if (ptr < len && buf[ptr++] == 's') { if (ptr < len && buf[ptr++] == ';') { mInputPtr = ptr; return '\''; } } } } } else if (c == 'g') { // gt? if (buf[ptr++] == 't' && buf[ptr++] == ';') { mInputPtr = ptr; return '>'; } } else if (c == 'l') { // lt? if (buf[ptr++] == 't' && buf[ptr++] == ';') { mInputPtr = ptr; return '<'; } } else if (c == 'q') { // quot? if (buf[ptr++] == 'u' && buf[ptr++] == 'o') { int len = mInputEnd; if (ptr < len && buf[ptr++] == 't') { if (ptr < len && buf[ptr++] == ';') { mInputPtr = ptr; return '"'; } } } } } return 0; } /** * Method called to resolve character entities, and only character * entities (except that pre-defined char entities -- amp, apos, lt, * gt, quote -- MAY be "char entities" in this sense, depending on * arguments). * Otherwise it is to return the null char; if so, * the input pointer will point to the same point as when method * entered (char after ampersand), plus the ampersand itself is * guaranteed to be in the input buffer (so caller can just push it * back if necessary). *

* Most often this method is called when reader is not to expand * non-char entities automatically, but to return them as separate * events. *

* Main complication here is that we need to do 5-char lookahead. This * is problematic if chars are on input buffer boundary. This is ok * for the root level input buffer, but not for some nested buffers. * However, according to XML specs, such split entities are actually * illegal... so we can throw an exception in those cases. * * @param checkStd If true, will check pre-defined internal entities * (gt, lt, amp, apos, quot) as character entities; if false, will only * check actual 'real' character entities. * * @return (Valid) character value, if entity is a character reference, * and could be resolved from current input buffer (does not span * buffer boundary); null char (code 0) if not (either non-char * entity, or spans input buffer boundary). */ protected int resolveCharOnlyEntity(boolean checkStd) throws XMLStreamException { //int avail = inputInBuffer(); int avail = mInputEnd - mInputPtr; if (avail < 6) { // split entity, or buffer boundary /* Don't want to lose leading '&' (in case we can not expand * the entity), so let's push it back first */ --mInputPtr; /* Shortest valid reference would be 3 chars ('&a;'); which * would only be legal from an expanded entity... */ if (!ensureInput(6)) { avail = inputInBuffer(); if (avail < 3) { throwUnexpectedEOF(SUFFIX_IN_ENTITY_REF); } } else { avail = 6; } // ... and now we can move pointer back as well: ++mInputPtr; } /* Ok, now we have one more character to check, and that's enough * to determine type decisively. */ char c = mInputBuffer[mInputPtr]; // A char reference? if (c == '#') { // yup ++mInputPtr; return resolveCharEnt(null); } // nope... except may be a pre-def? if (checkStd) { if (c == 'a') { char d = mInputBuffer[mInputPtr+1]; if (d == 'm') { if (avail >= 4 && mInputBuffer[mInputPtr+2] == 'p' && mInputBuffer[mInputPtr+3] == ';') { mInputPtr += 4; return '&'; } } else if (d == 'p') { if (avail >= 5 && mInputBuffer[mInputPtr+2] == 'o' && mInputBuffer[mInputPtr+3] == 's' && mInputBuffer[mInputPtr+4] == ';') { mInputPtr += 5; return '\''; } } } else if (c == 'l') { if (avail >= 3 && mInputBuffer[mInputPtr+1] == 't' && mInputBuffer[mInputPtr+2] == ';') { mInputPtr += 3; return '<'; } } else if (c == 'g') { if (avail >= 3 && mInputBuffer[mInputPtr+1] == 't' && mInputBuffer[mInputPtr+2] == ';') { mInputPtr += 3; return '>'; } } else if (c == 'q') { if (avail >= 5 && mInputBuffer[mInputPtr+1] == 'u' && mInputBuffer[mInputPtr+2] == 'o' && mInputBuffer[mInputPtr+3] == 't' && mInputBuffer[mInputPtr+4] == ';') { mInputPtr += 5; return '"'; } } } return 0; } /** * Reverse of {@link #resolveCharOnlyEntity}; will only resolve entity * if it is NOT a character entity (or pre-defined 'generic' entity; * amp, apos, lt, gt or quot). Only used in cases where entities * are to be separately returned unexpanded (in non-entity-replacing * mode); which means it's never called from dtd handler. */ protected EntityDecl resolveNonCharEntity() throws XMLStreamException { //int avail = inputInBuffer(); int avail = mInputEnd - mInputPtr; if (avail < 6) { // split entity, or buffer boundary /* Don't want to lose leading '&' (in case we can not expand * the entity), so let's push it back first */ --mInputPtr; /* Shortest valid reference would be 3 chars ('&a;'); which * would only be legal from an expanded entity... */ if (!ensureInput(6)) { avail = inputInBuffer(); if (avail < 3) { throwUnexpectedEOF(SUFFIX_IN_ENTITY_REF); } } else { avail = 6; } // ... and now we can move pointer back as well: ++mInputPtr; } // We don't care about char entities: char c = mInputBuffer[mInputPtr]; if (c == '#') { return null; } /* 19-Aug-2004, TSa: Need special handling for pre-defined * entities; they are not counted as 'real' general parsed * entities, but more as character entities... */ // have chars at least up to mInputPtr+4 by now if (c == 'a') { char d = mInputBuffer[mInputPtr+1]; if (d == 'm') { if (avail >= 4 && mInputBuffer[mInputPtr+2] == 'p' && mInputBuffer[mInputPtr+3] == ';') { // If not automatically expanding: //return sEntityAmp; // mInputPtr += 4; return null; } } else if (d == 'p') { if (avail >= 5 && mInputBuffer[mInputPtr+2] == 'o' && mInputBuffer[mInputPtr+3] == 's' && mInputBuffer[mInputPtr+4] == ';') { return null; } } } else if (c == 'l') { if (avail >= 3 && mInputBuffer[mInputPtr+1] == 't' && mInputBuffer[mInputPtr+2] == ';') { return null; } } else if (c == 'g') { if (avail >= 3 && mInputBuffer[mInputPtr+1] == 't' && mInputBuffer[mInputPtr+2] == ';') { return null; } } else if (c == 'q') { if (avail >= 5 && mInputBuffer[mInputPtr+1] == 'u' && mInputBuffer[mInputPtr+2] == 'o' && mInputBuffer[mInputPtr+3] == 't' && mInputBuffer[mInputPtr+4] == ';') { return null; } } // Otherwise, let's just parse in generic way: ++mInputPtr; // since we already read the first letter String id = parseEntityName(c); mCurrName = id; return findEntity(id, null); } /** * Method that does full resolution of an entity reference, be it * character entity, internal entity or external entity, including * updating of input buffers, and depending on whether result is * a character entity (or one of 5 pre-defined entities), returns * char in question, or null character (code 0) to indicate it had * to change input source. * * @param allowExt If true, is allowed to expand external entities * (expanding text); if false, is not (expanding attribute value). * * @return Either single-character replacement (which is NOT to be * reparsed), or null char (0) to indicate expansion is done via * input source. */ protected int fullyResolveEntity(boolean allowExt) throws XMLStreamException { char c = getNextCharFromCurrent(SUFFIX_IN_ENTITY_REF); // Do we have a (numeric) character entity reference? if (c == '#') { // numeric final StringBuffer originalSurface = new StringBuffer("#"); int ch = resolveCharEnt(originalSurface); if (mCfgTreatCharRefsAsEntities) { final char[] originalChars = new char[originalSurface.length()]; originalSurface.getChars(0, originalSurface.length(), originalChars, 0); mCurrEntity = getIntEntity(ch, originalChars); return 0; } return ch; } String id = parseEntityName(c); // Perhaps we have a pre-defined char reference? c = id.charAt(0); /* * 16-May-2004, TSa: Should custom entities (or ones defined in int/ext subset) override * pre-defined settings for these? */ char d = CHAR_NULL; if (c == 'a') { // amp or apos? if (id.equals("amp")) { d = '&'; } else if (id.equals("apos")) { d = '\''; } } else if (c == 'g') { // gt? if (id.length() == 2 && id.charAt(1) == 't') { d = '>'; } } else if (c == 'l') { // lt? if (id.length() == 2 && id.charAt(1) == 't') { d = '<'; } } else if (c == 'q') { // quot? if (id.equals("quot")) { d = '"'; } } if (d != CHAR_NULL) { if (mCfgTreatCharRefsAsEntities) { final char[] originalChars = new char[id.length()]; id.getChars(0, id.length(), originalChars, 0); mCurrEntity = getIntEntity(d, originalChars); return 0; } return d; } final EntityDecl e = expandEntity(id, allowExt, null); if (mCfgTreatCharRefsAsEntities) { mCurrEntity = e; } return 0; } /** * Returns an entity (possibly from cache) for the argument character using the encoded * representation in mInputBuffer[entityStartPos ... mInputPtr-1]. */ protected EntityDecl getIntEntity(int ch, final char[] originalChars) { String cacheKey = new String(originalChars); IntEntity entity = (IntEntity) mCachedEntities.get(cacheKey); if (entity == null) { String repl; if (ch <= 0xFFFF) { repl = Character.toString((char) ch); } else { StringBuffer sb = new StringBuffer(2); ch -= 0x10000; sb.append((char) ((ch >> 10) + 0xD800)); sb.append((char) ((ch & 0x3FF) + 0xDC00)); repl = sb.toString(); } entity = IntEntity.create(new String(originalChars), repl); mCachedEntities.put(cacheKey, entity); } return entity; } /** * Helper method that will try to expand a parsed entity (parameter or * generic entity). *

* note: called by sub-classes (dtd parser), needs to be protected. * * @param id Name of the entity being expanded * @param allowExt Whether external entities can be expanded or not; if * not, and the entity to expand would be external one, an exception * will be thrown */ protected EntityDecl expandEntity(String id, boolean allowExt, Object extraArg) throws XMLStreamException { mCurrName = id; EntityDecl ed = findEntity(id, extraArg); if (ed == null) { /* 30-Sep-2005, TSa: As per [WSTX-5], let's only throw exception * if we have to resolve it (otherwise it's just best-effort, * and null is ok) */ /* 02-Oct-2005, TSa: Plus, [WSTX-4] adds "undeclared entity * resolver" */ if (mCfgReplaceEntities) { mCurrEntity = expandUnresolvedEntity(id); } return null; } if (!mCfgTreatCharRefsAsEntities || this instanceof MinimalDTDReader) { expandEntity(ed, allowExt); } return ed; } /** *

* note: defined as private for documentation, ie. it's just called * from within this class (not sub-classes), from one specific method * (see above) * * @param ed Entity to be expanded * @param allowExt Whether external entities are allowed or not. */ private void expandEntity(EntityDecl ed, boolean allowExt) throws XMLStreamException { String id = ed.getName(); /* Very first thing; we can immediately check if expanding * this entity would result in infinite recursion: */ if (mInput.isOrIsExpandedFrom(id)) { throwRecursionError(id); } /* Should not refer unparsed entities from attribute values * or text content (except via notation mechanism, but that's * not parsed here) */ if (!ed.isParsed()) { throwParseError("Illegal reference to unparsed external entity \"{0}\"", id, null); } // 28-Jun-2004, TSa: Do we support external entity expansion? boolean isExt = ed.isExternal(); if (isExt) { if (!allowExt) { // never ok in attribute value... throwParseError("Encountered a reference to external parsed entity \"{0}\" when expanding attribute value: not legal as per XML 1.0/1.1 #3.1", id, null); } if (!mConfig.willSupportExternalEntities()) { throwParseError("Encountered a reference to external entity \"{0}\", but stream reader has feature \"{1}\" disabled", id, XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES); } } // First, let's give current context chance to save its stuff WstxInputSource oldInput = mInput; oldInput.saveContext(this); WstxInputSource newInput = null; try { newInput = ed.expand(oldInput, mEntityResolver, mConfig, mDocXmlVersion); } catch (FileNotFoundException fex) { /* Let's catch and rethrow this just so we get more meaningful * description (with input source position etc) */ throwParseError("(was {0}) {1}", fex.getClass().getName(), fex.getMessage()); } catch (IOException ioe) { throw constructFromIOE(ioe); } /* And then we'll need to make sure new input comes from the new * input source */ initInputSource(newInput, isExt, id); } /** *

* note: only called from the local expandEntity() method */ private EntityDecl expandUnresolvedEntity(String id) throws XMLStreamException { XMLResolver resolver = mConfig.getUndeclaredEntityResolver(); if (resolver != null) { /* Ok, we can check for recursion here; but let's only do that * if there is any chance that it might get resolved by * the special resolver (it must have been resolved this way * earlier, too...) */ if (mInput.isOrIsExpandedFrom(id)) { throwRecursionError(id); } WstxInputSource oldInput = mInput; oldInput.saveContext(this); // null, null -> no public or system ids int xmlVersion = mDocXmlVersion; // 05-Feb-2006, TSa: If xmlVersion not explicitly known, defaults to 1.0 if (xmlVersion == XmlConsts.XML_V_UNKNOWN) { xmlVersion = XmlConsts.XML_V_10; } WstxInputSource newInput; try { newInput = DefaultInputResolver.resolveEntityUsing (oldInput, id, null, null, resolver, mConfig, xmlVersion); if (mCfgTreatCharRefsAsEntities) { return new IntEntity(WstxInputLocation.getEmptyLocation(), newInput.getEntityId(), newInput.getSource(), new char[]{}, WstxInputLocation.getEmptyLocation()); } } catch (IOException ioe) { throw constructFromIOE(ioe); } if (newInput != null) { // true -> is external initInputSource(newInput, true, id); return null; } } handleUndeclaredEntity(id); return null; } /* /////////////////////////////////////////////////////////// // Abstract methods for sub-classes to implement /////////////////////////////////////////////////////////// */ /** * Abstract method for sub-classes to implement, for finding * a declared general or parsed entity. * * @param id Identifier of the entity to find * @param arg Optional argument passed from caller; needed by DTD * reader. */ protected abstract EntityDecl findEntity(String id, Object arg) throws XMLStreamException; /** * This method gets called if a declaration for an entity was not * found in entity expanding mode (enabled by default for xml reader, * always enabled for dtd reader). */ protected abstract void handleUndeclaredEntity(String id) throws XMLStreamException; protected abstract void handleIncompleteEntityProblem(WstxInputSource closing) throws XMLStreamException; /* /////////////////////////////////////////////////////////// // Basic tokenization /////////////////////////////////////////////////////////// */ /** * Method that will parse name token (roughly equivalent to XML specs; * although bit lenier for more efficient handling); either uri prefix, * or local name. *

* Much of complexity in this method has to do with the intention to * try to avoid any character copies. In this optimal case algorithm * would be fairly simple. However, this only works if all data is * already in input buffer... if not, copy has to be made halfway * through parsing, and that complicates things. *

* One thing to note is that String returned has been canonicalized * and (if necessary) added to symbol table. It can thus be compared * against other such (usually id) Strings, with simple equality operator. * * @param c First character of the name; not yet checked for validity * * @return Canonicalized name String (which may have length 0, if * EOF or non-name-start char encountered) */ protected String parseLocalName(char c) throws XMLStreamException { /* Has to start with letter, or '_' (etc); we won't allow ':' as that * is taken as namespace separator; no use trying to optimize * heavily as it's 98% likely it is a valid char... */ if (!isNameStartChar(c)) { if (c == ':') { throwUnexpectedChar(c, " (missing namespace prefix?)"); } throwUnexpectedChar(c, " (expected a name start character)"); } int ptr = mInputPtr; int hash = (int) c; final int inputLen = mInputEnd; int startPtr = ptr-1; // already read previous char final char[] inputBuf = mInputBuffer; /* After which there may be zero or more name chars * we have to consider */ while (true) { if (ptr >= inputLen) { /* Ok, identifier may continue past buffer end, need * to continue with part 2 (separate method, as this is * not as common as having it all in buffer) */ mInputPtr = ptr; return parseLocalName2(startPtr, hash); } // Ok, we have the char... is it a name char? c = inputBuf[ptr]; if (c < CHAR_LOWEST_LEGAL_LOCALNAME_CHAR) { break; } if (!isNameChar(c)) { break; } hash = (hash * 31) + (int) c; ++ptr; } mInputPtr = ptr; return mSymbols.findSymbol(mInputBuffer, startPtr, ptr - startPtr, hash); } /** * Second part of name token parsing; called when name can continue * past input buffer end (so only part was read before calling this * method to read the rest). *

* Note that this isn't heavily optimized, on assumption it's not * called very often. */ protected String parseLocalName2(int start, int hash) throws XMLStreamException { int ptr = mInputEnd - start; // Let's assume fairly short names char[] outBuf = getNameBuffer(ptr+8); if (ptr > 0) { System.arraycopy(mInputBuffer, start, outBuf, 0, ptr); } int outLen = outBuf.length; while (true) { // note: names can not cross input block (entity) boundaries... if (mInputPtr >= mInputEnd) { if (!loadMoreFromCurrent()) { break; } } char c = mInputBuffer[mInputPtr]; if (c < CHAR_LOWEST_LEGAL_LOCALNAME_CHAR) { break; } if (!isNameChar(c)) { break; } ++mInputPtr; if (ptr >= outLen) { mNameBuffer = outBuf = expandBy50Pct(outBuf); outLen = outBuf.length; } outBuf[ptr++] = c; hash = (hash * 31) + (int) c; } // Still need to canonicalize the name: return mSymbols.findSymbol(outBuf, 0, ptr, hash); } /** * Method that will parse 'full' name token; what full means depends on * whether reader is namespace aware or not. If it is, full name means * local name with no namespace prefix (PI target, entity/notation name); * if not, name can contain arbitrary number of colons. Note that * element and attribute names are NOT parsed here, so actual namespace * prefix separation can be handled properly there. *

* Similar to {@link #parseLocalName}, much of complexity stems from * trying to avoid copying name characters from input buffer. *

* Note that returned String will be canonicalized, similar to * {@link #parseLocalName}, but without separating prefix/local name. * * @return Canonicalized name String (which may have length 0, if * EOF or non-name-start char encountered) */ protected String parseFullName() throws XMLStreamException { if (mInputPtr >= mInputEnd) { loadMoreFromCurrent(); } return parseFullName(mInputBuffer[mInputPtr++]); } protected String parseFullName(char c) throws XMLStreamException { // First char has special handling: if (!isNameStartChar(c)) { if (c == ':') { // no name.... generally an error: if (mCfgNsEnabled) { throwNsColonException(parseFNameForError()); } // Ok, that's fine actually } else { if (c <= CHAR_SPACE) { throwUnexpectedChar(c, " (missing name?)"); } throwUnexpectedChar(c, " (expected a name start character)"); } } int ptr = mInputPtr; int hash = (int) c; int inputLen = mInputEnd; int startPtr = ptr-1; // to account for the first char /* After which there may be zero or more name chars * we have to consider */ while (true) { if (ptr >= inputLen) { /* Ok, identifier may continue past buffer end, need * to continue with part 2 (separate method, as this is * not as common as having it all in buffer) */ mInputPtr = ptr; return parseFullName2(startPtr, hash); } c = mInputBuffer[ptr]; if (c == ':') { // colon only allowed in non-NS mode if (mCfgNsEnabled) { mInputPtr = ptr; throwNsColonException(new String(mInputBuffer, startPtr, ptr - startPtr) + parseFNameForError()); } } else { if (c < CHAR_LOWEST_LEGAL_LOCALNAME_CHAR) { break; } if (!isNameChar(c)) { break; } } hash = (hash * 31) + (int) c; ++ptr; } mInputPtr = ptr; return mSymbols.findSymbol(mInputBuffer, startPtr, ptr - startPtr, hash); } protected String parseFullName2(int start, int hash) throws XMLStreamException { int ptr = mInputEnd - start; // Let's assume fairly short names char[] outBuf = getNameBuffer(ptr+8); if (ptr > 0) { System.arraycopy(mInputBuffer, start, outBuf, 0, ptr); } int outLen = outBuf.length; while (true) { /* 06-Sep-2004, TSa: Name tokens are not allowed to continue * past entity expansion ranges... that is, all characters * have to come from the same input source. Thus, let's only * load things from same input level */ if (mInputPtr >= mInputEnd) { if (!loadMoreFromCurrent()) { break; } } char c = mInputBuffer[mInputPtr]; if (c == ':') { // colon only allowed in non-NS mode if (mCfgNsEnabled) { throwNsColonException(new String(outBuf, 0, ptr) + c + parseFNameForError()); } } else if (c < CHAR_LOWEST_LEGAL_LOCALNAME_CHAR) { break; } else if (!isNameChar(c)) { break; } ++mInputPtr; if (ptr >= outLen) { mNameBuffer = outBuf = expandBy50Pct(outBuf); outLen = outBuf.length; } outBuf[ptr++] = c; hash = (hash * 31) + (int) c; } // Still need to canonicalize the name: return mSymbols.findSymbol(outBuf, 0, ptr, hash); } /** * Method called to read in full name, including unlimited number of * namespace separators (':'), for the purpose of displaying name in * an error message. Won't do any further validations, and parsing * is not optimized: main need is just to get more meaningful error * messages. */ protected String parseFNameForError() throws XMLStreamException { StringBuffer sb = new StringBuffer(100); while (true) { char c; if (mInputPtr < mInputEnd) { c = mInputBuffer[mInputPtr++]; } else { // can't error here, so let's accept EOF for now: int i = getNext(); if (i < 0) { break; } c = (char) i; } if (c != ':' && !isNameChar(c)) { --mInputPtr; break; } sb.append(c); } return sb.toString(); } protected final String parseEntityName(char c) throws XMLStreamException { String id = parseFullName(c); // Needs to be followed by a semi-colon, too.. from same input source: if (mInputPtr >= mInputEnd) { if (!loadMoreFromCurrent()) { throwParseError("Missing semicolon after reference for entity \"{0}\"", id, null); } } c = mInputBuffer[mInputPtr++]; if (c != ';') { throwUnexpectedChar(c, "; expected a semi-colon after the reference for entity '"+id+"'"); } return id; } /** * Note: does not check for number of colons, amongst other things. * Main idea is to skip through what superficially seems like a valid * id, nothing more. This is only done when really skipping through * something we do not care about at all: not even whether names/ids * would be valid (for example, when ignoring internal DTD subset). * * @return Length of skipped name. */ protected int skipFullName(char c) throws XMLStreamException { if (!isNameStartChar(c)) { --mInputPtr; return 0; } /* After which there may be zero or more name chars * we have to consider */ int count = 1; while (true) { c = (mInputPtr < mInputEnd) ? mInputBuffer[mInputPtr++] : getNextChar(SUFFIX_EOF_EXP_NAME); if (c != ':' && !isNameChar(c)) { break; } ++count; } return count; } /** * Simple parsing method that parses system ids, which are generally * used in entities (from DOCTYPE declaration to internal/external * subsets). *

* NOTE: returned String is not canonicalized, on assumption that * external ids may be longish, and are not shared all that often, as * they are generally just used for resolving paths, if anything. *
* Also note that this method is not heavily optimized, as it's not * likely to be a bottleneck for parsing. */ protected final String parseSystemId(char quoteChar, boolean convertLFs, String errorMsg) throws XMLStreamException { char[] buf = getNameBuffer(-1); int ptr = 0; while (true) { char c = (mInputPtr < mInputEnd) ? mInputBuffer[mInputPtr++] : getNextChar(errorMsg); if (c == quoteChar) { break; } /* ??? 14-Jun-2004, TSa: Should we normalize linefeeds or not? * It seems like we should, for all input... so that's the way it * works. */ if (c == '\n') { markLF(); } else if (c == '\r') { if (peekNext() == '\n') { ++mInputPtr; if (!convertLFs) { /* The only tricky thing; need to preserve 2-char LF; need to * output one char from here, then can fall back to default: */ if (ptr >= buf.length) { buf = expandBy50Pct(buf); } buf[ptr++] = '\r'; } c = '\n'; } else if (convertLFs) { c = '\n'; } } // Other than that, let's just append it: if (ptr >= buf.length) { buf = expandBy50Pct(buf); } buf[ptr++] = c; } return (ptr == 0) ? "" : new String(buf, 0, ptr); } /** * Simple parsing method that parses system ids, which are generally * used in entities (from DOCTYPE declaration to internal/external * subsets). *

* As per xml specs, the contents are actually normalized. *

* NOTE: returned String is not canonicalized, on assumption that * external ids may be longish, and are not shared all that often, as * they are generally just used for resolving paths, if anything. *
* Also note that this method is not heavily optimized, as it's not * likely to be a bottleneck for parsing. */ protected final String parsePublicId(char quoteChar, String errorMsg) throws XMLStreamException { char[] buf = getNameBuffer(-1); int ptr = 0; boolean spaceToAdd = false; while (true) { char c = (mInputPtr < mInputEnd) ? mInputBuffer[mInputPtr++] : getNextChar(errorMsg); if (c == quoteChar) { break; } if (c == '\n') { markLF(); spaceToAdd = true; continue; } else if (c == '\r') { if (peekNext() == '\n') { ++mInputPtr; } spaceToAdd = true; continue; } else if (c == CHAR_SPACE) { spaceToAdd = true; continue; } else { // Verify it's a legal pubid char (see XML spec, #13, from 2.3) if ((c >= VALID_PUBID_CHAR_COUNT) || sPubidValidity[c] != PUBID_CHAR_VALID_B) { throwUnexpectedChar(c, " in public identifier"); } } // Other than that, let's just append it: if (ptr >= buf.length) { buf = expandBy50Pct(buf); } /* Space-normalization means scrapping leading and trailing * white space, and coalescing remaining ws into single spaces. */ if (spaceToAdd) { // pending white space to add? if (c == CHAR_SPACE) { // still a space; let's skip continue; } /* ok: if we have non-space, we'll either forget about * space(s) (if nothing has been output, ie. leading space), * or output a single space (in-between non-white space) */ spaceToAdd = false; if (ptr > 0) { buf[ptr++] = CHAR_SPACE; if (ptr >= buf.length) { buf = expandBy50Pct(buf); } } } buf[ptr++] = c; } return (ptr == 0) ? "" : new String(buf, 0, ptr); } protected final void parseUntil(TextBuffer tb, char endChar, boolean convertLFs, String errorMsg) throws XMLStreamException { // Let's first ensure we have some data in there... if (mInputPtr >= mInputEnd) { loadMore(errorMsg); } while (true) { // Let's loop consequtive 'easy' spans: char[] inputBuf = mInputBuffer; int inputLen = mInputEnd; int ptr = mInputPtr; int startPtr = ptr; while (ptr < inputLen) { char c = inputBuf[ptr++]; if (c == endChar) { int thisLen = ptr - startPtr - 1; if (thisLen > 0) { tb.append(inputBuf, startPtr, thisLen); } mInputPtr = ptr; return; } if (c == '\n') { mInputPtr = ptr; // markLF() requires this markLF(); } else if (c == '\r') { if (!convertLFs && ptr < inputLen) { if (inputBuf[ptr] == '\n') { ++ptr; } mInputPtr = ptr; markLF(); } else { int thisLen = ptr - startPtr - 1; if (thisLen > 0) { tb.append(inputBuf, startPtr, thisLen); } mInputPtr = ptr; c = getNextChar(errorMsg); if (c != '\n') { --mInputPtr; // pusback tb.append(convertLFs ? '\n' : '\r'); } else { if (convertLFs) { tb.append('\n'); } else { tb.append('\r'); tb.append('\n'); } } startPtr = ptr = mInputPtr; markLF(); } } } int thisLen = ptr - startPtr; if (thisLen > 0) { tb.append(inputBuf, startPtr, thisLen); } loadMore(errorMsg); startPtr = ptr = mInputPtr; inputBuf = mInputBuffer; inputLen = mInputEnd; } } /* /////////////////////////////////////////////////////////// // Internal methods /////////////////////////////////////////////////////////// */ private int resolveCharEnt(StringBuffer originalCharacters) throws XMLStreamException { int value = 0; char c = getNextChar(SUFFIX_IN_ENTITY_REF); if (originalCharacters != null) { originalCharacters.append(c); } if (c == 'x') { // hex while (true) { c = (mInputPtr < mInputEnd) ? mInputBuffer[mInputPtr++] : getNextCharFromCurrent(SUFFIX_IN_ENTITY_REF); if (c == ';') { break; } if (originalCharacters != null) { originalCharacters.append(c); } value = value << 4; if (c <= '9' && c >= '0') { value += (c - '0'); } else if (c >= 'a' && c <= 'f') { value += 10 + (c - 'a'); } else if (c >= 'A' && c <= 'F') { value += 10 + (c - 'A'); } else { throwUnexpectedChar(c, "; expected a hex digit (0-9a-fA-F)."); } // Overflow? if (value > MAX_UNICODE_CHAR) { reportUnicodeOverflow(); } } } else { // numeric (decimal) while (c != ';') { if (c <= '9' && c >= '0') { value = (value * 10) + (c - '0'); // Overflow? if (value > MAX_UNICODE_CHAR) { reportUnicodeOverflow(); } } else { throwUnexpectedChar(c, "; expected a decimal number."); } c = (mInputPtr < mInputEnd) ? mInputBuffer[mInputPtr++] : getNextCharFromCurrent(SUFFIX_IN_ENTITY_REF); if (originalCharacters != null && c != ';') { originalCharacters.append(c); } } } validateChar(value); return value; } /** * Method that will verify that expanded Unicode codepoint is a valid * XML content character. */ private final void validateChar(int value) throws XMLStreamException { /* 24-Jan-2006, TSa: Ok, "high" Unicode chars are problematic, * need to be reported by a surrogate pair.. */ if (value >= 0xD800) { if (value < 0xE000) { // no surrogates via entity expansion reportIllegalChar(value); } if (value > 0xFFFF) { // Within valid range at all? if (value > MAX_UNICODE_CHAR) { reportUnicodeOverflow(); } } else if (value >= 0xFFFE) { // 0xFFFE and 0xFFFF are illegal too reportIllegalChar(value); } // Ok, fine as is } else if (value < 32) { if (value == 0) { throwParseError("Invalid character reference: null character not allowed in XML content."); } // XML 1.1 allows most other chars; 1.0 does not: if (!mXml11 && (value != 0x9 && value != 0xA && value != 0xD)) { reportIllegalChar(value); } } } protected final char[] getNameBuffer(int minSize) { char[] buf = mNameBuffer; if (buf == null) { mNameBuffer = buf = new char[(minSize > 48) ? (minSize+16) : 64]; } else if (minSize >= buf.length) { // let's allow one char extra... int len = buf.length; len += (len >> 1); // grow by 50% mNameBuffer = buf = new char[(minSize >= len) ? (minSize+16) : len]; } return buf; } protected final char[] expandBy50Pct(char[] buf) { int len = buf.length; char[] newBuf = new char[len + (len >> 1)]; System.arraycopy(buf, 0, newBuf, 0, len); return newBuf; } /** * Method called to throw an exception indicating that a name that * should not be namespace-qualified (PI target, entity/notation name) * is one, and reader is namespace aware. */ private void throwNsColonException(String name) throws XMLStreamException { throwParseError("Illegal name \"{0}\" (PI target, entity/notation name): can not contain a colon (XML Namespaces 1.0#6)", name, null); } private void throwRecursionError(String entityName) throws XMLStreamException { throwParseError("Illegal entity expansion: entity \"{0}\" expands itself recursively.", entityName, null); } private void reportUnicodeOverflow() throws XMLStreamException { throwParseError("Illegal character entity: value higher than max allowed (0x{0})", Integer.toHexString(MAX_UNICODE_CHAR), null); } private void reportIllegalChar(int value) throws XMLStreamException { throwParseError("Illegal character entity: expansion character (code 0x{0}", Integer.toHexString(value), null); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy