All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.ctc.wstx.sr.ValidatingStreamReader Maven / Gradle / Ivy

Go to download

Woodstox is a high-performance XML processor that implements Stax (JSR-173) and SAX2 APIs

The newest version!
/* Woodstox XML processor
 *
 * Copyright (c) 2004- Tatu Saloranta, [email protected]
 *
 * Licensed under the License specified in the file LICENSE which is
 * included with the source code.
 * You may not use this file except in compliance with the License.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.ctc.wstx.sr;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.*;

import javax.xml.stream.XMLStreamException;

import org.codehaus.stax2.XMLInputFactory2;
import org.codehaus.stax2.validation.*;

import com.ctc.wstx.api.ReaderConfig;
import com.ctc.wstx.cfg.ErrorConsts;
import com.ctc.wstx.cfg.XmlConsts;
import com.ctc.wstx.io.*;
import com.ctc.wstx.dtd.DTDId;
import com.ctc.wstx.dtd.DTDSubset;
import com.ctc.wstx.dtd.DTDValidatorBase;
import com.ctc.wstx.dtd.FullDTDReader;
import com.ctc.wstx.util.URLUtil;

/**
 * Implementation of {@link org.codehaus.stax2.XMLStreamReader2}
 * that builds on {@link TypedStreamReader} and adds full DTD-handling
 * including DTD validation
 *
 * @author Tatu Saloranta
 * @author Benson Margulies
 */
public class ValidatingStreamReader
    extends TypedStreamReader
{
    /*
    ///////////////////////////////////////////////////////////////////////
    // Constants for standard StAX properties:
    ///////////////////////////////////////////////////////////////////////
    */

    final static String STAX_PROP_ENTITIES = "javax.xml.stream.entities";

    final static String STAX_PROP_NOTATIONS = "javax.xml.stream.notations";

    /*
    ///////////////////////////////////////////////////////////////////////
    // Validation (DTD) information (entities, ...)
    ///////////////////////////////////////////////////////////////////////
     */

    // // // Note: some members that logically belong here, are actually
    // // // part of superclass

    /**
     * Combined DTD set, constructed from parsed internal and external
     * entities (which may have been set via override DTD functionality).
     */
    DTDValidationSchema mDTD = null;

    /**
     * Validating reader keeps of automatically created DTD-based
     * validator, since its handling may differ from that of application
     * managed validators.
     */
    XMLValidator mAutoDtdValidator = null;

    /**
     * Flag that indicates whether a DTD validator has been automatically
     * set (as per DOCTYPE declaration or override)
     */
    boolean mDtdValidatorSet = false;

    /**
     * Custom validation problem handler, if any.
     */
    protected ValidationProblemHandler mVldProbHandler = null;

    /*
    ///////////////////////////////////////////////////////////////////////
    // Life-cycle (ctors)
    ///////////////////////////////////////////////////////////////////////
     */

    private ValidatingStreamReader(InputBootstrapper bs,
                                   BranchingReaderSource input, ReaderCreator owner,
                                   ReaderConfig cfg, InputElementStack elemStack,
                                   boolean forER)
        throws XMLStreamException
    {
        super(bs, input, owner, cfg, elemStack, forER);
    }

    /**
     * Factory method for constructing readers.
     *
     * @param owner "Owner" of this reader, factory that created the reader;
     *   needed for returning updated symbol table information after parsing.
     * @param input Input source used to read the XML document.
     * @param cfg Object that contains reader configuration info.
     * @param bs Bootstrapper to use, for reading xml declaration etc.
     * @param forER True if this reader is to be (configured to be) used by
     *   an event reader. Will cause some changes to default settings, as
     *   required by contracts Woodstox XMLEventReader implementation has
     *   (with respect to lazy parsing, short text segments etc)
     */
    public static ValidatingStreamReader createValidatingStreamReader
        (BranchingReaderSource input, ReaderCreator owner,
         ReaderConfig cfg, InputBootstrapper bs, boolean forER)
        throws XMLStreamException
    {
        ValidatingStreamReader sr = new ValidatingStreamReader
            (bs, input, owner, cfg, createElementStack(cfg), forER);
        return sr;
    }

    /*
    ///////////////////////////////////////////////////////////////////////
    // Public API, configuration
    ///////////////////////////////////////////////////////////////////////
     */

    public Object getProperty(String name)
    {
        // DTD-specific properties...
        if (name.equals(STAX_PROP_ENTITIES)) {
            safeEnsureFinishToken();
            if (mDTD == null || !(mDTD instanceof DTDSubset)) {
                return null;
            }
            List l = ((DTDSubset) mDTD).getGeneralEntityList();
            /* Let's make a copy, so that caller can not modify
             * DTD's internal list instance
             */
            return new ArrayList(l);
        }
        if (name.equals(STAX_PROP_NOTATIONS)) {
            safeEnsureFinishToken();
            if (mDTD == null || !(mDTD instanceof DTDSubset)) {
                return null;
            }
            /* Let's make a copy, so that caller can not modify
             * DTD's internal list instance
             */
            List l = ((DTDSubset) mDTD).getNotationList();
            return new ArrayList(l);
        }
        return super.getProperty(name);
    }

    /*
    ///////////////////////////////////////////////////////////////////////
    // XMLStreamReader2 (StAX2) implementation
    ///////////////////////////////////////////////////////////////////////
     */

    // // // StAX2, per-reader configuration

    // no additional readable features
    //public Object getFeature(String name)

    public void setFeature(String name, Object value)
    {
        // Referring to DTD-related features?
        if (name.equals(FEATURE_DTD_OVERRIDE)) {
            /* !!! 06-Feb-2007, TSa: Null with 4.0 will actually mean
             *  'remove any overrides'; which is different from earlier
             *  meaning (which was use a dummy/empty override). 
             *  Should we throw an exception, or warn, or something...?
             */
            if (value != null && !(value instanceof DTDValidationSchema)) {
                throw new IllegalArgumentException("Value to set for feature "+name+" not of type DTDValidationSchema");
            }
            mConfig.setProperty(XMLInputFactory2.P_DTD_OVERRIDE, (DTDValidationSchema) value);
        } else {
            super.setFeature(name, value);
        }
    }

    /*
    ///////////////////////////////////////////////////////////////////////
    // DTDInfo implementation (StAX 2)
    ///////////////////////////////////////////////////////////////////////
     */

    public Object getProcessedDTD() {
        return getProcessedDTDSchema();
    }

    public DTDValidationSchema getProcessedDTDSchema() {
        DTDValidationSchema dtd = mConfig.getDTDOverride();
        if (dtd == null) {
            dtd = mDTD;
        }
        return mDTD;
    }

    /*
    ///////////////////////////////////////////////////////////////////////
    // Stax2 validation
    ///////////////////////////////////////////////////////////////////////
     */

    // @Override
    public XMLValidator validateAgainst(XMLValidationSchema schema)
        throws XMLStreamException
    {
        return mElementStack.validateAgainst(schema);
    }

    // @Override
    public XMLValidator stopValidatingAgainst(XMLValidationSchema schema)
        throws XMLStreamException
    {
        return mElementStack.stopValidatingAgainst(schema);
    }

    // @Override
    public XMLValidator stopValidatingAgainst(XMLValidator validator)
        throws XMLStreamException
    {
        return mElementStack.stopValidatingAgainst(validator);
    }

    // @Override
    public ValidationProblemHandler setValidationProblemHandler(ValidationProblemHandler h)
    {
        ValidationProblemHandler oldH = mVldProbHandler;
        mVldProbHandler = h;
        return oldH;
    }

    /*
    ///////////////////////////////////////////////////////////////////////
    // Private methods, DOCTYPE handling
    ///////////////////////////////////////////////////////////////////////
     */

    /**
     * This method gets called to handle remainder of DOCTYPE declaration,
     * essentially the optional internal subset. Internal subset, if such
     * exists, is always read, but whether its contents are added to the
     * read buffer depend on passed-in argument.
     *

* NOTE: Since this method overrides the default implementation, make * sure you do NOT change the method signature. * * @param copyContents If true, will copy contents of the internal * subset of DOCTYPE declaration * in the text buffer (in addition to parsing it for actual use); if * false, will only do parsing. */ protected void finishDTD(boolean copyContents) throws XMLStreamException { if (!hasConfigFlags(CFG_SUPPORT_DTD)) { super.finishDTD(copyContents); return; } /* We know there are no spaces, as this char was read and pushed * back earlier... */ char c = getNextChar(SUFFIX_IN_DTD); DTDSubset intSubset = null; /* Do we have an internal subset? Note that we have earlier checked * that it has to be either '[' or closing '>'. */ if (c == '[') { // Do we need to copy the contents of int. subset in the buffer? if (copyContents) { ((BranchingReaderSource) mInput).startBranch(mTextBuffer, mInputPtr, mNormalizeLFs); } try { intSubset = FullDTDReader.readInternalSubset(this, mInput, mConfig, hasConfigFlags(CFG_VALIDATE_AGAINST_DTD), mDocXmlVersion); } finally { /* Let's close branching in any and every case (may allow * graceful recovery in error cases in future */ if (copyContents) { /* Need to "push back" ']' got in the succesful case * (that's -1 part below); * in error case it'll just be whatever last char was. */ ((BranchingReaderSource) mInput).endBranch(mInputPtr-1); } } // And then we need closing '>' c = getNextCharAfterWS(SUFFIX_IN_DTD_INTERNAL); } if (c != '>') { throwUnexpectedChar(c, "; expected '>' to finish DOCTYPE declaration."); } /* But, then, we also may need to read the external subset, if * one was defined: */ /* 19-Sep-2004, TSa: That does not need to be done, however, if * there's a DTD override set. */ mDTD = mConfig.getDTDOverride(); if (mDTD != null) { // We have earlier override that's already parsed } else { // Nope, no override DTDSubset extSubset = null; /* 05-Mar-2006, TSa: If standalone was specified as "yes", we * should not rely on any external declarations, so shouldn't * we really just skip the external subset? */ /* Alas: SAX (Xerces) still tries to read it... should we * do the Right Thing, or follow the leader? For now, let's * just follow the wrong example. */ //if (mDocStandalone != DOC_STANDALONE_YES) { if (true) { if (mDtdPublicId != null || mDtdSystemId != null) { extSubset = findDtdExtSubset(mDtdPublicId, mDtdSystemId, intSubset); } } if (intSubset == null) { mDTD = extSubset; } else if (extSubset == null) { mDTD = intSubset; } else { mDTD = intSubset.combineWithExternalSubset(this, extSubset); } } if (mDTD == null) { // only if specifically overridden not to have any mGeneralEntities = null; } else { if (mDTD instanceof DTDSubset) { mGeneralEntities = ((DTDSubset) mDTD).getGeneralEntityMap(); } else { /* Also, let's warn if using non-native DTD implementation, * since entities and notations can not be accessed */ _reportProblem(mConfig.getXMLReporter(), ErrorConsts.WT_DT_DECL, "Value to set for feature "+FEATURE_DTD_OVERRIDE+" not a native Woodstox DTD implementation (but "+mDTD.getClass()+"): can not access full entity or notation information", null); } /* 16-Jan-2006, TSa: Actually, we have both fully-validating mode, * and non-validating-but-DTD-aware mode. In latter case, we'll * still need to add a validator, but just to get type info * and to add attribute default values if necessary. */ mAutoDtdValidator = mDTD.createValidator(/*(ValidationContext)*/ mElementStack); mDtdValidatorSet = true; // so we won't get nags NsDefaultProvider nsDefs = null; if (mAutoDtdValidator instanceof DTDValidatorBase) { DTDValidatorBase dtdv = (DTDValidatorBase) mAutoDtdValidator; dtdv.setAttrValueNormalization(true); // Do we have any attribute defaults for 'xmlns' or 'xmlns:*'? if (dtdv.hasNsDefaults()) { nsDefs = dtdv; } } mElementStack.setAutomaticDTDValidator(mAutoDtdValidator, nsDefs); } } /** * If there is an error handler established, call it. */ // @Override public void reportValidationProblem(XMLValidationProblem prob) throws XMLStreamException { if (mVldProbHandler != null) { // Fix for [WSTX-209] mVldProbHandler.reportProblem(prob); } else { super.reportValidationProblem(prob); } } /** * Method called right before handling the root element, by the base * class. This allows for some initialization and checks to be done * (not including ones that need access to actual element name) */ protected void initValidation() throws XMLStreamException { if (hasConfigFlags(CFG_VALIDATE_AGAINST_DTD) && !mDtdValidatorSet) { /* It's ok to miss it, but it may not be what caller wants. Either * way, let's pass the info and continue */ reportProblem(null, ErrorConsts.WT_DT_DECL, ErrorConsts.W_MISSING_DTD, null, null); } } /* /////////////////////////////////////////////////////////////////////// // Private methods, external subset access /////////////////////////////////////////////////////////////////////// */ /** * Method called by finishDTD, to locate the specified * external DTD subset. Subset may be obtained from a cache, if cached * copy exists and is compatible; if not, it will be read from the * source identified by the public and/or system identifier passed. */ private DTDSubset findDtdExtSubset(String pubId, String sysId, DTDSubset intSubset) throws XMLStreamException { boolean cache = hasConfigFlags(CFG_CACHE_DTDS); DTDId dtdId; try { dtdId = constructDtdId(pubId, sysId); } catch (IOException ioe) { throw constructFromIOE(ioe); } if (cache) { DTDSubset extSubset = findCachedSubset(dtdId, intSubset); if (extSubset != null) { return extSubset; } } // No useful cached copy? Need to read it then. /* For now, we do require system identifier; otherwise we don't * know how to resolve DTDs by public id. In future should * probably also have some simple catalog resolving facility? */ if (sysId == null) { throwParseError("Can not resolve DTD with public id \"{0}\"; missing system identifier", mDtdPublicId, null); } WstxInputSource src = null; try { int xmlVersion = mDocXmlVersion; // 05-Feb-2006, TSa: If xmlVersion not explicitly known, defaults to 1.0 if (xmlVersion == XmlConsts.XML_V_UNKNOWN) { xmlVersion = XmlConsts.XML_V_10; } /* null -> no explicit path context, use parent's * null -> not an entity expansion, no name. * Note, too, that we can NOT just pass mEntityResolver, since * that's the one used for general entities, whereas ext subset * should be resolved by the param entity resolver. */ src = DefaultInputResolver.resolveEntity (mInput, null, null, pubId, sysId, mConfig.getDtdResolver(), mConfig, xmlVersion); } catch (FileNotFoundException fex) { /* Let's catch and rethrow this just so we get more meaningful * description (with input source position etc) */ throwParseError("(was {0}) {1}", fex.getClass().getName(), fex.getMessage()); } catch (IOException ioe) { throwFromIOE(ioe); } DTDSubset extSubset = FullDTDReader.readExternalSubset(src, mConfig, intSubset, hasConfigFlags(CFG_VALIDATE_AGAINST_DTD), mDocXmlVersion); if (cache) { /* Ok; can be cached, but only if it does NOT refer to * parameter entities defined in the internal subset (if * it does, there's no easy/efficient to check if it could * be used later on, plus it's unlikely it could be) */ if (extSubset.isCachable()) { mOwner.addCachedDTD(dtdId, extSubset); } } return extSubset; } private DTDSubset findCachedSubset(DTDId id, DTDSubset intSubset) throws XMLStreamException { DTDSubset extSubset = mOwner.findCachedDTD(id); /* Ok, now; can use the cached copy iff it does not refer to * any parameter entities internal subset (if one exists) * defines: */ if (extSubset != null) { if (intSubset == null || extSubset.isReusableWith(intSubset)) { return extSubset; } } return null; } /** * Method called to resolve path to external DTD subset, given * system identifier. */ private URI resolveExtSubsetPath(String systemId) throws IOException { // Do we have a context to use for resolving? URL ctxt = (mInput == null) ? null : mInput.getSource(); /* Ok, either got a context or not; let's create the URL based on * the id, and optional context: */ if (ctxt == null) { /* Call will try to figure out if system id has the protocol * in it; if not, create a relative file, if it does, try to * resolve it. */ return URLUtil.uriFromSystemId(systemId); } URL url = URLUtil.urlFromSystemId(systemId, ctxt); try { return new URI(url.toExternalForm()); } catch (URISyntaxException e) { // should never occur... throw new IOException("Failed to construct URI for external subset, URL = "+url.toExternalForm()+": "+e.getMessage()); } } protected DTDId constructDtdId(String pubId, String sysId) throws IOException { /* Following settings will change what gets stored as DTD, so * they need to separate cached instances too: */ int significantFlags = mConfigFlags & (CFG_NAMESPACE_AWARE /* Let's optimize non-validating case; DTD info we need * is less if so (no need to store content specs for one)... * plus, eventual functionality may be different too. */ | CFG_VALIDATE_AGAINST_DTD /* Also, whether we support dtd++ or not may change construction * of settings... (currently does not, but could) */ | CFG_SUPPORT_DTDPP /* Also, basic xml:id support does matter -- xml:id attribute * type is verified only if it's enabled */ | CFG_XMLID_TYPING ); URI sysRef = (sysId == null || sysId.length() == 0) ? null : resolveExtSubsetPath(sysId); /* 29-Mar-2006, TSa: Apparently public ids are not always very * unique and/or can be mismatched with system ids, resulting * in false matches if using public ids. As a result, by default * Woodstox does NOT rely on public ids, when matching. */ boolean usePublicId = (mConfigFlags & CFG_CACHE_DTDS_BY_PUBLIC_ID) != 0; if (usePublicId && pubId != null && pubId.length() > 0) { return DTDId.construct(pubId, sysRef, significantFlags, mXml11); } if (sysRef == null) { return null; } return DTDId.constructFromSystemId(sysRef, significantFlags, mXml11); } protected DTDId constructDtdId(URI sysId) throws IOException { int significantFlags = mConfigFlags & (CFG_NAMESPACE_AWARE /* Let's optimize non-validating case; DTD info we need * is less if so (no need to store content specs for one) */ | CFG_VALIDATE_AGAINST_DTD /* Also, whether we support dtd++ or not may change construction * of settings... (currently does not, but could) */ | CFG_SUPPORT_DTDPP ); return DTDId.constructFromSystemId(sysId, significantFlags, mXml11); } /* /////////////////////////////////////////////////////////////////////// // Private methods, DTD validation support /////////////////////////////////////////////////////////////////////// */ /** * Method called by lower-level parsing code when invalid content * (anything inside element with 'empty' content spec; text inside * non-mixed element etc) is found during basic scanning. Note * that actual DTD element structure problems are not reported * through this method. */ protected void reportInvalidContent(int evtType) throws XMLStreamException { switch (mVldContent) { case XMLValidator.CONTENT_ALLOW_NONE: reportValidationProblem(ErrorConsts.ERR_VLD_EMPTY, mElementStack.getTopElementDesc(), ErrorConsts.tokenTypeDesc(evtType)); break; case XMLValidator.CONTENT_ALLOW_WS: case XMLValidator.CONTENT_ALLOW_WS_NONSTRICT: // should this ever occur? reportValidationProblem(ErrorConsts.ERR_VLD_NON_MIXED, mElementStack.getTopElementDesc(), null); break; case XMLValidator.CONTENT_ALLOW_VALIDATABLE_TEXT: case XMLValidator.CONTENT_ALLOW_ANY_TEXT: /* Not 100% sure if this should ever happen... depends on * interpretation of 'any' content model? */ reportValidationProblem(ErrorConsts.ERR_VLD_ANY, mElementStack.getTopElementDesc(), ErrorConsts.tokenTypeDesc(evtType)); break; default: // should never occur: throwParseError("Internal error: trying to report invalid content for "+evtType); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy