com.ctc.wstx.sr.ValidatingStreamReader Maven / Gradle / Ivy
/* Woodstox XML processor
*
* Copyright (c) 2004- Tatu Saloranta, [email protected]
*
* Licensed under the License specified in the file LICENSE which is
* included with the source code.
* You may not use this file except in compliance with the License.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.ctc.wstx.sr;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.*;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.NotationDeclaration;
import org.codehaus.stax2.XMLInputFactory2;
import org.codehaus.stax2.validation.*;
import com.ctc.wstx.api.ReaderConfig;
import com.ctc.wstx.cfg.ErrorConsts;
import com.ctc.wstx.cfg.XmlConsts;
import com.ctc.wstx.io.*;
import com.ctc.wstx.dtd.DTDId;
import com.ctc.wstx.dtd.DTDSubset;
import com.ctc.wstx.dtd.DTDValidatorBase;
import com.ctc.wstx.dtd.FullDTDReader;
import com.ctc.wstx.ent.EntityDecl;
import com.ctc.wstx.util.URLUtil;
/**
* Implementation of {@link org.codehaus.stax2.XMLStreamReader2}
* that builds on {@link TypedStreamReader} and adds full DTD-handling
* including DTD validation
*
* @author Tatu Saloranta
* @author Benson Margulies
*/
public class ValidatingStreamReader
extends TypedStreamReader
{
/*
///////////////////////////////////////////////////////////////////////
// Constants for standard StAX properties:
///////////////////////////////////////////////////////////////////////
*/
final static String STAX_PROP_ENTITIES = "javax.xml.stream.entities";
final static String STAX_PROP_NOTATIONS = "javax.xml.stream.notations";
/*
///////////////////////////////////////////////////////////////////////
// Validation (DTD) information (entities, ...)
///////////////////////////////////////////////////////////////////////
*/
// // // Note: some members that logically belong here, are actually
// // // part of superclass
/**
* Combined DTD set, constructed from parsed internal and external
* entities (which may have been set via override DTD functionality).
*/
DTDValidationSchema mDTD = null;
/**
* Validating reader keeps of automatically created DTD-based
* validator, since its handling may differ from that of application
* managed validators.
*/
XMLValidator mAutoDtdValidator = null;
/**
* Flag that indicates whether a DTD validator has been automatically
* set (as per DOCTYPE declaration or override)
*/
boolean mDtdValidatorSet = false;
/**
* Custom validation problem handler, if any.
*/
protected ValidationProblemHandler mVldProbHandler = null;
/*
///////////////////////////////////////////////////////////////////////
// Life-cycle (ctors)
///////////////////////////////////////////////////////////////////////
*/
private ValidatingStreamReader(InputBootstrapper bs,
BranchingReaderSource input, ReaderCreator owner,
ReaderConfig cfg, InputElementStack elemStack,
boolean forER)
throws XMLStreamException
{
super(bs, input, owner, cfg, elemStack, forER);
}
/**
* Factory method for constructing readers.
*
* @param owner "Owner" of this reader, factory that created the reader;
* needed for returning updated symbol table information after parsing.
* @param input Input source used to read the XML document.
* @param cfg Object that contains reader configuration info.
* @param bs Bootstrapper to use, for reading xml declaration etc.
* @param forER True if this reader is to be (configured to be) used by
* an event reader. Will cause some changes to default settings, as
* required by contracts Woodstox XMLEventReader implementation has
* (with respect to lazy parsing, short text segments etc)
*/
public static ValidatingStreamReader createValidatingStreamReader
(BranchingReaderSource input, ReaderCreator owner,
ReaderConfig cfg, InputBootstrapper bs, boolean forER)
throws XMLStreamException
{
ValidatingStreamReader sr = new ValidatingStreamReader
(bs, input, owner, cfg, createElementStack(cfg), forER);
return sr;
}
/*
///////////////////////////////////////////////////////////////////////
// Public API, configuration
///////////////////////////////////////////////////////////////////////
*/
@Override
public Object getProperty(String name)
{
// DTD-specific properties...
if (name.equals(STAX_PROP_ENTITIES)) {
safeEnsureFinishToken();
if (mDTD == null || !(mDTD instanceof DTDSubset)) {
return null;
}
List l = ((DTDSubset) mDTD).getGeneralEntityList();
/* Let's make a copy, so that caller can not modify
* DTD's internal list instance
*/
return new ArrayList(l);
}
if (name.equals(STAX_PROP_NOTATIONS)) {
safeEnsureFinishToken();
if (mDTD == null || !(mDTD instanceof DTDSubset)) {
return null;
}
/* Let's make a copy, so that caller can not modify
* DTD's internal list instance
*/
List l = ((DTDSubset) mDTD).getNotationList();
return new ArrayList(l);
}
return super.getProperty(name);
}
/*
///////////////////////////////////////////////////////////////////////
// XMLStreamReader2 (StAX2) implementation
///////////////////////////////////////////////////////////////////////
*/
// // // StAX2, per-reader configuration
/*
///////////////////////////////////////////////////////////////////////
// DTDInfo implementation (StAX 2)
///////////////////////////////////////////////////////////////////////
*/
@Override
public Object getProcessedDTD() {
return getProcessedDTDSchema();
}
@Override
public DTDValidationSchema getProcessedDTDSchema() {
DTDValidationSchema dtd = mConfig.getDTDOverride();
if (dtd == null) {
dtd = mDTD;
}
return mDTD;
}
/*
///////////////////////////////////////////////////////////////////////
// Stax2 validation
///////////////////////////////////////////////////////////////////////
*/
@Override
public XMLValidator validateAgainst(XMLValidationSchema schema)
throws XMLStreamException
{
return mElementStack.validateAgainst(schema);
}
@Override
public XMLValidator stopValidatingAgainst(XMLValidationSchema schema)
throws XMLStreamException
{
return mElementStack.stopValidatingAgainst(schema);
}
@Override
public XMLValidator stopValidatingAgainst(XMLValidator validator)
throws XMLStreamException
{
return mElementStack.stopValidatingAgainst(validator);
}
@Override
public ValidationProblemHandler setValidationProblemHandler(ValidationProblemHandler h)
{
ValidationProblemHandler oldH = mVldProbHandler;
mVldProbHandler = h;
return oldH;
}
/*
///////////////////////////////////////////////////////////////////////
// Private methods, DOCTYPE handling
///////////////////////////////////////////////////////////////////////
*/
/**
* This method gets called to handle remainder of DOCTYPE declaration,
* essentially the optional internal subset. Internal subset, if such
* exists, is always read, but whether its contents are added to the
* read buffer depend on passed-in argument.
*
* NOTE: Since this method overrides the default implementation, make
* sure you do NOT change the method signature.
*
* @param copyContents If true, will copy contents of the internal
* subset of DOCTYPE declaration
* in the text buffer (in addition to parsing it for actual use); if
* false, will only do parsing.
*/
@Override
protected void finishDTD(boolean copyContents)
throws XMLStreamException
{
if (!hasConfigFlags(CFG_SUPPORT_DTD)) {
super.finishDTD(copyContents);
return;
}
/* We know there are no spaces, as this char was read and pushed
* back earlier...
*/
char c = getNextChar(SUFFIX_IN_DTD);
DTDSubset intSubset = null;
/* Do we have an internal subset? Note that we have earlier checked
* that it has to be either '[' or closing '>'.
*/
if (c == '[') {
// Do we need to copy the contents of int. subset in the buffer?
if (copyContents) {
((BranchingReaderSource) mInput).startBranch(mTextBuffer, mInputPtr, mNormalizeLFs);
}
try {
intSubset = FullDTDReader.readInternalSubset(this, mInput, mConfig,
hasConfigFlags(CFG_VALIDATE_AGAINST_DTD),
mDocXmlVersion);
} finally {
/* Let's close branching in any and every case (may allow
* graceful recovery in error cases in future
*/
if (copyContents) {
/* Need to "push back" ']' got in the succesful case
* (that's -1 part below);
* in error case it'll just be whatever last char was.
*/
((BranchingReaderSource) mInput).endBranch(mInputPtr-1);
}
}
// And then we need closing '>'
c = getNextCharAfterWS(SUFFIX_IN_DTD_INTERNAL);
}
if (c != '>') {
throwUnexpectedChar(c, "; expected '>' to finish DOCTYPE declaration.");
}
/* But, then, we also may need to read the external subset, if
* one was defined:
*/
/* 19-Sep-2004, TSa: That does not need to be done, however, if
* there's a DTD override set.
*/
mDTD = mConfig.getDTDOverride();
if (mDTD != null) {
// We have earlier override that's already parsed
} else { // Nope, no override
DTDSubset extSubset = null;
/* 05-Mar-2006, TSa: If standalone was specified as "yes", we
* should not rely on any external declarations, so shouldn't
* we really just skip the external subset?
*/
/* Alas: SAX (Xerces) still tries to read it... should we
* do the Right Thing, or follow the leader? For now, let's
* just follow the wrong example.
*/
//if (mDocStandalone != DOC_STANDALONE_YES) {
if (true) {
if (mDtdPublicId != null || mDtdSystemId != null) {
extSubset = findDtdExtSubset(mDtdPublicId, mDtdSystemId, intSubset);
}
}
if (intSubset == null) {
mDTD = extSubset;
} else if (extSubset == null) {
mDTD = intSubset;
} else {
mDTD = intSubset.combineWithExternalSubset(this, extSubset);
}
}
if (mDTD == null) { // only if specifically overridden not to have any
mGeneralEntities = null;
} else {
if (mDTD instanceof DTDSubset) {
mGeneralEntities = ((DTDSubset) mDTD).getGeneralEntityMap();
} else {
/* Also, let's warn if using non-native DTD implementation,
* since entities and notations can not be accessed
*/
_reportProblem(mConfig.getXMLReporter(), ErrorConsts.WT_DT_DECL,
"Value to set for property '"+XMLInputFactory2.P_DTD_OVERRIDE
+"' not a native Woodstox DTD implementation (but "+mDTD.getClass()+"): can not access full entity or notation information", null);
}
/* 16-Jan-2006, TSa: Actually, we have both fully-validating mode,
* and non-validating-but-DTD-aware mode. In latter case, we'll
* still need to add a validator, but just to get type info
* and to add attribute default values if necessary.
*/
mAutoDtdValidator = mDTD.createValidator(/*(ValidationContext)*/ mElementStack);
mDtdValidatorSet = true; // so we won't get nags
NsDefaultProvider nsDefs = null;
if (mAutoDtdValidator instanceof DTDValidatorBase) {
DTDValidatorBase dtdv = (DTDValidatorBase) mAutoDtdValidator;
dtdv.setAttrValueNormalization(true);
// Do we have any attribute defaults for 'xmlns' or 'xmlns:*'?
if (dtdv.hasNsDefaults()) {
nsDefs = dtdv;
}
}
mElementStack.setAutomaticDTDValidator(mAutoDtdValidator, nsDefs);
}
}
/**
* If there is an error handler established, call it.
*/
@Override
public void reportValidationProblem(XMLValidationProblem prob)
throws XMLStreamException
{
if (mVldProbHandler != null) {
// Fix for [WSTX-209]
mVldProbHandler.reportProblem(prob);
} else {
super.reportValidationProblem(prob);
}
}
/**
* Method called right before handling the root element, by the base
* class. This allows for some initialization and checks to be done
* (not including ones that need access to actual element name)
*/
@Override
protected void initValidation() throws XMLStreamException
{
if (hasConfigFlags(CFG_VALIDATE_AGAINST_DTD)
&& !mDtdValidatorSet) {
/* It's ok to miss it, but it may not be what caller wants. Either
* way, let's pass the info and continue
*/
reportProblem(null, ErrorConsts.WT_DT_DECL, ErrorConsts.W_MISSING_DTD, null, null);
}
}
/*
///////////////////////////////////////////////////////////////////////
// Private methods, external subset access
///////////////////////////////////////////////////////////////////////
*/
/**
* Method called by finishDTD
, to locate the specified
* external DTD subset. Subset may be obtained from a cache, if cached
* copy exists and is compatible; if not, it will be read from the
* source identified by the public and/or system identifier passed.
*/
private DTDSubset findDtdExtSubset(String pubId, String sysId,
DTDSubset intSubset)
throws XMLStreamException
{
boolean cache = hasConfigFlags(CFG_CACHE_DTDS);
DTDId dtdId;
try {
dtdId = constructDtdId(pubId, sysId);
} catch (IOException ioe) {
throw constructFromIOE(ioe);
}
if (cache) {
DTDSubset extSubset = findCachedSubset(dtdId, intSubset);
if (extSubset != null) {
return extSubset;
}
}
// No useful cached copy? Need to read it then.
/* For now, we do require system identifier; otherwise we don't
* know how to resolve DTDs by public id. In future should
* probably also have some simple catalog resolving facility?
*/
if (sysId == null) {
throwParseError("Can not resolve DTD with public id \"{0}\"; missing system identifier", mDtdPublicId, null);
}
WstxInputSource src = null;
try {
int xmlVersion = mDocXmlVersion;
// 05-Feb-2006, TSa: If xmlVersion not explicitly known, defaults to 1.0
if (xmlVersion == XmlConsts.XML_V_UNKNOWN) {
xmlVersion = XmlConsts.XML_V_10;
}
/* null -> no explicit path context, use parent's
* null -> not an entity expansion, no name.
* Note, too, that we can NOT just pass mEntityResolver, since
* that's the one used for general entities, whereas ext subset
* should be resolved by the param entity resolver.
*/
src = DefaultInputResolver.resolveEntity
(mInput, null, null, pubId, sysId, mConfig.getDtdResolver(),
mConfig, xmlVersion);
} catch (FileNotFoundException fex) {
/* Let's catch and rethrow this just so we get more meaningful
* description (with input source position etc)
*/
throwParseError("(was {0}) {1}", fex.getClass().getName(), fex.getMessage());
} catch (IOException ioe) {
throwFromIOE(ioe);
}
DTDSubset extSubset = FullDTDReader.readExternalSubset(src, mConfig, intSubset,
hasConfigFlags(CFG_VALIDATE_AGAINST_DTD),
mDocXmlVersion);
if (cache) {
/* Ok; can be cached, but only if it does NOT refer to
* parameter entities defined in the internal subset (if
* it does, there's no easy/efficient to check if it could
* be used later on, plus it's unlikely it could be)
*/
if (extSubset.isCachable()) {
mOwner.addCachedDTD(dtdId, extSubset);
}
}
return extSubset;
}
private DTDSubset findCachedSubset(DTDId id, DTDSubset intSubset)
throws XMLStreamException
{
DTDSubset extSubset = mOwner.findCachedDTD(id);
/* Ok, now; can use the cached copy iff it does not refer to
* any parameter entities internal subset (if one exists)
* defines:
*/
if (extSubset != null) {
if (intSubset == null || extSubset.isReusableWith(intSubset)) {
return extSubset;
}
}
return null;
}
/**
* Method called to resolve path to external DTD subset, given
* system identifier.
*/
private URI resolveExtSubsetPath(String systemId) throws IOException
{
// Do we have a context to use for resolving?
URL ctxt = (mInput == null) ? null : mInput.getSource();
/* Ok, either got a context or not; let's create the URL based on
* the id, and optional context:
*/
if (ctxt == null) {
/* Call will try to figure out if system id has the protocol
* in it; if not, create a relative file, if it does, try to
* resolve it.
*/
return URLUtil.uriFromSystemId(systemId);
}
URL url = URLUtil.urlFromSystemId(systemId, ctxt);
try {
return new URI(url.toExternalForm());
} catch (URISyntaxException e) { // should never occur...
throw new IOException("Failed to construct URI for external subset, URL = "+url.toExternalForm()+": "+e.getMessage());
}
}
protected DTDId constructDtdId(String pubId, String sysId) throws IOException
{
/* Following settings will change what gets stored as DTD, so
* they need to separate cached instances too:
*/
int significantFlags = mConfigFlags &
(CFG_NAMESPACE_AWARE
/* Let's optimize non-validating case; DTD info we need
* is less if so (no need to store content specs for one)...
* plus, eventual functionality may be different too.
*/
| CFG_VALIDATE_AGAINST_DTD
/* Also, whether we support dtd++ or not may change construction
* of settings... (currently does not, but could)
*/
| CFG_SUPPORT_DTDPP
/* Also, basic xml:id support does matter -- xml:id attribute
* type is verified only if it's enabled
*/
| CFG_XMLID_TYPING
);
URI sysRef = (sysId == null || sysId.length() == 0) ? null :
resolveExtSubsetPath(sysId);
/* 29-Mar-2006, TSa: Apparently public ids are not always very
* unique and/or can be mismatched with system ids, resulting
* in false matches if using public ids. As a result, by default
* Woodstox does NOT rely on public ids, when matching.
*/
boolean usePublicId = (mConfigFlags & CFG_CACHE_DTDS_BY_PUBLIC_ID) != 0;
if (usePublicId && pubId != null && pubId.length() > 0) {
return DTDId.construct(pubId, sysRef, significantFlags, mXml11);
}
if (sysRef == null) {
return null;
}
return DTDId.constructFromSystemId(sysRef, significantFlags, mXml11);
}
protected DTDId constructDtdId(URI sysId) throws IOException
{
int significantFlags = mConfigFlags &
(CFG_NAMESPACE_AWARE
/* Let's optimize non-validating case; DTD info we need
* is less if so (no need to store content specs for one)
*/
| CFG_VALIDATE_AGAINST_DTD
/* Also, whether we support dtd++ or not may change construction
* of settings... (currently does not, but could)
*/
| CFG_SUPPORT_DTDPP
);
return DTDId.constructFromSystemId(sysId, significantFlags, mXml11);
}
/*
///////////////////////////////////////////////////////////////////////
// Private methods, DTD validation support
///////////////////////////////////////////////////////////////////////
*/
/**
* Method called by lower-level parsing code when invalid content
* (anything inside element with 'empty' content spec; text inside
* non-mixed element etc) is found during basic scanning. Note
* that actual DTD element structure problems are not reported
* through this method.
*/
@Override
protected void reportInvalidContent(int evtType)
throws XMLStreamException
{
switch (mVldContent) {
case XMLValidator.CONTENT_ALLOW_NONE:
reportValidationProblem(ErrorConsts.ERR_VLD_EMPTY,
mElementStack.getTopElementDesc(),
ErrorConsts.tokenTypeDesc(evtType));
break;
case XMLValidator.CONTENT_ALLOW_WS:
case XMLValidator.CONTENT_ALLOW_WS_NONSTRICT: // should this ever occur?
reportValidationProblem(ErrorConsts.ERR_VLD_NON_MIXED,
mElementStack.getTopElementDesc(), null);
break;
case XMLValidator.CONTENT_ALLOW_VALIDATABLE_TEXT:
case XMLValidator.CONTENT_ALLOW_ANY_TEXT:
/* Not 100% sure if this should ever happen... depends on
* interpretation of 'any' content model?
*/
reportValidationProblem(ErrorConsts.ERR_VLD_ANY,
mElementStack.getTopElementDesc(),
ErrorConsts.tokenTypeDesc(evtType));
break;
default: // should never occur:
throwParseError("Internal error: trying to report invalid content for "+evtType);
}
}
}