src.java.com.ctc.wstx.dtd.FullDTDReader Maven / Gradle / Ivy
/* Woodstox XML processor
*
* Copyright (c) 2004- Tatu Saloranta, [email protected]
*
* Licensed under the License specified in file LICENSE, included with
* the source code.
* You may not use this file except in compliance with the License.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.ctc.wstx.dtd;
import java.io.IOException;
import java.io.Reader;
import java.io.Writer;
import java.net.URL;
import java.text.MessageFormat;
import java.util.*;
import javax.xml.stream.Location;
import javax.xml.stream.XMLReporter;
import javax.xml.stream.XMLStreamException;
import org.codehaus.stax2.validation.XMLValidationProblem;
import org.codehaus.stax2.validation.XMLValidator;
import com.ctc.wstx.api.ReaderConfig;
import com.ctc.wstx.cfg.ErrorConsts;
import com.ctc.wstx.cfg.XmlConsts;
import com.ctc.wstx.compat.JdkFeatures;
import com.ctc.wstx.ent.*;
import com.ctc.wstx.exc.WstxException;
import com.ctc.wstx.io.WstxInputData;
import com.ctc.wstx.io.WstxInputSource;
import com.ctc.wstx.util.*;
/**
* Reader that reads in DTD information from internal or external subset.
*
* There are 2 main modes for DTDReader, depending on whether it is parsing
* internal or external subset. Parsing of internal subset is somewhat
* simpler, since no dependency checking is needed. For external subset,
* handling of parameter entities is bit more complicated, as care has to
* be taken to distinguish between using PEs defined in int. subset, and
* ones defined in ext. subset itself. This determines cachability of
* external subsets.
*
* Reader also implements simple stand-alone functionality for flattening
* DTD files (expanding all references to their eventual textual form);
* this is sometimes useful when optimizing modularized DTDs
* (which are more maintainable) into single monolithic DTDs (which in
* general can be more performant).
*/
public class FullDTDReader
extends MinimalDTDReader
{
/**
* Flag that can be changed to enable or disable interning of shared
* names; shared names are used for enumerated values to reduce
* memory usage.
*/
final static boolean INTERN_SHARED_NAMES = false;
/**
* Expected maximum length of internal entities; balanced to reduce
* likelihood of having to grow the array, versus allocating too
* big chunks.
*/
final static int EXP_ENTITY_VALUE_LEN = 500;
final static int EXP_ATTR_VALUE_LEN = 200;
// // // Entity expansion types:
final static Boolean ENTITY_EXP_GE = Boolean.FALSE;
final static Boolean ENTITY_EXP_PE = Boolean.TRUE;
/*
//////////////////////////////////////////////////
// Configuration
//////////////////////////////////////////////////
*/
final int mConfigFlags;
// Extracted wstx-specific settings:
final boolean mCfgNormAttrs;
final boolean mCfgSupportDTDPP;
/**
* This flag indicates whether we should build a validating 'real'
* validator (true, the usual case),
* or a simpler pseudo-validator that can do all non-validation tasks
* that are based on DTD info (entity expansion, notation references,
* default attribute values). Latter is used in non-validating mode.
*
*/
final boolean mCfgFullyValidating;
/*
//////////////////////////////////////////////////
// Entity handling, parameter entities (PEs)
//////////////////////////////////////////////////
*/
/**
* Set of parameter entities defined so far in the currently parsed
* subset. Note: the first definition sticks, entities can not be
* redefined.
*
* Keys are entity name Strings; values are instances of EntityDecl
*/
HashMap mParamEntities;
/**
* Set of parameter entities already defined for the subset being
* parsed; namely, PEs defined in the internal subset passed when
* parsing matching external subset. Null when parsing internal
* subset.
*/
final HashMap mPredefdPEs;
/**
* Set of parameter entities (ids) that have been referenced by this
* DTD; only maintained for external subsets, and only as long as
* no pre-defined PE has been referenced.
*/
Set mRefdPEs;
/*
//////////////////////////////////////////////////
// Entity handling, general entities (GEs)
//////////////////////////////////////////////////
*/
/**
* Set of generic entities defined so far in this subset.
* As with parameter entities, the first definition sticks.
*
* Keys are entity name Strings; values are instances of EntityDecl
*
* Note: this Map only contains entities declared and defined in the
* subset being parsed; no previously defined values are passed.
*/
HashMap mGeneralEntities;
/**
* Set of general entities already defined for the subset being
* parsed; namely, PEs defined in the internal subset passed when
* parsing matching external subset. Null when parsing internal
* subset. Such entities are only needed directly for one purpose;
* to be expanded when reading attribute default value definitions.
*/
final HashMap mPredefdGEs;
/**
* Set of general entities (ids) that have been referenced by this
* DTD; only maintained for external subsets, and only as long as
* no pre-defined GEs have been referenced.
*/
Set mRefdGEs;
/*
//////////////////////////////////////////////////
// Entity handling, both PEs and GEs
//////////////////////////////////////////////////
*/
/**
* Flag used to keep track of whether current (external) subset
* has referenced at least one PE that was pre-defined.
*/
boolean mUsesPredefdEntities = false;
/*
//////////////////////////////////////////////////
// Notation settings
//////////////////////////////////////////////////
*/
/**
* Set of notations defined so far. Since it's illegal to (try to)
* redefine notations, there's no specific precedence.
*
* Keys are entity name Strings; values are instances of
* NotationDecl objects
*/
HashMap mNotations;
/**
* Notations already parsed before current subset; that is,
* notations from the internal subset if we are currently
* parsing matching external subset.
*/
final HashMap mPredefdNotations;
/**
* Flag used to keep track of whether current (external) subset
* has referenced at least one notation that was defined in internal
* subset. If so, can not cache the external subset
*/
boolean mUsesPredefdNotations = false;
/*
//////////////////////////////////////////////////
// Element specifications
//////////////////////////////////////////////////
*/
/**
* Map used to shared NameKey instances, to reduce memory usage
* of (qualified) element and attribute names
*/
HashMap mSharedNames = null;
/**
* Contains definition of elements and matching content specifications.
* Also contains temporary placeholders for elements that are indirectly
* "created" by ATTLIST declarations that precede actual declaration
* for the ELEMENT referred to.
*/
HashMap mElements;
/**
* Map used for sharing legal enumeration values; used since oftentimes
* same enumeration values are used with multiple attributes
*/
HashMap mSharedEnumValues = null;
/*
//////////////////////////////////////////////////
// Entity expansion state:
//////////////////////////////////////////////////
*/
/**
* This is the attribute default value that is currently being parsed.
* Needs to be a global member due to the way entity expansion failures
* are reported: problems need to be attached to this object, even
* thought the default value itself will not be passed through.
*/
DefaultAttrValue mCurrAttrDefault = null;
/**
* Flag that indicates if the currently expanding (or last expanded)
* entity is a Parameter Entity or General Entity.
*/
boolean mExpandingPE = false;
/**
* Text buffer used for constructing expansion value of the internal
* entities, and for default attribute values.
* Lazily constructed when needed, reused.
*/
TextBuffer mValueBuffer = null;
/**
* Second character of a surrogate pair returned, if any; CHAR_NULL
* to indicate none.
*/
char mSurrogateSecond = CHAR_NULL;
/*
//////////////////////////////////////////////////
// Reader state
//////////////////////////////////////////////////
*/
/**
* Nesting count for conditionally included sections; 0 means that
* we are not inside such a section. Note that condition ignore is
* handled separately.
*/
int mIncludeCount = 0;
/**
* This flag is used to catch uses of PEs in the internal subset
* within declarations (full declarations are ok, but not other types)
*/
boolean mCheckForbiddenPEs = false;
/**
* Keyword of the declaration being currently parsed (if any). Used
* for error reporting purposes.
*/
String mCurrDeclaration;
/*
//////////////////////////////////////////////////
// DTD++ support information
//////////////////////////////////////////////////
*/
/**
* Flag that indicates if any DTD++ features have been encountered
* (in DTD++-supporting mode).
*/
boolean mAnyDTDppFeatures = false;
/**
* Currently active default namespace URI.
*/
String mDefaultNsURI = "";
/**
* Prefix-to-NsURI mappings for this DTD, if any: lazily
* constructed when needed
*/
HashMap mNamespaces = null;
/*
//////////////////////////////////////////////////
// Additional support for creating expanded output
// of processed DTD.
//////////////////////////////////////////////////
*/
DTDWriter mFlattenWriter = null;
/*
//////////////////////////////////////////////////
// Support for SAX API impl:
//////////////////////////////////////////////////
*/
final DTDEventListener mEventListener;
transient TextBuffer mTextBuffer = null;
/*
//////////////////////////////////////////////////
// Life-cycle
//////////////////////////////////////////////////
*/
/**
* Constructor used for reading/skipping internal subset.
*/
private FullDTDReader(WstxInputSource input, ReaderConfig cfg,
boolean constructFully, int xmlVersion)
{
this(input, cfg, false, null, constructFully, xmlVersion);
}
/**
* Constructor used for reading external subset.
*/
private FullDTDReader(WstxInputSource input, ReaderConfig cfg,
DTDSubset intSubset,
boolean constructFully, int xmlVersion)
{
this(input, cfg, true, intSubset, constructFully, xmlVersion);
// Let's make sure line/col offsets are correct...
input.initInputLocation(this, mCurrDepth);
}
/**
* Common initialization part of int/ext subset constructors.
*/
private FullDTDReader(WstxInputSource input, ReaderConfig cfg,
boolean isExt, DTDSubset intSubset,
boolean constructFully, int xmlVersion)
{
super(input, cfg, isExt);
/* What matters here is what the main xml doc had; that determines
* xml conformance level to use.
*/
mDocXmlVersion = xmlVersion;
mXml11 = cfg.isXml11();
int cfgFlags = cfg.getConfigFlags();
mConfigFlags = cfgFlags;
mCfgNormAttrs = (cfgFlags & CFG_NORMALIZE_ATTR_VALUES) != 0;
mCfgSupportDTDPP = (cfgFlags & CFG_SUPPORT_DTDPP) != 0;
mCfgFullyValidating = constructFully;
mUsesPredefdEntities = false;
mParamEntities = null;
mRefdPEs = null;
mRefdGEs = null;
mGeneralEntities = null;
// Did we get any existing parameter entities?
HashMap pes = (intSubset == null) ?
null : intSubset.getParameterEntityMap();
if (pes == null || pes.isEmpty()) {
mPredefdPEs = null;
} else {
mPredefdPEs = pes;
}
// How about general entities (needed only for attr. def. values)
HashMap ges = (intSubset == null) ?
null : intSubset.getGeneralEntityMap();
if (ges == null || ges.isEmpty()) {
mPredefdGEs = null;
} else {
mPredefdGEs = ges;
}
// And finally, notations
HashMap not = (intSubset == null) ?
null : intSubset.getNotationMap();
if (not == null || not.isEmpty()) {
mPredefdNotations = null;
} else {
mPredefdNotations = not;
}
mEventListener = mConfig.getDTDEventListener();
}
/**
* Method called to read in the internal subset definition.
*/
public static DTDSubset readInternalSubset(WstxInputData srcData,
WstxInputSource input,
ReaderConfig cfg,
boolean constructFully,
int xmlVersion)
throws IOException, XMLStreamException
{
FullDTDReader r = new FullDTDReader(input, cfg, constructFully, xmlVersion);
// Need to read using the same low-level reader interface:
r.copyBufferStateFrom(srcData);
DTDSubset ss;
try {
ss = r.parseDTD();
} finally {
/* And then need to restore changes back to owner (line nrs etc);
* effectively means that we'll stop reading external DTD subset,
* if so.
*/
srcData.copyBufferStateFrom(r);
}
return ss;
}
/**
* Method called to read in the external subset definition.
*/
public static DTDSubset readExternalSubset
(WstxInputSource src, ReaderConfig cfg, DTDSubset intSubset,
boolean constructFully, int xmlVersion)
throws IOException, XMLStreamException
{
FullDTDReader r = new FullDTDReader(src, cfg, intSubset, constructFully, xmlVersion);
return r.parseDTD();
}
/**
* Method that will parse, process and output contents of an external
* DTD subset. It will do processing similar to
* {@link #readExternalSubset}, but additionally will copy its processed
* ("flattened") input to specified writer.
*
* @param src Input source used to read the main external subset
* @param flattenWriter Writer to output processed DTD content to
* @param inclComments If true, will pass comments to the writer; if false,
* will strip comments out
* @param inclConditionals If true, will include conditional block markers,
* as well as intervening content; if false, will strip out both markers
* and ignorable sections.
* @param inclPEs If true, will output parameter entity declarations; if
* false will parse and use them, but not output.
*/
public static DTDSubset flattenExternalSubset(WstxInputSource src, Writer flattenWriter,
boolean inclComments, boolean inclConditionals,
boolean inclPEs)
throws IOException, XMLStreamException
{
int configFlags = -1; // let's start with all options set, first
ReaderConfig cfg = ReaderConfig.createFullDefaults();
// Need to create a non-shared copy to populate symbol table field
cfg = cfg.createNonShared(new SymbolTable());
/* Let's actually not normalize LFs; it's likely caller wouldn't
* really want any such changes....
*/
cfg.clearConfigFlag(CFG_NORMALIZE_LFS);
cfg.clearConfigFlag(CFG_NORMALIZE_ATTR_VALUES);
/* Let's assume xml 1.0... can be taken as an arg later on, if we
* truly care.
*/
FullDTDReader r = new FullDTDReader(src, cfg, null, true, XmlConsts.XML_V_UNKNOWN);
r.setFlattenWriter(flattenWriter, inclComments, inclConditionals,
inclPEs);
DTDSubset ss = r.parseDTD();
r.flushFlattenWriter();
flattenWriter.flush();
return ss;
}
private TextBuffer getTextBuffer()
{
if (mTextBuffer == null) {
mTextBuffer = TextBuffer.createTemporaryBuffer(200);
mTextBuffer.resetInitialized();
} else {
mTextBuffer.resetWithEmpty();
}
return mTextBuffer;
}
/*
//////////////////////////////////////////////////
// Configuration
//////////////////////////////////////////////////
*/
/**
* Method that will set specified Writer as the 'flattening writer';
* writer used to output flattened version of DTD read in. This is
* similar to running a C-preprocessor on C-sources, except that
* defining writer will not prevent normal parsing of DTD itself.
*/
public void setFlattenWriter(Writer w, boolean inclComments,
boolean inclConditionals, boolean inclPEs)
{
mFlattenWriter = new DTDWriter(w, inclComments, inclConditionals,
inclPEs);
}
private void flushFlattenWriter()
throws IOException
{
mFlattenWriter.flush(mInputBuffer, mInputPtr);
}
/*
//////////////////////////////////////////////////
// Internal API
//////////////////////////////////////////////////
*/
/**
* Method that may need to be called by attribute default value
* validation code, during parsing....
*
* Note: see base class for some additional remarks about this
* method.
*/
public EntityDecl findEntity(String entName)
{
if (mPredefdGEs != null) {
EntityDecl decl = (EntityDecl) mPredefdGEs.get(entName);
if (decl != null) {
return decl;
}
}
return (EntityDecl) mGeneralEntities.get(entName);
}
/*
//////////////////////////////////////////////////
// Main-level parsing methods
//////////////////////////////////////////////////
*/
protected DTDSubset parseDTD()
throws IOException, XMLStreamException
{
while (true) {
mCheckForbiddenPEs = false; // PEs are ok at this point
int i = getNextAfterWS();
if (i < 0) {
if (mIsExternal) { // ok for external DTDs
break;
}
// Error for internal subset
throwUnexpectedEOF(SUFFIX_IN_DTD_INTERNAL);
}
if (i == '%') { // parameter entity
expandPE();
continue;
}
/* First, let's keep track of start of the directive; needed for
* entity and notation declaration events.
*/
mTokenInputTotal = mCurrInputProcessed + mInputPtr;
mTokenInputRow = mCurrInputRow;
mTokenInputCol = mInputPtr - mCurrInputRowStart;
if (i == '<') {
// PEs not allowed within declarations, in the internal subset proper
mCheckForbiddenPEs = !mIsExternal && (mInput == mRootInput);
if (mFlattenWriter == null) {
parseDirective();
} else {
parseDirectiveFlattened();
}
continue;
}
if (i == ']') {
if (mIncludeCount == 0 && !mIsExternal) { // End of internal subset
break;
}
if (mIncludeCount > 0) { // active INCLUDE block(s) open?
boolean suppress = (mFlattenWriter != null) && !mFlattenWriter.includeConditionals();
if (suppress) {
mFlattenWriter.flush(mInputBuffer, mInputPtr-1);
mFlattenWriter.disableOutput();
}
try {
// ]]> needs to be a token, can not come from PE:
char c = dtdNextFromCurr();
if (c == ']') {
c = dtdNextFromCurr();
if (c == '>') {
// Ok, fine, conditional include section ended.
--mIncludeCount;
continue;
}
}
throwDTDUnexpectedChar(c, "; expected ']]>' to close conditional include section");
} finally {
if (suppress) {
mFlattenWriter.enableOutput(mInputPtr);
}
}
}
// otherwise will fall through, and give an error
}
if (mIsExternal) {
throwDTDUnexpectedChar(i, "; expected a '<' to start a directive");
}
throwDTDUnexpectedChar(i, "; expected a '<' to start a directive, or \"]>\" to end internal subset");
}
/* 05-Feb-2006, TSa: Not allowed to have unclosed INCLUDE/IGNORE
* blocks...
*/
if (mIncludeCount > 0) { // active INCLUDE block(s) open?
String suffix = (mIncludeCount == 1) ? "an INCLUDE block" : (""+mIncludeCount+" INCLUDE blocks");
throwUnexpectedEOF(getErrorMsg()+"; expected closing marker for "+suffix);
}
// Ok; time to construct and return DTD data object.
DTDSubset ss;
// There are more settings for ext. subsets:
if (mIsExternal) {
/* External subsets are cachable if they did not refer to any
* PEs or GEs defined in internal subset passed in (if any),
* nor to any notations.
* We don't care about PEs it defined itself, but need to pass
* in Set of PEs it refers to, to check if cached copy can be
* used with different int. subsets.
* We need not worry about notations referred, since they are
* not allowed to be re-defined.
*/
boolean cachable = !mUsesPredefdEntities && !mUsesPredefdNotations;
ss = DTDSubsetImpl.constructInstance(cachable,
mGeneralEntities, mRefdGEs,
null, mRefdPEs,
mNotations, mElements,
mCfgFullyValidating);
} else {
/* Internal subsets are not cachable (no unique way to refer
* to unique internal subsets), and there can be no references
* to pre-defined PEs, as none were passed.
*/
ss = DTDSubsetImpl.constructInstance(false, mGeneralEntities, null,
mParamEntities, null,
mNotations, mElements,
mCfgFullyValidating);
}
return ss;
}
protected void parseDirective()
throws IOException, XMLStreamException
{
/* Hmmh. Don't think PEs are allowed to contain starting
* '!' (or '?')... and it has to come from the same
* input source too (no splits)
*/
char c = dtdNextFromCurr();
if (c == '?') { // xml decl?
readPI();
return;
}
if (c != '!') { // nothing valid
throwDTDUnexpectedChar(c, "; expected '!' to start a directive");
}
/* ignore/include, comment, or directive; we are still getting
* token from same section though
*/
c = dtdNextFromCurr();
if (c == '-') { // plain comment
c = dtdNextFromCurr();
if (c != '-') {
throwDTDUnexpectedChar(c, "; expected '-' for a comment");
}
if (mEventListener != null && mEventListener.dtdReportComments()) {
readComment(mEventListener);
} else {
skipComment();
}
} else if (c == '[') {
checkInclusion();
} else if (c >= 'A' && c <= 'Z') {
handleDeclaration(c);
} else {
throwDTDUnexpectedChar(c, ErrorConsts.ERR_DTD_MAINLEVEL_KEYWORD);
}
}
/**
* Method similar to {@link #parseDirective}, but one that takes care
* to properly output dtd contents via {@link com.ctc.wstx.dtd.DTDWriter}
* as necessary.
* Separated to simplify both methods; otherwise would end up with
* 'if (... flatten...) ... else ...' spaghetti code.
*/
protected void parseDirectiveFlattened()
throws IOException, XMLStreamException
{
/* First, need to flush any flattened output there may be, at
* this point (except for opening lt char): and then need to
* temporarily disable more output until we know the type and
* whether it should be output or not:
*/
mFlattenWriter.flush(mInputBuffer, mInputPtr-1);
mFlattenWriter.disableOutput();
/* Let's determine type here, and call appropriate skip/parse
* methods.
*/
char c = dtdNextFromCurr();
if (c == '?') { // xml decl?
mFlattenWriter.enableOutput(mInputPtr);
mFlattenWriter.output("");
readPI();
//throwDTDUnexpectedChar(c, " expected '!' to start a directive");
return;
}
if (c != '!') { // nothing valid
throwDTDUnexpectedChar(c, ErrorConsts.ERR_DTD_MAINLEVEL_KEYWORD);
}
// ignore/include, comment, or directive
c = dtdNextFromCurr();
if (c == '-') { // plain comment
c = dtdNextFromCurr();
if (c != '-') {
throwDTDUnexpectedChar(c, "; expected '-' for a comment");
}
boolean comm = mFlattenWriter.includeComments();
if (comm) {
mFlattenWriter.enableOutput(mInputPtr);
mFlattenWriter.output("