com.ctc.wstx.sr.StreamScanner Maven / Gradle / Ivy
/* Woodstox XML processor
*
* Copyright (c) 2004- Tatu Saloranta, [email protected]
*
* Licensed under the License specified in file LICENSE, included with
* the source code.
* You may not use this file except in compliance with the License.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.ctc.wstx.sr;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.URL;
import java.text.MessageFormat;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import javax.xml.stream.Location;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLReporter;
import javax.xml.stream.XMLResolver;
import javax.xml.stream.XMLStreamException;
import org.codehaus.stax2.XMLReporter2;
import org.codehaus.stax2.XMLStreamLocation2;
import org.codehaus.stax2.validation.XMLValidationProblem;
import com.ctc.wstx.api.ReaderConfig;
import com.ctc.wstx.cfg.ErrorConsts;
import com.ctc.wstx.cfg.InputConfigFlags;
import com.ctc.wstx.cfg.ParsingErrorMsgs;
import com.ctc.wstx.cfg.XmlConsts;
import com.ctc.wstx.dtd.MinimalDTDReader;
import com.ctc.wstx.ent.EntityDecl;
import com.ctc.wstx.ent.IntEntity;
import com.ctc.wstx.exc.*;
import com.ctc.wstx.io.DefaultInputResolver;
import com.ctc.wstx.io.WstxInputData;
import com.ctc.wstx.io.WstxInputLocation;
import com.ctc.wstx.io.WstxInputSource;
import com.ctc.wstx.util.ExceptionUtil;
import com.ctc.wstx.util.SymbolTable;
import com.ctc.wstx.util.TextBuffer;
/**
* Abstract base class that defines some basic functionality that all
* Woodstox reader classes (main XML reader, DTD reader) extend from.
*/
public abstract class StreamScanner
extends WstxInputData
implements InputProblemReporter,
InputConfigFlags, ParsingErrorMsgs
{
// // // Some well-known chars:
/**
* Last (highest) char code of the three, LF, CR and NULL
*/
public final static char CHAR_CR_LF_OR_NULL = (char) 13;
public final static int INT_CR_LF_OR_NULL = 13;
/**
* Character that allows quick check of whether a char can potentially
* be some kind of markup, WRT input stream processing;
* has to contain linefeeds, &, < and > (">" only matters when
* quoting text, as part of "]]>")
*/
protected final static char CHAR_FIRST_PURE_TEXT = (char) ('>' + 1);
/**
* First character in Unicode (ie one with lowest id) that is legal
* as part of a local name (all valid name chars minus ':'). Used
* for doing quick check for local name end; usually name ends in
* a whitespace or equals sign.
*/
protected final static char CHAR_LOWEST_LEGAL_LOCALNAME_CHAR = '-';
/*
///////////////////////////////////////////////////////////
// Character validity constants, structs
///////////////////////////////////////////////////////////
*/
/**
* We will only use validity array for first 256 characters, mostly
* because after those characters it's easier to do fairly simple
* block checks.
*/
private final static int VALID_CHAR_COUNT = 0x100;
private final static byte NAME_CHAR_INVALID_B = (byte) 0;
private final static byte NAME_CHAR_ALL_VALID_B = (byte) 1;
private final static byte NAME_CHAR_VALID_NONFIRST_B = (byte) -1;
private final static byte[] sCharValidity = new byte[VALID_CHAR_COUNT];
static {
/* First, since all valid-as-first chars are also valid-as-other chars,
* we'll initialize common chars:
*/
sCharValidity['_'] = NAME_CHAR_ALL_VALID_B;
for (int i = 0, last = ('z' - 'a'); i <= last; ++i) {
sCharValidity['A' + i] = NAME_CHAR_ALL_VALID_B;
sCharValidity['a' + i] = NAME_CHAR_ALL_VALID_B;
}
for (int i = 0xC0; i < 0xF6; ++i) { // not all are fully valid, but
sCharValidity[i] = NAME_CHAR_ALL_VALID_B;
}
// ... now we can 'revert' ones not fully valid:
sCharValidity[0xD7] = NAME_CHAR_INVALID_B;
sCharValidity[0xF7] = NAME_CHAR_INVALID_B;
/* And then we can proceed with ones only valid-as-other.
*/
sCharValidity['-'] = NAME_CHAR_VALID_NONFIRST_B;
sCharValidity['.'] = NAME_CHAR_VALID_NONFIRST_B;
sCharValidity[0xB7] = NAME_CHAR_VALID_NONFIRST_B;
for (int i = '0'; i <= '9'; ++i) {
sCharValidity[i] = NAME_CHAR_VALID_NONFIRST_B;
}
}
/**
* Public identifiers only use 7-bit ascii range.
*/
private final static int VALID_PUBID_CHAR_COUNT = 0x80;
private final static byte[] sPubidValidity = new byte[VALID_PUBID_CHAR_COUNT];
// private final static byte PUBID_CHAR_INVALID_B = (byte) 0;
private final static byte PUBID_CHAR_VALID_B = (byte) 1;
static {
for (int i = 0, last = ('z' - 'a'); i <= last; ++i) {
sPubidValidity['A' + i] = PUBID_CHAR_VALID_B;
sPubidValidity['a' + i] = PUBID_CHAR_VALID_B;
}
for (int i = '0'; i <= '9'; ++i) {
sPubidValidity[i] = PUBID_CHAR_VALID_B;
}
// 3 main white space types are valid
sPubidValidity[0x0A] = PUBID_CHAR_VALID_B;
sPubidValidity[0x0D] = PUBID_CHAR_VALID_B;
sPubidValidity[0x20] = PUBID_CHAR_VALID_B;
// And many of punctuation/separator ascii chars too:
sPubidValidity['-'] = PUBID_CHAR_VALID_B;
sPubidValidity['\''] = PUBID_CHAR_VALID_B;
sPubidValidity['('] = PUBID_CHAR_VALID_B;
sPubidValidity[')'] = PUBID_CHAR_VALID_B;
sPubidValidity['+'] = PUBID_CHAR_VALID_B;
sPubidValidity[','] = PUBID_CHAR_VALID_B;
sPubidValidity['.'] = PUBID_CHAR_VALID_B;
sPubidValidity['/'] = PUBID_CHAR_VALID_B;
sPubidValidity[':'] = PUBID_CHAR_VALID_B;
sPubidValidity['='] = PUBID_CHAR_VALID_B;
sPubidValidity['?'] = PUBID_CHAR_VALID_B;
sPubidValidity[';'] = PUBID_CHAR_VALID_B;
sPubidValidity['!'] = PUBID_CHAR_VALID_B;
sPubidValidity['*'] = PUBID_CHAR_VALID_B;
sPubidValidity['#'] = PUBID_CHAR_VALID_B;
sPubidValidity['@'] = PUBID_CHAR_VALID_B;
sPubidValidity['$'] = PUBID_CHAR_VALID_B;
sPubidValidity['_'] = PUBID_CHAR_VALID_B;
sPubidValidity['%'] = PUBID_CHAR_VALID_B;
}
/*
///////////////////////////////////////////////////////////
// Basic configuration
///////////////////////////////////////////////////////////
*/
/**
* Copy of the configuration object passed by the factory.
* Contains immutable settings for this reader (or in case
* of DTD parsers, reader that uses it)
*/
protected final ReaderConfig mConfig;
// // // Various extracted settings:
/**
* If true, Reader is namespace aware, and should do basic checks
* (usually enforcing limitations on having colons in names)
*/
protected final boolean mCfgNsEnabled;
// Extracted standard on/off settings:
/**
* note: left non-final on purpose: sub-class may need to modify
* the default value after construction.
*/
protected boolean mCfgReplaceEntities;
/*
///////////////////////////////////////////////////////////
// Symbol handling, if applicable
///////////////////////////////////////////////////////////
*/
final SymbolTable mSymbols;
/**
* Local full name for the event, if it has one (note: element events
* do NOT use this variable; those names are stored in element stack):
* target for processing instructions.
*
* Currently used for proc. instr. target, and entity name (at least
* when current entity reference is null).
*
* Note: this variable is generally not cleared, since it comes from
* a symbol table, ie. this won't be the only reference.
*/
protected String mCurrName;
/*
///////////////////////////////////////////////////////////
// Input handling
///////////////////////////////////////////////////////////
*/
/**
* Currently active input source; contains link to parent (nesting) input
* sources, if any.
*/
protected WstxInputSource mInput;
/**
* Top-most input source this reader can use; due to input source
* chaining, this is not necessarily the root of all input; for example,
* external DTD subset reader's root input still has original document
* input as its parent.
*/
protected final WstxInputSource mRootInput;
/**
* Custom resolver used to handle external entities that are to be expanded
* by this reader (external param/general entity expander)
*/
protected XMLResolver mEntityResolver = null;
/**
* This is the current depth of the input stack (same as what input
* element stack would return as its depth).
* It is used to enforce input scope constraints for nesting of
* elements (for xml reader) and dtd declaration (for dtd reader)
* with regards to input block (entity expansion) boundaries.
*
* Basically this value is compared to {@link #mInputTopDepth}, which
* indicates what was the depth at the point where the currently active
* input scope/block was started.
*/
protected int mCurrDepth;
protected int mInputTopDepth;
/**
* Number of times a parsed general entity has been expanded; used for
* (optionally) limiting number of expansion to guard against
* denial-of-service attacks like "Billion Laughs".
*
* @since 4.3
*/
protected int mEntityExpansionCount;
/**
* Flag that indicates whether linefeeds in the input data are to
* be normalized or not.
* Xml specs mandate that the line feeds are only normalized
* when they are from the external entities (main doc, external
* general/parsed entities), so normalization has to be
* suppressed when expanding internal general/parsed entities.
*/
protected boolean mNormalizeLFs;
/*
///////////////////////////////////////////////////////////
// Buffer(s) for local name(s) and text content
///////////////////////////////////////////////////////////
*/
/**
* Temporary buffer used if local name can not be just directly
* constructed from input buffer (name is on a boundary or such).
*/
protected char[] mNameBuffer = null;
/*
///////////////////////////////////////////////////////////
// Information about starting location of event
// Reader is pointing to; updated on-demand
///////////////////////////////////////////////////////////
*/
// // // Location info at point when current token was started
/**
* Total number of characters read before start of current token.
* For big (gigabyte-sized) sizes are possible, needs to be long,
* unlike pointers and sizes related to in-memory buffers.
*/
protected long mTokenInputTotal = 0;
/**
* Input row on which current token starts, 1-based
*/
protected int mTokenInputRow = 1;
/**
* Column on input row that current token starts; 0-based (although
* in the end it'll be converted to 1-based)
*/
protected int mTokenInputCol = 0;
/*
///////////////////////////////////////////////////////////
// XML document information (from doc decl if one
// was found) common to all entities (main xml
// document, external DTD subset)
///////////////////////////////////////////////////////////
*/
/**
* Input stream encoding, if known (passed in, or determined by
* auto-detection); null if not.
*/
protected String mDocInputEncoding = null;
/**
* Character encoding from xml declaration, if any; null if no
* declaration, or it didn't specify encoding.
*/
protected String mDocXmlEncoding = null;
/**
* XML version as declared by the document; one of constants
* from {@link XmlConsts} (like {@link XmlConsts#XML_V_10}).
*/
protected int mDocXmlVersion = XmlConsts.XML_V_UNKNOWN;
/**
* Cache of internal character entities;
*/
protected Map mCachedEntities;
/**
* Flag for whether or not character references should be treated as entities
*/
protected boolean mCfgTreatCharRefsAsEntities;
/**
* Entity reference stream currently points to.
*/
protected EntityDecl mCurrEntity;
/*
///////////////////////////////////////////////////////////
// Life-cycle
///////////////////////////////////////////////////////////
*/
/**
* Constructor used when creating a complete new (main-level) reader that
* does not share its input buffers or state with another reader.
*/
protected StreamScanner(WstxInputSource input, ReaderConfig cfg,
XMLResolver res)
{
super();
mInput = input;
// 17-Jun-2004, TSa: Need to know root-level input source
mRootInput = input;
mConfig = cfg;
mSymbols = cfg.getSymbols();
int cf = cfg.getConfigFlags();
mCfgNsEnabled = (cf & CFG_NAMESPACE_AWARE) != 0;
mCfgReplaceEntities = (cf & CFG_REPLACE_ENTITY_REFS) != 0;
mNormalizeLFs = mConfig.willNormalizeLFs();
mInputBuffer = null;
mInputPtr = mInputEnd = 0;
mEntityResolver = res;
mCfgTreatCharRefsAsEntities = mConfig.willTreatCharRefsAsEnts();
if (mCfgTreatCharRefsAsEntities) {
mCachedEntities = new HashMap();
} else {
mCachedEntities = Collections.emptyMap();
}
}
/*
///////////////////////////////////////////////////////////
// Package API
///////////////////////////////////////////////////////////
*/
/**
* Method that returns location of the last character returned by this
* reader; that is, location "one less" than the currently pointed to
* location.
*/
protected WstxInputLocation getLastCharLocation()
{
return mInput.getLocation(mCurrInputProcessed + mInputPtr - 1,
mCurrInputRow, mInputPtr - mCurrInputRowStart);
}
protected URL getSource() throws IOException {
return mInput.getSource();
}
protected String getSystemId() {
return mInput.getSystemId();
}
/*
///////////////////////////////////////////////////////////
// Partial LocationInfo implementation (not implemented
// by this base class, but is by some sub-classes)
///////////////////////////////////////////////////////////
*/
/**
* Returns location of last properly parsed token; as per StAX specs,
* apparently needs to be the end of current event, which is the same
* as the start of the following event (or EOF if that's next).
*/
@Override
public abstract Location getLocation();
public XMLStreamLocation2 getStartLocation()
{
// note: +1 is used as columns are 1-based...
return mInput.getLocation(mTokenInputTotal,
mTokenInputRow, mTokenInputCol + 1);
}
public XMLStreamLocation2 getCurrentLocation()
{
return mInput.getLocation(mCurrInputProcessed + mInputPtr,
mCurrInputRow, mInputPtr - mCurrInputRowStart + 1);
}
/*
///////////////////////////////////////////////////////////
// InputProblemReporter implementation
///////////////////////////////////////////////////////////
*/
public WstxException throwWfcException(String msg, boolean deferErrors)
throws WstxException
{
WstxException ex = constructWfcException(msg);
if (!deferErrors) {
throw ex;
}
return ex;
}
@Override
public void throwParseError(String msg) throws XMLStreamException {
throwParseError(msg, null, null);
}
/**
* Throws generic parse error with specified message and current parsing
* location.
*
* Note: public access only because core code in other packages needs
* to access it.
*/
@Override
public void throwParseError(String format, Object arg, Object arg2)
throws XMLStreamException
{
String msg = (arg != null || arg2 != null) ?
MessageFormat.format(format, new Object[] { arg, arg2 }) : format;
throw constructWfcException(msg);
}
public void reportProblem(String probType, String format, Object arg, Object arg2)
throws XMLStreamException
{
XMLReporter rep = mConfig.getXMLReporter();
if (rep != null) {
_reportProblem(rep, probType,
MessageFormat.format(format, new Object[] { arg, arg2 }), null);
}
}
@Override
public void reportProblem(Location loc, String probType,
String format, Object arg, Object arg2)
throws XMLStreamException
{
XMLReporter rep = mConfig.getXMLReporter();
if (rep != null) {
String msg = (arg != null || arg2 != null) ?
MessageFormat.format(format, new Object[] { arg, arg2 }) : format;
_reportProblem(rep, probType, msg, loc);
}
}
protected void _reportProblem(XMLReporter rep, String probType, String msg, Location loc)
throws XMLStreamException
{
if (loc == null) {
loc = getLastCharLocation();
}
_reportProblem(rep, new XMLValidationProblem(loc, msg, XMLValidationProblem.SEVERITY_ERROR, probType));
}
protected void _reportProblem(XMLReporter rep, XMLValidationProblem prob)
throws XMLStreamException
{
if (rep != null) {
Location loc = prob.getLocation();
if (loc == null) {
loc = getLastCharLocation();
prob.setLocation(loc);
}
// Backwards-compatibility fix: add non-null type, if missing:
if (prob.getType() == null) {
prob.setType(ErrorConsts.WT_VALIDATION);
}
// [WSTX-154]: was catching and dropping thrown exception: shouldn't.
// [WTSX-157]: need to support XMLReporter2
if (rep instanceof XMLReporter2) {
((XMLReporter2) rep).report(prob);
} else {
rep.report(prob.getMessage(), prob.getType(), prob, loc);
}
}
}
/**
*
* Note: this is the base implementation used for implementing
* ValidationContext
*/
@Override
public void reportValidationProblem(XMLValidationProblem prob)
throws XMLStreamException
{
// !!! TBI: Fail-fast vs. deferred modes?
/* For now let's implement basic functionality: warnings get
* reported via XMLReporter, errors and fatal errors result in
* immediate exceptions.
*/
/* 27-May-2008, TSa: [WSTX-153] Above is incorrect: as per Stax
* javadocs for XMLReporter, both warnings and non-fatal errors
* (which includes all validation errors) should be reported via
* XMLReporter interface, and only fatals should cause an
* immediate stream exception (by-passing reporter)
*/
if (prob.getSeverity() > XMLValidationProblem.SEVERITY_ERROR) {
throw WstxValidationException.create(prob);
}
XMLReporter rep = mConfig.getXMLReporter();
if (rep != null) {
_reportProblem(rep, prob);
} else {
/* If no reporter, regular non-fatal errors are to be reported
* as exceptions as well, for backwards compatibility
*/
if (prob.getSeverity() >= XMLValidationProblem.SEVERITY_ERROR) {
throw WstxValidationException.create(prob);
}
}
}
public void reportValidationProblem(String msg, int severity)
throws XMLStreamException
{
reportValidationProblem(new XMLValidationProblem(getLastCharLocation(),
msg, severity));
}
@Override
public void reportValidationProblem(String msg)
throws XMLStreamException
{
reportValidationProblem(new XMLValidationProblem(getLastCharLocation(), msg,
XMLValidationProblem.SEVERITY_ERROR));
}
public void reportValidationProblem(Location loc, String msg)
throws XMLStreamException
{
reportValidationProblem(new XMLValidationProblem(loc, msg));
}
@Override
public void reportValidationProblem(String format, Object arg, Object arg2)
throws XMLStreamException
{
reportValidationProblem(MessageFormat.format(format, new Object[] { arg, arg2 }));
}
/*
///////////////////////////////////////////////////////////
// Other error reporting methods
///////////////////////////////////////////////////////////
*/
protected WstxException constructWfcException(String msg)
{
return new WstxParsingException(msg, getLastCharLocation());
}
/**
* Construct and return a {@link XMLStreamException} to throw
* as a result of a failed Typed Access operation (but one not
* caused by a Well-Formedness Constraint or Validation Constraint
* problem)
*/
/*
protected WstxException _constructTypeException(String msg)
{
// Hmmh. Should there be a distinct sub-type?
return new WstxParsingException(msg, getLastCharLocation());
}
*/
protected WstxException constructFromIOE(IOException ioe)
{
return new WstxIOException(ioe);
}
protected WstxException constructNullCharException()
{
return new WstxUnexpectedCharException("Illegal character (NULL, unicode 0) encountered: not valid in any content",
getLastCharLocation(), CHAR_NULL);
}
protected void throwUnexpectedChar(int i, String msg) throws WstxException
{
char c = (char) i;
String excMsg = "Unexpected character "+getCharDesc(c)+msg;
throw new WstxUnexpectedCharException(excMsg, getLastCharLocation(), c);
}
protected void throwNullChar() throws WstxException {
throw constructNullCharException();
}
protected void throwInvalidSpace(int i) throws WstxException {
throwInvalidSpace(i, false);
}
protected WstxException throwInvalidSpace(int i, boolean deferErrors)
throws WstxException
{
char c = (char) i;
WstxException ex;
if (c == CHAR_NULL) {
ex = constructNullCharException();
} else {
String msg = "Illegal character ("+getCharDesc(c)+")";
if (mXml11) {
msg += " [note: in XML 1.1, it could be included via entity expansion]";
}
ex = new WstxUnexpectedCharException(msg, getLastCharLocation(), c);
}
if (!deferErrors) {
throw ex;
}
return ex;
}
protected void throwUnexpectedEOF(String msg)
throws WstxException
{
throw new WstxEOFException("Unexpected EOF"+(msg == null ? "" : msg),
getLastCharLocation());
}
/**
* Similar to {@link #throwUnexpectedEOF}, but only indicates ending
* of an input block. Used when reading a token that can not span
* input block boundaries (ie. can not continue past end of an
* entity expansion).
*/
protected void throwUnexpectedEOB(String msg)
throws WstxException
{
throw new WstxEOFException("Unexpected end of input block"+(msg == null ? "" : msg),
getLastCharLocation());
}
protected void throwFromIOE(IOException ioe) throws WstxException {
throw new WstxIOException(ioe);
}
protected void throwFromStrE(XMLStreamException strex)
throws WstxException
{
if (strex instanceof WstxException) {
throw (WstxException) strex;
}
throw new WstxException(strex);
}
/**
* Method called to report an error, when caller's signature only
* allows runtime exceptions to be thrown.
*/
protected void throwLazyError(Exception e)
{
if (e instanceof XMLStreamException) {
WstxLazyException.throwLazily((XMLStreamException) e);
}
ExceptionUtil.throwRuntimeException(e);
}
protected String tokenTypeDesc(int type) {
return ErrorConsts.tokenTypeDesc(type);
}
/*
///////////////////////////////////////////////////////////
// Input buffer handling
///////////////////////////////////////////////////////////
*/
/**
* Returns current input source this source uses.
*
* Note: public only because some implementations are on different
* package.
*/
public final WstxInputSource getCurrentInput() {
return mInput;
}
protected final int inputInBuffer() {
return mInputEnd - mInputPtr;
}
@SuppressWarnings("cast")
protected final int getNext() throws XMLStreamException
{
if (mInputPtr >= mInputEnd) {
if (!loadMore()) {
return -1;
}
}
return (int) mInputBuffer[mInputPtr++];
}
/**
* Similar to {@link #getNext}, but does not advance pointer
* in input buffer.
*
* Note: this method only peeks within current input source;
* it does not close it and check nested input source (if any).
* This is necessary when checking keywords, since they can never
* cross input block boundary.
*/
@SuppressWarnings("cast")
protected final int peekNext()
throws XMLStreamException
{
if (mInputPtr >= mInputEnd) {
if (!loadMoreFromCurrent()) {
return -1;
}
}
return (int) mInputBuffer[mInputPtr];
}
protected final char getNextChar(String errorMsg)
throws XMLStreamException
{
if (mInputPtr >= mInputEnd) {
loadMore(errorMsg);
}
return mInputBuffer[mInputPtr++];
}
/**
* Similar to {@link #getNextChar}, but will not read more characters
* from parent input source(s) if the current input source doesn't
* have more content. This is often needed to prevent "runaway" content,
* such as comments that start in an entity but do not have matching
* close marker inside entity; XML specification specifically states
* such markup is not legal.
*/
protected final char getNextCharFromCurrent(String errorMsg)
throws XMLStreamException
{
if (mInputPtr >= mInputEnd) {
loadMoreFromCurrent(errorMsg);
}
return mInputBuffer[mInputPtr++];
}
/**
* Method that will skip through zero or more white space characters,
* and return either the character following white space, or -1 to
* indicate EOF (end of the outermost input source)/
*/
@SuppressWarnings("cast")
protected final int getNextAfterWS()
throws XMLStreamException
{
if (mInputPtr >= mInputEnd) {
if (!loadMore()) {
return -1;
}
}
char c = mInputBuffer[mInputPtr++];
while (c <= CHAR_SPACE) {
// Linefeed?
if (c == '\n' || c == '\r') {
skipCRLF(c);
} else if (c != CHAR_SPACE && c != '\t') {
throwInvalidSpace(c);
}
// Still a white space?
if (mInputPtr >= mInputEnd) {
if (!loadMore()) {
return -1;
}
}
c = mInputBuffer[mInputPtr++];
}
return (int) c;
}
protected final char getNextCharAfterWS(String errorMsg)
throws XMLStreamException
{
if (mInputPtr >= mInputEnd) {
loadMore(errorMsg);
}
char c = mInputBuffer[mInputPtr++];
while (c <= CHAR_SPACE) {
// Linefeed?
if (c == '\n' || c == '\r') {
skipCRLF(c);
} else if (c != CHAR_SPACE && c != '\t') {
throwInvalidSpace(c);
}
// Still a white space?
if (mInputPtr >= mInputEnd) {
loadMore(errorMsg);
}
c = mInputBuffer[mInputPtr++];
}
return c;
}
protected final char getNextInCurrAfterWS(String errorMsg)
throws XMLStreamException
{
return getNextInCurrAfterWS(errorMsg, getNextCharFromCurrent(errorMsg));
}
protected final char getNextInCurrAfterWS(String errorMsg, char c)
throws XMLStreamException
{
while (c <= CHAR_SPACE) {
// Linefeed?
if (c == '\n' || c == '\r') {
skipCRLF(c);
} else if (c != CHAR_SPACE && c != '\t') {
throwInvalidSpace(c);
}
// Still a white space?
if (mInputPtr >= mInputEnd) {
loadMoreFromCurrent(errorMsg);
}
c = mInputBuffer[mInputPtr++];
}
return c;
}
/**
* Method called when a CR has been spotted in input; checks if next
* char is LF, and if so, skips it. Note that next character has to
* come from the current input source, to qualify; it can never come
* from another (nested) input source.
*
* @return True, if passed in char is '\r' and next one is '\n'.
*/
protected final boolean skipCRLF(char c)
throws XMLStreamException
{
boolean result;
if (c == '\r' && peekNext() == '\n') {
++mInputPtr;
result = true;
} else {
result = false;
}
++mCurrInputRow;
mCurrInputRowStart = mInputPtr;
return result;
}
protected final void markLF() {
++mCurrInputRow;
mCurrInputRowStart = mInputPtr;
}
protected final void markLF(int inputPtr) {
++mCurrInputRow;
mCurrInputRowStart = inputPtr;
}
/**
* Method to push back last character read; can only be called once,
* that is, no more than one char can be guaranteed to be succesfully
* returned.
*/
protected final void pushback() { --mInputPtr; }
/*
///////////////////////////////////////////////////////////
// Sub-class overridable input handling methods
///////////////////////////////////////////////////////////
*/
/**
* Method called when an entity has been expanded (new input source
* has been created). Needs to initialize location information and change
* active input source.
*
* @param entityId Name of the entity being expanded
*/
protected void initInputSource(WstxInputSource newInput, boolean isExt,
String entityId)
throws XMLStreamException
{
// Let's make sure new input will be read next time input is needed:
mInputPtr = 0;
mInputEnd = 0;
/* Plus, reset the input location so that'll be accurate for
* error reporting etc.
*/
mInputTopDepth = mCurrDepth;
// [WSTX-296]: Check for entity expansion depth against configurable limit
int entityDepth = mInput.getEntityDepth() + 1;
verifyLimit("Maximum entity expansion depth", mConfig.getMaxEntityDepth(), entityDepth);
mInput = newInput;
mInput.initInputLocation(this, mCurrDepth, entityDepth);
/* 21-Feb-2006, TSa: Linefeeds are NOT normalized when expanding
* internal entities (XML, 2.11)
*/
if (isExt) {
mNormalizeLFs = true;
} else {
mNormalizeLFs = false;
}
}
/**
* Method that will try to read one or more characters from currently
* open input sources; closing input sources if necessary.
*
* @return true if reading succeeded (or may succeed), false if
* we reached EOF.
*/
protected boolean loadMore()
throws XMLStreamException
{
WstxInputSource input = mInput;
do {
/* Need to make sure offsets are properly updated for error
* reporting purposes, and do this now while previous amounts
* are still known.
*/
mCurrInputProcessed += mInputEnd;
verifyLimit("Maximum document characters", mConfig.getMaxCharacters(), mCurrInputProcessed);
mCurrInputRowStart -= mInputEnd;
int count;
try {
count = input.readInto(this);
if (count > 0) {
return true;
}
input.close();
} catch (IOException ioe) {
throw constructFromIOE(ioe);
}
if (input == mRootInput) {
/* Note: no need to check entity/input nesting in this
* particular case, since it will be handled by higher level
* parsing code (results in an unexpected EOF)
*/
return false;
}
WstxInputSource parent = input.getParent();
if (parent == null) { // sanity check!
throwNullParent(input);
}
/* 13-Feb-2006, TSa: Ok, do we violate a proper nesting constraints
* with this input block closure?
*/
if (mCurrDepth != input.getScopeId()) {
handleIncompleteEntityProblem(input);
}
mInput = input = parent;
input.restoreContext(this);
mInputTopDepth = input.getScopeId();
/* 21-Feb-2006, TSa: Since linefeed normalization needs to be
* suppressed for internal entity expansion, we may need to
* change the state...
*/
if (!mNormalizeLFs) {
mNormalizeLFs = !input.fromInternalEntity();
}
// Maybe there are leftovers from that input in buffer now?
} while (mInputPtr >= mInputEnd);
return true;
}
protected final boolean loadMore(String errorMsg)
throws XMLStreamException
{
if (!loadMore()) {
throwUnexpectedEOF(errorMsg);
}
return true;
}
protected boolean loadMoreFromCurrent()
throws XMLStreamException
{
// Need to update offsets properly
mCurrInputProcessed += mInputEnd;
mCurrInputRowStart -= mInputEnd;
verifyLimit("Maximum document characters", mConfig.getMaxCharacters(), mCurrInputProcessed);
try {
int count = mInput.readInto(this);
return (count > 0);
} catch (IOException ie) {
throw constructFromIOE(ie);
}
}
protected final boolean loadMoreFromCurrent(String errorMsg)
throws XMLStreamException
{
if (!loadMoreFromCurrent()) {
throwUnexpectedEOB(errorMsg);
}
return true;
}
/**
* Method called to make sure current main-level input buffer has at
* least specified number of characters available consequtively,
* without having to call {@link #loadMore}. It can only be called
* when input comes from main-level buffer; further, call can shift
* content in input buffer, so caller has to flush any data still
* pending. In short, caller has to know exactly what it's doing. :-)
*
* Note: method does not check for any other input sources than the
* current one -- if current source can not fulfill the request, a
* failure is indicated.
*
* @return true if there's now enough data; false if not (EOF)
*/
protected boolean ensureInput(int minAmount)
throws XMLStreamException
{
int currAmount = mInputEnd - mInputPtr;
if (currAmount >= minAmount) {
return true;
}
try {
return mInput.readMore(this, minAmount);
} catch (IOException ie) {
throw constructFromIOE(ie);
}
}
protected void closeAllInput(boolean force)
throws XMLStreamException
{
WstxInputSource input = mInput;
while (true) {
try {
if (force) {
input.closeCompletely();
} else {
input.close();
}
} catch (IOException ie) {
throw constructFromIOE(ie);
}
if (input == mRootInput) {
break;
}
WstxInputSource parent = input.getParent();
if (parent == null) { // sanity check!
throwNullParent(input);
}
mInput = input = parent;
}
}
/**
* @param curr Input source currently in use
*/
protected void throwNullParent(WstxInputSource curr)
{
throw new IllegalStateException(ErrorConsts.ERR_INTERNAL);
//throw new IllegalStateException("Internal error: null parent for input source '"+curr+"'; should never occur (should have stopped at root input '"+mRootInput+"').");
}
/*
///////////////////////////////////////////////////////////
// Entity resolution
///////////////////////////////////////////////////////////
*/
/**
* Method that tries to resolve a character entity, or (if caller so
* specifies), a pre-defined internal entity (lt, gt, amp, apos, quot).
* It will succeed iff:
*
* - Entity in question is a simple character entity (either one of
* 5 pre-defined ones, or using decimal/hex notation), AND
*
-
*
- Entity fits completely inside current input buffer.
*
-
*
* If so, character value of entity is returned. Character 0 is returned
* otherwise; if so, caller needs to do full resolution.
*
* Note: On entry we are guaranteed there are at least 3 more characters
* in this buffer; otherwise we shouldn't be called.
*
* @param checkStd If true, will check pre-defined internal entities
* (gt, lt, amp, apos, quot); if false, will only check actual
* character entities.
*
* @return (Valid) character value, if entity is a character reference,
* and could be resolved from current input buffer (does not span
* buffer boundary); null char (code 0) if not (either non-char
* entity, or spans input buffer boundary).
*/
protected int resolveSimpleEntity(boolean checkStd)
throws XMLStreamException
{
char[] buf = mInputBuffer;
int ptr = mInputPtr;
char c = buf[ptr++];
// Numeric reference?
if (c == '#') {
c = buf[ptr++];
int value = 0;
int inputLen = mInputEnd;
if (c == 'x') { // hex
while (ptr < inputLen) {
c = buf[ptr++];
if (c == ';') {
break;
}
value = value << 4;
if (c <= '9' && c >= '0') {
value += (c - '0');
} else if (c >= 'a' && c <= 'f') {
value += (10 + (c - 'a'));
} else if (c >= 'A' && c <= 'F') {
value += (10 + (c - 'A'));
} else {
mInputPtr = ptr; // so error points to correct char
throwUnexpectedChar(c, "; expected a hex digit (0-9a-fA-F).");
}
/* Need to check for overflow; easiest to do right as
* it happens...
*/
if (value > MAX_UNICODE_CHAR) {
reportUnicodeOverflow();
}
}
} else { // numeric (decimal)
while (c != ';') {
if (c <= '9' && c >= '0') {
value = (value * 10) + (c - '0');
// Overflow?
if (value > MAX_UNICODE_CHAR) {
reportUnicodeOverflow();
}
} else {
mInputPtr = ptr; // so error points to correct char
throwUnexpectedChar(c, "; expected a decimal number.");
}
if (ptr >= inputLen) {
break;
}
c = buf[ptr++];
}
}
/* We get here either if we got it all, OR if we ran out of
* input in current buffer.
*/
if (c == ';') { // got the full thing
mInputPtr = ptr;
validateChar(value);
return value;
}
/* If we ran out of input, need to just fall back, gets
* resolved via 'full' resolution mechanism.
*/
} else if (checkStd) {
/* Caller may not want to resolve these quite yet...
* (when it wants separate events for non-char entities)
*/
if (c == 'a') { // amp or apos?
c = buf[ptr++];
if (c == 'm') { // amp?
if (buf[ptr++] == 'p') {
if (ptr < mInputEnd && buf[ptr++] == ';') {
mInputPtr = ptr;
return '&';
}
}
} else if (c == 'p') { // apos?
if (buf[ptr++] == 'o') {
int len = mInputEnd;
if (ptr < len && buf[ptr++] == 's') {
if (ptr < len && buf[ptr++] == ';') {
mInputPtr = ptr;
return '\'';
}
}
}
}
} else if (c == 'g') { // gt?
if (buf[ptr++] == 't' && buf[ptr++] == ';') {
mInputPtr = ptr;
return '>';
}
} else if (c == 'l') { // lt?
if (buf[ptr++] == 't' && buf[ptr++] == ';') {
mInputPtr = ptr;
return '<';
}
} else if (c == 'q') { // quot?
if (buf[ptr++] == 'u' && buf[ptr++] == 'o') {
int len = mInputEnd;
if (ptr < len && buf[ptr++] == 't') {
if (ptr < len && buf[ptr++] == ';') {
mInputPtr = ptr;
return '"';
}
}
}
}
}
return 0;
}
/**
* Method called to resolve character entities, and only character
* entities (except that pre-defined char entities -- amp, apos, lt,
* gt, quote -- MAY be "char entities" in this sense, depending on
* arguments).
* Otherwise it is to return the null char; if so,
* the input pointer will point to the same point as when method
* entered (char after ampersand), plus the ampersand itself is
* guaranteed to be in the input buffer (so caller can just push it
* back if necessary).
*
* Most often this method is called when reader is not to expand
* non-char entities automatically, but to return them as separate
* events.
*
* Main complication here is that we need to do 5-char lookahead. This
* is problematic if chars are on input buffer boundary. This is ok
* for the root level input buffer, but not for some nested buffers.
* However, according to XML specs, such split entities are actually
* illegal... so we can throw an exception in those cases.
*
* @param checkStd If true, will check pre-defined internal entities
* (gt, lt, amp, apos, quot) as character entities; if false, will only
* check actual 'real' character entities.
*
* @return (Valid) character value, if entity is a character reference,
* and could be resolved from current input buffer (does not span
* buffer boundary); null char (code 0) if not (either non-char
* entity, or spans input buffer boundary).
*/
protected int resolveCharOnlyEntity(boolean checkStd)
throws XMLStreamException
{
//int avail = inputInBuffer();
int avail = mInputEnd - mInputPtr;
if (avail < 6) {
// split entity, or buffer boundary
/* Don't want to lose leading '&' (in case we can not expand
* the entity), so let's push it back first
*/
--mInputPtr;
/* Shortest valid reference would be 3 chars ('&a;'); which
* would only be legal from an expanded entity...
*/
if (!ensureInput(6)) {
avail = inputInBuffer();
if (avail < 3) {
throwUnexpectedEOF(SUFFIX_IN_ENTITY_REF);
}
} else {
avail = 6;
}
// ... and now we can move pointer back as well:
++mInputPtr;
}
/* Ok, now we have one more character to check, and that's enough
* to determine type decisively.
*/
char c = mInputBuffer[mInputPtr];
// A char reference?
if (c == '#') { // yup
++mInputPtr;
return resolveCharEnt(null);
}
// nope... except may be a pre-def?
if (checkStd) {
if (c == 'a') {
char d = mInputBuffer[mInputPtr+1];
if (d == 'm') {
if (avail >= 4
&& mInputBuffer[mInputPtr+2] == 'p'
&& mInputBuffer[mInputPtr+3] == ';') {
mInputPtr += 4;
return '&';
}
} else if (d == 'p') {
if (avail >= 5
&& mInputBuffer[mInputPtr+2] == 'o'
&& mInputBuffer[mInputPtr+3] == 's'
&& mInputBuffer[mInputPtr+4] == ';') {
mInputPtr += 5;
return '\'';
}
}
} else if (c == 'l') {
if (avail >= 3
&& mInputBuffer[mInputPtr+1] == 't'
&& mInputBuffer[mInputPtr+2] == ';') {
mInputPtr += 3;
return '<';
}
} else if (c == 'g') {
if (avail >= 3
&& mInputBuffer[mInputPtr+1] == 't'
&& mInputBuffer[mInputPtr+2] == ';') {
mInputPtr += 3;
return '>';
}
} else if (c == 'q') {
if (avail >= 5
&& mInputBuffer[mInputPtr+1] == 'u'
&& mInputBuffer[mInputPtr+2] == 'o'
&& mInputBuffer[mInputPtr+3] == 't'
&& mInputBuffer[mInputPtr+4] == ';') {
mInputPtr += 5;
return '"';
}
}
}
return 0;
}
/**
* Reverse of {@link #resolveCharOnlyEntity}; will only resolve entity
* if it is NOT a character entity (or pre-defined 'generic' entity;
* amp, apos, lt, gt or quot). Only used in cases where entities
* are to be separately returned unexpanded (in non-entity-replacing
* mode); which means it's never called from dtd handler.
*/
protected EntityDecl resolveNonCharEntity()
throws XMLStreamException
{
//int avail = inputInBuffer();
int avail = mInputEnd - mInputPtr;
if (avail < 6) {
// split entity, or buffer boundary
/* Don't want to lose leading '&' (in case we can not expand
* the entity), so let's push it back first
*/
--mInputPtr;
/* Shortest valid reference would be 3 chars ('&a;'); which
* would only be legal from an expanded entity...
*/
if (!ensureInput(6)) {
avail = inputInBuffer();
if (avail < 3) {
throwUnexpectedEOF(SUFFIX_IN_ENTITY_REF);
}
} else {
avail = 6;
}
// ... and now we can move pointer back as well:
++mInputPtr;
}
// We don't care about char entities:
char c = mInputBuffer[mInputPtr];
if (c == '#') {
return null;
}
/* 19-Aug-2004, TSa: Need special handling for pre-defined
* entities; they are not counted as 'real' general parsed
* entities, but more as character entities...
*/
// have chars at least up to mInputPtr+4 by now
if (c == 'a') {
char d = mInputBuffer[mInputPtr+1];
if (d == 'm') {
if (avail >= 4
&& mInputBuffer[mInputPtr+2] == 'p'
&& mInputBuffer[mInputPtr+3] == ';') {
// If not automatically expanding:
//return sEntityAmp;
// mInputPtr += 4;
return null;
}
} else if (d == 'p') {
if (avail >= 5
&& mInputBuffer[mInputPtr+2] == 'o'
&& mInputBuffer[mInputPtr+3] == 's'
&& mInputBuffer[mInputPtr+4] == ';') {
return null;
}
}
} else if (c == 'l') {
if (avail >= 3
&& mInputBuffer[mInputPtr+1] == 't'
&& mInputBuffer[mInputPtr+2] == ';') {
return null;
}
} else if (c == 'g') {
if (avail >= 3
&& mInputBuffer[mInputPtr+1] == 't'
&& mInputBuffer[mInputPtr+2] == ';') {
return null;
}
} else if (c == 'q') {
if (avail >= 5
&& mInputBuffer[mInputPtr+1] == 'u'
&& mInputBuffer[mInputPtr+2] == 'o'
&& mInputBuffer[mInputPtr+3] == 't'
&& mInputBuffer[mInputPtr+4] == ';') {
return null;
}
}
// Otherwise, let's just parse in generic way:
++mInputPtr; // since we already read the first letter
String id = parseEntityName(c);
mCurrName = id;
return findEntity(id, null);
}
/**
* Method that does full resolution of an entity reference, be it
* character entity, internal entity or external entity, including
* updating of input buffers, and depending on whether result is
* a character entity (or one of 5 pre-defined entities), returns
* char in question, or null character (code 0) to indicate it had
* to change input source.
*
* @param allowExt If true, is allowed to expand external entities
* (expanding text); if false, is not (expanding attribute value).
*
* @return Either single-character replacement (which is NOT to be
* reparsed), or null char (0) to indicate expansion is done via
* input source.
*/
protected int fullyResolveEntity(boolean allowExt)
throws XMLStreamException
{
char c = getNextCharFromCurrent(SUFFIX_IN_ENTITY_REF);
// Do we have a (numeric) character entity reference?
if (c == '#') { // numeric
final StringBuffer originalSurface = new StringBuffer("#");
int ch = resolveCharEnt(originalSurface);
if (mCfgTreatCharRefsAsEntities) {
final char[] originalChars = new char[originalSurface.length()];
originalSurface.getChars(0, originalSurface.length(), originalChars, 0);
mCurrEntity = getIntEntity(ch, originalChars);
return 0;
}
return ch;
}
String id = parseEntityName(c);
// Perhaps we have a pre-defined char reference?
c = id.charAt(0);
/*
* 16-May-2004, TSa: Should custom entities (or ones defined in int/ext subset) override
* pre-defined settings for these?
*/
char d = CHAR_NULL;
if (c == 'a') { // amp or apos?
if (id.equals("amp")) {
d = '&';
} else if (id.equals("apos")) {
d = '\'';
}
} else if (c == 'g') { // gt?
if (id.length() == 2 && id.charAt(1) == 't') {
d = '>';
}
} else if (c == 'l') { // lt?
if (id.length() == 2 && id.charAt(1) == 't') {
d = '<';
}
} else if (c == 'q') { // quot?
if (id.equals("quot")) {
d = '"';
}
}
if (d != CHAR_NULL) {
if (mCfgTreatCharRefsAsEntities) {
final char[] originalChars = new char[id.length()];
id.getChars(0, id.length(), originalChars, 0);
mCurrEntity = getIntEntity(d, originalChars);
return 0;
}
return d;
}
final EntityDecl e = expandEntity(id, allowExt, null);
if (mCfgTreatCharRefsAsEntities) {
mCurrEntity = e;
}
return 0;
}
/**
* Returns an entity (possibly from cache) for the argument character using the encoded
* representation in mInputBuffer[entityStartPos ... mInputPtr-1].
*/
protected EntityDecl getIntEntity(int ch, final char[] originalChars)
{
String cacheKey = new String(originalChars);
IntEntity entity = mCachedEntities.get(cacheKey);
if (entity == null) {
String repl;
if (ch <= 0xFFFF) {
repl = Character.toString((char) ch);
} else {
StringBuffer sb = new StringBuffer(2);
ch -= 0x10000;
sb.append((char) ((ch >> 10) + 0xD800));
sb.append((char) ((ch & 0x3FF) + 0xDC00));
repl = sb.toString();
}
entity = IntEntity.create(new String(originalChars), repl);
mCachedEntities.put(cacheKey, entity);
}
return entity;
}
/**
* Helper method that will try to expand a parsed entity (parameter or
* generic entity).
*
* note: called by sub-classes (dtd parser), needs to be protected.
*
* @param id Name of the entity being expanded
* @param allowExt Whether external entities can be expanded or not; if
* not, and the entity to expand would be external one, an exception
* will be thrown
*/
protected EntityDecl expandEntity(String id, boolean allowExt,
Object extraArg)
throws XMLStreamException
{
mCurrName = id;
EntityDecl ed = findEntity(id, extraArg);
if (ed == null) {
/* 30-Sep-2005, TSa: As per [WSTX-5], let's only throw exception
* if we have to resolve it (otherwise it's just best-effort,
* and null is ok)
*/
/* 02-Oct-2005, TSa: Plus, [WSTX-4] adds "undeclared entity
* resolver"
*/
if (mCfgReplaceEntities) {
mCurrEntity = expandUnresolvedEntity(id);
}
return null;
}
if (!mCfgTreatCharRefsAsEntities || this instanceof MinimalDTDReader) {
expandEntity(ed, allowExt);
}
return ed;
}
/**
*
* note: defined as private for documentation, ie. it's just called
* from within this class (not sub-classes), from one specific method
* (see above)
*
* @param ed Entity to be expanded
* @param allowExt Whether external entities are allowed or not.
*/
private void expandEntity(EntityDecl ed, boolean allowExt)
throws XMLStreamException
{
String id = ed.getName();
/* Very first thing; we can immediately check if expanding
* this entity would result in infinite recursion:
*/
if (mInput.isOrIsExpandedFrom(id)) {
throwRecursionError(id);
}
/* Should not refer unparsed entities from attribute values
* or text content (except via notation mechanism, but that's
* not parsed here)
*/
if (!ed.isParsed()) {
throwParseError("Illegal reference to unparsed external entity \"{0}\"", id, null);
}
// 28-Jun-2004, TSa: Do we support external entity expansion?
boolean isExt = ed.isExternal();
if (isExt) {
if (!allowExt) { // never ok in attribute value...
throwParseError("Encountered a reference to external parsed entity \"{0}\" when expanding attribute value: not legal as per XML 1.0/1.1 #3.1", id, null);
}
if (!mConfig.willSupportExternalEntities()) {
throwParseError("Encountered a reference to external entity \"{0}\", but stream reader has feature \"{1}\" disabled",
id, XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES);
}
}
verifyLimit("Maximum entity expansion count", mConfig.getMaxEntityCount(), ++mEntityExpansionCount);
// First, let's give current context chance to save its stuff
WstxInputSource oldInput = mInput;
oldInput.saveContext(this);
WstxInputSource newInput = null;
try {
newInput = ed.expand(oldInput, mEntityResolver, mConfig, mDocXmlVersion);
} catch (FileNotFoundException fex) {
/* Let's catch and rethrow this just so we get more meaningful
* description (with input source position etc)
*/
throwParseError("(was {0}) {1}", fex.getClass().getName(), fex.getMessage());
} catch (IOException ioe) {
throw constructFromIOE(ioe);
}
/* And then we'll need to make sure new input comes from the new
* input source
*/
initInputSource(newInput, isExt, id);
}
/**
*
* note: only called from the local expandEntity() method
*/
private EntityDecl expandUnresolvedEntity(String id)
throws XMLStreamException
{
XMLResolver resolver = mConfig.getUndeclaredEntityResolver();
if (resolver != null) {
/* Ok, we can check for recursion here; but let's only do that
* if there is any chance that it might get resolved by
* the special resolver (it must have been resolved this way
* earlier, too...)
*/
if (mInput.isOrIsExpandedFrom(id)) {
throwRecursionError(id);
}
WstxInputSource oldInput = mInput;
oldInput.saveContext(this);
// null, null -> no public or system ids
int xmlVersion = mDocXmlVersion;
// 05-Feb-2006, TSa: If xmlVersion not explicitly known, defaults to 1.0
if (xmlVersion == XmlConsts.XML_V_UNKNOWN) {
xmlVersion = XmlConsts.XML_V_10;
}
WstxInputSource newInput;
try {
newInput = DefaultInputResolver.resolveEntityUsing
(oldInput, id, null, null, resolver, mConfig, xmlVersion);
if (mCfgTreatCharRefsAsEntities) {
return new IntEntity(WstxInputLocation.getEmptyLocation(), newInput.getEntityId(),
newInput.getSource(), new char[]{}, WstxInputLocation.getEmptyLocation());
}
} catch (IOException ioe) {
throw constructFromIOE(ioe);
}
if (newInput != null) {
// true -> is external
initInputSource(newInput, true, id);
return null;
}
}
handleUndeclaredEntity(id);
return null;
}
/*
///////////////////////////////////////////////////////////
// Abstract methods for sub-classes to implement
///////////////////////////////////////////////////////////
*/
/**
* Abstract method for sub-classes to implement, for finding
* a declared general or parsed entity.
*
* @param id Identifier of the entity to find
* @param arg Optional argument passed from caller; needed by DTD
* reader.
*/
protected abstract EntityDecl findEntity(String id, Object arg)
throws XMLStreamException;
/**
* This method gets called if a declaration for an entity was not
* found in entity expanding mode (enabled by default for xml reader,
* always enabled for dtd reader).
*/
protected abstract void handleUndeclaredEntity(String id)
throws XMLStreamException;
protected abstract void handleIncompleteEntityProblem(WstxInputSource closing)
throws XMLStreamException;
/*
///////////////////////////////////////////////////////////
// Basic tokenization
///////////////////////////////////////////////////////////
*/
/**
* Method that will parse name token (roughly equivalent to XML specs;
* although bit lenier for more efficient handling); either uri prefix,
* or local name.
*
* Much of complexity in this method has to do with the intention to
* try to avoid any character copies. In this optimal case algorithm
* would be fairly simple. However, this only works if all data is
* already in input buffer... if not, copy has to be made halfway
* through parsing, and that complicates things.
*
* One thing to note is that String returned has been canonicalized
* and (if necessary) added to symbol table. It can thus be compared
* against other such (usually id) Strings, with simple equality operator.
*
* @param c First character of the name; not yet checked for validity
*
* @return Canonicalized name String (which may have length 0, if
* EOF or non-name-start char encountered)
*/
protected String parseLocalName(char c)
throws XMLStreamException
{
/* Has to start with letter, or '_' (etc); we won't allow ':' as that
* is taken as namespace separator; no use trying to optimize
* heavily as it's 98% likely it is a valid char...
*/
if (!isNameStartChar(c)) {
if (c == ':') {
throwUnexpectedChar(c, " (missing namespace prefix?)");
}
throwUnexpectedChar(c, " (expected a name start character)");
}
int ptr = mInputPtr;
int hash = c;
final int inputLen = mInputEnd;
int startPtr = ptr-1; // already read previous char
final char[] inputBuf = mInputBuffer;
/* After which there may be zero or more name chars
* we have to consider
*/
while (true) {
if (ptr >= inputLen) {
/* Ok, identifier may continue past buffer end, need
* to continue with part 2 (separate method, as this is
* not as common as having it all in buffer)
*/
mInputPtr = ptr;
return parseLocalName2(startPtr, hash);
}
// Ok, we have the char... is it a name char?
c = inputBuf[ptr];
if (c < CHAR_LOWEST_LEGAL_LOCALNAME_CHAR) {
break;
}
if (!isNameChar(c)) {
break;
}
hash = (hash * 31) + c;
++ptr;
}
mInputPtr = ptr;
return mSymbols.findSymbol(mInputBuffer, startPtr, ptr - startPtr, hash);
}
/**
* Second part of name token parsing; called when name can continue
* past input buffer end (so only part was read before calling this
* method to read the rest).
*
* Note that this isn't heavily optimized, on assumption it's not
* called very often.
*/
protected String parseLocalName2(int start, int hash)
throws XMLStreamException
{
int ptr = mInputEnd - start;
// Let's assume fairly short names
char[] outBuf = getNameBuffer(ptr+8);
if (ptr > 0) {
System.arraycopy(mInputBuffer, start, outBuf, 0, ptr);
}
int outLen = outBuf.length;
while (true) {
// note: names can not cross input block (entity) boundaries...
if (mInputPtr >= mInputEnd) {
if (!loadMoreFromCurrent()) {
break;
}
}
char c = mInputBuffer[mInputPtr];
if (c < CHAR_LOWEST_LEGAL_LOCALNAME_CHAR) {
break;
}
if (!isNameChar(c)) {
break;
}
++mInputPtr;
if (ptr >= outLen) {
mNameBuffer = outBuf = expandBy50Pct(outBuf);
outLen = outBuf.length;
}
outBuf[ptr++] = c;
hash = (hash * 31) + c;
}
// Still need to canonicalize the name:
return mSymbols.findSymbol(outBuf, 0, ptr, hash);
}
/**
* Method that will parse 'full' name token; what full means depends on
* whether reader is namespace aware or not. If it is, full name means
* local name with no namespace prefix (PI target, entity/notation name);
* if not, name can contain arbitrary number of colons. Note that
* element and attribute names are NOT parsed here, so actual namespace
* prefix separation can be handled properly there.
*
* Similar to {@link #parseLocalName}, much of complexity stems from
* trying to avoid copying name characters from input buffer.
*
* Note that returned String will be canonicalized, similar to
* {@link #parseLocalName}, but without separating prefix/local name.
*
* @return Canonicalized name String (which may have length 0, if
* EOF or non-name-start char encountered)
*/
protected String parseFullName()
throws XMLStreamException
{
if (mInputPtr >= mInputEnd) {
loadMoreFromCurrent();
}
return parseFullName(mInputBuffer[mInputPtr++]);
}
protected String parseFullName(char c)
throws XMLStreamException
{
// First char has special handling:
if (!isNameStartChar(c)) {
if (c == ':') { // no name.... generally an error:
if (mCfgNsEnabled) {
throwNsColonException(parseFNameForError());
}
// Ok, that's fine actually
} else {
if (c <= CHAR_SPACE) {
throwUnexpectedChar(c, " (missing name?)");
}
throwUnexpectedChar(c, " (expected a name start character)");
}
}
int ptr = mInputPtr;
int hash = c;
int inputLen = mInputEnd;
int startPtr = ptr-1; // to account for the first char
/* After which there may be zero or more name chars
* we have to consider
*/
while (true) {
if (ptr >= inputLen) {
/* Ok, identifier may continue past buffer end, need
* to continue with part 2 (separate method, as this is
* not as common as having it all in buffer)
*/
mInputPtr = ptr;
return parseFullName2(startPtr, hash);
}
c = mInputBuffer[ptr];
if (c == ':') { // colon only allowed in non-NS mode
if (mCfgNsEnabled) {
mInputPtr = ptr;
throwNsColonException(new String(mInputBuffer, startPtr, ptr - startPtr) + parseFNameForError());
}
} else {
if (c < CHAR_LOWEST_LEGAL_LOCALNAME_CHAR) {
break;
}
if (!isNameChar(c)) {
break;
}
}
hash = (hash * 31) + c;
++ptr;
}
mInputPtr = ptr;
return mSymbols.findSymbol(mInputBuffer, startPtr, ptr - startPtr, hash);
}
@SuppressWarnings("cast")
protected String parseFullName2(int start, int hash)
throws XMLStreamException
{
int ptr = mInputEnd - start;
// Let's assume fairly short names
char[] outBuf = getNameBuffer(ptr+8);
if (ptr > 0) {
System.arraycopy(mInputBuffer, start, outBuf, 0, ptr);
}
int outLen = outBuf.length;
while (true) {
/* 06-Sep-2004, TSa: Name tokens are not allowed to continue
* past entity expansion ranges... that is, all characters
* have to come from the same input source. Thus, let's only
* load things from same input level
*/
if (mInputPtr >= mInputEnd) {
if (!loadMoreFromCurrent()) {
break;
}
}
char c = mInputBuffer[mInputPtr];
if (c == ':') { // colon only allowed in non-NS mode
if (mCfgNsEnabled) {
throwNsColonException(new String(outBuf, 0, ptr) + c + parseFNameForError());
}
} else if (c < CHAR_LOWEST_LEGAL_LOCALNAME_CHAR) {
break;
} else if (!isNameChar(c)) {
break;
}
++mInputPtr;
if (ptr >= outLen) {
mNameBuffer = outBuf = expandBy50Pct(outBuf);
outLen = outBuf.length;
}
outBuf[ptr++] = c;
hash = (hash * 31) + (int) c;
}
// Still need to canonicalize the name:
return mSymbols.findSymbol(outBuf, 0, ptr, hash);
}
/**
* Method called to read in full name, including unlimited number of
* namespace separators (':'), for the purpose of displaying name in
* an error message. Won't do any further validations, and parsing
* is not optimized: main need is just to get more meaningful error
* messages.
*/
protected String parseFNameForError()
throws XMLStreamException
{
StringBuilder sb = new StringBuilder(100);
while (true) {
char c;
if (mInputPtr < mInputEnd) {
c = mInputBuffer[mInputPtr++];
} else { // can't error here, so let's accept EOF for now:
int i = getNext();
if (i < 0) {
break;
}
c = (char) i;
}
if (c != ':' && !isNameChar(c)) {
--mInputPtr;
break;
}
sb.append(c);
}
return sb.toString();
}
protected final String parseEntityName(char c)
throws XMLStreamException
{
String id = parseFullName(c);
// Needs to be followed by a semi-colon, too.. from same input source:
if (mInputPtr >= mInputEnd) {
if (!loadMoreFromCurrent()) {
throwParseError("Missing semicolon after reference for entity \"{0}\"", id, null);
}
}
c = mInputBuffer[mInputPtr++];
if (c != ';') {
throwUnexpectedChar(c, "; expected a semi-colon after the reference for entity '"+id+"'");
}
return id;
}
/**
* Note: does not check for number of colons, amongst other things.
* Main idea is to skip through what superficially seems like a valid
* id, nothing more. This is only done when really skipping through
* something we do not care about at all: not even whether names/ids
* would be valid (for example, when ignoring internal DTD subset).
*
* @return Length of skipped name.
*/
protected int skipFullName(char c)
throws XMLStreamException
{
if (!isNameStartChar(c)) {
--mInputPtr;
return 0;
}
/* After which there may be zero or more name chars
* we have to consider
*/
int count = 1;
while (true) {
c = (mInputPtr < mInputEnd) ?
mInputBuffer[mInputPtr++] : getNextChar(SUFFIX_EOF_EXP_NAME);
if (c != ':' && !isNameChar(c)) {
break;
}
++count;
}
return count;
}
/**
* Simple parsing method that parses system ids, which are generally
* used in entities (from DOCTYPE declaration to internal/external
* subsets).
*
* NOTE: returned String is not canonicalized, on assumption that
* external ids may be longish, and are not shared all that often, as
* they are generally just used for resolving paths, if anything.
*
* Also note that this method is not heavily optimized, as it's not
* likely to be a bottleneck for parsing.
*/
protected final String parseSystemId(char quoteChar, boolean convertLFs,
String errorMsg)
throws XMLStreamException
{
char[] buf = getNameBuffer(-1);
int ptr = 0;
while (true) {
char c = (mInputPtr < mInputEnd) ?
mInputBuffer[mInputPtr++] : getNextChar(errorMsg);
if (c == quoteChar) {
break;
}
/* ??? 14-Jun-2004, TSa: Should we normalize linefeeds or not?
* It seems like we should, for all input... so that's the way it
* works.
*/
if (c == '\n') {
markLF();
} else if (c == '\r') {
if (peekNext() == '\n') {
++mInputPtr;
if (!convertLFs) {
/* The only tricky thing; need to preserve 2-char LF; need to
* output one char from here, then can fall back to default:
*/
if (ptr >= buf.length) {
buf = expandBy50Pct(buf);
}
buf[ptr++] = '\r';
}
c = '\n';
} else if (convertLFs) {
c = '\n';
}
}
// Other than that, let's just append it:
if (ptr >= buf.length) {
buf = expandBy50Pct(buf);
}
buf[ptr++] = c;
}
return (ptr == 0) ? "" : new String(buf, 0, ptr);
}
/**
* Simple parsing method that parses system ids, which are generally
* used in entities (from DOCTYPE declaration to internal/external
* subsets).
*
* As per xml specs, the contents are actually normalized.
*
* NOTE: returned String is not canonicalized, on assumption that
* external ids may be longish, and are not shared all that often, as
* they are generally just used for resolving paths, if anything.
*
* Also note that this method is not heavily optimized, as it's not
* likely to be a bottleneck for parsing.
*/
protected final String parsePublicId(char quoteChar, String errorMsg)
throws XMLStreamException
{
char[] buf = getNameBuffer(-1);
int ptr = 0;
boolean spaceToAdd = false;
while (true) {
char c = (mInputPtr < mInputEnd) ?
mInputBuffer[mInputPtr++] : getNextChar(errorMsg);
if (c == quoteChar) {
break;
}
if (c == '\n') {
markLF();
spaceToAdd = true;
continue;
} else if (c == '\r') {
if (peekNext() == '\n') {
++mInputPtr;
}
spaceToAdd = true;
continue;
} else if (c == CHAR_SPACE) {
spaceToAdd = true;
continue;
} else {
// Verify it's a legal pubid char (see XML spec, #13, from 2.3)
if ((c >= VALID_PUBID_CHAR_COUNT)
|| sPubidValidity[c] != PUBID_CHAR_VALID_B) {
throwUnexpectedChar(c, " in public identifier");
}
}
// Other than that, let's just append it:
if (ptr >= buf.length) {
buf = expandBy50Pct(buf);
}
/* Space-normalization means scrapping leading and trailing
* white space, and coalescing remaining ws into single spaces.
*/
if (spaceToAdd) { // pending white space to add?
if (c == CHAR_SPACE) { // still a space; let's skip
continue;
}
/* ok: if we have non-space, we'll either forget about
* space(s) (if nothing has been output, ie. leading space),
* or output a single space (in-between non-white space)
*/
spaceToAdd = false;
if (ptr > 0) {
buf[ptr++] = CHAR_SPACE;
if (ptr >= buf.length) {
buf = expandBy50Pct(buf);
}
}
}
buf[ptr++] = c;
}
return (ptr == 0) ? "" : new String(buf, 0, ptr);
}
protected final void parseUntil(TextBuffer tb, char endChar, boolean convertLFs,
String errorMsg)
throws XMLStreamException
{
// Let's first ensure we have some data in there...
if (mInputPtr >= mInputEnd) {
loadMore(errorMsg);
}
while (true) {
// Let's loop consequtive 'easy' spans:
char[] inputBuf = mInputBuffer;
int inputLen = mInputEnd;
int ptr = mInputPtr;
int startPtr = ptr;
while (ptr < inputLen) {
char c = inputBuf[ptr++];
if (c == endChar) {
int thisLen = ptr - startPtr - 1;
if (thisLen > 0) {
tb.append(inputBuf, startPtr, thisLen);
}
mInputPtr = ptr;
return;
}
if (c == '\n') {
mInputPtr = ptr; // markLF() requires this
markLF();
} else if (c == '\r') {
if (!convertLFs && ptr < inputLen) {
if (inputBuf[ptr] == '\n') {
++ptr;
}
mInputPtr = ptr;
markLF();
} else {
int thisLen = ptr - startPtr - 1;
if (thisLen > 0) {
tb.append(inputBuf, startPtr, thisLen);
}
mInputPtr = ptr;
c = getNextChar(errorMsg);
if (c != '\n') {
--mInputPtr; // pusback
tb.append(convertLFs ? '\n' : '\r');
} else {
if (convertLFs) {
tb.append('\n');
} else {
tb.append('\r');
tb.append('\n');
}
}
startPtr = ptr = mInputPtr;
markLF();
}
}
}
int thisLen = ptr - startPtr;
if (thisLen > 0) {
tb.append(inputBuf, startPtr, thisLen);
}
loadMore(errorMsg);
startPtr = ptr = mInputPtr;
inputBuf = mInputBuffer;
inputLen = mInputEnd;
}
}
/*
///////////////////////////////////////////////////////////
// Internal methods
///////////////////////////////////////////////////////////
*/
private int resolveCharEnt(StringBuffer originalCharacters)
throws XMLStreamException
{
int value = 0;
char c = getNextChar(SUFFIX_IN_ENTITY_REF);
if (originalCharacters != null) {
originalCharacters.append(c);
}
if (c == 'x') { // hex
while (true) {
c = (mInputPtr < mInputEnd) ? mInputBuffer[mInputPtr++]
: getNextCharFromCurrent(SUFFIX_IN_ENTITY_REF);
if (c == ';') {
break;
}
if (originalCharacters != null) {
originalCharacters.append(c);
}
value = value << 4;
if (c <= '9' && c >= '0') {
value += (c - '0');
} else if (c >= 'a' && c <= 'f') {
value += 10 + (c - 'a');
} else if (c >= 'A' && c <= 'F') {
value += 10 + (c - 'A');
} else {
throwUnexpectedChar(c, "; expected a hex digit (0-9a-fA-F).");
}
// Overflow?
if (value > MAX_UNICODE_CHAR) {
reportUnicodeOverflow();
}
}
} else { // numeric (decimal)
while (c != ';') {
if (c <= '9' && c >= '0') {
value = (value * 10) + (c - '0');
// Overflow?
if (value > MAX_UNICODE_CHAR) {
reportUnicodeOverflow();
}
} else {
throwUnexpectedChar(c, "; expected a decimal number.");
}
c = (mInputPtr < mInputEnd) ? mInputBuffer[mInputPtr++]
: getNextCharFromCurrent(SUFFIX_IN_ENTITY_REF);
if (originalCharacters != null && c != ';') {
originalCharacters.append(c);
}
}
}
validateChar(value);
return value;
}
/**
* Method that will verify that expanded Unicode codepoint is a valid
* XML content character.
*/
private final void validateChar(int value)
throws XMLStreamException
{
/* 24-Jan-2006, TSa: Ok, "high" Unicode chars are problematic,
* need to be reported by a surrogate pair..
*/
if (value >= 0xD800) {
if (value < 0xE000) { // no surrogates via entity expansion
reportIllegalChar(value);
}
if (value > 0xFFFF) {
// Within valid range at all?
if (value > MAX_UNICODE_CHAR) {
reportUnicodeOverflow();
}
} else if (value >= 0xFFFE) { // 0xFFFE and 0xFFFF are illegal too
reportIllegalChar(value);
}
// Ok, fine as is
} else if (value < 32) {
if (value == 0) {
throwParseError("Invalid character reference: null character not allowed in XML content.");
}
// XML 1.1 allows most other chars; 1.0 does not:
if (!mXml11 &&
(value != 0x9 && value != 0xA && value != 0xD)) {
reportIllegalChar(value);
}
}
}
protected final char[] getNameBuffer(int minSize)
{
char[] buf = mNameBuffer;
if (buf == null) {
mNameBuffer = buf = new char[(minSize > 48) ? (minSize+16) : 64];
} else if (minSize >= buf.length) { // let's allow one char extra...
int len = buf.length;
len += (len >> 1); // grow by 50%
mNameBuffer = buf = new char[(minSize >= len) ? (minSize+16) : len];
}
return buf;
}
protected final char[] expandBy50Pct(char[] buf)
{
int len = buf.length;
char[] newBuf = new char[len + (len >> 1)];
System.arraycopy(buf, 0, newBuf, 0, len);
return newBuf;
}
/**
* Method called to throw an exception indicating that a name that
* should not be namespace-qualified (PI target, entity/notation name)
* is one, and reader is namespace aware.
*/
private void throwNsColonException(String name)
throws XMLStreamException
{
throwParseError("Illegal name \"{0}\" (PI target, entity/notation name): can not contain a colon (XML Namespaces 1.0#6)", name, null);
}
private void throwRecursionError(String entityName)
throws XMLStreamException
{
throwParseError("Illegal entity expansion: entity \"{0}\" expands itself recursively.", entityName, null);
}
private void reportUnicodeOverflow()
throws XMLStreamException
{
throwParseError("Illegal character entity: value higher than max allowed (0x{0})", Integer.toHexString(MAX_UNICODE_CHAR), null);
}
private void reportIllegalChar(int value)
throws XMLStreamException
{
throwParseError("Illegal character entity: expansion character (code 0x{0}", Integer.toHexString(value), null);
}
protected void verifyLimit(String type, long maxValue, long currentValue)
throws XMLStreamException
{
if (currentValue > maxValue) {
throw constructLimitViolation(type, maxValue);
}
}
protected XMLStreamException constructLimitViolation(String type, long limit)
throws XMLStreamException
{
return new XMLStreamException(type+" limit ("+limit+") exceeded");
}
}