com.sun.xml.parser.Parser Maven / Gradle / Ivy
/*
* $Id: Parser.java,v 1.13 1999/05/14 16:50:22 mode Exp $
*
* Copyright (c) 1998-1999 Sun Microsystems, Inc. All Rights Reserved.
*
* This software is the confidential and proprietary information of Sun
* Microsystems, Inc. ("Confidential Information"). You shall not
* disclose such Confidential Information and shall use it only in
* accordance with the terms of the license agreement you entered into
* with Sun.
*
* SUN MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY OF THE
* SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
* IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
* PURPOSE, OR NON-INFRINGEMENT. SUN SHALL NOT BE LIABLE FOR ANY DAMAGES
* SUFFERED BY LICENSEE AS A RESULT OF USING, MODIFYING OR DISTRIBUTING
* THIS SOFTWARE OR ITS DERIVATIVES.
*/
package com.sun.xml.parser;
import java.io.IOException;
import java.io.Reader;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Locale;
import java.util.Vector;
import org.xml.sax.*;
import com.sun.xml.util.MessageCatalog;
import com.sun.xml.util.XmlChars;
//
// NOTE: when maintaining this code, take care to keep the message
// catalogue(s) up to date!! It's important that the diagnostics
// be informative.
//
/**
* This implements a fast non-validating SAX parser. This one always
* processes external parsed entities, strictly adheres to the XML 1.0
* specification, and provides useful diagnostics. It supports an optimization
* allowing faster processing of valid standalone XML documents. For
* multi-language applications (such as web servers using XML processing
* to create dynamic content), a method supports choosing a locale for
* parser diagnostics which is both understood by the message recipient
* and supported by the parser.
*
* This conforms to the XML 1.0 specification. To configure an XML
* processor which tests document conformance against XML Namespaces,
* provide a DtdEventListener which examines declarations of
* entities and notations, and have your document listener check other
* constraints such as ensuring xmlns* attribute values properly
* declare all namespace prefixes. (Only element and attribute names may
* contain colons, and even then the name prefix before the colon must be
* properly declared.)
*
*
SAX parsers produce a stream of parse events, which applications
* process to create an object model which is specific to their tasks.
* Applications which do not want to process event streams in that way
* should use an API producing a standardized object model, such as the
* W3C's Document Object Model (DOM). This parser supports
* building fully conformant DOM Document objects, through
* use of DtdEventListener extensions to SAX in conjunction with an
* appropriate implementation of a SAX DocumentHandler. In
* addition, it supports some features (exposing comments, CDATA sections,
* and entity references) which are allowed by DOM but not required to
* be reported by conformant XML processors. (As usual, the default
* handler for parsing events other than fatal errors ignores them.)
*
* @see ValidatingParser
*
* @author David Brownell
* @version $Revision: 1.13 $
*/
public class Parser implements org.xml.sax.Parser
{
// stack of input entities being merged
private InputEntity in;
// temporaries reused during parsing
private AttributeListImpl attTmp;
private StringBuffer strTmp;
private char nameTmp [];
private NameCache nameCache;
private char charTmp [] = new char [2];
// NOTE: odd heap behavior, at least with classic VM: if "strTmp" is
// reused, LOTS of extra memory is consumed in some simple situations.
// JVM bug filed; it's no longer a win to reuse it as much, in any case.
// parsing modes
private boolean isValidating = false;
private boolean fastStandalone = false;
private boolean isInAttribute = false;
// temporary DTD parsing state
private boolean inExternalPE;
private boolean doLexicalPE;
private boolean donePrologue;
// info about the document
private boolean isStandalone;
private String rootElementName;
// DTD state, used during parsing
private boolean ignoreDeclarations;
private SimpleHashtable elements = new SimpleHashtable (47);
private SimpleHashtable params = new SimpleHashtable (7);
// exposed to package-private subclass
Hashtable notations = new Hashtable (7);
SimpleHashtable entities = new SimpleHashtable (17);
// stuff associated with SAX
private DocumentHandler docHandler;
private DTDHandler dtdHandler;
private EntityResolver resolver;
private ErrorHandler errHandler;
private Locale locale;
private Locator locator;
// extended parser API support
private DtdEventListener dtdListener;
private LexicalEventListener lexicalListener;
// Compile time option: disable validation support for a better
// fit in memory-critical environments (P-Java etc). Doing that
// and removing the validating parser support saves (at this time)
// about 15% in size.
private static final boolean supportValidation = true;
// string constants -- use these copies so "==" works
// package private
static final String strANY = "ANY";
static final String strEMPTY = "EMPTY";
////////////////////////////////////////////////////////////////
//
// PARSER methods
//
////////////////////////////////////////////////////////////////
/** Constructs a SAX parser object. */
public Parser ()
{
locator = new DocLocator ();
setHandlers ();
}
/**
* SAX: Used by applications to request locale for diagnostics.
*
* @param l The locale to use, or null to use system defaults
* (which may include only message IDs).
* @throws SAXException If no diagnostic messages are available
* in that locale.
*/
public void setLocale (Locale l)
throws SAXException
{
if (l != null && !messages.isLocaleSupported (l.toString ()))
throw new SAXException (messages.getMessage (locale,
"P-078", new Object [] { l }));
locale = l;
}
/** Returns the diagnostic locale. */
public Locale getLocale ()
{ return locale; }
/**
* Chooses a client locale to use for diagnostics, using the first
* language specified in the list that is supported by this parser.
* That locale is then set using
* setLocale(). Such a list could be provided by a variety of user
* preference mechanisms, including the HTTP Accept-Language
* header field.
*
* @see com.sun.xml.util.MessageCatalog
*
* @param languages Array of language specifiers, ordered with the most
* preferable one at the front. For example, "en-ca" then "fr-ca",
* followed by "zh_CN". Both RFC 1766 and Java styles are supported.
* @return The chosen locale, or null.
*/
public Locale chooseLocale (String languages [])
throws SAXException
{
Locale l = messages.chooseLocale (languages);
if (l != null)
setLocale (l);
return l;
}
/** SAX: Lets applications control entity resolution. */
public void setEntityResolver (EntityResolver r)
{ resolver = r; }
/** Returns the object used to resolve entities */
public EntityResolver getEntityResolver ()
{ return resolver; }
/**
* SAX: Used by applications to see unparsed entity information,
* this assigns the handler for the basic SAX DTD events as well as
* the extended "DtdEventListener" events. If the specified handler
* supports the extended events, it receives those events; otherwise,
* they are ignored.
*
* @see DtdEventListener
*/
public void setDTDHandler (DTDHandler handler)
{
if (handler == null)
handler = defaultHandler;
dtdHandler = handler;
if (handler instanceof DtdEventListener)
dtdListener = (DtdEventListener) handler;
else
dtdListener = defaultHandler;
}
/** Returns the handler used to deliver unparsed entity information. */
public DTDHandler getDTDHandler ()
{ return dtdHandler; }
/**
* SAX: The primary application hook into the parser, this
* assigns the handler for the basic SAX document events as well as
* the extended "lexical" events. If the specified handler supports
* the extended events, it receives those events; otherwise, they
* are ignored.
*
* @see LexicalEventListener
*/
public void setDocumentHandler (DocumentHandler handler)
{
if (handler == null)
handler = defaultHandler;
docHandler = handler;
if (handler instanceof LexicalEventListener)
lexicalListener = (LexicalEventListener) handler;
else
lexicalListener = defaultHandler;
}
/** Returns the application being driven by the parser. */
public DocumentHandler getDocumentHandler ()
{ return docHandler; }
/**
* SAX: Used to override default error handling; for example, to
* ensure that validity errors abort parsing, or to report
* errors through the correct channels.
*/
public void setErrorHandler (ErrorHandler handler)
{ errHandler = handler; }
/** Returns the object used for error handling */
public ErrorHandler getErrorHandler ()
{ return errHandler; }
/** SAX: Parse a document. */
public void parse (InputSource in)
throws SAXException, IOException
{
init ();
parseInternal (in);
}
/** SAX: Parse a document. */
public void parse (String uri)
throws SAXException, IOException
{
InputSource in;
init ();
// System.out.println ("parse (\"" + uri + "\")");
in = resolver.resolveEntity (null, uri);
// If custom resolver punts resolution to parser, handle it ...
if (in == null)
in = Resolver.createInputSource (new java.net.URL (uri), false);
// ... or if custom resolver doesn't correctly construct the
// input entity, patch it up enough so relative URIs work, and
// issue a warning to minimize later confusion.
else if (in.getSystemId () == null) {
warning ("P-065", null);
in.setSystemId (uri);
}
parseInternal (in);
}
/**
* Setting this flag enables faster processing of valid standalone
* documents: external DTD information is not processed, and no
* attribute normalization or defaulting is done. This optimization
* is only permitted in non-validating parsers; for validating
* parsers, this mode is silently disabled.
*
*
For documents which are declared as standalone, but which are
* not valid, a fatal error may be reported for references to externally
* defined entities. That could happen in any nonvalidating parser which
* did not read externally defined entities. Also, if any attribute
* values need normalization or defaulting, it will not be done.
*/
public void setFastStandalone (boolean value)
{ fastStandalone = value && !isValidating; }
/**
* Returns true if standalone documents skip processing of
* all external DTD information.
*/
public boolean isFastStandalone ()
{ return fastStandalone; }
/**
* In support of the HTML DOM model of client side
* <xhtml:script> tag processing, this method permits
* data to be spliced into the input stream. This method would
* normally be called from an endElement callback to put the
* buffered result of calls such as DOM HTMLDocument.write
* into the input stream.
*/
public void pushInputBuffer (char buf [], int offset, int len)
throws SAXException
{
if (len <= 0)
return;
// arraycopy is inelegant, but that's the worst penalty for now
if (offset != 0 || len != buf.length) {
char tmp [] = new char [len];
System.arraycopy (buf, offset, tmp, 0, len);
buf = tmp;
}
pushReader (buf, null, false);
}
// package private
void setIsValidating (boolean value)
{
if (supportValidation)
isValidating = value;
else
throw new RuntimeException (messages.getMessage (locale, "V-000"));
if (value)
fastStandalone = false;
}
// makes sure the parser's reset to "before a document"
private void init ()
{
in = null;
// alloc temporary data used in parsing
attTmp = new AttributeListImpl ();
strTmp = new StringBuffer ();
nameTmp = new char [20];
nameCache = new NameCache ();
// reset doc info
isStandalone = false;
rootElementName = null;
isInAttribute = false;
inExternalPE = false;
doLexicalPE = false;
donePrologue = false;
entities.clear ();
notations.clear ();
params.clear ();
elements.clear ();
ignoreDeclarations = false;
// initialize predefined references ... re-interpreted later
builtin ("amp", "&");
builtin ("lt", "<");
builtin ("gt", ">");
builtin ("quot", "\"");
builtin ("apos", "'");
if (locale == null)
locale = Locale.getDefault ();
if (resolver == null)
resolver = new Resolver ();
setHandlers ();
}
static private final ListenerBase defaultHandler = new ListenerBase ();
private void setHandlers ()
{
if (dtdHandler == null)
dtdHandler = defaultHandler;
if (dtdListener == null)
dtdListener = defaultHandler;
if (errHandler == null)
errHandler = defaultHandler;
if (docHandler == null)
docHandler = defaultHandler;
if (lexicalListener == null)
lexicalListener = defaultHandler;
}
private void builtin (String entityName, String entityValue)
{
InternalEntity entity;
entity = new InternalEntity (entityName, entityValue.toCharArray ());
entities.put (entityName, entity);
}
////////////////////////////////////////////////////////////////
//
// parsing is by recursive descent, code roughly
// following the BNF rules except tweaked for simple
// lookahead. rules are more or less in numeric order,
// except where code sharing suggests other structures.
//
// a classic benefit of recursive descent parsers: it's
// relatively easy to get diagnostics that make sense.
//
////////////////////////////////////////////////////////////////
//
// CHAPTER 2: Documents
//
private void parseInternal (InputSource input)
throws SAXException, IOException
{
if (input == null)
fatal ("P-000");
try {
in = InputEntity.getInputEntity (errHandler, locale);
in.init (input, null, null, false);
//
// doc handler sees the locator, lots of PIs, DTD info
// about external entities and notations, then the body.
//Need to initialize this after InputEntity cos locator uses
//InputEntity's systemid, publicid, line no. etc
docHandler.setDocumentLocator (locator);
docHandler.startDocument ();
// [1] document ::= prolog element Misc*
// [22] prolog ::= XMLDecl? Misc* (DoctypeDecl Misc *)?
maybeXmlDecl ();
maybeMisc (false);
if (!maybeDoctypeDecl ()) {
if (supportValidation && isValidating)
warning ("V-001", null);
}
maybeMisc (false);
donePrologue = true;
//
// One root element ... then basically PIs before EOF.
//
if (!in.peekc ('<') || !maybeElement (null))
fatal ("P-067");
//Check subclass. Used for validation of id refs.
afterRoot ();
maybeMisc (true);
if (!in.isEOF ())
fatal ("P-001", new Object []
{ Integer.toHexString (((int)getc ())) } );
docHandler.endDocument ();
} catch (EndOfInputException e) {
if (!in.isDocument ()) {
String name = in.getName ();
do { // force a relevant URI and line number
in = in.pop ();
} while (in.isInternal ());
fatal ("P-002", new Object []
{ name },
e);
} else
fatal ("P-003", null, e);
} catch (RuntimeException e) {
// Don't discard location that triggered the exception
throw new SAXParseException (
e.getMessage () != null
? e.getMessage ()
: e.getClass ().getName (),
locator.getPublicId (), locator.getSystemId (),
locator.getLineNumber (), locator.getColumnNumber (),
e);
} finally {
// recycle temporary data used during parsing
strTmp = null;
attTmp = null;
nameTmp = null;
nameCache = null;
// ditto input sources etc
if (in != null) {
in.close ();
in = null;
}
// get rid of all DTD info ... some of it would be
// useful for editors etc, investigate later.
params.clear ();
entities.clear ();
notations.clear ();
elements.clear ();
afterDocument ();
}
}
// package private -- for subclass
void afterRoot () throws SAXException { }
// package private -- for subclass
void afterDocument () { }
// role is for diagnostics
private void whitespace (String roleId) throws IOException, SAXException
// [3] S ::= (#x20 | #x9 | #xd | #xa)+
{
if (!maybeWhitespace ())
fatal ("P-004", new Object []
{ messages.getMessage (locale, roleId) });
}
// S?
private boolean maybeWhitespace () throws IOException, SAXException
{
if (!(inExternalPE && doLexicalPE))
return in.maybeWhitespace ();
// see getc() for the PE logic -- this lets us splice
// expansions of PEs in "anywhere". getc() has smarts,
// so for external PEs we don't bypass it.
// XXX we can marginally speed PE handling, and certainly
// be cleaner (hence potentially more correct), by using
// the observations that expanded PEs only start and stop
// where whitespace is allowed. getc wouldn't need any
// "lexical" PE expansion logic, and no other method needs
// to handle termination of PEs. (parsing of literals would
// still need to pop entities, but not parsing of references
// in content.)
char c = getc();
boolean saw = false;
while (c == ' ' || c == '\t' || c == '\n' || c == '\r') {
saw = true;
// this gracefully ends things when we stop playing
// with internal parameters. caller should have a
// grammar rule allowing whitespace at end of entity.
if (in.isEOF () && !in.isInternal ())
return saw;
c = getc ();
}
ungetc ();
return saw;
}
private String maybeGetName ()
throws IOException, SAXException
{
NameCacheEntry entry = maybeGetNameCacheEntry ();
return (entry == null) ? null : entry.name;
}
private NameCacheEntry maybeGetNameCacheEntry ()
throws IOException, SAXException
{
// [5] Name ::= (Letter|'_'|':') (Namechar)*
char c = getc ();
if (!XmlChars.isLetter (c) && c != ':' && c != '_') {
ungetc ();
return null;
}
return nameCharString (c);
}
// Used when parsing enumerations
private String getNmtoken ()
throws SAXException, IOException
{
// [7] Nmtoken ::= (Namechar)+
char c = getc ();
if (!XmlChars.isNameChar (c))
fatal ("P-006", new Object [] { new Character (c) });
return nameCharString (c).name;
}
// n.b. this gets used when parsing attribute values (for
// internal references) so we can't use strTmp; it's also
// a hotspot for CPU and memory in the parser (called at least
// once for each element) so this has been optimized a bit.
private NameCacheEntry nameCharString (char c)
throws IOException, SAXException
{
int i = 1;
nameTmp [0] = c;
for (;;) {
if ((c = in.getNameChar ()) == 0)
break;
if (i >= nameTmp.length) {
char tmp [] = new char [nameTmp.length + 10];
System.arraycopy (nameTmp, 0, tmp, 0, nameTmp.length);
nameTmp = tmp;
}
nameTmp [i++] = c;
}
return nameCache.lookupEntry (nameTmp, i);
}
//
// much similarity between parsing entity values in DTD
// and attribute values (in DTD or content) ... both follow
// literal parsing rules, newline canonicalization, etc
//
// leaves value in 'strTmp' ... either a "replacement text" (4.5),
// or else partially normalized attribute value (the first bit
// of 3.3.3's spec, without the "if not CDATA" bits).
//
private void parseLiteral (boolean isEntityValue)
throws IOException, SAXException
{
// [9] EntityValue ::=
// '"' ([^"&%] | Reference | PEReference)* '"'
// | "'" ([^'&%] | Reference | PEReference)* "'"
// [10] AttValue ::=
// '"' ([^"&] | Reference )* '"'
// | "'" ([^'&] | Reference )* "'"
char quote = getc ();
char c;
InputEntity source = in;
if (quote != '\'' && quote != '"')
fatal ("P-007");
// don't report entity expansions within attributes,
// they're reported "fully expanded" via SAX
isInAttribute = !isEntityValue;
// get value into strTmp
strTmp = new StringBuffer ();
// scan, allowing entity push/pop wherever ...
// expanded entities can't terminate the literal!
for (;;) {
if (in != source && in.isEOF ()) {
// we don't report end of parsed entities
// within attributes (no SAX hooks)
in = in.pop ();
continue;
}
if ((c = getc ()) == quote && in == source)
break;
//
// Basically the "reference in attribute value"
// row of the chart in section 4.4 of the spec
//
if (c == '&') {
String entityName = maybeGetName ();
if (entityName != null) {
nextChar (';', "F-020", entityName);
// 4.4 says: bypass these here ... we'll catch
// forbidden refs to unparsed entities on use
if (isEntityValue) {
strTmp.append ('&');
strTmp.append (entityName);
strTmp.append (';');
continue;
}
expandEntityInLiteral (entityName, entities, isEntityValue);
// character references are always included immediately
} else if ((c = getc ()) == '#') {
int tmp = parseCharNumber ();
if (tmp > 0xffff) {
tmp = surrogatesToCharTmp (tmp);
strTmp.append (charTmp [0]);
if (tmp == 2)
strTmp.append (charTmp [1]);
} else
strTmp.append ((char) tmp);
} else
fatal ("P-009");
continue;
}
// expand parameter entities only within entity value literals
if (c == '%' && isEntityValue) {
String entityName = maybeGetName ();
if (entityName != null) {
nextChar (';', "F-021", entityName);
if (inExternalPE)
expandEntityInLiteral (entityName,
params, isEntityValue);
else
fatal ("P-010", new Object [] { entityName });
continue;
} else
fatal ("P-011");
}
// For attribute values ...
if (!isEntityValue) {
// 3.3.3 says whitespace normalizes to space...
if (c == ' ' || c == '\t' || c == '\n' || c == '\r') {
strTmp.append (' ');
continue;
}
// "<" not legal in parsed literals ...
if (c == '<')
fatal ("P-012");
}
strTmp.append (c);
}
isInAttribute = false;
}
// does a SINGLE expansion of the entity (often reparsed later)
private void expandEntityInLiteral (
String name,
SimpleHashtable table,
boolean isEntityValue
) throws SAXException, IOException
{
Object entity = table.get (name);
//
// Note: if entity is a PE (value.isPE) there is an XML
// requirement that the content be "markkupdecl", but that error
// is ignored here (as permitted by the XML spec).
//
if (entity instanceof InternalEntity) {
InternalEntity value = (InternalEntity) entity;
if (supportValidation && isValidating
&& isStandalone
&& !value.isFromInternalSubset)
error ("V-002", new Object [] { name });
pushReader (value.buf, name, !value.isPE);
} else if (entity instanceof ExternalEntity) {
if (!isEntityValue) // must be a PE ...
fatal ("P-013", new Object [] { name });
// XXX if this returns false ...
pushReader ((ExternalEntity) entity);
} else if (entity == null) {
//
// Note: much confusion about whether spec requires such
// errors to be fatal in many cases, but none about whether
// it allows "normal" errors to be unrecoverable!
//
fatal (
(table == params) ? "V-022" : "P-014",
new Object [] { name });
}
}
// [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
// for PUBLIC and SYSTEM literals, also "'
// NOTE: XML spec should explicitly say that PE ref syntax is
// ignored in PIs, comments, SystemLiterals, and Pubid Literal
// values ... can't process the XML spec's own DTD without doing
// that for comments.
private String getQuotedString (String type, String extra)
throws IOException, SAXException
{
// use in.getc to bypass PE processing
char quote = in.getc ();
if (quote != '\'' && quote != '"')
fatal ("P-015", new Object [] {
messages.getMessage (locale, type, new Object [] { extra })
});
char c;
strTmp = new StringBuffer ();
while ((c = in.getc ()) != quote)
strTmp.append ((char)c);
return strTmp.toString ();
}
private String parsePublicId ()
throws IOException, SAXException
{
// [12] PubidLiteral ::= ('"' PubidChar* '"') | ("'" PubidChar* "'")
// [13] PubidChar ::= #x20|#xd|#xa|[a-zA-Z0-9]|[-'()+,./:=?;!*#@$_%]
String retval = getQuotedString ("F-033", null);
for (int i = 0; i < retval.length (); i++) {
char c = retval.charAt (i);
if (" \r\n-'()+,./:=?;!*#@$_%0123456789".indexOf(c) == -1
&& !(c >= 'A' && c <= 'Z')
&& !(c >= 'a' && c <= 'z'))
fatal ("P-016", new Object [] { new Character (c) });
}
strTmp = new StringBuffer ();
strTmp.append (retval);
return normalize (false);
}
// [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
// handled by: InputEntity.parsedContent()
private boolean maybeComment (boolean skipStart)
throws IOException, SAXException
{
// [15] Comment ::= ''
if (!in.peek (skipStart ? "!--" : "