All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.sun.xml.parser.Parser Maven / Gradle / Ivy

The newest version!
/*
 * $Id: Parser.java,v 1.13 1999/05/14 16:50:22 mode Exp $
 * 
 * Copyright (c) 1998-1999 Sun Microsystems, Inc. All Rights Reserved.
 * 
 * This software is the confidential and proprietary information of Sun
 * Microsystems, Inc. ("Confidential Information").  You shall not
 * disclose such Confidential Information and shall use it only in
 * accordance with the terms of the license agreement you entered into
 * with Sun.
 * 
 * SUN MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY OF THE
 * SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
 * PURPOSE, OR NON-INFRINGEMENT. SUN SHALL NOT BE LIABLE FOR ANY DAMAGES
 * SUFFERED BY LICENSEE AS A RESULT OF USING, MODIFYING OR DISTRIBUTING
 * THIS SOFTWARE OR ITS DERIVATIVES.
 */


package com.sun.xml.parser;

import java.io.IOException;
import java.io.Reader;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Locale;
import java.util.Vector;

import org.xml.sax.*;

import com.sun.xml.util.MessageCatalog;
import com.sun.xml.util.XmlChars;


//
// NOTE:  when maintaining this code, take care to keep the message
// catalogue(s) up to date!!  It's important that the diagnostics
// be informative.
//


/**
 * This implements a fast non-validating SAX parser.  This one always 
 * processes external parsed entities, strictly adheres to the XML 1.0
 * specification, and provides useful diagnostics.  It supports an optimization
 * allowing faster processing of valid standalone XML documents.  For
 * multi-language applications (such as web servers using XML processing
 * to create dynamic content), a method supports choosing a locale for
 * parser diagnostics which is both understood by the message recipient
 * and supported by the parser.
 *
 * 

This conforms to the XML 1.0 specification. To configure an XML * processor which tests document conformance against XML Namespaces, * provide a DtdEventListener which examines declarations of * entities and notations, and have your document listener check other * constraints such as ensuring xmlns* attribute values properly * declare all namespace prefixes. (Only element and attribute names may * contain colons, and even then the name prefix before the colon must be * properly declared.) * *

SAX parsers produce a stream of parse events, which applications * process to create an object model which is specific to their tasks. * Applications which do not want to process event streams in that way * should use an API producing a standardized object model, such as the * W3C's Document Object Model (DOM). This parser supports * building fully conformant DOM Document objects, through * use of DtdEventListener extensions to SAX in conjunction with an * appropriate implementation of a SAX DocumentHandler. In * addition, it supports some features (exposing comments, CDATA sections, * and entity references) which are allowed by DOM but not required to * be reported by conformant XML processors. (As usual, the default * handler for parsing events other than fatal errors ignores them.) * * @see ValidatingParser * * @author David Brownell * @version $Revision: 1.13 $ */ public class Parser implements org.xml.sax.Parser { // stack of input entities being merged private InputEntity in; // temporaries reused during parsing private AttributeListImpl attTmp; private StringBuffer strTmp; private char nameTmp []; private NameCache nameCache; private char charTmp [] = new char [2]; // NOTE: odd heap behavior, at least with classic VM: if "strTmp" is // reused, LOTS of extra memory is consumed in some simple situations. // JVM bug filed; it's no longer a win to reuse it as much, in any case. // parsing modes private boolean isValidating = false; private boolean fastStandalone = false; private boolean isInAttribute = false; // temporary DTD parsing state private boolean inExternalPE; private boolean doLexicalPE; private boolean donePrologue; // info about the document private boolean isStandalone; private String rootElementName; // DTD state, used during parsing private boolean ignoreDeclarations; private SimpleHashtable elements = new SimpleHashtable (47); private SimpleHashtable params = new SimpleHashtable (7); // exposed to package-private subclass Hashtable notations = new Hashtable (7); SimpleHashtable entities = new SimpleHashtable (17); // stuff associated with SAX private DocumentHandler docHandler; private DTDHandler dtdHandler; private EntityResolver resolver; private ErrorHandler errHandler; private Locale locale; private Locator locator; // extended parser API support private DtdEventListener dtdListener; private LexicalEventListener lexicalListener; // Compile time option: disable validation support for a better // fit in memory-critical environments (P-Java etc). Doing that // and removing the validating parser support saves (at this time) // about 15% in size. private static final boolean supportValidation = true; // string constants -- use these copies so "==" works // package private static final String strANY = "ANY"; static final String strEMPTY = "EMPTY"; //////////////////////////////////////////////////////////////// // // PARSER methods // //////////////////////////////////////////////////////////////// /** Constructs a SAX parser object. */ public Parser () { locator = new DocLocator (); setHandlers (); } /** * SAX: Used by applications to request locale for diagnostics. * * @param l The locale to use, or null to use system defaults * (which may include only message IDs). * @throws SAXException If no diagnostic messages are available * in that locale. */ public void setLocale (Locale l) throws SAXException { if (l != null && !messages.isLocaleSupported (l.toString ())) throw new SAXException (messages.getMessage (locale, "P-078", new Object [] { l })); locale = l; } /** Returns the diagnostic locale. */ public Locale getLocale () { return locale; } /** * Chooses a client locale to use for diagnostics, using the first * language specified in the list that is supported by this parser. * That locale is then set using * setLocale(). Such a list could be provided by a variety of user * preference mechanisms, including the HTTP Accept-Language * header field. * * @see com.sun.xml.util.MessageCatalog * * @param languages Array of language specifiers, ordered with the most * preferable one at the front. For example, "en-ca" then "fr-ca", * followed by "zh_CN". Both RFC 1766 and Java styles are supported. * @return The chosen locale, or null. */ public Locale chooseLocale (String languages []) throws SAXException { Locale l = messages.chooseLocale (languages); if (l != null) setLocale (l); return l; } /** SAX: Lets applications control entity resolution. */ public void setEntityResolver (EntityResolver r) { resolver = r; } /** Returns the object used to resolve entities */ public EntityResolver getEntityResolver () { return resolver; } /** * SAX: Used by applications to see unparsed entity information, * this assigns the handler for the basic SAX DTD events as well as * the extended "DtdEventListener" events. If the specified handler * supports the extended events, it receives those events; otherwise, * they are ignored. * * @see DtdEventListener */ public void setDTDHandler (DTDHandler handler) { if (handler == null) handler = defaultHandler; dtdHandler = handler; if (handler instanceof DtdEventListener) dtdListener = (DtdEventListener) handler; else dtdListener = defaultHandler; } /** Returns the handler used to deliver unparsed entity information. */ public DTDHandler getDTDHandler () { return dtdHandler; } /** * SAX: The primary application hook into the parser, this * assigns the handler for the basic SAX document events as well as * the extended "lexical" events. If the specified handler supports * the extended events, it receives those events; otherwise, they * are ignored. * * @see LexicalEventListener */ public void setDocumentHandler (DocumentHandler handler) { if (handler == null) handler = defaultHandler; docHandler = handler; if (handler instanceof LexicalEventListener) lexicalListener = (LexicalEventListener) handler; else lexicalListener = defaultHandler; } /** Returns the application being driven by the parser. */ public DocumentHandler getDocumentHandler () { return docHandler; } /** * SAX: Used to override default error handling; for example, to * ensure that validity errors abort parsing, or to report * errors through the correct channels. */ public void setErrorHandler (ErrorHandler handler) { errHandler = handler; } /** Returns the object used for error handling */ public ErrorHandler getErrorHandler () { return errHandler; } /** SAX: Parse a document. */ public void parse (InputSource in) throws SAXException, IOException { init (); parseInternal (in); } /** SAX: Parse a document. */ public void parse (String uri) throws SAXException, IOException { InputSource in; init (); // System.out.println ("parse (\"" + uri + "\")"); in = resolver.resolveEntity (null, uri); // If custom resolver punts resolution to parser, handle it ... if (in == null) in = Resolver.createInputSource (new java.net.URL (uri), false); // ... or if custom resolver doesn't correctly construct the // input entity, patch it up enough so relative URIs work, and // issue a warning to minimize later confusion. else if (in.getSystemId () == null) { warning ("P-065", null); in.setSystemId (uri); } parseInternal (in); } /** * Setting this flag enables faster processing of valid standalone * documents: external DTD information is not processed, and no * attribute normalization or defaulting is done. This optimization * is only permitted in non-validating parsers; for validating * parsers, this mode is silently disabled. * *

For documents which are declared as standalone, but which are * not valid, a fatal error may be reported for references to externally * defined entities. That could happen in any nonvalidating parser which * did not read externally defined entities. Also, if any attribute * values need normalization or defaulting, it will not be done. */ public void setFastStandalone (boolean value) { fastStandalone = value && !isValidating; } /** * Returns true if standalone documents skip processing of * all external DTD information. */ public boolean isFastStandalone () { return fastStandalone; } /** * In support of the HTML DOM model of client side * <xhtml:script> tag processing, this method permits * data to be spliced into the input stream. This method would * normally be called from an endElement callback to put the * buffered result of calls such as DOM HTMLDocument.write * into the input stream. */ public void pushInputBuffer (char buf [], int offset, int len) throws SAXException { if (len <= 0) return; // arraycopy is inelegant, but that's the worst penalty for now if (offset != 0 || len != buf.length) { char tmp [] = new char [len]; System.arraycopy (buf, offset, tmp, 0, len); buf = tmp; } pushReader (buf, null, false); } // package private void setIsValidating (boolean value) { if (supportValidation) isValidating = value; else throw new RuntimeException (messages.getMessage (locale, "V-000")); if (value) fastStandalone = false; } // makes sure the parser's reset to "before a document" private void init () { in = null; // alloc temporary data used in parsing attTmp = new AttributeListImpl (); strTmp = new StringBuffer (); nameTmp = new char [20]; nameCache = new NameCache (); // reset doc info isStandalone = false; rootElementName = null; isInAttribute = false; inExternalPE = false; doLexicalPE = false; donePrologue = false; entities.clear (); notations.clear (); params.clear (); elements.clear (); ignoreDeclarations = false; // initialize predefined references ... re-interpreted later builtin ("amp", "&"); builtin ("lt", "<"); builtin ("gt", ">"); builtin ("quot", "\""); builtin ("apos", "'"); if (locale == null) locale = Locale.getDefault (); if (resolver == null) resolver = new Resolver (); setHandlers (); } static private final ListenerBase defaultHandler = new ListenerBase (); private void setHandlers () { if (dtdHandler == null) dtdHandler = defaultHandler; if (dtdListener == null) dtdListener = defaultHandler; if (errHandler == null) errHandler = defaultHandler; if (docHandler == null) docHandler = defaultHandler; if (lexicalListener == null) lexicalListener = defaultHandler; } private void builtin (String entityName, String entityValue) { InternalEntity entity; entity = new InternalEntity (entityName, entityValue.toCharArray ()); entities.put (entityName, entity); } //////////////////////////////////////////////////////////////// // // parsing is by recursive descent, code roughly // following the BNF rules except tweaked for simple // lookahead. rules are more or less in numeric order, // except where code sharing suggests other structures. // // a classic benefit of recursive descent parsers: it's // relatively easy to get diagnostics that make sense. // //////////////////////////////////////////////////////////////// // // CHAPTER 2: Documents // private void parseInternal (InputSource input) throws SAXException, IOException { if (input == null) fatal ("P-000"); try { in = InputEntity.getInputEntity (errHandler, locale); in.init (input, null, null, false); // // doc handler sees the locator, lots of PIs, DTD info // about external entities and notations, then the body. //Need to initialize this after InputEntity cos locator uses //InputEntity's systemid, publicid, line no. etc docHandler.setDocumentLocator (locator); docHandler.startDocument (); // [1] document ::= prolog element Misc* // [22] prolog ::= XMLDecl? Misc* (DoctypeDecl Misc *)? maybeXmlDecl (); maybeMisc (false); if (!maybeDoctypeDecl ()) { if (supportValidation && isValidating) warning ("V-001", null); } maybeMisc (false); donePrologue = true; // // One root element ... then basically PIs before EOF. // if (!in.peekc ('<') || !maybeElement (null)) fatal ("P-067"); //Check subclass. Used for validation of id refs. afterRoot (); maybeMisc (true); if (!in.isEOF ()) fatal ("P-001", new Object [] { Integer.toHexString (((int)getc ())) } ); docHandler.endDocument (); } catch (EndOfInputException e) { if (!in.isDocument ()) { String name = in.getName (); do { // force a relevant URI and line number in = in.pop (); } while (in.isInternal ()); fatal ("P-002", new Object [] { name }, e); } else fatal ("P-003", null, e); } catch (RuntimeException e) { // Don't discard location that triggered the exception throw new SAXParseException ( e.getMessage () != null ? e.getMessage () : e.getClass ().getName (), locator.getPublicId (), locator.getSystemId (), locator.getLineNumber (), locator.getColumnNumber (), e); } finally { // recycle temporary data used during parsing strTmp = null; attTmp = null; nameTmp = null; nameCache = null; // ditto input sources etc if (in != null) { in.close (); in = null; } // get rid of all DTD info ... some of it would be // useful for editors etc, investigate later. params.clear (); entities.clear (); notations.clear (); elements.clear (); afterDocument (); } } // package private -- for subclass void afterRoot () throws SAXException { } // package private -- for subclass void afterDocument () { } // role is for diagnostics private void whitespace (String roleId) throws IOException, SAXException // [3] S ::= (#x20 | #x9 | #xd | #xa)+ { if (!maybeWhitespace ()) fatal ("P-004", new Object [] { messages.getMessage (locale, roleId) }); } // S? private boolean maybeWhitespace () throws IOException, SAXException { if (!(inExternalPE && doLexicalPE)) return in.maybeWhitespace (); // see getc() for the PE logic -- this lets us splice // expansions of PEs in "anywhere". getc() has smarts, // so for external PEs we don't bypass it. // XXX we can marginally speed PE handling, and certainly // be cleaner (hence potentially more correct), by using // the observations that expanded PEs only start and stop // where whitespace is allowed. getc wouldn't need any // "lexical" PE expansion logic, and no other method needs // to handle termination of PEs. (parsing of literals would // still need to pop entities, but not parsing of references // in content.) char c = getc(); boolean saw = false; while (c == ' ' || c == '\t' || c == '\n' || c == '\r') { saw = true; // this gracefully ends things when we stop playing // with internal parameters. caller should have a // grammar rule allowing whitespace at end of entity. if (in.isEOF () && !in.isInternal ()) return saw; c = getc (); } ungetc (); return saw; } private String maybeGetName () throws IOException, SAXException { NameCacheEntry entry = maybeGetNameCacheEntry (); return (entry == null) ? null : entry.name; } private NameCacheEntry maybeGetNameCacheEntry () throws IOException, SAXException { // [5] Name ::= (Letter|'_'|':') (Namechar)* char c = getc (); if (!XmlChars.isLetter (c) && c != ':' && c != '_') { ungetc (); return null; } return nameCharString (c); } // Used when parsing enumerations private String getNmtoken () throws SAXException, IOException { // [7] Nmtoken ::= (Namechar)+ char c = getc (); if (!XmlChars.isNameChar (c)) fatal ("P-006", new Object [] { new Character (c) }); return nameCharString (c).name; } // n.b. this gets used when parsing attribute values (for // internal references) so we can't use strTmp; it's also // a hotspot for CPU and memory in the parser (called at least // once for each element) so this has been optimized a bit. private NameCacheEntry nameCharString (char c) throws IOException, SAXException { int i = 1; nameTmp [0] = c; for (;;) { if ((c = in.getNameChar ()) == 0) break; if (i >= nameTmp.length) { char tmp [] = new char [nameTmp.length + 10]; System.arraycopy (nameTmp, 0, tmp, 0, nameTmp.length); nameTmp = tmp; } nameTmp [i++] = c; } return nameCache.lookupEntry (nameTmp, i); } // // much similarity between parsing entity values in DTD // and attribute values (in DTD or content) ... both follow // literal parsing rules, newline canonicalization, etc // // leaves value in 'strTmp' ... either a "replacement text" (4.5), // or else partially normalized attribute value (the first bit // of 3.3.3's spec, without the "if not CDATA" bits). // private void parseLiteral (boolean isEntityValue) throws IOException, SAXException { // [9] EntityValue ::= // '"' ([^"&%] | Reference | PEReference)* '"' // | "'" ([^'&%] | Reference | PEReference)* "'" // [10] AttValue ::= // '"' ([^"&] | Reference )* '"' // | "'" ([^'&] | Reference )* "'" char quote = getc (); char c; InputEntity source = in; if (quote != '\'' && quote != '"') fatal ("P-007"); // don't report entity expansions within attributes, // they're reported "fully expanded" via SAX isInAttribute = !isEntityValue; // get value into strTmp strTmp = new StringBuffer (); // scan, allowing entity push/pop wherever ... // expanded entities can't terminate the literal! for (;;) { if (in != source && in.isEOF ()) { // we don't report end of parsed entities // within attributes (no SAX hooks) in = in.pop (); continue; } if ((c = getc ()) == quote && in == source) break; // // Basically the "reference in attribute value" // row of the chart in section 4.4 of the spec // if (c == '&') { String entityName = maybeGetName (); if (entityName != null) { nextChar (';', "F-020", entityName); // 4.4 says: bypass these here ... we'll catch // forbidden refs to unparsed entities on use if (isEntityValue) { strTmp.append ('&'); strTmp.append (entityName); strTmp.append (';'); continue; } expandEntityInLiteral (entityName, entities, isEntityValue); // character references are always included immediately } else if ((c = getc ()) == '#') { int tmp = parseCharNumber (); if (tmp > 0xffff) { tmp = surrogatesToCharTmp (tmp); strTmp.append (charTmp [0]); if (tmp == 2) strTmp.append (charTmp [1]); } else strTmp.append ((char) tmp); } else fatal ("P-009"); continue; } // expand parameter entities only within entity value literals if (c == '%' && isEntityValue) { String entityName = maybeGetName (); if (entityName != null) { nextChar (';', "F-021", entityName); if (inExternalPE) expandEntityInLiteral (entityName, params, isEntityValue); else fatal ("P-010", new Object [] { entityName }); continue; } else fatal ("P-011"); } // For attribute values ... if (!isEntityValue) { // 3.3.3 says whitespace normalizes to space... if (c == ' ' || c == '\t' || c == '\n' || c == '\r') { strTmp.append (' '); continue; } // "<" not legal in parsed literals ... if (c == '<') fatal ("P-012"); } strTmp.append (c); } isInAttribute = false; } // does a SINGLE expansion of the entity (often reparsed later) private void expandEntityInLiteral ( String name, SimpleHashtable table, boolean isEntityValue ) throws SAXException, IOException { Object entity = table.get (name); // // Note: if entity is a PE (value.isPE) there is an XML // requirement that the content be "markkupdecl", but that error // is ignored here (as permitted by the XML spec). // if (entity instanceof InternalEntity) { InternalEntity value = (InternalEntity) entity; if (supportValidation && isValidating && isStandalone && !value.isFromInternalSubset) error ("V-002", new Object [] { name }); pushReader (value.buf, name, !value.isPE); } else if (entity instanceof ExternalEntity) { if (!isEntityValue) // must be a PE ... fatal ("P-013", new Object [] { name }); // XXX if this returns false ... pushReader ((ExternalEntity) entity); } else if (entity == null) { // // Note: much confusion about whether spec requires such // errors to be fatal in many cases, but none about whether // it allows "normal" errors to be unrecoverable! // fatal ( (table == params) ? "V-022" : "P-014", new Object [] { name }); } } // [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") // for PUBLIC and SYSTEM literals, also "' // NOTE: XML spec should explicitly say that PE ref syntax is // ignored in PIs, comments, SystemLiterals, and Pubid Literal // values ... can't process the XML spec's own DTD without doing // that for comments. private String getQuotedString (String type, String extra) throws IOException, SAXException { // use in.getc to bypass PE processing char quote = in.getc (); if (quote != '\'' && quote != '"') fatal ("P-015", new Object [] { messages.getMessage (locale, type, new Object [] { extra }) }); char c; strTmp = new StringBuffer (); while ((c = in.getc ()) != quote) strTmp.append ((char)c); return strTmp.toString (); } private String parsePublicId () throws IOException, SAXException { // [12] PubidLiteral ::= ('"' PubidChar* '"') | ("'" PubidChar* "'") // [13] PubidChar ::= #x20|#xd|#xa|[a-zA-Z0-9]|[-'()+,./:=?;!*#@$_%] String retval = getQuotedString ("F-033", null); for (int i = 0; i < retval.length (); i++) { char c = retval.charAt (i); if (" \r\n-'()+,./:=?;!*#@$_%0123456789".indexOf(c) == -1 && !(c >= 'A' && c <= 'Z') && !(c >= 'a' && c <= 'z')) fatal ("P-016", new Object [] { new Character (c) }); } strTmp = new StringBuffer (); strTmp.append (retval); return normalize (false); } // [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) // handled by: InputEntity.parsedContent() private boolean maybeComment (boolean skipStart) throws IOException, SAXException { // [15] Comment ::= '' if (!in.peek (skipStart ? "!--" : "