com.sun.msv.scanner.dtd.DTDParser Maven / Gradle / Ivy
The newest version!
/*
* Copyright (c) 1998-2013 Oracle and/or its affiliates. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* - Neither the name of Oracle nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
* IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package com.sun.msv.scanner.dtd;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Locale;
import java.util.Set;
import java.util.Vector;
import org.xml.sax.EntityResolver;
import org.xml.sax.InputSource;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
/**
* This implements parsing of XML 1.0 DTDs.
*
* This conforms to the portion of the XML 1.0 specification related
* to the external DTD subset.
*
* For multi-language applications (such as web servers using XML
* processing to create dynamic content), a method supports choosing
* a locale for parser diagnostics which is both understood by the
* message recipient and supported by the parser.
*
* This parser produces a stream of parse events. It supports some
* features (exposing comments, CDATA sections, and entity references)
* which are not required to be reported by conformant XML processors.
*
* @author David Brownell
* @author Janet Koenig
* @author Kohsuke KAWAGUCHI
* @version $Id: DTDParser.java 1793 2013-02-18 12:52:53Z snajper $
*/
public class DTDParser {
public final static String TYPE_CDATA = "CDATA";
public final static String TYPE_ID = "ID";
public final static String TYPE_IDREF = "IDREF";
public final static String TYPE_IDREFS = "IDREFS";
public final static String TYPE_ENTITY = "ENTITY";
public final static String TYPE_ENTITIES = "ENTITIES";
public final static String TYPE_NMTOKEN = "NMTOKEN";
public final static String TYPE_NMTOKENS = "NMTOKENS";
public final static String TYPE_NOTATION = "NOTATION";
public final static String TYPE_ENUMERATION = "ENUMERATION";
// stack of input entities being merged
private InputEntity in;
// temporaries reused during parsing
private StringBuffer strTmp;
private char nameTmp [];
private NameCache nameCache;
private char charTmp [] = new char [2];
// temporary DTD parsing state
private boolean doLexicalPE;
// DTD state, used during parsing
// private SimpleHashtable elements = new SimpleHashtable (47);
protected final Set declaredElements = new java.util.HashSet();
private SimpleHashtable params = new SimpleHashtable (7);
// exposed to package-private subclass
Hashtable notations = new Hashtable (7);
SimpleHashtable entities = new SimpleHashtable (17);
private SimpleHashtable ids = new SimpleHashtable ();
// listeners for DTD parsing events
private DTDEventListener dtdHandler;
private EntityResolver resolver;
private Locale locale;
// string constants -- use these copies so "==" works
// package private
static final String strANY = "ANY";
static final String strEMPTY = "EMPTY";
/**
* Used by applications to request locale for diagnostics.
*
* @param l The locale to use, or null to use system defaults
* (which may include only message IDs).
*/
public void setLocale (Locale l) throws SAXException {
if (l != null && !messages.isLocaleSupported (l.toString ())) {
throw new SAXException (messages.getMessage (locale,
"P-078", new Object [] { l }));
}
locale = l;
}
/**
* Returns the diagnostic locale.
*/
public Locale getLocale () {
return locale;
}
/**
* Chooses a client locale to use for diagnostics, using the first
* language specified in the list that is supported by this parser.
* That locale is then set using
* setLocale(). Such a list could be provided by a variety of user
* preference mechanisms, including the HTTP Accept-Language
* header field.
*
* @see MessageCatalog
*
* @param languages Array of language specifiers, ordered with the most
* preferable one at the front. For example, "en-ca" then "fr-ca",
* followed by "zh_CN". Both RFC 1766 and Java styles are supported.
* @return The chosen locale, or null.
*/
public Locale chooseLocale (String languages [])
throws SAXException {
Locale l = messages.chooseLocale (languages);
if (l != null) {
setLocale (l);
}
return l;
}
/**
* Lets applications control entity resolution.
*/
public void setEntityResolver (EntityResolver r) {
resolver = r;
}
/**
* Returns the object used to resolve entities
*/
public EntityResolver getEntityResolver () {
return resolver;
}
/**
* Used by applications to set handling of DTD parsing events.
*/
public void setDtdHandler (DTDEventListener handler) {
dtdHandler = handler;
if( handler!=null )
handler.setDocumentLocator(
new Locator(){
public String getPublicId() { return DTDParser.this.getPublicId(); }
public String getSystemId() { return DTDParser.this.getSystemId(); }
public int getLineNumber() { return DTDParser.this.getLineNumber(); }
public int getColumnNumber() { return DTDParser.this.getColumnNumber(); }
});
}
/**
* Returns the handler used to for DTD parsing events.
*/
public DTDEventListener getDtdHandler () {
return dtdHandler;
}
/**
* Parse a DTD.
*/
public void parse (InputSource in)
throws IOException, SAXException {
init ();
parseInternal (in);
}
/**
* Parse a DTD.
*/
public void parse (String uri)
throws IOException, SAXException
{
InputSource in;
init ();
// System.out.println ("parse (\"" + uri + "\")");
in = resolver.resolveEntity (null, uri);
// If custom resolver punts resolution to parser, handle it ...
if (in == null) {
in = Resolver.createInputSource (new java.net.URL (uri), false);
// ... or if custom resolver doesn't correctly construct the
// input entity, patch it up enough so relative URIs work, and
// issue a warning to minimize later confusion.
} else if (in.getSystemId () == null) {
warning ("P-065", null);
in.setSystemId (uri);
}
parseInternal (in);
}
// makes sure the parser is reset to "before a document"
private void init ()
{
in = null;
// alloc temporary data used in parsing
strTmp = new StringBuffer ();
nameTmp = new char [20];
nameCache = new NameCache ();
// reset doc info
// isInAttribute = false;
doLexicalPE = false;
entities.clear ();
notations.clear ();
params.clear ();
// elements.clear ();
declaredElements.clear();
// initialize predefined references ... re-interpreted later
builtin ("amp", "&");
builtin ("lt", "<");
builtin ("gt", ">");
builtin ("quot", "\"");
builtin ("apos", "'");
if (locale == null)
locale = Locale.getDefault ();
if (resolver == null)
resolver = new Resolver ();
if (dtdHandler == null)
dtdHandler = new DTDHandlerBase();
}
private void builtin (String entityName, String entityValue) {
InternalEntity entity;
entity = new InternalEntity (entityName, entityValue.toCharArray ());
entities.put (entityName, entity);
}
////////////////////////////////////////////////////////////////
//
// parsing is by recursive descent, code roughly
// following the BNF rules except tweaked for simple
// lookahead. rules are more or less in numeric order,
// except where code sharing suggests other structures.
//
// a classic benefit of recursive descent parsers: it's
// relatively easy to get diagnostics that make sense.
//
////////////////////////////////////////////////////////////////
private void parseInternal (InputSource input)
throws IOException, SAXException {
if (input == null)
fatal("P-000");
try {
in = InputEntity.getInputEntity(dtdHandler, locale);
in.init(input, null, null, false);
dtdHandler.startDTD(in);
// [30] extSubset ::= TextDecl? extSubsetDecl
// [31] extSubsetDecl ::= ( markupdecl | conditionalSect
// | PEReference | S )*
// ... same as [79] extPE, which is where the code is
ExternalEntity externalSubset = new ExternalEntity(in);
externalParameterEntity(externalSubset);
if (!in.isEOF ()) {
fatal ("P-001", new Object []
{ Integer.toHexString (((int)getc ())) } );
}
afterRoot();
dtdHandler.endDTD ();
} catch (EndOfInputException e) {
if (!in.isDocument ()) {
String name = in.getName ();
do { // force a relevant URI and line number
in = in.pop ();
} while (in.isInternal ());
fatal ("P-002", new Object [] { name });
} else {
fatal ("P-003", null);
}
} catch (RuntimeException e) {
// Don't discard location that triggered the exception
// ## Should properly wrap exception
System.err.print("Internal DTD parser error: "); // ##
e.printStackTrace();
throw new SAXParseException ( e.getMessage () != null
? e.getMessage () : e.getClass ().getName (),
getPublicId (), getSystemId (),
getLineNumber (), getColumnNumber ());
} finally {
// recycle temporary data used during parsing
strTmp = null;
nameTmp = null;
nameCache = null;
// ditto input sources etc
if (in != null) {
in.close ();
in = null;
}
// get rid of all DTD info ... some of it would be
// useful for editors etc, investigate later.
params.clear();
entities.clear();
notations.clear();
declaredElements.clear();
// elements.clear();
ids.clear();
}
}
void afterRoot () throws SAXException
{
// Make sure all IDREFs match declared ID attributes. We scan
// after the document element is parsed, since XML allows forward
// references, and only now can we know if they're all resolved.
for (Enumeration e = ids.keys ();
e.hasMoreElements ();
) {
String id = (String)e.nextElement ();
Boolean value = (Boolean)ids.get(id);
if (Boolean.FALSE == value)
error ("V-024", new Object [] { id });
}
}
// role is for diagnostics
private void whitespace (String roleId)
throws IOException, SAXException {
// [3] S ::= (#x20 | #x9 | #xd | #xa)+
if (!maybeWhitespace ()) {
fatal ("P-004", new Object []
{ messages.getMessage (locale, roleId) });
}
}
// S?
private boolean maybeWhitespace ()
throws IOException, SAXException {
if (!doLexicalPE)
return in.maybeWhitespace ();
// see getc() for the PE logic -- this lets us splice
// expansions of PEs in "anywhere". getc() has smarts,
// so for external PEs we don't bypass it.
// XXX we can marginally speed PE handling, and certainly
// be cleaner (hence potentially more correct), by using
// the observations that expanded PEs only start and stop
// where whitespace is allowed. getc wouldn't need any
// "lexical" PE expansion logic, and no other method needs
// to handle termination of PEs. (parsing of literals would
// still need to pop entities, but not parsing of references
// in content.)
char c = getc();
boolean saw = false;
while (c == ' ' || c == '\t' || c == '\n' || c == '\r') {
saw = true;
// this gracefully ends things when we stop playing
// with internal parameters. caller should have a
// grammar rule allowing whitespace at end of entity.
if (in.isEOF () && !in.isInternal ())
return saw;
c = getc ();
}
ungetc ();
return saw;
}
private String maybeGetName ()
throws IOException, SAXException {
NameCacheEntry entry = maybeGetNameCacheEntry ();
return (entry == null) ? null : entry.name;
}
private NameCacheEntry maybeGetNameCacheEntry ()
throws IOException, SAXException {
// [5] Name ::= (Letter|'_'|':') (Namechar)*
char c = getc ();
if (!XmlChars.isLetter (c) && c != ':' && c != '_') {
ungetc ();
return null;
}
return nameCharString (c);
}
// Used when parsing enumerations
private String getNmtoken ()
throws IOException, SAXException {
// [7] Nmtoken ::= (Namechar)+
char c = getc ();
if (!XmlChars.isNameChar (c))
fatal ("P-006", new Object [] { new Character (c) });
return nameCharString (c).name;
}
// n.b. this gets used when parsing attribute values (for
// internal references) so we can't use strTmp; it's also
// a hotspot for CPU and memory in the parser (called at least
// once for each element) so this has been optimized a bit.
private NameCacheEntry nameCharString (char c)
throws IOException, SAXException {
int i = 1;
nameTmp [0] = c;
for (;;) {
if ((c = in.getNameChar ()) == 0)
break;
if (i >= nameTmp.length) {
char tmp [] = new char [nameTmp.length + 10];
System.arraycopy (nameTmp, 0, tmp, 0, nameTmp.length);
nameTmp = tmp;
}
nameTmp [i++] = c;
}
return nameCache.lookupEntry (nameTmp, i);
}
//
// much similarity between parsing entity values in DTD
// and attribute values (in DTD or content) ... both follow
// literal parsing rules, newline canonicalization, etc
//
// leaves value in 'strTmp' ... either a "replacement text" (4.5),
// or else partially normalized attribute value (the first bit
// of 3.3.3's spec, without the "if not CDATA" bits).
//
private void parseLiteral (boolean isEntityValue)
throws IOException, SAXException {
// [9] EntityValue ::=
// '"' ([^"&%] | Reference | PEReference)* '"'
// | "'" ([^'&%] | Reference | PEReference)* "'"
// [10] AttValue ::=
// '"' ([^"&] | Reference )* '"'
// | "'" ([^'&] | Reference )* "'"
char quote = getc ();
char c;
InputEntity source = in;
if (quote != '\'' && quote != '"') {
fatal ("P-007");
}
// don't report entity expansions within attributes,
// they're reported "fully expanded" via SAX
// isInAttribute = !isEntityValue;
// get value into strTmp
strTmp = new StringBuffer ();
// scan, allowing entity push/pop wherever ...
// expanded entities can't terminate the literal!
for (;;) {
if (in != source && in.isEOF ()) {
// we don't report end of parsed entities
// within attributes (no SAX hooks)
in = in.pop ();
continue;
}
if ((c = getc ()) == quote && in == source) {
break;
}
//
// Basically the "reference in attribute value"
// row of the chart in section 4.4 of the spec
//
if (c == '&') {
String entityName = maybeGetName ();
if (entityName != null) {
nextChar (';', "F-020", entityName);
// 4.4 says: bypass these here ... we'll catch
// forbidden refs to unparsed entities on use
if (isEntityValue) {
strTmp.append ('&');
strTmp.append (entityName);
strTmp.append (';');
continue;
}
expandEntityInLiteral (entityName, entities, isEntityValue);
// character references are always included immediately
} else if ((c = getc ()) == '#') {
int tmp = parseCharNumber ();
if (tmp > 0xffff) {
tmp = surrogatesToCharTmp (tmp);
strTmp.append (charTmp [0]);
if (tmp == 2)
strTmp.append (charTmp [1]);
} else
strTmp.append ((char) tmp);
} else
fatal ("P-009");
continue;
}
// expand parameter entities only within entity value literals
if (c == '%' && isEntityValue) {
String entityName = maybeGetName ();
if (entityName != null) {
nextChar (';', "F-021", entityName);
expandEntityInLiteral (entityName, params, isEntityValue);
continue;
} else
fatal ("P-011");
}
// For attribute values ...
if (!isEntityValue) {
// 3.3.3 says whitespace normalizes to space...
if (c == ' ' || c == '\t' || c == '\n' || c == '\r') {
strTmp.append (' ');
continue;
}
// "<" not legal in parsed literals ...
if (c == '<')
fatal ("P-012");
}
strTmp.append (c);
}
// isInAttribute = false;
}
// does a SINGLE expansion of the entity (often reparsed later)
private void expandEntityInLiteral( String name, SimpleHashtable table,
boolean isEntityValue)
throws IOException, SAXException {
Object entity = table.get (name);
if (entity instanceof InternalEntity) {
InternalEntity value = (InternalEntity) entity;
pushReader (value.buf, name, !value.isPE);
} else if (entity instanceof ExternalEntity) {
if (!isEntityValue) // must be a PE ...
fatal ("P-013", new Object [] { name });
// XXX if this returns false ...
pushReader ((ExternalEntity) entity);
} else if (entity == null) {
//
// Note: much confusion about whether spec requires such
// errors to be fatal in many cases, but none about whether
// it allows "normal" errors to be unrecoverable!
//
fatal (
(table == params) ? "V-022" : "P-014",
new Object [] { name });
}
}
// [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
// for PUBLIC and SYSTEM literals, also "'
// NOTE: XML spec should explicitly say that PE ref syntax is
// ignored in PIs, comments, SystemLiterals, and Pubid Literal
// values ... can't process the XML spec's own DTD without doing
// that for comments.
private String getQuotedString (String type, String extra)
throws IOException, SAXException {
// use in.getc to bypass PE processing
char quote = in.getc ();
if (quote != '\'' && quote != '"')
fatal ("P-015", new Object [] {
messages.getMessage (locale, type, new Object [] { extra })
});
char c;
strTmp = new StringBuffer ();
while ((c = in.getc ()) != quote)
strTmp.append ((char)c);
return strTmp.toString ();
}
private String parsePublicId () throws IOException, SAXException {
// [12] PubidLiteral ::= ('"' PubidChar* '"') | ("'" PubidChar* "'")
// [13] PubidChar ::= #x20|#xd|#xa|[a-zA-Z0-9]|[-'()+,./:=?;!*#@$_%]
String retval = getQuotedString ("F-033", null);
for (int i = 0; i < retval.length (); i++) {
char c = retval.charAt (i);
if (" \r\n-'()+,./:=?;!*#@$_%0123456789".indexOf(c) == -1
&& !(c >= 'A' && c <= 'Z')
&& !(c >= 'a' && c <= 'z'))
fatal ("P-016", new Object [] { new Character (c) });
}
strTmp = new StringBuffer ();
strTmp.append (retval);
return normalize (false);
}
// [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
// handled by: InputEntity.parsedContent()
private boolean maybeComment (boolean skipStart)
throws IOException, SAXException {
// [15] Comment ::= ''
if (!in.peek (skipStart ? "!--" : "