All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.sun.xml.dtdparser.DTDParser Maven / Gradle / Ivy

There is a newer version: 4.0.5
Show newest version
/*
 * Copyright (c) 1998, 2018 Oracle and/or its affiliates. All rights reserved.
 *
 * This program and the accompanying materials are made available under the
 * terms of the Eclipse Distribution License v. 1.0, which is available at
 * http://www.eclipse.org/org/documents/edl-v10.php.
 *
 * SPDX-License-Identifier: BSD-3-Clause
 */

package com.sun.xml.dtdparser;

import org.xml.sax.EntityResolver;
import org.xml.sax.InputSource;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Locale;
import java.util.Set;
import java.util.Vector;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
 * This implements parsing of XML 1.0 DTDs.
 * 

* This conforms to the portion of the XML 1.0 specification related to the * external DTD subset. *

* For multi-language applications (such as web servers using XML processing to * create dynamic content), a method supports choosing a locale for parser * diagnostics which is both understood by the message recipient and supported * by the parser. *

* This parser produces a stream of parse events. It supports some features * (exposing comments, CDATA sections, and entity references) which are not * required to be reported by conformant XML processors. * * @author David Brownell * @author Janet Koenig * @author Kohsuke KAWAGUCHI * @version $Id: DTDParser.java,v 1.2 2009-04-16 15:25:49 snajper Exp $ */ public class DTDParser { public final static String TYPE_CDATA = "CDATA"; public final static String TYPE_ID = "ID"; public final static String TYPE_IDREF = "IDREF"; public final static String TYPE_IDREFS = "IDREFS"; public final static String TYPE_ENTITY = "ENTITY"; public final static String TYPE_ENTITIES = "ENTITIES"; public final static String TYPE_NMTOKEN = "NMTOKEN"; public final static String TYPE_NMTOKENS = "NMTOKENS"; public final static String TYPE_NOTATION = "NOTATION"; public final static String TYPE_ENUMERATION = "ENUMERATION"; // stack of input entities being merged private InputEntity in; // temporaries reused during parsing private StringBuffer strTmp; private char nameTmp[]; private NameCache nameCache; private char charTmp[] = new char[2]; // temporary DTD parsing state private boolean doLexicalPE; // DTD state, used during parsing // private SimpleHashtable elements = new SimpleHashtable (47); protected final Set declaredElements = new java.util.HashSet(); private SimpleHashtable params = new SimpleHashtable(7); // exposed to package-private subclass Hashtable notations = new Hashtable(7); SimpleHashtable entities = new SimpleHashtable(17); private SimpleHashtable ids = new SimpleHashtable(); // listeners for DTD parsing events private DTDEventListener dtdHandler; private EntityResolver resolver; private Locale locale; // string constants -- use these copies so "==" works // package private static final String strANY = "ANY"; static final String strEMPTY = "EMPTY"; private static final Logger LOGGER = Logger.getLogger(DTDParser.class.getName()); /** * Used by applications to request locale for diagnostics. * * @param l The locale to use, or null to use system defaults (which may * include only message IDs). */ public void setLocale(Locale l) throws SAXException { if (l != null && !messages.isLocaleSupported(l.toString())) { throw new SAXException(messages.getMessage(locale, "P-078", new Object[]{l})); } locale = l; } /** * Returns the diagnostic locale. */ public Locale getLocale() { return locale; } /** * Chooses a client locale to use for diagnostics, using the first language * specified in the list that is supported by this parser. That locale is * then set using setLocale(). * Such a list could be provided by a variety of user preference mechanisms, * including the HTTP Accept-Language header field. * * @param languages Array of language specifiers, ordered with the most * preferable one at the front. For example, "en-ca" then "fr-ca", followed * by "zh_CN". Both RFC 1766 and Java styles are supported. * @return The chosen locale, or null. * @see MessageCatalog */ public Locale chooseLocale(String languages[]) throws SAXException { Locale l = messages.chooseLocale(languages); if (l != null) { setLocale(l); } return l; } /** * Lets applications control entity resolution. */ public void setEntityResolver(EntityResolver r) { resolver = r; } /** * Returns the object used to resolve entities */ public EntityResolver getEntityResolver() { return resolver; } /** * Used by applications to set handling of DTD parsing events. */ public void setDtdHandler(DTDEventListener handler) { dtdHandler = handler; if (handler != null) { handler.setDocumentLocator(new Locator() { @Override public String getPublicId() { return DTDParser.this.getPublicId(); } @Override public String getSystemId() { return DTDParser.this.getSystemId(); } @Override public int getLineNumber() { return DTDParser.this.getLineNumber(); } @Override public int getColumnNumber() { return DTDParser.this.getColumnNumber(); } }); } } /** * Returns the handler used to for DTD parsing events. */ public DTDEventListener getDtdHandler() { return dtdHandler; } /** * Parse a DTD. */ public void parse(InputSource in) throws IOException, SAXException { init(); parseInternal(in); } /** * Parse a DTD. */ public void parse(String uri) throws IOException, SAXException { InputSource inSource; init(); // System.out.println ("parse (\"" + uri + "\")"); inSource = resolver.resolveEntity(null, uri); // If custom resolver punts resolution to parser, handle it ... if (inSource == null) { inSource = Resolver.createInputSource(new java.net.URL(uri), false); // ... or if custom resolver doesn't correctly construct the // input entity, patch it up enough so relative URIs work, and // issue a warning to minimize later confusion. } else if (inSource.getSystemId() == null) { warning("P-065", null); inSource.setSystemId(uri); } parseInternal(inSource); } // makes sure the parser is reset to "before a document" private void init() { in = null; // alloc temporary data used in parsing strTmp = new StringBuffer(); nameTmp = new char[20]; nameCache = new NameCache(); // reset doc info // isInAttribute = false; doLexicalPE = false; entities.clear(); notations.clear(); params.clear(); // elements.clear (); declaredElements.clear(); // initialize predefined references ... re-interpreted later builtin("amp", "&"); builtin("lt", "<"); builtin("gt", ">"); builtin("quot", "\""); builtin("apos", "'"); if (locale == null) { locale = Locale.getDefault(); } if (resolver == null) { resolver = new Resolver(); } if (dtdHandler == null) { dtdHandler = new DTDHandlerBase(); } } private void builtin(String entityName, String entityValue) { InternalEntity entity; entity = new InternalEntity(entityName, entityValue.toCharArray()); entities.put(entityName, entity); } //////////////////////////////////////////////////////////////// // // parsing is by recursive descent, code roughly // following the BNF rules except tweaked for simple // lookahead. rules are more or less in numeric order, // except where code sharing suggests other structures. // // a classic benefit of recursive descent parsers: it's // relatively easy to get diagnostics that make sense. // //////////////////////////////////////////////////////////////// @SuppressWarnings("CallToThreadDumpStack") private void parseInternal(InputSource input) throws IOException, SAXException { if (input == null) { fatal("P-000"); } try { in = InputEntity.getInputEntity(dtdHandler, locale); in.init(input, null, null, false); dtdHandler.startDTD(in); // [30] extSubset ::= TextDecl? extSubsetDecl // [31] extSubsetDecl ::= ( markupdecl | conditionalSect // | PEReference | S )* // ... same as [79] extPE, which is where the code is ExternalEntity externalSubset = new ExternalEntity(in); externalParameterEntity(externalSubset); if (!in.isEOF()) { fatal("P-001", new Object[]{Integer.toHexString(((int) getc()))}); } afterRoot(); dtdHandler.endDTD(); } catch (EndOfInputException e) { if (!in.isDocument()) { String name = in.getName(); do { // force a relevant URI and line number in = in.pop(); } while (in.isInternal()); fatal("P-002", new Object[]{name}); } else { fatal("P-003", null); } } catch (RuntimeException e) { LOGGER.log(Level.SEVERE, "Internal DTD parser error.", e); throw new SAXParseException(e.getMessage() != null ? e.getMessage() : e.getClass().getName(), getPublicId(), getSystemId(), getLineNumber(), getColumnNumber()); } finally { // recycle temporary data used during parsing strTmp = null; nameTmp = null; nameCache = null; // ditto input sources etc if (in != null) { in.close(); in = null; } // get rid of all DTD info ... some of it would be // useful for editors etc, investigate later. params.clear(); entities.clear(); notations.clear(); declaredElements.clear(); // elements.clear(); ids.clear(); } } void afterRoot() throws SAXException { // Make sure all IDREFs match declared ID attributes. We scan // after the document element is parsed, since XML allows forward // references, and only now can we know if they're all resolved. for (Enumeration e = ids.keys(); e.hasMoreElements();) { String id = (String) e.nextElement(); Boolean value = (Boolean) ids.get(id); if (Boolean.FALSE.equals(value)) { error("V-024", new Object[]{id}); } } } // role is for diagnostics private void whitespace(String roleId) throws IOException, SAXException { // [3] S ::= (#x20 | #x9 | #xd | #xa)+ if (!maybeWhitespace()) { fatal("P-004", new Object[]{messages.getMessage(locale, roleId)}); } } // S? private boolean maybeWhitespace() throws IOException, SAXException { if (!doLexicalPE) { return in.maybeWhitespace(); } // see getc() for the PE logic -- this lets us splice // expansions of PEs in "anywhere". getc() has smarts, // so for external PEs we don't bypass it. // XXX we can marginally speed PE handling, and certainly // be cleaner (hence potentially more correct), by using // the observations that expanded PEs only start and stop // where whitespace is allowed. getc wouldn't need any // "lexical" PE expansion logic, and no other method needs // to handle termination of PEs. (parsing of literals would // still need to pop entities, but not parsing of references // in content.) char c = getc(); boolean saw = false; while (c == ' ' || c == '\t' || c == '\n' || c == '\r') { saw = true; // this gracefully ends things when we stop playing // with internal parameters. caller should have a // grammar rule allowing whitespace at end of entity. if (in.isEOF() && !in.isInternal()) { return saw; } c = getc(); } ungetc(); return saw; } private String maybeGetName() throws IOException, SAXException { NameCacheEntry entry = maybeGetNameCacheEntry(); return (entry == null) ? null : entry.name; } private NameCacheEntry maybeGetNameCacheEntry() throws IOException, SAXException { // [5] Name ::= (Letter|'_'|':') (Namechar)* char c = getc(); if (!XmlChars.isLetter(c) && c != ':' && c != '_') { ungetc(); return null; } return nameCharString(c); } // Used when parsing enumerations private String getNmtoken() throws IOException, SAXException { // [7] Nmtoken ::= (Namechar)+ char c = getc(); if (!XmlChars.isNameChar(c)) { fatal("P-006", new Object[]{Character.valueOf(c)}); } return nameCharString(c).name; } // n.b. this gets used when parsing attribute values (for // internal references) so we can't use strTmp; it's also // a hotspot for CPU and memory in the parser (called at least // once for each element) so this has been optimized a bit. private NameCacheEntry nameCharString(char c) throws IOException, SAXException { int i = 1; nameTmp[0] = c; for (;;) { if ((c = in.getNameChar()) == 0) { break; } if (i >= nameTmp.length) { char tmp[] = new char[nameTmp.length + 10]; System.arraycopy(nameTmp, 0, tmp, 0, nameTmp.length); nameTmp = tmp; } nameTmp[i++] = c; } return nameCache.lookupEntry(nameTmp, i); } // // much similarity between parsing entity values in DTD // and attribute values (in DTD or content) ... both follow // literal parsing rules, newline canonicalization, etc // // leaves value in 'strTmp' ... either a "replacement text" (4.5), // or else partially normalized attribute value (the first bit // of 3.3.3's spec, without the "if not CDATA" bits). // @SuppressWarnings("UnusedAssignment") private void parseLiteral(boolean isEntityValue) throws IOException, SAXException { // [9] EntityValue ::= // '"' ([^"&%] | Reference | PEReference)* '"' // | "'" ([^'&%] | Reference | PEReference)* "'" // [10] AttValue ::= // '"' ([^"&] | Reference )* '"' // | "'" ([^'&] | Reference )* "'" char quote = getc(); char c; InputEntity source = in; if (quote != '\'' && quote != '"') { fatal("P-007"); } // don't report entity expansions within attributes, // they're reported "fully expanded" via SAX // isInAttribute = !isEntityValue; // get value into strTmp strTmp = new StringBuffer(); // scan, allowing entity push/pop wherever ... // expanded entities can't terminate the literal! for (;;) { if (in != source && in.isEOF()) { // we don't report end of parsed entities // within attributes (no SAX hooks) in = in.pop(); continue; } if ((c = getc()) == quote && in == source) { break; } // // Basically the "reference in attribute value" // row of the chart in section 4.4 of the spec // if (c == '&') { String entityName = maybeGetName(); if (entityName != null) { nextChar(';', "F-020", entityName); // 4.4 says: bypass these here ... we'll catch // forbidden refs to unparsed entities on use if (isEntityValue) { strTmp.append('&'); strTmp.append(entityName); strTmp.append(';'); continue; } expandEntityInLiteral(entityName, entities, isEntityValue); // character references are always included immediately } else if ((getc()) == '#') { int tmp = parseCharNumber(); if (tmp > 0xffff) { tmp = surrogatesToCharTmp(tmp); strTmp.append(charTmp[0]); if (tmp == 2) { strTmp.append(charTmp[1]); } } else { strTmp.append((char) tmp); } } else { fatal("P-009"); } continue; } // expand parameter entities only within entity value literals if (c == '%' && isEntityValue) { String entityName = maybeGetName(); if (entityName != null) { nextChar(';', "F-021", entityName); expandEntityInLiteral(entityName, params, isEntityValue); continue; } else { fatal("P-011"); } } // For attribute values ... if (!isEntityValue) { // 3.3.3 says whitespace normalizes to space... if (c == ' ' || c == '\t' || c == '\n' || c == '\r') { strTmp.append(' '); continue; } // "<" not legal in parsed literals ... if (c == '<') { fatal("P-012"); } } strTmp.append(c); } // isInAttribute = false; } // does a SINGLE expansion of the entity (often reparsed later) private void expandEntityInLiteral(String name, SimpleHashtable table, boolean isEntityValue) throws IOException, SAXException { Object entity = table.get(name); if (entity instanceof InternalEntity) { InternalEntity value = (InternalEntity) entity; pushReader(value.buf, name, !value.isPE); } else if (entity instanceof ExternalEntity) { if (!isEntityValue) // must be a PE ... { fatal("P-013", new Object[]{name}); } // XXX if this returns false ... pushReader((ExternalEntity) entity); } else if (entity == null) { // // Note: much confusion about whether spec requires such // errors to be fatal in many cases, but none about whether // it allows "normal" errors to be unrecoverable! // fatal((table == params) ? "V-022" : "P-014", new Object[]{name}); } } // [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") // for PUBLIC and SYSTEM literals, also "' // NOTE: XML spec should explicitly say that PE ref syntax is // ignored in PIs, comments, SystemLiterals, and Pubid Literal // values ... can't process the XML spec's own DTD without doing // that for comments. private String getQuotedString(String type, String extra) throws IOException, SAXException { // use in.getc to bypass PE processing char quote = in.getc(); if (quote != '\'' && quote != '"') { fatal("P-015", new Object[]{ messages.getMessage(locale, type, new Object[]{extra}) }); } char c; strTmp = new StringBuffer(); while ((c = in.getc()) != quote) { strTmp.append((char) c); } return strTmp.toString(); } private String parsePublicId() throws IOException, SAXException { // [12] PubidLiteral ::= ('"' PubidChar* '"') | ("'" PubidChar* "'") // [13] PubidChar ::= #x20|#xd|#xa|[a-zA-Z0-9]|[-'()+,./:=?;!*#@$_%] String retval = getQuotedString("F-033", null); for (int i = 0; i < retval.length(); i++) { char c = retval.charAt(i); if (" \r\n-'()+,./:=?;!*#@$_%0123456789".indexOf(c) == -1 && !(c >= 'A' && c <= 'Z') && !(c >= 'a' && c <= 'z')) { fatal("P-016", new Object[]{Character.valueOf(c)}); } } strTmp = new StringBuffer(); strTmp.append(retval); return normalize(false); } // [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) // handled by: InputEntity.parsedContent() private boolean maybeComment(boolean skipStart) throws IOException, SAXException { // [15] Comment ::= '' if (!in.peek(skipStart ? "!--" : "