com.sun.xml.dtdparser.DTDParser Maven / Gradle / Ivy
/*
* Copyright (c) 1998, 2023 Oracle and/or its affiliates. All rights reserved.
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Distribution License v. 1.0, which is available at
* http://www.eclipse.org/org/documents/edl-v10.php.
*
* SPDX-License-Identifier: BSD-3-Clause
*/
package com.sun.xml.dtdparser;
import org.xml.sax.EntityResolver;
import org.xml.sax.InputSource;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
/**
* This implements parsing of XML 1.0 DTDs.
*
* This conforms to the portion of the XML 1.0 specification related to the
* external DTD subset.
*
* For multi-language applications (such as web servers using XML processing to
* create dynamic content), a method supports choosing a locale for parser
* diagnostics which is both understood by the message recipient and supported
* by the parser.
*
* This parser produces a stream of parse events. It supports some features
* (exposing comments, CDATA sections, and entity references) which are not
* required to be reported by conformant XML processors.
*
* @author David Brownell
* @author Janet Koenig
* @author Kohsuke KAWAGUCHI
*/
public class DTDParser {
public static final String TYPE_CDATA = "CDATA";
public static final String TYPE_ID = "ID";
public static final String TYPE_IDREF = "IDREF";
public static final String TYPE_IDREFS = "IDREFS";
public static final String TYPE_ENTITY = "ENTITY";
public static final String TYPE_ENTITIES = "ENTITIES";
public static final String TYPE_NMTOKEN = "NMTOKEN";
public static final String TYPE_NMTOKENS = "NMTOKENS";
public static final String TYPE_NOTATION = "NOTATION";
public static final String TYPE_ENUMERATION = "ENUMERATION";
// stack of input entities being merged
private InputEntity in;
// temporaries reused during parsing
private StringBuffer strTmp;
private char[] nameTmp;
private NameCache nameCache;
private final char[] charTmp = new char[2];
// temporary DTD parsing state
private boolean doLexicalPE;
// DTD state, used during parsing
// private SimpleHashtable elements = new SimpleHashtable (47);
protected final Set declaredElements = new HashSet<>();
private final SimpleHashtable params = new SimpleHashtable<>(7);
// exposed to package-private subclass
Map notations = new HashMap<>(7);
SimpleHashtable entities = new SimpleHashtable<>(17);
private final SimpleHashtable ids = new SimpleHashtable<>();
// listeners for DTD parsing events
private DTDEventListener dtdHandler;
private EntityResolver resolver;
private Locale locale;
// string constants -- use these copies so "==" works
// package private
static final String strANY = "ANY";
static final String strEMPTY = "EMPTY";
private static final Logger LOGGER = Logger.getLogger(DTDParser.class.getName());
/**
* Constructs a DTDParser.
*/
public DTDParser() {}
/**
* Used by applications to request locale for diagnostics.
*
* @param l The locale to use, or null to use system defaults (which may
* include only message IDs).
* @throws SAXException for errors
*/
public void setLocale(Locale l) throws SAXException {
if (l != null && !messages.isLocaleSupported(l.toString())) {
throw new SAXException(messages.getMessage(locale,
"P-078", new Object[]{l}));
}
locale = l;
}
/**
* Returns the diagnostic locale.
* @return the diagnostic locale
*/
public Locale getLocale() {
return locale;
}
/**
* Chooses a client locale to use for diagnostics, using the first language
* specified in the list that is supported by this parser. That locale is
* then set using setLocale().
* Such a list could be provided by a variety of user preference mechanisms,
* including the HTTP Accept-Language header field.
*
* @param languages Array of language specifiers, ordered with the most
* preferable one at the front. For example, "en-ca" then "fr-ca", followed
* by "zh_CN". Both RFC 1766 and Java styles are supported.
* @return The chosen locale, or null.
* @throws SAXException for errors
* @see MessageCatalog
*/
public Locale chooseLocale(String[] languages)
throws SAXException {
Locale l = messages.chooseLocale(languages);
if (l != null) {
setLocale(l);
}
return l;
}
/**
* Lets applications control entity resolution.
* @param r EntityResolver
*/
public void setEntityResolver(EntityResolver r) {
resolver = r;
}
/**
* Returns the object used to resolve entities
* @return the object used to resolve entities
*/
public EntityResolver getEntityResolver() {
return resolver;
}
/**
* Used by applications to set handling of DTD parsing events.
*/
public void setDtdHandler(DTDEventListener handler) {
dtdHandler = handler;
if (handler != null) {
handler.setDocumentLocator(new Locator() {
@Override
public String getPublicId() {
return DTDParser.this.getPublicId();
}
@Override
public String getSystemId() {
return DTDParser.this.getSystemId();
}
@Override
public int getLineNumber() {
return DTDParser.this.getLineNumber();
}
@Override
public int getColumnNumber() {
return DTDParser.this.getColumnNumber();
}
});
}
}
/**
* Returns the handler used to for DTD parsing events.
* @return the handler
*/
public DTDEventListener getDtdHandler() {
return dtdHandler;
}
/**
* Parse a DTD.
* @throws IOException for errors
* @throws SAXException for errors
*/
public void parse(InputSource in)
throws IOException, SAXException {
init();
parseInternal(in);
}
/**
* Parse a DTD.
* @throws IOException for errors
* @throws SAXException for errors
*/
public void parse(String uri)
throws IOException, SAXException {
InputSource inSource;
init();
// System.out.println ("parse (\"" + uri + "\")");
inSource = resolver.resolveEntity(null, uri);
// If custom resolver punts resolution to parser, handle it ...
if (inSource == null) {
inSource = Resolver.createInputSource(new java.net.URL(uri), false);
// ... or if custom resolver doesn't correctly construct the
// input entity, patch it up enough so relative URIs work, and
// issue a warning to minimize later confusion.
} else if (inSource.getSystemId() == null) {
warning("P-065", null);
inSource.setSystemId(uri);
}
parseInternal(inSource);
}
// makes sure the parser is reset to "before a document"
private void init() {
in = null;
// alloc temporary data used in parsing
strTmp = new StringBuffer();
nameTmp = new char[20];
nameCache = new NameCache();
// reset doc info
// isInAttribute = false;
doLexicalPE = false;
entities.clear();
notations.clear();
params.clear();
// elements.clear ();
declaredElements.clear();
// initialize predefined references ... re-interpreted later
builtin("amp", "&");
builtin("lt", "<");
builtin("gt", ">");
builtin("quot", "\"");
builtin("apos", "'");
if (locale == null) {
locale = Locale.getDefault();
}
if (resolver == null) {
resolver = new Resolver();
}
if (dtdHandler == null) {
dtdHandler = new DTDHandlerBase();
}
}
private void builtin(String entityName, String entityValue) {
InternalEntity entity;
entity = new InternalEntity(entityName, entityValue.toCharArray());
entities.put(entityName, entity);
}
////////////////////////////////////////////////////////////////
//
// parsing is by recursive descent, code roughly
// following the BNF rules except tweaked for simple
// lookahead. rules are more or less in numeric order,
// except where code sharing suggests other structures.
//
// a classic benefit of recursive descent parsers: it's
// relatively easy to get diagnostics that make sense.
//
////////////////////////////////////////////////////////////////
private void parseInternal(InputSource input)
throws IOException, SAXException {
if (input == null) {
fatal("P-000");
}
try {
in = InputEntity.getInputEntity(dtdHandler, locale);
in.init(input, null, null, false);
dtdHandler.startDTD(in);
// [30] extSubset ::= TextDecl? extSubsetDecl
// [31] extSubsetDecl ::= ( markupdecl | conditionalSect
// | PEReference | S )*
// ... same as [79] extPE, which is where the code is
ExternalEntity externalSubset = new ExternalEntity(in);
externalParameterEntity(externalSubset);
if (!in.isEOF()) {
fatal("P-001", new Object[]{Integer.toHexString(getc())});
}
afterRoot();
dtdHandler.endDTD();
} catch (EndOfInputException e) {
if (!in.isDocument()) {
String name = in.getName();
do { // force a relevant URI and line number
in = in.pop();
} while (in.isInternal());
fatal("P-002", new Object[]{name});
} else {
fatal("P-003", null);
}
} catch (RuntimeException e) {
LOGGER.log(Level.SEVERE, "Internal DTD parser error.", e);
throw new SAXParseException(e.getMessage() != null
? e.getMessage() : e.getClass().getName(),
getPublicId(), getSystemId(),
getLineNumber(), getColumnNumber());
} finally {
// recycle temporary data used during parsing
strTmp = null;
nameTmp = null;
nameCache = null;
// ditto input sources etc
if (in != null) {
in.close();
in = null;
}
// get rid of all DTD info ... some of it would be
// useful for editors etc., investigate later.
params.clear();
entities.clear();
notations.clear();
declaredElements.clear();
// elements.clear();
ids.clear();
}
}
void afterRoot() throws SAXException {
// Make sure all IDREFs match declared ID attributes. We scan
// after the document element is parsed, since XML allows forward
// references, and only now can we know if they're all resolved.
for (Enumeration e = ids.keys();
e.hasMoreElements();) {
String id = e.nextElement();
Boolean value = ids.get(id);
if (Boolean.FALSE.equals(value)) {
error("V-024", new Object[]{id});
}
}
}
// role is for diagnostics
private void whitespace(String roleId)
throws IOException, SAXException {
// [3] S ::= (#x20 | #x9 | #xd | #xa)+
if (!maybeWhitespace()) {
fatal("P-004", new Object[]{messages.getMessage(locale, roleId)});
}
}
// S?
private boolean maybeWhitespace()
throws IOException, SAXException {
if (!doLexicalPE) {
return in.maybeWhitespace();
}
// see getc() for the PE logic -- this lets us splice
// expansions of PEs in "anywhere". getc() has smarts,
// so for external PEs we don't bypass it.
// XXX we can marginally speed PE handling, and certainly
// be cleaner (hence potentially more correct), by using
// the observations that expanded PEs only start and stop
// where whitespace is allowed. getc wouldn't need any
// "lexical" PE expansion logic, and no other method needs
// to handle termination of PEs. (parsing of literals would
// still need to pop entities, but not parsing of references
// in content.)
char c = getc();
boolean saw = false;
while (c == ' ' || c == '\t' || c == '\n' || c == '\r') {
saw = true;
// this gracefully ends things when we stop playing
// with internal parameters. caller should have a
// grammar rule allowing whitespace at end of entity.
if (in.isEOF() && !in.isInternal()) {
return saw;
}
c = getc();
}
ungetc();
return saw;
}
private String maybeGetName()
throws IOException, SAXException {
NameCacheEntry entry = maybeGetNameCacheEntry();
return (entry == null) ? null : entry.name;
}
private NameCacheEntry maybeGetNameCacheEntry()
throws IOException, SAXException {
// [5] Name ::= (Letter|'_'|':') (Namechar)*
char c = getc();
if (!XmlChars.isLetter(c) && c != ':' && c != '_') {
ungetc();
return null;
}
return nameCharString(c);
}
// Used when parsing enumerations
private String getNmtoken()
throws IOException, SAXException {
// [7] Nmtoken ::= (Namechar)+
char c = getc();
if (!XmlChars.isNameChar(c)) {
fatal("P-006", new Object[]{c});
}
return nameCharString(c).name;
}
// n.b. this gets used when parsing attribute values (for
// internal references) so we can't use strTmp; it's also
// a hotspot for CPU and memory in the parser (called at least
// once for each element) so this has been optimized a bit.
private NameCacheEntry nameCharString(char c)
throws IOException, SAXException {
int i = 1;
nameTmp[0] = c;
for (;;) {
if ((c = in.getNameChar()) == 0) {
break;
}
if (i >= nameTmp.length) {
char[] tmp = new char[nameTmp.length + 10];
System.arraycopy(nameTmp, 0, tmp, 0, nameTmp.length);
nameTmp = tmp;
}
nameTmp[i++] = c;
}
return nameCache.lookupEntry(nameTmp, i);
}
//
// much similarity between parsing entity values in DTD
// and attribute values (in DTD or content) ... both follow
// literal parsing rules, newline canonicalization, etc.
//
// leaves value in 'strTmp' ... either a "replacement text" (4.5),
// or else partially normalized attribute value (the first bit
// of 3.3.3's spec, without the "if not CDATA" bits).
//
private void parseLiteral(boolean isEntityValue)
throws IOException, SAXException {
// [9] EntityValue ::=
// '"' ([^"&%] | Reference | PEReference)* '"'
// | "'" ([^'&%] | Reference | PEReference)* "'"
// [10] AttValue ::=
// '"' ([^"&] | Reference )* '"'
// | "'" ([^'&] | Reference )* "'"
char quote = getc();
char c;
InputEntity source = in;
if (quote != '\'' && quote != '"') {
fatal("P-007");
}
// don't report entity expansions within attributes,
// they're reported "fully expanded" via SAX
// isInAttribute = !isEntityValue;
// get value into strTmp
strTmp = new StringBuffer();
// scan, allowing entity push/pop wherever ...
// expanded entities can't terminate the literal!
for (;;) {
if (in != source && in.isEOF()) {
// we don't report end of parsed entities
// within attributes (no SAX hooks)
in = in.pop();
continue;
}
if ((c = getc()) == quote && in == source) {
break;
}
//
// Basically the "reference in attribute value"
// row of the chart in section 4.4 of the spec
//
if (c == '&') {
String entityName = maybeGetName();
if (entityName != null) {
nextChar(';', "F-020", entityName);
// 4.4 says: bypass these here ... we'll catch
// forbidden refs to unparsed entities on use
if (isEntityValue) {
strTmp.append('&');
strTmp.append(entityName);
strTmp.append(';');
continue;
}
expandEntityInLiteral(entityName, entities, isEntityValue);
// character references are always included immediately
} else if ((getc()) == '#') {
int tmp = parseCharNumber();
if (tmp > 0xffff) {
tmp = surrogatesToCharTmp(tmp);
strTmp.append(charTmp[0]);
if (tmp == 2) {
strTmp.append(charTmp[1]);
}
} else {
strTmp.append((char) tmp);
}
} else {
fatal("P-009");
}
continue;
}
// expand parameter entities only within entity value literals
if (c == '%' && isEntityValue) {
String entityName = maybeGetName();
if (entityName != null) {
nextChar(';', "F-021", entityName);
expandEntityInLiteral(entityName, params, isEntityValue);
continue;
} else {
fatal("P-011");
}
}
// For attribute values ...
if (!isEntityValue) {
// 3.3.3 says whitespace normalizes to space...
if (c == ' ' || c == '\t' || c == '\n' || c == '\r') {
strTmp.append(' ');
continue;
}
// "<" not legal in parsed literals ...
if (c == '<') {
fatal("P-012");
}
}
strTmp.append(c);
}
// isInAttribute = false;
}
// does a SINGLE expansion of the entity (often reparsed later)
private void expandEntityInLiteral(String name, SimpleHashtable table,
boolean isEntityValue)
throws IOException, SAXException {
EntityDecl entity = table.get(name);
if (entity instanceof InternalEntity) {
InternalEntity value = (InternalEntity) entity;
pushReader(value.buf, name, !value.isPE);
} else if (entity instanceof ExternalEntity) {
if (!isEntityValue) // must be a PE ...
{
fatal("P-013", new Object[]{name});
}
// XXX if this returns false ...
pushReader((ExternalEntity) entity);
} else if (entity == null) {
//
// Note: much confusion about whether spec requires such
// errors to be fatal in many cases, but none about whether
// it allows "normal" errors to be unrecoverable!
//
fatal((table == params) ? "V-022" : "P-014",
new Object[]{name});
}
}
// [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
// for PUBLIC and SYSTEM literals, also "'
// NOTE: XML spec should explicitly say that PE ref syntax is
// ignored in PIs, comments, SystemLiterals, and Pubid Literal
// values ... can't process the XML spec's own DTD without doing
// that for comments.
private String getQuotedString(String type, String extra)
throws IOException, SAXException {
// use in.getc to bypass PE processing
char quote = in.getc();
if (quote != '\'' && quote != '"') {
fatal("P-015", new Object[]{
messages.getMessage(locale, type, new Object[]{extra})
});
}
char c;
strTmp = new StringBuffer();
while ((c = in.getc()) != quote) {
strTmp.append(c);
}
return strTmp.toString();
}
private String parsePublicId() throws IOException, SAXException {
// [12] PubidLiteral ::= ('"' PubidChar* '"') | ("'" PubidChar* "'")
// [13] PubidChar ::= #x20|#xd|#xa|[a-zA-Z0-9]|[-'()+,./:=?;!*#@$_%]
String retval = getQuotedString("F-033", null);
for (int i = 0; i < retval.length(); i++) {
char c = retval.charAt(i);
if (" \r\n-'()+,./:=?;!*#@$_%0123456789".indexOf(c) == -1
&& !(c >= 'A' && c <= 'Z')
&& !(c >= 'a' && c <= 'z')) {
fatal("P-016", new Object[]{c});
}
}
strTmp = new StringBuffer();
strTmp.append(retval);
return normalize(false);
}
// [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
// handled by: InputEntity.parsedContent()
private boolean maybeComment(boolean skipStart)
throws IOException, SAXException {
// [15] Comment ::= ''
if (!in.peek(skipStart ? "!--" : "