com.sun.xml.dtdparser.DTDParser Maven / Gradle / Ivy

Go to download
/*
 * Copyright (c) 1998, 2021 Oracle and/or its affiliates. All rights reserved.
 *
 * This program and the accompanying materials are made available under the
 * terms of the Eclipse Distribution License v. 1.0, which is available at
 * http://www.eclipse.org/org/documents/edl-v10.php.
 *
 * SPDX-License-Identifier: BSD-3-Clause
 */

package com.sun.xml.dtdparser;

import org.xml.sax.EntityResolver;
import org.xml.sax.InputSource;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
 * This implements parsing of XML 1.0 DTDs.
 * 
 * This conforms to the portion of the XML 1.0 specification related to the
 * external DTD subset.
 * 

 * For multi-language applications (such as web servers using XML processing to
 * create dynamic content), a method supports choosing a locale for parser
 * diagnostics which is both understood by the message recipient and supported
 * by the parser.
 * 

 * This parser produces a stream of parse events. It supports some features
 * (exposing comments, CDATA sections, and entity references) which are not
 * required to be reported by conformant XML processors.
 *
 * @author David Brownell
 * @author Janet Koenig
 * @author Kohsuke KAWAGUCHI
 * @version $Id: DTDParser.java,v 1.2 2009-04-16 15:25:49 snajper Exp $
 */
public class DTDParser {

    public final static String TYPE_CDATA = "CDATA";
    public final static String TYPE_ID = "ID";
    public final static String TYPE_IDREF = "IDREF";
    public final static String TYPE_IDREFS = "IDREFS";
    public final static String TYPE_ENTITY = "ENTITY";
    public final static String TYPE_ENTITIES = "ENTITIES";
    public final static String TYPE_NMTOKEN = "NMTOKEN";
    public final static String TYPE_NMTOKENS = "NMTOKENS";
    public final static String TYPE_NOTATION = "NOTATION";
    public final static String TYPE_ENUMERATION = "ENUMERATION";
    // stack of input entities being merged
    private InputEntity in;
    // temporaries reused during parsing
    private StringBuffer strTmp;
    private char nameTmp[];
    private NameCache nameCache;
    private char charTmp[] = new char[2];
    // temporary DTD parsing state
    private boolean doLexicalPE;
    // DTD state, used during parsing
//    private SimpleHashtable    elements = new SimpleHashtable (47);
    protected final Set declaredElements = new HashSet<>();
    private final SimpleHashtable params = new SimpleHashtable<>(7);
    // exposed to package-private subclass
    Map notations = new HashMap<>(7);
    SimpleHashtable entities = new SimpleHashtable<>(17);
    private final SimpleHashtable ids = new SimpleHashtable<>();
    // listeners for DTD parsing events
    private DTDEventListener dtdHandler;
    private EntityResolver resolver;
    private Locale locale;
    // string constants -- use these copies so "==" works
    // package private
    static final String strANY = "ANY";
    static final String strEMPTY = "EMPTY";

    private static final Logger LOGGER = Logger.getLogger(DTDParser.class.getName());

    /**
     * Constructs a DTDParser.
     */
    public DTDParser() {}

    /**
     * Used by applications to request locale for diagnostics.
     *
     * @param l The locale to use, or null to use system defaults (which may
     * include only message IDs).
     * @throws SAXException for errors
     */
    public void setLocale(Locale l) throws SAXException {

        if (l != null && !messages.isLocaleSupported(l.toString())) {
            throw new SAXException(messages.getMessage(locale,
                    "P-078", new Object[]{l}));
        }
        locale = l;
    }

    /**
     * Returns the diagnostic locale.
     * @return the diagnostic locale
     */
    public Locale getLocale() {
        return locale;
    }

    /**
     * Chooses a client locale to use for diagnostics, using the first language
     * specified in the list that is supported by this parser. That locale is
     * then set using  setLocale().
     * Such a list could be provided by a variety of user preference mechanisms,
     * including the HTTP Accept-Language header field.
     *
     * @param languages Array of language specifiers, ordered with the most
     * preferable one at the front. For example, "en-ca" then "fr-ca", followed
     * by "zh_CN". Both RFC 1766 and Java styles are supported.
     * @return The chosen locale, or null.
     * @throws SAXException for errors
     * @see MessageCatalog
     */
    public Locale chooseLocale(String languages[])
            throws SAXException {

        Locale l = messages.chooseLocale(languages);

        if (l != null) {
            setLocale(l);
        }
        return l;
    }

    /**
     * Lets applications control entity resolution.
     * @param r EntityResolver
     */
    public void setEntityResolver(EntityResolver r) {

        resolver = r;
    }

    /**
     * Returns the object used to resolve entities
     * @return the object used to resolve entities
     */
    public EntityResolver getEntityResolver() {

        return resolver;
    }

    /**
     * Used by applications to set handling of DTD parsing events.
     * @param handler
     */
    public void setDtdHandler(DTDEventListener handler) {
        dtdHandler = handler;
        if (handler != null) {
            handler.setDocumentLocator(new Locator() {
                @Override
                public String getPublicId() {
                    return DTDParser.this.getPublicId();
                }

                @Override
                public String getSystemId() {
                    return DTDParser.this.getSystemId();
                }

                @Override
                public int getLineNumber() {
                    return DTDParser.this.getLineNumber();
                }

                @Override
                public int getColumnNumber() {
                    return DTDParser.this.getColumnNumber();
                }
            });
        }
    }

    /**
     * Returns the handler used to for DTD parsing events.
     * @return the handler
     */
    public DTDEventListener getDtdHandler() {
        return dtdHandler;
    }

    /**
     * Parse a DTD.
     * @param in
     * @throws IOException for errors
     * @throws SAXException for errors
     */
    public void parse(InputSource in)
            throws IOException, SAXException {
        init();
        parseInternal(in);
    }

    /**
     * Parse a DTD.
     * @param uri
     * @throws IOException for errors
     * @throws SAXException for errors
     */
    public void parse(String uri)
            throws IOException, SAXException {
        InputSource inSource;

        init();
        // System.out.println ("parse (\"" + uri + "\")");
        inSource = resolver.resolveEntity(null, uri);

        // If custom resolver punts resolution to parser, handle it ...
        if (inSource == null) {
            inSource = Resolver.createInputSource(new java.net.URL(uri), false);

            // ... or if custom resolver doesn't correctly construct the
            // input entity, patch it up enough so relative URIs work, and
            // issue a warning to minimize later confusion.
        } else if (inSource.getSystemId() == null) {
            warning("P-065", null);
            inSource.setSystemId(uri);
        }

        parseInternal(inSource);
    }

    // makes sure the parser is reset to "before a document"
    private void init() {
        in = null;

        // alloc temporary data used in parsing
        strTmp = new StringBuffer();
        nameTmp = new char[20];
        nameCache = new NameCache();

        // reset doc info
//        isInAttribute = false;

        doLexicalPE = false;

        entities.clear();
        notations.clear();
        params.clear();
        //    elements.clear ();
        declaredElements.clear();

        // initialize predefined references ... re-interpreted later
        builtin("amp", "&");
        builtin("lt", "<");
        builtin("gt", ">");
        builtin("quot", "\"");
        builtin("apos", "'");

        if (locale == null) {
            locale = Locale.getDefault();
        }
        if (resolver == null) {
            resolver = new Resolver();
        }
        if (dtdHandler == null) {
            dtdHandler = new DTDHandlerBase();
        }
    }

    private void builtin(String entityName, String entityValue) {
        InternalEntity entity;
        entity = new InternalEntity(entityName, entityValue.toCharArray());
        entities.put(entityName, entity);
    }

    ////////////////////////////////////////////////////////////////
    //
    // parsing is by recursive descent, code roughly
    // following the BNF rules except tweaked for simple
    // lookahead.  rules are more or less in numeric order,
    // except where code sharing suggests other structures.
    //
    // a classic benefit of recursive descent parsers:  it's
    // relatively easy to get diagnostics that make sense.
    //
    ////////////////////////////////////////////////////////////////
    @SuppressWarnings("CallToThreadDumpStack")
    private void parseInternal(InputSource input)
            throws IOException, SAXException {

        if (input == null) {
            fatal("P-000");
        }

        try {
            in = InputEntity.getInputEntity(dtdHandler, locale);
            in.init(input, null, null, false);

            dtdHandler.startDTD(in);

            // [30] extSubset ::= TextDecl? extSubsetDecl
            // [31] extSubsetDecl ::= ( markupdecl | conditionalSect
            //        | PEReference | S )*
            //    ... same as [79] extPE, which is where the code is

            ExternalEntity externalSubset = new ExternalEntity(in);
            externalParameterEntity(externalSubset);

            if (!in.isEOF()) {
                fatal("P-001", new Object[]{Integer.toHexString(((int) getc()))});
            }
            afterRoot();
            dtdHandler.endDTD();

        } catch (EndOfInputException e) {
            if (!in.isDocument()) {
                String name = in.getName();
                do {    // force a relevant URI and line number
                    in = in.pop();
                } while (in.isInternal());
                fatal("P-002", new Object[]{name});
            } else {
                fatal("P-003", null);
            }
        } catch (RuntimeException e) {
            LOGGER.log(Level.SEVERE, "Internal DTD parser error.", e);
            throw new SAXParseException(e.getMessage() != null
                    ? e.getMessage() : e.getClass().getName(),
                    getPublicId(), getSystemId(),
                    getLineNumber(), getColumnNumber());

        } finally {
            // recycle temporary data used during parsing
            strTmp = null;
            nameTmp = null;
            nameCache = null;

            // ditto input sources etc
            if (in != null) {
                in.close();
                in = null;
            }

            // get rid of all DTD info ... some of it would be
            // useful for editors etc, investigate later.

            params.clear();
            entities.clear();
            notations.clear();
            declaredElements.clear();
//        elements.clear();
            ids.clear();
        }
    }

    void afterRoot() throws SAXException {
        // Make sure all IDREFs match declared ID attributes.  We scan
        // after the document element is parsed, since XML allows forward
        // references, and only now can we know if they're all resolved.

        for (Enumeration e = ids.keys();
                e.hasMoreElements();) {
            String id = e.nextElement();
            Boolean value = ids.get(id);
            if (Boolean.FALSE.equals(value)) {
                error("V-024", new Object[]{id});
            }
        }
    }

    // role is for diagnostics
    private void whitespace(String roleId)
            throws IOException, SAXException {

        // [3] S ::= (#x20 | #x9 | #xd | #xa)+
        if (!maybeWhitespace()) {
            fatal("P-004", new Object[]{messages.getMessage(locale, roleId)});
        }
    }

    // S?
    private boolean maybeWhitespace()
            throws IOException, SAXException {

        if (!doLexicalPE) {
            return in.maybeWhitespace();
        }

        // see getc() for the PE logic -- this lets us splice
        // expansions of PEs in "anywhere".  getc() has smarts,
        // so for external PEs we don't bypass it.

        // XXX we can marginally speed PE handling, and certainly
        // be cleaner (hence potentially more correct), by using
        // the observations that expanded PEs only start and stop
        // where whitespace is allowed.  getc wouldn't need any
        // "lexical" PE expansion logic, and no other method needs
        // to handle termination of PEs.  (parsing of literals would
        // still need to pop entities, but not parsing of references
        // in content.)

        char c = getc();
        boolean saw = false;

        while (c == ' ' || c == '\t' || c == '\n' || c == '\r') {
            saw = true;

            // this gracefully ends things when we stop playing
            // with internal parameters.  caller should have a
            // grammar rule allowing whitespace at end of entity.
            if (in.isEOF() && !in.isInternal()) {
                return saw;
            }
            c = getc();
        }
        ungetc();
        return saw;
    }

    private String maybeGetName()
            throws IOException, SAXException {

        NameCacheEntry entry = maybeGetNameCacheEntry();
        return (entry == null) ? null : entry.name;
    }

    private NameCacheEntry maybeGetNameCacheEntry()
            throws IOException, SAXException {

        // [5] Name ::= (Letter|'_'|':') (Namechar)*
        char c = getc();

        if (!XmlChars.isLetter(c) && c != ':' && c != '_') {
            ungetc();
            return null;
        }
        return nameCharString(c);
    }

    // Used when parsing enumerations
    private String getNmtoken()
            throws IOException, SAXException {

        // [7] Nmtoken ::= (Namechar)+
        char c = getc();
        if (!XmlChars.isNameChar(c)) {
            fatal("P-006", new Object[]{c});
        }
        return nameCharString(c).name;
    }

    // n.b. this gets used when parsing attribute values (for
    // internal references) so we can't use strTmp; it's also
    // a hotspot for CPU and memory in the parser (called at least
    // once for each element) so this has been optimized a bit.
    private NameCacheEntry nameCharString(char c)
            throws IOException, SAXException {

        int i = 1;

        nameTmp[0] = c;
        for (;;) {
            if ((c = in.getNameChar()) == 0) {
                break;
            }
            if (i >= nameTmp.length) {
                char tmp[] = new char[nameTmp.length + 10];
                System.arraycopy(nameTmp, 0, tmp, 0, nameTmp.length);
                nameTmp = tmp;
            }
            nameTmp[i++] = c;
        }
        return nameCache.lookupEntry(nameTmp, i);
    }

    //
    // much similarity between parsing entity values in DTD
    // and attribute values (in DTD or content) ... both follow
    // literal parsing rules, newline canonicalization, etc
    //
    // leaves value in 'strTmp' ... either a "replacement text" (4.5),
    // or else partially normalized attribute value (the first bit
    // of 3.3.3's spec, without the "if not CDATA" bits).
    //
    @SuppressWarnings("UnusedAssignment")
    private void parseLiteral(boolean isEntityValue)
            throws IOException, SAXException {

        // [9] EntityValue ::=
        //    '"' ([^"&%] | Reference | PEReference)* '"'
        //    |    "'" ([^'&%] | Reference | PEReference)* "'"
        // [10] AttValue ::=
        //    '"' ([^"&]  | Reference             )* '"'
        //    |    "'" ([^'&]  | Reference             )* "'"
        char quote = getc();
        char c;
        InputEntity source = in;

        if (quote != '\'' && quote != '"') {
            fatal("P-007");
        }

        // don't report entity expansions within attributes,
        // they're reported "fully expanded" via SAX
//    isInAttribute = !isEntityValue;

        // get value into strTmp
        strTmp = new StringBuffer();

        // scan, allowing entity push/pop wherever ...
        // expanded entities can't terminate the literal!
        for (;;) {
            if (in != source && in.isEOF()) {
                // we don't report end of parsed entities
                // within attributes (no SAX hooks)
                in = in.pop();
                continue;
            }
            if ((c = getc()) == quote && in == source) {
                break;
            }

            //
            // Basically the "reference in attribute value"
            // row of the chart in section 4.4 of the spec
            //
            if (c == '&') {
                String entityName = maybeGetName();

                if (entityName != null) {
                    nextChar(';', "F-020", entityName);

                    // 4.4 says:  bypass these here ... we'll catch
                    // forbidden refs to unparsed entities on use
                    if (isEntityValue) {
                        strTmp.append('&');
                        strTmp.append(entityName);
                        strTmp.append(';');
                        continue;
                    }
                    expandEntityInLiteral(entityName, entities, isEntityValue);

                    // character references are always included immediately
                } else if ((getc()) == '#') {
                    int tmp = parseCharNumber();

                    if (tmp > 0xffff) {
                        tmp = surrogatesToCharTmp(tmp);
                        strTmp.append(charTmp[0]);
                        if (tmp == 2) {
                            strTmp.append(charTmp[1]);
                        }
                    } else {
                        strTmp.append((char) tmp);
                    }
                } else {
                    fatal("P-009");
                }
                continue;

            }

            // expand parameter entities only within entity value literals
            if (c == '%' && isEntityValue) {
                String entityName = maybeGetName();

                if (entityName != null) {
                    nextChar(';', "F-021", entityName);
                    expandEntityInLiteral(entityName, params, isEntityValue);
                    continue;
                } else {
                    fatal("P-011");
                }
            }

            // For attribute values ...
            if (!isEntityValue) {
                // 3.3.3 says whitespace normalizes to space...
                if (c == ' ' || c == '\t' || c == '\n' || c == '\r') {
                    strTmp.append(' ');
                    continue;
                }

                // "<" not legal in parsed literals ...
                if (c == '<') {
                    fatal("P-012");
                }
            }

            strTmp.append(c);
        }
//    isInAttribute = false;
    }

    // does a SINGLE expansion of the entity (often reparsed later)
    private void expandEntityInLiteral(String name, SimpleHashtable table,
            boolean isEntityValue)
            throws IOException, SAXException {

        EntityDecl entity = table.get(name);

        if (entity instanceof InternalEntity) {
            InternalEntity value = (InternalEntity) entity;
            pushReader(value.buf, name, !value.isPE);

        } else if (entity instanceof ExternalEntity) {
            if (!isEntityValue) // must be a PE ...
            {
                fatal("P-013", new Object[]{name});
            }
            // XXX if this returns false ...
            pushReader((ExternalEntity) entity);

        } else if (entity == null) {
            //
            // Note:  much confusion about whether spec requires such
            // errors to be fatal in many cases, but none about whether
            // it allows "normal" errors to be unrecoverable!
            //
            fatal((table == params) ? "V-022" : "P-014",
                    new Object[]{name});
        }
    }

    // [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
    // for PUBLIC and SYSTEM literals, also "'
    // NOTE:  XML spec should explicitly say that PE ref syntax is
    // ignored in PIs, comments, SystemLiterals, and Pubid Literal
    // values ... can't process the XML spec's own DTD without doing
    // that for comments.
    private String getQuotedString(String type, String extra)
            throws IOException, SAXException {

        // use in.getc to bypass PE processing
        char quote = in.getc();

        if (quote != '\'' && quote != '"') {
            fatal("P-015", new Object[]{
                        messages.getMessage(locale, type, new Object[]{extra})
                    });
        }

        char c;

        strTmp = new StringBuffer();
        while ((c = in.getc()) != quote) {
            strTmp.append(c);
        }
        return strTmp.toString();
    }

    private String parsePublicId() throws IOException, SAXException {

        // [12] PubidLiteral ::= ('"' PubidChar* '"') | ("'" PubidChar* "'")
        // [13] PubidChar ::= #x20|#xd|#xa|[a-zA-Z0-9]|[-'()+,./:=?;!*#@$_%]
        String retval = getQuotedString("F-033", null);
        for (int i = 0; i < retval.length(); i++) {
            char c = retval.charAt(i);
            if (" \r\n-'()+,./:=?;!*#@$_%0123456789".indexOf(c) == -1
                    && !(c >= 'A' && c <= 'Z')
                    && !(c >= 'a' && c <= 'z')) {
                fatal("P-016", new Object[]{c});
            }
        }
        strTmp = new StringBuffer();
        strTmp.append(retval);
        return normalize(false);
    }

    // [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
    // handled by:  InputEntity.parsedContent()
    private boolean maybeComment(boolean skipStart)
            throws IOException, SAXException {

        // [15] Comment ::= ''
        if (!in.peek(skipStart ? "!--" : "