com.sun.xml.dtdparser.DTDParser Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of jaxb-osgi Show documentation
JAXB (JSR 222) reference implementation This module is exclusively meant for use in GlassFish V3 development. It is highly unlikely to work in any other environment.
There is a newer version: 4.0.5
Show newest version
/*
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
 *
 * Copyright (c) 1998-2012 Oracle and/or its affiliates. All rights reserved.
 *
 * The contents of this file are subject to the terms of either the GNU
 * General Public License Version 2 only ("GPL") or the Common Development
 * and Distribution License("CDDL") (collectively, the "License").  You
 * may not use this file except in compliance with the License.  You can
 * obtain a copy of the License at
 * http://glassfish.java.net/public/CDDL+GPL_1_1.html
 * or packager/legal/LICENSE.txt.  See the License for the specific
 * language governing permissions and limitations under the License.
 *
 * When distributing the software, include this License Header Notice in each
 * file and include the License file at packager/legal/LICENSE.txt.
 *
 * GPL Classpath Exception:
 * Oracle designates this particular file as subject to the "Classpath"
 * exception as provided by Oracle in the GPL Version 2 section of the License
 * file that accompanied this code.
 *
 * Modifications:
 * If applicable, add the following below the License Header, with the fields
 * enclosed by brackets [] replaced by your own identifying information:
 * "Portions Copyright [year] [name of copyright owner]"
 *
 * Contributor(s):
 * If you wish your version of this file to be governed by only the CDDL or
 * only the GPL Version 2, indicate your decision by adding "[Contributor]
 * elects to include this software in this distribution under the [CDDL or GPL
 * Version 2] license."  If you don't indicate a single choice of license, a
 * recipient has the option to distribute your version of this file under
 * either the CDDL, the GPL Version 2 or to extend the choice of license to
 * its licensees as provided above.  However, if you add GPL Version 2 code
 * and therefore, elected the GPL Version 2 license, then the option applies
 * only if the new code is made subject to such option by the copyright
 * holder.
 */

package com.sun.xml.dtdparser;

import org.xml.sax.EntityResolver;
import org.xml.sax.InputSource;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Locale;
import java.util.Set;
import java.util.Vector;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
 * This implements parsing of XML 1.0 DTDs.
 * 
 * This conforms to the portion of the XML 1.0 specification related to the
 * external DTD subset.
 * 

 * For multi-language applications (such as web servers using XML processing to
 * create dynamic content), a method supports choosing a locale for parser
 * diagnostics which is both understood by the message recipient and supported
 * by the parser.
 * 

 * This parser produces a stream of parse events. It supports some features
 * (exposing comments, CDATA sections, and entity references) which are not
 * required to be reported by conformant XML processors.
 *
 * @author David Brownell
 * @author Janet Koenig
 * @author Kohsuke KAWAGUCHI
 * @version $Id: DTDParser.java,v 1.2 2009-04-16 15:25:49 snajper Exp $
 */
public class DTDParser {

    public final static String TYPE_CDATA = "CDATA";
    public final static String TYPE_ID = "ID";
    public final static String TYPE_IDREF = "IDREF";
    public final static String TYPE_IDREFS = "IDREFS";
    public final static String TYPE_ENTITY = "ENTITY";
    public final static String TYPE_ENTITIES = "ENTITIES";
    public final static String TYPE_NMTOKEN = "NMTOKEN";
    public final static String TYPE_NMTOKENS = "NMTOKENS";
    public final static String TYPE_NOTATION = "NOTATION";
    public final static String TYPE_ENUMERATION = "ENUMERATION";
    // stack of input entities being merged
    private InputEntity in;
    // temporaries reused during parsing
    private StringBuffer strTmp;
    private char nameTmp[];
    private NameCache nameCache;
    private char charTmp[] = new char[2];
    // temporary DTD parsing state
    private boolean doLexicalPE;
    // DTD state, used during parsing
//    private SimpleHashtable    elements = new SimpleHashtable (47);
    protected final Set declaredElements = new java.util.HashSet();
    private SimpleHashtable params = new SimpleHashtable(7);
    // exposed to package-private subclass
    Hashtable notations = new Hashtable(7);
    SimpleHashtable entities = new SimpleHashtable(17);
    private SimpleHashtable ids = new SimpleHashtable();
    // listeners for DTD parsing events
    private DTDEventListener dtdHandler;
    private EntityResolver resolver;
    private Locale locale;
    // string constants -- use these copies so "==" works
    // package private
    static final String strANY = "ANY";
    static final String strEMPTY = "EMPTY";

    private static final Logger LOGGER = Logger.getLogger(DTDParser.class.getName());
    
    /**
     * Used by applications to request locale for diagnostics.
     *
     * @param l The locale to use, or null to use system defaults (which may
     * include only message IDs).
     */
    public void setLocale(Locale l) throws SAXException {

        if (l != null && !messages.isLocaleSupported(l.toString())) {
            throw new SAXException(messages.getMessage(locale,
                    "P-078", new Object[]{l}));
        }
        locale = l;
    }

    /**
     * Returns the diagnostic locale.
     */
    public Locale getLocale() {
        return locale;
    }

    /**
     * Chooses a client locale to use for diagnostics, using the first language
     * specified in the list that is supported by this parser. That locale is
     * then set using  setLocale().
     * Such a list could be provided by a variety of user preference mechanisms,
     * including the HTTP Accept-Language header field.
     *
     * @param languages Array of language specifiers, ordered with the most
     * preferable one at the front. For example, "en-ca" then "fr-ca", followed
     * by "zh_CN". Both RFC 1766 and Java styles are supported.
     * @return The chosen locale, or null.
     * @see MessageCatalog
     */
    public Locale chooseLocale(String languages[])
            throws SAXException {

        Locale l = messages.chooseLocale(languages);

        if (l != null) {
            setLocale(l);
        }
        return l;
    }

    /**
     * Lets applications control entity resolution.
     */
    public void setEntityResolver(EntityResolver r) {

        resolver = r;
    }

    /**
     * Returns the object used to resolve entities
     */
    public EntityResolver getEntityResolver() {

        return resolver;
    }

    /**
     * Used by applications to set handling of DTD parsing events.
     */
    public void setDtdHandler(DTDEventListener handler) {
        dtdHandler = handler;
        if (handler != null) {
            handler.setDocumentLocator(new Locator() {
                @Override
                public String getPublicId() {
                    return DTDParser.this.getPublicId();
                }

                @Override
                public String getSystemId() {
                    return DTDParser.this.getSystemId();
                }

                @Override
                public int getLineNumber() {
                    return DTDParser.this.getLineNumber();
                }

                @Override
                public int getColumnNumber() {
                    return DTDParser.this.getColumnNumber();
                }
            });
        }
    }

    /**
     * Returns the handler used to for DTD parsing events.
     */
    public DTDEventListener getDtdHandler() {
        return dtdHandler;
    }

    /**
     * Parse a DTD.
     */
    public void parse(InputSource in)
            throws IOException, SAXException {
        init();
        parseInternal(in);
    }

    /**
     * Parse a DTD.
     */
    public void parse(String uri)
            throws IOException, SAXException {
        InputSource inSource;

        init();
        // System.out.println ("parse (\"" + uri + "\")");
        inSource = resolver.resolveEntity(null, uri);

        // If custom resolver punts resolution to parser, handle it ...
        if (inSource == null) {
            inSource = Resolver.createInputSource(new java.net.URL(uri), false);

            // ... or if custom resolver doesn't correctly construct the
            // input entity, patch it up enough so relative URIs work, and
            // issue a warning to minimize later confusion.
        } else if (inSource.getSystemId() == null) {
            warning("P-065", null);
            inSource.setSystemId(uri);
        }

        parseInternal(inSource);
    }

    // makes sure the parser is reset to "before a document"
    private void init() {
        in = null;

        // alloc temporary data used in parsing
        strTmp = new StringBuffer();
        nameTmp = new char[20];
        nameCache = new NameCache();

        // reset doc info
//        isInAttribute = false;

        doLexicalPE = false;

        entities.clear();
        notations.clear();
        params.clear();
        //    elements.clear ();
        declaredElements.clear();

        // initialize predefined references ... re-interpreted later
        builtin("amp", "&");
        builtin("lt", "<");
        builtin("gt", ">");
        builtin("quot", "\"");
        builtin("apos", "'");

        if (locale == null) {
            locale = Locale.getDefault();
        }
        if (resolver == null) {
            resolver = new Resolver();
        }
        if (dtdHandler == null) {
            dtdHandler = new DTDHandlerBase();
        }
    }

    private void builtin(String entityName, String entityValue) {
        InternalEntity entity;
        entity = new InternalEntity(entityName, entityValue.toCharArray());
        entities.put(entityName, entity);
    }

    ////////////////////////////////////////////////////////////////
    //
    // parsing is by recursive descent, code roughly
    // following the BNF rules except tweaked for simple
    // lookahead.  rules are more or less in numeric order,
    // except where code sharing suggests other structures.
    //
    // a classic benefit of recursive descent parsers:  it's
    // relatively easy to get diagnostics that make sense.
    //
    ////////////////////////////////////////////////////////////////
    @SuppressWarnings("CallToThreadDumpStack")
    private void parseInternal(InputSource input)
            throws IOException, SAXException {

        if (input == null) {
            fatal("P-000");
        }

        try {
            in = InputEntity.getInputEntity(dtdHandler, locale);
            in.init(input, null, null, false);

            dtdHandler.startDTD(in);

            // [30] extSubset ::= TextDecl? extSubsetDecl
            // [31] extSubsetDecl ::= ( markupdecl | conditionalSect
            //        | PEReference | S )*
            //    ... same as [79] extPE, which is where the code is

            ExternalEntity externalSubset = new ExternalEntity(in);
            externalParameterEntity(externalSubset);

            if (!in.isEOF()) {
                fatal("P-001", new Object[]{Integer.toHexString(((int) getc()))});
            }
            afterRoot();
            dtdHandler.endDTD();

        } catch (EndOfInputException e) {
            if (!in.isDocument()) {
                String name = in.getName();
                do {    // force a relevant URI and line number
                    in = in.pop();
                } while (in.isInternal());
                fatal("P-002", new Object[]{name});
            } else {
                fatal("P-003", null);
            }
        } catch (RuntimeException e) {
            LOGGER.log(Level.SEVERE, "Internal DTD parser error.", e);
            throw new SAXParseException(e.getMessage() != null
                    ? e.getMessage() : e.getClass().getName(),
                    getPublicId(), getSystemId(),
                    getLineNumber(), getColumnNumber());

        } finally {
            // recycle temporary data used during parsing
            strTmp = null;
            nameTmp = null;
            nameCache = null;

            // ditto input sources etc
            if (in != null) {
                in.close();
                in = null;
            }

            // get rid of all DTD info ... some of it would be
            // useful for editors etc, investigate later.

            params.clear();
            entities.clear();
            notations.clear();
            declaredElements.clear();
//        elements.clear();
            ids.clear();
        }
    }

    void afterRoot() throws SAXException {
        // Make sure all IDREFs match declared ID attributes.  We scan
        // after the document element is parsed, since XML allows forward
        // references, and only now can we know if they're all resolved.

        for (Enumeration e = ids.keys();
                e.hasMoreElements();) {
            String id = (String) e.nextElement();
            Boolean value = (Boolean) ids.get(id);
            if (Boolean.FALSE.equals(value)) {
                error("V-024", new Object[]{id});
            }
        }
    }

    // role is for diagnostics
    private void whitespace(String roleId)
            throws IOException, SAXException {

        // [3] S ::= (#x20 | #x9 | #xd | #xa)+
        if (!maybeWhitespace()) {
            fatal("P-004", new Object[]{messages.getMessage(locale, roleId)});
        }
    }

    // S?
    private boolean maybeWhitespace()
            throws IOException, SAXException {

        if (!doLexicalPE) {
            return in.maybeWhitespace();
        }

        // see getc() for the PE logic -- this lets us splice
        // expansions of PEs in "anywhere".  getc() has smarts,
        // so for external PEs we don't bypass it.

        // XXX we can marginally speed PE handling, and certainly
        // be cleaner (hence potentially more correct), by using
        // the observations that expanded PEs only start and stop
        // where whitespace is allowed.  getc wouldn't need any
        // "lexical" PE expansion logic, and no other method needs
        // to handle termination of PEs.  (parsing of literals would
        // still need to pop entities, but not parsing of references
        // in content.)

        char c = getc();
        boolean saw = false;

        while (c == ' ' || c == '\t' || c == '\n' || c == '\r') {
            saw = true;

            // this gracefully ends things when we stop playing
            // with internal parameters.  caller should have a
            // grammar rule allowing whitespace at end of entity.
            if (in.isEOF() && !in.isInternal()) {
                return saw;
            }
            c = getc();
        }
        ungetc();
        return saw;
    }

    private String maybeGetName()
            throws IOException, SAXException {

        NameCacheEntry entry = maybeGetNameCacheEntry();
        return (entry == null) ? null : entry.name;
    }

    private NameCacheEntry maybeGetNameCacheEntry()
            throws IOException, SAXException {

        // [5] Name ::= (Letter|'_'|':') (Namechar)*
        char c = getc();

        if (!XmlChars.isLetter(c) && c != ':' && c != '_') {
            ungetc();
            return null;
        }
        return nameCharString(c);
    }

    // Used when parsing enumerations
    private String getNmtoken()
            throws IOException, SAXException {

        // [7] Nmtoken ::= (Namechar)+
        char c = getc();
        if (!XmlChars.isNameChar(c)) {
            fatal("P-006", new Object[]{Character.valueOf(c)});
        }
        return nameCharString(c).name;
    }

    // n.b. this gets used when parsing attribute values (for
    // internal references) so we can't use strTmp; it's also
    // a hotspot for CPU and memory in the parser (called at least
    // once for each element) so this has been optimized a bit.
    private NameCacheEntry nameCharString(char c)
            throws IOException, SAXException {

        int i = 1;

        nameTmp[0] = c;
        for (;;) {
            if ((c = in.getNameChar()) == 0) {
                break;
            }
            if (i >= nameTmp.length) {
                char tmp[] = new char[nameTmp.length + 10];
                System.arraycopy(nameTmp, 0, tmp, 0, nameTmp.length);
                nameTmp = tmp;
            }
            nameTmp[i++] = c;
        }
        return nameCache.lookupEntry(nameTmp, i);
    }

    //
    // much similarity between parsing entity values in DTD
    // and attribute values (in DTD or content) ... both follow
    // literal parsing rules, newline canonicalization, etc
    //
    // leaves value in 'strTmp' ... either a "replacement text" (4.5),
    // or else partially normalized attribute value (the first bit
    // of 3.3.3's spec, without the "if not CDATA" bits).
    //
    @SuppressWarnings("UnusedAssignment")
    private void parseLiteral(boolean isEntityValue)
            throws IOException, SAXException {

        // [9] EntityValue ::=
        //    '"' ([^"&%] | Reference | PEReference)* '"'
        //    |    "'" ([^'&%] | Reference | PEReference)* "'"
        // [10] AttValue ::=
        //    '"' ([^"&]  | Reference             )* '"'
        //    |    "'" ([^'&]  | Reference             )* "'"
        char quote = getc();
        char c;
        InputEntity source = in;

        if (quote != '\'' && quote != '"') {
            fatal("P-007");
        }

        // don't report entity expansions within attributes,
        // they're reported "fully expanded" via SAX
//    isInAttribute = !isEntityValue;

        // get value into strTmp
        strTmp = new StringBuffer();

        // scan, allowing entity push/pop wherever ...
        // expanded entities can't terminate the literal!
        for (;;) {
            if (in != source && in.isEOF()) {
                // we don't report end of parsed entities
                // within attributes (no SAX hooks)
                in = in.pop();
                continue;
            }
            if ((c = getc()) == quote && in == source) {
                break;
            }

            //
            // Basically the "reference in attribute value"
            // row of the chart in section 4.4 of the spec
            //
            if (c == '&') {
                String entityName = maybeGetName();

                if (entityName != null) {
                    nextChar(';', "F-020", entityName);

                    // 4.4 says:  bypass these here ... we'll catch
                    // forbidden refs to unparsed entities on use
                    if (isEntityValue) {
                        strTmp.append('&');
                        strTmp.append(entityName);
                        strTmp.append(';');
                        continue;
                    }
                    expandEntityInLiteral(entityName, entities, isEntityValue);

                    // character references are always included immediately
                } else if ((getc()) == '#') {
                    int tmp = parseCharNumber();

                    if (tmp > 0xffff) {
                        tmp = surrogatesToCharTmp(tmp);
                        strTmp.append(charTmp[0]);
                        if (tmp == 2) {
                            strTmp.append(charTmp[1]);
                        }
                    } else {
                        strTmp.append((char) tmp);
                    }
                } else {
                    fatal("P-009");
                }
                continue;

            }

            // expand parameter entities only within entity value literals
            if (c == '%' && isEntityValue) {
                String entityName = maybeGetName();

                if (entityName != null) {
                    nextChar(';', "F-021", entityName);
                    expandEntityInLiteral(entityName, params, isEntityValue);
                    continue;
                } else {
                    fatal("P-011");
                }
            }

            // For attribute values ...
            if (!isEntityValue) {
                // 3.3.3 says whitespace normalizes to space...
                if (c == ' ' || c == '\t' || c == '\n' || c == '\r') {
                    strTmp.append(' ');
                    continue;
                }

                // "<" not legal in parsed literals ...
                if (c == '<') {
                    fatal("P-012");
                }
            }

            strTmp.append(c);
        }
//    isInAttribute = false;
    }

    // does a SINGLE expansion of the entity (often reparsed later)
    private void expandEntityInLiteral(String name, SimpleHashtable table,
            boolean isEntityValue)
            throws IOException, SAXException {

        Object entity = table.get(name);

        if (entity instanceof InternalEntity) {
            InternalEntity value = (InternalEntity) entity;
            pushReader(value.buf, name, !value.isPE);

        } else if (entity instanceof ExternalEntity) {
            if (!isEntityValue) // must be a PE ...
            {
                fatal("P-013", new Object[]{name});
            }
            // XXX if this returns false ...
            pushReader((ExternalEntity) entity);

        } else if (entity == null) {
            //
            // Note:  much confusion about whether spec requires such
            // errors to be fatal in many cases, but none about whether
            // it allows "normal" errors to be unrecoverable!
            //
            fatal((table == params) ? "V-022" : "P-014",
                    new Object[]{name});
        }
    }

    // [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
    // for PUBLIC and SYSTEM literals, also "'
    // NOTE:  XML spec should explicitly say that PE ref syntax is
    // ignored in PIs, comments, SystemLiterals, and Pubid Literal
    // values ... can't process the XML spec's own DTD without doing
    // that for comments.
    private String getQuotedString(String type, String extra)
            throws IOException, SAXException {

        // use in.getc to bypass PE processing
        char quote = in.getc();

        if (quote != '\'' && quote != '"') {
            fatal("P-015", new Object[]{
                        messages.getMessage(locale, type, new Object[]{extra})
                    });
        }

        char c;

        strTmp = new StringBuffer();
        while ((c = in.getc()) != quote) {
            strTmp.append((char) c);
        }
        return strTmp.toString();
    }

    private String parsePublicId() throws IOException, SAXException {

        // [12] PubidLiteral ::= ('"' PubidChar* '"') | ("'" PubidChar* "'")
        // [13] PubidChar ::= #x20|#xd|#xa|[a-zA-Z0-9]|[-'()+,./:=?;!*#@$_%]
        String retval = getQuotedString("F-033", null);
        for (int i = 0; i < retval.length(); i++) {
            char c = retval.charAt(i);
            if (" \r\n-'()+,./:=?;!*#@$_%0123456789".indexOf(c) == -1
                    && !(c >= 'A' && c <= 'Z')
                    && !(c >= 'a' && c <= 'z')) {
                fatal("P-016", new Object[]{Character.valueOf(c)});
            }
        }
        strTmp = new StringBuffer();
        strTmp.append(retval);
        return normalize(false);
    }

    // [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
    // handled by:  InputEntity.parsedContent()
    private boolean maybeComment(boolean skipStart)
            throws IOException, SAXException {

        // [15] Comment ::= ''
        if (!in.peek(skipStart ? "!--" : "