All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.sun.msv.scanner.dtd.DTDParser Maven / Gradle / Ivy

There is a newer version: 2.3.0
Show newest version
/*
 * Copyright (c) 1998-2013 Oracle and/or its affiliates. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *
 *   - Neither the name of Oracle nor the names of its
 *     contributors may be used to endorse or promote products derived
 *     from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

package com.sun.msv.scanner.dtd;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Locale;
import java.util.Set;
import java.util.Vector;

import org.xml.sax.EntityResolver;
import org.xml.sax.InputSource;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;

/**
 * This implements parsing of XML 1.0 DTDs.
 *
 * This conforms to the portion of the XML 1.0 specification related 
 * to the external DTD subset.
 *
 * For multi-language applications (such as web servers using XML 
 * processing to create dynamic content), a method supports choosing 
 * a locale for parser diagnostics which is both understood by the 
 * message recipient and supported by the parser.
 *
 * This parser produces a stream of parse events.  It supports some
 * features (exposing comments, CDATA sections, and entity references) 
 * which are not required to be reported by conformant XML processors.  
 *
 * @author David Brownell
 * @author Janet Koenig
 * @author Kohsuke KAWAGUCHI
 * @version $Id: DTDParser.java 1793 2013-02-18 12:52:53Z snajper $
 */
public class DTDParser {
    public final static String TYPE_CDATA = "CDATA";
    public final static String TYPE_ID = "ID";
    public final static String TYPE_IDREF = "IDREF";
    public final static String TYPE_IDREFS = "IDREFS";
    public final static String TYPE_ENTITY = "ENTITY";
    public final static String TYPE_ENTITIES = "ENTITIES";
    public final static String TYPE_NMTOKEN = "NMTOKEN";
    public final static String TYPE_NMTOKENS = "NMTOKENS";
    public final static String TYPE_NOTATION = "NOTATION";
    public final static String TYPE_ENUMERATION = "ENUMERATION";

    
    // stack of input entities being merged
    private InputEntity        in;

    // temporaries reused during parsing
    private StringBuffer    strTmp;
    private char        nameTmp [];
    private NameCache        nameCache;
    private char        charTmp [] = new char [2];

    // temporary DTD parsing state
    private boolean        doLexicalPE;

    // DTD state, used during parsing
//    private SimpleHashtable    elements = new SimpleHashtable (47);
    protected final Set declaredElements = new java.util.HashSet();
    private SimpleHashtable    params = new SimpleHashtable (7);

    // exposed to package-private subclass
    Hashtable            notations = new Hashtable (7);
    SimpleHashtable        entities = new SimpleHashtable (17);

    private SimpleHashtable     ids = new SimpleHashtable ();

    // listeners for DTD parsing events
    private DTDEventListener    dtdHandler;

    private EntityResolver      resolver;
    private Locale              locale;

    // string constants -- use these copies so "==" works
    // package private
    static final String        strANY = "ANY";
    static final String        strEMPTY = "EMPTY";
    
    /**
     * Used by applications to request locale for diagnostics.
     *
     * @param l The locale to use, or null to use system defaults
     *            (which may include only message IDs).
     */
    public void setLocale (Locale l) throws SAXException {

    if (l != null && !messages.isLocaleSupported (l.toString ())) {
        throw new SAXException (messages.getMessage (locale,
            "P-078", new Object [] { l }));
        }
    locale = l;
    }

    /** 
     * Returns the diagnostic locale. 
     */
    public Locale getLocale () { 
        return locale; 
    }
    
    /**
     * Chooses a client locale to use for diagnostics, using the first
     * language specified in the list that is supported by this parser.
     * That locale is then set using 
     * setLocale().  Such a list could be provided by a variety of user
     * preference mechanisms, including the HTTP Accept-Language
     * header field.
     *
     * @see MessageCatalog
     *
     * @param languages Array of language specifiers, ordered with the most
     *    preferable one at the front.  For example, "en-ca" then "fr-ca",
     *  followed by "zh_CN".  Both RFC 1766 and Java styles are supported.
     * @return The chosen locale, or null.
     */
    public Locale chooseLocale (String languages [])
                  throws SAXException {

    Locale    l = messages.chooseLocale (languages);

    if (l != null) {
        setLocale (l);
        }
    return l;
    }

    /** 
     * Lets applications control entity resolution. 
     */
    public void setEntityResolver (EntityResolver r) { 

        resolver = r; 
    }
 
    /** 
     * Returns the object used to resolve entities 
     */
    public EntityResolver getEntityResolver () { 

        return resolver; 
    }

    /** 
     * Used by applications to set handling of DTD parsing events. 
     */
    public void setDtdHandler (DTDEventListener handler) {
        dtdHandler = handler;
        if( handler!=null )
            handler.setDocumentLocator(
                new Locator(){
                    public String getPublicId() { return DTDParser.this.getPublicId(); }
                    public String getSystemId() { return DTDParser.this.getSystemId(); }
                    public int getLineNumber() { return DTDParser.this.getLineNumber(); }
                    public int getColumnNumber() { return DTDParser.this.getColumnNumber(); }
                });
    }

    /** 
     * Returns the handler used to for DTD parsing events. 
     */
    public DTDEventListener getDtdHandler () { 
        return dtdHandler; 
    }

    /** 
     * Parse a DTD. 
     */
    public void parse (InputSource in) 
                throws IOException, SAXException {
        init ();
        parseInternal (in);
    }

    /** 
     * Parse a DTD. 
     */
    public void parse (String uri)
        throws IOException, SAXException
    {
        InputSource    in;

        init ();
        // System.out.println ("parse (\"" + uri + "\")");
        in = resolver.resolveEntity (null, uri);

        // If custom resolver punts resolution to parser, handle it ...
        if (in == null) {
            in = Resolver.createInputSource (new java.net.URL (uri), false);

            // ... or if custom resolver doesn't correctly construct the
            // input entity, patch it up enough so relative URIs work, and
            // issue a warning to minimize later confusion.
        } else if (in.getSystemId () == null) {
            warning ("P-065", null);
            in.setSystemId (uri);
        }

        parseInternal (in);
    }

    // makes sure the parser is reset to "before a document"
    private void init ()
    {
        in = null;

        // alloc temporary data used in parsing
        strTmp = new StringBuffer ();
        nameTmp = new char [20];
        nameCache = new NameCache ();

        // reset doc info
//        isInAttribute = false;

        doLexicalPE = false;

        entities.clear ();
        notations.clear ();
        params.clear ();
    //    elements.clear ();
        declaredElements.clear();

        // initialize predefined references ... re-interpreted later
        builtin ("amp", "&");
        builtin ("lt", "<");
        builtin ("gt", ">");
        builtin ("quot", "\"");
        builtin ("apos", "'");

        if (locale == null)
            locale = Locale.getDefault ();
        if (resolver == null)
            resolver = new Resolver ();
            if (dtdHandler == null)
                dtdHandler = new DTDHandlerBase();
    }

    private void builtin (String entityName, String entityValue) {
        InternalEntity entity;
        entity = new InternalEntity (entityName, entityValue.toCharArray ());
        entities.put (entityName, entity);
    }


    ////////////////////////////////////////////////////////////////
    //
    // parsing is by recursive descent, code roughly
    // following the BNF rules except tweaked for simple
    // lookahead.  rules are more or less in numeric order,
    // except where code sharing suggests other structures.
    //
    // a classic benefit of recursive descent parsers:  it's
    // relatively easy to get diagnostics that make sense.
    //
    ////////////////////////////////////////////////////////////////


    private void parseInternal (InputSource input)
                 throws IOException, SAXException {

    if (input == null)
        fatal("P-000");

    try {
        in = InputEntity.getInputEntity(dtdHandler, locale);
        in.init(input, null, null, false);

        dtdHandler.startDTD(in);

        // [30] extSubset ::= TextDecl? extSubsetDecl
        // [31] extSubsetDecl ::= ( markupdecl | conditionalSect
        //        | PEReference | S )*
        //    ... same as [79] extPE, which is where the code is

            ExternalEntity externalSubset = new ExternalEntity(in);
            externalParameterEntity(externalSubset);

        if (!in.isEOF ()) {
        fatal ("P-001", new Object []
            { Integer.toHexString (((int)getc ())) } );
            }
            afterRoot();
            dtdHandler.endDTD ();

    } catch (EndOfInputException e) {
        if (!in.isDocument ()) {
        String name = in.getName ();
        do {    // force a relevant URI and line number  
            in = in.pop ();
        } while (in.isInternal ());
        fatal ("P-002", new Object [] { name });
        } else {
        fatal ("P-003", null);
            }
    } catch (RuntimeException e) {
        // Don't discard location that triggered the exception
        // ## Should properly wrap exception
        System.err.print("Internal DTD parser error: "); // ##
        e.printStackTrace();
        throw new SAXParseException ( e.getMessage () != null
            ? e.getMessage () : e.getClass ().getName (),
        getPublicId (), getSystemId (),
        getLineNumber (), getColumnNumber ());

    } finally {
        // recycle temporary data used during parsing
        strTmp = null;
        nameTmp = null;
        nameCache = null;

        // ditto input sources etc
        if (in != null) {
        in.close ();
        in = null;
        }

        // get rid of all DTD info ... some of it would be
        // useful for editors etc, investigate later.

        params.clear();
        entities.clear();
        notations.clear();
        declaredElements.clear();
//        elements.clear();
        ids.clear();
    }
    }

    void afterRoot () throws SAXException
    {
        // Make sure all IDREFs match declared ID attributes.  We scan
        // after the document element is parsed, since XML allows forward
        // references, and only now can we know if they're all resolved.

        for (Enumeration e = ids.keys ();
                e.hasMoreElements ();
                ) {
            String id = (String)e.nextElement ();
            Boolean value = (Boolean)ids.get(id);
            if (Boolean.FALSE == value)
                error ("V-024", new Object [] { id });
        }
    }


    // role is for diagnostics
    private void whitespace (String roleId) 
                 throws IOException, SAXException {

        // [3] S ::= (#x20 | #x9 | #xd | #xa)+
    if (!maybeWhitespace ()) {
        fatal ("P-004", new Object []
            { messages.getMessage (locale, roleId) });
        }
    }

    // S?
    private boolean maybeWhitespace () 
                    throws IOException, SAXException {

    if (!doLexicalPE)
        return in.maybeWhitespace ();

    // see getc() for the PE logic -- this lets us splice
    // expansions of PEs in "anywhere".  getc() has smarts,
    // so for external PEs we don't bypass it.

    // XXX we can marginally speed PE handling, and certainly
    // be cleaner (hence potentially more correct), by using
    // the observations that expanded PEs only start and stop
    // where whitespace is allowed.  getc wouldn't need any
    // "lexical" PE expansion logic, and no other method needs
    // to handle termination of PEs.  (parsing of literals would
    // still need to pop entities, but not parsing of references
    // in content.)

    char c = getc();
    boolean saw = false;

    while (c == ' ' || c == '\t' || c == '\n' || c == '\r') {
        saw = true;

        // this gracefully ends things when we stop playing
        // with internal parameters.  caller should have a
        // grammar rule allowing whitespace at end of entity.
        if (in.isEOF () && !in.isInternal ())
        return saw;
        c = getc ();
    }
    ungetc ();
    return saw;
    }

    private String maybeGetName ()
                   throws IOException, SAXException {

    NameCacheEntry    entry = maybeGetNameCacheEntry ();
    return (entry == null) ? null : entry.name;
    }

    private NameCacheEntry maybeGetNameCacheEntry ()
                           throws IOException, SAXException {

    // [5] Name ::= (Letter|'_'|':') (Namechar)*
    char        c = getc ();

    if (!XmlChars.isLetter (c) && c != ':' && c != '_') {
        ungetc ();
        return null;
    }
    return nameCharString (c);
    }

    // Used when parsing enumerations
    private String getNmtoken ()
                   throws IOException, SAXException {

    // [7] Nmtoken ::= (Namechar)+
    char c = getc ();
    if (!XmlChars.isNameChar (c))
        fatal ("P-006", new Object [] { new Character (c) });
    return nameCharString (c).name;
    }

    // n.b. this gets used when parsing attribute values (for
    // internal references) so we can't use strTmp; it's also
    // a hotspot for CPU and memory in the parser (called at least
    // once for each element) so this has been optimized a bit.

    private NameCacheEntry nameCharString (char c)
                           throws IOException, SAXException {

    int    i = 1;

    nameTmp [0] = c;
    for (;;) {
        if ((c = in.getNameChar ()) == 0)
        break;
        if (i >= nameTmp.length) {
        char tmp [] = new char [nameTmp.length + 10];
        System.arraycopy (nameTmp, 0, tmp, 0, nameTmp.length);
        nameTmp = tmp;
        }
        nameTmp [i++] = c;
    }
    return nameCache.lookupEntry (nameTmp, i);
    }

    //
    // much similarity between parsing entity values in DTD
    // and attribute values (in DTD or content) ... both follow
    // literal parsing rules, newline canonicalization, etc
    //
    // leaves value in 'strTmp' ... either a "replacement text" (4.5),
    // or else partially normalized attribute value (the first bit
    // of 3.3.3's spec, without the "if not CDATA" bits).
    //
    private void parseLiteral (boolean isEntityValue)
                 throws IOException, SAXException {

    // [9] EntityValue ::=
    //    '"' ([^"&%] | Reference | PEReference)* '"'
    //    |    "'" ([^'&%] | Reference | PEReference)* "'"
    // [10] AttValue ::=
    //    '"' ([^"&]  | Reference             )* '"'
    //    |    "'" ([^'&]  | Reference             )* "'"
    char        quote = getc ();
    char        c;
    InputEntity    source = in;

    if (quote != '\'' && quote != '"') {
        fatal ("P-007");
        }

    // don't report entity expansions within attributes,
    // they're reported "fully expanded" via SAX
//    isInAttribute = !isEntityValue;

    // get value into strTmp
    strTmp = new StringBuffer ();

    // scan, allowing entity push/pop wherever ...
    // expanded entities can't terminate the literal!
    for (;;) {
        if (in != source && in.isEOF ()) {
        // we don't report end of parsed entities
        // within attributes (no SAX hooks)
        in = in.pop ();
        continue;
        }
        if ((c = getc ()) == quote && in == source) {
        break;
            }

        //
        // Basically the "reference in attribute value"
        // row of the chart in section 4.4 of the spec
        //
        if (c == '&') {
        String    entityName = maybeGetName ();

        if (entityName != null) {
            nextChar (';', "F-020", entityName);

            // 4.4 says:  bypass these here ... we'll catch
            // forbidden refs to unparsed entities on use
            if (isEntityValue) {
            strTmp.append ('&');
            strTmp.append (entityName);
            strTmp.append (';');
            continue;
            }
            expandEntityInLiteral (entityName, entities, isEntityValue);


        // character references are always included immediately
        } else if ((c = getc ()) == '#') {
            int tmp = parseCharNumber ();

            if (tmp > 0xffff) {
            tmp = surrogatesToCharTmp (tmp);
            strTmp.append (charTmp [0]);
            if (tmp == 2)
                strTmp.append (charTmp [1]);
            } else
            strTmp.append ((char) tmp);
        } else
            fatal ("P-009");
        continue;

        }

        // expand parameter entities only within entity value literals
        if (c == '%' && isEntityValue) {
        String    entityName = maybeGetName ();

        if (entityName != null) {
            nextChar (';', "F-021", entityName);
                    expandEntityInLiteral (entityName, params, isEntityValue);
            continue;
        } else
            fatal ("P-011");
        }

        // For attribute values ...
        if (!isEntityValue) {
        // 3.3.3 says whitespace normalizes to space...
        if (c == ' ' || c == '\t' || c == '\n' || c == '\r') {
            strTmp.append (' ');
            continue;
        }

        // "<" not legal in parsed literals ...
        if (c == '<')
            fatal ("P-012");
        }

        strTmp.append (c);
    }
//    isInAttribute = false;
    }

    // does a SINGLE expansion of the entity (often reparsed later)
    private void expandEntityInLiteral( String name, SimpleHashtable table,
                                    boolean isEntityValue) 
                 throws IOException, SAXException {

    Object    entity = table.get (name);

    if (entity instanceof InternalEntity) {
        InternalEntity value = (InternalEntity) entity;
        pushReader (value.buf, name, !value.isPE);

    } else if (entity instanceof ExternalEntity) {
        if (!isEntityValue)    // must be a PE ...
        fatal ("P-013", new Object [] { name });
        // XXX if this returns false ...
        pushReader ((ExternalEntity) entity);

    } else if (entity == null) {
        //
        // Note:  much confusion about whether spec requires such
        // errors to be fatal in many cases, but none about whether
        // it allows "normal" errors to be unrecoverable!
        //
        fatal (
        (table == params) ? "V-022" : "P-014",
        new Object [] { name });
    }
    }

    // [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
    // for PUBLIC and SYSTEM literals, also "'
    
    // NOTE:  XML spec should explicitly say that PE ref syntax is
    // ignored in PIs, comments, SystemLiterals, and Pubid Literal
    // values ... can't process the XML spec's own DTD without doing
    // that for comments.

    private String getQuotedString (String type, String extra)
                   throws IOException, SAXException {

    // use in.getc to bypass PE processing
    char         quote = in.getc ();

    if (quote != '\'' && quote != '"')
        fatal ("P-015", new Object [] {
        messages.getMessage (locale, type, new Object [] { extra })
        });

    char        c;

    strTmp = new StringBuffer ();
    while ((c = in.getc ()) != quote)
        strTmp.append ((char)c);
    return strTmp.toString ();
    }


    private String parsePublicId () throws IOException, SAXException {

    // [12] PubidLiteral ::= ('"' PubidChar* '"') | ("'" PubidChar* "'")
    // [13] PubidChar ::= #x20|#xd|#xa|[a-zA-Z0-9]|[-'()+,./:=?;!*#@$_%]
    String retval = getQuotedString ("F-033", null);
    for (int i = 0; i < retval.length (); i++) {
        char c = retval.charAt (i);
        if (" \r\n-'()+,./:=?;!*#@$_%0123456789".indexOf(c) == -1
            && !(c >= 'A' && c <= 'Z')
            && !(c >= 'a' && c <= 'z'))
        fatal ("P-016", new Object [] { new Character (c) });
    }
    strTmp = new StringBuffer ();
    strTmp.append (retval);
    return normalize (false);
    }

    // [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
    // handled by:  InputEntity.parsedContent()

    private boolean maybeComment (boolean skipStart)
                    throws IOException, SAXException {

    // [15] Comment ::= ''
    if (!in.peek (skipStart ? "!--" : "