com.sun.msv.scanner.dtd.Resolver Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of jaxb-extra-osgi Show documentation
JAXB extra OSGI bundle
There is a newer version: 2.3.0
/*
 * Copyright (c) 1998-2013 Oracle and/or its affiliates. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *
 *   - Neither the name of Oracle nor the names of its
 *     contributors may be used to endorse or promote products derived
 *     from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

package com.sun.msv.scanner.dtd;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.Hashtable;

import org.xml.sax.EntityResolver;
import org.xml.sax.InputSource;

/**
 * This entity resolver class provides a number of utilities which can help
 * managment of external parsed entities in XML.  These are commonly used
 * to hold markup declarations that are to be used as part of a Document
 * Type Declaration (DTD), or to hold text marked up with XML.
 *
 *  Features include: 

 *
 *  Static factory methods are provided for constructing SAX InputSource
 * objects from Files, URLs, or MIME objects.  This eliminates a class of
 * error-prone coding in applications.
 *
 * 
 Character encodings for XML documents are correctly supported: 
 *
 *     The encodings defined in the RFCs for MIME content types
 *    (2046 for general MIME, and 2376 for XML in particular), are
 *    supported, handling charset=... attributes and accepting
 *    content types which are known to be safe for use with XML;
 *
 *    
 The character encoding autodetection algorithm identified
 *    in the XML specification is used, and leverages all of
 *    the JDK 1.1 (and later) character encoding support.
 *
 *    
 The use of MIME typing may optionally be disabled, forcing the
 *    use of autodetection, to support web servers which don't correctly
 *    report MIME types for XML.  For example, they may report text that
 *    is encoded in EUC-JP as being US-ASCII text, leading to fatal
 *    errors during parsing.
 *
 *    
 The InputSource objects returned by this class always
 *    have a java.io.Reader available as the "character
 *    stream" property.
 *
 *    
 *
 * 
 Catalog entries can map public identifiers to Java resources or
 * to local URLs.  These are used to reduce network dependencies and loads,
 * and will often be used for external DTD components.  For example, packages
 * shipping DTD files as resources in JAR files can eliminate network traffic
 * when accessing them, and sites may provide local caches of common DTDs.
 * Note that no particular catalog syntax is supported by this class, only
 * the notion of a set of entries.
 *
 * 
 *
 *  Subclasses can perform tasks such as supporting new URI schemes for
 * URIs which are not URLs, such as URNs (see RFC 2396) or for accessing
 * MIME entities which are part of a multipart/related group
 * (see RFC 2387).  They may also be used to support particular catalog
 * syntaxes, such as the 
 * SGML/Open Catalog (SOCAT) which supports the SGML notion of "Formal
 * Public Identifiers (FPIs).
 *
 * @author David Brownell
 * @author Janet Koenig 
 * @version 1.3 00/02/24
 */
public class Resolver implements EntityResolver
{
    private boolean        ignoringMIME;

    // table mapping public IDs to (local) URIs
    private Hashtable        id2uri;

    // tables mapping public IDs to resources and classloaders
    private Hashtable        id2resource;
    private Hashtable        id2loader;

    //
    // table of MIME content types (less attributes!) known
    // to be mostly "OK" to use with XML MIME entities.  the
    // idea is to rule out obvious braindamage ("image/jpg")
    // not the subtle stuff ("text/html") that might actually
    // be (or become) safe.
    //
    private static final String types [] = {
    "application/xml",
    "text/xml",
    "text/plain",
    "text/html",            // commonly mis-inferred
    "application/x-netcdf",        // this is often illegal XML
    "content/unknown"
    };

    /** Constructs a resolver. */
    public            Resolver () { }
    
    /**
     * Returns an input source, using the MIME type information and URL
     * scheme to statically determine the correct character encoding if
     * possible and otherwise autodetecting it.  MIME carefully specifies
     * the character encoding defaults, and how attributes of the content
     * type can change it.  XML further specifies two mandatory encodings
     * (UTF-8 and UTF-16), and includes an XML declaration which can be
     * used to internally label most documents encoded using US-ASCII
     * supersets (such as Shift_JIS, EUC-JP, ISO-2022-*, ISO-8859-*, and
     * more).
     *
     * 
 This method can be used to access XML documents which do not
     * have URIs (such as servlet input streams, or most JavaMail message
     * entities) and to support access methods such as HTTP POST or PUT.
     * (URLs normally return content using the GET method.)
     *
     * 
  The caller should set the system ID in order for relative URIs
     * found in this document to be interpreted correctly. In some cases,
     * a custom resolver will need to be used; for example, documents
     * may be grouped in a single MIME "multipart/related" bundle, and
     * relative URLs would refer to other documents in that bundle.
     *
     * @param contentType The MIME content type for the source for which
     *    an InputSource is desired, such as text/xml;charset=utf-8.
     * @param stream The input byte stream for the input source.
     * @param checkType If true, this verifies that the content type is known
     *    to support XML documents, such as application/xml.
     * @param scheme Unless this is "file", unspecified MIME types
     *    default to US-ASCII.  Files are always autodetected since most
     *    file systems discard character encoding information.
     */
    public static InputSource createInputSource (
    String        contentType,
    InputStream    stream,
    boolean        checkType,
    String        scheme
    ) throws IOException
    {
        InputSource     retval;
    String        charset = null;
        
    if (contentType != null) {
        int        index;

        contentType = contentType.toLowerCase ();
        index = contentType.indexOf (';');
        if (index != -1) {
        String    attributes;

        attributes = contentType.substring (index + 1);
        contentType = contentType.substring (0, index);

        // use "charset=..." if it's available
        index = attributes.indexOf ("charset");
        if (index != -1) {
            attributes = attributes.substring (index + 7);
            // strip out subsequent attributes
            if ((index = attributes.indexOf (';')) != -1)
            attributes = attributes.substring (0, index);
            // find start of value
            if ((index = attributes.indexOf ('=')) != -1) {
            attributes = attributes.substring (index + 1);
            // strip out rfc822 comments
            if ((index = attributes.indexOf ('(')) != -1)
                attributes = attributes.substring (0, index);
            // double quotes are optional
            if ((index = attributes.indexOf ('"')) != -1) {
                attributes = attributes.substring (index + 1);
                attributes = attributes.substring (0,
                    attributes.indexOf ('"'));
            }
            charset = attributes.trim ();
            // XXX "\;", "\)" etc were mishandled above
            }
        }
        }

        //
        // Check MIME type.
        //
        if (checkType) {
        boolean isOK = false;
        for (int i = 0; i < types.length; i++)
            if (types [i].equals (contentType)) {
            isOK = true;
            break;
            }
        if (!isOK)
            throw new IOException ("Not XML: " + contentType);
        }

        //
        // "text/*" MIME types have hard-wired character set
        // defaults, as specified in the RFCs.  For XML, we
        // ignore the system "file.encoding" property since
        // autodetection is more correct.
        //
        if (charset == null) {
        contentType = contentType.trim ();
        if (contentType.startsWith ("text/")) {
            if (!"file".equalsIgnoreCase (scheme))
            charset = "US-ASCII";
        }
        // "application/*" has no default
        }
    }

    retval = new InputSource (XmlReader.createReader (stream, charset));
    retval.setByteStream (stream);
    retval.setEncoding (charset);
    return retval;
    }


    /**
     * Creates an input source from a given URI.
     *
     * @param uri the URI (system ID) for the entity
     * @param checkType if true, the MIME content type for the entity
     *    is checked for document type and character set encoding.
     */
    static public InputSource createInputSource (URL uri, boolean checkType)
        throws IOException {
    
        URLConnection    conn = uri.openConnection ();
        InputSource    retval;
    
        if (checkType) {
            String    contentType = conn.getContentType ();
            retval = createInputSource (contentType, conn.getInputStream (),
                false, uri.getProtocol ());
        } else {
            retval = new InputSource (
                XmlReader.createReader (conn.getInputStream ()));
        }
        retval.setSystemId (conn.getURL ().toString ());
        return retval;
    }


    /**
     * Creates an input source from a given file, autodetecting
     * the character encoding.
     */
    static public InputSource createInputSource (File file)
    throws IOException
    {
    InputSource    retval;
    String        path;
    
    retval = new InputSource (
            XmlReader.createReader (new FileInputStream (file)));

    // On JDK 1.2 and later, simplify this:
    //    "path = file.toURL ().toString ()".
    path = file.getAbsolutePath ();
    if (File.separatorChar != '/')
        path = path.replace (File.separatorChar, '/');
    if (!path.startsWith ("/"))
        path = "/" + path;
    if (!path.endsWith ("/") && file.isDirectory ())
        path = path + "/";

    retval.setSystemId ("file:" + path);
    return retval;
    }


    /**
     * SAX:
     * Resolve the given entity into an input source.  If the name can't
     * be mapped to a preferred form of the entity, the URI is used.  To
     * resolve the entity, first a local catalog mapping names to URIs is
     * consulted.  If no mapping is found there, a catalog mapping names
     * to java resources is consulted.  Finally, if neither mapping found
     * a copy of the entity, the specified URI is used.
     *
     * 
 When a URI is used, 
     * createInputSource is used to correctly deduce the character
     * encoding used by this entity.  No MIME type checking is done.
     *
     * @param name Used to find alternate copies of the entity, when
     *    this value is non-null; this is the XML "public ID".
     * @param uri Used when no alternate copy of the entity is found;
     *    this is the XML "system ID", normally a URI.
     */
    public InputSource     resolveEntity (String name, String uri)
     throws IOException
    {            
    InputSource     retval;        
    String        mappedURI = name2uri (name);
    InputStream    stream;

    // prefer explicit URI mappings, then bundled resources...
    if (mappedURI == null && (stream = mapResource (name)) != null) {
        uri = "java:resource:" + (String) id2resource.get (name);
        retval = new InputSource (XmlReader.createReader (stream));

    // ...and treat all URIs the same (as URLs for now). 
    } else {
        URL            url;
        URLConnection    conn;
        
        if (mappedURI != null)
        uri = mappedURI;
        else if (uri == null)
        return null;

        url = new URL (uri);
        conn = url.openConnection ();
        uri = conn.getURL ().toString ();
        // System.out.println ("++ URI: " + url);
        if (ignoringMIME)
        retval = new InputSource (
            XmlReader.createReader (conn.getInputStream ()));
        else {
        String        contentType = conn.getContentType ();
        retval = createInputSource (contentType,
            conn.getInputStream (),
            false, url.getProtocol ());
        }
    }
    retval.setSystemId (uri);
    retval.setPublicId (name);
    return retval;
    }


    /**
     * Returns true if this resolver is ignoring MIME types in the documents
     * it returns, to work around bugs in how servers have reported the
     * documents' MIME types.
     */
    public boolean isIgnoringMIME ()
    { return ignoringMIME; }

    /**
     * Tells the resolver whether to ignore MIME types in the documents it
     * retrieves.  Many web servers incorrectly assign text documents a
     * default character encoding, even when that is incorrect.  For example,
     * all HTTP text documents default to use ISO-8859-1 (used for Western
     * European languages), and other MIME sources default text documents
     * to use US-ASCII (a seven bit encoding).  For XML documents which
     * include text encoding declarations (as most should do), these server
     * bugs can be worked around by ignoring the MIME type entirely.
     */
    public void setIgnoringMIME (boolean value)
    { ignoringMIME = value; }


    // maps the public ID to an alternate URI, if one is registered
    private String name2uri (String publicId)
    {
    if (publicId == null || id2uri == null)
        return null;
    return (String) id2uri.get (publicId);
    }


    /**
     * Registers the given public ID as corresponding to a particular 
     * URI, typically a local copy.  This URI will be used in preference
     * to ones provided as system IDs in XML entity declarations.  This
     * mechanism would most typically be used for Document Type Definitions
     * (DTDs), where the public IDs are formally managed and versioned.
     *
     * @param publicId The managed public ID being mapped
     * @param uri The URI of the preferred copy of that entity
     */
    public void registerCatalogEntry (
    String        publicId,
    String        uri
    )
    {
    if (id2uri == null)
        id2uri = new Hashtable (17);
    id2uri.put (publicId, uri);
    }


    // return the resource as a stream
    private InputStream mapResource (String publicId)
    {
    // System.out.println ("++ PUBLIC: " + publicId);
    if (publicId == null || id2resource == null)
        return null;

    String        resourceName = (String) id2resource.get (publicId);
    ClassLoader    loader = null;

    if (resourceName == null)
        return null;
    // System.out.println ("++ Resource: " + resourceName);
    
    if (id2loader != null)
        loader = (ClassLoader) id2loader.get (publicId);
    // System.out.println ("++ Loader: " + loader);
    if (loader == null)
        return ClassLoader.getSystemResourceAsStream (resourceName);
    return loader.getResourceAsStream (resourceName);
    }

    /**
     * Registers a given public ID as corresponding to a particular Java
     * resource in a given class loader, typically distributed with a
     * software package.  This resource will be preferred over system IDs
     * included in XML documents.  This mechanism should most typically be
     * used for Document Type Definitions (DTDs), where the public IDs are
     * formally managed and versioned.
     *
     *  If a mapping to a URI has been provided, that mapping takes
     * precedence over this one.
     *
     * @param publicId The managed public ID being mapped
     * @param resourceName The name of the Java resource
     * @param loader The class loader holding the resource, or null if
     *    it is a system resource.
     */
    public void registerCatalogEntry (
    String        publicId,
    String        resourceName,
    ClassLoader    loader
    )
    {
    if (id2resource == null)
        id2resource = new Hashtable (17);
    id2resource.put (publicId, resourceName);

    if (loader != null) {
        if (id2loader == null)
        id2loader = new Hashtable (17);
        id2loader.put (publicId, loader);
    }
    }
}