com.sun.xml.dtdparser.Resolver Maven / Gradle / Ivy
/*
* Copyright (c) 1998, 2023 Oracle and/or its affiliates. All rights reserved.
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Distribution License v. 1.0, which is available at
* http://www.eclipse.org/org/documents/edl-v10.php.
*
* SPDX-License-Identifier: BSD-3-Clause
*/
package com.sun.xml.dtdparser;
import org.xml.sax.EntityResolver;
import org.xml.sax.InputSource;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
/**
* This entity resolver class provides a number of utilities which can help
* management of external parsed entities in XML. These are commonly used
* to hold markup declarations that are to be used as part of a Document
* Type Declaration (DTD), or to hold text marked up with XML.
*
* Features include:
*
* - Static factory methods are provided for constructing SAX InputSource
* objects from Files, URLs, or MIME objects. This eliminates a class of
* error-prone coding in applications.
*
* - Character encodings for XML documents are correctly supported:
*
* - The encodings defined in the RFCs for MIME content types
* (2046 for general MIME, and 2376 for XML in particular), are
* supported, handling charset=... attributes and accepting
* content types which are known to be safe for use with XML;
*
* - The character encoding auto-detection algorithm identified
* in the XML specification is used, and leverages all of
* the JDK 1.1 (and later) character encoding support.
*
* - The use of MIME typing may optionally be disabled, forcing the
* use of autodetection, to support web servers which don't correctly
* report MIME types for XML. For example, they may report text that
* is encoded in EUC-JP as being US-ASCII text, leading to fatal
* errors during parsing.
*
* - The InputSource objects returned by this class always
* have a {@code java.io.Reader} available as the "character
* stream" property.
*
*
*
* - Catalog entries can map public identifiers to Java resources or
* to local URLs. These are used to reduce network dependencies and loads,
* and will often be used for external DTD components. For example, packages
* shipping DTD files as resources in JAR files can eliminate network traffic
* when accessing them, and sites may provide local caches of common DTDs.
* Note that no particular catalog syntax is supported by this class, only
* the notion of a set of entries.
*
*
*
* Subclasses can perform tasks such as supporting new URI schemes for
* URIs which are not URLs, such as URNs (see RFC 2396) or for accessing
* MIME entities which are part of a multipart/related group
* (see RFC 2387). They may also be used to support particular catalog
* syntax's, such as the
* SGML/Open Catalog (SOCAT) which supports the SGML notion of "Formal
* Public Identifiers (FPIs)".
*
* @author David Brownell
* @author Janet Koenig
*/
public class Resolver implements EntityResolver {
private boolean ignoringMIME;
// table mapping public IDs to (local) URIs
private Map id2uri;
// tables mapping public IDs to resources and classloaders
private Map id2resource;
private Map id2loader;
//
// table of MIME content types (fewer attributes!) known
// to be mostly "OK" to use with XML MIME entities. the
// idea is to rule out obvious brain-damage ("image/jpg")
// not the subtle stuff ("text/html") that might actually
// be (or become) safe.
//
private static final String[] types = {
"application/xml",
"text/xml",
"text/plain",
"text/html", // commonly mis-inferred
"application/x-netcdf", // this is often illegal XML
"content/unknown"
};
/**
* Constructs a resolver.
*/
public Resolver() {
}
/**
* Returns an input source, using the MIME type information and URL
* scheme to statically determine the correct character encoding if
* possible and otherwise auto-detecting it. MIME carefully specifies
* the character encoding defaults, and how attributes of the content
* type can change it. XML further specifies two mandatory encodings
* (UTF-8 and UTF-16), and includes an XML declaration which can be
* used to internally label most documents encoded using US-ASCII
* supersets (such as Shift_JIS, EUC-JP, ISO-2022-*, ISO-8859-*, and
* more).
*
* This method can be used to access XML documents which do not
* have URIs (such as servlet input streams, or most JavaMail message
* entities) and to support access methods such as HTTP POST or PUT.
* (URLs normally return content using the GET method.)
*
* The caller should set the system ID in order for relative URIs
* found in this document to be interpreted correctly. In some cases,
* a custom resolver will need to be used; for example, documents
* may be grouped in a single MIME "multipart/related" bundle, and
* relative URLs would refer to other documents in that bundle.
*
* @param contentType The MIME content type for the source for which
* an InputSource is desired, such as text/xml;charset=utf-8.
* @param stream The input byte stream for the input source.
* @param checkType If true, this verifies that the content type is known
* to support XML documents, such as application/xml.
* @param scheme Unless this is "file", unspecified MIME types
* default to US-ASCII. Files are always autodetected since most
* file systems discard character encoding information.
* @return an input source
* @throws IOException for errors
*/
public static InputSource createInputSource(String contentType,
InputStream stream,
boolean checkType,
String scheme) throws IOException {
InputSource retval;
String charset = null;
if (contentType != null) {
int index;
contentType = contentType.toLowerCase(Locale.ENGLISH);
index = contentType.indexOf(';');
if (index != -1) {
String attributes;
attributes = contentType.substring(index + 1);
contentType = contentType.substring(0, index);
// use "charset=..." if it's available
index = attributes.indexOf("charset");
if (index != -1) {
attributes = attributes.substring(index + 7);
// strip out subsequent attributes
if ((index = attributes.indexOf(';')) != -1)
attributes = attributes.substring(0, index);
// find start of value
if ((index = attributes.indexOf('=')) != -1) {
attributes = attributes.substring(index + 1);
// strip out rfc822 comments
if ((index = attributes.indexOf('(')) != -1)
attributes = attributes.substring(0, index);
// double quotes are optional
if ((index = attributes.indexOf('"')) != -1) {
attributes = attributes.substring(index + 1);
attributes = attributes.substring(0,
attributes.indexOf('"'));
}
charset = attributes.trim();
// XXX "\;", "\)" etc. were mishandled above
}
}
}
//
// Check MIME type.
//
if (checkType) {
boolean isOK = false;
for (String type : types)
if (type.equals(contentType)) {
isOK = true;
break;
}
if (!isOK)
throw new IOException("Not XML: " + contentType);
}
//
// "text/*" MIME types have hard-wired character set
// defaults, as specified in the RFCs. For XML, we
// ignore the system "file.encoding" property since
// auto-detection is more correct.
//
if (charset == null) {
contentType = contentType.trim();
if (contentType.startsWith("text/")) {
if (!"file".equalsIgnoreCase(scheme))
charset = "US-ASCII";
}
// "application/*" has no default
}
}
retval = new InputSource(XmlReader.createReader(stream, charset));
retval.setByteStream(stream);
retval.setEncoding(charset);
return retval;
}
/**
* Creates an input source from a given URI.
*
* @param uri the URI (system ID) for the entity
* @param checkType if true, the MIME content type for the entity
* is checked for document type and character set encoding.
* @return an input source from given URI
* @throws IOException for errors
*/
public static InputSource createInputSource(URL uri, boolean checkType)
throws IOException {
URLConnection conn = uri.openConnection();
InputSource retval;
if (checkType) {
String contentType = conn.getContentType();
retval = createInputSource(contentType, conn.getInputStream(),
false, uri.getProtocol());
} else {
retval = new InputSource(XmlReader.createReader(conn.getInputStream()));
}
retval.setSystemId(conn.getURL().toString());
return retval;
}
/**
* Creates an input source from a given file, auto-detecting
* the character encoding.
* @param file file to create an input source from
* @return an input source from given file
* @throws IOException for errors
*/
public static InputSource createInputSource(File file)
throws IOException {
InputSource retval = new InputSource(XmlReader.createReader(new FileInputStream(file)));
retval.setSystemId(file.toURI().toURL().toString());
return retval;
}
/**
* SAX:
* Resolve the given entity into an input source. If the name can't
* be mapped to a preferred form of the entity, the URI is used. To
* resolve the entity, first a local catalog mapping names to URIs is
* consulted. If no mapping is found there, a catalog mapping names
* to java resources is consulted. Finally, if neither mapping found
* a copy of the entity, the specified URI is used.
*
* When a URI is used,
* createInputSource is used to correctly deduce the character
* encoding used by this entity. No MIME type checking is done.
*
* @param name Used to find alternate copies of the entity, when
* this value is non-null; this is the XML "public ID".
* @param uri Used when no alternate copy of the entity is found;
* this is the XML "system ID", normally a URI.
* @return resolved entity
* @throws IOException for errors
*/
@Override
public InputSource resolveEntity(String name, String uri)
throws IOException {
InputSource retval;
String mappedURI = name2uri(name);
InputStream stream;
// prefer explicit URI mappings, then bundled resources...
if (mappedURI == null && (stream = mapResource(name)) != null && id2resource != null) {
uri = "java:resource:" + id2resource.get(name);
retval = new InputSource(XmlReader.createReader(stream));
// ...and treat all URIs the same (as URLs for now).
} else {
URL url;
URLConnection conn;
if (mappedURI != null)
uri = mappedURI;
else if (uri == null)
return null;
url = new URL(uri);
conn = url.openConnection();
uri = conn.getURL().toString();
// System.out.println ("++ URI: " + url);
if (ignoringMIME)
retval = new InputSource(XmlReader.createReader(conn.getInputStream()));
else {
String contentType = conn.getContentType();
retval = createInputSource(contentType,
conn.getInputStream(),
false, url.getProtocol());
}
}
retval.setSystemId(uri);
retval.setPublicId(name);
return retval;
}
/**
* Returns true if this resolver is ignoring MIME types in the documents
* it returns, to work around bugs in how servers have reported the
* documents' MIME types.
* @return true if this resolver is ignoring MIME types
*/
public boolean isIgnoringMIME() {
return ignoringMIME;
}
/**
* Tells the resolver whether to ignore MIME types in the documents it
* retrieves. Many web servers incorrectly assign text documents a
* default character encoding, even when that is incorrect. For example,
* all HTTP text documents default to use ISO-8859-1 (used for Western
* European languages), and other MIME sources default text documents
* to use US-ASCII (a seven bit encoding). For XML documents which
* include text encoding declarations (as most should do), these server
* bugs can be worked around by ignoring the MIME type entirely.
* @param value whether to ignore MIME types in the documents
*/
public void setIgnoringMIME(boolean value) {
ignoringMIME = value;
}
// maps the public ID to an alternate URI, if one is registered
private String name2uri(String publicId) {
if (publicId == null || id2uri == null)
return null;
return id2uri.get(publicId);
}
/**
* Registers the given public ID as corresponding to a particular
* URI, typically a local copy. This URI will be used in preference
* to ones provided as system IDs in XML entity declarations. This
* mechanism would most typically be used for Document Type Definitions
* (DTDs), where the public IDs are formally managed and versioned.
*
* @param publicId The managed public ID being mapped
* @param uri The URI of the preferred copy of that entity
*/
public void registerCatalogEntry(String publicId,
String uri) {
if (id2uri == null)
id2uri = new HashMap<>(17);
id2uri.put(publicId, uri);
}
// return the resource as a stream
private InputStream mapResource(String publicId) {
// System.out.println ("++ PUBLIC: " + publicId);
if (publicId == null || id2resource == null)
return null;
String resourceName = id2resource.get(publicId);
ClassLoader loader = null;
if (resourceName == null)
return null;
// System.out.println ("++ Resource: " + resourceName);
if (id2loader != null)
loader = id2loader.get(publicId);
// System.out.println ("++ Loader: " + loader);
if (loader == null)
return ClassLoader.getSystemResourceAsStream(resourceName);
return loader.getResourceAsStream(resourceName);
}
/**
* Registers a given public ID as corresponding to a particular Java
* resource in a given class loader, typically distributed with a
* software package. This resource will be preferred over system IDs
* included in XML documents. This mechanism should most typically be
* used for Document Type Definitions (DTDs), where the public IDs are
* formally managed and versioned.
*
*
If a mapping to a URI has been provided, that mapping takes
* precedence over this one.
*
* @param publicId The managed public ID being mapped
* @param resourceName The name of the Java resource
* @param loader The class loader holding the resource, or null if
* it is a system resource.
*/
public void registerCatalogEntry(String publicId,
String resourceName,
ClassLoader loader) {
if (id2resource == null)
id2resource = new HashMap<>(17);
id2resource.put(publicId, resourceName);
if (loader != null) {
if (id2loader == null)
id2loader = new HashMap<>(17);
id2loader.put(publicId, loader);
}
}
}