All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.xwiki.xml.internal.LocalEntityResolver Maven / Gradle / Ivy

/*
 * See the NOTICE file distributed with this work for additional
 * information regarding copyright ownership.
 *
 * This is free software; you can redistribute it and/or modify it
 * under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2.1 of
 * the License, or (at your option) any later version.
 *
 * This software is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this software; if not, write to the Free
 * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
 * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
 */
/*
 This class was copied from https://github.com/css4j/xml-dtd
 See https://github.com/css4j/xml-dtd/issues/7 for the reason why we copied it instead of having a dependency on it.

 Copyright (c) 1998-2022, Carlos Amengual.
 Originally Licensed under a BSD-style License but relicensed under LGPL for XWiki by Carlos Amengual.
 You can find the original license here:
 https://css4j.github.io/LICENSE.txt
 */
package org.xwiki.xml.internal;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.lang.reflect.Constructor;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Locale;
import java.util.Map.Entry;

import org.w3c.dom.DocumentType;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.ext.EntityResolver2;
import org.xml.sax.ext.LexicalHandler;

/**
 * Implements EntityResolver2.
 * 

* Has common W3C DTDs/entities built-in and loads others via the supplied * SYSTEM URL, provided that certain conditions are met: *

*
    *
  • URL protocol is http/https.
  • *
  • Either the mime type is valid for a DTD or entity, or the filename ends * with .dtd, .ent or .mod.
  • *
  • The whitelist is either disabled (no host added to it) or contains the * host from the URL.
  • *
*

* If the whitelist was enabled (e.g. default constructor), any attempt to * download data from a remote URL not present in the whitelist is going to * produce an exception. You can use that to determine whether your documents * are referencing a DTD resource that is not bundled with this resolver. *

*

* If the constructor with a false argument was used, the whitelist * can still be enabled by adding a hostname via * {@link #addHostToWhiteList(String)}. *

*

* Although this resolver should protect you from most information leaks (see * SSRF * attacks) and also from jar: * decompression bombs, DoS * attacks based on entity expansion/recursion like the * 'billion laughs * attack' may still be possible and should be prevented at the XML parser. * Be sure to use a properly configured, recent version of your parser. *

* * @author Carlos Amengual * @version $Id: e7be22562000682b9ac9e9cede40af0d81b22792 $ */ public class LocalEntityResolver implements EntityResolver2 { private static final DTDLoader dtdLoader = createDTDLoader(); private static final String XHTML1_TRA_PUBLICID = "-//W3C//DTD XHTML 1.0 Transitional//EN"; private static final String XHTML1_TRA_SYSTEMID = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"; // The map is sized to have room for one additional mapping // via registerSystemIdFilename private final HashMap systemIdToFilename = new HashMap(69, 0.4f); private final HashMap systemIdToPublicId = new HashMap(14); private ClassLoader loader = null; private HashSet whitelist = null; /** * Construct a resolver with the whitelist enabled. */ public LocalEntityResolver() { this(true); } /** * Construct a resolver with the whitelist enabled or disabled according to * enableWhitelist. * * @param enableWhitelist can be false to allow connecting to any * host to retrieve DTDs or entities, or * true to enable the (empty) whitelist so * no network connections are to be allowed until a host * is added to it. */ public LocalEntityResolver(boolean enableWhitelist) { super(); systemIdToFilename.put("https://www.w3.org/TR/html5/entities.dtd", "/xhtml5.ent"); systemIdToFilename.put("http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd", "/xhtml1-strict.dtd"); systemIdToFilename.put("http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd", "/xhtml1-transitional.dtd"); systemIdToFilename.put("http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd", "/xhtml11.dtd"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml11.dtd", "/xhtml11.dtd"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent", "/xhtml5.ent"); systemIdToFilename.put("http://www.w3.org/TR/xhtml11/DTD/xhtml-lat1.ent", "/xhtml5.ent"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-symbol.ent", "/xhtml-symbol.ent"); systemIdToFilename.put("http://www.w3.org/TR/xhtml11/DTD/xhtml-symbol.ent", "/xhtml-symbol.ent"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-special.ent", "/xhtml-special.ent"); systemIdToFilename.put("http://www.w3.org/TR/xhtml11/DTD/xhtml-special.ent", "/xhtml-special.ent"); // XHTML 1.1 modules systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-inlstyle-1.mod", "/xhtml-inlstyle-1.mod"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml11-model-1.mod", "/xhtml11-model-1.mod"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-datatypes-1.mod", "/xhtml-datatypes-1.mod"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-framework-1.mod", "/xhtml-framework-1.mod"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-text-1.mod", "/xhtml-text-1.mod"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-hypertext-1.mod", "/xhtml-hypertext-1.mod"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-list-1.mod", "/xhtml-list-1.mod"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-edit-1.mod", "/xhtml-edit-1.mod"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-bdo-1.mod", "/xhtml-bdo-1.mod"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-ruby-1.mod", "/xhtml-ruby-1.mod"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-pres-1.mod", "/xhtml-pres-1.mod"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-link-1.mod", "/xhtml-link-1.mod"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-meta-1.mod", "/xhtml-meta-1.mod"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-base-1.mod", "/xhtml-base-1.mod"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-script-1.mod", "/xhtml-script-1.mod"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-style-1.mod", "/xhtml-style-1.mod"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-image-1.mod", "/xhtml-image-1.mod"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-csismap-1.mod", "/xhtml-csismap-1.mod"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-ssismap-1.mod", "/xhtml-ssismap-1.mod"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-param-1.mod", "/xhtml-param-1.mod"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-object-1.mod", "/xhtml-object-1.mod"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-table-1.mod", "/xhtml-table-1.mod"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-form-1.mod", "/xhtml-form-1.mod"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-legacy-1.mod", "/xhtml-legacy-1.mod"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-struct-1.mod", "/xhtml-struct-1.mod"); // Other common DTDs systemIdToFilename.put("http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd", "/xhtml1-frameset.dtd"); systemIdToFilename.put("http://www.w3.org/TR/xhtml-basic/xhtml-basic11.dtd", "/xhtml-basic11.dtd"); systemIdToFilename.put("http://www.w3.org/TR/html4/strict.dtd", "/html4-strict.dtd"); systemIdToFilename.put("http://www.w3.org/TR/html4/loose.dtd", "/html4-loose.dtd"); systemIdToFilename.put("http://www.w3.org/TR/html4/frameset.dtd", "/html4-frameset.dtd"); systemIdToFilename.put("http://www.w3.org/Math/DTD/mathml2/mathml2.dtd", "/mathml2.dtd"); systemIdToFilename.put("http://www.w3.org/Math/DTD/mathml1/mathml.dtd", "/mathml.dtd"); systemIdToFilename.put("http://www.w3.org/2002/04/xhtml-math-svg/xhtml-math-svg.dtd", "/xhtml-math-svg.dtd"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-inlstruct-1.mod", "/xhtml-inlstruct-1.mod"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-inlphras-1.mod", "/xhtml-inlphras-1.mod"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-blkstruct-1.mod", "/xhtml-blkstruct-1.mod"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-blkphras-1.mod", "/xhtml-blkphras-1.mod"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-applet-1.mod", "/xhtml-applet-1.dtd"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-blkpres-1.mod", "/xhtml-blkpres-1.mod"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-basic-form-1.mod", "/xhtml-basic-form-1.mod"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-basic-table-1.mod", "/xhtml-basic-table-1.mod"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-frames-1.mod", "/xhtml-frames-1.mod"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-target-1.mod", "/xhtml-target-1.mod"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-iframe-1.mod", "/xhtml-iframe-1.mod"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-events-1.mod", "/xhtml-events-1.mod"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-nameident-1.mod", "/xhtml-nameident-1.mod"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-legacy-redecl-1.mod", "/xhtml-legacy-redecl-1.mod"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-inlpres-1.mod", "/xhtml-inlpres-1.mod"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-arch-1.mod", "/xhtml-arch-1.mod"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-notations-1.mod", "/xhtml-notations-1.mod"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-qname-1.mod", "/xhtml-qname-1.mod"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-attribs-1.mod", "/xhtml-attribs-1.mod"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-charent-1.mod", "/xhtml-charent-1.mod"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-basic11-model-1.mod", "/xhtml-basic11-model-1.mod"); systemIdToFilename.put("http://www.w3.org/MarkUp/DTD/xhtml-inputmode-1.mod", "/xhtml-inputmode-1.mod"); systemIdToFilename.put("http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd", "/svg11.dtd"); systemIdToFilename.put("http://www.w3.org/TR/2001/REC-SVG-20010904/DTD/svg10.dtd", "/svg10.dtd"); // systemIdToPublicId.put("http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd", "-//W3C//DTD XHTML 1.0 Strict//EN"); systemIdToPublicId.put("http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd", "-//W3C//DTD XHTML 1.0 Transitional//EN"); systemIdToPublicId.put("http://www.w3.org/MarkUp/DTD/xhtml11.dtd", "-//W3C//DTD XHTML 1.1//EN"); systemIdToPublicId.put("http://www.w3.org/TR/xhtml11/DTD/xhtml-lat1.ent", "-//W3C//ENTITIES Latin 1 for XHTML//EN"); systemIdToPublicId.put("http://www.w3.org/TR/xhtml11/DTD/xhtml-symbol.ent", "-//W3C//ENTITIES Symbols for XHTML//EN"); systemIdToPublicId.put("http://www.w3.org/TR/xhtml11/DTD/xhtml-special.ent", "-//W3C//ENTITIES Special for XHTML//EN"); systemIdToPublicId.put("http://www.w3.org/TR/html4/strict.dtd", "-//W3C//DTD HTML 4.01//EN"); systemIdToPublicId.put("http://www.w3.org/TR/html4/loose.dtd", "-//W3C//DTD HTML 4.01 Transitional//EN"); systemIdToPublicId.put("http://www.w3.org/TR/html4/frameset.dtd", "-//W3C//DTD HTML 4.01 Frameset//EN"); systemIdToPublicId.put("http://www.w3.org/Math/DTD/mathml2/mathml2.dtd", "-//W3C//DTD MathML 2.0//EN"); systemIdToPublicId.put("http://www.w3.org/Math/DTD/mathml1/mathml.dtd", "math"); systemIdToPublicId.put("http://www.w3.org/2002/04/xhtml-math-svg/xhtml-math-svg.dtd", "-//W3C//DTD XHTML 1.1 plus MathML 2.0 plus SVG 1.1//EN"); systemIdToPublicId.put("http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd", "-//W3C//DTD SVG 1.1//EN"); systemIdToPublicId.put("http://www.w3.org/TR/2001/REC-SVG-20010904/DTD/svg10.dtd", "-//W3C//DTD SVG 1.0//EN"); if (enableWhitelist) { whitelist = new HashSet<>(1); } } /** * Add the given host to a whitelist for remote DTD fetching. *

* If the whitelist is enabled, only http or https URLs will be allowed. *

* * @param fqdn * the fully qualified domain name to add to the whitelist. */ public void addHostToWhiteList(String fqdn) { if (fqdn != null) { if (whitelist == null) { whitelist = new HashSet(4); } whitelist.add(fqdn.toLowerCase(Locale.ROOT)); } } /** * Allows applications to provide an external subset for documents that don't * explicitly define one. *

* Documents with {@code DOCTYPE} declarations that omit an external subset can * thus augment the declarations available for validation, entity processing, * and attribute processing (normalization, defaulting, and reporting types * including {@code ID}). This augmentation is reported through the * {@link LexicalHandler#startDTD startDTD()} method as if the document text had * originally included the external subset; this callback is made before any * internal subset data or errors are reported. *

*

* This method can also be used with documents that have no {@code DOCTYPE} * declaration. When the root element is encountered but no {@code DOCTYPE} * declaration has been seen, this method is invoked. If it returns a value for * the external subset, that root element is declared to be the root element, * giving the effect of splicing a {@code DOCTYPE} declaration at the end the * prolog of a document that could not otherwise be valid. The sequence of * parser callbacks in that case logically resembles this: *

* *
     * ... comments and PIs from the prolog (as usual)
     * startDTD ("rootName", source.getPublicId (), source.getSystemId ());
     * startEntity ("[dtd]");
     * ... declarations, comments, and PIs from the external subset
     * endEntity ("[dtd]");
     * endDTD ();
     * ... then the rest of the document (as usual)
     * startElement (..., "rootName", ...);
     * 
* *

* Note that the {@code InputSource} gets no further resolution. Also, this * method will never be used by a (non-validating) processor that is not * including external parameter entities. *

*

* Uses for this method include facilitating data validation when interoperating * with XML processors that would always require undesirable network accesses * for external entities, or which for other reasons adopt a "no DTDs" policy. *

*

* Warning: returning an external subset modifies the input * document. By providing definitions for general entities, it can make a * malformed document appear to be well formed. *

* * @param name Identifies the document root element. This name comes from a * {@code DOCTYPE} declaration (where available) or from the * actual root element. * @param baseURI The document's base URI, serving as an additional hint for * selecting the external subset. This is always an absolute URI, * unless it is {@code null} because the {@code XMLReader} was * given an {@code InputSource} without one. * * @return an {@code InputSource} object describing the new external subset to * be used by the parser. If no specific subset could be determined, an * input source describing the HTML5 entities is returned. * * @throws SAXException if either the provided arguments or the input * source were invalid or not allowed. * @throws java.io.IOException if an I/O problem was found while loading the * input source. */ @Override public InputSource getExternalSubset(String name, String baseURI) throws SAXException, IOException { InputSource is = findExternalSubset(name, baseURI); if (is == null) { // Give the HTML5 entities as a fallback String fname = systemIdToFilename.get("https://www.w3.org/TR/html5/entities.dtd"); Reader re = dtdLoader.loadDTDfromClasspath(loader, fname); if (re != null) { is = new InputSource(re); } else { throw new IOException("Could not find resource: " + fname); } } return is; } private InputSource findExternalSubset(String name, String baseURI) throws SAXException, IOException { InputSource is; if ("html".equalsIgnoreCase(name)) { is = resolveEntity("[dtd]", XHTML1_TRA_PUBLICID, baseURI, XHTML1_TRA_SYSTEMID); is.setPublicId(null); is.setSystemId(null); } else { is = null; } return is; } /** * Register an internal classpath filename to retrieve a DTD {@code SystemId}. * * @param systemId the {@code SystemId}. * @param filename the internal filename. Must point to a resource with * {@code UTF-8} encoding. * @return {@code true} if the new {@code SystemId} was successfully registered, * {@code false} if it was already registered. * @throws IllegalArgumentException if the {@code filename} is considered * invalid by {@link #isInvalidPath(String)}. */ protected boolean registerSystemIdFilename(String systemId, String filename) { if (filename == null || systemId == null) { throw new NullPointerException("Null SystemId or filename."); } if (isInvalidPath(filename)) { throw new IllegalArgumentException("Bad DTD filename."); } String ret; synchronized (systemIdToFilename) { ret = systemIdToFilename.putIfAbsent(systemId, filename); } return ret == null; } /** * Allows applications to map references to external entities into input * sources. *

* This method is only called for external entities which have been properly * declared. It provides more flexibility than the * {@link org.xml.sax.EntityResolver EntityResolver} interface, supporting * implementations of more complex catalogue schemes such as the one defined by * the * OASIS XML Catalogs specification. *

*

* Parsers configured to use this resolver method will call it to determine the * input source to use for any external entity being included because of a * reference in the XML text. That excludes the document entity, and any * external entity returned by {@link #getExternalSubset getExternalSubset()}. * When a (non-validating) processor is configured not to include a class of * entities (parameter or general) through use of feature flags, this method is * not invoked for such entities. *

*

* If no valid input source could be determined, this method will throw a * {@code SAXException} instead of returning {@code null} as other * implementations would do. If you have to retrieve a DTD which is not directly * provided by this resolver, you need to whitelist the host using * {@link #addHostToWhiteList(String)} first. Make sure that either the systemId * URL ends with a valid extension, or that the retrieved URL was served with a * valid DTD media type. *

*

* Note that the entity naming scheme used here is the same one used in the * {@link org.xml.sax.ext.LexicalHandler LexicalHandler}, or in the * {@link org.xml.sax.ContentHandler#skippedEntity * ContentHandler.skippedEntity()} method. *

* * @param name Identifies the external entity being resolved. Either * "{@code [dtd]}" for the external subset, or a name starting * with "{@code %}" to indicate a parameter entity, or else the * name of a general entity. This is never {@code null} when * invoked by a SAX2 parser. * @param publicId The public identifier of the external entity being referenced * (normalized as required by the XML specification), or * {@code null} if none was supplied. * @param baseURI The URI with respect to which relative systemIDs are * interpreted. This is always an absolute URI, unless it is * {@code null} (likely because the {@code XMLReader} was given * an {@code InputSource} without one). This URI is defined by * the XML specification to be the one associated with the * "{@literal <}" starting the relevant declaration. * @param systemId The system identifier of the external entity being * referenced; either a relative or absolute URI. * * @return an {@code InputSource} object describing the new input source to be * used by the parser. This implementation never returns {@code null} if * {@code systemId} is non-{@code null}. * * @throws SAXException if either the provided arguments or the input * source were invalid or not allowed. * @throws java.io.IOException if an I/O problem was found while forming the URL * to the input source, or when connecting to it. */ @Override public InputSource resolveEntity(String name, String publicId, String baseURI, String systemId) throws SAXException, IOException { if (publicId == null) { publicId = systemIdToPublicId.get(systemId); } else if (systemId == null) { systemId = getSystemIdFromPublicId(publicId); } String fname = systemIdToFilename.get(systemId); InputSource isrc = null; if (fname != null) { Reader re = dtdLoader.loadDTDfromClasspath(loader, fname); if (re != null) { isrc = new InputSource(re); isrc.setPublicId(publicId); if (systemId != null) { isrc.setSystemId(systemId); } } else { throw new SAXException("Could not find resource: " + fname); } } else if (systemId != null) { URL enturl; if (baseURI != null) { URL base = new URL(baseURI); enturl = new URL(base, systemId); } else { enturl = new URL(systemId); } if (isInvalidProtocol(enturl.getProtocol())) { throw new SAXException("Invalid url protocol: " + enturl.getProtocol()); } if (isWhitelistEnabled() && !isWhitelistedHost(enturl.getHost())) { throw new SAXException( "Whitelist is enabled, and attempted to retrieve data from " + enturl.toExternalForm()); } boolean invalidPath = isInvalidPath(enturl.getPath()); String charset = "UTF-8"; URLConnection con = openConnection(enturl); connect(con); String conType = con.getContentType(); if (conType != null) { int sepidx = conType.indexOf(';'); if (sepidx != -1 && sepidx < conType.length()) { conType = conType.substring(0, sepidx); charset = AgentUtil.findCharset(conType, sepidx + 1); } } if (invalidPath && !isValidContentType(conType)) { // Disconnect if (con instanceof HttpURLConnection) { ((HttpURLConnection) con).disconnect(); } String msg = enturl.toExternalForm(); if (conType != null) { // Sanitize untrusted content-type by removing control characters // ('Other, Control' unicode category). conType = conType.replaceAll("\\p{Cc}", "*CTRL*"); msg = "URL served with invalid type (" + conType + "): " + msg; } else { msg = "URL served with invalid type: " + msg; } throw new SAXException(msg); } isrc = new InputSource(); isrc.setSystemId(enturl.toExternalForm()); if (publicId != null) { isrc.setPublicId(publicId); } isrc.setEncoding(charset); InputStream is = con.getInputStream(); isrc.setCharacterStream(new InputStreamReader(is, charset)); } else { isrc = findExternalSubset(name, baseURI); // 'isrc' can be null safely: there is no SystemId URL to connect to } return isrc; } private String getSystemIdFromPublicId(String publicId) { Iterator> it = systemIdToPublicId.entrySet().iterator(); while (it.hasNext()) { Entry entry = it.next(); if (publicId.equals(entry.getValue())) { return entry.getKey(); } } return null; } /** * Determine if the given path is considered invalid for a DTD. *

* To be valid, must end with {@code .dtd}, {@code .ent} or {@code .mod}. *

* * @param path the path to check. * @return {@code true} if the path is invalid for a DTD, {@code false} otherwise. */ protected boolean isInvalidPath(String path) { int len = path.length(); String ext; return len < 5 || (!(ext = path.substring(len - 4)).equalsIgnoreCase(".dtd") && !ext.equalsIgnoreCase(".ent") && !ext.equalsIgnoreCase(".mod")); } /** * Is the whitelist enabled ? * * @return true if the whitelist is enabled. */ protected boolean isWhitelistEnabled() { return whitelist != null; } /** * Is the given protocol not supported by this resolver ? *

* Only {@code http} and {@code https} are valid. *

* * @param protocol the protocol. * @return true if this resolver considers the given protocol invalid. */ protected boolean isInvalidProtocol(String protocol) { return !protocol.equals("http") && !protocol.equals("https"); } /** * Is the given host whitelisted ? * * @param host * the host to test. * @return true if the given host is whitelisted. */ protected boolean isWhitelistedHost(String host) { return whitelist.contains(host.toLowerCase(Locale.ROOT)); } /** * Open a connection to the given URL. * * @param url the URL to connect to. * @return the connection. * @throws IOException if an I/O error happened opening the connection. */ protected URLConnection openConnection(URL url) throws IOException { return url.openConnection(); } /** * Connect the given URLConnection. * * @param con * the URLConnection. * @throws IOException * if a problem happened connecting. */ protected void connect(final URLConnection con) throws IOException { con.setConnectTimeout(60000); dtdLoader.connect(con); } /** * Is the given string a valid DTD/entity content-type ? * * @param conType * the content-type. * @return true if it is a valid DTD/entity content-type */ protected boolean isValidContentType(String conType) { return conType != null && (conType.equals("application/xml-dtd") || conType.equals("text/xml-external-parsed-entity") || conType.equals("application/xml-external-parsed-entity")); } /** * Allow the application to resolve external entities. * *

* The parser will call this method before opening any external entity except * the top-level document entity. Such entities include the external DTD subset * and external parameter entities referenced within the DTD (in either case, * only if the parser reads external parameter entities), and external general * entities referenced within the document element (if the parser reads external * general entities). The application may request that the parser locate the * entity itself, that it use an alternative URI, or that it use data provided * by the application (as a character or byte input stream). *

*

* If no valid input source could be determined, this method will throw a * {@code SAXException} instead of returning {@code null} as other * implementations would do. If you have to retrieve a DTD which is not directly * provided by this resolver, you need to whitelist the host using * {@link #addHostToWhiteList(String)} first. Make sure that either the systemId * URL ends with a valid extension, or that the retrieved URL was served with a * valid DTD media type. *

* * @param publicId The public identifier of the external entity being * referenced, or {@code null} if none was supplied. * @param systemId The system identifier of the external entity being * referenced. * @return an {@code InputSource} object describing the new input source. This * implementation never returns {@code null} if {@code systemId} is * non-{@code null}. * @throws SAXException if either the provided arguments or the input * source were invalid or not allowed. * @throws java.io.IOException if an I/O problem was found while forming the URL * to the input source, or when connecting to it. */ @Override public InputSource resolveEntity(String publicId, String systemId) throws SAXException, IOException { return resolveEntity(null, publicId, null, systemId); } /** * Resolve external entities according to the given {@code DocumentType}. *

* If no valid input source could be determined, this method will throw a * {@code SAXException} instead of returning {@code null} as other * implementations would do. If you have to retrieve a DTD which is not directly * provided by this resolver, you need to whitelist the host using * {@link #addHostToWhiteList(String)} first. Make sure that either the systemId * URL ends with a valid extension, or that the retrieved URL was served with a * valid DTD media type. *

* * @param dtDecl the {@code DocumentType}. * @return an {@code InputSource} object describing the new input source. * @throws SAXException if either the provided arguments or the input * source were invalid or not allowed. * @throws java.io.IOException if an I/O problem was found while forming the URL * to the input source, or when connecting to it. */ public InputSource resolveEntity(DocumentType dtDecl) throws SAXException, IOException { return resolveEntity(dtDecl.getName(), dtDecl.getPublicId(), dtDecl.getBaseURI(), dtDecl.getSystemId()); } /** * Set the class loader to be used to read the built-in DTDs. * * @param loader the class loader. */ public void setClassLoader(ClassLoader loader) { this.loader = loader; } private static DTDLoader createDTDLoader() { DTDLoader loader; try { Class cl = Class.forName("io.sf.carte.doc.xml.dtd.SMDTDLoader"); Constructor ctor = cl.getConstructor(); loader = (DTDLoader) ctor.newInstance(); } catch (Exception e) { loader = new SimpleDTDLoader(); } return loader; } abstract static class DTDLoader { abstract void connect(URLConnection con) throws IOException; abstract Reader loadDTDfromClasspath(ClassLoader loader, String dtdFilename); } /** * Load DTDs without a Security Manager. */ private static final class SimpleDTDLoader extends DTDLoader { @Override void connect(final URLConnection con) throws IOException { con.connect(); } @Override Reader loadDTDfromClasspath(final ClassLoader loader, final String dtdFilename) { InputStream is; if (loader != null) { is = loader.getResourceAsStream(dtdFilename); } else { is = LocalEntityResolver.class.getResourceAsStream(dtdFilename); } if (is == null) { is = ClassLoader.getSystemResourceAsStream(dtdFilename); } Reader re = null; if (is != null) { re = new InputStreamReader(is, StandardCharsets.UTF_8); } return re; } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy