nu.validator.xml.PrudentHttpEntityResolver Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of validator Show documentation
Show all versions of validator Show documentation
An HTML-checking library (used by https://html5.validator.nu and the HTML5 facet of the W3C Validator)
/*
* Copyright (c) 2005 Henri Sivonen
* Copyright (c) 2007-2015 Mozilla Foundation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
package nu.validator.xml;
import java.io.IOException;
import java.io.InputStream;
import java.security.KeyManagementException;
import java.security.KeyStoreException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.zip.GZIPInputStream;
import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.SSLContext;
import nu.validator.io.BoundedInputStream;
import nu.validator.io.ObservableInputStream;
import nu.validator.io.StreamBoundException;
import nu.validator.io.StreamObserver;
import nu.validator.io.SystemIdIOException;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.conn.ssl.SSLContextBuilder;
import org.apache.http.conn.ssl.TrustStrategy;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.client.LaxRedirectStrategy;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.log4j.Logger;
import org.xml.sax.EntityResolver;
import org.xml.sax.ErrorHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import io.mola.galimatias.URL;
import io.mola.galimatias.GalimatiasParseException;
/**
* @version $Id: PrudentHttpEntityResolver.java,v 1.1 2005/01/08 08:11:26
* hsivonen Exp $
* @author hsivonen
*/
@SuppressWarnings("deprecation") public class PrudentHttpEntityResolver
implements EntityResolver {
private static final Logger log4j = Logger.getLogger(PrudentHttpEntityResolver.class);
private static HttpClient client;
private static int maxRequests;
private long sizeLimit;
private final ErrorHandler errorHandler;
private int requestsLeft;
private boolean allowRnc = false;
private boolean allowHtml = false;
private boolean allowXhtml = false;
private boolean acceptAllKnownXmlTypes = false;
private boolean allowGenericXml = true;
private final ContentTypeParser contentTypeParser;
private String userAgent;
/**
* Sets the timeouts of the HTTP client.
*
* @param connectionTimeout
* timeout until connection established in milliseconds. Zero
* means no timeout.
* @param socketTimeout
* timeout for waiting for data in milliseconds. Zero means no
* timeout.
* @param maxRequests
* maximum number of connections to a particuar host
*/
public static void setParams(int connectionTimeout, int socketTimeout,
int maxRequests) {
PrudentHttpEntityResolver.maxRequests = maxRequests;
PoolingHttpClientConnectionManager phcConnMgr;
Registry registry = //
RegistryBuilder. create() //
.register("http", PlainConnectionSocketFactory.getSocketFactory()) //
.register("https", SSLConnectionSocketFactory.getSocketFactory()) //
.build();
HttpClientBuilder builder = HttpClients.custom();
builder.setRedirectStrategy(new LaxRedirectStrategy());
builder.setMaxConnPerRoute(maxRequests);
builder.setMaxConnTotal(200);
if ("true".equals(System.getProperty(
"nu.validator.xml.promiscuous-ssl", "false"))) { //
try {
SSLContext promiscuousSSLContext = new SSLContextBuilder() //
.loadTrustMaterial(null, new TrustStrategy() {
public boolean isTrusted(X509Certificate[] arg0, String arg1)
throws CertificateException {
return true;
}
}).build();
builder.setSslcontext(promiscuousSSLContext);
HostnameVerifier verifier = //
SSLConnectionSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER;
SSLConnectionSocketFactory promiscuousSSLConnSocketFactory = //
new SSLConnectionSocketFactory(promiscuousSSLContext, verifier);
registry = RegistryBuilder. create() //
.register("https", promiscuousSSLConnSocketFactory) //
.register("http",
PlainConnectionSocketFactory.getSocketFactory()) //
.build();
} catch (KeyManagementException e) {
e.printStackTrace();
} catch (NumberFormatException e) {
e.printStackTrace();
} catch (NoSuchAlgorithmException e) {
e.printStackTrace();
} catch (KeyStoreException e) {
e.printStackTrace();
}
}
phcConnMgr = new PoolingHttpClientConnectionManager(registry);
phcConnMgr.setDefaultMaxPerRoute(maxRequests);
phcConnMgr.setMaxTotal(200);
builder.setConnectionManager(phcConnMgr);
RequestConfig.Builder config = RequestConfig.custom();
config.setCircularRedirectsAllowed(true);
config.setMaxRedirects(20); // Gecko default
config.setConnectTimeout(connectionTimeout);
config.setCookieSpec(CookieSpecs.BEST_MATCH);
config.setSocketTimeout(socketTimeout);
client = builder.setDefaultRequestConfig(config.build()).build();
}
public void setUserAgent(String ua) {
userAgent = ua;
}
/**
* @param sizeLimit
* @param laxContentType
* @param errorHandler
*/
public PrudentHttpEntityResolver(long sizeLimit, boolean laxContentType,
ErrorHandler errorHandler) {
this.sizeLimit = sizeLimit;
this.requestsLeft = maxRequests;
this.errorHandler = errorHandler;
this.contentTypeParser = new ContentTypeParser(errorHandler,
laxContentType, this.allowRnc, this.allowHtml, this.allowXhtml,
this.acceptAllKnownXmlTypes, this.allowGenericXml);
}
/**
* @see org.xml.sax.EntityResolver#resolveEntity(java.lang.String,
* java.lang.String)
*/
public InputSource resolveEntity(String publicId, String systemId)
throws SAXException, IOException {
if (requestsLeft > -1) {
if (requestsLeft == 0) {
throw new IOException(
"Number of permitted HTTP requests exceeded.");
} else {
requestsLeft--;
}
}
HttpGet m = null;
try {
URL url;
try {
url = URL.parse(systemId);
} catch (GalimatiasParseException e) {
IOException ioe = (IOException) new IOException(e.getMessage()).initCause(e);
SAXParseException spe = new SAXParseException(e.getMessage(),
publicId, systemId, -1, -1, ioe);
if (errorHandler != null) {
errorHandler.fatalError(spe);
}
throw spe;
}
String scheme = url.scheme();
if (!("http".equals(scheme) || "https".equals(scheme))) {
String msg = "Unsupported URI scheme: \u201C" + scheme
+ "\u201D.";
SAXParseException spe = new SAXParseException(msg, publicId,
systemId, -1, -1, new IOException(msg));
if (errorHandler != null) {
errorHandler.fatalError(spe);
}
throw spe;
}
systemId = url.toString();
try {
m = new HttpGet(systemId);
} catch (IllegalArgumentException e) {
SAXParseException spe = new SAXParseException(
e.getMessage(),
publicId,
systemId,
-1,
-1,
(IOException) new IOException(e.getMessage()).initCause(e));
if (errorHandler != null) {
errorHandler.fatalError(spe);
}
throw spe;
}
m.setHeader("User-Agent", userAgent);
m.setHeader("Accept", buildAccept());
m.setHeader("Accept-Encoding", "gzip");
log4j.info(systemId);
HttpResponse response = client.execute(m);
int statusCode = response.getStatusLine().getStatusCode();
if (statusCode != 200) {
String msg = "HTTP resource not retrievable. The HTTP status from the remote server was: "
+ statusCode + ".";
SAXParseException spe = new SAXParseException(msg, publicId,
m.getURI().toString(), -1, -1, new IOException(msg));
if (errorHandler != null) {
errorHandler.fatalError(spe);
}
throw spe;
}
HttpEntity entity = response.getEntity();
long len = entity.getContentLength();
if (sizeLimit > -1 && len > sizeLimit) {
SAXParseException spe = new SAXParseException(
"Resource size exceeds limit.",
publicId,
m.getURI().toString(),
-1,
-1,
new StreamBoundException("Resource size exceeds limit."));
if (errorHandler != null) {
errorHandler.fatalError(spe);
}
throw spe;
}
TypedInputSource is;
org.apache.http.Header ct = response.getFirstHeader("Content-Type");
String contentType = null;
final String baseUri = m.getURI().toString();
if (ct != null) {
contentType = ct.getValue();
}
is = contentTypeParser.buildTypedInputSource(baseUri, publicId,
contentType);
Header cl = response.getFirstHeader("Content-Language");
if (cl != null) {
is.setLanguage(cl.getValue().trim());
}
Header xuac = response.getFirstHeader("X-UA-Compatible");
if (xuac != null) {
String val = xuac.getValue().trim();
if (!"ie=edge".equalsIgnoreCase(val)) {
SAXParseException spe = new SAXParseException(
"X-UA-Compatible HTTP header must have the value \u201CIE=edge\u201D,"
+ " was \u201C" + val + "\u201D.",
publicId, systemId, -1, -1);
errorHandler.error(spe);
}
}
final HttpGet meth = m;
InputStream stream = entity.getContent();
if (sizeLimit > -1) {
stream = new BoundedInputStream(stream, sizeLimit, baseUri);
}
Header ce = response.getFirstHeader("Content-Encoding");
if (ce != null) {
String val = ce.getValue().trim();
if ("gzip".equalsIgnoreCase(val)
|| "x-gzip".equalsIgnoreCase(val)) {
stream = new GZIPInputStream(stream);
if (sizeLimit > -1) {
stream = new BoundedInputStream(stream, sizeLimit,
baseUri);
}
}
}
is.setByteStream(new ObservableInputStream(stream,
new StreamObserver() {
private final Logger log4j = Logger.getLogger("nu.validator.xml.PrudentEntityResolver.StreamObserver");
private boolean released = false;
public void closeCalled() {
log4j.debug("closeCalled");
if (!released) {
log4j.debug("closeCalled, not yet released");
released = true;
try {
meth.releaseConnection();
} catch (Exception e) {
log4j.debug(
"closeCalled, releaseConnection", e);
}
}
}
public void exceptionOccurred(Exception ex)
throws IOException {
if (!released) {
released = true;
try {
meth.abort();
} catch (Exception e) {
log4j.debug("exceptionOccurred, abort", e);
} finally {
try {
meth.releaseConnection();
} catch (Exception e) {
log4j.debug(
"exceptionOccurred, releaseConnection",
e);
}
}
}
if (ex instanceof SystemIdIOException) {
SystemIdIOException siie = (SystemIdIOException) ex;
throw siie;
} else if (ex instanceof IOException) {
IOException ioe = (IOException) ex;
throw new SystemIdIOException(baseUri,
ioe.getMessage(), ioe);
} else if (ex instanceof RuntimeException) {
RuntimeException re = (RuntimeException) ex;
throw re;
} else {
throw new RuntimeException(
"API contract violation. Wrong exception type.",
ex);
}
}
public void finalizerCalled() {
if (!released) {
released = true;
try {
meth.abort();
} catch (Exception e) {
log4j.debug("finalizerCalled, abort", e);
} finally {
try {
meth.releaseConnection();
} catch (Exception e) {
log4j.debug(
"finalizerCalled, releaseConnection",
e);
}
}
}
}
}));
return is;
} catch (IOException e) {
if (m != null) {
try {
m.abort();
} catch (Exception ex) {
log4j.debug("abort", ex);
} finally {
try {
m.releaseConnection();
} catch (Exception ex) {
log4j.debug("releaseConnection", ex);
}
}
}
throw e;
} catch (SAXException e) {
if (m != null) {
try {
m.abort();
} catch (Exception ex) {
log4j.debug("abort", ex);
} finally {
try {
m.releaseConnection();
} catch (Exception ex) {
log4j.debug("releaseConnection", ex);
}
}
}
throw e;
} catch (RuntimeException e) {
if (m != null) {
try {
m.abort();
} catch (Exception ex) {
log4j.debug("abort", ex);
} finally {
try {
m.releaseConnection();
} catch (Exception ex) {
log4j.debug("releaseConnection", ex);
}
}
}
throw e;
}
}
/**
* @return Returns the allowRnc.
*/
public boolean isAllowRnc() {
return allowRnc;
}
/**
* @param allowRnc
* The allowRnc to set.
*/
public void setAllowRnc(boolean allowRnc) {
this.allowRnc = allowRnc;
this.contentTypeParser.setAllowRnc(allowRnc);
}
/**
* @param allowHtml
*/
public void setAllowHtml(boolean allowHtml) {
this.allowHtml = allowHtml;
this.contentTypeParser.setAllowHtml(allowHtml);
}
/**
* Returns the acceptAllKnownXmlTypes.
*
* @return the acceptAllKnownXmlTypes
*/
public boolean isAcceptAllKnownXmlTypes() {
return acceptAllKnownXmlTypes;
}
/**
* Sets the acceptAllKnownXmlTypes.
*
* @param acceptAllKnownXmlTypes
* the acceptAllKnownXmlTypes to set
*/
public void setAcceptAllKnownXmlTypes(boolean acceptAllKnownXmlTypes) {
this.acceptAllKnownXmlTypes = acceptAllKnownXmlTypes;
this.contentTypeParser.setAcceptAllKnownXmlTypes(acceptAllKnownXmlTypes);
}
/**
* Returns the allowGenericXml.
*
* @return the allowGenericXml
*/
public boolean isAllowGenericXml() {
return allowGenericXml;
}
/**
* Sets the allowGenericXml.
*
* @param allowGenericXml
* the allowGenericXml to set
*/
public void setAllowGenericXml(boolean allowGenericXml) {
this.allowGenericXml = allowGenericXml;
this.contentTypeParser.setAllowGenericXml(allowGenericXml);
}
/**
* Returns the allowXhtml.
*
* @return the allowXhtml
*/
public boolean isAllowXhtml() {
return allowXhtml;
}
/**
* Sets the allowXhtml.
*
* @param allowXhtml
* the allowXhtml to set
*/
public void setAllowXhtml(boolean allowXhtml) {
this.allowXhtml = allowXhtml;
this.contentTypeParser.setAllowXhtml(allowXhtml);
}
private String buildAccept() {
return "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
}
/**
* Returns the allowHtml.
*
* @return the allowHtml
*/
public boolean isAllowHtml() {
return allowHtml;
}
public boolean isOnlyHtmlAllowed() {
return !isAllowGenericXml() && !isAllowRnc() && !isAllowXhtml();
}
}