com.jaeksoft.searchlib.crawler.web.spider.HttpAbstract Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of opensearchserver Show documentation
Show all versions of opensearchserver Show documentation
OpenSearchServer is a powerful, enterprise-class, search engine program. Using the web user interface, the crawlers (web, file, database, ...) and the REST/RESTFul API you will be able to integrate quickly and easily advanced full-text search capabilities in your application. OpenSearchServer runs on Windows and Linux/Unix/BSD.
The newest version!
/**
* License Agreement for OpenSearchServer
*
* Copyright (C) 2013-2016 Emmanuel Keller / Jaeksoft
*
* http://www.open-search-server.com
*
* This file is part of OpenSearchServer.
*
* OpenSearchServer is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* OpenSearchServer is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with OpenSearchServer.
* If not, see .
**/
package com.jaeksoft.searchlib.crawler.web.spider;
import com.jaeksoft.searchlib.Logging;
import com.jaeksoft.searchlib.crawler.web.database.CookieItem;
import com.jaeksoft.searchlib.crawler.web.database.CredentialItem;
import com.jaeksoft.searchlib.util.FormatUtils.ThreadSafeDateFormat;
import com.jaeksoft.searchlib.util.FormatUtils.ThreadSafeSimpleDateFormat;
import com.jaeksoft.searchlib.util.IOUtils;
import com.jaeksoft.searchlib.util.cifs.NTLMSchemeFactory;
import org.apache.commons.collections.CollectionUtils;
import org.apache.http.*;
import org.apache.http.auth.AuthSchemeProvider;
import org.apache.http.client.CookieStore;
import org.apache.http.client.CredentialsProvider;
import org.apache.http.client.RedirectStrategy;
import org.apache.http.client.config.AuthSchemes;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.HttpRequestBase;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.config.SocketConfig;
import org.apache.http.conn.ssl.NoopHostnameVerifier;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.cookie.Cookie;
import org.apache.http.entity.ContentType;
import org.apache.http.impl.auth.BasicSchemeFactory;
import org.apache.http.impl.auth.DigestSchemeFactory;
import org.apache.http.impl.auth.KerberosSchemeFactory;
import org.apache.http.impl.auth.SPNegoSchemeFactory;
import org.apache.http.impl.client.*;
import org.apache.http.ssl.SSLContextBuilder;
import org.apache.http.ssl.TrustStrategy;
import org.apache.http.util.EntityUtils;
import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.SSLContext;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.charset.Charset;
import java.nio.charset.UnsupportedCharsetException;
import java.security.KeyManagementException;
import java.security.KeyStoreException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.text.ParseException;
import java.util.List;
import java.util.Locale;
import java.util.concurrent.TimeUnit;
public abstract class HttpAbstract {
private final int msTimeOut;
private final boolean followRedirect;
private final CloseableHttpClient httpClient;
private RedirectStrategy redirectStrategy;
private HttpResponse httpResponse = null;
private final HttpClientContext httpClientContext;
private HttpRequestBase httpBaseRequest = null;
private final ProxyHandler proxyHandler;
private final HttpHost proxyHost;
private HttpEntity httpEntity = null;
private StatusLine statusLine = null;
private CredentialsProvider credentialsProvider;
private final CookieStore cookieStore;
public HttpAbstract(String userAgent, boolean bFollowRedirect, ProxyHandler proxyHandler, int msTimeOut)
throws IOException {
this.followRedirect = bFollowRedirect;
this.msTimeOut = msTimeOut;
HttpClientBuilder builder = HttpClients.custom();
// Timeout
builder.setDefaultSocketConfig(SocketConfig.custom().setSoTimeout(msTimeOut).build());
builder.setConnectionTimeToLive(msTimeOut * 2, TimeUnit.MILLISECONDS);
SSLContext sslContext;
try {
sslContext = new SSLContextBuilder().loadTrustMaterial(null, new TrustStrategy() {
@Override
public boolean isTrusted(X509Certificate[] arg0, String arg1) throws CertificateException {
return true;
}
}).build();
builder.setSSLContext(sslContext);
} catch (KeyManagementException e) {
throw new IOException(e);
} catch (NoSuchAlgorithmException e) {
throw new IOException(e);
} catch (KeyStoreException e) {
throw new IOException(e);
}
HostnameVerifier hostnameVerifier = NoopHostnameVerifier.INSTANCE;
SSLConnectionSocketFactory sslSocketFactory = new SSLConnectionSocketFactory(sslContext, hostnameVerifier);
builder.setSSLSocketFactory(sslSocketFactory);
redirectStrategy = new DefaultRedirectStrategy();
if (userAgent != null) {
userAgent = userAgent.trim();
if (userAgent.length() > 0)
builder.setUserAgent(userAgent);
else
userAgent = null;
}
if (!bFollowRedirect)
builder.disableRedirectHandling();
this.proxyHandler = proxyHandler;
proxyHost = proxyHandler == null ? null : proxyHandler.getAnyProxy();
Registry authSchemeRegistry =
RegistryBuilder.create().register(AuthSchemes.NTLM, new NTLMSchemeFactory())
.register(AuthSchemes.BASIC, new BasicSchemeFactory())
.register(AuthSchemes.DIGEST, new DigestSchemeFactory())
.register(AuthSchemes.SPNEGO, new SPNegoSchemeFactory())
.register(AuthSchemes.KERBEROS, new KerberosSchemeFactory()).build();
credentialsProvider = new BasicCredentialsProvider();
builder.setDefaultCredentialsProvider(credentialsProvider);
builder.setDefaultAuthSchemeRegistry(authSchemeRegistry);
httpClient = builder.build();
httpClientContext = HttpClientContext.create();
cookieStore = new BasicCookieStore();
httpClientContext.setCookieStore(cookieStore);
}
protected void reset() {
httpResponse = null;
httpBaseRequest = null;
synchronized (this) {
if (httpEntity != null) {
try {
EntityUtils.consume(httpEntity);
} catch (IOException e) {
Logging.warn(e.getMessage(), e);
}
httpEntity = null;
}
statusLine = null;
}
}
protected void execute(HttpRequestBase httpBaseRequest, CredentialItem credentialItem, List cookies)
throws IOException, URISyntaxException {
// Filling the cookie store with configuration cookies
if (!CollectionUtils.isEmpty(cookies)) {
List cookieList = cookieStore.getCookies();
for (CookieItem cookie : cookies) {
Cookie newCookie = cookie.getCookie();
if (!cookieList.contains(newCookie))
cookieStore.addCookie(newCookie);
}
}
this.httpBaseRequest = httpBaseRequest;
// No more than one 1 minute to establish the connection
// No more than 10 minutes to establish the socket
// Cookies uses best match policy
RequestConfig.Builder configBuilder =
RequestConfig.custom().setSocketTimeout(msTimeOut).setConnectionRequestTimeout(msTimeOut)
.setConnectTimeout(msTimeOut).setCookieSpec(CookieSpecs.STANDARD)
.setRedirectsEnabled(followRedirect);
if (credentialItem == null)
credentialsProvider.clear();
else
credentialItem.setUpCredentials(credentialsProvider, httpBaseRequest);
URI uri = httpBaseRequest.getURI();
if (proxyHandler != null && proxyHost != null)
if (proxyHandler.isProxy(uri))
proxyHandler.applyProxy(configBuilder, proxyHost, credentialsProvider);
httpBaseRequest.setConfig(configBuilder.build());
httpResponse = httpClient.execute(httpBaseRequest, httpClientContext);
if (httpResponse == null)
return;
statusLine = httpResponse.getStatusLine();
httpEntity = httpResponse.getEntity();
}
public URI getRedirectLocation() {
synchronized (this) {
if (httpResponse == null)
return null;
try {
if (!redirectStrategy.isRedirected(httpBaseRequest, httpResponse, httpClientContext)) {
Object redirects = httpClientContext.getAttribute(HttpClientContext.REDIRECT_LOCATIONS);
if (redirects == null)
return null;
if (redirects instanceof List>) {
List> redirectCollection = (List>) redirects;
if (CollectionUtils.isEmpty(redirectCollection))
return null;
redirects = redirectCollection.get(redirectCollection.size() - 1);
}
if (redirects instanceof URI)
return ((URI) redirects);
else
return new URI(redirects.toString());
}
HttpUriRequest httpUri = redirectStrategy.getRedirect(httpBaseRequest, httpResponse, httpClientContext);
if (httpUri == null)
return null;
return httpUri.getURI();
} catch (ProtocolException e) {
Logging.info(e);
return null;
} catch (URISyntaxException e) {
Logging.info(e);
return null;
}
}
}
final public Long getContentLength() {
synchronized (this) {
if (httpEntity != null)
return httpEntity.getContentLength();
Header header = httpResponse.getFirstHeader("Content-Length");
if (header == null)
return null;
String value = header.getValue();
if (value == null)
return null;
return new Long(value);
}
}
public String getContentDispositionFilename() {
synchronized (this) {
if (httpResponse == null)
return null;
Header header = httpResponse.getFirstHeader("Content-Disposition");
if (header == null)
return null;
String s = header.getValue();
int i1 = s.indexOf("filename=");
if (i1 == -1)
return null;
i1 += 9;
int i2 = s.indexOf(";", i1);
String f = (i2 == -1) ? s.substring(i1) : s.substring(i1, i2);
return f.replace("\"", "");
}
}
public String getContentBaseType() {
synchronized (this) {
Header header = null;
if (httpEntity != null)
header = httpEntity.getContentType();
if (header == null)
header = httpResponse.getFirstHeader("Content-Type");
if (header == null)
return null;
String v = header.getValue();
int i = v.indexOf(';');
if (i == -1)
return v;
return v.substring(0, i);
}
}
// Sun, 06 Nov 1994 08:49:37 GMT ; RFC 822, updated by RFC 1123
// Sunday, 06-Nov-94 08:49:37 GMT ; RFC 850, obsoleted by RFC 1036
// Sun Nov 6 08:49:37 1994
private final static String[] LastModifiedDateFormats =
{ "EEE, dd MMM yyyy HH:mm:ss zzz", "EEE, dd MMM yyyy HH:mm:ss z", "EEEE, dd-MMM-yy HH:mm:ss z",
"EEE MMM d HH:mm:ss yyyy" };
private final static ThreadSafeDateFormat[] httpDatesFormats;
static {
int i = 0;
httpDatesFormats = new ThreadSafeDateFormat[LastModifiedDateFormats.length * 2];
for (String format : LastModifiedDateFormats) {
httpDatesFormats[i++] = new ThreadSafeSimpleDateFormat(format, Locale.ENGLISH);
httpDatesFormats[i++] = new ThreadSafeSimpleDateFormat(format);
}
}
;
public Long getLastModified() {
synchronized (this) {
Header header = httpResponse.getFirstHeader("Last-Modified");
if (header == null)
return null;
String v = header.getValue();
if (v == null)
return null;
ParseException parseException = null;
for (ThreadSafeDateFormat dateFormat : httpDatesFormats) {
try {
return dateFormat.parse(v).getTime();
} catch (ParseException e) {
parseException = e;
}
}
if (parseException != null)
Logging.warn(parseException);
return null;
}
}
public static void main(String[] argv) throws IOException {
for (ThreadSafeDateFormat dateFormat : httpDatesFormats) {
try {
System.out.println(dateFormat.parse("Thu, 21 Feb 2013 20:11:52 GMT").getTime());
} catch (ParseException e) {
e.printStackTrace();
}
}
}
public String getContentTypeCharset() {
synchronized (this) {
if (httpEntity == null)
return null;
try {
ContentType ct = ContentType.getOrDefault(httpEntity);
if (ct == null)
return null;
Charset charset = ct.getCharset();
if (charset == null)
return null;
return charset.name();
} catch (UnsupportedCharsetException e) {
Logging.warn(e);
return null;
}
}
}
public String getContentEncoding() {
synchronized (this) {
if (httpEntity == null)
return null;
Header header = httpEntity.getContentEncoding();
if (header == null)
return null;
return header.getValue();
}
}
public String getContentLocation() {
synchronized (this) {
if (httpResponse == null)
return null;
Header header = httpResponse.getFirstHeader("Content-Location");
if (header == null)
return null;
return header.getValue();
}
}
protected InputStream getContent() throws IllegalStateException, IOException {
synchronized (this) {
if (httpEntity == null)
return null;
return httpEntity.getContent();
}
}
public Integer getStatusCode() {
synchronized (this) {
if (statusLine == null)
return null;
return statusLine.getStatusCode();
}
}
public String getReasonPhrase() {
synchronized (this) {
if (statusLine == null)
return null;
return statusLine.getReasonPhrase();
}
}
public void release() {
synchronized (this) {
reset();
if (httpClient != null)
IOUtils.close(httpClient);
}
}
public Header[] getHeaders() {
synchronized (this) {
if (httpResponse == null)
return null;
return httpResponse.getAllHeaders();
}
}
}