
com.serphacker.serposcope.scraper.http.ScrapClient Maven / Gradle / Ivy
/*
* Serposcope - SEO rank checker https://serposcope.serphacker.com/
*
* Copyright (c) 2016 SERP Hacker
* @author Pierre Nogues
* @license https://opensource.org/licenses/MIT MIT License
*/
package com.serphacker.serposcope.scraper.http;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.serphacker.serposcope.scraper.http.extensions.CloseableBasicHttpClientConnectionManager;
import com.serphacker.serposcope.scraper.http.extensions.ScrapClientPlainConnectionFactory;
import com.serphacker.serposcope.scraper.http.extensions.ScrapClientSSLConnectionFactory;
import com.serphacker.serposcope.scraper.http.extensions.ScrapClientSocksAuthenticator;
import com.serphacker.serposcope.scraper.http.proxy.BindProxy;
import com.serphacker.serposcope.scraper.http.proxy.DirectNoProxy;
import com.serphacker.serposcope.scraper.http.proxy.HttpProxy;
import java.io.Closeable;
import java.io.IOException;
import java.io.InputStream;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.Date;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpException;
import org.apache.http.HttpHost;
import org.apache.http.HttpRequest;
import org.apache.http.HttpResponse;
import org.apache.http.auth.AuthScope;
import org.apache.http.auth.Credentials;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.CredentialsProvider;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpRequestBase;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.conn.routing.HttpRoute;
import org.apache.http.conn.routing.HttpRoutePlanner;
import org.apache.http.conn.routing.RouteInfo;
import org.apache.http.cookie.Cookie;
import org.apache.http.entity.ContentType;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.BasicCredentialsProvider;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.protocol.HttpContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.serphacker.serposcope.scraper.http.proxy.ScrapProxy;
import com.serphacker.serposcope.scraper.http.proxy.SocksProxy;
import com.serphacker.serposcope.scraper.utils.EncodeUtils;
import java.net.InetSocketAddress;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import org.apache.http.NameValuePair;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.config.SocketConfig;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.entity.StringEntity;
import org.apache.http.entity.mime.HttpMultipartMode;
import org.apache.http.entity.mime.MultipartEntityBuilder;
import org.apache.http.entity.mime.content.ContentBody;
import org.apache.http.impl.DefaultConnectionReuseStrategy;
import org.apache.http.impl.client.RedirectLocations;
import org.apache.http.message.BasicNameValuePair;
/**
* *
* not thread safe
*
* @author admin
*/
public class ScrapClient implements Closeable, CredentialsProvider {
public enum PostType {
URL_ENCODED,
MULTIPART,
JSON
}
private static final Logger LOG = LoggerFactory.getLogger(ScrapClient.class);
private final static ObjectMapper jsonMapper = new ObjectMapper();
public final static String DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0";
public final static int DEFAULT_TIMEOUT_MS = 30000;
public final static int DEFAULT_MAX_RESPONSE_LENGTH = (1024 * 1024 * 4) - 1;// 4MB
CloseableHttpClient client;
BasicCredentialsProvider credentialProvider = new BasicCredentialsProvider();
BasicCookieStore basicCookieStore = new BasicCookieStore();
final CloseableBasicHttpClientConnectionManager connManager;
ScrapClientPlainConnectionFactory plainConnectionFactory = new ScrapClientPlainConnectionFactory();
ScrapClientSSLConnectionFactory sslConnectionFactory = new ScrapClientSSLConnectionFactory(plainConnectionFactory);
String useragent = DEFAULT_USER_AGENT;
Integer timeoutMS = DEFAULT_TIMEOUT_MS;
ScrapProxy proxy;
int maxResponseLength;
byte[] buffer;
List requestHeaders = new ArrayList<>();
Map routes = new HashMap<>();
boolean proxyChangedSinceLastRequest;
int maxRedirect = 0;
long executionTimeMS;
CloseableHttpResponse response;
byte[] content;
int statusCode;
Exception exception;
String lastRedirect;
class SCliConnectionReuseStrategy extends DefaultConnectionReuseStrategy {
@Override
public boolean keepAlive(HttpResponse response, HttpContext context) {
if (!proxyChangedSinceLastRequest && (proxy == null || (proxy instanceof BindProxy))) {
return super.keepAlive(response, context);
} else {
return false;
}
}
}
class SCliHttpRoutePlanner implements HttpRoutePlanner {
@Override
public HttpRoute determineRoute(HttpHost originaltarget, HttpRequest request, HttpContext context) throws HttpException {
boolean ssl = "https".equalsIgnoreCase(originaltarget.getSchemeName());
HttpHost target = routes.getOrDefault(originaltarget, originaltarget);
if (proxy == null) {
return new HttpRoute(target);
}
if (proxy instanceof SocksProxy) {
SocksProxy socksProxy = (SocksProxy) proxy;
context.setAttribute("proxy.socks", new InetSocketAddress(socksProxy.getIp(), socksProxy.getPort()));
return new HttpRoute(target);
}
if (proxy instanceof BindProxy) {
BindProxy bindProxy = (BindProxy) proxy;
try {
return new HttpRoute(target, InetAddress.getByName(bindProxy.ip), ssl);
} catch (UnknownHostException cause) {
throw new HttpException("invalid bind ip", cause);
}
}
if (proxy instanceof HttpProxy) {
HttpProxy httpProxy = (HttpProxy) proxy;
return new HttpRoute(
target,
null,
new HttpHost(httpProxy.getIp(), httpProxy.getPort()),
ssl,
ssl ? RouteInfo.TunnelType.TUNNELLED : RouteInfo.TunnelType.PLAIN,
ssl ? RouteInfo.LayerType.LAYERED : RouteInfo.LayerType.PLAIN
);
}
throw new UnsupportedOperationException("unsupported proxy type : " + proxy);
}
}
public ScrapClient() {
setMaxResponseLength(DEFAULT_MAX_RESPONSE_LENGTH);
sslConnectionFactory.setInsecure(false);
connManager = new CloseableBasicHttpClientConnectionManager(
RegistryBuilder.create()
.register("http", plainConnectionFactory)
.register("https", sslConnectionFactory)
.build()
);
client = HttpClients
.custom()
.setRoutePlanner(this.new SCliHttpRoutePlanner())
.setDefaultCredentialsProvider(this)
.setDefaultCookieStore(basicCookieStore)
.setConnectionReuseStrategy(this.new SCliConnectionReuseStrategy())
.setConnectionManager(connManager)
.build();
setTimeout(timeoutMS);
}
public void addCookie(Cookie cookie) {
basicCookieStore.addCookie(cookie);
}
public void addCookies(Cookie[] cookies) {
basicCookieStore.addCookies(cookies);
}
public void addCookies(Collection cookies) {
for (Cookie cooky : cookies) {
basicCookieStore.addCookie(cooky);
}
}
public List getCookies() {
return basicCookieStore.getCookies();
}
public boolean clearExpiredCookies(Date date) {
return basicCookieStore.clearExpired(date);
}
public void clearCookies() {
basicCookieStore.clear();
}
public String getUseragent() {
return useragent;
}
public void setUseragent(String useragent) {
this.useragent = useragent;
}
public void setProxy(ScrapProxy proxy) {
synchronized (connManager) {
connManager.closeConnection();
}
proxyChangedSinceLastRequest = true;
if (proxy != null && proxy instanceof DirectNoProxy) {
this.proxy = null;
} else {
this.proxy = proxy;
}
if (proxy instanceof SocksProxy) {
ScrapClientSocksAuthenticator.INSTANCE.addProxy((SocksProxy) proxy);
}
}
public ScrapProxy getProxy() {
return proxy;
}
public Integer getTimeout() {
return timeoutMS;
}
public final void setTimeout(Integer timeoutMS) {
this.timeoutMS = timeoutMS;
SocketConfig.Builder newSocketConfig = SocketConfig.custom();
if (timeoutMS != null) {
newSocketConfig.setSoTimeout(timeoutMS);
}
connManager.setSocketConfig(newSocketConfig.build());
}
public int getMaxResponseLength() {
return maxResponseLength;
}
public final void setMaxResponseLength(int maxResponseLength) {
this.maxResponseLength = maxResponseLength + 1;
buffer = new byte[this.maxResponseLength];
}
public CloseableHttpResponse getResponse() {
return response;
}
public byte[] getContent() {
return content;
}
public String getContentAsString() {
if (response == null || content == null) {
return null;
}
Charset charset = getDetectedCharset();
if (charset == null) {
charset = Charset.forName("UTF-8");
}
return new String(content, charset);
}
public Charset getDetectedCharset() {
ContentType contentType = null;
try {
contentType = ContentType.get(response.getEntity());
} catch (Exception ex) {
}
Charset charset = null;
if (contentType != null) {
try {
charset = contentType.getCharset();
} catch (final Exception ex) {
}
if (charset == null) {
if (contentType.getMimeType().contains("text/html")) {
charset = detectCharsetFromHtmlMeta();
}
}
}
return charset;
}
final static Pattern pcharset = Pattern.compile("charset=['\"]?([^\"'\\s]+)");
protected Charset detectCharsetFromHtmlMeta() {
if (content == null) {
return null;
}
int len = content.length > 4096 ? 4096 : content.length;
Matcher matcher = pcharset.matcher(new ByteCharSequence(content, 0, len));
if (matcher.find()) {
try {
return Charset.forName(matcher.group(1));
} catch (Exception ex) {
}
}
return null;
}
public String getResponseHeader(String key) {
if (response == null) {
return null;
}
Header header = response.getFirstHeader(key);
if (header == null) {
return null;
}
return header.getValue();
}
public int getStatusCode() {
return statusCode;
}
public Exception getException() {
return exception;
}
public int get(String url) {
return get(url, null);
}
public int get(String url, String referrer) {
HttpGet request = new HttpGet(url);
if (referrer != null) {
request.addHeader("Referer", referrer);
}
return request(request);
}
public int post(String url, Map data, PostType dataType) {
return post(url, data, dataType, null);
}
public int post(String url, Map data, PostType dataType, String charset) {
return post(url, data, dataType, charset, null);
}
public int post(String url, Map data, PostType dataType, String charset, String referrer) {
clearPreviousRequest();
HttpPost request = new HttpPost(url);
HttpEntity entity = null;
if (charset == null) {
charset = "utf-8";
}
Charset detectedCharset = null;
try {
detectedCharset = Charset.forName(charset);
} catch (Exception ex) {
LOG.warn("invalid charset name {}, switching to utf-8");
detectedCharset = Charset.forName("utf-8");
}
data = handleUnsupportedEncoding(data, detectedCharset);
switch (dataType) {
case JSON:
try {
String json = jsonMapper.writeValueAsString(data);
entity = new StringEntity(json, ContentType.create("application/json", "utf-8"));
}catch(Exception ex){
statusCode = -1;
exception = ex;
return statusCode;
}
break;
case URL_ENCODED:
List formparams = new ArrayList<>();
for (Map.Entry entry : data.entrySet()) {
if (entry.getValue() instanceof String) {
formparams.add(new BasicNameValuePair(entry.getKey(), (String) entry.getValue()));
} else {
LOG.warn("trying to url encode non string data");
formparams.add(new BasicNameValuePair(entry.getKey(), entry.getValue().toString()));
}
}
try {
entity = new UrlEncodedFormEntity(formparams, detectedCharset);
} catch (Exception ex) {
statusCode = -1;
exception = ex;
return statusCode;
}
break;
case MULTIPART:
MultipartEntityBuilder builder = MultipartEntityBuilder.create()
.setCharset(detectedCharset)
.setMode(HttpMultipartMode.BROWSER_COMPATIBLE);
ContentType formDataCT = ContentType.create("form-data", detectedCharset);
// formDataCT = ContentType.DEFAULT_TEXT;
for (Map.Entry entry : data.entrySet()) {
String key = entry.getKey();
if (entry.getValue() instanceof String) {
builder = builder.addTextBody(key, (String) entry.getValue(), formDataCT);
} else if (entry.getValue() instanceof byte[]) {
builder = builder.addBinaryBody(key, (byte[]) entry.getValue());
} else if (entry.getValue() instanceof ContentBody) {
builder = builder.addPart(key, (ContentBody) entry.getValue());
} else {
exception = new UnsupportedOperationException("unssuported body type " + entry.getValue().getClass());
return statusCode = -1;
}
}
entity = builder.build();
break;
default:
exception = new UnsupportedOperationException("unspported PostType " + dataType);
return statusCode = -1;
}
request.setEntity(entity);
if (referrer != null) {
request.addHeader("Referer", referrer);
}
return request(request);
}
protected Map handleUnsupportedEncoding(Map data, Charset detectedCharset) {
Map cleanedData = new HashMap<>();
boolean hasUnsupportedEncoding = false;
for (Map.Entry entry : data.entrySet()) {
String key = entry.getKey();
Object value = entry.getValue();
if (!EncodeUtils.canEncode(key, detectedCharset.name())) {
hasUnsupportedEncoding = true;
key = EncodeUtils.forceASCII(key);
}
if (value instanceof String) {
if (!EncodeUtils.canEncode((String) value, detectedCharset.name())) {
hasUnsupportedEncoding = true;
value = EncodeUtils.forceASCII((String) value);
}
}
cleanedData.put(key, value);
}
if (hasUnsupportedEncoding) {
LOG.warn("failed to encode some post data to {} forced to ascii", detectedCharset.name());
}
return cleanedData;
}
protected void clearPreviousRequest() {
content = null;
exception = null;
response = null;
statusCode = 0;
lastRedirect = null;
}
public int request(HttpRequestBase request) {
synchronized (connManager) {
try {
clearPreviousRequest();
executionTimeMS = System.currentTimeMillis();
HttpClientContext context = HttpClientContext.create();
initializeRequest(request, context);
response = client.execute(request, context);
statusCode = response.getStatusLine().getStatusCode();
RedirectLocations redirects = context.getAttribute(HttpClientContext.REDIRECT_LOCATIONS, RedirectLocations.class);
if(redirects != null && !redirects.isEmpty()){
lastRedirect = redirects.get(redirects.size()-1).toString();
}
HttpEntity entity = response.getEntity();
long contentLength = entity.getContentLength();
if (contentLength > maxResponseLength) {
throw new ResponseTooBigException(
"content length (" + contentLength + ") "
+ "is greater than max response leength (" + maxResponseLength + ")"
);
}
InputStream stream = entity.getContent();
int totalRead = 0;
int read = 0;
while (totalRead < maxResponseLength
&& (read = stream.read(buffer, totalRead, maxResponseLength - totalRead)) != -1) {
totalRead += read;
}
if (totalRead == maxResponseLength && read != 0) {
throw new ResponseTooBigException("already read " + totalRead + " bytes");
}
content = Arrays.copyOfRange(buffer, 0, totalRead);
} catch (Exception ex) {
content = null;
statusCode = -1;
exception = ex;
} finally {
proxyChangedSinceLastRequest = false;
closeResponse();
executionTimeMS = System.currentTimeMillis() - executionTimeMS;
}
return statusCode;
}
}
protected void initializeRequest(HttpRequestBase request, HttpClientContext context){
if (request.getFirstHeader("user-agent") == null) {
request.setHeader("User-Agent", useragent);
}
for (Header requestHeader : requestHeaders) {
request.setHeader(requestHeader);
}
RequestConfig.Builder configBuilder =
RequestConfig.copy(request.getConfig() == null ? RequestConfig.DEFAULT : request.getConfig());
if (timeoutMS != null) {
configBuilder.setConnectTimeout(timeoutMS);
configBuilder.setConnectionRequestTimeout(timeoutMS);
configBuilder.setSocketTimeout(timeoutMS);
}
if(maxRedirect == 0){
configBuilder.setRedirectsEnabled(false);
} else {
configBuilder.setMaxRedirects(maxRedirect);
}
RequestConfig config = configBuilder.build();
context.setAttribute(HttpClientContext.REQUEST_CONFIG, config);
request.setConfig(config);
}
public void closeResponse() {
if (response != null) {
try {
response.close();
} catch (Exception ex) {
LOG.warn("Exception while closing response", ex);
}
}
}
@Override
public void close() throws IOException {
closeResponse();
if (client != null) {
client.close();
}
}
public void setRoute(HttpHost to, HttpHost via) {
routes.put(to, via);
}
public void removeRouteVia(HttpHost host) {
routes.remove(host);
}
public void removeRoutesTo(String host) {
routes.entrySet().removeIf((Map.Entry t) -> host.equals(t.getValue().getHostName()));
}
public void removeRoutes() {
routes.clear();
}
@Override
public Credentials getCredentials(AuthScope authscope) {
if (proxy != null && proxy instanceof HttpProxy) {
HttpProxy httpProxy = (HttpProxy) proxy;
if (httpProxy.getIp().equals(authscope.getHost())
&& httpProxy.getPort() == authscope.getPort()
&& httpProxy.getUsername() != null
&& httpProxy.getPassword() != null) {
return new UsernamePasswordCredentials(httpProxy.getUsername(), httpProxy.getPassword());
}
}
return credentialProvider.getCredentials(authscope);
}
@Override
public void setCredentials(AuthScope scope, Credentials auth) {
credentialProvider.setCredentials(scope, auth);
}
@Override
public void clear() {
credentialProvider.clear();
}
public void setRequestHeader(Header header) {
removeRequestHeadersByName(header.getName());
requestHeaders.add(header);
}
public void removeRequestHeadersByName(String name) {
requestHeaders.removeIf((Header t) -> t.getName().toLowerCase().equals(name.toLowerCase()));
}
public long getExecutionTimeMS() {
return executionTimeMS;
}
public boolean isInsecureSSL() {
return sslConnectionFactory.isInsecure();
}
public void setInsecureSSL(boolean insecureSSL) {
this.sslConnectionFactory.setInsecure(insecureSSL);
}
public int getMaxRedirect() {
return maxRedirect;
}
public void setMaxRedirect(int maxRedirect) {
this.maxRedirect = maxRedirect;
}
public void enableFollowRedirect(){
maxRedirect = 10;
}
public void disableFollowRedirect(){
maxRedirect = 0;
}
public String getLastRedirect() {
return lastRedirect;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy