All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.serphacker.serposcope.scraper.http.ScrapClient Maven / Gradle / Ivy

/* 
 * Serposcope - SEO rank checker https://serposcope.serphacker.com/
 * 
 * Copyright (c) 2016 SERP Hacker
 * @author Pierre Nogues 
 * @license https://opensource.org/licenses/MIT MIT License
 */
package com.serphacker.serposcope.scraper.http;

import com.fasterxml.jackson.databind.ObjectMapper;
import com.serphacker.serposcope.scraper.http.extensions.CloseableBasicHttpClientConnectionManager;
import com.serphacker.serposcope.scraper.http.extensions.ScrapClientPlainConnectionFactory;
import com.serphacker.serposcope.scraper.http.extensions.ScrapClientSSLConnectionFactory;
import com.serphacker.serposcope.scraper.http.extensions.ScrapClientSocksAuthenticator;
import com.serphacker.serposcope.scraper.http.proxy.BindProxy;
import com.serphacker.serposcope.scraper.http.proxy.DirectNoProxy;
import com.serphacker.serposcope.scraper.http.proxy.HttpProxy;
import java.io.Closeable;
import java.io.IOException;
import java.io.InputStream;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.Date;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpException;
import org.apache.http.HttpHost;
import org.apache.http.HttpRequest;
import org.apache.http.HttpResponse;
import org.apache.http.auth.AuthScope;
import org.apache.http.auth.Credentials;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.CredentialsProvider;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpRequestBase;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.conn.routing.HttpRoute;
import org.apache.http.conn.routing.HttpRoutePlanner;
import org.apache.http.conn.routing.RouteInfo;
import org.apache.http.cookie.Cookie;
import org.apache.http.entity.ContentType;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.BasicCredentialsProvider;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.protocol.HttpContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.serphacker.serposcope.scraper.http.proxy.ScrapProxy;
import com.serphacker.serposcope.scraper.http.proxy.SocksProxy;
import com.serphacker.serposcope.scraper.utils.EncodeUtils;
import java.net.InetSocketAddress;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import org.apache.http.NameValuePair;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.config.SocketConfig;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.entity.StringEntity;
import org.apache.http.entity.mime.HttpMultipartMode;
import org.apache.http.entity.mime.MultipartEntityBuilder;
import org.apache.http.entity.mime.content.ContentBody;
import org.apache.http.impl.DefaultConnectionReuseStrategy;
import org.apache.http.impl.client.RedirectLocations;
import org.apache.http.message.BasicNameValuePair;

/**
 * *
 * not thread safe
 *
 * @author admin
 */
public class ScrapClient implements Closeable, CredentialsProvider {

    public enum PostType {
        URL_ENCODED,
        MULTIPART,
        JSON
    }
    
    private static final Logger LOG = LoggerFactory.getLogger(ScrapClient.class);
    
    private final static ObjectMapper jsonMapper = new ObjectMapper();

    public final static String DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0";
    public final static int DEFAULT_TIMEOUT_MS = 30000;
    public final static int DEFAULT_MAX_RESPONSE_LENGTH = (1024 * 1024 * 4) - 1;// 4MB

    CloseableHttpClient client;
    BasicCredentialsProvider credentialProvider = new BasicCredentialsProvider();
    BasicCookieStore basicCookieStore = new BasicCookieStore();
    final CloseableBasicHttpClientConnectionManager connManager;
    ScrapClientPlainConnectionFactory plainConnectionFactory = new ScrapClientPlainConnectionFactory();
    ScrapClientSSLConnectionFactory sslConnectionFactory = new ScrapClientSSLConnectionFactory(plainConnectionFactory);

    String useragent = DEFAULT_USER_AGENT;
    Integer timeoutMS = DEFAULT_TIMEOUT_MS;
    ScrapProxy proxy;
    int maxResponseLength;
    byte[] buffer;
    List
requestHeaders = new ArrayList<>(); Map routes = new HashMap<>(); boolean proxyChangedSinceLastRequest; int maxRedirect = 0; long executionTimeMS; CloseableHttpResponse response; byte[] content; int statusCode; Exception exception; String lastRedirect; class SCliConnectionReuseStrategy extends DefaultConnectionReuseStrategy { @Override public boolean keepAlive(HttpResponse response, HttpContext context) { if (!proxyChangedSinceLastRequest && (proxy == null || (proxy instanceof BindProxy))) { return super.keepAlive(response, context); } else { return false; } } } class SCliHttpRoutePlanner implements HttpRoutePlanner { @Override public HttpRoute determineRoute(HttpHost originaltarget, HttpRequest request, HttpContext context) throws HttpException { boolean ssl = "https".equalsIgnoreCase(originaltarget.getSchemeName()); HttpHost target = routes.getOrDefault(originaltarget, originaltarget); if (proxy == null) { return new HttpRoute(target); } if (proxy instanceof SocksProxy) { SocksProxy socksProxy = (SocksProxy) proxy; context.setAttribute("proxy.socks", new InetSocketAddress(socksProxy.getIp(), socksProxy.getPort())); return new HttpRoute(target); } if (proxy instanceof BindProxy) { BindProxy bindProxy = (BindProxy) proxy; try { return new HttpRoute(target, InetAddress.getByName(bindProxy.ip), ssl); } catch (UnknownHostException cause) { throw new HttpException("invalid bind ip", cause); } } if (proxy instanceof HttpProxy) { HttpProxy httpProxy = (HttpProxy) proxy; return new HttpRoute( target, null, new HttpHost(httpProxy.getIp(), httpProxy.getPort()), ssl, ssl ? RouteInfo.TunnelType.TUNNELLED : RouteInfo.TunnelType.PLAIN, ssl ? RouteInfo.LayerType.LAYERED : RouteInfo.LayerType.PLAIN ); } throw new UnsupportedOperationException("unsupported proxy type : " + proxy); } } public ScrapClient() { setMaxResponseLength(DEFAULT_MAX_RESPONSE_LENGTH); sslConnectionFactory.setInsecure(false); connManager = new CloseableBasicHttpClientConnectionManager( RegistryBuilder.create() .register("http", plainConnectionFactory) .register("https", sslConnectionFactory) .build() ); client = HttpClients .custom() .setRoutePlanner(this.new SCliHttpRoutePlanner()) .setDefaultCredentialsProvider(this) .setDefaultCookieStore(basicCookieStore) .setConnectionReuseStrategy(this.new SCliConnectionReuseStrategy()) .setConnectionManager(connManager) .build(); setTimeout(timeoutMS); } public void addCookie(Cookie cookie) { basicCookieStore.addCookie(cookie); } public void addCookies(Cookie[] cookies) { basicCookieStore.addCookies(cookies); } public void addCookies(Collection cookies) { for (Cookie cooky : cookies) { basicCookieStore.addCookie(cooky); } } public List getCookies() { return basicCookieStore.getCookies(); } public boolean clearExpiredCookies(Date date) { return basicCookieStore.clearExpired(date); } public void clearCookies() { basicCookieStore.clear(); } public String getUseragent() { return useragent; } public void setUseragent(String useragent) { this.useragent = useragent; } public void setProxy(ScrapProxy proxy) { synchronized (connManager) { connManager.closeConnection(); } proxyChangedSinceLastRequest = true; if (proxy != null && proxy instanceof DirectNoProxy) { this.proxy = null; } else { this.proxy = proxy; } if (proxy instanceof SocksProxy) { ScrapClientSocksAuthenticator.INSTANCE.addProxy((SocksProxy) proxy); } } public ScrapProxy getProxy() { return proxy; } public Integer getTimeout() { return timeoutMS; } public final void setTimeout(Integer timeoutMS) { this.timeoutMS = timeoutMS; SocketConfig.Builder newSocketConfig = SocketConfig.custom(); if (timeoutMS != null) { newSocketConfig.setSoTimeout(timeoutMS); } connManager.setSocketConfig(newSocketConfig.build()); } public int getMaxResponseLength() { return maxResponseLength; } public final void setMaxResponseLength(int maxResponseLength) { this.maxResponseLength = maxResponseLength + 1; buffer = new byte[this.maxResponseLength]; } public CloseableHttpResponse getResponse() { return response; } public byte[] getContent() { return content; } public String getContentAsString() { if (response == null || content == null) { return null; } Charset charset = getDetectedCharset(); if (charset == null) { charset = Charset.forName("UTF-8"); } return new String(content, charset); } public Charset getDetectedCharset() { ContentType contentType = null; try { contentType = ContentType.get(response.getEntity()); } catch (Exception ex) { } Charset charset = null; if (contentType != null) { try { charset = contentType.getCharset(); } catch (final Exception ex) { } if (charset == null) { if (contentType.getMimeType().contains("text/html")) { charset = detectCharsetFromHtmlMeta(); } } } return charset; } final static Pattern pcharset = Pattern.compile("charset=['\"]?([^\"'\\s]+)"); protected Charset detectCharsetFromHtmlMeta() { if (content == null) { return null; } int len = content.length > 4096 ? 4096 : content.length; Matcher matcher = pcharset.matcher(new ByteCharSequence(content, 0, len)); if (matcher.find()) { try { return Charset.forName(matcher.group(1)); } catch (Exception ex) { } } return null; } public String getResponseHeader(String key) { if (response == null) { return null; } Header header = response.getFirstHeader(key); if (header == null) { return null; } return header.getValue(); } public int getStatusCode() { return statusCode; } public Exception getException() { return exception; } public int get(String url) { return get(url, null); } public int get(String url, String referrer) { HttpGet request = new HttpGet(url); if (referrer != null) { request.addHeader("Referer", referrer); } return request(request); } public int post(String url, Map data, PostType dataType) { return post(url, data, dataType, null); } public int post(String url, Map data, PostType dataType, String charset) { return post(url, data, dataType, charset, null); } public int post(String url, Map data, PostType dataType, String charset, String referrer) { clearPreviousRequest(); HttpPost request = new HttpPost(url); HttpEntity entity = null; if (charset == null) { charset = "utf-8"; } Charset detectedCharset = null; try { detectedCharset = Charset.forName(charset); } catch (Exception ex) { LOG.warn("invalid charset name {}, switching to utf-8"); detectedCharset = Charset.forName("utf-8"); } data = handleUnsupportedEncoding(data, detectedCharset); switch (dataType) { case JSON: try { String json = jsonMapper.writeValueAsString(data); entity = new StringEntity(json, ContentType.create("application/json", "utf-8")); }catch(Exception ex){ statusCode = -1; exception = ex; return statusCode; } break; case URL_ENCODED: List formparams = new ArrayList<>(); for (Map.Entry entry : data.entrySet()) { if (entry.getValue() instanceof String) { formparams.add(new BasicNameValuePair(entry.getKey(), (String) entry.getValue())); } else { LOG.warn("trying to url encode non string data"); formparams.add(new BasicNameValuePair(entry.getKey(), entry.getValue().toString())); } } try { entity = new UrlEncodedFormEntity(formparams, detectedCharset); } catch (Exception ex) { statusCode = -1; exception = ex; return statusCode; } break; case MULTIPART: MultipartEntityBuilder builder = MultipartEntityBuilder.create() .setCharset(detectedCharset) .setMode(HttpMultipartMode.BROWSER_COMPATIBLE); ContentType formDataCT = ContentType.create("form-data", detectedCharset); // formDataCT = ContentType.DEFAULT_TEXT; for (Map.Entry entry : data.entrySet()) { String key = entry.getKey(); if (entry.getValue() instanceof String) { builder = builder.addTextBody(key, (String) entry.getValue(), formDataCT); } else if (entry.getValue() instanceof byte[]) { builder = builder.addBinaryBody(key, (byte[]) entry.getValue()); } else if (entry.getValue() instanceof ContentBody) { builder = builder.addPart(key, (ContentBody) entry.getValue()); } else { exception = new UnsupportedOperationException("unssuported body type " + entry.getValue().getClass()); return statusCode = -1; } } entity = builder.build(); break; default: exception = new UnsupportedOperationException("unspported PostType " + dataType); return statusCode = -1; } request.setEntity(entity); if (referrer != null) { request.addHeader("Referer", referrer); } return request(request); } protected Map handleUnsupportedEncoding(Map data, Charset detectedCharset) { Map cleanedData = new HashMap<>(); boolean hasUnsupportedEncoding = false; for (Map.Entry entry : data.entrySet()) { String key = entry.getKey(); Object value = entry.getValue(); if (!EncodeUtils.canEncode(key, detectedCharset.name())) { hasUnsupportedEncoding = true; key = EncodeUtils.forceASCII(key); } if (value instanceof String) { if (!EncodeUtils.canEncode((String) value, detectedCharset.name())) { hasUnsupportedEncoding = true; value = EncodeUtils.forceASCII((String) value); } } cleanedData.put(key, value); } if (hasUnsupportedEncoding) { LOG.warn("failed to encode some post data to {} forced to ascii", detectedCharset.name()); } return cleanedData; } protected void clearPreviousRequest() { content = null; exception = null; response = null; statusCode = 0; lastRedirect = null; } public int request(HttpRequestBase request) { synchronized (connManager) { try { clearPreviousRequest(); executionTimeMS = System.currentTimeMillis(); HttpClientContext context = HttpClientContext.create(); initializeRequest(request, context); response = client.execute(request, context); statusCode = response.getStatusLine().getStatusCode(); RedirectLocations redirects = context.getAttribute(HttpClientContext.REDIRECT_LOCATIONS, RedirectLocations.class); if(redirects != null && !redirects.isEmpty()){ lastRedirect = redirects.get(redirects.size()-1).toString(); } HttpEntity entity = response.getEntity(); long contentLength = entity.getContentLength(); if (contentLength > maxResponseLength) { throw new ResponseTooBigException( "content length (" + contentLength + ") " + "is greater than max response leength (" + maxResponseLength + ")" ); } InputStream stream = entity.getContent(); int totalRead = 0; int read = 0; while (totalRead < maxResponseLength && (read = stream.read(buffer, totalRead, maxResponseLength - totalRead)) != -1) { totalRead += read; } if (totalRead == maxResponseLength && read != 0) { throw new ResponseTooBigException("already read " + totalRead + " bytes"); } content = Arrays.copyOfRange(buffer, 0, totalRead); } catch (Exception ex) { content = null; statusCode = -1; exception = ex; } finally { proxyChangedSinceLastRequest = false; closeResponse(); executionTimeMS = System.currentTimeMillis() - executionTimeMS; } return statusCode; } } protected void initializeRequest(HttpRequestBase request, HttpClientContext context){ if (request.getFirstHeader("user-agent") == null) { request.setHeader("User-Agent", useragent); } for (Header requestHeader : requestHeaders) { request.setHeader(requestHeader); } RequestConfig.Builder configBuilder = RequestConfig.copy(request.getConfig() == null ? RequestConfig.DEFAULT : request.getConfig()); if (timeoutMS != null) { configBuilder.setConnectTimeout(timeoutMS); configBuilder.setConnectionRequestTimeout(timeoutMS); configBuilder.setSocketTimeout(timeoutMS); } if(maxRedirect == 0){ configBuilder.setRedirectsEnabled(false); } else { configBuilder.setMaxRedirects(maxRedirect); } RequestConfig config = configBuilder.build(); context.setAttribute(HttpClientContext.REQUEST_CONFIG, config); request.setConfig(config); } public void closeResponse() { if (response != null) { try { response.close(); } catch (Exception ex) { LOG.warn("Exception while closing response", ex); } } } @Override public void close() throws IOException { closeResponse(); if (client != null) { client.close(); } } public void setRoute(HttpHost to, HttpHost via) { routes.put(to, via); } public void removeRouteVia(HttpHost host) { routes.remove(host); } public void removeRoutesTo(String host) { routes.entrySet().removeIf((Map.Entry t) -> host.equals(t.getValue().getHostName())); } public void removeRoutes() { routes.clear(); } @Override public Credentials getCredentials(AuthScope authscope) { if (proxy != null && proxy instanceof HttpProxy) { HttpProxy httpProxy = (HttpProxy) proxy; if (httpProxy.getIp().equals(authscope.getHost()) && httpProxy.getPort() == authscope.getPort() && httpProxy.getUsername() != null && httpProxy.getPassword() != null) { return new UsernamePasswordCredentials(httpProxy.getUsername(), httpProxy.getPassword()); } } return credentialProvider.getCredentials(authscope); } @Override public void setCredentials(AuthScope scope, Credentials auth) { credentialProvider.setCredentials(scope, auth); } @Override public void clear() { credentialProvider.clear(); } public void setRequestHeader(Header header) { removeRequestHeadersByName(header.getName()); requestHeaders.add(header); } public void removeRequestHeadersByName(String name) { requestHeaders.removeIf((Header t) -> t.getName().toLowerCase().equals(name.toLowerCase())); } public long getExecutionTimeMS() { return executionTimeMS; } public boolean isInsecureSSL() { return sslConnectionFactory.isInsecure(); } public void setInsecureSSL(boolean insecureSSL) { this.sslConnectionFactory.setInsecure(insecureSSL); } public int getMaxRedirect() { return maxRedirect; } public void setMaxRedirect(int maxRedirect) { this.maxRedirect = maxRedirect; } public void enableFollowRedirect(){ maxRedirect = 10; } public void disableFollowRedirect(){ maxRedirect = 0; } public String getLastRedirect() { return lastRedirect; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy