All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.github.xbynet.crawler.http.AbsDownloader Maven / Gradle / Ivy

The newest version!
package com.github.xbynet.crawler.http;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

import org.apache.http.HttpHost;
import org.apache.http.NameValuePair;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.client.methods.RequestBuilder;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.github.xbynet.crawler.IpProxyProvider;
import com.github.xbynet.crawler.Request;
import com.github.xbynet.crawler.RequestAction;
import com.github.xbynet.crawler.Response;
import com.github.xbynet.crawler.Site;
import com.github.xbynet.crawler.Spider;
import com.github.xbynet.crawler.SpiderListener;
import com.github.xbynet.crawler.Const.HttpMethod;
import com.github.xbynet.crawler.utils.CrawlerUtils;

public abstract class AbsDownloader implements Downloader{
	private Logger log=LoggerFactory.getLogger(AbsDownloader.class);
	
	private CloseableHttpClient client;
	private Spider spider;
	
	public AbsDownloader(){
		
	}
	public void init(){
		HttpClientFactory clientFactory=spider.getHttpClientFactory();
		if(clientFactory==null){
			clientFactory=new HttpClientFactory();
		}
		this.client=clientFactory.getClient();
	}
	protected void doDownload(Request request,Object... extras){
		String url=request.getUrl();
		Site site=getSpider().getSite();
		IpProxyProvider ipProxyProvider=getSpider().getIpProvider();
		HttpHost proxy=null;
		if(ipProxyProvider!=null){
			proxy=ipProxyProvider.getIp();
		}
		
		log.debug(getSpider().getName()+",开始请求"+url);
		HttpUriRequest httpUriRequest=generateHttpRequest(site, request, proxy);
		
		Response response=new Response();
		boolean state=cycleRequest(httpUriRequest,request,site,response,extras);
		
		if(!state){
			log.error("no content crawled for "+request.getUrl());
			notifyListener(false,request,null);
			return;
		}
		addContinueRequest(response);
		notifyListener(true,request,null);
		//循环遍历所有分块请求
		List orderReqList=request.getPartRequest();
		while(orderReqList!=null && orderReqList.size()>0){
			Request req=orderReqList.remove(0);
			spider.getScheduler().getDuplicateRemover().isDuplicate(req, spider);
			Response resp=new Response(response);
			state=cycleRequest(generateHttpRequest(site, req, proxy), req, site, resp, extras);
			if(!state){
				log.error("no content crawled for "+req.getUrl());
				notifyListener(false, req, null);
			}else{
				notifyListener(true,req,null);
			}
			addContinueRequest(resp);
		}
	}
	protected void addContinueRequest(Response response){
		List reqlist=response.getContinueReqeusts();
		if(reqlist!=null){
			for(Request req:reqlist){
				spider.getScheduler().push(req, spider);
			}
		}
	}
	protected boolean cycleRequest(HttpUriRequest httpUriRequest,Request request,Site site,Response response,Object... extras){
		boolean state=false;
		try {
			state = doRequest(httpUriRequest, request,site,response,extras);
		} catch (Exception e) {
			log.error("",e);
		}
		int retryCount=request.getRetryCount()>=0?request.getRetryCount():site.getRetry();
		int retrySleepTimes=request.getRetrySleepTime()>=0?request.getRetrySleepTime():site.getRetrySleep();
		int retryIndex=1;
		while(!state && retryIndex headerEntry : site.getHeaders().entrySet()) {
                requestBuilder.setHeader(headerEntry.getKey(), headerEntry.getValue());
            }
        }
        RequestConfig.Builder requestConfigBuilder = RequestConfig.custom();
        if (site != null) {
            requestConfigBuilder.setConnectionRequestTimeout(site.getTimeout())
                    .setSocketTimeout(site.getTimeout())
                    .setConnectTimeout(site.getTimeout())
                    .setCookieSpec(CookieSpecs.STANDARD);
        }

        if (proxy != null) {
            requestConfigBuilder.setProxy(proxy);
        }
        requestBuilder.setConfig(requestConfigBuilder.build());
        HttpUriRequest httpUriRequest = requestBuilder.build();
        if (request.getHeaders() != null && !request.getHeaders().isEmpty()) {
            for (Map.Entry header : request.getHeaders().entrySet()) {
                httpUriRequest.setHeader(header.getKey(), header.getValue());
            }
        }
        return httpUriRequest;
	}
	private RequestBuilder selectRequestMethod(Request request) {
        HttpMethod method = request.getMethod();
        if (method == null || method==HttpMethod.GET) {
            return addFormParams(RequestBuilder.get(),request);
        } else if (method==HttpMethod.POST) {
            return addFormParams(RequestBuilder.post(),request);
        } else if (method==HttpMethod.HEAD) {
            return addFormParams(RequestBuilder.head(),request);
        }
        throw new IllegalArgumentException("Illegal HTTP Method " + method);
    }

    private RequestBuilder addFormParams(RequestBuilder requestBuilder, Request request) {
        if (request.getEntity() != null && "POST".equalsIgnoreCase(requestBuilder.getMethod())) {
            requestBuilder.setEntity(request.getEntity());
        }else if(request.getParams()!=null){
        	List nameValuePairs=new ArrayList();
        	for(String key:request.getParams().keySet()){
        		BasicNameValuePair pair=new BasicNameValuePair(key, request.getParams().get(key));
        		nameValuePairs.add(pair);
        	}
        	try {
				requestBuilder.setEntity(new UrlEncodedFormEntity(nameValuePairs, "UTF-8"));
			} catch (UnsupportedEncodingException e) {
				log.error("",e);
			}
        }
        return requestBuilder;
    }

	public Spider getSpider() {
		return spider;
	}

	public void setSpider(Spider spider) {
		this.spider = spider;
	}

	@Override
	public void close() throws IOException {
		spider=null;
		client.close();
		client=null;
	}

	@Override
	public void download(Request request) {
		throw new RuntimeException("not support!");
	}
	
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy