All Downloads are FREE. Search and download functionalities are using the official Maven repository.

us.codecraft.webmagic.downloader.HttpClientDownloader Maven / Gradle / Ivy

The newest version!
package us.codecraft.webmagic.downloader;

import java.io.IOException;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.Map;
import java.util.Optional;

import org.apache.commons.io.IOUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.util.EntityUtils;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.proxy.ProxyProvider;
import us.codecraft.webmagic.selector.PlainText;
import us.codecraft.webmagic.utils.CharsetUtils;
import us.codecraft.webmagic.utils.HttpClientUtils;

/**
 * The http downloader based on HttpClient.
 *
 * @author [email protected] 
* @since 0.1.0 */ public class HttpClientDownloader extends AbstractDownloader { private final Map httpClients = new HashMap(); private HttpClientGenerator httpClientGenerator = new HttpClientGenerator(); private HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter(); private ProxyProvider proxyProvider; private boolean responseHeader = true; public void setHttpUriRequestConverter(HttpUriRequestConverter httpUriRequestConverter) { this.httpUriRequestConverter = httpUriRequestConverter; } public void setProxyProvider(ProxyProvider proxyProvider) { this.proxyProvider = proxyProvider; } private CloseableHttpClient getHttpClient(Site site) { if (site == null) { return httpClientGenerator.getClient(null); } String domain = site.getDomain(); CloseableHttpClient httpClient = httpClients.get(domain); if (httpClient == null) { synchronized (this) { httpClient = httpClients.get(domain); if (httpClient == null) { httpClient = httpClientGenerator.getClient(site); httpClients.put(domain, httpClient); } } } return httpClient; } @Override public Page download(Request request, Task task) { if (task == null || task.getSite() == null) { throw new NullPointerException("task or site can not be null"); } CloseableHttpResponse httpResponse = null; CloseableHttpClient httpClient = getHttpClient(task.getSite()); Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(request, task) : null; HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy); Page page = Page.fail(request); try { httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext()); page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task); onSuccess(page, task); return page; } catch (IOException e) { onError(page, task, e); return page; } finally { if (httpResponse != null) { //ensure the connection is released back to pool EntityUtils.consumeQuietly(httpResponse.getEntity()); } if (proxyProvider != null && proxy != null) { proxyProvider.returnProxy(proxy, page, task); } } } @Override public void setThread(int thread) { httpClientGenerator.setPoolSize(thread); } protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { HttpEntity entity = httpResponse.getEntity(); byte[] bytes = entity != null ? IOUtils.toByteArray(entity.getContent()) : new byte[0]; String contentType = entity != null && entity.getContentType() != null ? entity.getContentType().getValue() : null; Page page = new Page(); page.setBytes(bytes); if (!request.isBinaryContent()) { if (charset == null) { charset = getHtmlCharset(contentType, bytes, task); } page.setCharset(charset); page.setRawText(new String(bytes, charset)); } page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); page.setStatusCode(httpResponse.getStatusLine().getStatusCode()); page.setDownloadSuccess(true); if (responseHeader) { page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders())); } return page; } private String getHtmlCharset(String contentType, byte[] contentBytes, Task task) throws IOException { String charset = CharsetUtils.detectCharset(contentType, contentBytes); if (charset == null) { charset = Optional.ofNullable(task.getSite().getDefaultCharset()).orElseGet(Charset.defaultCharset()::name); } return charset; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy