top.codings.websiphon.core.requester.ApacheWebRequester Maven / Gradle / Ivy
package top.codings.websiphon.core.requester;
import com.alibaba.fastjson.JSON;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.Setter;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.collections.MapUtils;
import org.apache.http.*;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.*;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.client.protocol.ResponseContentEncoding;
import org.apache.http.client.utils.HttpClientUtils;
import org.apache.http.concurrent.FutureCallback;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.entity.BasicHttpEntity;
import org.apache.http.entity.ContentType;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.nio.client.CloseableHttpAsyncClient;
import org.apache.http.impl.nio.client.HttpAsyncClients;
import org.apache.http.impl.nio.conn.PoolingNHttpClientConnectionManager;
import org.apache.http.impl.nio.reactor.DefaultConnectingIOReactor;
import org.apache.http.impl.nio.reactor.IOReactorConfig;
import org.apache.http.nio.conn.NoopIOSessionStrategy;
import org.apache.http.nio.conn.SchemeIOSessionStrategy;
import org.apache.http.nio.conn.ssl.SSLIOSessionStrategy;
import org.apache.http.nio.reactor.ConnectingIOReactor;
import org.apache.http.protocol.RequestConnControl;
import org.apache.http.protocol.RequestContent;
import org.apache.http.protocol.RequestTargetHost;
import org.apache.http.ssl.SSLContextBuilder;
import top.codings.websiphon.bean.BasicWebRequest;
import top.codings.websiphon.bean.WebRequest;
import top.codings.websiphon.bean.WebResponse;
import top.codings.websiphon.core.context.CrawlerContext;
import top.codings.websiphon.exception.WebException;
import top.codings.websiphon.exception.WebNetworkException;
import top.codings.websiphon.util.ByteUtils;
import top.codings.websiphon.util.HeadersUtils;
import top.codings.websiphon.util.HttpDecodeUtils;
import top.codings.websiphon.util.HttpOperator;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.InetSocketAddress;
import java.net.Proxy;
import java.nio.charset.Charset;
import java.util.concurrent.CancellationException;
import java.util.concurrent.atomic.AtomicInteger;
@NoArgsConstructor
@Slf4j
public class ApacheWebRequester implements WebRequester {
@Getter
private boolean health = false;
@Setter
@Getter
private boolean redirect = false;
@Setter
@Getter
private boolean ignoreHttpError = false;
@Setter
@Getter
private int maxRedirects = 3;
@Setter
@Getter
private int maxPerRoute = 2;
private CloseableHttpAsyncClient httpAsyncClient;
// private RequestConfig requestConfig;
private AtomicInteger size = new AtomicInteger(0);
public ApacheWebRequester(boolean redirect) {
this.redirect = redirect;
}
@Override
public void init() throws Exception {
int ioThreadCount = Runtime.getRuntime().availableProcessors();
int maxConnTotal = Integer.MAX_VALUE;
try {
/*requestConfig = RequestConfig.custom()
.setMaxRedirects(2)
.setContentCompressionEnabled(false)
.setExpectContinueEnabled(true)
.setRedirectsEnabled(redirect)
.build();*/
IOReactorConfig ioReactorConfig = IOReactorConfig.custom()
// .setBacklogSize(10240)
.setSoTimeout(30000)
.setTcpNoDelay(true)
.setIoThreadCount(ioThreadCount)
.setSoKeepAlive(true)
.build();
ConnectingIOReactor ioReactor = new DefaultConnectingIOReactor(ioReactorConfig);
// 忽略证书验证
SSLIOSessionStrategy sslioSessionStrategy = new SSLIOSessionStrategy(
SSLContextBuilder.create().loadTrustMaterial((x509Certificates, s) -> true).build(),
(hostname, session) -> true);
Registry registry = RegistryBuilder.create()
.register("http", NoopIOSessionStrategy.INSTANCE)
.register("https", sslioSessionStrategy)
.build();
PoolingNHttpClientConnectionManager connManager = new PoolingNHttpClientConnectionManager(ioReactor, registry);
connManager.setMaxTotal(maxConnTotal);
connManager.setDefaultMaxPerRoute(maxPerRoute);
// CredentialsProvider credsProvider = new BasicCredentialsProvider();
httpAsyncClient = HttpAsyncClients.custom()
.setConnectionManager(connManager)
// .setDefaultRequestConfig(requestConfig)
// .setDefaultCredentialsProvider(credsProvider)
// .setSSLContext(SSLContextBuilder.create().loadTrustMaterial((x509Certificates, s) -> true).build())
// .setSSLHostnameVerifier((hostname, session) -> true)
.addInterceptorLast(new RequestTargetHost())
// .addInterceptorLast(new RequestAcceptEncoding(Arrays.asList("compress")))
.addInterceptorLast(new RequestConnControl())
.addInterceptorLast(new RequestContent(true))
.addInterceptorLast((HttpRequestInterceptor) (httpRequest, httpContext) -> {
httpRequest.setHeader("Accept-Encoding", "compress");
// 添加请求头伪装
WebRequest webRequest = ((HttpClientContext) httpContext).getAttribute(WebRequest.class.getName(), WebRequest.class);
if (MapUtils.isEmpty(webRequest.headers())) {
HeadersUtils.getHeaders().forEach((key, value) -> {
if (httpRequest.containsHeader(key)) {
return;
}
httpRequest.addHeader(key, value);
});
} else {
webRequest.headers().forEach((key, value) -> {
if (httpRequest.containsHeader(key)) {
return;
}
httpRequest.addHeader(key, value);
});
}
if (!httpRequest.containsHeader("Referer")) {
httpRequest.addHeader("Referer", webRequest.uri());
}
})
.addInterceptorLast(new ResponseContentEncoding())
.addInterceptorLast((HttpResponseInterceptor) (httpResponse, httpContext) -> {
WebRequest webRequest = ((HttpClientContext) httpContext).getAttribute(WebRequest.class.getName(), WebRequest.class);
WebResponse webResponse = webRequest.response();
int respCode = httpResponse.getStatusLine().getStatusCode();
// 允许跳转且处于跳转状态
if (respCode >= 300 && respCode < 400 && redirect) {
Header locationHeader = httpResponse.getFirstHeader("location");
if (null != locationHeader) {
String location = locationHeader.getValue();
location = location.replace(":80", "").replace(":443", "");
webResponse.setRedirect(true);
webResponse.setRedirectUrl(HttpOperator.recombineLink(location, webRequest.uri()));
}
return;
}
// 装填响应头信息
for (Header header : httpResponse.getAllHeaders()) {
if (webResponse.getHeaders().containsKey(header.getName())) {
String value = webResponse.getHeaders().get(header.getName());
value = value + ";" + header.getValue();
webResponse.getHeaders().put(header.getName(), value);
} else {
webResponse.getHeaders().put(header.getName(), header.getValue());
}
}
})
.build();
httpAsyncClient.start();
health = true;
} catch (Exception e) {
throw new WebException("异步请求器初始化失败", e);
}
}
@Override
public void execute(BasicWebRequest webRequest) throws WebNetworkException {
try {
size.incrementAndGet();
HttpRequestBase httpRequest;
httpRequest = initMethod(webRequest);
initHeaders(webRequest, httpRequest);
if (webRequest.body() instanceof JSON) {
httpRequest.setHeader("content-type", "application/json;charset=UTF-8");
}
initConfig(webRequest, httpRequest);
HttpClientContext context = HttpClientContext.create();
context.setAttribute(WebRequest.class.getName(), webRequest);
httpAsyncClient.execute(
httpRequest,
context,
new AsyncFutureCallback(webRequest));
} catch (Exception e) {
size.decrementAndGet();
throw new WebNetworkException("执行异步请求失败", e);
}
}
/**
* 装填响应主体
*
* @param httpResponse
* @param webRequest
* @throws IOException
*/
private void fillResponseBody(HttpResponse httpResponse, WebRequest webRequest) throws IOException {
WebResponse webResponse = webRequest.response();
// 装填响应状态
webResponse.setResult(WebResponse.Result.valueOf(httpResponse.getStatusLine().getStatusCode()));
// 装填URL
webResponse.setUrl(webRequest.uri());
// 装填响应内容
webResponse.setBytes(ByteUtils.readAllBytes(httpResponse.getEntity().getContent()));
ContentType contentType;
Charset charset;
String encoding;
if ((contentType = ContentType.get(httpResponse.getEntity())) != null && (charset = contentType.getCharset()) != null) {
encoding = charset.name();
} else {
// 查找编码
if (contentType != null && contentType.getMimeType().startsWith("text")) {
encoding = HttpDecodeUtils.findCharset(webResponse.getBytes());
} else {
encoding = "utf-8";
}
}
if (null != contentType) {
// 装填响应类型
webResponse.setContentType(contentType.getMimeType());
if (contentType.getMimeType().contains("json")) {
webResponse.setJson((JSON) JSON.parse(new String(webResponse.getBytes(), encoding)));
} else if (contentType.getMimeType().startsWith("text")) {
webResponse.setHtml(new String(webResponse.getBytes(), encoding));
}
}
}
/**
* 初始化特定配置
*
* @param webRequest
* @param httpRequest
*/
private void initConfig(BasicWebRequest webRequest, HttpRequestBase httpRequest) {
RequestConfig.Builder builder = RequestConfig.custom();
/*Proxy proxy = webRequest.getProxy();
if (proxy != null && proxy != Proxy.NO_PROXY) {
HttpHost proxyHost = new HttpHost(((InetSocketAddress) proxy.address()).getHostName(), ((InetSocketAddress) proxy.address()).getPort());
builder.setProxy(proxyHost);
}*/
RequestConfig config = builder
.setSocketTimeout(30000)
.setConnectTimeout(30000)
.setConnectionRequestTimeout(30000)
.setRedirectsEnabled(redirect)
.setMaxRedirects(maxRedirects)
.setContentCompressionEnabled(false)
.setExpectContinueEnabled(true)
.build();
httpRequest.setConfig(config);
}
/**
* 根据请求方法初始化请求对象
*
* @param webRequest
* @return
*/
private HttpRequestBase initMethod(WebRequest webRequest) {
HttpRequestBase httpRequest;
switch (webRequest.method()) {
case GET:
httpRequest = new HttpGet(webRequest.uri());
break;
case HEAD:
httpRequest = new HttpHead(webRequest.uri());
break;
case POST:
httpRequest = new HttpPost(webRequest.uri());
initBody(webRequest, (HttpEntityEnclosingRequestBase) httpRequest);
break;
case PUT:
httpRequest = new HttpPut(webRequest.uri());
initBody(webRequest, (HttpEntityEnclosingRequestBase) httpRequest);
break;
case PATCH:
httpRequest = new HttpPatch(webRequest.uri());
initBody(webRequest, (HttpEntityEnclosingRequestBase) httpRequest);
break;
case DELETE:
httpRequest = new HttpDelete(webRequest.uri());
break;
default:
throw new IllegalArgumentException(String.format("不支持该请求方法[%s]", webRequest.method().name()));
}
return httpRequest;
}
/**
* 初始化请求承载数据
*
* @param webRequest
* @param httpRequest
*/
private void initBody(WebRequest webRequest, HttpEntityEnclosingRequestBase httpRequest) {
if (webRequest.body() != null) {
HttpEntityEnclosingRequestBase httpEntityEnclosingRequest = httpRequest;
HttpEntity httpEntity;
if (webRequest.body() instanceof String) {
httpEntity = new StringEntity(webRequest.body().toString(), "utf-8");
} else if (webRequest.body() instanceof JSON) {
httpEntity = new StringEntity(webRequest.body().toString(), "utf-8");
} else if (webRequest.body() instanceof byte[]) {
BasicHttpEntity basicHttpEntity = new BasicHttpEntity();
InputStream inputStream = new ByteArrayInputStream((byte[]) webRequest.body());
basicHttpEntity.setContent(inputStream);
basicHttpEntity.setContentLength(((byte[]) webRequest.body()).length);
httpEntity = basicHttpEntity;
} else {
throw new IllegalArgumentException(String.format("请求的body类型不支持 ", webRequest.body().getClass()));
}
httpEntityEnclosingRequest.setEntity(httpEntity);
}
}
/**
* 初始化头信息
*
* @param webRequest
* @param httpRequest
*/
private void initHeaders(WebRequest webRequest, HttpRequestBase httpRequest) {
if (MapUtils.isEmpty(webRequest.headers())) {
HeadersUtils.getHeaders().forEach((key, value) -> {
if (httpRequest.containsHeader(key)) {
return;
}
httpRequest.setHeader(key, value);
});
} else {
webRequest.headers().forEach((key, value) -> {
if (httpRequest.containsHeader(key)) {
return;
}
httpRequest.setHeader(key, value);
});
}
}
@Override
public int size() {
return size.get();
}
@Override
public void close() {
try {
health = false;
httpAsyncClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
private class AsyncFutureCallback implements FutureCallback {
CrawlerContext crawlerContext;
WebRequest webRequest;
public AsyncFutureCallback(WebRequest webRequest) {
this.webRequest = webRequest;
crawlerContext = webRequest.context();
}
@Override
public void completed(HttpResponse httpResponse) {
size.decrementAndGet();
try {
WebResponse webResponse = webRequest.response();
if (null == webResponse) {
webRequest.succeed();
return;
}
fillResponseBody(httpResponse, webRequest);
if (!ignoreHttpError) {
int respCode = webResponse.getResult().getKey();
// 判断响应码是否正常
if (!(respCode >= 200 && respCode < 300)) {
throw new WebNetworkException(String.format("响应码不是2xx [%d]", httpResponse.getStatusLine().getStatusCode()));
}
}
} catch (Exception e) {
webRequest.failed(e);
return;
} finally {
HttpClientUtils.closeQuietly(httpResponse);
}
webRequest.succeed();
}
@Override
public void failed(Exception e) {
size.decrementAndGet();
webRequest.failed(e);
}
@Override
public void cancelled() {
size.decrementAndGet();
webRequest.failed(new CancellationException("请求被强制取消"));
}
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy