![JAR search and dependency download from the Maven repository](/logo.png)
fr.whimtrip.ext.jwhtscrapper.service.HtmlAutoScrapperManager Maven / Gradle / Ivy
Show all versions of whimtrip-ext-scrapper Show documentation
/*
* This code is licensed to WhimTrip©. For any question, please contact the author of the file.
*/
/*
* This code is licensed to WhimTrip©. For any question, please contact the author of the file.
*/
/*
* This code is licensed to WhimTrip©. For any question, please contact the author of the file.
*/
/*
* This code is licensed to WhimTrip©. For any question, please contact the author of the file.
*/
/*
* This code is licensed to WhimTrip©. For any question, please contact the author of the file.
*/
package fr.whimtrip.ext.jwhtscrapper.service;
import fr.whimtrip.core.util.intrf.ExceptionLogger;
import fr.whimtrip.ext.jwhthtmltopojo.HtmlToPojoEngine;
import fr.whimtrip.ext.jwhtscrapper.annotation.*;
import fr.whimtrip.ext.jwhtscrapper.enm.Action;
import fr.whimtrip.ext.jwhtscrapper.intfr.BasicObjectMapper;
import fr.whimtrip.ext.jwhtscrapper.intfr.HtmlAutoScrapper;
import fr.whimtrip.ext.jwhtscrapper.intfr.ProxyFinder;
import fr.whimtrip.ext.jwhtscrapper.intfr.ScrapperHelper;
import fr.whimtrip.ext.jwhtscrapper.service.base.BoundRequestBuilderProcessor;
import fr.whimtrip.ext.jwhtscrapper.service.base.HttpManagerClient;
import fr.whimtrip.ext.jwhtscrapper.service.holder.PostField;
import fr.whimtrip.ext.jwhtscrapper.service.holder.RequestsScrappingContext;
import fr.whimtrip.ext.jwhtscrapper.service.holder.ScrappingContext;
import fr.whimtrip.ext.jwhtscrapper.service.scoped.DefaultHttpManagerClientBuilder;
import fr.whimtrip.ext.jwhtscrapper.service.scoped.HtmlAutoScrapperImpl;
import fr.whimtrip.ext.jwhtscrapper.service.scoped.HttpWithProxyManagerClient;
import io.netty.handler.codec.http.DefaultHttpHeaders;
import io.netty.handler.codec.http.HttpHeaders;
import io.netty.handler.codec.http.cookie.Cookie;
import io.netty.handler.codec.http.cookie.DefaultCookie;
import org.asynchttpclient.AsyncHttpClient;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.atomic.AtomicBoolean;
/**
* Part of project jwht-scrapper
* Created on 26/07/18
*
*
* This class will be in charge of providing factory method for instanciating
* both {@link HttpWithProxyManagerClient} and {@link HtmlAutoScrapper} thanks to
* methods input parameters and context services binded using the original
* constructor.
*
*
*
* @author Louis-wht
* @since 1.0.0
*/
public final class HtmlAutoScrapperManager {
private final ExceptionLogger exceptionLogger;
private final HtmlToPojoEngine htmlToPojoEngine;
private final BasicObjectMapper objectMapper;
private final AsyncHttpClient asyncHttpClient;
private final ProxyFinder proxyFinder;
private final BoundRequestBuilderProcessor boundRequestBuilderProcessor;
private final AtomicBoolean scrapStopped = new AtomicBoolean(false);
/**
* Package private constructor that is yet only meant to be used through its
* dedicated builder {@link HtmlAutoScrapperManagerBuilder}.
* @param exceptionLogger the exception logger that will be used by both the
* {@link HttpWithProxyManagerClient} and the {@link HtmlAutoScrapper}
* @param htmlToPojoEngine the core html to pojo engine allowing us to parse
* HTML input to java POJOs.
* @param objectMapper the object mapper to use for mapping differently formatted
* strings.
* @param asyncHttpClient the http client to use for performing the requests
* @param proxyFinder the proxy finder to take full usage of proxies configuration
* @param boundRequestBuilderProcessor the request processor used for headers,
* cookies etc modfications as well as other
* eventual use cases.
*/
HtmlAutoScrapperManager(
@NotNull final ExceptionLogger exceptionLogger,
@NotNull final HtmlToPojoEngine htmlToPojoEngine,
@Nullable final BasicObjectMapper objectMapper,
@NotNull final AsyncHttpClient asyncHttpClient,
@Nullable final ProxyFinder proxyFinder,
@NotNull final BoundRequestBuilderProcessor boundRequestBuilderProcessor
){
this.exceptionLogger = exceptionLogger;
this.htmlToPojoEngine = htmlToPojoEngine;
this.objectMapper = objectMapper;
this.asyncHttpClient = asyncHttpClient;
this.proxyFinder = proxyFinder;
this.boundRequestBuilderProcessor = boundRequestBuilderProcessor;
}
/**
*
* Simplified factory method.
*
* @param awaitBetweenRequests time to wait between each consecutive http
* request.
* @param proxyChangeRate the rate at which the proxies should be switched
* @param timeout the timeout in milliseconds before the request will be
* retried
* @param useProxy wether you should use proxies or not for performing your
* request
* @param maxRequestRetries maximum number of retries before throwing a
* failure exception
* @return built {@link HttpWithProxyManagerClient}
*/
public HttpManagerClient createProxyManagerClient(
int awaitBetweenRequests,
int proxyChangeRate,
int timeout,
boolean useProxy,
int maxRequestRetries
){
return createProxyManagerClient(
awaitBetweenRequests,
proxyChangeRate,
timeout,
useProxy,
false,
true,
false,
true,
maxRequestRetries,
null,
null,
new Cookie[]{}
);
}
/**
*
* @param awaitBetweenRequests time to wait between each consecutive http
* request.
*
* @param proxyChangeRate the rate at which the proxies should be switched
*
* @param timeout the timeout in milliseconds before the request will be
* retried
*
* @param useProxy wether you should use proxies or not for performing your
* request
*
* @param connectToProxyBeforeRequest wether a {@code CONNECT} TCP initialization
* request should be performed before hand.
* Warning! Only use if you know what you are doing!
*
* @param rotatingUserAgent will auto assign rotating user agent headers to
* each request using {@link RotatingUserAgent#pickRandomUserAgent()}.
*
* @param allowInfiniteRedirections will allow infinite redirections.
* Redirections with {@code 301} or {@code 302} HTTP Status codes will
* be followed as a normal browser would. Redirections are by default
* limited to 3 on the same request. Setting this field to true will
* let potential (quite common case when scrapping) happens.
* Warning! Only use if you know what you are doing!
*
* @param followRedirections wether HTTP redirection (301 and 302 HTTP status)
* should be accepted or not. If false, no redirection will be followed, even
* though {@code allowInfiniteRedirections} is set to true. If set to true with
* {@code allowInfiniteRedirections} set to false, redirections will only be
* followed once in per single HTTP request but not more.
*
* @param maxRequestRetries maximum number of retries before throwing a
* failure exception
*
*
* @param headers default headers to use in each requests
* @param cookies default cookies to use in each requests
* @param fields default POST fields to use on each requests.
*
* @return built {@link HttpWithProxyManagerClient}
*/
public HttpManagerClient createProxyManagerClient(
int awaitBetweenRequests,
int proxyChangeRate,
int timeout,
boolean useProxy,
boolean connectToProxyBeforeRequest,
boolean rotatingUserAgent,
boolean allowInfiniteRedirections,
boolean followRedirections,
int maxRequestRetries,
HttpHeaders headers,
List fields,
Cookie... cookies
){
return
new DefaultHttpManagerClientBuilder(asyncHttpClient, exceptionLogger, boundRequestBuilderProcessor)
.setAwaitBetweenRequests(awaitBetweenRequests)
.setProxyChangeRate(proxyChangeRate)
.setTimeout(timeout)
.setUseProxy(useProxy)
.setConnectToProxyBeforeRequest(connectToProxyBeforeRequest)
.setRotatingUserAgent(rotatingUserAgent)
.setAllowInfiniteRedirections(allowInfiniteRedirections)
.setFollowRedirections(followRedirections)
.setMaxRequestRetries(maxRequestRetries)
.setDefaultHeaders(headers)
.setDefaultFields(fields)
.setDefaultCookies(cookies)
.setProxyFinder(proxyFinder)
.setScrapStopped(scrapStopped)
.build();
}
/**
* Manual {@link HtmlAutoScrapper} factory method.
* @param client the {@link HttpWithProxyManagerClient} that will be used under the
* hood by the {@link HtmlAutoScrapper}.
*
* @param clazz the class to map resulting outputs to.
*
*
* @param followRedirections wether HTTP redirections should be followed
* or not (HTTP redirections is valid if status
* code is {@code 301} or {@code 302} and when
* the {@code Location} header is not empty.
*
* @param warningSignDelay delay before retrying any action in the case
* a {@link WarningSign} was triggered and only if it
* was set to {@link Action#RETRY}.
* @param the type of model this scrapper will cast resulting outputs to.
* @return built in {@link HtmlAutoScrapper}.
*/
public HtmlAutoScrapper createHtmlAutoScrapper(
final HttpManagerClient client,
Class clazz,
boolean followRedirections,
int warningSignDelay
)
{
return new HtmlAutoScrapperImpl<>(
client,
htmlToPojoEngine,
boundRequestBuilderProcessor,
objectMapper,
exceptionLogger,
clazz,
followRedirections,
warningSignDelay,
scrapStopped
);
}
/**
*
* Automatic factory method using {@link RequestsScrappingContext}
* built using annotations gathered on top of {@link ScrapperHelper}
* implementation.
*
* @param requestPreparator context of the scrapping request
* @return built in {@link HttpWithProxyManagerClient}
*/
public HttpManagerClient createProxyManagerClient(RequestsScrappingContext requestPreparator) {
RequestsConfig config = requestPreparator.getRequestsConfig();
ProxyConfig proxyConfig = config.proxyConfig();
HttpHeaders headers = new DefaultHttpHeaders();
for(Header hdr : config.defaultHeaders())
{
headers.add(hdr.name(), hdr.value());
}
List cookieList = new ArrayList<>();
for(fr.whimtrip.ext.jwhtscrapper.annotation.Cookie ck : config.defaultCookies())
{
Cookie cookie = new DefaultCookie(ck.name(), ck.value());
cookie.setDomain(ck.domain());
cookie.setPath(ck.path());
cookie.setMaxAge(Cookie.UNDEFINED_MAX_AGE);
cookieList.add(cookie);
}
Cookie[] cookies = new Cookie[cookieList.size()];
cookieList.toArray(cookies);
List fields = new ArrayList<>();
for(fr.whimtrip.ext.jwhtscrapper.annotation.Field fld : config.defaultPostFields()) {
fields.add(new PostField(fld.name(), fld.value()));
}
return createProxyManagerClient(
config.waitBetweenRequests(),
1,
config.timeout(),
proxyConfig.useProxy(),
proxyConfig.connectToProxyBeforeRequest(),
config.rotatingUserAgent(),
config.allowInfiniteRedirections(),
config.followRedirections(),
config.maxRequestRetries(),
headers,
fields,
cookies
);
}
/**
*
* Automatic factory method using {@link RequestsScrappingContext}
* built using annotations gathered on top of {@link ScrapperHelper}
* implementation.
* Built {@link HttpWithProxyManagerClient} is already required in order
* for this {@link HtmlAutoScrapper} to have the correct subjacent
* processing unit.
*
* @param context context of the scrapping request
* @param httpManagerClient previously built in {@link HttpWithProxyManagerClient}
* using this factory class.
* @return built in {@link HtmlAutoScrapper}
*/
public HtmlAutoScrapper createHtmlAutoScrapper(HttpManagerClient httpManagerClient, ScrappingContext context) {
return createHtmlAutoScrapper(
httpManagerClient,
context.getModelClazz(),
context.getRequestsScrappingContext().getRequestsConfig().followRedirections(),
context.getRequestsScrappingContext().getRequestsConfig().warningSignDelay()
);
}
}