All Downloads are FREE. Search and download functionalities are using the official Maven repository.

fr.whimtrip.ext.jwhtscrapper.service.HtmlAutoScrapperManager Maven / Gradle / Ivy

The newest version!
/*
 * This code is licensed to WhimTrip©. For any question, please contact the author of the file.
 */

/*
 * This code is licensed to WhimTrip©. For any question, please contact the author of the file.
 */

/*
 * This code is licensed to WhimTrip©. For any question, please contact the author of the file.
 */

/*
 * This code is licensed to WhimTrip©. For any question, please contact the author of the file.
 */

/*
 * This code is licensed to WhimTrip©. For any question, please contact the author of the file.
 */

package fr.whimtrip.ext.jwhtscrapper.service;

import fr.whimtrip.core.util.intrf.ExceptionLogger;
import fr.whimtrip.ext.jwhthtmltopojo.HtmlToPojoEngine;
import fr.whimtrip.ext.jwhtscrapper.annotation.*;
import fr.whimtrip.ext.jwhtscrapper.enm.Action;
import fr.whimtrip.ext.jwhtscrapper.intfr.BasicObjectMapper;
import fr.whimtrip.ext.jwhtscrapper.intfr.HtmlAutoScrapper;
import fr.whimtrip.ext.jwhtscrapper.intfr.ProxyFinder;
import fr.whimtrip.ext.jwhtscrapper.intfr.ScrapperHelper;
import fr.whimtrip.ext.jwhtscrapper.service.base.BoundRequestBuilderProcessor;
import fr.whimtrip.ext.jwhtscrapper.service.base.HttpManagerClient;
import fr.whimtrip.ext.jwhtscrapper.service.holder.PostField;
import fr.whimtrip.ext.jwhtscrapper.service.holder.RequestsScrappingContext;
import fr.whimtrip.ext.jwhtscrapper.service.holder.ScrappingContext;
import fr.whimtrip.ext.jwhtscrapper.service.scoped.DefaultHttpManagerClientBuilder;
import fr.whimtrip.ext.jwhtscrapper.service.scoped.HtmlAutoScrapperImpl;
import fr.whimtrip.ext.jwhtscrapper.service.scoped.HttpWithProxyManagerClient;
import io.netty.handler.codec.http.DefaultHttpHeaders;
import io.netty.handler.codec.http.HttpHeaders;
import io.netty.handler.codec.http.cookie.Cookie;
import io.netty.handler.codec.http.cookie.DefaultCookie;
import org.asynchttpclient.AsyncHttpClient;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;

import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.atomic.AtomicBoolean;

/**
 * 

Part of project jwht-scrapper

*

Created on 26/07/18

* *

* This class will be in charge of providing factory method for instanciating * both {@link HttpWithProxyManagerClient} and {@link HtmlAutoScrapper} thanks to * methods input parameters and context services binded using the original * constructor. *

* * * @author Louis-wht * @since 1.0.0 */ public final class HtmlAutoScrapperManager { private final ExceptionLogger exceptionLogger; private final HtmlToPojoEngine htmlToPojoEngine; private final BasicObjectMapper objectMapper; private final AsyncHttpClient asyncHttpClient; private final ProxyFinder proxyFinder; private final BoundRequestBuilderProcessor boundRequestBuilderProcessor; private final AtomicBoolean scrapStopped = new AtomicBoolean(false); /** * Package private constructor that is yet only meant to be used through its * dedicated builder {@link HtmlAutoScrapperManagerBuilder}. * @param exceptionLogger the exception logger that will be used by both the * {@link HttpWithProxyManagerClient} and the {@link HtmlAutoScrapper} * @param htmlToPojoEngine the core html to pojo engine allowing us to parse * HTML input to java POJOs. * @param objectMapper the object mapper to use for mapping differently formatted * strings. * @param asyncHttpClient the http client to use for performing the requests * @param proxyFinder the proxy finder to take full usage of proxies configuration * @param boundRequestBuilderProcessor the request processor used for headers, * cookies etc modfications as well as other * eventual use cases. */ HtmlAutoScrapperManager( @NotNull final ExceptionLogger exceptionLogger, @NotNull final HtmlToPojoEngine htmlToPojoEngine, @Nullable final BasicObjectMapper objectMapper, @NotNull final AsyncHttpClient asyncHttpClient, @Nullable final ProxyFinder proxyFinder, @NotNull final BoundRequestBuilderProcessor boundRequestBuilderProcessor ){ this.exceptionLogger = exceptionLogger; this.htmlToPojoEngine = htmlToPojoEngine; this.objectMapper = objectMapper; this.asyncHttpClient = asyncHttpClient; this.proxyFinder = proxyFinder; this.boundRequestBuilderProcessor = boundRequestBuilderProcessor; } /** * * Simplified factory method. * * @param awaitBetweenRequests time to wait between each consecutive http * request. * @param proxyChangeRate the rate at which the proxies should be switched * @param timeout the timeout in milliseconds before the request will be * retried * @param useProxy wether you should use proxies or not for performing your * request * @param maxRequestRetries maximum number of retries before throwing a * failure exception * @return built {@link HttpWithProxyManagerClient} */ public HttpManagerClient createProxyManagerClient( int awaitBetweenRequests, int proxyChangeRate, int timeout, boolean useProxy, int maxRequestRetries ){ return createProxyManagerClient( awaitBetweenRequests, proxyChangeRate, timeout, useProxy, false, true, false, true, maxRequestRetries, null, null, new Cookie[]{} ); } /** * * @param awaitBetweenRequests time to wait between each consecutive http * request.
* * @param proxyChangeRate the rate at which the proxies should be switched * * @param timeout the timeout in milliseconds before the request will be * retried
* * @param useProxy wether you should use proxies or not for performing your * request
* * @param connectToProxyBeforeRequest wether a {@code CONNECT} TCP initialization * request should be performed before hand. * Warning! Only use if you know what you are doing!
* * @param rotatingUserAgent will auto assign rotating user agent headers to * each request using {@link RotatingUserAgent#pickRandomUserAgent()}.
* * @param allowInfiniteRedirections will allow infinite redirections. * Redirections with {@code 301} or {@code 302} HTTP Status codes will * be followed as a normal browser would. Redirections are by default * limited to 3 on the same request. Setting this field to true will * let potential (quite common case when scrapping) happens. * Warning! Only use if you know what you are doing! * * @param followRedirections wether HTTP redirection (301 and 302 HTTP status) * should be accepted or not. If false, no redirection will be followed, even * though {@code allowInfiniteRedirections} is set to true. If set to true with * {@code allowInfiniteRedirections} set to false, redirections will only be * followed once in per single HTTP request but not more. * * @param maxRequestRetries maximum number of retries before throwing a * failure exception
* * * @param headers default headers to use in each requests
* @param cookies default cookies to use in each requests
* @param fields default POST fields to use on each requests. * * @return built {@link HttpWithProxyManagerClient} */ public HttpManagerClient createProxyManagerClient( int awaitBetweenRequests, int proxyChangeRate, int timeout, boolean useProxy, boolean connectToProxyBeforeRequest, boolean rotatingUserAgent, boolean allowInfiniteRedirections, boolean followRedirections, int maxRequestRetries, HttpHeaders headers, List fields, Cookie... cookies ){ return new DefaultHttpManagerClientBuilder(asyncHttpClient, exceptionLogger, boundRequestBuilderProcessor) .setAwaitBetweenRequests(awaitBetweenRequests) .setProxyChangeRate(proxyChangeRate) .setTimeout(timeout) .setUseProxy(useProxy) .setConnectToProxyBeforeRequest(connectToProxyBeforeRequest) .setRotatingUserAgent(rotatingUserAgent) .setAllowInfiniteRedirections(allowInfiniteRedirections) .setFollowRedirections(followRedirections) .setMaxRequestRetries(maxRequestRetries) .setDefaultHeaders(headers) .setDefaultFields(fields) .setDefaultCookies(cookies) .setProxyFinder(proxyFinder) .setScrapStopped(scrapStopped) .build(); } /** *

Manual {@link HtmlAutoScrapper} factory method.

* @param client the {@link HttpWithProxyManagerClient} that will be used under the * hood by the {@link HtmlAutoScrapper}.
* * @param clazz the class to map resulting outputs to.
* * * @param followRedirections wether HTTP redirections should be followed * or not (HTTP redirections is valid if status * code is {@code 301} or {@code 302} and when * the {@code Location} header is not empty.
* * @param warningSignDelay delay before retrying any action in the case * a {@link WarningSign} was triggered and only if it * was set to {@link Action#RETRY}.
* @param the type of model this scrapper will cast resulting outputs to. * @return built in {@link HtmlAutoScrapper}. */ public HtmlAutoScrapper createHtmlAutoScrapper( final HttpManagerClient client, Class clazz, boolean followRedirections, int warningSignDelay ) { return new HtmlAutoScrapperImpl<>( client, htmlToPojoEngine, boundRequestBuilderProcessor, objectMapper, exceptionLogger, clazz, followRedirections, warningSignDelay, scrapStopped ); } /** *

* Automatic factory method using {@link RequestsScrappingContext} * built using annotations gathered on top of {@link ScrapperHelper} * implementation. *

* @param requestPreparator context of the scrapping request * @return built in {@link HttpWithProxyManagerClient} */ public HttpManagerClient createProxyManagerClient(RequestsScrappingContext requestPreparator) { RequestsConfig config = requestPreparator.getRequestsConfig(); ProxyConfig proxyConfig = config.proxyConfig(); HttpHeaders headers = new DefaultHttpHeaders(); for(Header hdr : config.defaultHeaders()) { headers.add(hdr.name(), hdr.value()); } List cookieList = new ArrayList<>(); for(fr.whimtrip.ext.jwhtscrapper.annotation.Cookie ck : config.defaultCookies()) { Cookie cookie = new DefaultCookie(ck.name(), ck.value()); cookie.setDomain(ck.domain()); cookie.setPath(ck.path()); cookie.setMaxAge(Cookie.UNDEFINED_MAX_AGE); cookieList.add(cookie); } Cookie[] cookies = new Cookie[cookieList.size()]; cookieList.toArray(cookies); List fields = new ArrayList<>(); for(fr.whimtrip.ext.jwhtscrapper.annotation.Field fld : config.defaultPostFields()) { fields.add(new PostField(fld.name(), fld.value())); } return createProxyManagerClient( config.waitBetweenRequests(), 1, config.timeout(), proxyConfig.useProxy(), proxyConfig.connectToProxyBeforeRequest(), config.rotatingUserAgent(), config.allowInfiniteRedirections(), config.followRedirections(), config.maxRequestRetries(), headers, fields, cookies ); } /** *

* Automatic factory method using {@link RequestsScrappingContext} * built using annotations gathered on top of {@link ScrapperHelper} * implementation. * Built {@link HttpWithProxyManagerClient} is already required in order * for this {@link HtmlAutoScrapper} to have the correct subjacent * processing unit. *

* @param context context of the scrapping request * @param httpManagerClient previously built in {@link HttpWithProxyManagerClient} * using this factory class. * @return built in {@link HtmlAutoScrapper} */ public HtmlAutoScrapper createHtmlAutoScrapper(HttpManagerClient httpManagerClient, ScrappingContext context) { return createHtmlAutoScrapper( httpManagerClient, context.getModelClazz(), context.getRequestsScrappingContext().getRequestsConfig().followRedirections(), context.getRequestsScrappingContext().getRequestsConfig().warningSignDelay() ); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy