fr.whimtrip.ext.jwhtscrapper.service.scoped.HtmlAutoScrapperImpl Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of whimtrip-ext-scrapper Show documentation
Fully featured highly pluggable and customizable Java scrapping framework
The newest version!
/*
 * This code is licensed to WhimTrip©. For any question, please contact the author of the file.
 */

/*
 * This code is licensed to WhimTrip©. For any question, please contact the author of the file.
 */

/*
 * This code is licensed to WhimTrip©. For any question, please contact the author of the file.
 */

/*
 * This code is licensed to WhimTrip©. For any question, please contact the author of the file.
 */

/*
 * This code is licensed to WhimTrip©. For any question, please contact the author of the file.
 */

package fr.whimtrip.ext.jwhtscrapper.service.scoped;

import fr.whimtrip.core.util.WhimtripUtils;
import fr.whimtrip.core.util.intrf.ExceptionLogger;
import fr.whimtrip.ext.jwhthtmltopojo.HtmlToPojoEngine;
import fr.whimtrip.ext.jwhthtmltopojo.exception.HtmlToPojoException;
import fr.whimtrip.ext.jwhthtmltopojo.intrf.HtmlAdapter;
import fr.whimtrip.ext.jwhtscrapper.enm.Action;
import fr.whimtrip.ext.jwhtscrapper.annotation.WarningSign;
import fr.whimtrip.ext.jwhtscrapper.exception.*;
import fr.whimtrip.ext.jwhtscrapper.impl.ScrapperHtmlAdapterFactory;
import fr.whimtrip.ext.jwhtscrapper.intfr.BasicObjectMapper;
import fr.whimtrip.ext.jwhtscrapper.intfr.HtmlAutoScrapper;
import fr.whimtrip.ext.jwhtscrapper.intfr.HttpMetrics;
import fr.whimtrip.ext.jwhtscrapper.intfr.LinksFollower;
import fr.whimtrip.ext.jwhtscrapper.service.base.BoundRequestBuilderProcessor;
import fr.whimtrip.ext.jwhtscrapper.service.base.HttpManagerClient;
import fr.whimtrip.ext.jwhtscrapper.service.holder.LinkListScrappingContext;
import fr.whimtrip.ext.jwhtscrapper.service.holder.LinkScrappingContext;
import org.asynchttpclient.BoundRequestBuilder;
import org.jetbrains.annotations.Contract;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicBoolean;

import static fr.whimtrip.ext.jwhtscrapper.enm.PausingBehavior.PAUSE_ALL_THREADS;
import static fr.whimtrip.ext.jwhtscrapper.enm.PausingBehavior.PAUSE_CURRENT_THREAD_ONLY;


/**
 * Part of project jwht-scrapper
 * Created on 30/07/18
 *
 * 
 *     Default implementation of {@link HtmlAutoScrapper}. As stated in the interface
 *     javadoc, we implemented it with all given recommendations including :
 * 
 *
 * 
 *     
 *         {@link HttpWithProxyManagerClient}
 *     
 *     
 *         {@link LinksFollowerImpl}
 *     
 *     
 *         {@link HtmlToPojoEngine} with {@link ScrapperHtmlAdapterFactory}.
 *     
 *     
 *         {@link BasicObjectMapper} is accepted.
 *     
 * 
 *
 * @see HtmlAutoScrapper
 * @author Louis-wht
 * @since 1.0.0
 */
public final class HtmlAutoScrapperImpl implements HtmlAutoScrapper {

    private static final Logger log = LoggerFactory.getLogger(HtmlAutoScrapperImpl.class);

    private final HttpManagerClient httpManagerClient;
    private final BasicObjectMapper objectMapper;
    private final BoundRequestBuilderProcessor boundRequestBuilderProcessor;
    private final HtmlAdapter htmlAdapter;
    private final HtmlToPojoEngine htmlToPojoEngine;
    private final ExceptionLogger exceptionLogger;
    private final Class persistentClass;

    private final int warningSignDelay;
    private final boolean followRedirections;
    private final AtomicBoolean scrapStopped;

    private WarningSignException lastThrownWarningSignException;


    /**
     * Default Constructor
     *
     * @param exceptionLogger the exception logger that will be used by both the
     *                        {@link HttpWithProxyManagerClient} and the {@link HtmlAutoScrapper}
     * @param htmlToPojoEngine the core html to pojo engine allowing us to parse
     *                         HTML input to java POJOs.
     * @param objectMapper the object mapper to use for mapping differently formatted
     *                     strings.
     * @param boundRequestBuilderProcessor the request processor used for headers,
     *                                    cookies etc modfications as well as other
     *                                    eventual use cases.
     * @param httpManagerClient the {@link HttpWithProxyManagerClient} that will be used under the
     *               hood by the {@link HtmlAutoScrapper}.

     *
     * @param clazz the class to map resulting outputs to.

     *
     *
     * @param followRedirections wether HTTP redirections should be followed
     *                           or not (HTTP redirections is valid if status
     *                           code is {@code 301} or {@code 302} and when
     *                           the {@code Location} header is not empty.

     *
     * @param warningSignDelay delay before retrying any action in the case
     *                         a {@link WarningSign} was triggered and only if it
     *                         was set to {@link Action#RETRY}.

     * @param  scrapStopped shared atomic boolean indicating if the current scrap
     *                      process is stopped or not.
     */
    public HtmlAutoScrapperImpl(
            HttpManagerClient httpManagerClient,
            HtmlToPojoEngine htmlToPojoEngine,
            BoundRequestBuilderProcessor boundRequestBuilderProcessor,
            BasicObjectMapper objectMapper,
            ExceptionLogger exceptionLogger,
            Class clazz,
            boolean followRedirections,
            int warningSignDelay,
            AtomicBoolean scrapStopped
    ) {
        this.httpManagerClient = httpManagerClient;
        this.boundRequestBuilderProcessor = boundRequestBuilderProcessor;
        this.objectMapper = objectMapper;
        this.exceptionLogger = exceptionLogger;

        this.persistentClass = clazz;

        htmlAdapter = htmlToPojoEngine.adapter(persistentClass);

        this.htmlToPojoEngine = htmlToPojoEngine;
        this.warningSignDelay = warningSignDelay;
        this.followRedirections = followRedirections;
        this.scrapStopped = scrapStopped;
    }

    /**
     * {@inheritDoc}
     */
    @NotNull
    @Override
    public T scrap(@NotNull final BoundRequestBuilder req, @Nullable final T obj)
            throws ModelBindingException, LinkException, WarningSignException
    {
        return scrap(req, obj, htmlAdapter, followRedirections, true);
    }


    /**
     * {@inheritDoc}
     */
    @NotNull
    @Override
    public T scrapPost(@NotNull final String url, @Nullable final Map fields)
            throws ModelBindingException, LinkException, WarningSignException
    {
        return scrap(prepareScrapPost(url, fields));
    }


    /**
     * {@inheritDoc}
     */
    @NotNull
    @Override
    public T scrapGet(@NotNull final String url)
            throws ModelBindingException, LinkException, WarningSignException
    {
        return scrap(prepareScrapGet(url));
    }

    /**
     * {@inheritDoc}
     */
    @NotNull
    @Override
    public BoundRequestBuilder prepareScrapPost(@NotNull final String url, @Nullable final Map fields)
    {
        BoundRequestBuilder req = httpManagerClient.preparePost(url);

        if(fields != null)
            for (Map.Entry field : fields.entrySet())
                req.addFormParam(field.getKey(), field.getValue().toString());


        return req;
    }

    /**
     * {@inheritDoc}
     */
    @NotNull
    @Override
    public BoundRequestBuilder prepareScrapGet(@NotNull final String url)
    {
        return httpManagerClient.prepareGet(url);
    }

    /**
     * {@inheritDoc}
     */
    @NotNull
    @Override
    public HttpMetrics getHttpMetrics() throws ScrapperUnsupportedException {
        return httpManagerClient.getHttpMetrics();
    }


    /**
     * 
     *     This provide the core private method that will perform scrapping
     *     related tasks. It conforms to all recommandations and contracts
     *     stipulated in {@link HtmlAutoScrapper#scrap(BoundRequestBuilder, Object)}.
     * 
     * @see HtmlAutoScrapper#scrap(BoundRequestBuilder, Object)
     * @param req the prepared {@link BoundRequestBuilder}
     * @param obj the object to map the resulting scrap to.
     * @param adapter the {@link HtmlAdapter} to use to map the resulting
     *                HTML body to a POJO. if the {@link BasicObjectMapper}
     *                is used instead, the adapter will still be used to perform
     *                field injection, links following and warning sign triggering.
     * @param followRedirections wether HTTP redirections should be followed or not.
     * @param  the type of the POJO to map it to. This inner method can be called
     *            recursively for links scrapping with other POJOs type. this explain
     *           why {@code T} is not used here.
     * @return the scrapped and ready to use POJO instance.
     * @throws ModelBindingException see {@link HtmlAutoScrapper#scrap(BoundRequestBuilder, Object)}.
     * @throws LinkException see {@link HtmlAutoScrapper#scrap(BoundRequestBuilder, Object)}.
     * @throws WarningSignException see {@link HtmlAutoScrapper#scrap(BoundRequestBuilder, Object)}.
     */
    private  U scrap(
            @NotNull        BoundRequestBuilder req,
            @Nullable       U obj,
            @NotNull  final HtmlAdapter adapter,
            final boolean followRedirections
    ) throws ModelBindingException, LinkException, WarningSignException
    {
        return scrap(req, obj, adapter, followRedirections, false);
    }

    /**
     * 
     *     This provide the core private method that will perform scrapping
     *     related tasks. It conforms to all recommandations and contracts
     *     stipulated in {@link HtmlAutoScrapper#scrap(BoundRequestBuilder, Object)}.
     * 
     * @see HtmlAutoScrapper#scrap(BoundRequestBuilder, Object)
     * @param req the prepared {@link BoundRequestBuilder}
     * @param obj the object to map the resulting scrap to.
     * @param adapter the {@link HtmlAdapter} to use to map the resulting
     *                HTML body to a POJO. if the {@link BasicObjectMapper}
     *                is used instead, the adapter will still be used to perform
     *                field injection, links following and warning sign triggering.
     * @param followRedirections wether HTTP redirections should be followed or not.
     * @param  the type of the POJO to map it to. This inner method can be called
     *            recursively for links scrapping with other POJOs type. this explain
     *           why {@code T} is not used here.
     * @return the scrapped and ready to use POJO instance.
     * @throws ModelBindingException see {@link HtmlAutoScrapper#scrap(BoundRequestBuilder, Object)}.
     * @throws LinkException see {@link HtmlAutoScrapper#scrap(BoundRequestBuilder, Object)}.
     * @throws WarningSignException see {@link HtmlAutoScrapper#scrap(BoundRequestBuilder, Object)}.
     */
    @SuppressWarnings("unchecked")
    private  U scrap(
            @NotNull        BoundRequestBuilder req,
            @Nullable       U obj,
            @NotNull  final HtmlAdapter adapter,
                      final boolean followRedirections,
                      final boolean parentCall
    ) throws ModelBindingException, LinkException, WarningSignException
    {
        Class mappedClazz =  obj == null ? (Class) persistentClass :  (Class) obj.getClass();
        String rawResponse = null;

        while(scrapStopped.get()){

            try
            {
                Thread.sleep(warningSignDelay / 10);
            }
            catch (InterruptedException e)
            {
                exceptionLogger.logException(e);
            }
        }

        rawResponse = httpManagerClient.getResponse(req, followRedirections);

        try {

            obj = buildObject(obj, adapter, mappedClazz, rawResponse);

            resolveLinks(obj, adapter);


            return obj;

        }
        catch (WarningSignActualScrapStoppedException e)
        {

            // this will propagate up to the parent scrap call which will return
            // untouched object
            if(!parentCall)
                throw e;
            return obj;
        }
        catch(WarningSignException e)
        {
            return handleWarningSign(req, obj, adapter, followRedirections, parentCall, e);

        }
        catch (IOException | HtmlToPojoException e)
        {
            throw new ModelBindingException(e);
        }
        finally
        {
            // removing the http manager client request context to avoid memory
            // overloading
            httpManagerClient.removeContext(req);
        }
    }


    /**
     * 
     *     This provide the core private method that will handle a warning sign and
     *     the actions to be taken when triggered.
     * 
     * @param req the prepared {@link BoundRequestBuilder}
     * @param obj the object to map the resulting scrap to.
     * @param adapter the {@link HtmlAdapter} to use to map the resulting
     *                HTML body to a POJO. if the {@link BasicObjectMapper}
     *                is used instead, the adapter will still be used to perform
     *                field injection, links following and warning sign triggering.
     * @param followRedirections wether HTTP redirections should be followed or not.
     * @param e the warning exception triggered.
     * @param  the type of the POJO to map it to. This inner method can be called
     *            recursively for links scrapping with other POJOs type. this explain
     *           why {@code T} is not used here.
     * @return the scrapped and ready to use POJO instance.
     * @throws ModelBindingException see {@link HtmlAutoScrapper#scrap(BoundRequestBuilder, Object)}.
     * @throws LinkException see {@link HtmlAutoScrapper#scrap(BoundRequestBuilder, Object)}.
     * @throws WarningSignException see {@link HtmlAutoScrapper#scrap(BoundRequestBuilder, Object)}.
     */
    private  U handleWarningSign(
            @NotNull  BoundRequestBuilder req,
            @Nullable U obj,
            @NotNull  HtmlAdapter adapter,
                      boolean followRedirections,
                      boolean parentCall,
                      WarningSignException e

    ) throws ModelBindingException, LinkException, WarningSignException
    {

        log.warn("A warning sign was triggered! {}", e.getMessage());
        boundRequestBuilderProcessor.printReq(req);

        if(
                e.getPausingBehavior() == PAUSE_ALL_THREADS
             || e.getPausingBehavior() == PAUSE_CURRENT_THREAD_ONLY)
        {

            if (e.getPausingBehavior() == PAUSE_ALL_THREADS) {
                scrapStopped.set(false);
                lastThrownWarningSignException = e;
            }

            WhimtripUtils.waitFor((long) warningSignDelay, log, 20);

            // this is to ensure that the same pausing threads is the one that is
            // commanding the scrapping to stop.
            if(e.getPausingBehavior() == PAUSE_ALL_THREADS &&  e == lastThrownWarningSignException)
                scrapStopped.set(true);
        }


        if(e.getAction() == Action.THROW_EXCEPTION)
        {
            log.warn("Current scrap handled a fatal error which shouldn't lead to further scrapping for that object");
            throw e;
        }

        if(e.getAction() == Action.STOP_ACTUAL_SCRAP)
        {
            log.warn("Current object shouldn't be further scrapped");
            // this will propagate up to the parent scrap call which will return
            // untouched object
            throw new WarningSignActualScrapStoppedException(e);
        }

        if(e.getAction() == Action.RETRY)
        {
            return scrap(req, obj, adapter, followRedirections, parentCall);
        }

        // Action.NONE -> Won't do nothing, scrap will continue but not on the current
        // POJO branching.
        return obj;
    }


    /**
     * 
     *     Will call {@link #scrap(BoundRequestBuilder, Object, HtmlAdapter, boolean)}
     *     with all context parameters given by an {@link LinkScrappingContext}.
     * 
     * @param lsc the {@link LinkScrappingContext} to use to perform the scrap operation.
     * @param  the type of POJO instance it should return.
     * @return the corresponding {@code U} pojo instance.
     * @throws ModelBindingException see {@link HtmlAutoScrapper#scrap(BoundRequestBuilder, Object)}.
     * @throws LinkException see {@link HtmlAutoScrapper#scrap(BoundRequestBuilder, Object)}.
     * @throws WarningSignException see {@link HtmlAutoScrapper#scrap(BoundRequestBuilder, Object)}.
     */
    private  U scrap(LinkScrappingContext lsc)
            throws ModelBindingException, LinkException, WarningSignException
    {
         return scrap(lsc.getBoundRequestBuilder(), lsc.getNewObj(), lsc.getAdapter(), lsc.followRedirections());
    }


    /**
     * 
     *     This method will build String body to a POJO using either
     *     {@link HtmlAdapter} to map HTML or {@link BasicObjectMapper}
     *     to map any other input format to POJO with any other rules it
     *     might imply.
     * 
     * @param obj the object to build the scrapping result to. Might be null.
     * @param adapter the adapter to use. Might be null if {@link BasicObjectMapper}
     *                is provided instead.
     * @param mappedClazz the clazz to map the body to.
     * @param rawResponse the raw String body response from the HTTP scrapping request.
     * @param  the type of POJO to map it to.
     * @return fully built and scrapped {@code U} POJO instance
     * @throws IOException if Object binding didn't work as expected with {@link BasicObjectMapper}.
     * @throws HtmlToPojoException if Object binding failed with {@link HtmlAdapter}.
     */
    private  U buildObject(
            @Nullable U obj,
            @Nullable final HtmlAdapter adapter,
            @Nullable final Class mappedClazz,
            @NotNull final String rawResponse
    ) throws IOException, HtmlToPojoException
    {
        if (obj == null) {
            obj = objectMapper == null ?
                    adapter.fromHtml(rawResponse)
                    : objectMapper.readValue(rawResponse, mappedClazz);
        } else {
            obj = objectMapper == null ?
                    adapter.fromHtml(rawResponse, obj)
                    : objectMapper.readValue(rawResponse, mappedClazz, obj);
        }
        return obj;
    }


    /**
     * 
     *     This method will handle all link following tasks related.
     *     This includes three main steps:
     * 
     * 
     *     
     *         Resolving all links to be searched and polled using {@link LinksFollowerImpl}
     *         default implementation of {@link LinksFollower}.
     *     
     *     
     *         Scrap those links using {@link #scrap(LinkScrappingContext)} method.
     *     
     *     
     *         Set the resulting the values to the field.
     *     
     * 
     *
     * 
     *     Additionally Link lists and single links are handled separately because
     *     list of links requires to first instanciate a list to append every link
     *     entry scrap result to.
     * 
     * @param obj the object to resolve links for.
     * @param adapter the {@link HtmlAdapter} to use to analyse the links to further scrap.
     * @param  the type of the POJO instance to scrap links for.
     * @throws LinkException when thrown by underlying {@link LinksFollower#resolveBasicLinks()}
     *                       or if a scrapping exception when scrapping the links in which case
     *                       the original exception can be retrieved using {@link Throwable#getCause()}.
     * @throws ModelBindingException if any field setting operation failed due to POJO reflection
     *                               access failure in which case the execption should be corrected
     *                               before starting the scrapping once again.
     */
    private  void resolveLinks(@NotNull final U obj, @NotNull final HtmlAdapter adapter) throws LinkException, ModelBindingException{

        LinksFollower linksFollower = new LinksFollowerImpl(httpManagerClient, htmlToPojoEngine, exceptionLogger, boundRequestBuilderProcessor, obj, adapter);
        linksFollower.resolveBasicLinks();

        scrapAndSetLinkLists(linksFollower);

        scrapAndSetBasicLinks(linksFollower);
    }

    /**
     * 
     *     This method will start a simple Link scrap and set the value to the
     *     corresponding field.
     * 
     * @param linksFollower the {@link LinksFollower} instance holding resolved links to
     *                      scrap.
     * @throws LinkException when thrown by underlying {@link LinksFollower#resolveBasicLinks()}
     *                       or if a scrapping exception when scrapping the links in which case
     *                       the original exception can be retrieved using {@link Throwable#getCause()}.
     * @throws ModelBindingException if any field setting operation failed due to POJO reflection
     *                               access failure in which case the execption should be corrected
     *                               before starting the scrapping once again.
     */
    private void scrapAndSetBasicLinks(@NotNull final LinksFollower linksFollower) throws ModelBindingException, LinkException {

        for(LinkScrappingContext lsc : linksFollower.getScrappingContexts()) {

            Object newObj = null;
            try {
                newObj = scrap(lsc);
            }
            catch (ScrapperException e)
            {
                handleScrapperException(lsc.throwExceptions(), e);
            }

            if(newObj != null)
            {

                try {
                    WhimtripUtils.setObjectToField(lsc.getFieldToBeSet(), lsc.getParentObject(), newObj);
                }

                catch (IllegalAccessException e) {
                    exceptionLogger.logException(e);
                    throw new ModelBindingException(e);
                }
            }
        }
    }

    /**
     * Will handle Scrapper Exception and turn them into {@link ScrapperException}
     * @param throwExceptions wether the exception should be thrown or not.
     * @param e the underlying {@link ScrapperException}
     * @throws LinkException if {@code throwExceptions} is set to true.
     */
    @Contract("true, _ -> fail")
    private void handleScrapperException(boolean throwExceptions, ScrapperException e) throws LinkException {

        if(throwExceptions)
            throw new LinkException(e);

        exceptionLogger.logException(e);
    }

    /**
     * 
     *     This method will start a link list scrap and set the resulting list
     *     value to the corresponding field.
     * 
     * @param linksFollower the {@link LinksFollower} instance holding resolved links to
     *                      scrap.
     * @throws LinkException when thrown by underlying {@link LinksFollower#resolveBasicLinks()}
     *                       or if a scrapping exception when scrapping the links in which case
     *                       the original exception can be retrieved using {@link Throwable#getCause()}.
     * @throws ModelBindingException if any field setting operation failed due to POJO reflection
     *                               access failure in which case the execption should be corrected
     *                               before starting the scrapping once again.
     */
    private void scrapAndSetLinkLists(@NotNull final LinksFollower linksFollower) throws ModelBindingException, LinkException {

        for(LinkListScrappingContext llsc : linksFollower.getLinkListsScrappingContexts()) {
            List ulist = buildLinkListScraps(llsc);

            try {
                WhimtripUtils.setObjectToField(llsc.getFieldToBeSet(), llsc.getParentObject(), ulist);
            }

            catch (IllegalAccessException e) {
                exceptionLogger.logException(e);
                throw new ModelBindingException(e);
            }
        }
    }

    /**
     * 
     *     The Link List to POJO List builder method. For each {@link LinkListScrappingContext}
     *     of the origin list, it will simply call {@link #scrap(LinkScrappingContext)}
     *     and then add the resulting value if not null to the new list being created.
     * 
     * @param llsc the {@link LinkListScrappingContext} to scrap all links for.
     * @param  the type of POJO instances in the list to return.
     * @return a list of {@code U} typed instances freshly scrapped from the
     *         {@link LinkScrappingContext} contained in {@code llsc}.
     * @throws LinkException when a scrapping operation failed if {@link LinkScrappingContext#throwExceptions()}
     *                       returned true.
     */
    private  List buildLinkListScraps(LinkListScrappingContext llsc) throws LinkException {

        List uList = new ArrayList<>();
        for(LinkScrappingContext lsc : llsc) {

            U newObj = null;
            try{
                newObj = scrap(lsc);
            }
            catch (ScrapperException e) {
                handleScrapperException(lsc.throwExceptions(), e);
            }
            if(newObj != null)
                uList.add(newObj);
        }
        return uList;
    }



}