fr.whimtrip.ext.jwhtscrapper.service.scoped.DefaultAutomaticInnerScrapperClient Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of whimtrip-ext-scrapper Show documentation
Fully featured highly pluggable and customizable Java scrapping framework
The newest version!
/*
 * This code is licensed to WhimTrip©. For any question, please contact the author of the file.
 */

/*
 * This code is licensed to WhimTrip©. For any question, please contact the author of the file.
 */

package fr.whimtrip.ext.jwhtscrapper.service.scoped;

import fr.whimtrip.core.util.WhimtripUtils;
import fr.whimtrip.core.util.intrf.ExceptionLogger;
import fr.whimtrip.ext.jwhtscrapper.exception.ScrapperAlreadyFinishedException;
import fr.whimtrip.ext.jwhtscrapper.exception.ScrapperAlreadyStartedException;
import fr.whimtrip.ext.jwhtscrapper.exception.ScrapperUnsupportedException;
import fr.whimtrip.ext.jwhtscrapper.impl.ScrappingStatsImpl;
import fr.whimtrip.ext.jwhtscrapper.intfr.HtmlAutoScrapper;
import fr.whimtrip.ext.jwhtscrapper.intfr.HttpMetrics;
import fr.whimtrip.ext.jwhtscrapper.intfr.ScrapperHelper;
import fr.whimtrip.ext.jwhtscrapper.intfr.ScrappingStats;
import fr.whimtrip.ext.jwhtscrapper.service.base.AutomaticInnerScrapperClient;
import fr.whimtrip.ext.jwhtscrapper.service.base.BoundRequestBuilderProcessor;
import fr.whimtrip.ext.jwhtscrapper.service.base.ScrapperThreadCallable;
import fr.whimtrip.ext.jwhtscrapper.service.holder.RequestsScrappingContext;
import fr.whimtrip.ext.jwhtscrapper.service.holder.ScrappingContext;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.FutureTask;


/**
 * Part of project jwht-scrapper
 * Created on 07/08/18
 *
 * 
 *     Default and standard implementation of {@link AutomaticInnerScrapperClient}.
 * 
 *
 * @param  Parent Type
 * @param  Model on which response body will be mapped
 * @author Louis-wht
 * @since 1.0.0
 */
public final class DefaultAutomaticInnerScrapperClient implements AutomaticInnerScrapperClient {

    private static final Logger log = LoggerFactory.getLogger(DefaultAutomaticInnerScrapperClient.class);
    private static final long SLEEP_TIME_BETWEEN_GATHERING_OF_RESULTS = 1000;
    private static final int LENGTH_OF_THE_PERCENTAGE_BAR = 100;
    private static final int WIDTH_OF_THE_PERCENTAGE_BAR = 2;
    private static final int LOG_STATUS_EVERY_X_FINISHED_TASKS = 5;

    private final ScrappingContext> context;

    private final HtmlAutoScrapper htmlAutoScrapper;

    private final ExceptionLogger exceptionLogger;

    private final List> runningTasks = new ArrayList<>();

    private final List
 pList = new ArrayList<>();
    private final List results = new ArrayList();

    private final BoundRequestBuilderProcessor requestProcessor;

    private int finishedTasks = 0;
    private int lastFinishedTasksLog = 0;
    private int startedScrapsCount = 0;
    private int validFinishedTasks = 0;
    private int failedFinishedTasks = 0;
    private boolean scrapStarted = false;
    private boolean stopped = false;
    private RequestsScrappingContext requestsScrappingContext;


    /**
     * 

     *     Default constructor instance.
     * 
     * @param context the scrapping context to use to build and drive the current
     *                scrapping client.
     * @param htmlAutoScrapper the underlying {@link HtmlAutoScrapper} to use to perform
     *                         the scraps.
     * @param exceptionLogger the {@link ExceptionLogger} to use to perform exception
     *                               logging.
     * @param requestProcessor the request processor {@link BoundRequestBuilderProcessor}.
     */
    public DefaultAutomaticInnerScrapperClient(
            ScrappingContext> context,
            HtmlAutoScrapper htmlAutoScrapper,
            ExceptionLogger exceptionLogger,
            BoundRequestBuilderProcessor requestProcessor
    )
    {
        this.context = context;
        this.htmlAutoScrapper = htmlAutoScrapper;
        this.exceptionLogger = exceptionLogger;
        this.requestProcessor = requestProcessor;
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public synchronized List scrap() throws InterruptedException, ExecutionException, ScrapperAlreadyStartedException
    {

        if(scrapStarted)
            throw new ScrapperAlreadyStartedException(this.getClass());

        scrapStarted = true;

        try {
            return innerScrap();
        }
        catch (InterruptedException | ExecutionException e) {
            throw e;
        }
        finally {
            stopped = true;
        }
    }


    /**
     * {@inheritDoc}
     */
    public void addObjectsToScrap(List objectsToScrap){
        synchronized (pList) {
            if(stopped || pList.isEmpty())
                throw new ScrapperAlreadyFinishedException(getContext().getHelper().getClass().getSimpleName());
            pList.addAll(objectsToScrap);
        }
    }

    /**
     * {@inheritDoc}
     */
    public synchronized void terminate() {
        synchronized (runningTasks) {
            stopped = true;
            for (FutureTask ft : runningTasks) {
                ft.cancel(true);
            }
        }
    }


    /**
     * {@inheritDoc}
     */
    public List> getRunningTasks() {
        return runningTasks;
    }

    /**
     * {@inheritDoc}
     */
    public ScrappingContext> getContext() {
        return context;
    }

    /**
     * {@inheritDoc}
     */
    public HttpMetrics getHttpMetrics() throws ScrapperUnsupportedException {
        return htmlAutoScrapper.getHttpMetrics();
    }



    /**
     * {@inheritDoc}
     */
    public ScrappingStats getScrapingStats() {
        if(!scrapStarted)
            return new ScrappingStatsImpl(0,0,0,0, 0);

        int runningTasks = startedScrapsCount - finishedTasks;

        return new ScrappingStatsImpl(
                finishedTasks,
                runningTasks,
                validFinishedTasks,
                failedFinishedTasks,
                pList.size() + runningTasks
        );
    }

    /**
     * 

     *     Inner scrapping process. It basically will check for unstarted threads
     *     in a while loop while the queue is not empty and fill those unstarted
     *     threads with new threads ready to be scrapped while creating an
     *     {@link ScrapperThreadCallableImpl} under the hood to properly handle the
     *     each scrapping process.
     * 
     * @return a list of objects returned by each different tasks from {@link ScrapperHelper#returnResult(Object, Object)}
     *         method at the end of each scrapping process.
     * @throws ExecutionException if any Execution process was triggered along the way.
     * @throws InterruptedException if the scrapping was interrupted by another thread calling
     *                              {@link #terminate()}.
     */
    @NotNull
    private List innerScrap() throws ExecutionException, InterruptedException {

        requestsScrappingContext = context.getRequestsScrappingContext();

        synchronized (pList) {
            pList.addAll(context.getParentObjects());
        }

        startedScrapsCount = 0;

        Iterator iterator = pList.iterator();
        List
 pSublist = new ArrayList<>();

        do {
            pSublist.clear();
            pSublist.addAll(newPSublist(iterator));
            iterator = pList.iterator();
            startThreads(pSublist);

            startedScrapsCount += pSublist.size();

            ScrappingResult scrappingResult = waitAndRemoveFinishedThreads();
            if(log.isTraceEnabled())
                log.trace(
                        "failed = {}, valids = {}.",
                        scrappingResult.failed,
                        scrappingResult.valid
                );


            validFinishedTasks += scrappingResult.valid;
            failedFinishedTasks += scrappingResult.valid;

            int delay = context.getRequestsScrappingContext().getRequestsConfig().periodicDelay();

            if(delay > 0 && scrappingResult.valid >= 1) {
                WhimtripUtils.waitFor((long)delay, log, 20);
            }

        } while(startedScrapsCount < requestsScrappingContext.getScrapLimit() && iterator.hasNext() && !stopped);

        while(!runningTasks.isEmpty())
        {
            waitAndRemoveFinishedThreads();
        }

        return results;
    }

    /**
     * 

     *     Wait and remove all finished threads while storing stats, logging
     *     status if necessary and storing the results of the scrapping process.
     *     Most of this is handled by {@link #removeFinishedThreads()}.
     * 
     * @return the {@link ScrappingResult} for the current threads removal.
     * @throws ExecutionException if an execution exception was triggered
     *                            while calling {@link FutureTask#get()}.
     * @throws InterruptedException if the scrapping was interrupted by another thread calling
     *                              {@link #terminate()}.
     */
    private ScrappingResult waitAndRemoveFinishedThreads()
            throws ExecutionException, InterruptedException {

        try {
            Thread.sleep(SLEEP_TIME_BETWEEN_GATHERING_OF_RESULTS);
        }
        catch(InterruptedException e)
        {
            exceptionLogger.logException(e);
        }
        return removeFinishedThreads();
    }



    /**
     * 
     *     Remove all finished threads while storing stats, logging
     *     status if necessary and storing the results of the scrapping process.
     * 
     * @return the {@link ScrappingResult} for the current threads removal.
     * @throws ExecutionException if an execution exception was triggered
     *                            while calling {@link FutureTask#get()}.
     * @throws InterruptedException if the scrapping was interrupted by another thread calling
     *                              {@link #terminate()}.
     */
    @SuppressWarnings("unchecked")
    private ScrappingResult removeFinishedThreads()
            throws ExecutionException, InterruptedException
    {
        List copiedTasks = new ArrayList<>();
        copiedTasks.addAll(runningTasks);
        FutureTask