![JAR search and dependency download from the Maven repository](/logo.png)
fr.whimtrip.ext.jwhtscrapper.service.scoped.DefaultAutomaticInnerScrapperClient Maven / Gradle / Ivy
Show all versions of whimtrip-ext-scrapper Show documentation
/*
* This code is licensed to WhimTrip©. For any question, please contact the author of the file.
*/
/*
* This code is licensed to WhimTrip©. For any question, please contact the author of the file.
*/
package fr.whimtrip.ext.jwhtscrapper.service.scoped;
import fr.whimtrip.core.util.WhimtripUtils;
import fr.whimtrip.core.util.intrf.ExceptionLogger;
import fr.whimtrip.ext.jwhtscrapper.exception.ScrapperAlreadyFinishedException;
import fr.whimtrip.ext.jwhtscrapper.exception.ScrapperAlreadyStartedException;
import fr.whimtrip.ext.jwhtscrapper.exception.ScrapperUnsupportedException;
import fr.whimtrip.ext.jwhtscrapper.impl.ScrappingStatsImpl;
import fr.whimtrip.ext.jwhtscrapper.intfr.HtmlAutoScrapper;
import fr.whimtrip.ext.jwhtscrapper.intfr.HttpMetrics;
import fr.whimtrip.ext.jwhtscrapper.intfr.ScrapperHelper;
import fr.whimtrip.ext.jwhtscrapper.intfr.ScrappingStats;
import fr.whimtrip.ext.jwhtscrapper.service.base.AutomaticInnerScrapperClient;
import fr.whimtrip.ext.jwhtscrapper.service.base.BoundRequestBuilderProcessor;
import fr.whimtrip.ext.jwhtscrapper.service.base.ScrapperThreadCallable;
import fr.whimtrip.ext.jwhtscrapper.service.holder.RequestsScrappingContext;
import fr.whimtrip.ext.jwhtscrapper.service.holder.ScrappingContext;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.FutureTask;
/**
* Part of project jwht-scrapper
* Created on 07/08/18
*
*
* Default and standard implementation of {@link AutomaticInnerScrapperClient}.
*
*
* @param Parent Type
* @param Model on which response body will be mapped
* @author Louis-wht
* @since 1.0.0
*/
public final class DefaultAutomaticInnerScrapperClient implements AutomaticInnerScrapperClient
{
private static final Logger log = LoggerFactory.getLogger(DefaultAutomaticInnerScrapperClient.class);
private static final long SLEEP_TIME_BETWEEN_GATHERING_OF_RESULTS = 1000;
private static final int LENGTH_OF_THE_PERCENTAGE_BAR = 100;
private static final int WIDTH_OF_THE_PERCENTAGE_BAR = 2;
private static final int LOG_STATUS_EVERY_X_FINISHED_TASKS = 5;
private final ScrappingContext
> context;
private final HtmlAutoScrapper htmlAutoScrapper;
private final ExceptionLogger exceptionLogger;
private final List> runningTasks = new ArrayList<>();
private final List pList = new ArrayList<>();
private final List results = new ArrayList();
private final BoundRequestBuilderProcessor requestProcessor;
private int finishedTasks = 0;
private int lastFinishedTasksLog = 0;
private int startedScrapsCount = 0;
private int validFinishedTasks = 0;
private int failedFinishedTasks = 0;
private boolean scrapStarted = false;
private boolean stopped = false;
private RequestsScrappingContext requestsScrappingContext;
/**
*
* Default constructor instance.
*
* @param context the scrapping context to use to build and drive the current
* scrapping client.
* @param htmlAutoScrapper the underlying {@link HtmlAutoScrapper} to use to perform
* the scraps.
* @param exceptionLogger the {@link ExceptionLogger} to use to perform exception
* logging.
* @param requestProcessor the request processor {@link BoundRequestBuilderProcessor}.
*/
public DefaultAutomaticInnerScrapperClient(
ScrappingContext> context,
HtmlAutoScrapper htmlAutoScrapper,
ExceptionLogger exceptionLogger,
BoundRequestBuilderProcessor requestProcessor
)
{
this.context = context;
this.htmlAutoScrapper = htmlAutoScrapper;
this.exceptionLogger = exceptionLogger;
this.requestProcessor = requestProcessor;
}
/**
* {@inheritDoc}
*/
@Override
public synchronized List scrap() throws InterruptedException, ExecutionException, ScrapperAlreadyStartedException
{
if(scrapStarted)
throw new ScrapperAlreadyStartedException(this.getClass());
scrapStarted = true;
try {
return innerScrap();
}
catch (InterruptedException | ExecutionException e) {
throw e;
}
finally {
stopped = true;
}
}
/**
* {@inheritDoc}
*/
public void addObjectsToScrap(List objectsToScrap){
synchronized (pList) {
if(stopped || pList.isEmpty())
throw new ScrapperAlreadyFinishedException(getContext().getHelper().getClass().getSimpleName());
pList.addAll(objectsToScrap);
}
}
/**
* {@inheritDoc}
*/
public synchronized void terminate() {
synchronized (runningTasks) {
stopped = true;
for (FutureTask ft : runningTasks) {
ft.cancel(true);
}
}
}
/**
* {@inheritDoc}
*/
public List> getRunningTasks() {
return runningTasks;
}
/**
* {@inheritDoc}
*/
public ScrappingContext> getContext() {
return context;
}
/**
* {@inheritDoc}
*/
public HttpMetrics getHttpMetrics() throws ScrapperUnsupportedException {
return htmlAutoScrapper.getHttpMetrics();
}
/**
* {@inheritDoc}
*/
public ScrappingStats getScrapingStats() {
if(!scrapStarted)
return new ScrappingStatsImpl(0,0,0,0, 0);
int runningTasks = startedScrapsCount - finishedTasks;
return new ScrappingStatsImpl(
finishedTasks,
runningTasks,
validFinishedTasks,
failedFinishedTasks,
pList.size() + runningTasks
);
}
/**
*
* Inner scrapping process. It basically will check for unstarted threads
* in a while loop while the queue is not empty and fill those unstarted
* threads with new threads ready to be scrapped while creating an
* {@link ScrapperThreadCallableImpl} under the hood to properly handle the
* each scrapping process.
*
* @return a list of objects returned by each different tasks from {@link ScrapperHelper#returnResult(Object, Object)}
* method at the end of each scrapping process.
* @throws ExecutionException if any Execution process was triggered along the way.
* @throws InterruptedException if the scrapping was interrupted by another thread calling
* {@link #terminate()}.
*/
@NotNull
private List innerScrap() throws ExecutionException, InterruptedException {
requestsScrappingContext = context.getRequestsScrappingContext();
synchronized (pList) {
pList.addAll(context.getParentObjects());
}
startedScrapsCount = 0;
Iterator iterator = pList.iterator();
List
pSublist = new ArrayList<>();
do {
pSublist.clear();
pSublist.addAll(newPSublist(iterator));
iterator = pList.iterator();
startThreads(pSublist);
startedScrapsCount += pSublist.size();
ScrappingResult scrappingResult = waitAndRemoveFinishedThreads();
if(log.isTraceEnabled())
log.trace(
"failed = {}, valids = {}.",
scrappingResult.failed,
scrappingResult.valid
);
validFinishedTasks += scrappingResult.valid;
failedFinishedTasks += scrappingResult.valid;
int delay = context.getRequestsScrappingContext().getRequestsConfig().periodicDelay();
if(delay > 0 && scrappingResult.valid >= 1) {
WhimtripUtils.waitFor((long)delay, log, 20);
}
} while(startedScrapsCount < requestsScrappingContext.getScrapLimit() && iterator.hasNext() && !stopped);
while(!runningTasks.isEmpty())
{
waitAndRemoveFinishedThreads();
}
return results;
}
/**
*
* Wait and remove all finished threads while storing stats, logging
* status if necessary and storing the results of the scrapping process.
* Most of this is handled by {@link #removeFinishedThreads()}.
*
* @return the {@link ScrappingResult} for the current threads removal.
* @throws ExecutionException if an execution exception was triggered
* while calling {@link FutureTask#get()}.
* @throws InterruptedException if the scrapping was interrupted by another thread calling
* {@link #terminate()}.
*/
private ScrappingResult waitAndRemoveFinishedThreads()
throws ExecutionException, InterruptedException {
try {
Thread.sleep(SLEEP_TIME_BETWEEN_GATHERING_OF_RESULTS);
}
catch(InterruptedException e)
{
exceptionLogger.logException(e);
}
return removeFinishedThreads();
}
/**
*
* Remove all finished threads while storing stats, logging
* status if necessary and storing the results of the scrapping process.
*
* @return the {@link ScrappingResult} for the current threads removal.
* @throws ExecutionException if an execution exception was triggered
* while calling {@link FutureTask#get()}.
* @throws InterruptedException if the scrapping was interrupted by another thread calling
* {@link #terminate()}.
*/
@SuppressWarnings("unchecked")
private ScrappingResult removeFinishedThreads()
throws ExecutionException, InterruptedException
{
List copiedTasks = new ArrayList<>();
copiedTasks.addAll(runningTasks);
FutureTask