![JAR search and dependency download from the Maven repository](/logo.png)
fr.whimtrip.ext.jwhtscrapper.service.scoped.HtmlAutoScrapperImpl Maven / Gradle / Ivy
Show all versions of whimtrip-ext-scrapper Show documentation
/*
* This code is licensed to WhimTrip©. For any question, please contact the author of the file.
*/
/*
* This code is licensed to WhimTrip©. For any question, please contact the author of the file.
*/
/*
* This code is licensed to WhimTrip©. For any question, please contact the author of the file.
*/
/*
* This code is licensed to WhimTrip©. For any question, please contact the author of the file.
*/
/*
* This code is licensed to WhimTrip©. For any question, please contact the author of the file.
*/
package fr.whimtrip.ext.jwhtscrapper.service.scoped;
import fr.whimtrip.core.util.WhimtripUtils;
import fr.whimtrip.core.util.intrf.ExceptionLogger;
import fr.whimtrip.ext.jwhthtmltopojo.HtmlToPojoEngine;
import fr.whimtrip.ext.jwhthtmltopojo.exception.HtmlToPojoException;
import fr.whimtrip.ext.jwhthtmltopojo.intrf.HtmlAdapter;
import fr.whimtrip.ext.jwhtscrapper.enm.Action;
import fr.whimtrip.ext.jwhtscrapper.annotation.WarningSign;
import fr.whimtrip.ext.jwhtscrapper.exception.*;
import fr.whimtrip.ext.jwhtscrapper.impl.ScrapperHtmlAdapterFactory;
import fr.whimtrip.ext.jwhtscrapper.intfr.BasicObjectMapper;
import fr.whimtrip.ext.jwhtscrapper.intfr.HtmlAutoScrapper;
import fr.whimtrip.ext.jwhtscrapper.intfr.HttpMetrics;
import fr.whimtrip.ext.jwhtscrapper.intfr.LinksFollower;
import fr.whimtrip.ext.jwhtscrapper.service.base.BoundRequestBuilderProcessor;
import fr.whimtrip.ext.jwhtscrapper.service.base.HttpManagerClient;
import fr.whimtrip.ext.jwhtscrapper.service.holder.LinkListScrappingContext;
import fr.whimtrip.ext.jwhtscrapper.service.holder.LinkScrappingContext;
import org.asynchttpclient.BoundRequestBuilder;
import org.jetbrains.annotations.Contract;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicBoolean;
import static fr.whimtrip.ext.jwhtscrapper.enm.PausingBehavior.PAUSE_ALL_THREADS;
import static fr.whimtrip.ext.jwhtscrapper.enm.PausingBehavior.PAUSE_CURRENT_THREAD_ONLY;
/**
* Part of project jwht-scrapper
* Created on 30/07/18
*
*
* Default implementation of {@link HtmlAutoScrapper}. As stated in the interface
* javadoc, we implemented it with all given recommendations including :
*
*
*
* -
* {@link HttpWithProxyManagerClient}
*
* -
* {@link LinksFollowerImpl}
*
* -
* {@link HtmlToPojoEngine} with {@link ScrapperHtmlAdapterFactory}.
*
* -
* {@link BasicObjectMapper} is accepted.
*
*
*
* @see HtmlAutoScrapper
* @author Louis-wht
* @since 1.0.0
*/
public final class HtmlAutoScrapperImpl implements HtmlAutoScrapper {
private static final Logger log = LoggerFactory.getLogger(HtmlAutoScrapperImpl.class);
private final HttpManagerClient httpManagerClient;
private final BasicObjectMapper objectMapper;
private final BoundRequestBuilderProcessor boundRequestBuilderProcessor;
private final HtmlAdapter htmlAdapter;
private final HtmlToPojoEngine htmlToPojoEngine;
private final ExceptionLogger exceptionLogger;
private final Class persistentClass;
private final int warningSignDelay;
private final boolean followRedirections;
private final AtomicBoolean scrapStopped;
private WarningSignException lastThrownWarningSignException;
/**
* Default Constructor
*
* @param exceptionLogger the exception logger that will be used by both the
* {@link HttpWithProxyManagerClient} and the {@link HtmlAutoScrapper}
* @param htmlToPojoEngine the core html to pojo engine allowing us to parse
* HTML input to java POJOs.
* @param objectMapper the object mapper to use for mapping differently formatted
* strings.
* @param boundRequestBuilderProcessor the request processor used for headers,
* cookies etc modfications as well as other
* eventual use cases.
* @param httpManagerClient the {@link HttpWithProxyManagerClient} that will be used under the
* hood by the {@link HtmlAutoScrapper}.
*
* @param clazz the class to map resulting outputs to.
*
*
* @param followRedirections wether HTTP redirections should be followed
* or not (HTTP redirections is valid if status
* code is {@code 301} or {@code 302} and when
* the {@code Location} header is not empty.
*
* @param warningSignDelay delay before retrying any action in the case
* a {@link WarningSign} was triggered and only if it
* was set to {@link Action#RETRY}.
* @param scrapStopped shared atomic boolean indicating if the current scrap
* process is stopped or not.
*/
public HtmlAutoScrapperImpl(
HttpManagerClient httpManagerClient,
HtmlToPojoEngine htmlToPojoEngine,
BoundRequestBuilderProcessor boundRequestBuilderProcessor,
BasicObjectMapper objectMapper,
ExceptionLogger exceptionLogger,
Class clazz,
boolean followRedirections,
int warningSignDelay,
AtomicBoolean scrapStopped
) {
this.httpManagerClient = httpManagerClient;
this.boundRequestBuilderProcessor = boundRequestBuilderProcessor;
this.objectMapper = objectMapper;
this.exceptionLogger = exceptionLogger;
this.persistentClass = clazz;
htmlAdapter = htmlToPojoEngine.adapter(persistentClass);
this.htmlToPojoEngine = htmlToPojoEngine;
this.warningSignDelay = warningSignDelay;
this.followRedirections = followRedirections;
this.scrapStopped = scrapStopped;
}
/**
* {@inheritDoc}
*/
@NotNull
@Override
public T scrap(@NotNull final BoundRequestBuilder req, @Nullable final T obj)
throws ModelBindingException, LinkException, WarningSignException
{
return scrap(req, obj, htmlAdapter, followRedirections, true);
}
/**
* {@inheritDoc}
*/
@NotNull
@Override
public T scrapPost(@NotNull final String url, @Nullable final Map fields)
throws ModelBindingException, LinkException, WarningSignException
{
return scrap(prepareScrapPost(url, fields));
}
/**
* {@inheritDoc}
*/
@NotNull
@Override
public T scrapGet(@NotNull final String url)
throws ModelBindingException, LinkException, WarningSignException
{
return scrap(prepareScrapGet(url));
}
/**
* {@inheritDoc}
*/
@NotNull
@Override
public BoundRequestBuilder prepareScrapPost(@NotNull final String url, @Nullable final Map fields)
{
BoundRequestBuilder req = httpManagerClient.preparePost(url);
if(fields != null)
for (Map.Entry field : fields.entrySet())
req.addFormParam(field.getKey(), field.getValue().toString());
return req;
}
/**
* {@inheritDoc}
*/
@NotNull
@Override
public BoundRequestBuilder prepareScrapGet(@NotNull final String url)
{
return httpManagerClient.prepareGet(url);
}
/**
* {@inheritDoc}
*/
@NotNull
@Override
public HttpMetrics getHttpMetrics() throws ScrapperUnsupportedException {
return httpManagerClient.getHttpMetrics();
}
/**
*
* This provide the core private method that will perform scrapping
* related tasks. It conforms to all recommandations and contracts
* stipulated in {@link HtmlAutoScrapper#scrap(BoundRequestBuilder, Object)}.
*
* @see HtmlAutoScrapper#scrap(BoundRequestBuilder, Object)
* @param req the prepared {@link BoundRequestBuilder}
* @param obj the object to map the resulting scrap to.
* @param adapter the {@link HtmlAdapter} to use to map the resulting
* HTML body to a POJO. if the {@link BasicObjectMapper}
* is used instead, the adapter will still be used to perform
* field injection, links following and warning sign triggering.
* @param followRedirections wether HTTP redirections should be followed or not.
* @param the type of the POJO to map it to. This inner method can be called
* recursively for links scrapping with other POJOs type. this explain
* why {@code T} is not used here.
* @return the scrapped and ready to use POJO instance.
* @throws ModelBindingException see {@link HtmlAutoScrapper#scrap(BoundRequestBuilder, Object)}.
* @throws LinkException see {@link HtmlAutoScrapper#scrap(BoundRequestBuilder, Object)}.
* @throws WarningSignException see {@link HtmlAutoScrapper#scrap(BoundRequestBuilder, Object)}.
*/
private U scrap(
@NotNull BoundRequestBuilder req,
@Nullable U obj,
@NotNull final HtmlAdapter adapter,
final boolean followRedirections
) throws ModelBindingException, LinkException, WarningSignException
{
return scrap(req, obj, adapter, followRedirections, false);
}
/**
*
* This provide the core private method that will perform scrapping
* related tasks. It conforms to all recommandations and contracts
* stipulated in {@link HtmlAutoScrapper#scrap(BoundRequestBuilder, Object)}.
*
* @see HtmlAutoScrapper#scrap(BoundRequestBuilder, Object)
* @param req the prepared {@link BoundRequestBuilder}
* @param obj the object to map the resulting scrap to.
* @param adapter the {@link HtmlAdapter} to use to map the resulting
* HTML body to a POJO. if the {@link BasicObjectMapper}
* is used instead, the adapter will still be used to perform
* field injection, links following and warning sign triggering.
* @param followRedirections wether HTTP redirections should be followed or not.
* @param the type of the POJO to map it to. This inner method can be called
* recursively for links scrapping with other POJOs type. this explain
* why {@code T} is not used here.
* @return the scrapped and ready to use POJO instance.
* @throws ModelBindingException see {@link HtmlAutoScrapper#scrap(BoundRequestBuilder, Object)}.
* @throws LinkException see {@link HtmlAutoScrapper#scrap(BoundRequestBuilder, Object)}.
* @throws WarningSignException see {@link HtmlAutoScrapper#scrap(BoundRequestBuilder, Object)}.
*/
@SuppressWarnings("unchecked")
private U scrap(
@NotNull BoundRequestBuilder req,
@Nullable U obj,
@NotNull final HtmlAdapter adapter,
final boolean followRedirections,
final boolean parentCall
) throws ModelBindingException, LinkException, WarningSignException
{
Class mappedClazz = obj == null ? (Class) persistentClass : (Class) obj.getClass();
String rawResponse = null;
while(scrapStopped.get()){
try
{
Thread.sleep(warningSignDelay / 10);
}
catch (InterruptedException e)
{
exceptionLogger.logException(e);
}
}
rawResponse = httpManagerClient.getResponse(req, followRedirections);
try {
obj = buildObject(obj, adapter, mappedClazz, rawResponse);
resolveLinks(obj, adapter);
return obj;
}
catch (WarningSignActualScrapStoppedException e)
{
// this will propagate up to the parent scrap call which will return
// untouched object
if(!parentCall)
throw e;
return obj;
}
catch(WarningSignException e)
{
return handleWarningSign(req, obj, adapter, followRedirections, parentCall, e);
}
catch (IOException | HtmlToPojoException e)
{
throw new ModelBindingException(e);
}
finally
{
// removing the http manager client request context to avoid memory
// overloading
httpManagerClient.removeContext(req);
}
}
/**
*
* This provide the core private method that will handle a warning sign and
* the actions to be taken when triggered.
*
* @param req the prepared {@link BoundRequestBuilder}
* @param obj the object to map the resulting scrap to.
* @param adapter the {@link HtmlAdapter} to use to map the resulting
* HTML body to a POJO. if the {@link BasicObjectMapper}
* is used instead, the adapter will still be used to perform
* field injection, links following and warning sign triggering.
* @param followRedirections wether HTTP redirections should be followed or not.
* @param e the warning exception triggered.
* @param the type of the POJO to map it to. This inner method can be called
* recursively for links scrapping with other POJOs type. this explain
* why {@code T} is not used here.
* @return the scrapped and ready to use POJO instance.
* @throws ModelBindingException see {@link HtmlAutoScrapper#scrap(BoundRequestBuilder, Object)}.
* @throws LinkException see {@link HtmlAutoScrapper#scrap(BoundRequestBuilder, Object)}.
* @throws WarningSignException see {@link HtmlAutoScrapper#scrap(BoundRequestBuilder, Object)}.
*/
private U handleWarningSign(
@NotNull BoundRequestBuilder req,
@Nullable U obj,
@NotNull HtmlAdapter adapter,
boolean followRedirections,
boolean parentCall,
WarningSignException e
) throws ModelBindingException, LinkException, WarningSignException
{
log.warn("A warning sign was triggered! {}", e.getMessage());
boundRequestBuilderProcessor.printReq(req);
if(
e.getPausingBehavior() == PAUSE_ALL_THREADS
|| e.getPausingBehavior() == PAUSE_CURRENT_THREAD_ONLY)
{
if (e.getPausingBehavior() == PAUSE_ALL_THREADS) {
scrapStopped.set(false);
lastThrownWarningSignException = e;
}
WhimtripUtils.waitFor((long) warningSignDelay, log, 20);
// this is to ensure that the same pausing threads is the one that is
// commanding the scrapping to stop.
if(e.getPausingBehavior() == PAUSE_ALL_THREADS && e == lastThrownWarningSignException)
scrapStopped.set(true);
}
if(e.getAction() == Action.THROW_EXCEPTION)
{
log.warn("Current scrap handled a fatal error which shouldn't lead to further scrapping for that object");
throw e;
}
if(e.getAction() == Action.STOP_ACTUAL_SCRAP)
{
log.warn("Current object shouldn't be further scrapped");
// this will propagate up to the parent scrap call which will return
// untouched object
throw new WarningSignActualScrapStoppedException(e);
}
if(e.getAction() == Action.RETRY)
{
return scrap(req, obj, adapter, followRedirections, parentCall);
}
// Action.NONE -> Won't do nothing, scrap will continue but not on the current
// POJO branching.
return obj;
}
/**
*
* Will call {@link #scrap(BoundRequestBuilder, Object, HtmlAdapter, boolean)}
* with all context parameters given by an {@link LinkScrappingContext}.
*
* @param lsc the {@link LinkScrappingContext} to use to perform the scrap operation.
* @param the type of POJO instance it should return.
* @return the corresponding {@code U} pojo instance.
* @throws ModelBindingException see {@link HtmlAutoScrapper#scrap(BoundRequestBuilder, Object)}.
* @throws LinkException see {@link HtmlAutoScrapper#scrap(BoundRequestBuilder, Object)}.
* @throws WarningSignException see {@link HtmlAutoScrapper#scrap(BoundRequestBuilder, Object)}.
*/
private U scrap(LinkScrappingContext, U> lsc)
throws ModelBindingException, LinkException, WarningSignException
{
return scrap(lsc.getBoundRequestBuilder(), lsc.getNewObj(), lsc.getAdapter(), lsc.followRedirections());
}
/**
*
* This method will build String body to a POJO using either
* {@link HtmlAdapter} to map HTML or {@link BasicObjectMapper}
* to map any other input format to POJO with any other rules it
* might imply.
*
* @param obj the object to build the scrapping result to. Might be null.
* @param adapter the adapter to use. Might be null if {@link BasicObjectMapper}
* is provided instead.
* @param mappedClazz the clazz to map the body to.
* @param rawResponse the raw String body response from the HTTP scrapping request.
* @param the type of POJO to map it to.
* @return fully built and scrapped {@code U} POJO instance
* @throws IOException if Object binding didn't work as expected with {@link BasicObjectMapper}.
* @throws HtmlToPojoException if Object binding failed with {@link HtmlAdapter}.
*/
private U buildObject(
@Nullable U obj,
@Nullable final HtmlAdapter adapter,
@Nullable final Class mappedClazz,
@NotNull final String rawResponse
) throws IOException, HtmlToPojoException
{
if (obj == null) {
obj = objectMapper == null ?
adapter.fromHtml(rawResponse)
: objectMapper.readValue(rawResponse, mappedClazz);
} else {
obj = objectMapper == null ?
adapter.fromHtml(rawResponse, obj)
: objectMapper.readValue(rawResponse, mappedClazz, obj);
}
return obj;
}
/**
*
* This method will handle all link following tasks related.
* This includes three main steps:
*
*
* -
* Resolving all links to be searched and polled using {@link LinksFollowerImpl}
* default implementation of {@link LinksFollower}.
*
* -
* Scrap those links using {@link #scrap(LinkScrappingContext)} method.
*
* -
* Set the resulting the values to the field.
*
*
*
*
* Additionally Link lists and single links are handled separately because
* list of links requires to first instanciate a list to append every link
* entry scrap result to.
*
* @param obj the object to resolve links for.
* @param adapter the {@link HtmlAdapter} to use to analyse the links to further scrap.
* @param the type of the POJO instance to scrap links for.
* @throws LinkException when thrown by underlying {@link LinksFollower#resolveBasicLinks()}
* or if a scrapping exception when scrapping the links in which case
* the original exception can be retrieved using {@link Throwable#getCause()}.
* @throws ModelBindingException if any field setting operation failed due to POJO reflection
* access failure in which case the execption should be corrected
* before starting the scrapping once again.
*/
private void resolveLinks(@NotNull final U obj, @NotNull final HtmlAdapter adapter) throws LinkException, ModelBindingException{
LinksFollower linksFollower = new LinksFollowerImpl(httpManagerClient, htmlToPojoEngine, exceptionLogger, boundRequestBuilderProcessor, obj, adapter);
linksFollower.resolveBasicLinks();
scrapAndSetLinkLists(linksFollower);
scrapAndSetBasicLinks(linksFollower);
}
/**
*
* This method will start a simple Link scrap and set the value to the
* corresponding field.
*
* @param linksFollower the {@link LinksFollower} instance holding resolved links to
* scrap.
* @throws LinkException when thrown by underlying {@link LinksFollower#resolveBasicLinks()}
* or if a scrapping exception when scrapping the links in which case
* the original exception can be retrieved using {@link Throwable#getCause()}.
* @throws ModelBindingException if any field setting operation failed due to POJO reflection
* access failure in which case the execption should be corrected
* before starting the scrapping once again.
*/
private void scrapAndSetBasicLinks(@NotNull final LinksFollower linksFollower) throws ModelBindingException, LinkException {
for(LinkScrappingContext lsc : linksFollower.getScrappingContexts()) {
Object newObj = null;
try {
newObj = scrap(lsc);
}
catch (ScrapperException e)
{
handleScrapperException(lsc.throwExceptions(), e);
}
if(newObj != null)
{
try {
WhimtripUtils.setObjectToField(lsc.getFieldToBeSet(), lsc.getParentObject(), newObj);
}
catch (IllegalAccessException e) {
exceptionLogger.logException(e);
throw new ModelBindingException(e);
}
}
}
}
/**
* Will handle Scrapper Exception and turn them into {@link ScrapperException}
* @param throwExceptions wether the exception should be thrown or not.
* @param e the underlying {@link ScrapperException}
* @throws LinkException if {@code throwExceptions} is set to true.
*/
@Contract("true, _ -> fail")
private void handleScrapperException(boolean throwExceptions, ScrapperException e) throws LinkException {
if(throwExceptions)
throw new LinkException(e);
exceptionLogger.logException(e);
}
/**
*
* This method will start a link list scrap and set the resulting list
* value to the corresponding field.
*
* @param linksFollower the {@link LinksFollower} instance holding resolved links to
* scrap.
* @throws LinkException when thrown by underlying {@link LinksFollower#resolveBasicLinks()}
* or if a scrapping exception when scrapping the links in which case
* the original exception can be retrieved using {@link Throwable#getCause()}.
* @throws ModelBindingException if any field setting operation failed due to POJO reflection
* access failure in which case the execption should be corrected
* before starting the scrapping once again.
*/
private void scrapAndSetLinkLists(@NotNull final LinksFollower linksFollower) throws ModelBindingException, LinkException {
for(LinkListScrappingContext llsc : linksFollower.getLinkListsScrappingContexts()) {
List ulist = buildLinkListScraps(llsc);
try {
WhimtripUtils.setObjectToField(llsc.getFieldToBeSet(), llsc.getParentObject(), ulist);
}
catch (IllegalAccessException e) {
exceptionLogger.logException(e);
throw new ModelBindingException(e);
}
}
}
/**
*
* The Link List to POJO List builder method. For each {@link LinkListScrappingContext}
* of the origin list, it will simply call {@link #scrap(LinkScrappingContext)}
* and then add the resulting value if not null to the new list being created.
*
* @param llsc the {@link LinkListScrappingContext} to scrap all links for.
* @param the type of POJO instances in the list to return.
* @return a list of {@code U} typed instances freshly scrapped from the
* {@link LinkScrappingContext} contained in {@code llsc}.
* @throws LinkException when a scrapping operation failed if {@link LinkScrappingContext#throwExceptions()}
* returned true.
*/
private List buildLinkListScraps(LinkListScrappingContext, U> llsc) throws LinkException {
List uList = new ArrayList<>();
for(LinkScrappingContext, U> lsc : llsc) {
U newObj = null;
try{
newObj = scrap(lsc);
}
catch (ScrapperException e) {
handleScrapperException(lsc.throwExceptions(), e);
}
if(newObj != null)
uList.add(newObj);
}
return uList;
}
}