All Downloads are FREE. Search and download functionalities are using the official Maven repository.

fr.whimtrip.ext.jwhtscrapper.service.scoped.HtmlAutoScrapperImpl Maven / Gradle / Ivy

The newest version!
/*
 * This code is licensed to WhimTrip©. For any question, please contact the author of the file.
 */

/*
 * This code is licensed to WhimTrip©. For any question, please contact the author of the file.
 */

/*
 * This code is licensed to WhimTrip©. For any question, please contact the author of the file.
 */

/*
 * This code is licensed to WhimTrip©. For any question, please contact the author of the file.
 */

/*
 * This code is licensed to WhimTrip©. For any question, please contact the author of the file.
 */

package fr.whimtrip.ext.jwhtscrapper.service.scoped;

import fr.whimtrip.core.util.WhimtripUtils;
import fr.whimtrip.core.util.intrf.ExceptionLogger;
import fr.whimtrip.ext.jwhthtmltopojo.HtmlToPojoEngine;
import fr.whimtrip.ext.jwhthtmltopojo.exception.HtmlToPojoException;
import fr.whimtrip.ext.jwhthtmltopojo.intrf.HtmlAdapter;
import fr.whimtrip.ext.jwhtscrapper.enm.Action;
import fr.whimtrip.ext.jwhtscrapper.annotation.WarningSign;
import fr.whimtrip.ext.jwhtscrapper.exception.*;
import fr.whimtrip.ext.jwhtscrapper.impl.ScrapperHtmlAdapterFactory;
import fr.whimtrip.ext.jwhtscrapper.intfr.BasicObjectMapper;
import fr.whimtrip.ext.jwhtscrapper.intfr.HtmlAutoScrapper;
import fr.whimtrip.ext.jwhtscrapper.intfr.HttpMetrics;
import fr.whimtrip.ext.jwhtscrapper.intfr.LinksFollower;
import fr.whimtrip.ext.jwhtscrapper.service.base.BoundRequestBuilderProcessor;
import fr.whimtrip.ext.jwhtscrapper.service.base.HttpManagerClient;
import fr.whimtrip.ext.jwhtscrapper.service.holder.LinkListScrappingContext;
import fr.whimtrip.ext.jwhtscrapper.service.holder.LinkScrappingContext;
import org.asynchttpclient.BoundRequestBuilder;
import org.jetbrains.annotations.Contract;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicBoolean;

import static fr.whimtrip.ext.jwhtscrapper.enm.PausingBehavior.PAUSE_ALL_THREADS;
import static fr.whimtrip.ext.jwhtscrapper.enm.PausingBehavior.PAUSE_CURRENT_THREAD_ONLY;


/**
 * 

Part of project jwht-scrapper

*

Created on 30/07/18

* *

* Default implementation of {@link HtmlAutoScrapper}. As stated in the interface * javadoc, we implemented it with all given recommendations including : *

* *
    *
  • * {@link HttpWithProxyManagerClient} *
  • *
  • * {@link LinksFollowerImpl} *
  • *
  • * {@link HtmlToPojoEngine} with {@link ScrapperHtmlAdapterFactory}. *
  • *
  • * {@link BasicObjectMapper} is accepted. *
  • *
* * @see HtmlAutoScrapper * @author Louis-wht * @since 1.0.0 */ public final class HtmlAutoScrapperImpl implements HtmlAutoScrapper { private static final Logger log = LoggerFactory.getLogger(HtmlAutoScrapperImpl.class); private final HttpManagerClient httpManagerClient; private final BasicObjectMapper objectMapper; private final BoundRequestBuilderProcessor boundRequestBuilderProcessor; private final HtmlAdapter htmlAdapter; private final HtmlToPojoEngine htmlToPojoEngine; private final ExceptionLogger exceptionLogger; private final Class persistentClass; private final int warningSignDelay; private final boolean followRedirections; private final AtomicBoolean scrapStopped; private WarningSignException lastThrownWarningSignException; /** *

Default Constructor

* * @param exceptionLogger the exception logger that will be used by both the * {@link HttpWithProxyManagerClient} and the {@link HtmlAutoScrapper} * @param htmlToPojoEngine the core html to pojo engine allowing us to parse * HTML input to java POJOs. * @param objectMapper the object mapper to use for mapping differently formatted * strings. * @param boundRequestBuilderProcessor the request processor used for headers, * cookies etc modfications as well as other * eventual use cases. * @param httpManagerClient the {@link HttpWithProxyManagerClient} that will be used under the * hood by the {@link HtmlAutoScrapper}.
* * @param clazz the class to map resulting outputs to.
* * * @param followRedirections wether HTTP redirections should be followed * or not (HTTP redirections is valid if status * code is {@code 301} or {@code 302} and when * the {@code Location} header is not empty.
* * @param warningSignDelay delay before retrying any action in the case * a {@link WarningSign} was triggered and only if it * was set to {@link Action#RETRY}.
* @param scrapStopped shared atomic boolean indicating if the current scrap * process is stopped or not. */ public HtmlAutoScrapperImpl( HttpManagerClient httpManagerClient, HtmlToPojoEngine htmlToPojoEngine, BoundRequestBuilderProcessor boundRequestBuilderProcessor, BasicObjectMapper objectMapper, ExceptionLogger exceptionLogger, Class clazz, boolean followRedirections, int warningSignDelay, AtomicBoolean scrapStopped ) { this.httpManagerClient = httpManagerClient; this.boundRequestBuilderProcessor = boundRequestBuilderProcessor; this.objectMapper = objectMapper; this.exceptionLogger = exceptionLogger; this.persistentClass = clazz; htmlAdapter = htmlToPojoEngine.adapter(persistentClass); this.htmlToPojoEngine = htmlToPojoEngine; this.warningSignDelay = warningSignDelay; this.followRedirections = followRedirections; this.scrapStopped = scrapStopped; } /** * {@inheritDoc} */ @NotNull @Override public T scrap(@NotNull final BoundRequestBuilder req, @Nullable final T obj) throws ModelBindingException, LinkException, WarningSignException { return scrap(req, obj, htmlAdapter, followRedirections, true); } /** * {@inheritDoc} */ @NotNull @Override public T scrapPost(@NotNull final String url, @Nullable final Map fields) throws ModelBindingException, LinkException, WarningSignException { return scrap(prepareScrapPost(url, fields)); } /** * {@inheritDoc} */ @NotNull @Override public T scrapGet(@NotNull final String url) throws ModelBindingException, LinkException, WarningSignException { return scrap(prepareScrapGet(url)); } /** * {@inheritDoc} */ @NotNull @Override public BoundRequestBuilder prepareScrapPost(@NotNull final String url, @Nullable final Map fields) { BoundRequestBuilder req = httpManagerClient.preparePost(url); if(fields != null) for (Map.Entry field : fields.entrySet()) req.addFormParam(field.getKey(), field.getValue().toString()); return req; } /** * {@inheritDoc} */ @NotNull @Override public BoundRequestBuilder prepareScrapGet(@NotNull final String url) { return httpManagerClient.prepareGet(url); } /** * {@inheritDoc} */ @NotNull @Override public HttpMetrics getHttpMetrics() throws ScrapperUnsupportedException { return httpManagerClient.getHttpMetrics(); } /** *

* This provide the core private method that will perform scrapping * related tasks. It conforms to all recommandations and contracts * stipulated in {@link HtmlAutoScrapper#scrap(BoundRequestBuilder, Object)}. *

* @see HtmlAutoScrapper#scrap(BoundRequestBuilder, Object) * @param req the prepared {@link BoundRequestBuilder} * @param obj the object to map the resulting scrap to. * @param adapter the {@link HtmlAdapter} to use to map the resulting * HTML body to a POJO. if the {@link BasicObjectMapper} * is used instead, the adapter will still be used to perform * field injection, links following and warning sign triggering. * @param followRedirections wether HTTP redirections should be followed or not. * @param the type of the POJO to map it to. This inner method can be called * recursively for links scrapping with other POJOs type. this explain * why {@code T} is not used here. * @return the scrapped and ready to use POJO instance. * @throws ModelBindingException see {@link HtmlAutoScrapper#scrap(BoundRequestBuilder, Object)}. * @throws LinkException see {@link HtmlAutoScrapper#scrap(BoundRequestBuilder, Object)}. * @throws WarningSignException see {@link HtmlAutoScrapper#scrap(BoundRequestBuilder, Object)}. */ private U scrap( @NotNull BoundRequestBuilder req, @Nullable U obj, @NotNull final HtmlAdapter adapter, final boolean followRedirections ) throws ModelBindingException, LinkException, WarningSignException { return scrap(req, obj, adapter, followRedirections, false); } /** *

* This provide the core private method that will perform scrapping * related tasks. It conforms to all recommandations and contracts * stipulated in {@link HtmlAutoScrapper#scrap(BoundRequestBuilder, Object)}. *

* @see HtmlAutoScrapper#scrap(BoundRequestBuilder, Object) * @param req the prepared {@link BoundRequestBuilder} * @param obj the object to map the resulting scrap to. * @param adapter the {@link HtmlAdapter} to use to map the resulting * HTML body to a POJO. if the {@link BasicObjectMapper} * is used instead, the adapter will still be used to perform * field injection, links following and warning sign triggering. * @param followRedirections wether HTTP redirections should be followed or not. * @param the type of the POJO to map it to. This inner method can be called * recursively for links scrapping with other POJOs type. this explain * why {@code T} is not used here. * @return the scrapped and ready to use POJO instance. * @throws ModelBindingException see {@link HtmlAutoScrapper#scrap(BoundRequestBuilder, Object)}. * @throws LinkException see {@link HtmlAutoScrapper#scrap(BoundRequestBuilder, Object)}. * @throws WarningSignException see {@link HtmlAutoScrapper#scrap(BoundRequestBuilder, Object)}. */ @SuppressWarnings("unchecked") private U scrap( @NotNull BoundRequestBuilder req, @Nullable U obj, @NotNull final HtmlAdapter adapter, final boolean followRedirections, final boolean parentCall ) throws ModelBindingException, LinkException, WarningSignException { Class mappedClazz = obj == null ? (Class) persistentClass : (Class) obj.getClass(); String rawResponse = null; while(scrapStopped.get()){ try { Thread.sleep(warningSignDelay / 10); } catch (InterruptedException e) { exceptionLogger.logException(e); } } rawResponse = httpManagerClient.getResponse(req, followRedirections); try { obj = buildObject(obj, adapter, mappedClazz, rawResponse); resolveLinks(obj, adapter); return obj; } catch (WarningSignActualScrapStoppedException e) { // this will propagate up to the parent scrap call which will return // untouched object if(!parentCall) throw e; return obj; } catch(WarningSignException e) { return handleWarningSign(req, obj, adapter, followRedirections, parentCall, e); } catch (IOException | HtmlToPojoException e) { throw new ModelBindingException(e); } finally { // removing the http manager client request context to avoid memory // overloading httpManagerClient.removeContext(req); } } /** *

* This provide the core private method that will handle a warning sign and * the actions to be taken when triggered. *

* @param req the prepared {@link BoundRequestBuilder} * @param obj the object to map the resulting scrap to. * @param adapter the {@link HtmlAdapter} to use to map the resulting * HTML body to a POJO. if the {@link BasicObjectMapper} * is used instead, the adapter will still be used to perform * field injection, links following and warning sign triggering. * @param followRedirections wether HTTP redirections should be followed or not. * @param e the warning exception triggered. * @param the type of the POJO to map it to. This inner method can be called * recursively for links scrapping with other POJOs type. this explain * why {@code T} is not used here. * @return the scrapped and ready to use POJO instance. * @throws ModelBindingException see {@link HtmlAutoScrapper#scrap(BoundRequestBuilder, Object)}. * @throws LinkException see {@link HtmlAutoScrapper#scrap(BoundRequestBuilder, Object)}. * @throws WarningSignException see {@link HtmlAutoScrapper#scrap(BoundRequestBuilder, Object)}. */ private U handleWarningSign( @NotNull BoundRequestBuilder req, @Nullable U obj, @NotNull HtmlAdapter adapter, boolean followRedirections, boolean parentCall, WarningSignException e ) throws ModelBindingException, LinkException, WarningSignException { log.warn("A warning sign was triggered! {}", e.getMessage()); boundRequestBuilderProcessor.printReq(req); if( e.getPausingBehavior() == PAUSE_ALL_THREADS || e.getPausingBehavior() == PAUSE_CURRENT_THREAD_ONLY) { if (e.getPausingBehavior() == PAUSE_ALL_THREADS) { scrapStopped.set(false); lastThrownWarningSignException = e; } WhimtripUtils.waitFor((long) warningSignDelay, log, 20); // this is to ensure that the same pausing threads is the one that is // commanding the scrapping to stop. if(e.getPausingBehavior() == PAUSE_ALL_THREADS && e == lastThrownWarningSignException) scrapStopped.set(true); } if(e.getAction() == Action.THROW_EXCEPTION) { log.warn("Current scrap handled a fatal error which shouldn't lead to further scrapping for that object"); throw e; } if(e.getAction() == Action.STOP_ACTUAL_SCRAP) { log.warn("Current object shouldn't be further scrapped"); // this will propagate up to the parent scrap call which will return // untouched object throw new WarningSignActualScrapStoppedException(e); } if(e.getAction() == Action.RETRY) { return scrap(req, obj, adapter, followRedirections, parentCall); } // Action.NONE -> Won't do nothing, scrap will continue but not on the current // POJO branching. return obj; } /** *

* Will call {@link #scrap(BoundRequestBuilder, Object, HtmlAdapter, boolean)} * with all context parameters given by an {@link LinkScrappingContext}. *

* @param lsc the {@link LinkScrappingContext} to use to perform the scrap operation. * @param the type of POJO instance it should return. * @return the corresponding {@code U} pojo instance. * @throws ModelBindingException see {@link HtmlAutoScrapper#scrap(BoundRequestBuilder, Object)}. * @throws LinkException see {@link HtmlAutoScrapper#scrap(BoundRequestBuilder, Object)}. * @throws WarningSignException see {@link HtmlAutoScrapper#scrap(BoundRequestBuilder, Object)}. */ private U scrap(LinkScrappingContext lsc) throws ModelBindingException, LinkException, WarningSignException { return scrap(lsc.getBoundRequestBuilder(), lsc.getNewObj(), lsc.getAdapter(), lsc.followRedirections()); } /** *

* This method will build String body to a POJO using either * {@link HtmlAdapter} to map HTML or {@link BasicObjectMapper} * to map any other input format to POJO with any other rules it * might imply. *

* @param obj the object to build the scrapping result to. Might be null. * @param adapter the adapter to use. Might be null if {@link BasicObjectMapper} * is provided instead. * @param mappedClazz the clazz to map the body to. * @param rawResponse the raw String body response from the HTTP scrapping request. * @param the type of POJO to map it to. * @return fully built and scrapped {@code U} POJO instance * @throws IOException if Object binding didn't work as expected with {@link BasicObjectMapper}. * @throws HtmlToPojoException if Object binding failed with {@link HtmlAdapter}. */ private U buildObject( @Nullable U obj, @Nullable final HtmlAdapter adapter, @Nullable final Class mappedClazz, @NotNull final String rawResponse ) throws IOException, HtmlToPojoException { if (obj == null) { obj = objectMapper == null ? adapter.fromHtml(rawResponse) : objectMapper.readValue(rawResponse, mappedClazz); } else { obj = objectMapper == null ? adapter.fromHtml(rawResponse, obj) : objectMapper.readValue(rawResponse, mappedClazz, obj); } return obj; } /** *

* This method will handle all link following tasks related. * This includes three main steps: *

*
    *
  • * Resolving all links to be searched and polled using {@link LinksFollowerImpl} * default implementation of {@link LinksFollower}. *
  • *
  • * Scrap those links using {@link #scrap(LinkScrappingContext)} method. *
  • *
  • * Set the resulting the values to the field. *
  • *
* *

* Additionally Link lists and single links are handled separately because * list of links requires to first instanciate a list to append every link * entry scrap result to. *

* @param obj the object to resolve links for. * @param adapter the {@link HtmlAdapter} to use to analyse the links to further scrap. * @param the type of the POJO instance to scrap links for. * @throws LinkException when thrown by underlying {@link LinksFollower#resolveBasicLinks()} * or if a scrapping exception when scrapping the links in which case * the original exception can be retrieved using {@link Throwable#getCause()}. * @throws ModelBindingException if any field setting operation failed due to POJO reflection * access failure in which case the execption should be corrected * before starting the scrapping once again. */ private void resolveLinks(@NotNull final U obj, @NotNull final HtmlAdapter adapter) throws LinkException, ModelBindingException{ LinksFollower linksFollower = new LinksFollowerImpl(httpManagerClient, htmlToPojoEngine, exceptionLogger, boundRequestBuilderProcessor, obj, adapter); linksFollower.resolveBasicLinks(); scrapAndSetLinkLists(linksFollower); scrapAndSetBasicLinks(linksFollower); } /** *

* This method will start a simple Link scrap and set the value to the * corresponding field. *

* @param linksFollower the {@link LinksFollower} instance holding resolved links to * scrap. * @throws LinkException when thrown by underlying {@link LinksFollower#resolveBasicLinks()} * or if a scrapping exception when scrapping the links in which case * the original exception can be retrieved using {@link Throwable#getCause()}. * @throws ModelBindingException if any field setting operation failed due to POJO reflection * access failure in which case the execption should be corrected * before starting the scrapping once again. */ private void scrapAndSetBasicLinks(@NotNull final LinksFollower linksFollower) throws ModelBindingException, LinkException { for(LinkScrappingContext lsc : linksFollower.getScrappingContexts()) { Object newObj = null; try { newObj = scrap(lsc); } catch (ScrapperException e) { handleScrapperException(lsc.throwExceptions(), e); } if(newObj != null) { try { WhimtripUtils.setObjectToField(lsc.getFieldToBeSet(), lsc.getParentObject(), newObj); } catch (IllegalAccessException e) { exceptionLogger.logException(e); throw new ModelBindingException(e); } } } } /** *

Will handle Scrapper Exception and turn them into {@link ScrapperException}

* @param throwExceptions wether the exception should be thrown or not. * @param e the underlying {@link ScrapperException} * @throws LinkException if {@code throwExceptions} is set to true. */ @Contract("true, _ -> fail") private void handleScrapperException(boolean throwExceptions, ScrapperException e) throws LinkException { if(throwExceptions) throw new LinkException(e); exceptionLogger.logException(e); } /** *

* This method will start a link list scrap and set the resulting list * value to the corresponding field. *

* @param linksFollower the {@link LinksFollower} instance holding resolved links to * scrap. * @throws LinkException when thrown by underlying {@link LinksFollower#resolveBasicLinks()} * or if a scrapping exception when scrapping the links in which case * the original exception can be retrieved using {@link Throwable#getCause()}. * @throws ModelBindingException if any field setting operation failed due to POJO reflection * access failure in which case the execption should be corrected * before starting the scrapping once again. */ private void scrapAndSetLinkLists(@NotNull final LinksFollower linksFollower) throws ModelBindingException, LinkException { for(LinkListScrappingContext llsc : linksFollower.getLinkListsScrappingContexts()) { List ulist = buildLinkListScraps(llsc); try { WhimtripUtils.setObjectToField(llsc.getFieldToBeSet(), llsc.getParentObject(), ulist); } catch (IllegalAccessException e) { exceptionLogger.logException(e); throw new ModelBindingException(e); } } } /** *

* The Link List to POJO List builder method. For each {@link LinkListScrappingContext} * of the origin list, it will simply call {@link #scrap(LinkScrappingContext)} * and then add the resulting value if not null to the new list being created. *

* @param llsc the {@link LinkListScrappingContext} to scrap all links for. * @param the type of POJO instances in the list to return. * @return a list of {@code U} typed instances freshly scrapped from the * {@link LinkScrappingContext} contained in {@code llsc}. * @throws LinkException when a scrapping operation failed if {@link LinkScrappingContext#throwExceptions()} * returned true. */ private List buildLinkListScraps(LinkListScrappingContext llsc) throws LinkException { List uList = new ArrayList<>(); for(LinkScrappingContext lsc : llsc) { U newObj = null; try{ newObj = scrap(lsc); } catch (ScrapperException e) { handleScrapperException(lsc.throwExceptions(), e); } if(newObj != null) uList.add(newObj); } return uList; } }