All Downloads are FREE. Search and download functionalities are using the official Maven repository.

fr.whimtrip.ext.jwhtscrapper.service.scoped.LinksFollowerImpl Maven / Gradle / Ivy

The newest version!
package fr.whimtrip.ext.jwhtscrapper.service.scoped;

import com.fasterxml.jackson.databind.ObjectMapper;
import fr.whimtrip.core.util.WhimtripUtils;
import fr.whimtrip.core.util.intrf.ExceptionLogger;
import fr.whimtrip.ext.jwhthtmltopojo.HtmlToPojoEngine;
import fr.whimtrip.ext.jwhthtmltopojo.adapter.HtmlToPojoAnnotationMap;
import fr.whimtrip.ext.jwhthtmltopojo.intrf.HtmlAdapter;
import fr.whimtrip.ext.jwhtscrapper.annotation.HasLink;
import fr.whimtrip.ext.jwhtscrapper.annotation.Link;
import fr.whimtrip.ext.jwhtscrapper.annotation.LinkListsFromBuilder;
import fr.whimtrip.ext.jwhtscrapper.annotation.LinkObject;
import fr.whimtrip.ext.jwhtscrapper.annotation.LinkObjects;
import fr.whimtrip.ext.jwhtscrapper.enm.Method;
import fr.whimtrip.ext.jwhtscrapper.exception.LinkClassCastException;
import fr.whimtrip.ext.jwhtscrapper.exception.LinkException;
import fr.whimtrip.ext.jwhtscrapper.exception.LinkListCastException;
import fr.whimtrip.ext.jwhtscrapper.exception.NullLinkException;
import fr.whimtrip.ext.jwhtscrapper.intfr.HtmlAutoScrapper;
import fr.whimtrip.ext.jwhtscrapper.intfr.HttpRequestEditor;
import fr.whimtrip.ext.jwhtscrapper.intfr.LinkListFactory;
import fr.whimtrip.ext.jwhtscrapper.intfr.LinksFollower;
import fr.whimtrip.ext.jwhtscrapper.service.base.BoundRequestBuilderProcessor;
import fr.whimtrip.ext.jwhtscrapper.service.base.HttpManagerClient;
import fr.whimtrip.ext.jwhtscrapper.service.holder.LinkListScrappingContext;
import fr.whimtrip.ext.jwhtscrapper.service.holder.LinkPreparatorHolder;
import fr.whimtrip.ext.jwhtscrapper.service.holder.LinkScrappingContext;
import fr.whimtrip.ext.jwhtscrapper.service.holder.PostField;
import fr.whimtrip.ext.jwhtscrapper.service.scoped.req.RequestUtils;
import org.asynchttpclient.BoundRequestBuilder;
import org.jetbrains.annotations.Contract;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.lang.reflect.Field;
import java.lang.reflect.ParameterizedType;
import java.lang.reflect.Type;
import java.util.*;
import java.util.regex.Pattern;

import static fr.whimtrip.ext.jwhtscrapper.annotation.Link.DEFAULT_REGEX_COND;

/**
 * 

Part of project jwht-scrapper

*

Created on 29/07/18

* *

* Default implementation of {@link LinksFollower} interface that * respect all of its contracts and prerequisite. *

* * @see LinksFollower * @author Louis-wht * @since 1.0.0 */ public final class LinksFollowerImpl implements LinksFollower { private static final Logger log = LoggerFactory.getLogger(LinksFollowerImpl.class); private final HttpManagerClient httpManagerClient; private final HtmlToPojoEngine htmlToPojoEngine; private final BoundRequestBuilderProcessor requestProcessor; private final Object model; private final HtmlAdapter adapter; private final ExceptionLogger exceptionLogger; private final List scrappingContexts = new ArrayList<>(); private final List linkListsScrappingContexts = new ArrayList<>(); private final Map mappedLinkedListsScrappingContexts = new HashMap<>(); private boolean resolved = false; /** * *

Default Public constructor of this class.

* @param httpManagerClient the {@link HttpManagerClient} to be used to prepare * {@link BoundRequestBuilder} as one of the last steps * of the links following process, using method * {@link HttpManagerClient#prepareGet(String)} or * {@link HttpManagerClient#preparePost(String)}.
* @param htmlToPojoEngine the htmlToPojoEngine that will be used here only * to perform POJO class fields analysis. The mapping * will be triggered from the {@link HtmlAutoScrapper} * and therefore still support custom {@link ObjectMapper} * implementation for other input formats conversion.
* @param exceptionLogger the {@link ExceptionLogger} to use to log potentially thrown * and catched exceptions if any.
* @param requestProcessor the {@link BoundRequestBuilderProcessor} to be used with * {@link HttpRequestEditor#editRequest(BoundRequestBuilder, LinkPreparatorHolder, BoundRequestBuilderProcessor)} * @param model the original parent model to assign fields to.
* @param adapter the parent POJO adapter that will also be used to retrieve * child POJO adapters.
* @param

the type of both the parent model and the parent adapter. */ public

LinksFollowerImpl( @NotNull final HttpManagerClient httpManagerClient, @NotNull final HtmlToPojoEngine htmlToPojoEngine, @NotNull final ExceptionLogger exceptionLogger, @NotNull final BoundRequestBuilderProcessor requestProcessor, @NotNull final P model, @NotNull final HtmlAdapter

adapter ){ this.httpManagerClient = httpManagerClient; this.htmlToPojoEngine = htmlToPojoEngine; this.exceptionLogger = exceptionLogger; this.requestProcessor = requestProcessor; this.model = model; this.adapter = adapter; } /** * {@inheritDoc} */ @Override @SuppressWarnings("unchecked") public void resolveBasicLinks() throws LinkException { if(resolved) return; resolved = true; resolveBasicLinks(model, adapter); linkListsScrappingContexts.addAll(mappedLinkedListsScrappingContexts.values()); } /** * {@inheritDoc} */ @Override @NotNull public List getScrappingContexts() { return scrappingContexts; } /** * {@inheritDoc} */ @Override @NotNull public List getLinkListsScrappingContexts() { return linkListsScrappingContexts; } /** *

* This is the core method of this implementation basically doing all the * work to scan links through and prepare them. This method requires the * two additional parameters because it is supposed to be used recursively * for child pojos which will require a new {@link HtmlAdapter} with a new * parent POJO instance. *

*

* This method will basically just call {@link #resolveChildPojosLinks(Object, List)}, * {@link #resolveListLinks(Object, List)} and {@link #followLinks(List, Object, HtmlAdapter)} * method to respect the three key points of the {@link LinksFollower} contract mentionned * here {@link LinksFollower#resolveBasicLinks()}. *

* @param model the parent POJO instance to search links for. * @param adapter the {@link HtmlAdapter} for the model POJO. * @param

the POJO type of {@code model}. * @throws LinkException when one link could not be properly resolved and prepared. * This will only happend if {@code throwExceptions} was * enabled in any underlying {@link LinkScrappingContext}. * Otherwise, the exception will be logged within the furnished * {@link ExceptionLogger}. */ private

void resolveBasicLinks( @NotNull final P model, @NotNull final HtmlAdapter

adapter ) throws LinkException { if(log.isInfoEnabled()) log.info("Resolving basic links for model type {}.",model.getClass()); List> links = adapter.getFieldList(Link.class); List> hasLinks = adapter.getFieldList(HasLink.class); List> linkListsFromBuilders = adapter.getFieldList(LinkListsFromBuilder.class); if(hasLinks != null) resolveChildPojosLinks(model, hasLinks); if(linkListsFromBuilders != null) resolveListLinks(model, linkListsFromBuilders); if(links != null) followLinks(links, model, adapter); } /** * Resolve the {@link LinkListsFromBuilder} annotations of the current POJO. * @param model the model to scan {@link LinkListsFromBuilder} annotations for. * @param linkListsFromBuilders the List of {@link HtmlToPojoAnnotationMap} for * {@link LinkListsFromBuilder} currently retrieved. * @param

the Parent POJO type. * @param the Child Pojo type of the list to create and populate. */ @SuppressWarnings("unchecked") private void resolveListLinks(P model, List> linkListsFromBuilders) { for(HtmlToPojoAnnotationMap linkList : linkListsFromBuilders) { if(List.class.isAssignableFrom(linkList.getField().getType())) { Type genericType = linkList.getField().getGenericType(); Type type = ((ParameterizedType) genericType).getActualTypeArguments()[0]; LinkListScrappingContext ulist = followLinkLists(linkList, model, htmlToPojoEngine.adapter((Class) type)); linkListsScrappingContexts.add(ulist); } else throw new LinkListCastException(linkList.getField()); } } /** *

* This method will scan a POJO searching for {@link HasLink} annotation on top of a * child POJO or list of child POJO typed field. If found, it will recursively call the * {@link #resolveBasicLinks(Object, HtmlAdapter)} method for each of those POJOs. *

*

* Depending on the field being a POJO typed field or a list of POJO typed field, * it will either be directly pre-processed (POJO typed field) or pre-processed * in a for loop for each single element of the original collection. *

* @param model the parent model POJO instance to scan for {@link HasLink} fields. * @param hasLinks the list of {@link HtmlToPojoAnnotationMap} containing {@link HasLink} * annotations already retrieved. * @param

the Parent POJO type. * @param the Child Pojo type. This type will be inferred in the for loop and * might therefore represent several different types during this method * call. * @throws LinkException if any underlying {@link #resolveBasicLinks(Object, HtmlAdapter)} * call itself throws a {@link LinkException}. */ @SuppressWarnings("unchecked") private void resolveChildPojosLinks(P model, List> hasLinks) throws LinkException { for(HtmlToPojoAnnotationMap hasLink : hasLinks) { if(Collection.class.isAssignableFrom(hasLink.getField().getType())) { try { Collection list = WhimtripUtils.getObjectFromField(hasLink.getField(), model); for(U element : list) { resolveBasicLinks(element, (HtmlAdapter) htmlToPojoEngine.adapter(element.getClass())); } } catch(IllegalAccessException e) { e.printStackTrace(); throw new LinkException(e); } } else { try { U element = WhimtripUtils.getObjectFromField(hasLink.getField(), model); resolveBasicLinks(element,(HtmlAdapter) htmlToPojoEngine.adapter(element.getClass())); } catch(IllegalAccessException e) { e.printStackTrace(); throw new LinkException(e); } } } } /** *

* This method will follow direct links of a POJO annotated with {@link Link} for * the String typed fields containing the URL to poll, and {@link LinkObject} * or {@link LinkObjects} for the fields to populate with the future resulting * scrap. *

* @param links the raw {@link HtmlToPojoAnnotationMap} for {@link Link} annotated fields * to further analyse. * @param model the parent POJO instance to map resulting values to. * @param adapter the parent POJO class {@link HtmlAdapter} * @param

the Parent POJO type. * @param the Child Pojo type. This type will be inferred in the for loop and * might therefore represent several different types during this method * call. * @throws LinkException if any of the {@link Link} annotated fields contains a * null value {@link NullLinkException}, isn't a String * typed field {@link LinkClassCastException} or if it cannot * be retrieved using reflection. */ @SuppressWarnings("unchecked") private void followLinks( @NotNull final List> links, @NotNull final U model, @NotNull final HtmlAdapter adapter ) throws LinkException { for (HtmlToPojoAnnotationMap link : links) { HtmlToPojoAnnotationMap objFieldMap = ((ScrapperHtmlAdapter

) adapter).getLinkObject(link); if (objFieldMap == null) throw new LinkException(link.getField()); Field objField = objFieldMap.getField(); String url = getLinkUrl(link, model); if(checkRegexCondition(link, url)) { boolean isListField = objFieldMap.getAnnotation() instanceof LinkObjects; HtmlAdapter newFieldAdapter = (HtmlAdapter) htmlToPojoEngine.adapter( isListField ? WhimtripUtils.getClassFromListField(objField) : objField.getType() ); boolean editRequest = link.getAnnotation().editRequest(); LinkPreparatorHolder container = new LinkPreparatorHolder( model, url, link.getAnnotation().method(), RequestUtils.buildFields(link.getAnnotation().fields()), objField, link.getAnnotation().requestEditor(), link.getAnnotation().followRedirections(), link.getAnnotation().throwExceptions() ); HttpRequestEditor requestEditor = null; if (editRequest) { requestEditor = (HttpRequestEditor) (WhimtripUtils.createNewInstance(container.getRequestEditorClazz())); requestEditor.init(container.getParentField()); } LinkScrappingContext lsc = buildContext(container, newFieldAdapter, requestEditor, editRequest); if(!isListField) { if(lsc != null) scrappingContexts.add(lsc); } else { LinkListScrappingContext llsc = mappedLinkedListsScrappingContexts .computeIfAbsent( objField, field -> new LinkListScrappingContext(field, model) ); llsc.add(lsc); } } } } /** *

* This method will check if the {@link Link#regexCondition()} is met * by the annotated field parsed and casted value. *

* @param link the {@link Link} annotated {@link HtmlToPojoAnnotationMap} field * to analyse. * @param linkVal the retrieved field value to check regex against. * @return a boolean indicating if the {@link Link#regexCondition()} is met or not. */ @Contract("_, null -> false") private boolean checkRegexCondition(HtmlToPojoAnnotationMap link, String linkVal){ if(linkVal == null) return false; if(link.getAnnotation().regexCondition().equals(DEFAULT_REGEX_COND)) return true; Pattern pattern = Pattern.compile(link.getAnnotation().regexCondition()); return pattern.matcher(linkVal).find(); } /** *

* Gather the link value using reflection. *

* @param link the {@link Link} annotated {@link HtmlToPojoAnnotationMap} field * to analyse. * @param model the model to extract stringified link from. * @param the type of the model to extract stringified link from. * @return the url retrieved from the corresponding {@link Link} annotated field. * @throws LinkException if any of the {@link Link} annotated fields contains a * null value {@link NullLinkException}, isn't a String * typed field {@link LinkClassCastException} or if it cannot * be retrieved using reflection. */ @Nullable private String getLinkUrl(@NotNull final HtmlToPojoAnnotationMap link, @NotNull final U model) throws LinkException { String linkVal = null; try{ Object linkRawVal = WhimtripUtils.getObjectFromField(link.getField(), model); if(linkRawVal == null) throw new NullLinkException(link.getField()); if(!(linkRawVal instanceof String)) throw new LinkClassCastException(link.getField()); linkVal = (String) linkRawVal; } catch(IllegalAccessException e) { throw new LinkException(e); } catch (LinkException e) { if(link.getAnnotation().throwExceptions()) throw e; exceptionLogger.logException(e); } return linkVal; } /** *

* This method will follow and prepare the {@link LinkListsFromBuilder} * annotated fields. *

* @param links the list of {@link LinkListsFromBuilder} annotated * {@link HtmlToPojoAnnotationMap} fields to analyse. * @param parent the parent POJO instance to assign scrap results to. * @param adapter the {@link HtmlAdapter} of type {@code U}. * @param

the Parent POJO type. * @param the child POJO type in the List to be created and assigned * to {@code parent} POJO instance. * @return the resulting {@link LinkListScrappingContext} built. */ @NotNull @SuppressWarnings("unchecked") private LinkListScrappingContext followLinkLists( @NotNull final HtmlToPojoAnnotationMap links, @NotNull final P parent, @NotNull final HtmlAdapter adapter ){ LinkListFactory

listFactory = WhimtripUtils.createNewInstance(links.getAnnotation().value()); LinkListScrappingContext ulist = new LinkListScrappingContext<>(links.getField(), parent); List containers = listFactory.createLinkPreparatorLists(parent, links.getField()); HttpRequestEditor requestEditor = null; for(LinkPreparatorHolder cntn : containers) { boolean editRequest = links.getAnnotation().editRequest(); if (editRequest && requestEditor == null) { requestEditor = (HttpRequestEditor) (WhimtripUtils.createNewInstance(cntn.getRequestEditorClazz())); requestEditor.init(cntn.getParentField()); } LinkScrappingContext newObjCntx = (LinkScrappingContext) buildContext(cntn, adapter, requestEditor, editRequest); if (newObjCntx != null) ulist.add(newObjCntx); } return ulist; } /** *

* This method will build a single {@link LinkScrappingContext} for * a single link to scrap. It will be called under the hood by both * {@link #followLinks(List, Object, HtmlAdapter)} and * {@link #followLinkLists(HtmlToPojoAnnotationMap, Object, HtmlAdapter)}. *

* @param container the {@link LinkPreparatorHolder} to use to prepare and * build {@link LinkScrappingContext}. * @param adapter the child {@link HtmlAdapter} to use by the {@link HtmlAutoScrapper} * to parse and scrap the {@code U} typed child to be scrapped element. * @param requestEditor the {@link HttpRequestEditor} instance to further prepare the * {@link BoundRequestBuilder} and {@code U} typed yet to come child * object. * @param editRequest wether the request should or shouldn't be edited. * @param

the parent POJO type. * @param the child POJO type. * @return the built and prepared. and ready to use {@link LinkScrappingContext}. */ @Nullable private LinkScrappingContext buildContext( @NotNull final LinkPreparatorHolder

container, @NotNull final HtmlAdapter adapter, @Nullable final HttpRequestEditor requestEditor, final boolean editRequest ){ if (!editRequest || (requestEditor != null && requestEditor.shouldDoRequest(container.getParent()))) { U newObj = adapter.createNewInstance(container.getParent()); BoundRequestBuilder req = buildReq(container, requestEditor, newObj, editRequest); return new LinkScrappingContext<>( req, newObj, adapter, container.getParentField(), container.getParent(), container.followRedirections(), container.throwExceptions() ); } return null; } /** *

* Inner method used by {@link #buildContext(LinkPreparatorHolder, HtmlAdapter, HttpRequestEditor, boolean)} * to prepare the {@link BoundRequestBuilder} to use for the {@link LinkScrappingContext}. *

* * @param container the {@link LinkPreparatorHolder} to use to prepare and * build {@link LinkScrappingContext}. * @param requestEditor the {@link HttpRequestEditor} instance to further prepare the * {@link BoundRequestBuilder} and {@code U} typed yet to come child * object. * @param editRequest wether the request should or shouldn't be edited. * @param newObj the instanciated new child object. (With handled POJO injection). * @param

the parent POJO type. * @param the child POJO type. * @return the new and prepared {@link BoundRequestBuilder} to be used for the * scrapping operation of the {@link HtmlAutoScrapper}. */ @NotNull private BoundRequestBuilder buildReq( @NotNull final LinkPreparatorHolder

container, @Nullable final HttpRequestEditor requestEditor, @Nullable final U newObj, final boolean editRequest ){ if (editRequest && requestEditor != null) requestEditor.prepareObject(newObj, container.getParent(), container); BoundRequestBuilder req; if (container.getMethod() == Method.GET) { req = httpManagerClient.prepareGet(container.getUrl()); } else { req = httpManagerClient.preparePost(container.getUrl()); for(PostField field: container.getFields()) { req.addFormParam(field.getName(), field.getValue()); } } if (editRequest && requestEditor != null) requestEditor.editRequest(req, container, requestProcessor); return req; } }