![JAR search and dependency download from the Maven repository](/logo.png)
fr.whimtrip.ext.jwhtscrapper.service.scoped.LinksFollowerImpl Maven / Gradle / Ivy
Show all versions of whimtrip-ext-scrapper Show documentation
package fr.whimtrip.ext.jwhtscrapper.service.scoped;
import com.fasterxml.jackson.databind.ObjectMapper;
import fr.whimtrip.core.util.WhimtripUtils;
import fr.whimtrip.core.util.intrf.ExceptionLogger;
import fr.whimtrip.ext.jwhthtmltopojo.HtmlToPojoEngine;
import fr.whimtrip.ext.jwhthtmltopojo.adapter.HtmlToPojoAnnotationMap;
import fr.whimtrip.ext.jwhthtmltopojo.intrf.HtmlAdapter;
import fr.whimtrip.ext.jwhtscrapper.annotation.HasLink;
import fr.whimtrip.ext.jwhtscrapper.annotation.Link;
import fr.whimtrip.ext.jwhtscrapper.annotation.LinkListsFromBuilder;
import fr.whimtrip.ext.jwhtscrapper.annotation.LinkObject;
import fr.whimtrip.ext.jwhtscrapper.annotation.LinkObjects;
import fr.whimtrip.ext.jwhtscrapper.enm.Method;
import fr.whimtrip.ext.jwhtscrapper.exception.LinkClassCastException;
import fr.whimtrip.ext.jwhtscrapper.exception.LinkException;
import fr.whimtrip.ext.jwhtscrapper.exception.LinkListCastException;
import fr.whimtrip.ext.jwhtscrapper.exception.NullLinkException;
import fr.whimtrip.ext.jwhtscrapper.intfr.HtmlAutoScrapper;
import fr.whimtrip.ext.jwhtscrapper.intfr.HttpRequestEditor;
import fr.whimtrip.ext.jwhtscrapper.intfr.LinkListFactory;
import fr.whimtrip.ext.jwhtscrapper.intfr.LinksFollower;
import fr.whimtrip.ext.jwhtscrapper.service.base.BoundRequestBuilderProcessor;
import fr.whimtrip.ext.jwhtscrapper.service.base.HttpManagerClient;
import fr.whimtrip.ext.jwhtscrapper.service.holder.LinkListScrappingContext;
import fr.whimtrip.ext.jwhtscrapper.service.holder.LinkPreparatorHolder;
import fr.whimtrip.ext.jwhtscrapper.service.holder.LinkScrappingContext;
import fr.whimtrip.ext.jwhtscrapper.service.holder.PostField;
import fr.whimtrip.ext.jwhtscrapper.service.scoped.req.RequestUtils;
import org.asynchttpclient.BoundRequestBuilder;
import org.jetbrains.annotations.Contract;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.lang.reflect.Field;
import java.lang.reflect.ParameterizedType;
import java.lang.reflect.Type;
import java.util.*;
import java.util.regex.Pattern;
import static fr.whimtrip.ext.jwhtscrapper.annotation.Link.DEFAULT_REGEX_COND;
/**
* Part of project jwht-scrapper
* Created on 29/07/18
*
*
* Default implementation of {@link LinksFollower} interface that
* respect all of its contracts and prerequisite.
*
*
* @see LinksFollower
* @author Louis-wht
* @since 1.0.0
*/
public final class LinksFollowerImpl implements LinksFollower {
private static final Logger log = LoggerFactory.getLogger(LinksFollowerImpl.class);
private final HttpManagerClient httpManagerClient;
private final HtmlToPojoEngine htmlToPojoEngine;
private final BoundRequestBuilderProcessor requestProcessor;
private final Object model;
private final HtmlAdapter adapter;
private final ExceptionLogger exceptionLogger;
private final List scrappingContexts = new ArrayList<>();
private final List linkListsScrappingContexts = new ArrayList<>();
private final Map mappedLinkedListsScrappingContexts = new HashMap<>();
private boolean resolved = false;
/**
*
* Default Public constructor of this class.
* @param httpManagerClient the {@link HttpManagerClient} to be used to prepare
* {@link BoundRequestBuilder} as one of the last steps
* of the links following process, using method
* {@link HttpManagerClient#prepareGet(String)} or
* {@link HttpManagerClient#preparePost(String)}.
* @param htmlToPojoEngine the htmlToPojoEngine that will be used here only
* to perform POJO class fields analysis. The mapping
* will be triggered from the {@link HtmlAutoScrapper}
* and therefore still support custom {@link ObjectMapper}
* implementation for other input formats conversion.
* @param exceptionLogger the {@link ExceptionLogger} to use to log potentially thrown
* and catched exceptions if any.
* @param requestProcessor the {@link BoundRequestBuilderProcessor} to be used with
* {@link HttpRequestEditor#editRequest(BoundRequestBuilder, LinkPreparatorHolder, BoundRequestBuilderProcessor)}
* @param model the original parent model to assign fields to.
* @param adapter the parent POJO adapter that will also be used to retrieve
* child POJO adapters.
* @param the type of both the parent model and the parent adapter.
*/
public
LinksFollowerImpl(
@NotNull final HttpManagerClient httpManagerClient,
@NotNull final HtmlToPojoEngine htmlToPojoEngine,
@NotNull final ExceptionLogger exceptionLogger,
@NotNull final BoundRequestBuilderProcessor requestProcessor,
@NotNull final P model,
@NotNull final HtmlAdapter
adapter
){
this.httpManagerClient = httpManagerClient;
this.htmlToPojoEngine = htmlToPojoEngine;
this.exceptionLogger = exceptionLogger;
this.requestProcessor = requestProcessor;
this.model = model;
this.adapter = adapter;
}
/**
* {@inheritDoc}
*/
@Override
@SuppressWarnings("unchecked")
public void resolveBasicLinks() throws LinkException {
if(resolved)
return;
resolved = true;
resolveBasicLinks(model, adapter);
linkListsScrappingContexts.addAll(mappedLinkedListsScrappingContexts.values());
}
/**
* {@inheritDoc}
*/
@Override
@NotNull
public List getScrappingContexts() {
return scrappingContexts;
}
/**
* {@inheritDoc}
*/
@Override
@NotNull
public List getLinkListsScrappingContexts() {
return linkListsScrappingContexts;
}
/**
*
* This is the core method of this implementation basically doing all the
* work to scan links through and prepare them. This method requires the
* two additional parameters because it is supposed to be used recursively
* for child pojos which will require a new {@link HtmlAdapter} with a new
* parent POJO instance.
*
*
* This method will basically just call {@link #resolveChildPojosLinks(Object, List)},
* {@link #resolveListLinks(Object, List)} and {@link #followLinks(List, Object, HtmlAdapter)}
* method to respect the three key points of the {@link LinksFollower} contract mentionned
* here {@link LinksFollower#resolveBasicLinks()}.
*
* @param model the parent POJO instance to search links for.
* @param adapter the {@link HtmlAdapter} for the model POJO.
* @param the POJO type of {@code model}.
* @throws LinkException when one link could not be properly resolved and prepared.
* This will only happend if {@code throwExceptions} was
* enabled in any underlying {@link LinkScrappingContext}.
* Otherwise, the exception will be logged within the furnished
* {@link ExceptionLogger}.
*/
private
void resolveBasicLinks(
@NotNull final P model,
@NotNull final HtmlAdapter
adapter
) throws LinkException
{
if(log.isInfoEnabled())
log.info("Resolving basic links for model type {}.",model.getClass());
List> links = adapter.getFieldList(Link.class);
List> hasLinks = adapter.getFieldList(HasLink.class);
List> linkListsFromBuilders = adapter.getFieldList(LinkListsFromBuilder.class);
if(hasLinks != null)
resolveChildPojosLinks(model, hasLinks);
if(linkListsFromBuilders != null)
resolveListLinks(model, linkListsFromBuilders);
if(links != null)
followLinks(links, model, adapter);
}
/**
* Resolve the {@link LinkListsFromBuilder} annotations of the current POJO.
* @param model the model to scan {@link LinkListsFromBuilder} annotations for.
* @param linkListsFromBuilders the List of {@link HtmlToPojoAnnotationMap} for
* {@link LinkListsFromBuilder} currently retrieved.
* @param the Parent POJO type.
* @param the Child Pojo type of the list to create and populate.
*/
@SuppressWarnings("unchecked")
private
void resolveListLinks(P model, List> linkListsFromBuilders) {
for(HtmlToPojoAnnotationMap linkList : linkListsFromBuilders)
{
if(List.class.isAssignableFrom(linkList.getField().getType())) {
Type genericType = linkList.getField().getGenericType();
Type type = ((ParameterizedType) genericType).getActualTypeArguments()[0];
LinkListScrappingContext ulist = followLinkLists(linkList, model, htmlToPojoEngine.adapter((Class) type));
linkListsScrappingContexts.add(ulist);
}
else throw new LinkListCastException(linkList.getField());
}
}
/**
*
* This method will scan a POJO searching for {@link HasLink} annotation on top of a
* child POJO or list of child POJO typed field. If found, it will recursively call the
* {@link #resolveBasicLinks(Object, HtmlAdapter)} method for each of those POJOs.
*
*
* Depending on the field being a POJO typed field or a list of POJO typed field,
* it will either be directly pre-processed (POJO typed field) or pre-processed
* in a for loop for each single element of the original collection.
*
* @param model the parent model POJO instance to scan for {@link HasLink} fields.
* @param hasLinks the list of {@link HtmlToPojoAnnotationMap} containing {@link HasLink}
* annotations already retrieved.
* @param the Parent POJO type.
* @param the Child Pojo type. This type will be inferred in the for loop and
* might therefore represent several different types during this method
* call.
* @throws LinkException if any underlying {@link #resolveBasicLinks(Object, HtmlAdapter)}
* call itself throws a {@link LinkException}.
*/
@SuppressWarnings("unchecked")
private
void resolveChildPojosLinks(P model, List> hasLinks) throws LinkException {
for(HtmlToPojoAnnotationMap hasLink : hasLinks)
{
if(Collection.class.isAssignableFrom(hasLink.getField().getType()))
{
try {
Collection list = WhimtripUtils.getObjectFromField(hasLink.getField(), model);
for(U element : list)
{
resolveBasicLinks(element, (HtmlAdapter) htmlToPojoEngine.adapter(element.getClass()));
}
}
catch(IllegalAccessException e)
{
e.printStackTrace();
throw new LinkException(e);
}
}
else
{
try {
U element = WhimtripUtils.getObjectFromField(hasLink.getField(), model);
resolveBasicLinks(element,(HtmlAdapter) htmlToPojoEngine.adapter(element.getClass()));
}
catch(IllegalAccessException e)
{
e.printStackTrace();
throw new LinkException(e);
}
}
}
}
/**
*
* This method will follow direct links of a POJO annotated with {@link Link} for
* the String typed fields containing the URL to poll, and {@link LinkObject}
* or {@link LinkObjects} for the fields to populate with the future resulting
* scrap.
*
* @param links the raw {@link HtmlToPojoAnnotationMap} for {@link Link} annotated fields
* to further analyse.
* @param model the parent POJO instance to map resulting values to.
* @param adapter the parent POJO class {@link HtmlAdapter}
* @param the Parent POJO type.
* @param the Child Pojo type. This type will be inferred in the for loop and
* might therefore represent several different types during this method
* call.
* @throws LinkException if any of the {@link Link} annotated fields contains a
* null value {@link NullLinkException}, isn't a String
* typed field {@link LinkClassCastException} or if it cannot
* be retrieved using reflection.
*/
@SuppressWarnings("unchecked")
private
void followLinks(
@NotNull final List> links,
@NotNull final U model,
@NotNull final HtmlAdapter adapter
) throws LinkException {
for (HtmlToPojoAnnotationMap link : links) {
HtmlToPojoAnnotationMap objFieldMap = ((ScrapperHtmlAdapter) adapter).getLinkObject(link);
if (objFieldMap == null) throw new LinkException(link.getField());
Field objField = objFieldMap.getField();
String url = getLinkUrl(link, model);
if(checkRegexCondition(link, url))
{
boolean isListField = objFieldMap.getAnnotation() instanceof LinkObjects;
HtmlAdapter newFieldAdapter =
(HtmlAdapter)
htmlToPojoEngine.adapter(
isListField ? WhimtripUtils.getClassFromListField(objField)
: objField.getType()
);
boolean editRequest = link.getAnnotation().editRequest();
LinkPreparatorHolder container =
new LinkPreparatorHolder(
model,
url,
link.getAnnotation().method(),
RequestUtils.buildFields(link.getAnnotation().fields()),
objField,
link.getAnnotation().requestEditor(),
link.getAnnotation().followRedirections(),
link.getAnnotation().throwExceptions()
);
HttpRequestEditor
requestEditor = null;
if (editRequest)
{
requestEditor =
(HttpRequestEditor
)
(WhimtripUtils.createNewInstance(container.getRequestEditorClazz()));
requestEditor.init(container.getParentField());
}
LinkScrappingContext
lsc = buildContext(container, newFieldAdapter, requestEditor, editRequest);
if(!isListField)
{
if(lsc != null)
scrappingContexts.add(lsc);
}
else
{
LinkListScrappingContext
llsc =
mappedLinkedListsScrappingContexts
.computeIfAbsent(
objField,
field -> new LinkListScrappingContext(field, model)
);
llsc.add(lsc);
}
}
}
}
/**
*
* This method will check if the {@link Link#regexCondition()} is met
* by the annotated field parsed and casted value.
*
* @param link the {@link Link} annotated {@link HtmlToPojoAnnotationMap} field
* to analyse.
* @param linkVal the retrieved field value to check regex against.
* @return a boolean indicating if the {@link Link#regexCondition()} is met or not.
*/
@Contract("_, null -> false")
private boolean checkRegexCondition(HtmlToPojoAnnotationMap link, String linkVal){
if(linkVal == null)
return false;
if(link.getAnnotation().regexCondition().equals(DEFAULT_REGEX_COND))
return true;
Pattern pattern = Pattern.compile(link.getAnnotation().regexCondition());
return pattern.matcher(linkVal).find();
}
/**
*
* Gather the link value using reflection.
*
* @param link the {@link Link} annotated {@link HtmlToPojoAnnotationMap} field
* to analyse.
* @param model the model to extract stringified link from.
* @param the type of the model to extract stringified link from.
* @return the url retrieved from the corresponding {@link Link} annotated field.
* @throws LinkException if any of the {@link Link} annotated fields contains a
* null value {@link NullLinkException}, isn't a String
* typed field {@link LinkClassCastException} or if it cannot
* be retrieved using reflection.
*/
@Nullable
private String getLinkUrl(@NotNull final HtmlToPojoAnnotationMap link, @NotNull final U model) throws LinkException {
String linkVal = null;
try{
Object linkRawVal = WhimtripUtils.getObjectFromField(link.getField(), model);
if(linkRawVal == null)
throw new NullLinkException(link.getField());
if(!(linkRawVal instanceof String))
throw new LinkClassCastException(link.getField());
linkVal = (String) linkRawVal;
}
catch(IllegalAccessException e)
{
throw new LinkException(e);
}
catch (LinkException e) {
if(link.getAnnotation().throwExceptions())
throw e;
exceptionLogger.logException(e);
}
return linkVal;
}
/**
*
* This method will follow and prepare the {@link LinkListsFromBuilder}
* annotated fields.
*
* @param links the list of {@link LinkListsFromBuilder} annotated
* {@link HtmlToPojoAnnotationMap} fields to analyse.
* @param parent the parent POJO instance to assign scrap results to.
* @param adapter the {@link HtmlAdapter} of type {@code U}.
* @param the Parent POJO type.
* @param the child POJO type in the List to be created and assigned
* to {@code parent} POJO instance.
* @return the resulting {@link LinkListScrappingContext} built.
*/
@NotNull
@SuppressWarnings("unchecked")
private
LinkListScrappingContext
followLinkLists(
@NotNull final HtmlToPojoAnnotationMap links,
@NotNull final P parent,
@NotNull final HtmlAdapter adapter
){
LinkListFactory listFactory = WhimtripUtils.createNewInstance(links.getAnnotation().value());
LinkListScrappingContext
ulist = new LinkListScrappingContext<>(links.getField(), parent);
List containers = listFactory.createLinkPreparatorLists(parent, links.getField());
HttpRequestEditor requestEditor = null;
for(LinkPreparatorHolder cntn : containers)
{
boolean editRequest = links.getAnnotation().editRequest();
if (editRequest && requestEditor == null)
{
requestEditor =
(HttpRequestEditor
)
(WhimtripUtils.createNewInstance(cntn.getRequestEditorClazz()));
requestEditor.init(cntn.getParentField());
}
LinkScrappingContext
newObjCntx =
(LinkScrappingContext
)
buildContext(cntn, adapter, requestEditor, editRequest);
if (newObjCntx != null)
ulist.add(newObjCntx);
}
return ulist;
}
/**
*
* This method will build a single {@link LinkScrappingContext} for
* a single link to scrap. It will be called under the hood by both
* {@link #followLinks(List, Object, HtmlAdapter)} and
* {@link #followLinkLists(HtmlToPojoAnnotationMap, Object, HtmlAdapter)}.
*
* @param container the {@link LinkPreparatorHolder} to use to prepare and
* build {@link LinkScrappingContext}.
* @param adapter the child {@link HtmlAdapter} to use by the {@link HtmlAutoScrapper}
* to parse and scrap the {@code U} typed child to be scrapped element.
* @param requestEditor the {@link HttpRequestEditor} instance to further prepare the
* {@link BoundRequestBuilder} and {@code U} typed yet to come child
* object.
* @param editRequest wether the request should or shouldn't be edited.
* @param the parent POJO type.
* @param the child POJO type.
* @return the built and prepared. and ready to use {@link LinkScrappingContext}.
*/
@Nullable
private
LinkScrappingContext
buildContext(
@NotNull final LinkPreparatorHolder
container,
@NotNull final HtmlAdapter adapter,
@Nullable final HttpRequestEditor
requestEditor,
final boolean editRequest
){
if (!editRequest || (requestEditor != null && requestEditor.shouldDoRequest(container.getParent())))
{
U newObj = adapter.createNewInstance(container.getParent());
BoundRequestBuilder req = buildReq(container, requestEditor, newObj, editRequest);
return
new LinkScrappingContext<>(
req,
newObj,
adapter,
container.getParentField(),
container.getParent(),
container.followRedirections(),
container.throwExceptions()
);
}
return null;
}
/**
*
* Inner method used by {@link #buildContext(LinkPreparatorHolder, HtmlAdapter, HttpRequestEditor, boolean)}
* to prepare the {@link BoundRequestBuilder} to use for the {@link LinkScrappingContext}.
*
*
* @param container the {@link LinkPreparatorHolder} to use to prepare and
* build {@link LinkScrappingContext}.
* @param requestEditor the {@link HttpRequestEditor} instance to further prepare the
* {@link BoundRequestBuilder} and {@code U} typed yet to come child
* object.
* @param editRequest wether the request should or shouldn't be edited.
* @param newObj the instanciated new child object. (With handled POJO injection).
* @param the parent POJO type.
* @param the child POJO type.
* @return the new and prepared {@link BoundRequestBuilder} to be used for the
* scrapping operation of the {@link HtmlAutoScrapper}.
*/
@NotNull
private
BoundRequestBuilder buildReq(
@NotNull final LinkPreparatorHolder
container,
@Nullable final HttpRequestEditor
requestEditor,
@Nullable final U newObj,
final boolean editRequest
){
if (editRequest && requestEditor != null)
requestEditor.prepareObject(newObj, container.getParent(), container);
BoundRequestBuilder req;
if (container.getMethod() == Method.GET)
{
req = httpManagerClient.prepareGet(container.getUrl());
}
else
{
req = httpManagerClient.preparePost(container.getUrl());
for(PostField field: container.getFields())
{
req.addFormParam(field.getName(), field.getValue());
}
}
if (editRequest && requestEditor != null)
requestEditor.editRequest(req, container, requestProcessor);
return req;
}
}