All Downloads are FREE. Search and download functionalities are using the official Maven repository.

us.codecraft.webmagic.model.PageModelExtractor Maven / Gradle / Ivy

The newest version!
package us.codecraft.webmagic.model;

import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import lombok.Getter;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.model.annotation.*;
import us.codecraft.webmagic.model.fields.PageField;
import us.codecraft.webmagic.model.formatter.ObjectFormatterBuilder;
import us.codecraft.webmagic.model.sources.Source;
import us.codecraft.webmagic.model.sources.SourceTextExtractor;
import us.codecraft.webmagic.model.sources.Source.*;
import us.codecraft.webmagic.selector.*;
import us.codecraft.webmagic.utils.ClassUtils;
import us.codecraft.webmagic.utils.ExtractorUtils;

import java.lang.annotation.Annotation;
import java.lang.reflect.Field;
import java.lang.reflect.Method;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;

import static us.codecraft.webmagic.model.annotation.ExtractBy.Source.RawText;

/**
 * The main internal logic of page model extractor.
 *
 * @author [email protected] 
* @since 0.2.0 */ class PageModelExtractor { @Getter private List targetUrlPatterns = new ArrayList(); @Getter private Selector targetUrlRegionSelector; @Getter private List helpUrlPatterns = new ArrayList(); @Getter private Selector helpUrlRegionSelector; @Getter private Class clazz; private List fieldExtractors; private Extractor objectExtractor; private Logger logger = LoggerFactory.getLogger(getClass()); public static PageModelExtractor create(Class clazz) { PageModelExtractor pageModelExtractor = new PageModelExtractor(); pageModelExtractor.init(clazz); return pageModelExtractor; } private void init(Class clazz) { this.clazz = clazz; initClassExtractors(); fieldExtractors = new ArrayList(); for (Field field : ClassUtils.getFieldsIncludeSuperClass(clazz)) { field.setAccessible(true); FieldExtractor fieldExtractor = getAnnotationExtractBy(clazz, field); FieldExtractor fieldExtractorTmp = getAnnotationExtractCombo(clazz, field); if (fieldExtractor != null && fieldExtractorTmp != null) { throw new IllegalStateException("Only one of 'ExtractBy ComboExtract ExtractByUrl' can be added to a field!"); } else if (fieldExtractor == null && fieldExtractorTmp != null) { fieldExtractor = fieldExtractorTmp; } fieldExtractorTmp = getAnnotationExtractByUrl(clazz, field); if (fieldExtractor != null && fieldExtractorTmp != null) { throw new IllegalStateException("Only one of 'ExtractBy ComboExtract ExtractByUrl' can be added to a field!"); } else if (fieldExtractor == null && fieldExtractorTmp != null) { fieldExtractor = fieldExtractorTmp; } if (fieldExtractor != null) { fieldExtractor.setObjectFormatter(new ObjectFormatterBuilder().setField(field).build()); fieldExtractors.add(fieldExtractor); } } } private FieldExtractor getAnnotationExtractByUrl(Class clazz, Field field) { FieldExtractor fieldExtractor = null; ExtractByUrl extractByUrl = field.getAnnotation(ExtractByUrl.class); if (extractByUrl != null) { String regexPattern = extractByUrl.value(); if (regexPattern.trim().equals("")) { regexPattern = ".*"; } fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), new Url(), extractByUrl.notNull(), extractByUrl.multi() || List.class.isAssignableFrom(field.getType())); Method setterMethod = getSetterMethod(clazz, field); if (setterMethod != null) { fieldExtractor.setSetterMethod(setterMethod); } } return fieldExtractor; } private FieldExtractor getAnnotationExtractCombo(Class clazz, Field field) { FieldExtractor fieldExtractor = null; ComboExtract comboExtract = field.getAnnotation(ComboExtract.class); if (comboExtract != null) { ExtractBy[] extractBies = comboExtract.value(); Selector selector; switch (comboExtract.op()) { case And: selector = new AndSelector(ExtractorUtils.getSelectors(extractBies)); break; case Or: selector = new OrSelector(ExtractorUtils.getSelectors(extractBies)); break; default: selector = new AndSelector(ExtractorUtils.getSelectors(extractBies)); } fieldExtractor = new FieldExtractor(field, selector, comboExtract.source() == ComboExtract.Source.RawHtml ? new RawHtml() : new SelectedHtml(), comboExtract.notNull(), comboExtract.multi() || List.class.isAssignableFrom(field.getType())); Method setterMethod = getSetterMethod(clazz, field); if (setterMethod != null) { fieldExtractor.setSetterMethod(setterMethod); } } return fieldExtractor; } private FieldExtractor getAnnotationExtractBy(Class clazz, Field field) { FieldExtractor fieldExtractor = null; ExtractBy extractBy = field.getAnnotation(ExtractBy.class); if (extractBy != null) { Selector selector = ExtractorUtils.getSelector(extractBy); ExtractBy.Source extractSource = extractBy.source(); if (extractBy.type()== ExtractBy.Type.JsonPath) extractSource = RawText; Source source = null; switch (extractSource) { case RawText: source = new RawText(); break; case RawHtml: source = new RawHtml(); break; case SelectedHtml: source = new SelectedHtml(); break; default: source = new SelectedHtml(); } fieldExtractor = new FieldExtractor(field, selector, source, extractBy.notNull(), List.class.isAssignableFrom(field.getType())); fieldExtractor.setSetterMethod(getSetterMethod(clazz, field)); } return fieldExtractor; } public static Method getSetterMethod(Class clazz, Field field) { String name = "set" + StringUtils.capitalize(field.getName()); try { Method declaredMethod = clazz.getDeclaredMethod(name, field.getType()); declaredMethod.setAccessible(true); return declaredMethod; } catch (NoSuchMethodException e) { return null; } } private void initClassExtractors() { Annotation annotation = clazz.getAnnotation(TargetUrl.class); if (annotation == null) { targetUrlPatterns.add(Pattern.compile(".*")); } else { TargetUrl targetUrl = (TargetUrl) annotation; String[] value = targetUrl.value(); for (String s : value) { targetUrlPatterns.add(Pattern.compile(s.replace(".", "\\.").replace("*", "[^\"'#]*"))); } if (!targetUrl.sourceRegion().equals("")) { targetUrlRegionSelector = new XpathSelector(targetUrl.sourceRegion()); } } annotation = clazz.getAnnotation(HelpUrl.class); if (annotation != null) { HelpUrl helpUrl = (HelpUrl) annotation; String[] value = helpUrl.value(); for (String s : value) { helpUrlPatterns.add(Pattern.compile(s.replace(".", "\\.").replace("*", "[^\"'#]*"))); } if (!helpUrl.sourceRegion().equals("")) { helpUrlRegionSelector = new XpathSelector(helpUrl.sourceRegion()); } } annotation = clazz.getAnnotation(ExtractBy.class); if (annotation != null) { ExtractBy extractBy = (ExtractBy) annotation; objectExtractor = new Extractor(new XpathSelector(extractBy.value()), new SelectedHtml(), extractBy.notNull(), extractBy.multi()); } } public Object process(Page page) { boolean matched = false; for (Pattern targetPattern : targetUrlPatterns) { if (targetPattern.matcher(page.getUrl().toString()).matches()) { matched = true; } } if (!matched) { return null; } if (objectExtractor == null) { return processSingle(page, null, true); } else { if (objectExtractor.multi) { List os = new ArrayList(); List list = objectExtractor.getSelector().selectList(page.getRawText()); for (String s : list) { Object o = processSingle(page, s, false); if (o != null) { os.add(o); } } return os; } else { String select = objectExtractor.getSelector().select(page.getRawText()); Object o = processSingle(page, select, false); return o; } } } private Object processSingle(Page page, String html, boolean isRaw) { Object o = null; try { o = clazz.newInstance(); for (FieldExtractor fieldExtractor : fieldExtractors) { PageField field = SourceTextExtractor.getText(page, html, isRaw, fieldExtractor); if (!field.operation(o, fieldExtractor, logger)) return null; } if (AfterExtractor.class.isAssignableFrom(clazz)) ((AfterExtractor) o).afterProcess(page); } catch (Exception e) { logger.error("extract fail", e); } return o; } }