us.codecraft.webmagic.model.PageModelExtractor Maven / Gradle / Ivy
package us.codecraft.webmagic.model;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.model.annotation.*;
import us.codecraft.webmagic.model.formatter.BasicTypeFormatter;
import us.codecraft.webmagic.model.formatter.ObjectFormatter;
import us.codecraft.webmagic.model.formatter.ObjectFormatters;
import us.codecraft.webmagic.selector.*;
import us.codecraft.webmagic.utils.ClassUtils;
import us.codecraft.webmagic.utils.ExtractorUtils;
import java.lang.annotation.Annotation;
import java.lang.reflect.Field;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
/**
* The main internal logic of page model extractor.
*
* @author [email protected]
* @since 0.2.0
*/
class PageModelExtractor {
private List targetUrlPatterns = new ArrayList();
private Selector targetUrlRegionSelector;
private List helpUrlPatterns = new ArrayList();
private Selector helpUrlRegionSelector;
private Class clazz;
private List fieldExtractors;
private Extractor objectExtractor;
private Logger logger = LoggerFactory.getLogger(getClass());
public static PageModelExtractor create(Class clazz) {
PageModelExtractor pageModelExtractor = new PageModelExtractor();
pageModelExtractor.init(clazz);
return pageModelExtractor;
}
private void init(Class clazz) {
this.clazz = clazz;
initClassExtractors();
fieldExtractors = new ArrayList();
for (Field field : ClassUtils.getFieldsIncludeSuperClass(clazz)) {
field.setAccessible(true);
FieldExtractor fieldExtractor = getAnnotationExtractBy(clazz, field);
FieldExtractor fieldExtractorTmp = getAnnotationExtractCombo(clazz, field);
if (fieldExtractor != null && fieldExtractorTmp != null) {
throw new IllegalStateException("Only one of 'ExtractBy ComboExtract ExtractByUrl' can be added to a field!");
} else if (fieldExtractor == null && fieldExtractorTmp != null) {
fieldExtractor = fieldExtractorTmp;
}
fieldExtractorTmp = getAnnotationExtractByUrl(clazz, field);
if (fieldExtractor != null && fieldExtractorTmp != null) {
throw new IllegalStateException("Only one of 'ExtractBy ComboExtract ExtractByUrl' can be added to a field!");
} else if (fieldExtractor == null && fieldExtractorTmp != null) {
fieldExtractor = fieldExtractorTmp;
}
if (fieldExtractor != null) {
checkFormat(field, fieldExtractor);
fieldExtractors.add(fieldExtractor);
}
}
}
private void checkFormat(Field field, FieldExtractor fieldExtractor) {
//check custom formatter
Formatter formatter = field.getAnnotation(Formatter.class);
if (formatter != null && !formatter.formatter().equals(ObjectFormatter.class)) {
if (formatter != null) {
if (!formatter.formatter().equals(ObjectFormatter.class)) {
ObjectFormatter objectFormatter = initFormatter(formatter.formatter());
objectFormatter.initParam(formatter.value());
fieldExtractor.setObjectFormatter(objectFormatter);
return;
}
}
}
if (!fieldExtractor.isMulti() && !String.class.isAssignableFrom(field.getType())) {
Class> fieldClazz = BasicTypeFormatter.detectBasicClass(field.getType());
ObjectFormatter objectFormatter = getObjectFormatter(field, fieldClazz, formatter);
if (objectFormatter == null) {
throw new IllegalStateException("Can't find formatter for field " + field.getName() + " of type " + fieldClazz);
} else {
fieldExtractor.setObjectFormatter(objectFormatter);
}
} else if (fieldExtractor.isMulti()) {
if (!List.class.isAssignableFrom(field.getType())) {
throw new IllegalStateException("Field " + field.getName() + " must be list");
}
if (formatter != null) {
if (!formatter.subClazz().equals(Void.class)) {
ObjectFormatter objectFormatter = getObjectFormatter(field, formatter.subClazz(), formatter);
if (objectFormatter == null) {
throw new IllegalStateException("Can't find formatter for field " + field.getName() + " of type " + formatter.subClazz());
} else {
fieldExtractor.setObjectFormatter(objectFormatter);
}
}
}
}
}
private ObjectFormatter getObjectFormatter(Field field, Class> fieldClazz, Formatter formatter) {
return initFormatter(ObjectFormatters.get(fieldClazz));
}
private ObjectFormatter initFormatter(Class extends ObjectFormatter> formatterClazz) {
try {
return formatterClazz.newInstance();
} catch (InstantiationException e) {
logger.error("init ObjectFormatter fail", e);
} catch (IllegalAccessException e) {
logger.error("init ObjectFormatter fail", e);
}
return null;
}
private FieldExtractor getAnnotationExtractByUrl(Class clazz, Field field) {
FieldExtractor fieldExtractor = null;
ExtractByUrl extractByUrl = field.getAnnotation(ExtractByUrl.class);
if (extractByUrl != null) {
String regexPattern = extractByUrl.value();
if (regexPattern.trim().equals("")) {
regexPattern = ".*";
}
fieldExtractor = new FieldExtractor(field,
new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(),
extractByUrl.multi() || List.class.isAssignableFrom(field.getType()));
Method setterMethod = getSetterMethod(clazz, field);
if (setterMethod != null) {
fieldExtractor.setSetterMethod(setterMethod);
}
}
return fieldExtractor;
}
private FieldExtractor getAnnotationExtractCombo(Class clazz, Field field) {
FieldExtractor fieldExtractor = null;
ComboExtract comboExtract = field.getAnnotation(ComboExtract.class);
if (comboExtract != null) {
ExtractBy[] extractBies = comboExtract.value();
Selector selector;
switch (comboExtract.op()) {
case And:
selector = new AndSelector(ExtractorUtils.getSelectors(extractBies));
break;
case Or:
selector = new OrSelector(ExtractorUtils.getSelectors(extractBies));
break;
default:
selector = new AndSelector(ExtractorUtils.getSelectors(extractBies));
}
fieldExtractor = new FieldExtractor(field, selector, comboExtract.source() == ComboExtract.Source.RawHtml ? FieldExtractor.Source.RawHtml : FieldExtractor.Source.Html,
comboExtract.notNull(), comboExtract.multi() || List.class.isAssignableFrom(field.getType()));
Method setterMethod = getSetterMethod(clazz, field);
if (setterMethod != null) {
fieldExtractor.setSetterMethod(setterMethod);
}
}
return fieldExtractor;
}
private FieldExtractor getAnnotationExtractBy(Class clazz, Field field) {
FieldExtractor fieldExtractor = null;
ExtractBy extractBy = field.getAnnotation(ExtractBy.class);
if (extractBy != null) {
Selector selector = ExtractorUtils.getSelector(extractBy);
fieldExtractor = new FieldExtractor(field, selector, extractBy.source() == ExtractBy.Source.RawHtml ? FieldExtractor.Source.RawHtml : FieldExtractor.Source.Html,
extractBy.notNull(), extractBy.multi() || List.class.isAssignableFrom(field.getType()));
Method setterMethod = getSetterMethod(clazz, field);
if (setterMethod != null) {
fieldExtractor.setSetterMethod(setterMethod);
}
}
return fieldExtractor;
}
public static Method getSetterMethod(Class clazz, Field field) {
String name = "set" + StringUtils.capitalize(field.getName());
try {
Method declaredMethod = clazz.getDeclaredMethod(name, field.getType());
declaredMethod.setAccessible(true);
return declaredMethod;
} catch (NoSuchMethodException e) {
return null;
}
}
private void initClassExtractors() {
Annotation annotation = clazz.getAnnotation(TargetUrl.class);
if (annotation == null) {
targetUrlPatterns.add(Pattern.compile("(.*)"));
} else {
TargetUrl targetUrl = (TargetUrl) annotation;
String[] value = targetUrl.value();
for (String s : value) {
targetUrlPatterns.add(Pattern.compile("(" + s.replace(".", "\\.").replace("*", "[^\"'#]*") + ")"));
}
if (!targetUrl.sourceRegion().equals("")) {
targetUrlRegionSelector = new XpathSelector(targetUrl.sourceRegion());
}
}
annotation = clazz.getAnnotation(HelpUrl.class);
if (annotation != null) {
HelpUrl helpUrl = (HelpUrl) annotation;
String[] value = helpUrl.value();
for (String s : value) {
helpUrlPatterns.add(Pattern.compile("(" + s.replace(".", "\\.").replace("*", "[^\"'#]*") + ")"));
}
if (!helpUrl.sourceRegion().equals("")) {
helpUrlRegionSelector = new XpathSelector(helpUrl.sourceRegion());
}
}
annotation = clazz.getAnnotation(ExtractBy.class);
if (annotation != null) {
ExtractBy extractBy = (ExtractBy) annotation;
objectExtractor = new Extractor(new XpathSelector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi());
}
}
public Object process(Page page) {
boolean matched = false;
for (Pattern targetPattern : targetUrlPatterns) {
if (targetPattern.matcher(page.getUrl().toString()).matches()) {
matched = true;
}
}
if (!matched) {
return null;
}
if (objectExtractor == null) {
return processSingle(page, null, true);
} else {
if (objectExtractor.multi) {
List
© 2015 - 2025 Weber Informatics LLC | Privacy Policy