us.codecraft.webmagic.model.ModelPageProcessor Maven / Gradle / Ivy
The newest version!
package us.codecraft.webmagic.model;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selector;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* The extension to PageProcessor for page model extractor.
*
* @author [email protected]
* @since 0.2.0
*/
class ModelPageProcessor implements PageProcessor {
private List pageModelExtractorList = new ArrayList();
private Site site;
private boolean extractLinks = true;
public static ModelPageProcessor create(Site site, Class... clazzs) {
ModelPageProcessor modelPageProcessor = new ModelPageProcessor(site);
for (Class clazz : clazzs) {
modelPageProcessor.addPageModel(clazz);
}
return modelPageProcessor;
}
public ModelPageProcessor addPageModel(Class clazz) {
PageModelExtractor pageModelExtractor = PageModelExtractor.create(clazz);
pageModelExtractorList.add(pageModelExtractor);
return this;
}
private ModelPageProcessor(Site site) {
this.site = site;
}
@Override
public void process(Page page) {
for (PageModelExtractor pageModelExtractor : pageModelExtractorList) {
if (extractLinks) {
extractLinks(page, pageModelExtractor.getHelpUrlRegionSelector(), pageModelExtractor.getHelpUrlPatterns());
extractLinks(page, pageModelExtractor.getTargetUrlRegionSelector(), pageModelExtractor.getTargetUrlPatterns());
}
Object process = pageModelExtractor.process(page);
if (process == null || (process instanceof List && ((List) process).size() == 0)) {
continue;
}
postProcessPageModel(pageModelExtractor.getClazz(), process);
page.putField(pageModelExtractor.getClazz().getCanonicalName(), process);
}
if (page.getResultItems().getAll().size() == 0) {
page.getResultItems().setSkip(true);
}
}
private void extractLinks(Page page, Selector urlRegionSelector, List urlPatterns) {
List links;
if (urlRegionSelector == null) {
links = page.getHtml().links().all();
} else {
links = page.getHtml().selectList(urlRegionSelector).links().all();
}
for (String link : links) {
for (Pattern targetUrlPattern : urlPatterns) {
Matcher matcher = targetUrlPattern.matcher(link);
if (matcher.find()) {
page.addTargetRequest(new Request(matcher.group(0)));
}
}
}
}
protected void postProcessPageModel(Class clazz, Object object) {
}
@Override
public Site getSite() {
return site;
}
public boolean isExtractLinks() {
return extractLinks;
}
public void setExtractLinks(boolean extractLinks) {
this.extractLinks = extractLinks;
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy