All Downloads are FREE. Search and download functionalities are using the official Maven repository.

us.codecraft.webmagic.model.ModelPageProcessor Maven / Gradle / Ivy

The newest version!
package us.codecraft.webmagic.model;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selector;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * The extension to PageProcessor for page model extractor.
 *
 * @author [email protected] 
* @since 0.2.0 */ class ModelPageProcessor implements PageProcessor { private List pageModelExtractorList = new ArrayList(); private Site site; private boolean extractLinks = true; public static ModelPageProcessor create(Site site, Class... clazzs) { ModelPageProcessor modelPageProcessor = new ModelPageProcessor(site); for (Class clazz : clazzs) { modelPageProcessor.addPageModel(clazz); } return modelPageProcessor; } public ModelPageProcessor addPageModel(Class clazz) { PageModelExtractor pageModelExtractor = PageModelExtractor.create(clazz); pageModelExtractorList.add(pageModelExtractor); return this; } private ModelPageProcessor(Site site) { this.site = site; } @Override public void process(Page page) { for (PageModelExtractor pageModelExtractor : pageModelExtractorList) { if (extractLinks) { extractLinks(page, pageModelExtractor.getHelpUrlRegionSelector(), pageModelExtractor.getHelpUrlPatterns()); extractLinks(page, pageModelExtractor.getTargetUrlRegionSelector(), pageModelExtractor.getTargetUrlPatterns()); } Object process = pageModelExtractor.process(page); if (process == null || (process instanceof List && ((List) process).size() == 0)) { continue; } postProcessPageModel(pageModelExtractor.getClazz(), process); page.putField(pageModelExtractor.getClazz().getCanonicalName(), process); } if (page.getResultItems().getAll().size() == 0) { page.getResultItems().setSkip(true); } } private void extractLinks(Page page, Selector urlRegionSelector, List urlPatterns) { List links; if (urlRegionSelector == null) { links = page.getHtml().links().all(); } else { links = page.getHtml().selectList(urlRegionSelector).links().all(); } for (String link : links) { for (Pattern targetUrlPattern : urlPatterns) { Matcher matcher = targetUrlPattern.matcher(link); if (matcher.find()) { page.addTargetRequest(new Request(matcher.group(0))); } } } } protected void postProcessPageModel(Class clazz, Object object) { } @Override public Site getSite() { return site; } public boolean isExtractLinks() { return extractLinks; } public void setExtractLinks(boolean extractLinks) { this.extractLinks = extractLinks; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy