All Downloads are FREE. Search and download functionalities are using the official Maven repository.

us.codecraft.webmagic.example.GithubRepoPageMapper Maven / Gradle / Ivy

The newest version!
package us.codecraft.webmagic.example;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.model.PageMapper;
import us.codecraft.webmagic.processor.PageProcessor;

/**
 * @author [email protected] 
* @since 0.3.2 */ public class GithubRepoPageMapper implements PageProcessor { private Site site = Site.me().setRetryTimes(3).setSleepTime(0); private PageMapper githubRepoPageMapper = new PageMapper(GithubRepo.class); @Override public void process(Page page) { page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()); page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+)").all()); GithubRepo githubRepo = githubRepoPageMapper.get(page); if (githubRepo == null) { page.setSkip(true); } else { page.putField("repo", githubRepo); } } @Override public Site getSite() { return site; } public static void main(String[] args) { Spider.create(new GithubRepoPageMapper()).addUrl("https://github.com/code4craft").thread(5).run(); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy