All Downloads are FREE. Search and download functionalities are using the official Maven repository.

cienter.webmagic-scripts.v2020.6.17.source-code.Github.kt Maven / Gradle / Ivy

The newest version!

import us.codecraft.webmagic.Page
import us.codecraft.webmagic.Site
import us.codecraft.webmagic.Spider
import us.codecraft.webmagic.processor.PageProcessor
import us.codecraft.webmagic.processor.example.GithubRepoPageProcessor

/**
 *
 * @author [email protected]
 * Date: 2017/5/31
 * Time: 下午11:33
 *
 */
class GithubRepoPageProcessor : PageProcessor {

    private val site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000)

    override fun process(page: Page) {
        page.addTargetRequests(page.html.links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").all())
        page.addTargetRequests(page.html.links().regex("(https://github\\.com/[\\w\\-])").all())
        page.putField("author", page.url.regex("https://github\\.com/(\\w+)/.*").toString())
        page.putField("name", page.html.xpath("//h1[@class='public']/strong/a/text()").toString())
        if (page.resultItems.get("name") == null) {
            //skip this page
            page.setSkip(true)
        }
        page.putField("readme", page.html.xpath("//div[@id='readme']/tidyText()"))
    }

    override fun getSite(): Site {
        return site
    }

    companion object {
        @JvmStatic fun main(args: Array) {
            Spider.create(GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run()
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy