All Downloads are FREE. Search and download functionalities are using the official Maven repository.

r.blog-hunter.1.0.4.source-code.HunterConfig.json Maven / Gradle / Ivy

Go to download

博客猎手,基于webMagic的博客爬取工具,支持慕课、csdn、iteye、cnblogs、掘金和V2EX等各大主流博客平台。博客千万篇,版权第一条。狩猎不规范,亲人两行泪。

The newest version!
{
  "imooc": {
    "domain": "www.imooc.com",
    "titleRegex": "//span[@class=js-title]/html()",
    "authorRegex": "//div[@class=name_con]/p[@class=name]/a[@class=nick]/html()",
    "releaseDateRegex": "//div[@class='dc-profile']/div[@class='l']/span[@class='spacer']/text()",
    "contentRegex": "//div[@class=detail-content]/html()",
    "tagRegex": "//div[@class=cat-box]/div[@class=cat-wrap]/a[@class=cat]/html()",
    "descriptionRegex": "//meta[@name=Description]/@content",
    "targetLinksRegex": "/article/[0-9]{1,10}",
    "header": [
      "Host=www.imooc.com",
      "Referer=https://www.imooc.com"
    ],
    "entryUrls": [
      "https://www.imooc.com/u/{uid}/articles?page=1"
    ]
  },
  "csdn": {
    "domain": "blog.csdn.net",
    "titleRegex": "//h1[@class=title-article]/html()",
    "authorRegex": "//a[@class=follow-nickName]/html()",
    "releaseDateRegex": "//div[@class=article-bar-top]/div[@class='bar-content']/span[@class=time]/html()",
    "contentRegex": "//div[@id=content_views]/html()",
    "tagRegex": "//span[@class=artic-tag-box]/a[@class=tag-link]/html()",
    "targetLinksRegex": "(((http|ftp|https):\\/\\/[0-9a-zA-Z]{1,15}.blog.csdn.net/article/details/[0-9a-zA-Z]{1,15})|((http|ftp|https):\\/\\/blog.csdn.net/{uid}/article/details/[0-9a-zA-Z]{1,15}))",
    "header": [
      "Host=blog.csdn.net",
      "Referer=https://blog.csdn.net/{uid}/article/list/1"
    ],
    "entryUrls": [
      "https://blog.csdn.net/{uid}/article/list/1"
    ]
  },
  "iteye": {
    "domain": "{uid}.iteye.com",
    "titleRegex": "//div[@class=blog_title]/h3/text()",
    "authorRegex": "//div[@id=blog_owner_name]/html()",
    "releaseDateRegex": "//div[@class=blog_bottom]/ul/li[1]/html()",
    "contentRegex": "//div[@class=iteye-blog-content-contain]/html()",
    "tagRegex": "//div[@class=news_tag]/a/html()",
    "targetLinksRegex": ".*{uid}\\.iteye\\.com/blog/[0-9]+",
    "header": [
      "Host={uid}.iteye.com",
      "Referer=http://{uid}.iteye.com/"
    ],
    "entryUrls": [
      "http://{uid}.iteye.com/?page=1"
    ]
  },
  "cnblogs": {
    "domain": "www.cnblogs.com",
    "titleRegex": "//a[@id=cb_post_title_url]/html()",
    "authorRegex": "//div[@class=postDesc]/a[1]/html()",
    "releaseDateRegex": "//span[@id=post-date]/html()",
    "contentRegex": "//div[@id=cnblogs_post_body]/html()",
    "tagRegex": "//div[@id=EntryTag]/a/html()",
    "descriptionRegex": "//meta[@property=\"og:description\"]/@content",
    "targetLinksRegex": ".*www\\.cnblogs\\.com/{uid}/p/[\\w\\d]+\\.html",
    "header": [
      "Host=www.cnblogs.com",
      "Referer=https://www.cnblogs.com/"
    ],
    "entryUrls": [
      "https://www.cnblogs.com/{uid}/default.html?page=1"
    ]
  },
  "juejin": {
    "domain": "juejin.im",
    "titleRegex": "//h1[@class=article-title]/html()",
    "authorRegex": "//div[@itemprop=author]/meta[@itemprop=\"name\"]/@content",
    "releaseDateRegex": "//meta[@itemprop=\"datePublished\"]/@content",
    "contentRegex": "//div[@class=article-content]/html()",
    "tagRegex": "//div[@class=tag-title]/html()",
    "targetLinksRegex": ".*juejin\\.im/post/[\\w\\d]+",
    "header": [
      "Host=juejin.im",
      "Referer=https://juejin.im"
    ],
    "entryUrls": [
      "https://juejin.im/user/{uid}/posts"
    ]
  },
  "v2ex": {
    "domain": "v2ex.com",
    "titleRegex": "//*[@id=Main]/div[@class=box]/div[@class=header]/h1/html()",
    "authorRegex": "//*[@id=Main]/div[@class=box]/div[@class=header]/small/a/html()",
    "releaseDateRegex": "//meta[@property=\"article:published_time\"]/@content",
    "contentRegex": "//div[@class=markdown_body]/html()",
    "tagRegex": "//*[@id=\"Main\"]/div[6]/div/a/html()",
    "descriptionRegex": "//meta[@property=\"og:description\"]/@content",
    "targetLinksRegex": ".*www\\.v2ex\\.com/t/[\\w\\d]+",
    "header": [
      "Host=www.v2ex.com",
      "Referer=https://www.v2ex.com"
    ],
    "entryUrls": [
      "https://www.v2ex.com/member/{uid}"
    ]
  },
  "oschina": {
    "domain": "oschina.net",
    "titleRegex": "//h1[@class=article-box__title]/a/text()",
    "authorRegex": "//div[@class=article-box__meta]/div[@class=item-list]/div[2]/a/html()",
    "releaseDateRegex": "//div[@class=article-box__meta]/div[@class=item-list]/div[4]/html()",
    "contentRegex": "//div[@class=content]/html()",
    "tagRegex": "//div[@class=tags-box]/div[@class=tags-box__inner]/a/html()",
    "targetLinksRegex": "https://my.oschina.net/.*/blog/[0-9]{1,10}",
    "header": [
      "Host=my.oschina.net",
      "Referer=https://my.oschina.net"
    ],
    "entryUrls": [
      "https://my.oschina.net/{uid}",
      "https://my.oschina.net/u/{uid}"
    ]
  },
  "jianshu": {
    "resolver": {
      "releaseDate": {
        "type": "regex",
        "clazz": "java.lang.Long",
        "operator": "* 1000"
      }
    },
    "domain": "jianshu.com",
    "titleRegex": "//h1[@class=_1RuRku]/text()",
    "authorRegex": "//span[@class=_22gUMi]/html()",
    "releaseDateRegex": ".*\"first_shared_at\":([0-9]+),.*",
    "contentRegex": "//article[@class=_2rhmJa]/html()",
    "tagRegex": "//div",
    "targetLinksRegex": "/p/[0-9a-zA-Z]{1,15}",
    "header": [
      "Host=www.jianshu.com",
      "Referer=https://www.jianshu.com/p/{uid}"
    ],
    "entryUrls": [
      "https://www.jianshu.com/p/{uid}",
      "https://www.jianshu.com/u/{uid}"
    ]
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy