
me.zhyd.hunter.resolver.JsonResolver Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of blog-hunter Show documentation
Show all versions of blog-hunter Show documentation
博客猎手,基于webMagic的博客爬取工具,支持慕课、csdn、iteye、cnblogs、掘金和V2EX等各大主流博客平台。博客千万篇,版权第一条。狩猎不规范,亲人两行泪。
The newest version!
package me.zhyd.hunter.resolver;
import me.zhyd.hunter.config.HunterConfig;
import org.apache.commons.lang3.StringUtils;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.selector.JsonPathSelector;
/**
* 解析处理Ajax渲染的页面(待完善)
*
* @author yadong.zhang (yadong.zhang0415(a)gmail.com)
* @version 1.0
*/
public class JsonResolver implements Resolver {
@Override
public void process(Page page, HunterConfig model) {
String rawText = page.getRawText();
String title = new JsonPathSelector(model.getTitleRegex()).select(rawText);
if (!StringUtils.isEmpty(title) && !"null".equals(title)) {
page.putField("title", title);
page.putField("releaseDate", new JsonPathSelector(model.getReleaseDateRegex()).select(rawText));
page.putField("author", new JsonPathSelector(model.getAuthorRegex()).select(rawText));
page.putField("content", new JsonPathSelector(model.getContentRegex()).select(rawText));
page.putField("source", page.getRequest().getUrl());
}
page.addTargetRequests(page.getHtml().links().regex(model.getTargetLinksRegex()).all());
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy