
mtons.spider.fetcher.impl.ListFetcher Maven / Gradle / Ivy
package mtons.spider.fetcher.impl;
import org.apache.log4j.Logger;
import org.jsoup.select.Elements;
import mtons.spider.fetcher.Fetcher;
import mtons.spider.http.Request;
import mtons.spider.http.Response;
import mtons.spider.http.Style;
import mtons.spider.kit.Kit;
import mtons.spider.selector.Selectable;
/**
* 列表页解析器
*
* - 实现了 Fetcher 接口, 拦截了 accept 方法的执行并抛出 onAccept、onDetail 方法由子类扩展
*
* Created by langhsu on 16/6/15.
*/
public abstract class ListFetcher extends FetcherSupport implements Fetcher {
protected Logger logger = Logger.getLogger("spider.fetcher");
@Override
public String getStub() {
return "default";
}
/**
* 实现 accept 方法
* @param response 响应结果
*/
@Override
public void accept(Response response) {
if (response.getRequest().getStyle() == Style.DETAIL) {
onAccept(response);
}
if (response.getRequest().getStyle() == Style.LIST) {
parseDetail(response);
}
}
/**
* 解析方法, 由具体的业务类实现
*
* @param response 响应结果
*/
public abstract void onAccept(Response response);
/**
* 解析详情页链接, 并接入到任务队列
* @param response 响应结果
*/
public final void parseDetail(Response response) {
Selectable selectable = onDetail();
Elements els = selectable.accept(response.getDocument());
if (els != null && els.size() > 0) {
els.forEach(el -> {
logger.debug("view - " + el.attr("href"));
if (Kit.isNotEmptyUrl(el.attr("href"))) {
Request request = Request.get(Style.DETAIL, el.attr("abs:href"));
request.cloneFrom(response.getRequest());
response.addNextRequest(request);
}
});
}
}
/**
* 详情页链接解析规则
*
* example:
*
* new Selectable() {
* public Elements accept(Document document) {
* return document.select("#id");
* }
* }
*
* @return Selectable 选择器
*/
public abstract Selectable onDetail();
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy