com.yishuifengxiao.common.crawler.content.ContentExtract Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of crawler Show documentation

风铃虫-一个超级棒的爬虫内核

There is a newer version: 2.3.2

package com.yishuifengxiao.common.crawler.content;

import java.util.List;

import com.yishuifengxiao.common.crawler.domain.entity.Page;
import com.yishuifengxiao.common.crawler.domain.model.ContentRule;
import com.yishuifengxiao.common.crawler.domain.model.ExtractRule;
import com.yishuifengxiao.common.tool.exception.ServiceException;

/**
 * 内容解析器

 * 用于从网页里根据需要提取出目标数据

 * 
 * 使用方法核心示例如下：
 * 
 *  * //获取到网页的源码
 * String rawtxt = page.getRawTxt();
 * // 执行解析操作
 * object value=解析出来的数据
 * page.addResultItem( 实例中唯一的键, value) ;
 * 
 * 
 * @author yishui
 * @date 2019年11月26日
 * @version 1.0.0
 */
public interface ContentExtract {

	/**
	 * 从网页内容里解析出所有符合要求的数据
	 * 
	 * @param contentRule 内容解析规则
	 * @param rules       内容提取规则
	 * @param page        网页对象
	 * @throws ServiceException 解析时遇到的异常
	 */
	void extract(final ContentRule contentRule, final List rules, final Page page) throws ServiceException;
}