All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.saucesubfresh.starter.crawler.pipeline.AbstractParserPipeline Maven / Gradle / Ivy

package com.saucesubfresh.starter.crawler.pipeline;

import com.saucesubfresh.starter.crawler.domain.FieldExtractor;
import com.saucesubfresh.starter.crawler.domain.SpiderRequest;
import com.saucesubfresh.starter.crawler.domain.SpiderResponse;
import com.saucesubfresh.starter.crawler.enums.ExpressionType;
import com.saucesubfresh.starter.crawler.parser.ElementSelector;
import com.saucesubfresh.starter.crawler.parser.provider.JsonPathSelector;
import com.saucesubfresh.starter.crawler.utils.ExtractorUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * 抽象解析类,子类实现包括
 * 1. 动态规则解析(自动模式)
 * 2. 基于类注解解析(手动模式)
 * @author: 李俊平
 * @Date: 2022-04-19 22:49
 */
public abstract class AbstractParserPipeline implements ParserPipeline {

    @Override
    public void process(SpiderRequest request, SpiderResponse response) {
        Map parseResult = doParse(request, response);
        response.setParseResult(parseResult);
    }

    /**
     * 解析 json
     *
     * @param json
     * @param fieldExtractors
     * @return
     */
    protected Map parseJson(String json, final List fieldExtractors){
        Map fields = new HashMap<>();
        JsonPathSelector jsonPathSelector = new JsonPathSelector(json);
        for (FieldExtractor extractor : fieldExtractors) {
            String expressionValue = extractor.getExpressionValue();
            if (extractor.isMulti()){
                List results = jsonPathSelector.selectList(expressionValue);
                fields.put(extractor.getFieldName(), results);
            }else {
                String result = jsonPathSelector.select(expressionValue);
                fields.put(extractor.getFieldName(), result);
            }
        }
        return fields;
    }

    /**
     * 解析网页
     *
     * @param html
     * @param fieldExtractors
     * @return
     */
    protected Map parseHtml(String html, final List fieldExtractors){
        Map fields = new HashMap<>();
        Document document = Jsoup.parse(html);
        for (FieldExtractor extractor : fieldExtractors) {
            String expression = extractor.getExpressionValue();
            ExpressionType type = ExpressionType.of(extractor.getExpressionType());
            ElementSelector selector = (ElementSelector) ExtractorUtils.getSelector(type, expression);
            if (extractor.isMulti()){
                List results = selector.selectList(document);
                fields.put(extractor.getFieldName(), results);
            }else {
                String result = selector.select(document);
                fields.put(extractor.getFieldName(), result);
            }
        }
        return fields;
    }

    protected abstract Map doParse(SpiderRequest request, SpiderResponse response);

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy