All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.seimicrawler.xpath.util.Scanner Maven / Gradle / Ivy

Go to download

一个非常好用而且强大的基于xpath的html解析器。html的DOM树生成依赖Jsoup。Lexer 和 Parser基于Antlr4,支持完备的W3C XPATH 1.0标准语法,W3C规范:http://www.w3.org/TR/1999/REC-xpath-19991116。

There is a newer version: 2.5.3
Show newest version
package org.seimicrawler.xpath.util;

import org.apache.commons.lang3.exception.ExceptionUtils;
import org.seimicrawler.xpath.core.AxisSelector;
import org.seimicrawler.xpath.core.Function;
import org.seimicrawler.xpath.core.NodeTest;
import org.seimicrawler.xpath.core.axis.AncestorOrSelfSelector;
import org.seimicrawler.xpath.core.axis.AncestorSelector;
import org.seimicrawler.xpath.core.axis.AttributeSelector;
import org.seimicrawler.xpath.core.axis.ChildSelector;
import org.seimicrawler.xpath.core.axis.DescendantOrSelfSelector;
import org.seimicrawler.xpath.core.axis.DescendantSelector;
import org.seimicrawler.xpath.core.axis.FollowingSelector;
import org.seimicrawler.xpath.core.axis.FollowingSiblingOneSelector;
import org.seimicrawler.xpath.core.axis.FollowingSiblingSelector;
import org.seimicrawler.xpath.core.axis.ParentSelector;
import org.seimicrawler.xpath.core.axis.PrecedingSelector;
import org.seimicrawler.xpath.core.axis.PrecedingSiblingOneSelector;
import org.seimicrawler.xpath.core.axis.PrecedingSiblingSelector;
import org.seimicrawler.xpath.core.axis.SelfSelector;
import org.seimicrawler.xpath.core.function.*;
import org.seimicrawler.xpath.core.node.AllText;
import org.seimicrawler.xpath.core.node.Html;
import org.seimicrawler.xpath.core.node.Node;
import org.seimicrawler.xpath.core.node.Num;
import org.seimicrawler.xpath.core.node.OuterHtml;
import org.seimicrawler.xpath.core.node.Text;
import org.seimicrawler.xpath.exception.NoSuchAxisException;
import org.seimicrawler.xpath.exception.NoSuchFunctionException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.HashMap;
import java.util.Map;

/**
 * 考虑更广泛的兼容性,替换掉 FastClasspathScanner,采用手工注册
 * @author github.com/zhegexiaohuozi [email protected]
 * @since 2018/2/28.
 */
public class Scanner {
    private static Map axisSelectorMap = new HashMap<>();
    private static Map nodeTestMap = new HashMap<>();
    private static Map functionMap = new HashMap<>();
    private static Logger logger = LoggerFactory.getLogger(Scanner.class);

    static {
        initAxis(AncestorOrSelfSelector.class,AncestorSelector.class,AttributeSelector.class,ChildSelector.class,DescendantOrSelfSelector.class,DescendantSelector.class,FollowingSelector.class,FollowingSiblingOneSelector.class,FollowingSiblingSelector.class,ParentSelector.class,PrecedingSelector.class,PrecedingSiblingOneSelector.class,PrecedingSiblingSelector.class,SelfSelector.class);
        initFunction(Concat.class,Contains.class,Count.class,First.class,Last.class,Not.class,Position.class,StartsWith.class,StringLength.class,SubString.class,SubStringAfter.class,SubStringBefore.class,SubStringEx.class, FormatDate.class,SubStringAfterLast.class,SubStringBeforeLast.class);
        initNode(AllText.class,Html.class,Node.class,Num.class,OuterHtml.class,Text.class);
    }

    public static AxisSelector findSelectorByName(String selectorName) {
        AxisSelector selector = axisSelectorMap.get(selectorName);
        if (selector == null) {
            throw new NoSuchAxisException("not support axis: " + selectorName);
        }
        return selector;
    }

    public static NodeTest findNodeTestByName(String nodeTestName) {
        NodeTest nodeTest = nodeTestMap.get(nodeTestName);
        if (nodeTest == null) {
            throw new NoSuchFunctionException("not support nodeTest: " + nodeTestName);
        }
        return nodeTest;
    }

    public static Function findFunctionByName(String funcName) {
        Function function = functionMap.get(funcName);
        if (function == null) {
            throw new NoSuchFunctionException("not support function: " + funcName);
        }
        return function;
    }

    public static void registerFunction(Class func){
        Function function;
        try {
            function = func.newInstance();
            functionMap.put(function.name(), function);
        } catch (Exception e) {
            logger.info(ExceptionUtils.getRootCauseMessage(e),e);
        }
    }

    public static void registerNodeTest(Class nodeTestClass){
        NodeTest nodeTest;
        try {
            nodeTest = nodeTestClass.newInstance();
            nodeTestMap.put(nodeTest.name(), nodeTest);
        }  catch (Exception e) {
            logger.info(ExceptionUtils.getRootCauseMessage(e),e);
        }
    }

    public static void registerAxisSelector(Class axisSelectorClass){
        AxisSelector axisSelector;
        try {
            axisSelector = axisSelectorClass.newInstance();
            axisSelectorMap.put(axisSelector.name(), axisSelector);
        }  catch (Exception e) {
            logger.info(ExceptionUtils.getRootCauseMessage(e),e);
        }
    }

    public static void initAxis(Class... cls){
        for (Class axis:cls){
            registerAxisSelector(axis);
        }
    }

    public static void initFunction(Class... cls){
        for (Class func:cls){
            registerFunction(func);
        }
    }

    public static void initNode(Class... cls){
        for (Class node:cls){
            registerNodeTest(node);
        }
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy