All Downloads are FREE. Search and download functionalities are using the official Maven repository.

us.codecraft.webmagic.selector.Xpath2Selector Maven / Gradle / Ivy

The newest version!
package us.codecraft.webmagic.selector;

import java.util.*;
import java.util.concurrent.ConcurrentHashMap;

import javax.xml.namespace.NamespaceContext;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;

import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.DomSerializer;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

import net.sf.saxon.lib.NamespaceConstant;
import net.sf.saxon.xpath.XPathEvaluator;
import us.codecraft.webmagic.utils.BaseSelectorUtils;

import static us.codecraft.webmagic.selector.JaxpSelectorUtils.*;

/**
 * 支持xpath2.0的选择器。包装了HtmlCleaner和Saxon HE。
* * @author [email protected], hooy
* Date: 13-4-21 * Time: 上午9:39 */ public class Xpath2Selector implements Selector, NodeSelector { private final String xpathStr; private XPathExpression xPathExpression; private final Logger logger = LoggerFactory.getLogger(getClass()); public Xpath2Selector(String xpathStr) { this.xpathStr = xpathStr; try { init(); } catch (XPathExpressionException e) { throw new IllegalArgumentException("XPath error!", e); } } public static Xpath2Selector newInstance(String xpathStr) { return new Xpath2Selector(xpathStr); } enum XPath2NamespaceContext implements NamespaceContext { INSTANCE; private final Map prefix2NamespaceMap = new ConcurrentHashMap<>(); private final Map> namespace2PrefixMap = new ConcurrentHashMap<>(); private void put(String prefix, String namespaceURI) { prefix2NamespaceMap.put(prefix, namespaceURI); List prefixes = namespace2PrefixMap.computeIfAbsent(namespaceURI, k -> new ArrayList<>()); prefixes.add(prefix); } XPath2NamespaceContext() { put("fn", NamespaceConstant.FN); put("xslt", NamespaceConstant.XSLT); put("xhtml", NamespaceConstant.XHTML); } @Override public String getNamespaceURI(String prefix) { return prefix2NamespaceMap.get(prefix); } @Override public String getPrefix(String namespaceURI) { List prefixes = namespace2PrefixMap.get(namespaceURI); if (prefixes == null || prefixes.size() < 1) { return null; } return prefixes.get(0); } @Override public Iterator getPrefixes(String namespaceURI) { List prefixes = namespace2PrefixMap.get(namespaceURI); if (prefixes == null || prefixes.size() < 1) { return null; } return prefixes.iterator(); } } private void init() throws XPathExpressionException { XPathEvaluator xPathEvaluator = new XPathEvaluator(); xPathEvaluator.setNamespaceContext(XPath2NamespaceContext.INSTANCE); xPathExpression = xPathEvaluator.compile(xpathStr); } @Override public String select(String text) { try { Document doc = parse(text); return select(doc); } catch (Exception e) { logger.error("select text error! " + xpathStr, e); } return null; } @Override public String select(Node node) { try { return (String) xPathExpression.evaluate(node, XPathConstants.STRING); } catch (Exception e) { logger.error("select text error! " + xpathStr, e); } return null; } @Override public List selectList(String text) { try { Document doc = parse(text); return selectList(doc); } catch (Exception e) { logger.error("select text error! " + xpathStr, e); } return null; } @Override public List selectList(Node node) { try { NodeList result = (NodeList) xPathExpression.evaluate(node, XPathConstants.NODESET); List nodes = NodeListToArrayList(result); return nodesToStrings(nodes); } catch (Exception e) { logger.error("select text error! " + xpathStr, e); } return null; } public Node selectNode(String text) { try { Document doc = parse(text); return selectNode(doc); } catch (Exception e) { logger.error("select text error! " + xpathStr, e); } return null; } public Node selectNode(Node node) { try { return (Node) xPathExpression.evaluate(node, XPathConstants.NODE); } catch (Exception e) { logger.error("select text error! " + xpathStr, e); } return null; } public List selectNodes(String text) { try { Document doc = parse(text); return selectNodes(doc); } catch (Exception e) { logger.error("select text error! " + xpathStr, e); } return null; } public List selectNodes(Node node) { try { NodeList result = (NodeList) xPathExpression.evaluate(node, XPathConstants.NODESET); return NodeListToArrayList(result); } catch (Exception e) { logger.error("select text error! " + xpathStr, e); } return null; } protected static Document parse(String text) throws ParserConfigurationException { // HtmlCleaner could not parse or tag directly text = BaseSelectorUtils.preParse(text); HtmlCleaner htmlCleaner = new HtmlCleaner(); TagNode tagNode = htmlCleaner.clean(text); return new DomSerializer(new CleanerProperties()).createDOM(tagNode); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy