All Downloads are FREE. Search and download functionalities are using the official Maven repository.
Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
cn.wanghaomiao.xpath.core.XpathEvaluator Maven / Gradle / Ivy
Go to download
一个非常好用而且强大的基于xpath的html解析器。html的DOM树生成依赖Jsoup。Lexer 和 Parser基于Antlr4,支持完备的W3C XPATH 1.0标准语法,W3C规范:http://www.w3.org/TR/1999/REC-xpath-19991116。
package cn.wanghaomiao.xpath.core;
import cn.wanghaomiao.xpath.exception.NoSuchAxisException;
import cn.wanghaomiao.xpath.exception.NoSuchFunctionException;
import cn.wanghaomiao.xpath.model.Node;
import cn.wanghaomiao.xpath.model.Predicate;
import cn.wanghaomiao.xpath.util.CommonUtil;
import cn.wanghaomiao.xpath.util.ScopeEm;
import org.apache.commons.lang.StringUtils;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.lang.reflect.Method;
import java.util.LinkedList;
import java.util.List;
/**
* @author 汪浩淼 [ [email protected] ]
* @since 14-3-12 下午3:42
*/
public class XpathEvaluator {
/**
* xpath解析器的总入口,同时预处理,如‘|’
* @param xpath
* @param root
* @return
*/
public List xpathParser(String xpath,Elements root) throws NoSuchAxisException, NoSuchFunctionException {
if (xpath.contains("|")){
List rs = new LinkedList();
String[] chiXpaths = xpath.split("\\|");
for (String chiXp:chiXpaths){
if (chiXp.length()>0){
rs.addAll(evaluate(chiXp.trim(),root));
}
}
return rs;
}else {
return evaluate(xpath,root);
}
}
/**
* 获取xpath解析语法树
* @param xpath
* @return
*/
public List getXpathNodeTree(String xpath){
NodeTreeBuilderStateMachine st = new NodeTreeBuilderStateMachine();
while (st.state != NodeTreeBuilderStateMachine.BuilderState.END){
st.state.parser(st, xpath.toCharArray());
}
return st.context.xpathTr;
}
/**
* 根据xpath求出结果
* @param xpath
* @param root
* @return
*/
public List evaluate(String xpath,Elements root) throws NoSuchAxisException, NoSuchFunctionException {
List res = new LinkedList();
Elements context = root;
List xpathNodes=getXpathNodeTree(xpath);
for (int i=0;i contextTmp = new LinkedList();
if (n.getScopeEm()== ScopeEm.RECURSIVE||n.getScopeEm()==ScopeEm.CURREC){
if (n.getTagName().startsWith("@")){
for (Element e:context){
//处理上下文自身节点
String key = n.getTagName().substring(1);
if (key.equals("*")){
res.add(e.attributes().toString());
}else {
String value = e.attr(key);
if (StringUtils.isNotBlank(value)){
res.add(value);
}
}
//处理上下文子代节点
for (Element dep:e.getAllElements()){
if (key.equals("*")){
res.add(dep.attributes().toString());
}else {
String value = dep.attr(key);
if (StringUtils.isNotBlank(value)){
res.add(value);
}
}
}
}
}else if (n.getTagName().endsWith("()")){
//递归执行方法默认只支持text()
res.add(context.text());
}else {
Elements searchRes = context.select(n.getTagName());
for (Element e:searchRes){
Element filterR = filter(e,n);
if (filterR!=null){
contextTmp.add(filterR);
}
}
context = new Elements(contextTmp);
}
}else {
if (n.getTagName().startsWith("@")){
for (Element e:context){
String key = n.getTagName().substring(1);
if (key.equals("*")){
res.add(e.attributes().toString());
}else {
String value = e.attr(key);
if (StringUtils.isNotBlank(value)){
res.add(value);
}
}
}
}else if (n.getTagName().endsWith("()")){
res = (List) callFunc(n.getTagName().substring(0,n.getTagName().length()-2),context);
}else {
for (Element e:context){
Elements filterScope = e.children();
if (StringUtils.isNotBlank(n.getAxis())){
filterScope = getAxisScopeEls(n.getAxis(),e);
}
for (Element chi:filterScope){
Element fchi=filter(chi,n);
if (fchi!=null){
contextTmp.add(fchi);
}
}
}
context=new Elements(contextTmp);
if (i==xpathNodes.size()-1){
res.addAll(contextTmp);
}
}
}
}
return res;
}
/**
* 元素过滤器
* @param e
* @param node
* @return
*/
public Element filter(Element e,Node node) throws NoSuchFunctionException, NoSuchAxisException {
if (node.getTagName().equals("*")||node.getTagName().equals(e.nodeName())){
if (node.getPredicate()!=null){
Predicate p = node.getPredicate();
if (p.getOpEm()==null){
if (p.getValue().matches("\\d+")&&getElIndex(e)==Integer.parseInt(p.getValue())){
return e;
}else if (p.getValue().endsWith("()")&&(Boolean)callFilterFunc(p.getValue().substring(0,p.getValue().length()-2),e)){
return e;
}
//todo p.value ~= contains(./@href,'renren.com')
}else {
if (p.getLeft().matches("[^/]+\\(\\)")){
Object filterRes=p.getOpEm().excute(callFilterFunc(p.getLeft().substring(0,p.getLeft().length()-2),e).toString(),p.getRight());
if (filterRes instanceof Boolean && (Boolean) filterRes){
return e;
}else if(filterRes instanceof Integer && e.siblingIndex()==Integer.parseInt(filterRes.toString())){
return e;
}
}else if (p.getLeft().startsWith("@")){
String lValue = e.attr(p.getLeft().substring(1));
Object filterRes = p.getOpEm().excute(lValue,p.getRight());
if ((Boolean) filterRes){
return e;
}
}else {
// 操作符左边不是函数、属性默认就是xpath表达式了
List eltmp = new LinkedList();
eltmp.add(e);
List rstmp=evaluate(p.getLeft(),new Elements(eltmp));
if ((Boolean) p.getOpEm().excute(StringUtils.join(rstmp,""),p.getRight())){
return e;
}
}
}
}else {
return e;
}
}
return null;
}
/**
* 调用轴选择器
* @param axis
* @param e
* @return
* @throws NoSuchAxisException
*/
public Elements getAxisScopeEls(String axis,Element e) throws NoSuchAxisException {
try {
String functionName = CommonUtil.getJMethodNameFromStr(axis);
Method axisSelector = AxisSelector.class.getMethod(functionName, Element.class);
return (Elements) axisSelector.invoke(SingletonProducer.getInstance().getAxisSelector(),e);
}catch (NoSuchMethodException e1) {
throw new NoSuchAxisException("this axis is not supported,plase use other instead of '"+axis+"'");
} catch (Exception e2) {
throw new NoSuchAxisException(e2.getMessage());
}
}
/**
* 调用xpath主干上的函数
* @param funcname
* @param context
* @return
* @throws NoSuchFunctionException
*/
public Object callFunc(String funcname,Elements context) throws NoSuchFunctionException {
try {
Method function = Functions.class.getMethod(funcname,Elements.class);
return function.invoke(SingletonProducer.getInstance().getFunctions(),context);
} catch (NoSuchMethodException e) {
throw new NoSuchFunctionException("This function is not supported");
} catch (Exception e1) {
throw new NoSuchFunctionException(e1.getMessage());
}
}
/**
* 调用谓语中函数
* @param funcname
* @param el
* @return
* @throws NoSuchFunctionException
*/
public Object callFilterFunc(String funcname,Element el) throws NoSuchFunctionException {
try {
Method function = Functions.class.getMethod(funcname,Element.class);
return function.invoke(SingletonProducer.getInstance().getFunctions(),el);
} catch (NoSuchMethodException e) {
throw new NoSuchFunctionException("This function is not supported");
} catch (Exception et) {
throw new NoSuchFunctionException(et.getMessage());
}
}
public int getElIndex(Element e){
if (e!=null){
return CommonUtil.getElIndexInSameTags(e);
}
return 1;
}
}