org.apdplat.qa.util.NekoHTMLUtils Maven / Gradle / Ivy
/**
*
* APDPlat - Application Product Development Platform
* Copyright (c) 2013, 杨尚川, [email protected]
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*
*/
package org.apdplat.qa.util;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import org.apdplat.qa.model.Evidence;
import org.cyberneko.html.parsers.DOMParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import com.sun.org.apache.xpath.internal.XPathAPI;
/**
*
* @author 杨尚川
*/
public class NekoHTMLUtils {
private static final Logger LOG = LoggerFactory.getLogger(NekoHTMLUtils.class);
public static List parse(String url, String xpathExpression) {
InputStream in = null;
try {
in = new URL(url).openStream();
return parse(in, xpathExpression);
} catch (Exception e) {
LOG.error("错误", e);
} finally {
if (in != null) {
try {
in.close();
} catch (IOException e) {
LOG.error("错误", e);
}
}
}
return null;
}
public static List parse(InputStream in, String xpathExpression) {
return parse(in, xpathExpression, "UTF-8");
}
public static List parse(InputStream in, String xpathExpression, String encoding) {
DOMParser parser = new DOMParser();
List list = new ArrayList();
try {
// 设置网页的默认编码
parser.setProperty(
"http://cyberneko.org/html/properties/default-encoding",
encoding);
/*
* The Xerces HTML DOM implementation does not support namespaces
* and cannot represent XHTML documents with namespace information.
* Therefore, in order to use the default HTML DOM implementation
* with NekoHTML's DOMParser to parse XHTML documents, you must turn
* off namespace processing.
*/
parser.setFeature("http://xml.org/sax/features/namespaces", false);
parser.parse(new InputSource(new BufferedReader(new InputStreamReader(in, encoding))));
Document doc = parser.getDocument();
NodeList products = XPathAPI.selectNodeList(doc, xpathExpression.toUpperCase());
for (int i = 0; i < products.getLength(); i++) {
Node node = products.item(i);
list.add(node.getTextContent());
}
} catch (Exception e) {
LOG.error("错误", e);
}
return list;
}
public static List searchBaidu(String url) {
InputStream in = null;
try {
in = new URL(url).openStream();
return searchBaidu(in);
} catch (Exception e) {
LOG.error("错误", e);
} finally {
if (in != null) {
try {
in.close();
} catch (IOException e) {
LOG.error("错误", e);
}
}
}
return null;
}
public static List searchBaidu(InputStream in) {
//保证只读一次
byte[] datas = Tools.readAll(in);
if (LOG.isDebugEnabled()) {
try {
LOG.debug("内容:" + new String(datas, "UTF-8"));
} catch (UnsupportedEncodingException e) {
LOG.error("错误", e);
}
}
in = new ByteArrayInputStream(datas);
String totalXpathExpression = "//html/body/div/div[4]/div/p/span";
List totals = parse(in, totalXpathExpression);
int total;
int len = 10;
if (totals != null && !totals.isEmpty()) {
String str = totals.get(totals.size()-1);
int start = 10;
if (str.indexOf("约") != -1) {
start = 11;
}
total = Integer.parseInt(str.substring(start).replace(",", "").replace("个", ""));
LOG.info("搜索结果数:" + total);
} else {
return null;
}
if (total < 1) {
return null;
}
if (total < 10) {
len = total;
}
List evidences = new ArrayList<>();
for (int i = 0; i < len; i++) {
String titleXpathExpression = "//html/body/div/div[4]/div/div[2]/div[" + (i + 1) + "]/h3/a";
String contentXpathExpression = "//html/body/div/div[4]/div/div[2]/div[" + (i + 1) + "]/div[1]";
LOG.debug("titleXpathExpression:" + titleXpathExpression);
LOG.debug("contentXpathExpression:" + contentXpathExpression);
//重新构造输入流
in = new ByteArrayInputStream(datas);
List titles = parse(in, titleXpathExpression);
//重新构造输入流
in = new ByteArrayInputStream(datas);
List snippets = parse(in, contentXpathExpression);
if (titles != null && titles.size() == 1 && snippets != null && snippets.size() == 1) {
Evidence evidence = new Evidence();
evidence.setTitle(titles.get(0));
evidence.setSnippet(snippets.get(0));
evidences.add(evidence);
} else {
LOG.error("获取搜索结果列表项出错:" + titles + " - " + snippets);
}
}
if (evidences.isEmpty()) {
return null;
}
return evidences;
}
/**
* @param args
*/
public static void main(String[] args) {
String url = "http://www.baidu.com/s?pn=0&wd=java";
List evidences = searchBaidu(url);
if (evidences != null) {
for (Evidence evidence : evidences) {
LOG.info(evidence.getTitle());
LOG.info(evidence.getSnippet());
LOG.info("");
}
} else {
LOG.error("没有搜索到结果");
}
}
} © 2015 - 2025 Weber Informatics LLC | Privacy Policy