org.apdplat.extractor.html.ExtractRegular Maven / Gradle / Ivy
/**
*
* APDPlat - Application Product Development Platform
* Copyright (c) 2013, 杨尚川, [email protected]
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*
*/
package org.apdplat.extractor.html;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Matcher;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.lang.StringUtils;
import org.apdplat.extractor.html.model.CssPath;
import org.apdplat.extractor.html.model.ExtractFunction;
import org.apdplat.extractor.html.model.HtmlTemplate;
import org.apdplat.extractor.html.model.UrlPattern;
import org.codehaus.jackson.map.ObjectMapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import redis.clients.jedis.Jedis;
import redis.clients.jedis.JedisPool;
import redis.clients.jedis.JedisPoolConfig;
import redis.clients.jedis.JedisPubSub;
/**
* URL抽取规则
* 订阅Redis服务器Channel:pr,当规则改变的时候会收到通知消息CHANGE并重新初始化规则集合
* 初始化:
* 1、从配置管理web服务器获取完整的规则集合
* 2、抽取规则
* 3、构造规则查找结构
*
* @author 杨尚川
*
*/
public class ExtractRegular {
private static final Logger LOGGER = LoggerFactory.getLogger(ExtractRegular.class);
private static final ObjectMapper MAPPER = new ObjectMapper();
private static ExtractRegular extractRegular = null;
private volatile Map> urlPatternMap = null;
/**
* 私有构造函数
*/
private ExtractRegular() {
}
/**
* 获取抽取规则实例
* @param urlPatterns url模式列表
* @return 抽取规则实例
*/
public static ExtractRegular getInstance(List urlPatterns){
if (extractRegular != null) {
return extractRegular;
}
synchronized (ExtractRegular.class) {
if (extractRegular == null) {
extractRegular = new ExtractRegular();
//初始化抽取规则
extractRegular.init(urlPatterns);
}
}
return extractRegular;
}
/**
* 获取抽取规则实例
*
* @param serverUrl 配置管理WEB服务器的抽取规则下载地址
* @param redisHost Redis服务器主机
* @param redisPort Redis服务器端口
* @return 抽取规则实例
*/
public static ExtractRegular getInstance(String serverUrl, String redisHost, int redisPort) {
if (extractRegular != null) {
return extractRegular;
}
synchronized (ExtractRegular.class) {
if (extractRegular == null) {
extractRegular = new ExtractRegular();
//订阅Redis服务器Channel:pr,当规则改变的时候会收到通知消息CHANGE并重新初始化规则集合
extractRegular.subscribeRedis(redisHost, redisPort, serverUrl);
//初始化抽取规则
extractRegular.init(serverUrl);
}
}
return extractRegular;
}
/**
* 初始化:
* 1、从配置管理web服务器获取完整的抽取规则的json表示
* 2、抽取json,构造对应的java对象结构
*
* @param serverUrl 配置管理WEB服务器的抽取规则下载地址
*/
private synchronized void init(String serverUrl) {
LOGGER.info("开始下载URL抽取规则");
LOGGER.info("serverUrl: " + serverUrl);
//从配置管理web服务器获取完整的抽取规则
String json = downJson(serverUrl);
LOGGER.info("完成下载URL抽取规则");
//抽取规则
LOGGER.info("开始解析URL抽取规则");
List urlPatterns = parseJson(json);
LOGGER.info("完成解析URL抽取规则");
init(urlPatterns);
}
/**
* 初始化:
* 构造抽取规则查找结构
*
* @param urlPatterns url模式列表
*/
private synchronized void init(List urlPatterns) {
LOGGER.info("开始初始化URL抽取规则");
//构造抽取规则查找结构
Map> newUrlPatterns = toMap(urlPatterns);
if (!newUrlPatterns.isEmpty()) {
Map> oldUrlPatterns = urlPatternMap;
urlPatternMap = newUrlPatterns;
//清空之前的抽取规则查找结构(如果有)
if (oldUrlPatterns != null) {
for (List list : oldUrlPatterns.values()) {
list.clear();
}
oldUrlPatterns.clear();
}
}
LOGGER.info("完成初始化URL抽取规则");
}
/**
* 订阅Redis服务器Channel:pr,当规则改变的时候会收到通知消息CHANGE并重新初始化规则集合
*/
private void subscribeRedis(final String redisHost, final int redisPort, final String serverUrl) {
if (null == redisHost || redisPort < 1) {
LOGGER.error("没有指定redis服务器配置!");
return;
}
Thread thread = new Thread(new Runnable() {
@Override
public void run() {
String channel = "pr";
LOGGER.info("redis服务器配置信息 host:" + redisHost + ",port:" + redisPort + ",channel:" + channel);
while (true) {
try {
JedisPool jedisPool = new JedisPool(new JedisPoolConfig(), redisHost, redisPort);
Jedis jedis = jedisPool.getResource();
LOGGER.info("redis守护线程启动");
jedis.subscribe(new ExtractRegularChangeRedisListener(serverUrl), new String[]{channel});
jedisPool.returnResource(jedis);
LOGGER.info("redis守护线程结束");
break;
} catch (Exception e) {
LOGGER.info("redis未启动,暂停一分钟后重新连接");
try {
Thread.sleep(600000);
} catch (InterruptedException ex) {
LOGGER.error(ex.getMessage(), ex);
}
}
}
}
});
thread.setDaemon(true);
thread.setName("redis守护线程,用于动态加载抽取规则");
thread.start();
}
/**
* Redis监听器,监听抽取规则的变化
*
* @author 杨尚川
*
*/
private class ExtractRegularChangeRedisListener extends JedisPubSub {
private final String serverUrl;
public ExtractRegularChangeRedisListener(String serverUrl) {
this.serverUrl = serverUrl;
}
@Override
public void onMessage(String channel, String message) {
LOGGER.debug("onMessage channel:" + channel + " and message:" + message);
if ("pr".equals(channel) && "CHANGE".equals(message)) {
synchronized (ExtractRegularChangeRedisListener.class) {
init(serverUrl);
}
}
}
@Override
public void onPMessage(String pattern, String channel, String message) {
LOGGER.debug("pattern:" + pattern + " and channel:" + channel + " and message:" + message);
onMessage(channel, message);
}
@Override
public void onPSubscribe(String pattern, int subscribedChannels) {
LOGGER.debug("psubscribe pattern:" + pattern + " and subscribedChannels:" + subscribedChannels);
}
@Override
public void onPUnsubscribe(String pattern, int subscribedChannels) {
LOGGER.debug("punsubscribe pattern:" + pattern + " and subscribedChannels:" + subscribedChannels);
}
@Override
public void onSubscribe(String channel, int subscribedChannels) {
LOGGER.debug("subscribe channel:" + channel + " and subscribedChannels:" + subscribedChannels);
}
@Override
public void onUnsubscribe(String channel, int subscribedChannels) {
LOGGER.debug("unsubscribe channel:" + channel + " and subscribedChannels:" + subscribedChannels);
}
}
/**
* 从配置管理WEB服务器下载规则(json表示)
*
* @param url 配置管理WEB服务器下载规则的地址
* @return json字符串
*/
private String downJson(String url) {
// 构造HttpClient的实例
HttpClient httpClient = new HttpClient();
// 创建GET方法的实例
GetMethod method = new GetMethod(url);
try {
// 执行GetMethod
int statusCode = httpClient.executeMethod(method);
LOGGER.info("响应代码:" + statusCode);
if (statusCode != HttpStatus.SC_OK) {
LOGGER.error("请求失败: " + method.getStatusLine());
}
// 读取内容
String responseBody = new String(method.getResponseBody(), "utf-8");
return responseBody;
} catch (IOException e) {
LOGGER.error("检查请求的路径:" + url, e);
} finally {
// 释放连接
method.releaseConnection();
}
return "";
}
/**
* 将json格式的URL模式转换为JAVA对象表示
*
* @param json URL模式的JSON表示
* @return URL模式的JAVA对象表示
*/
private List parseJson(String json) {
List urlPatterns = new ArrayList<>();
try {
List © 2015 - 2025 Weber Informatics LLC | Privacy Policy