All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.pslib.jtool.util.BadWordsUtil Maven / Gradle / Ivy

package com.pslib.jtool.util;

import java.io.File;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * 敏感词判断和过滤
 * 
 *
 */
public class BadWordsUtil {
	protected final static Logger logger = LoggerFactory.getLogger(BadWordsUtil.class);
	/**
	 * 敏感词文件,如果不指定,则读取类根目录下的badwords.txt
	 */
	public static String filename = null;
	/**
	 * 文件大小限制
	 */
	public static long filesizeLimit = 20 * 1024 * 1024;

	private static long lastModified = 0;
	private static File file;
	private static Pattern badwords = null;
	private static Pattern replace_badwords = null;
	private static String filter1_rule = "\"`~!@#$%^&*()=':;,\\\\\\[\\]\\.<>/?~¥%…()—+|{}【】‘”“’。,、?《》 \t";
	private static Pattern filter1 = Pattern.compile("[" + filter1_rule + "]+");

	/**
	 * 初始化
	 */
	public static void init(String _filename) {
		filename = _filename;
		file = getFile();
		reload(true);
		fileChecker();
	}

	public static boolean fileCheckerShutdown = false;

	public static void fileChecker() {
		Thread daemon = new Thread("BADWORDS-FILE-CHECK") {
			@Override
			public void run() {
				synchronized (this) {
					while (!fileCheckerShutdown) {
						try {
							wait(30 * 1000);
							/// logger.info("file checker");
							reload(false);
						} catch (Exception ex) {
							logger.error("", ex);
						}
					}
				}
			}
		};
		daemon.setDaemon(true);
		daemon.start();
	}

	public static File getFile() {
		return new File(filename);
	}

	/**
	 * 加载数据
	 * 
	 * @param force
	 */
	public static void reload(boolean force) {
		if (force || lastModified != file.lastModified()) {
			try {
				logger.info("加载敏感词数据:" + file.getAbsolutePath());
				logger.info("文件大小:" + StringUtil.getFormatSize(file.length()));
				if (!file.exists()) {
					logger.error("文件不存在 ");
					return;
				}
				if (!file.canRead()) {
					logger.error("文件不可读 ");
					return;
				}

				if (file.length() > filesizeLimit) {
					logger.error("文件大小超过限制:limit(" + StringUtil.getFormatSize(file.length()) + ")>file(" + StringUtil.getFormatSize(filesizeLimit) + ")");
					return;
				}
				String words = FileUtil.fileRead(file);

				words = filterStr(words);

				String[] wordarr = words.split("\n");
				int len;
				StringBuffer tmp = new StringBuffer("(");
				StringBuffer tmp2 = new StringBuffer("(");
				for (int idx = 0; idx < wordarr.length; idx++) {
					wordarr[idx] = wordarr[idx].trim();
					len = wordarr[idx].length();
					if (len == 0) {
						continue;
					}
					tmp.append(wordarr[idx]).append("|");
					if (len > 1) {
						for (int i = 0; i < len; i++) {
							tmp2.append(wordarr[idx].substring(i, i + 1));
							tmp2.append("[" + filter1_rule + "]*");
						}
						tmp2.append("|");
					}

				}
				tmp.setLength(tmp.length() - 1);
				tmp.append(")");
				badwords = Pattern.compile(tmp.toString());

				tmp2.setLength(tmp2.length() - 1);
				tmp2.append(")");
				replace_badwords = Pattern.compile(tmp2.toString());

				lastModified = file.lastModified();

				logger.info("敏感词数据加载完成");
			} catch (Exception ex) {
				logger.error("", ex);
			}
		}
	}

	/**
	 * 过滤字符中的标点符号
	 * 
	 * @param str
	 * @return
	 */
	public static String filterStr(String str) {
		str = filter1.matcher(str).replaceAll("");
		return str;
	}

	/**
	 * 判断是否包含敏感词
	 * 
	 * @param str
	 * @return
	 */
	public static boolean check(String str) {
		if (badwords == null || str == null || str.length() == 0) {
			return true;
		}
		str = filterStr(str);
		return badwords.matcher(str).find();
	}

	/**
	 * 替换敏感词为指定字符串
	 * 
	 * @param str
	 * @return
	 */
	public static String replace(String str, String replace) {
		if (badwords == null || str == null || str.length() == 0) {
			return str;
		}
		str = badwords.matcher(str).replaceAll(replace);
		str = replace_badwords.matcher(str).replaceAll(replace);
		return str;
	}

	/**
	 * 替换敏感词为**
	 * 
	 * @param str
	 * @return
	 */
	public static String replace(String str) {
		return replace(str, "**");
	}

	/**
	 * 重复词过滤
	 * 
	 * @param str
	 * @return
	 */
	public static void filterWordsFile(String filename) {
		File file = new File(filename);
		String words = FileUtil.fileRead(file);

		words = filterStr(words);

		String[] wordarr = words.split("\n");
		List wordList = new ArrayList();
		for (int i = 0; i < wordarr.length; i++) {
			wordarr[i] = wordarr[i].trim();
			if (wordarr[i].length() == 0) {
				continue;
			}
			if (!wordList.contains(wordarr[i])) {
				wordList.add(wordarr[i]);
			}
		}
		// System.out.println(ArrayUtil.join("\r\n", wordList));
		FileUtil.fileWrite(file, ArrayUtil.join("\r\n", wordList));
	}

	public static void main(String[] args) {

	}

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy