cn.hutool.dfa.WordTree Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hutool-all Show documentation
Hutool是一个小而全的Java工具类库，通过静态方法封装，降低相关API的学习成本，提高工作效率，使Java拥有函数式语言般的优雅，让Java语言也可以“甜甜的”。
There is a newer version: 5.8.34
package cn.hutool.dfa;

import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.lang.Filter;
import cn.hutool.core.util.StrUtil;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

/**
 * DFA（Deterministic Finite Automaton 确定有穷自动机）
 * DFA单词树（以下简称单词树），常用于在某大段文字中快速查找某几个关键词是否存在。

 * 单词树使用group区分不同的关键字集合，不同的分组可以共享树枝，避免重复建树。

 * 单词树使用树状结构表示一组单词。

 * 例如：红领巾，红河构建树后为：

 * 红                    

 * /      \                 

 * 领         河             

 * /                            

 * 巾                            

 * 其中每个节点都是一个WordTree对象，查找时从上向下查找。

 *
 * @author Looly
 */
public class WordTree extends HashMap {
	private static final long serialVersionUID = -4646423269465809276L;

	/**
	 * 单词字符末尾标识，用于标识单词末尾字符
	 */
	private final Set endCharacterSet = new HashSet<>();
	/**
	 * 字符过滤规则，通过定义字符串过滤规则，过滤不需要的字符，当accept为false时，此字符不参与匹配
	 */
	private Filter charFilter = StopChar::isNotStopChar;

	//--------------------------------------------------------------------------------------- Constructor start

	/**
	 * 默认构造
	 */
	public WordTree() {
	}
	//--------------------------------------------------------------------------------------- Constructor start

	/**
	 * 设置字符过滤规则，通过定义字符串过滤规则，过滤不需要的字符

	 * 当accept为false时，此字符不参与匹配
	 *
	 * @param charFilter 过滤函数
	 * @return this
	 * @since 5.2.0
	 */
	public WordTree setCharFilter(Filter charFilter) {
		this.charFilter = charFilter;
		return this;
	}

	//------------------------------------------------------------------------------- add word

	/**
	 * 增加一组单词
	 *
	 * @param words 单词集合
	 * @return this
	 */
	public WordTree addWords(Collection words) {
		if (false == (words instanceof Set)) {
			words = new HashSet<>(words);
		}
		for (String word : words) {
			addWord(word);
		}
		return this;
	}

	/**
	 * 增加一组单词
	 *
	 * @param words 单词数组
	 *              @return this
	 */
	public WordTree addWords(String... words) {
		for (String word : CollUtil.newHashSet(words)) {
			addWord(word);
		}
		return this;
	}

	/**
	 * 添加单词，使用默认类型
	 *
	 * @param word 单词
	 * @return this
	 */
	public WordTree addWord(String word) {
		final Filter charFilter = this.charFilter;
		WordTree parent = null;
		WordTree current = this;
		WordTree child;
		char currentChar = 0;
		final int length = word.length();
		for (int i = 0; i < length; i++) {
			currentChar = word.charAt(i);
			if (charFilter.accept(currentChar)) {//只处理合法字符
				child = current.get(currentChar);
				if (child == null) {
					//无子类，新建一个子节点后存放下一个字符
					child = new WordTree();
					current.put(currentChar, child);
				}
				parent = current;
				current = child;
			}
		}
		if (null != parent) {
			parent.setEnd(currentChar);
		}
		return this;
	}
	//------------------------------------------------------------------------------- match

	/**
	 * 指定文本是否包含树中的词
	 *
	 * @param text 被检查的文本
	 * @return 是否包含
	 */
	public boolean isMatch(String text) {
		if (null == text) {
			return false;
		}
		return null != matchWord(text);
	}

	/**
	 * 获得第一个匹配的关键字
	 *
	 * @param text 被检查的文本
	 * @return 匹配到的关键字
	 */
	public String match(String text) {
		final FoundWord foundWord = matchWord(text);
		return null != foundWord ? foundWord.toString() : null;
	}

	/**
	 * 获得第一个匹配的关键字
	 *
	 * @param text 被检查的文本
	 * @return 匹配到的关键字
	 * @since 5.5.3
	 */
	public FoundWord matchWord(String text) {
		if (null == text) {
			return null;
		}
		final List matchAll = matchAllWords(text, 1);
		return CollUtil.get(matchAll, 0);
	}

	//------------------------------------------------------------------------------- match all

	/**
	 * 找出所有匹配的关键字
	 *
	 * @param text 被检查的文本
	 * @return 匹配的词列表
	 */
	public List matchAll(String text) {
		return matchAll(text, -1);
	}

	/**
	 * 找出所有匹配的关键字
	 *
	 * @param text 被检查的文本
	 * @return 匹配的词列表
	 * @since 5.5.3
	 */
	public List matchAllWords(String text) {
		return matchAllWords(text, -1);
	}

	/**
	 * 找出所有匹配的关键字
	 *
	 * @param text  被检查的文本
	 * @param limit 限制匹配个数
	 * @return 匹配的词列表
	 */
	public List matchAll(String text, int limit) {
		return matchAll(text, limit, false, false);
	}

	/**
	 * 找出所有匹配的关键字
	 *
	 * @param text  被检查的文本
	 * @param limit 限制匹配个数
	 * @return 匹配的词列表
	 * @since 5.5.3
	 */
	public List matchAllWords(String text, int limit) {
		return matchAllWords(text, limit, false, false);
	}

	/**
	 * 找出所有匹配的关键字

	 * 密集匹配原则：假如关键词有 ab,b，文本是abab，将匹配 [ab,b,ab]

	 * 贪婪匹配（最长匹配）原则：假如关键字a,ab，最长匹配将匹配[a, ab]
	 *
	 * @param text           被检查的文本
	 * @param limit          限制匹配个数
	 * @param isDensityMatch 是否使用密集匹配原则
	 * @param isGreedMatch   是否使用贪婪匹配（最长匹配）原则
	 * @return 匹配的词列表
	 */
	public List matchAll(String text, int limit, boolean isDensityMatch, boolean isGreedMatch) {
		final List matchAllWords = matchAllWords(text, limit, isDensityMatch, isGreedMatch);
		return CollUtil.map(matchAllWords, FoundWord::toString, true);
	}

	/**
	 * 找出所有匹配的关键字

	 * 密集匹配原则：假如关键词有 ab,b，文本是abab，将匹配 [ab,b,ab]

	 * 贪婪匹配（最长匹配）原则：假如关键字a,ab，最长匹配将匹配[a, ab]
	 *
	 * @param text           被检查的文本
	 * @param limit          限制匹配个数
	 * @param isDensityMatch 是否使用密集匹配原则
	 * @param isGreedMatch   是否使用贪婪匹配（最长匹配）原则
	 * @return 匹配的词列表
	 * @since 5.5.3
	 */
	public List matchAllWords(String text, int limit, boolean isDensityMatch, boolean isGreedMatch) {
		if (null == text) {
			return null;
		}

		List foundWords = new ArrayList<>();
		WordTree current = this;
		final int length = text.length();
		final Filter charFilter = this.charFilter;
		//存放查找到的字符缓存。完整出现一个词时加到findedWords中，否则清空
		final StringBuilder wordBuffer = StrUtil.builder();
		final StringBuilder keyBuffer = StrUtil.builder();
		char currentChar;
		for (int i = 0; i < length; i++) {
			wordBuffer.setLength(0);
			keyBuffer.setLength(0);
			for (int j = i; j < length; j++) {
				currentChar = text.charAt(j);
//				Console.log("i: {}, j: {}, currentChar: {}", i, j, currentChar);
				if (false == charFilter.accept(currentChar)) {
					if (wordBuffer.length() > 0) {
						//做为关键词中间的停顿词被当作关键词的一部分被返回
						wordBuffer.append(currentChar);
					} else {
						//停顿词做为关键词的第一个字符时需要跳过
						i++;
					}
					continue;
				} else if (false == current.containsKey(currentChar)) {
					//非关键字符被整体略过，重新以下个字符开始检查
					break;
				}
				wordBuffer.append(currentChar);
				keyBuffer.append(currentChar);
				if (current.isEnd(currentChar)) {
					//到达单词末尾，关键词成立，从此词的下一个位置开始查找
					foundWords.add(new FoundWord(keyBuffer.toString(), wordBuffer.toString(), i, j));
					if (limit > 0 && foundWords.size() >= limit) {
						//超过匹配限制个数，直接返回
						return foundWords;
					}
					if (false == isDensityMatch) {
						//如果非密度匹配，跳过匹配到的词
						i = j;
						break;
					}
					if (false == isGreedMatch) {
						//如果懒惰匹配（非贪婪匹配）。当遇到第一个结尾标记就结束本轮匹配
						break;
					}
				}
				current = current.get(currentChar);
				if (null == current) {
					break;
				}
			}
			current = this;
		}
		return foundWords;
	}
	//--------------------------------------------------------------------------------------- Private method start

	/**
	 * 是否末尾
	 *
	 * @param c 检查的字符
	 * @return 是否末尾
	 */
	private boolean isEnd(Character c) {
		return this.endCharacterSet.contains(c);
	}

	/**
	 * 设置是否到达末尾
	 *
	 * @param c 设置结尾的字符
	 */
	private void setEnd(Character c) {
		if (null != c) {
			this.endCharacterSet.add(c);
		}
	}

	/**
	 * 清除所有的词,
	 * 此方法调用后, wordTree 将被清空
	 * endCharacterSet 也将清空
	 */
	@Override
	public void clear() {
		super.clear();
		this.endCharacterSet.clear();
	}
	//--------------------------------------------------------------------------------------- Private method end
}