All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.aoju.bus.sensitive.WordTree Maven / Gradle / Ivy

There is a newer version: 8.0.0
Show newest version
/*********************************************************************************
 *                                                                               *
 * The MIT License (MIT)                                                         *
 *                                                                               *
 * Copyright (c) 2015-2020 aoju.org and other contributors.                      *
 *                                                                               *
 * Permission is hereby granted, free of charge, to any person obtaining a copy  *
 * of this software and associated documentation files (the "Software"), to deal *
 * in the Software without restriction, including without limitation the rights  *
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell     *
 * copies of the Software, and to permit persons to whom the Software is         *
 * furnished to do so, subject to the following conditions:                      *
 *                                                                               *
 * The above copyright notice and this permission notice shall be included in    *
 * all copies or substantial portions of the Software.                           *
 *                                                                               *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR    *
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,      *
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE   *
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER        *
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, *
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN     *
 * THE SOFTWARE.                                                                 *
 *                                                                               *
 ********************************************************************************/
package org.aoju.bus.sensitive;

import org.aoju.bus.core.lang.Filter;
import org.aoju.bus.core.text.Builders;
import org.aoju.bus.core.toolkit.CollKit;
import org.aoju.bus.core.toolkit.StringKit;

import java.util.*;

/**
 * DFA(Deterministic Finite Automaton 确定有穷自动机)
 * DFA单词树(以下简称单词树),常用于在某大段文字中快速查找某几个关键词是否存在
 * 单词树使用group区分不同的关键字集合,不同的分组可以共享树枝,避免重复建树
 * 单词树使用树状结构表示一组单词
 *
 * @author Kimi Liu
 * @version 6.1.3
 * @since JDK 1.8+
 */
public class WordTree extends HashMap {

    private static final long serialVersionUID = -1L;

    /**
     * 敏感词字符末尾标识,用于标识单词末尾字符
     */
    private final Set endCharacterSet = new HashSet<>();
    /**
     * 字符过滤规则,通过定义字符串过滤规则,过滤不需要的字符,当accept为false时,此字符不参与匹配
     */
    private Filter charFilter = StopChar::isNotStopChar;

    /**
     * 默认构造
     */
    public WordTree() {
    }

    /**
     * 设置字符过滤规则,通过定义字符串过滤规则,过滤不需要的字符
     * 当accept为false时,此字符不参与匹配
     *
     * @param charFilter 过滤函数
     * @return this
     */
    public WordTree setCharFilter(Filter charFilter) {
        this.charFilter = charFilter;
        return this;
    }

    /**
     * 增加一组单词
     *
     * @param words 单词集合
     */
    public void addWords(Collection words) {
        if (false == (words instanceof Set)) {
            words = new HashSet<>(words);
        }
        for (String word : words) {
            addWord(word);
        }
    }

    /**
     * 增加一组单词
     *
     * @param words 单词数组
     */
    public void addWords(String... words) {
        HashSet wordsSet = CollKit.newHashSet(words);
        for (String word : wordsSet) {
            addWord(word);
        }
    }

    /**
     * 添加单词,使用默认类型
     *
     * @param word 单词
     */
    public void addWord(String word) {
        final Filter charFilter = this.charFilter;
        WordTree parent = null;
        WordTree current = this;
        WordTree child;
        char currentChar = 0;
        int length = word.length();
        for (int i = 0; i < length; i++) {
            currentChar = word.charAt(i);
            if (charFilter.accept(currentChar)) {//只处理合法字符
                child = current.get(currentChar);
                if (child == null) {
                    // 无子类,新建一个子节点后存放下一个字符
                    child = new WordTree();
                    current.put(currentChar, child);
                }
                parent = current;
                current = child;
            }
        }
        if (null != parent) {
            parent.setEnd(currentChar);
        }
    }

    /**
     * 指定文本是否包含树中的词
     *
     * @param text 被检查的文本
     * @return 是否包含
     */
    public boolean isMatch(String text) {
        if (null == text) {
            return false;
        }
        return null != match(text);
    }

    /**
     * 获得第一个匹配的关键字
     *
     * @param text 被检查的文本
     * @return 匹配到的关键字
     */
    public String match(String text) {
        if (null == text) {
            return null;
        }
        List matchAll = matchAll(text, 1);
        if (CollKit.isNotEmpty(matchAll)) {
            return matchAll.get(0);
        }
        return null;
    }

    /**
     * 找出所有匹配的关键字
     *
     * @param text 被检查的文本
     * @return 匹配的词列表
     */
    public List matchAll(String text) {
        return matchAll(text, -1);
    }

    /**
     * 找出所有匹配的关键字
     *
     * @param text  被检查的文本
     * @param limit 限制匹配个数
     * @return 匹配的词列表
     */
    public List matchAll(String text, int limit) {
        return matchAll(text, limit, false, false);
    }

    /**
     * 找出所有匹配的关键字
     * 密集匹配原则:假如关键词有 ab,b,文本是abab,将匹配 [ab,b,ab]
     * 贪婪匹配(最长匹配)原则:假如关键字a,ab,最长匹配将匹配[a, ab]
     *
     * @param text           被检查的文本
     * @param limit          限制匹配个数
     * @param isDensityMatch 是否使用密集匹配原则
     * @param isGreedMatch   是否使用贪婪匹配(最长匹配)原则
     * @return 匹配的词列表
     */
    public List matchAll(String text, int limit, boolean isDensityMatch, boolean isGreedMatch) {
        if (null == text) {
            return null;
        }

        List foundWords = new ArrayList<>();
        WordTree current = this;
        int length = text.length();
        final Filter charFilter = this.charFilter;
        // 存放查找到的字符缓存。完整出现一个词时加到findedWords中,否则清空
        final Builders wordBuffer = StringKit.builders();
        char currentChar;
        for (int i = 0; i < length; i++) {
            wordBuffer.reset();
            for (int j = i; j < length; j++) {
                currentChar = text.charAt(j);
                if (false == charFilter.accept(currentChar)) {
                    if (wordBuffer.length() > 0) {
                        // 做为关键词中间的停顿词被当作关键词的一部分被返回
                        wordBuffer.append(currentChar);
                    } else {
                        // 停顿词做为关键词的第一个字符时需要跳过
                        i++;
                    }
                    continue;
                } else if (false == current.containsKey(currentChar)) {
                    // 非关键字符被整体略过,重新以下个字符开始检查
                    break;
                }
                wordBuffer.append(currentChar);
                if (current.isEnd(currentChar)) {
                    // 到达单词末尾,关键词成立,从此词的下一个位置开始查找
                    foundWords.add(wordBuffer.toString());
                    if (limit > 0 && foundWords.size() >= limit) {
                        // 超过匹配限制个数,直接返回
                        return foundWords;
                    }
                    if (false == isDensityMatch) {
                        // 如果非密度匹配,跳过匹配到的词
                        i = j;
                    }
                    if (false == isGreedMatch) {
                        // 如果懒惰匹配(非贪婪匹配)。当遇到第一个结尾标记就结束本轮匹配
                        break;
                    }
                }
                current = current.get(currentChar);
                if (null == current) {
                    break;
                }
            }
            current = this;
        }
        return foundWords;
    }

    /**
     * 是否末尾
     *
     * @param c 检查的字符
     * @return 是否末尾
     */
    private boolean isEnd(Character c) {
        return this.endCharacterSet.contains(c);
    }

    /**
     * 设置是否到达末尾
     *
     * @param c 设置结尾的字符
     */
    private void setEnd(Character c) {
        if (null != c) {
            this.endCharacterSet.add(c);
        }
    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy