All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.github.houbb.segment.support.trie.impl.SegmentTrieTree Maven / Gradle / Ivy

package com.github.houbb.segment.support.trie.impl;

import com.github.houbb.heaven.annotation.ThreadSafe;
import com.github.houbb.heaven.util.guava.Guavas;
import com.github.houbb.heaven.util.lang.ObjectUtil;
import com.github.houbb.heaven.util.util.MapUtil;
import com.github.houbb.log.integration.core.Log;
import com.github.houbb.log.integration.core.LogFactory;
import com.github.houbb.segment.api.ISegmentContext;
import com.github.houbb.segment.constant.SegmentConst;
import com.github.houbb.segment.data.phrase.api.ISegmentPhraseData;

import java.util.HashMap;
import java.util.Map;
import java.util.Set;

/**
 * 分词前缀树接口
 * @author binbin.hou
 * @since 0.0.1
 */
@ThreadSafe
public class SegmentTrieTree extends AbstractSegmentTrieTree {

    private static final Log LOG = LogFactory.getLog(SegmentTrieTree.class);

    /**
     * 内部单词 map
     *
     * @since 0.0.1
     */
    private static volatile Map innerWordMap = Guavas.newHashMap();

    @Override
    public Map getTrieTree(final ISegmentContext context) {
        if(ObjectUtil.isNotEmpty(innerWordMap)) {
            return innerWordMap;
        }

        LOG.debug("[Segment]-[data-trie] init start");
        synchronized(SegmentTrieTree.class) {
            if(MapUtil.isEmpty(innerWordMap)) {
                final ISegmentPhraseData segmentData = context.data();
                initInnerWordMap(segmentData);
            }
        }

        LOG.debug("[Segment]-[data-trie] init end");
        return innerWordMap;
    }

    @Override
    public void destroy() {
        LOG.debug("[Segment]-[data-trie] destroy start");
        synchronized (innerWordMap) {
            innerWordMap.clear();
        }
        LOG.debug("[Segment]-[data-trie] destroy end");
    }

    /**
     * 基于前缀树初始化 Map
     * @param segmentData 分词数据实现
     * @since 0.0.1
     */
    @SuppressWarnings("unchecked")
    private void initInnerWordMap(final ISegmentPhraseData segmentData) {
        // 加载字典
        Set wordSet = segmentData.getPhraseSet();

        for (String key : wordSet) {
            // 用来按照相应的格式保存敏感词库数据
            char[] chars = key.toCharArray();
            final int size = chars.length;

            // 每一个新词的循环,直接将结果设置为当前 map,所有变化都会体现在结果的 map 中
            Map currentMap = innerWordMap;

            for (int i = 0; i < size; i++) {
                // 截取敏感词当中的字,在敏感词库中字为HashMap对象的Key键值
                char charKey = chars[i];
                // 如果集合存在
                Object wordMap = currentMap.get(charKey);

                // 如果集合存在
                if (ObjectUtil.isNotNull(wordMap)) {
                    // 直接将获取到的 map 当前当前 map 进行继续的操作
                    currentMap = (Map) wordMap;
                } else {
                    //不存在则,则构建一个新的map,同时将isEnd设置为0,因为他不是最后一
                    Map newWordMap = new HashMap<>(8);
                    newWordMap.put(SegmentConst.IS_END, false);

                    // 将新的节点放入当前 map 中
                    currentMap.put(charKey, newWordMap);

                    // 将新节点设置为当前节点,方便下一次节点的循环。
                    currentMap = newWordMap;
                }

                // 判断是否为最后一个,添加是否结束的标识。
                if (i == size - 1) {
                    currentMap.put(SegmentConst.IS_END, true);
                }
            }
        }
    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy