org.apdplat.word.util.GenericTrie Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of word Show documentation
word分词是一个Java实现的中文分词组件，提供了多种基于词典的分词算法，并利用ngram模型来消除歧义。能准确识别英文、数字，以及日期、时间等数量词，能识别人名、地名、组织机构名等未登录词。同时提供了Lucene、Solr、ElasticSearch插件。
The newest version!
/**
 *
 * APDPlat - Application Product Development Platform
 * Copyright (c) 2013, 杨尚川, [email protected]
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see .
 *
 */

package org.apdplat.word.util;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;

/**
 * 词首字索引式通用前缀树，高效存储，快速搜索
 * 为前缀树的一级节点（词首字）建立索引（比二分查找要快）
 * @author 杨尚川
 */
public class GenericTrie {
    private static final Logger LOGGER = LoggerFactory.getLogger(GenericTrie.class);
    //词表的首字母数量在一个可控范围内，默认值为12000
    private static final int INDEX_LENGTH = 12000;
    private final TrieNode[] ROOT_NODES_INDEX = new TrieNode[INDEX_LENGTH];
    
    public void clear() {
        for(int i=0; i map = new HashMap<>();
        for(TrieNode node : ROOT_NODES_INDEX){
            if(node == null){
                emptySlot++;
            }else{
                int i=0;
                while((node = node.getSibling()) != null){
                    i++;
                }
                if(i > 0){
                    Integer count = map.get(i);
                    if(count == null){
                        count = 1;
                    }else{
                        count++;
                    }
                    map.put(i, count);
                }
            }
        }
        int count=0;
        for(int key : map.keySet()){
            int value = map.get(key);
            count += key*value;
            LOGGER.info("冲突次数为："+key+" 的元素个数："+value);
        }
        LOGGER.info("冲突次数："+count);
        LOGGER.info("总槽数："+INDEX_LENGTH);
        LOGGER.info("用槽数："+(INDEX_LENGTH-emptySlot));
        LOGGER.info("使用率："+(float)(INDEX_LENGTH-emptySlot)/INDEX_LENGTH*100+"%");
        LOGGER.info("剩槽数："+emptySlot);
    }
    /**
     * 获取字符对应的根节点
     * 如果节点不存在
     * 则增加根节点后返回新增的节点
     * @param character 字符
     * @return 字符对应的根节点
     */
    private TrieNode getRootNodeIfNotExistThenCreate(char character){
        TrieNode trieNode = getRootNode(character);
        if(trieNode == null){
            trieNode = new TrieNode(character);
            addRootNode(trieNode);
        }
        return trieNode;
    }
    /**
     * 新增一个根节点
     * @param rootNode 根节点
     */
    private void addRootNode(TrieNode rootNode){
        //计算节点的存储索引
        int index = rootNode.getCharacter()%INDEX_LENGTH;
        //检查索引是否和其他节点冲突
        TrieNode existTrieNode = ROOT_NODES_INDEX[index];
        if(existTrieNode != null){
            //有冲突，将冲突节点附加到当前节点之后
            rootNode.setSibling(existTrieNode);
        }
        //新增的节点总是在最前
        ROOT_NODES_INDEX[index] = rootNode;
    }
    /**
     * 获取字符对应的根节点
     * 如果不存在，则返回NULL
     * @param character 字符
     * @return 字符对应的根节点
     */
    private TrieNode getRootNode(char character){
        //计算节点的存储索引
        int index = character%INDEX_LENGTH;
        TrieNode trieNode = ROOT_NODES_INDEX[index];
        while(trieNode != null && character != trieNode.getCharacter()){
            //如果节点和其他节点冲突，则需要链式查找
            trieNode = trieNode.getSibling();
        }
        return trieNode;
    }
    public V get(String item){
        return get(item, 0, item.length());
    }
    public V get(String item, int start, int length){
        if(start < 0 || length < 1){
            return null;
        }
        if(item == null || item.length() < length){
            return null;
        }
        //从根节点开始查找
        //获取根节点
        TrieNode node = getRootNode(item.charAt(start));
        if(node == null){
            //不存在根节点，结束查找
            return null;
        }
        //存在根节点，继续查找
        for(int i=1;i child = node.getChild(character);
            if(child == null){
                //未找到匹配节点
                return null;
            }else{
                //找到节点，继续往下找
                node = child;
            }
        }
        if(node.isTerminal()){
            return node.getValue();
        }
        return null;
    }
    /**
     * 移除词性
     * @param item 
     */
    public void remove(String item) {
        if(item == null || item.isEmpty()){
            return;
        }
        if(LOGGER.isDebugEnabled()) {
            LOGGER.debug("移除词性：" + item);
        }
        //从根节点开始查找
        //获取根节点
        TrieNode node = getRootNode(item.charAt(0));
        if(node == null){
            //不存在根节点，结束查找
            if(LOGGER.isDebugEnabled()) {
                LOGGER.debug("词性不存在：" + item);
            }
            return;
        }
        int length = item.length();
        //存在根节点，继续查找
        for(int i=1;i child = node.getChild(character);
            if(child == null){
                //未找到匹配节点
                if(LOGGER.isDebugEnabled()) {
                    LOGGER.debug("词性不存在：" + item);
                }
                return;
            }else{
                //找到节点，继续往下找
                node = child;
            }
        }
        if(node.isTerminal()){
            //设置为非叶子节点，效果相当于移除词性
            node.setTerminal(false);
            node.setValue(null);
            if(LOGGER.isDebugEnabled()) {
                LOGGER.debug("成功移除词性：" + item);
            }
        }else{
            if(LOGGER.isDebugEnabled()) {
                LOGGER.debug("词性不存在：" + item);
            }
        }
    }
    
    public void put(String item, V value){
        //去掉首尾空白字符
        item=item.trim();
        int len = item.length();
        if(len < 1){
            //长度小于1则忽略
            return;
        }
        //从根节点开始添加
        //获取根节点
        TrieNode node = getRootNodeIfNotExistThenCreate(item.charAt(0));
        for(int i=1;i child = node.getChildIfNotExistThenCreate(character);
            //改变顶级节点
            node = child;
        }
        //设置终结字符，表示从根节点遍历到此是一个合法的词
        node.setTerminal(true);
        //设置分值
        node.setValue(value);
    }
    private static class TrieNode implements Comparable{
        private char character;
        private V value;
        private boolean terminal;
        private TrieNode sibling;
        private TrieNode[] children = new TrieNode[0];
        public TrieNode(char character){
            this.character = character;
        }
        public boolean isTerminal() {
            return terminal;
        }
        public void setTerminal(boolean terminal) {
            this.terminal = terminal;
        }        
        public char getCharacter() {
            return character;
        }
        public void setCharacter(char character) {
            this.character = character;
        }
        public V getValue() {
            return value;
        }
        public void setValue(V value) {
            this.value = value;
        }
        public TrieNode getSibling() {
            return sibling;
        }
        public void setSibling(TrieNode sibling) {
            this.sibling = sibling;
        }
        public Collection> getChildren() {
            return Arrays.asList(children);            
        }
        /**
         * 利用二分搜索算法从有序数组中找到特定的节点
         * @param character 待查找节点
         * @return NULL OR 节点数据
         */
        public TrieNode getChild(char character) {
            int index = Arrays.binarySearch(children, character);
            if(index >= 0){
                return children[index];
            }
            return null;
        }        
        public TrieNode getChildIfNotExistThenCreate(char character) {
            TrieNode child = getChild(character);
            if(child == null){
                child = new TrieNode(character);
                addChild(child);
            }
            return child;
        }
        public void addChild(TrieNode child) {
            children = insert(children, child);
        }
        /**
         * 将一个字符追加到有序数组
         * @param array 有序数组
         * @param element 字符
         * @return 新的有序数字
         */
        private TrieNode[] insert(TrieNode[] array, TrieNode element){
            int length = array.length;
            if(length == 0){
                array = new TrieNode[1];
                array[0] = element;
                return array;
            }
            TrieNode[] newArray = new TrieNode[length+1];
            boolean insert=false;
            for(int i=0; i node : ROOT_NODES_INDEX){
            if(node != null){
                show(node, "");
            }
        }
    }
    private void show(TrieNode node, String indent){
        if(node.isTerminal()){
            LOGGER.info(indent+node.getCharacter()+"="+node.getValue()+"(T)");
        }else{
            LOGGER.info(indent+node.getCharacter());
        }        
        for(TrieNode item : node.getChildren()){
            show(item,indent+"\t");
        }
    }
    public static void main(String[] args){
        GenericTrie trie = new GenericTrie<>();
        trie.put("杨尚川", "nr");
        trie.put("杨尚喜", "nr");
        trie.put("人", "n");
        trie.put("写代码", "v");
        trie.showConflict();
        trie.show();
        LOGGER.info("杨尚川：" + trie.get("杨尚川"));
        LOGGER.info("杨尚喜："+trie.get("杨尚喜"));
        LOGGER.info("人："+trie.get("人"));
        LOGGER.info("写代码："+trie.get("写代码"));

        GenericTrie trie2 = new GenericTrie<>();
        trie2.put("杨尚川", 100);
        trie2.put("杨尚喜", 90);
        trie2.put("人", 10);
        trie2.put("写代码", 80);
        trie2.showConflict();
        trie2.show();
        LOGGER.info("杨尚川：" + trie2.get("杨尚川"));
        LOGGER.info("杨尚喜："+trie2.get("杨尚喜"));
        LOGGER.info("人："+trie2.get("人"));
        LOGGER.info("写代码："+trie2.get("写代码"));
    }
}