All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.wltea.analyzer.dic.DictSegment Maven / Gradle / Ivy

The newest version!
/**
 *
 * IK 中文分词  版本 5.0
 * IK Analyzer release 5.0
 *
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * 源代码由林良益([email protected])提供
 * 版权声明 2012,乌龙茶工作室
 * provided by Linliangyi and copyright 2012 by Oolong studio
 *
 */
package org.wltea.analyzer.dic;

import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;

/**
 * 词典树分段,表示词典树的一个分枝
 */
class DictSegment implements Comparable {

    //公用字典表,存储汉字
    private static final Map charMap            = new HashMap(16, 0.95f);
    //数组大小上限
    private static final int                       ARRAY_LENGTH_LIMIT = 3;


    //Map存储结构
    private Map childrenMap;
    //数组方式存储结构
    private DictSegment[]               childrenArray;


    //当前节点上存储的字符
    private final Character nodeChar;
    //当前节点存储的Segment数目
    //storeSize <=ARRAY_LENGTH_LIMIT ,使用数组存储, storeSize >ARRAY_LENGTH_LIMIT ,则使用Map存储
    private int storeSize = 0;
    //当前DictSegment状态 ,默认 0 , 1表示从根节点到当前节点的路径表示一个词
    private int nodeState = 0;


    DictSegment(Character nodeChar) {
        if (nodeChar == null) {
            throw new IllegalArgumentException("参数为空异常,字符不能为空");
        }
        this.nodeChar = nodeChar;
    }

    Character getNodeChar() {
        return nodeChar;
    }

    /*
     * 判断是否有下一个节点
     */
    boolean hasNextNode() {
        return this.storeSize > 0;
    }

    /**
     * 匹配词段
     *
     * @param charArray 字符数组
     * @return Hit
     */
    Hit match(char[] charArray) {
        return this.match(charArray, 0, charArray.length, null);
    }

    /**
     * 匹配词段
     *
     * @param charArray 字符数组
     * @param begin     开始位置
     * @param length    长度
     * @return Hit
     */
    Hit match(char[] charArray, int begin, int length) {
        return this.match(charArray, begin, length, null);
    }

    /**
     * 匹配词段
     *
     * @param charArray 字符数组
     * @param begin     开始位置
     * @param length    长度
     * @param searchHit 查询HIT
     * @return Hit
     */
    Hit match(char[] charArray, int begin, int length, Hit searchHit) {

        if (searchHit == null) {
            //如果hit为空,新建
            searchHit = new Hit();
            //设置hit的其实文本位置
            searchHit.setBegin(begin);
        } else {
            //否则要将HIT状态重置
            searchHit.setUnmatch();
        }
        //设置hit的当前处理位置
        searchHit.setEnd(begin);

        Character keyChar = charArray[begin];
        DictSegment ds = null;

        //引用实例变量为本地变量,避免查询时遇到更新的同步问题
        DictSegment[] segmentArray = this.childrenArray;
        Map segmentMap = this.childrenMap;

        //STEP1 在节点中查找keyChar对应的DictSegment
        if (segmentArray != null) {
            //在数组中查找
            DictSegment keySegment = new DictSegment(keyChar);
            int position = Arrays.binarySearch(segmentArray, 0, this.storeSize, keySegment);
            if (position >= 0) {
                ds = segmentArray[position];
            }

        } else if (segmentMap != null) {
            //在map中查找
            ds = segmentMap.get(keyChar);
        }

        //STEP2 找到DictSegment,判断词的匹配状态,是否继续递归,还是返回结果
        if (ds != null) {
            if (length > 1) {
                //词未匹配完,继续往下搜索
                return ds.match(charArray, begin + 1, length - 1, searchHit);
            } else if (length == 1) {

                //搜索最后一个char
                if (ds.nodeState == 1) {
                    //添加HIT状态为完全匹配
                    searchHit.setMatch();
                }
                if (ds.hasNextNode()) {
                    //添加HIT状态为前缀匹配
                    searchHit.setPrefix();
                    //记录当前位置的DictSegment
                    searchHit.setMatchedDictSegment(ds);
                }
                return searchHit;
            }

        }
        //STEP3 没有找到DictSegment, 将HIT设置为不匹配
        return searchHit;
    }

    /**
     * 加载填充词典片段
     *
     * @param charArray 字符数组
     */
    void fillSegment(char[] charArray) {
        this.fillSegment(charArray, 0, charArray.length, 1);
    }

    /**
     * 屏蔽词典中的一个词
     *
     * @param charArray 字符数组
     */
    void disableSegment(char[] charArray) {
        this.fillSegment(charArray, 0, charArray.length, 0);
    }

    /**
     * 加载填充词典片段
     *
     * @param charArray 字符数组
     * @param begin     开始位置
     * @param length    长度
     * @param enabled   是否开启填充片段
     */
    private synchronized void fillSegment(char[] charArray, int begin, int length, int enabled) {
        //获取字典表中的汉字对象
        Character beginChar = charArray[begin];
        Character keyChar = charMap.get(beginChar);
        //字典中没有该字,则将其添加入字典
        if (keyChar == null) {
            charMap.put(beginChar, beginChar);
            keyChar = beginChar;
        }

        //搜索当前节点的存储,查询对应keyChar的keyChar,如果没有则创建
        DictSegment ds = lookforSegment(keyChar, enabled);
        if (ds != null) {
            //处理keyChar对应的segment
            if (length > 1) {
                //词元还没有完全加入词典树
                ds.fillSegment(charArray, begin + 1, length - 1, enabled);
            } else if (length == 1) {
                //已经是词元的最后一个char,设置当前节点状态为enabled,
                //enabled=1表明一个完整的词,enabled=0表示从词典中屏蔽当前词
                ds.nodeState = enabled;
            }
        }

    }

    /**
     * 查找本节点下对应的keyChar的segment
     *
     * @param keyChar 字符
     * @param create  =1如果没有找到,则创建新的segment ; =0如果没有找到,不创建,返回null
     * @return segment.
     */
    private DictSegment lookforSegment(Character keyChar, int create) {

        DictSegment ds = null;

        if (this.storeSize <= ARRAY_LENGTH_LIMIT) {
            //获取数组容器,如果数组未创建则创建数组
            DictSegment[] segmentArray = getChildrenArray();
            //搜寻数组
            DictSegment keySegment = new DictSegment(keyChar);
            int position = Arrays.binarySearch(segmentArray, 0, this.storeSize, keySegment);
            if (position >= 0) {
                ds = segmentArray[position];
            }

            //遍历数组后没有找到对应的segment
            if (ds == null && create == 1) {
                ds = keySegment;
                if (this.storeSize < ARRAY_LENGTH_LIMIT) {
                    //数组容量未满,使用数组存储
                    segmentArray[this.storeSize] = ds;
                    //segment数目+1
                    this.storeSize++;
                    Arrays.sort(segmentArray, 0, this.storeSize);

                } else {
                    //数组容量已满,切换Map存储
                    //获取Map容器,如果Map未创建,则创建Map
                    Map segmentMap = getChildrenMap();
                    //将数组中的segment迁移到Map中
                    migrate(segmentArray, segmentMap);
                    //存储新的segment
                    segmentMap.put(keyChar, ds);
                    //segment数目+1 ,  必须在释放数组前执行storeSize++ , 确保极端情况下,不会取到空的数组
                    this.storeSize++;
                    //释放当前的数组引用
                    this.childrenArray = null;
                }

            }

        } else {
            //获取Map容器,如果Map未创建,则创建Map
            Map segmentMap = getChildrenMap();
            //搜索Map
            ds = segmentMap.get(keyChar);
            if (ds == null && create == 1) {
                //构造新的segment
                ds = new DictSegment(keyChar);
                segmentMap.put(keyChar, ds);
                //当前节点存储segment数目+1
                this.storeSize++;
            }
        }

        return ds;
    }


    /**
     * 获取数组容器 线程同步方法
     */
    private DictSegment[] getChildrenArray() {
        if (this.childrenArray == null) {
            synchronized (this) {
                if (this.childrenArray == null) {
                    this.childrenArray = new DictSegment[ARRAY_LENGTH_LIMIT];
                }
            }
        }
        return this.childrenArray;
    }

    /**
     * 获取Map容器 线程同步方法
     */
    private Map getChildrenMap() {
        if (this.childrenMap == null) {
            synchronized (this) {
                if (this.childrenMap == null) {
                    this.childrenMap = new HashMap(ARRAY_LENGTH_LIMIT * 2, 0.8f);
                }
            }
        }
        return this.childrenMap;
    }

    /**
     * 将数组中的segment迁移到Map中
     *
     * @param segmentArray segmentArray
     */
    private void migrate(DictSegment[] segmentArray, Map segmentMap) {
        for (DictSegment segment : segmentArray) {
            if (segment != null) {
                segmentMap.put(segment.nodeChar, segment);
            }
        }
    }

    /**
     * 实现Comparable接口
     *
     * @param o  DictSegment
     * @return int
     */
    public int compareTo(DictSegment o) {
        //对当前节点存储的char进行比较
        return this.nodeChar.compareTo(o.nodeChar);
    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy