org.wltea.analyzer.dic.DictSegment Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of IK_Analyzer Show documentation
IK 分词器项目maven版.发布到maven中央仓库
The newest version!
/**
 * 
 * IK 中文分词  版本 5.0
 * IK Analyzer release 5.0
 * 
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * 源代码由林良益([email protected])提供
 * 版权声明 2012，乌龙茶工作室
 * provided by Linliangyi and copyright 2012 by Oolong studio
 * 
 */
package org.wltea.analyzer.dic;

import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;

/**
 * 词典树分段，表示词典树的一个分枝
 */
class DictSegment implements Comparable{
	
	//公用字典表，存储汉字
	private static final Map charMap = new HashMap(16 , 0.95f);
	//数组大小上限
	private static final int ARRAY_LENGTH_LIMIT = 3;

	
	//Map存储结构
	private Map childrenMap;
	//数组方式存储结构
	private DictSegment[] childrenArray;
	
	
	//当前节点上存储的字符
	private Character nodeChar;
	//当前节点存储的Segment数目
	//storeSize <=ARRAY_LENGTH_LIMIT ，使用数组存储， storeSize >ARRAY_LENGTH_LIMIT ,则使用Map存储
	private int storeSize = 0;
	//当前DictSegment状态 ,默认 0 , 1表示从根节点到当前节点的路径表示一个词
	private int nodeState = 0;	
	
	
	DictSegment(Character nodeChar){
		if(nodeChar == null){
			throw new IllegalArgumentException("参数为空异常，字符不能为空");
		}
		this.nodeChar = nodeChar;
	}

	Character getNodeChar() {
		return nodeChar;
	}
	
	/*
	 * 判断是否有下一个节点
	 */
	boolean hasNextNode(){
		return  this.storeSize > 0;
	}
	
	/**
	 * 匹配词段
	 * @param charArray
	 * @return Hit
	 */
	Hit match(char[] charArray){
		return this.match(charArray , 0 , charArray.length , null);
	}
	
	/**
	 * 匹配词段
	 * @param charArray
	 * @param begin
	 * @param length
	 * @return Hit 
	 */
	Hit match(char[] charArray , int begin , int length){
		return this.match(charArray , begin , length , null);
	}
	
	/**
	 * 匹配词段
	 * @param charArray
	 * @param begin
	 * @param length
	 * @param searchHit
	 * @return Hit 
	 */
	Hit match(char[] charArray , int begin , int length , Hit searchHit){
		
		if(searchHit == null){
			//如果hit为空，新建
			searchHit= new Hit();
			//设置hit的其实文本位置
			searchHit.setBegin(begin);
		}else{
			//否则要将HIT状态重置
			searchHit.setUnmatch();
		}
		//设置hit的当前处理位置
		searchHit.setEnd(begin);
		
		Character keyChar = new Character(charArray[begin]);
		DictSegment ds = null;
		
		//引用实例变量为本地变量，避免查询时遇到更新的同步问题
		DictSegment[] segmentArray = this.childrenArray;
		Map segmentMap = this.childrenMap;		
		
		//STEP1 在节点中查找keyChar对应的DictSegment
		if(segmentArray != null){
			//在数组中查找
			DictSegment keySegment = new DictSegment(keyChar);
			int position = Arrays.binarySearch(segmentArray, 0 , this.storeSize , keySegment);
			if(position >= 0){
				ds = segmentArray[position];
			}

		}else if(segmentMap != null){
			//在map中查找
			ds = (DictSegment)segmentMap.get(keyChar);
		}
		
		//STEP2 找到DictSegment，判断词的匹配状态，是否继续递归，还是返回结果
		if(ds != null){			
			if(length > 1){
				//词未匹配完，继续往下搜索
				return ds.match(charArray, begin + 1 , length - 1 , searchHit);
			}else if (length == 1){
				
				//搜索最后一个char
				if(ds.nodeState == 1){
					//添加HIT状态为完全匹配
					searchHit.setMatch();
				}
				if(ds.hasNextNode()){
					//添加HIT状态为前缀匹配
					searchHit.setPrefix();
					//记录当前位置的DictSegment
					searchHit.setMatchedDictSegment(ds);
				}
				return searchHit;
			}
			
		}
		//STEP3 没有找到DictSegment， 将HIT设置为不匹配
		return searchHit;		
	}

	/**
	 * 加载填充词典片段
	 * @param charArray
	 */
	void fillSegment(char[] charArray){
		this.fillSegment(charArray, 0 , charArray.length , 1); 
	}
	
	/**
	 * 屏蔽词典中的一个词
	 * @param charArray
	 */
	void disableSegment(char[] charArray){
		this.fillSegment(charArray, 0 , charArray.length , 0); 
	}
	
	/**
	 * 加载填充词典片段
	 * @param charArray
	 * @param begin
	 * @param length
	 * @param enabled
	 */
	private synchronized void fillSegment(char[] charArray , int begin , int length , int enabled){
		//获取字典表中的汉字对象
		Character beginChar = new Character(charArray[begin]);
		Character keyChar = charMap.get(beginChar);
		//字典中没有该字，则将其添加入字典
		if(keyChar == null){
			charMap.put(beginChar, beginChar);
			keyChar = beginChar;
		}
		
		//搜索当前节点的存储，查询对应keyChar的keyChar，如果没有则创建
		DictSegment ds = lookforSegment(keyChar , enabled);
		if(ds != null){
			//处理keyChar对应的segment
			if(length > 1){
				//词元还没有完全加入词典树
				ds.fillSegment(charArray, begin + 1, length - 1 , enabled);
			}else if (length == 1){
				//已经是词元的最后一个char,设置当前节点状态为enabled，
				//enabled=1表明一个完整的词，enabled=0表示从词典中屏蔽当前词
				ds.nodeState = enabled;
			}
		}

	}
	
	/**
	 * 查找本节点下对应的keyChar的segment	 * 
	 * @param keyChar
	 * @param create  =1如果没有找到，则创建新的segment ; =0如果没有找到，不创建，返回null
	 * @return
	 */
	private DictSegment lookforSegment(Character keyChar ,  int create){
		
		DictSegment ds = null;

		if(this.storeSize <= ARRAY_LENGTH_LIMIT){
			//获取数组容器，如果数组未创建则创建数组
			DictSegment[] segmentArray = getChildrenArray();			
			//搜寻数组
			DictSegment keySegment = new DictSegment(keyChar);
			int position = Arrays.binarySearch(segmentArray, 0 , this.storeSize, keySegment);
			if(position >= 0){
				ds = segmentArray[position];
			}
		
			//遍历数组后没有找到对应的segment
			if(ds == null && create == 1){
				ds = keySegment;
				if(this.storeSize < ARRAY_LENGTH_LIMIT){
					//数组容量未满，使用数组存储
					segmentArray[this.storeSize] = ds;
					//segment数目+1
					this.storeSize++;
					Arrays.sort(segmentArray , 0 , this.storeSize);
					
				}else{
					//数组容量已满，切换Map存储
					//获取Map容器，如果Map未创建,则创建Map
					Map segmentMap = getChildrenMap();
					//将数组中的segment迁移到Map中
					migrate(segmentArray ,  segmentMap);
					//存储新的segment
					segmentMap.put(keyChar, ds);
					//segment数目+1 ，  必须在释放数组前执行storeSize++ ， 确保极端情况下，不会取到空的数组
					this.storeSize++;
					//释放当前的数组引用
					this.childrenArray = null;
				}

			}			
			
		}else{
			//获取Map容器，如果Map未创建,则创建Map
			Map segmentMap = getChildrenMap();
			//搜索Map
			ds = (DictSegment)segmentMap.get(keyChar);
			if(ds == null && create == 1){
				//构造新的segment
				ds = new DictSegment(keyChar);
				segmentMap.put(keyChar , ds);
				//当前节点存储segment数目+1
				this.storeSize ++;
			}
		}

		return ds;
	}
	
	
	/**
	 * 获取数组容器
	 * 线程同步方法
	 */
	private DictSegment[] getChildrenArray(){
		if(this.childrenArray == null){
			synchronized(this){
				if(this.childrenArray == null){
					this.childrenArray = new DictSegment[ARRAY_LENGTH_LIMIT];
				}
			}
		}
		return this.childrenArray;
	}
	
	/**
	 * 获取Map容器
	 * 线程同步方法
	 */	
	private Map getChildrenMap(){
		if(this.childrenMap == null){
			synchronized(this){
				if(this.childrenMap == null){
					this.childrenMap = new HashMap(ARRAY_LENGTH_LIMIT * 2,0.8f);
				}
			}
		}
		return this.childrenMap;
	}
	
	/**
	 * 将数组中的segment迁移到Map中
	 * @param segmentArray
	 */
	private void migrate(DictSegment[] segmentArray , Map segmentMap){
		for(DictSegment segment : segmentArray){
			if(segment != null){
				segmentMap.put(segment.nodeChar, segment);
			}
		}
	}

	/**
	 * 实现Comparable接口
	 * @param o
	 * @return int
	 */
	public int compareTo(DictSegment o) {
		//对当前节点存储的char进行比较
		return this.nodeChar.compareTo(o.nodeChar);
	}
	
}