/**
* Copyright 2007 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package net.paoding.analysis.dictionary;
import java.util.HashMap;
import java.util.Map;
/**
* Dictionary的散列+二叉查找实现。
*
* 用于对大数量的,且头字符相同的字符串较多的情况,e.g汉字词语字典。在这种情况下,检索速度将比二叉字典更快。
*
*
* HashBinaryDictionary以一组已经排序的词语为输入,所有头字符 相同的词语划为一个集合作为分字典(使用BinaryDictionary实现)。
* 查找词语时,先根据第一个字符找得分词典(BinaryDictionary实现),再从该分词典中定位该词语。
*
*
* @author Zhiliang Wang [[email protected] ]
*
* @see BinaryDictionary
*
* @since 1.0
*
*/
public class HashBinaryDictionary implements Dictionary {
// -------------------------------------------------
/**
* 字典中所有词语,用于方便{@link #get(int)}方法
*/
private Word[] ascWords;
/**
* 首字符到分词典的映射
*/
private Map subs;
/**
*
*/
private final int hashIndex;
private final int start;
private final int end;
private final int count;
// -------------------------------------------------
/**
*
* @param ascWords
* 升序排列词语
* @param initialCapacity
* @param loadFactor
*/
public HashBinaryDictionary(Word[] ascWords, int initialCapacity, float loadFactor) {
this(ascWords, 0, 0, ascWords.length, initialCapacity, loadFactor);
}
public HashBinaryDictionary(Word[] ascWords, int hashIndex, int start, int end, int initialCapacity, float loadFactor) {
this.ascWords = ascWords;
this.start = start;
this.end = end;
this.count = end - start;
this.hashIndex = hashIndex;
subs = new HashMap(initialCapacity, loadFactor);
createSubDictionaries();
}
// -------------------------------------------------
/**
* 创建分词典映射,为构造函数调用
*/
protected void createSubDictionaries() {
if (this.start >= ascWords.length) {
return;
}
// 定位相同头字符词语的开头和结束位置以确认分字典
int beginIndex = this.start;
int endIndex = this.start + 1;
char beginHashChar = getChar(ascWords[start], hashIndex);
char endHashChar;
for (; endIndex < this.end; endIndex++) {
endHashChar = getChar(ascWords[endIndex], hashIndex);
if (endHashChar != beginHashChar) {
addSubDictionary(beginHashChar, beginIndex, endIndex);
beginIndex = endIndex;
beginHashChar = endHashChar;
}
}
addSubDictionary(beginHashChar, beginIndex, this.end);
}
protected char getChar(CharSequence s, int index) {
if (index >= s.length()) {
return (char) 0;
}
return s.charAt(index);
}
/**
* 将位置在beginIndex和endIndex之间(不包括endIndex)的词语作为一个分词典
*
* @param hashChar
* @param beginIndex
* @param endIndex
*/
protected void addSubDictionary(char hashChar, int beginIndex, int endIndex) {
Dictionary subDic = createSubDictionary(ascWords, beginIndex, endIndex);
SubDictionaryWrap subDicWrap = new SubDictionaryWrap(hashChar, subDic, beginIndex);
subs.put(keyOf(hashChar), subDicWrap);
}
protected Dictionary createSubDictionary(Word[] ascWords, int beginIndex, int endIndex) {
int count = endIndex - beginIndex;
if (count < 16 || hashIndex >= 16) {
return new BinaryDictionary(ascWords, beginIndex, endIndex);
} else {
return new HashBinaryDictionary(ascWords, hashIndex + 1, beginIndex, endIndex, getCapacity(count), 0.75f);
}
}
protected static final int[] capacityCandiate = { 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 10192 };
protected int getCapacity(int count) {
int capacity = -1;
count <<= 2;
count /= 3;
count += 1;
for (int i = 0; i < capacityCandiate.length; i++) {
if (count <= capacityCandiate[i]) {
capacity = capacityCandiate[i];
break;
}
}
if (capacity < 0) {
capacity = capacityCandiate[capacityCandiate.length - 1];
}
return capacity;
}
// -------------------------------------------------
public Word get(int index) {
return ascWords[start + index];
}
public Hit search(CharSequence input, int begin, int count) {
SubDictionaryWrap subDic = (SubDictionaryWrap) subs.get(keyOf(input.charAt(hashIndex + begin)));
if (subDic == null) {
return Hit.UNDEFINED;
}
Dictionary dic = subDic.dic;
// 对count==hashIndex + 1的处理
if (count == hashIndex + 1) {
Word header = dic.get(0);
if (header.length() == hashIndex + 1) {
if (subDic.wordIndexOffset + 1 < this.ascWords.length) {
return new Hit(subDic.wordIndexOffset, header, this.ascWords[subDic.wordIndexOffset + 1]);
} else {
return new Hit(subDic.wordIndexOffset, header, null);
}
} else {
return new Hit(Hit.UNCLOSED_INDEX, null, header);
}
}
// count > hashIndex + 1
Hit word = dic.search(input, begin, count);
if (word.isHit()) {
int index = subDic.wordIndexOffset + word.getIndex();
word.setIndex(index);
if (word.getNext() == null && index < size()) {
word.setNext(get(index + 1));
}
}
return word;
}
public int size() {
return count;
}
// -------------------------------------------------
/**
* 字符的在{@link #subs}的key值。
*
* @param theChar
* @return
*
* @see #subs
*/
protected Object keyOf(char theChar) {
// return theChar - 0x4E00;// '一'==0x4E00
return new Integer(theChar);
}
/**
* 分词典封箱
*/
static class SubDictionaryWrap {
/**
* 分词典词组的头字符
*/
char hashChar;
/**
* 分词典
*/
Dictionary dic;
/**
* 分词典第一个词语在所有词语中的偏移位置
*/
int wordIndexOffset;
public SubDictionaryWrap(char hashChar, Dictionary dic, int wordIndexOffset) {
this.hashChar = hashChar;
this.dic = dic;
this.wordIndexOffset = wordIndexOffset;
}
}
}