org.ansj.library.DATDictionary Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of ansj_seg Show documentation
Show all versions of ansj_seg Show documentation
best java chinese word seg !
package org.ansj.library;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map.Entry;
import java.util.Set;
import org.ansj.dic.DicReader;
import org.ansj.domain.AnsjItem;
import org.ansj.domain.PersonNatureAttr;
import org.ansj.domain.TermNature;
import org.ansj.domain.TermNatures;
import org.ansj.library.name.PersonAttrLibrary;
import org.ansj.util.MyStaticValue;
import org.nlpcn.commons.lang.dat.DoubleArrayTire;
import org.nlpcn.commons.lang.dat.Item;
import org.nlpcn.commons.lang.util.logging.Log;
public class DATDictionary {
private static final Log logger = MyStaticValue.getLog();
/**
* 所有在词典中出现的词,并且承担简繁体转换的任务.
*/
public static final char[] IN_SYSTEM = new char[65536];
/**
* 核心词典
*/
private static final DoubleArrayTire DAT = loadDAT();
/**
* 数组长度
*/
public static int arrayLength = DAT.arrayLength;
/**
* 加载词典
*
* @return
*/
private static DoubleArrayTire loadDAT() {
long start = System.currentTimeMillis();
try {
DoubleArrayTire dat = DoubleArrayTire.loadText(DicReader.getInputStream("core.dic"), AnsjItem.class);
// 人名识别必备的
personNameFull(dat);
// 记录词典中的词语,并且清除部分数据
for (Item item : dat.getDAT()) {
if (item == null || item.getName() == null) {
continue;
}
if (item.getStatus() < 4) {
for (int i = 0; i < item.getName().length(); i++) {
IN_SYSTEM[item.getName().charAt(i)] = item.getName().charAt(i);
}
}
if (item.getStatus() < 2) {
item.setName(null);
continue;
}
}
// 特殊字符标准化
IN_SYSTEM['%'] = '%';
logger.info("init core library ok use time : " + (System.currentTimeMillis() - start));
return dat;
} catch (InstantiationException e) {
MyStaticValue.LIBRARYLOG.warn("无法实例化", e);
} catch (IllegalAccessException e) {
MyStaticValue.LIBRARYLOG.warn("非法访问", e);
} catch (NumberFormatException e) {
MyStaticValue.LIBRARYLOG.warn("数字格式异常", e);
} catch (IOException e) {
MyStaticValue.LIBRARYLOG.warn("IO异常", e);
}
return null;
}
private static void personNameFull(DoubleArrayTire dat) throws NumberFormatException, IOException {
HashMap personMap = new PersonAttrLibrary().getPersonMap();
AnsjItem ansjItem = null;
// 人名词性补录
Set> entrySet = personMap.entrySet();
char c = 0;
String temp = null;
for (Entry entry : entrySet) {
temp = entry.getKey();
if (temp.length() == 1 && (ansjItem = (AnsjItem) dat.getDAT()[temp.charAt(0)]) == null) {
ansjItem = new AnsjItem();
ansjItem.setBase(c);
ansjItem.setCheck(-1);
ansjItem.setStatus((byte) 3);
ansjItem.setName(temp);
dat.getDAT()[temp.charAt(0)] = ansjItem;
} else {
ansjItem = dat.getItem(temp);
}
if (ansjItem == null) {
continue;
}
if ((ansjItem.termNatures) == null) {
if (temp.length() == 1 && temp.charAt(0) < 256) {
ansjItem.termNatures = TermNatures.NULL;
} else {
ansjItem.termNatures = new TermNatures(TermNature.NR);
}
}
ansjItem.termNatures.setPersonNatureAttr(entry.getValue());
}
}
public static int status(char c) {
Item item = (AnsjItem) DAT.getDAT()[c];
if (item == null) {
return 0;
}
return item.getStatus();
}
/**
* 判断一个词语是否在词典中
*
* @param word
* @return
*/
public static boolean isInSystemDic(String word) {
Item item = DAT.getItem(word);
return item != null && item.getStatus() > 1;
}
public static AnsjItem getItem(int index) {
AnsjItem item = DAT.getItem(index);
if (item == null) {
return AnsjItem.NULL;
}
return item;
}
public static AnsjItem getItem(String str) {
AnsjItem item = DAT.getItem(str);
if (item == null) {
return AnsjItem.NULL;
}
return item;
}
public static int getId(String str) {
return DAT.getId(str);
}
}