All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.ansj.library.DATDictionary Maven / Gradle / Ivy

There is a newer version: 5.1.6
Show newest version
package org.ansj.library;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map.Entry;
import java.util.Set;

import org.ansj.dic.DicReader;
import org.ansj.domain.AnsjItem;
import org.ansj.domain.PersonNatureAttr;
import org.ansj.domain.TermNature;
import org.ansj.domain.TermNatures;
import org.ansj.library.name.PersonAttrLibrary;
import org.ansj.util.MyStaticValue;
import org.nlpcn.commons.lang.dat.DoubleArrayTire;
import org.nlpcn.commons.lang.dat.Item;
import org.nlpcn.commons.lang.util.logging.Log;

public class DATDictionary {

	private static final Log logger = MyStaticValue.getLog();

	/**
	 * 所有在词典中出现的词,并且承担简繁体转换的任务.
	 */
	public static final char[] IN_SYSTEM = new char[65536];

	/**
	 * 核心词典
	 */
	private static final DoubleArrayTire DAT = loadDAT();

	/**
	 * 数组长度
	 */
	public static int arrayLength = DAT.arrayLength;

	/**
	 * 加载词典
	 * 
	 * @return
	 */
	private static DoubleArrayTire loadDAT() {
		long start = System.currentTimeMillis();
		try {
			DoubleArrayTire dat = DoubleArrayTire.loadText(DicReader.getInputStream("core.dic"), AnsjItem.class);
			// 人名识别必备的
			personNameFull(dat);
			// 记录词典中的词语,并且清除部分数据
			for (Item item : dat.getDAT()) {
				if (item == null || item.getName() == null) {
					continue;
				}

				if (item.getStatus() < 4) {
					for (int i = 0; i < item.getName().length(); i++) {
						IN_SYSTEM[item.getName().charAt(i)] = item.getName().charAt(i);
					}
				}
				if (item.getStatus() < 2) {
					item.setName(null);
					continue;
				}
			}
			// 特殊字符标准化
			IN_SYSTEM['%'] = '%';
			logger.info("init core library ok use time : " + (System.currentTimeMillis() - start));
			return dat;
		} catch (InstantiationException e) {
			MyStaticValue.LIBRARYLOG.warn("无法实例化", e);
		} catch (IllegalAccessException e) {
			MyStaticValue.LIBRARYLOG.warn("非法访问", e);
		} catch (NumberFormatException e) {
			MyStaticValue.LIBRARYLOG.warn("数字格式异常", e);
		} catch (IOException e) {
			MyStaticValue.LIBRARYLOG.warn("IO异常", e);
		}

		return null;
	}

	private static void personNameFull(DoubleArrayTire dat) throws NumberFormatException, IOException {
		HashMap personMap = new PersonAttrLibrary().getPersonMap();

		AnsjItem ansjItem = null;
		// 人名词性补录
		Set> entrySet = personMap.entrySet();
		char c = 0;
		String temp = null;
		for (Entry entry : entrySet) {
			temp = entry.getKey();

			if (temp.length() == 1 && (ansjItem = (AnsjItem) dat.getDAT()[temp.charAt(0)]) == null) {
				ansjItem = new AnsjItem();
				ansjItem.setBase(c);
				ansjItem.setCheck(-1);
				ansjItem.setStatus((byte) 3);
				ansjItem.setName(temp);
				dat.getDAT()[temp.charAt(0)] = ansjItem;
			} else {
				ansjItem = dat.getItem(temp);
			}

			if (ansjItem == null) {
				continue;
			}

			if ((ansjItem.termNatures) == null) {
				if (temp.length() == 1 && temp.charAt(0) < 256) {
					ansjItem.termNatures = TermNatures.NULL;
				} else {
					ansjItem.termNatures = new TermNatures(TermNature.NR);
				}
			}
			ansjItem.termNatures.setPersonNatureAttr(entry.getValue());
		}
	}

	public static int status(char c) {
		Item item = (AnsjItem) DAT.getDAT()[c];
		if (item == null) {
			return 0;
		}
		return item.getStatus();
	}

	/**
	 * 判断一个词语是否在词典中
	 * 
	 * @param word
	 * @return
	 */
	public static boolean isInSystemDic(String word) {
		Item item = DAT.getItem(word);
		return item != null && item.getStatus() > 1;
	}

	public static AnsjItem getItem(int index) {
		AnsjItem item = DAT.getItem(index);
		if (item == null) {
			return AnsjItem.NULL;
		}

		return item;
	}

	public static AnsjItem getItem(String str) {
		AnsjItem item = DAT.getItem(str);
		if (item == null) {
			return AnsjItem.NULL;
		}

		return item;
	}

	public static int getId(String str) {
		return DAT.getId(str);
	}

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy