org.ansj.library.DicLibrary Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of ansj_seg Show documentation
Show all versions of ansj_seg Show documentation
best java chinese word seg !
package org.ansj.library;
import java.io.BufferedReader;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import org.ansj.dic.PathToStream;
import org.ansj.domain.KV;
import org.ansj.util.MyStaticValue;
import org.nlpcn.commons.lang.tire.domain.Forest;
import org.nlpcn.commons.lang.tire.domain.Value;
import org.nlpcn.commons.lang.tire.library.Library;
import org.nlpcn.commons.lang.util.IOUtil;
import org.nlpcn.commons.lang.util.StringUtil;
import org.nlpcn.commons.lang.util.logging.Log;
import org.nlpcn.commons.lang.util.logging.LogFactory;
public class DicLibrary {
private static final Log LOG = LogFactory.getLog();
public static final String DEFAULT = "dic";
public static final String DEFAULT_NATURE = "userDefine";
public static final Integer DEFAULT_FREQ = 1000;
public static final String DEFAULT_FREQ_STR = "1000";
// 用户自定义词典
private static final Map> DIC = new HashMap<>();
static {
for (Entry entry : MyStaticValue.ENV.entrySet()) {
if (entry.getKey().startsWith(DEFAULT)) {
put(entry.getKey(), entry.getValue());
}
}
putIfAbsent(DEFAULT, "library/default.dic");
Forest forest = get();
if (forest == null) {
put(DEFAULT, DEFAULT, new Forest());
}
}
/**
* 关键词增加
*
* @param keyword 所要增加的关键词
* @param nature 关键词的词性
* @param freq 关键词的词频
*/
public static void insert(String key, String keyword, String nature, int freq) {
Forest dic = get(key);
String[] paramers = new String[2];
paramers[0] = nature;
paramers[1] = String.valueOf(freq);
Value value = new Value(keyword, paramers);
Library.insertWord(dic, value);
}
/**
* 增加关键词
*
* @param keyword
*/
public static void insert(String key, String keyword) {
insert(key, keyword, DEFAULT_NATURE, DEFAULT_FREQ);
}
/**
* 删除关键词
*/
public static void delete(String key, String word) {
Forest dic = get(key);
if (dic != null) {
Library.removeWord(dic, word);
}
}
/**
* 将用户自定义词典清空
*/
public static void clear(String key) {
get(key).clear();
}
public static Forest get() {
if (!DIC.containsKey(DEFAULT)) {
return null;
}
return get(DEFAULT);
}
/**
* 根据模型名称获取crf模型
*
* @param modelName
* @return
*/
public static Forest get(String key) {
KV kv = DIC.get(key);
if (kv == null) {
if (MyStaticValue.ENV.containsKey(key)) {
putIfAbsent(key, MyStaticValue.ENV.get(key));
return get(key);
}
LOG.warn("dic " + key + " not found in config ");
return null;
}
Forest forest = kv.getV();
if (forest == null) {
forest = init(key, kv);
}
return forest;
}
/**
* 根据keys获取词典集合
*
* @param keys
* @return
*/
public static Forest[] gets(String... keys) {
Forest[] forests = new Forest[keys.length];
for (int i = 0; i < forests.length; i++) {
forests[i] = get(keys[i]);
}
return forests;
}
/**
* 根据keys获取词典集合
*
* @param keys
* @return
*/
public static Forest[] gets(Collection keys) {
return gets(keys.toArray(new String[keys.size()]));
}
/**
* 用户自定义词典加载
*
* @param key
* @param path
* @return
*/
private synchronized static Forest init(String key, KV kv) {
Forest forest = kv.getV();
if (forest != null) {
return forest;
}
try {
forest = new Forest();
LOG.debug("begin init dic !");
long start = System.currentTimeMillis();
String temp = null;
String[] strs = null;
Value value = null;
try (BufferedReader br = IOUtil.getReader(PathToStream.stream(kv.getK()), "UTF-8")) {
while ((temp = br.readLine()) != null) {
if (StringUtil.isNotBlank(temp)) {
temp = StringUtil.trim(temp);
strs = temp.split("\t");
strs[0] = strs[0].toLowerCase();
// 如何核心辞典存在那么就放弃
if (MyStaticValue.isSkipUserDefine && DATDictionary.getId(strs[0]) > 0) {
continue;
}
if (strs.length != 3) {
value = new Value(strs[0], DEFAULT_NATURE, DEFAULT_FREQ_STR);
} else {
value = new Value(strs[0], strs[1], strs[2]);
}
Library.insertWord(forest, value);
}
}
}
LOG.info("load dic use time:" + (System.currentTimeMillis() - start) + " path is : " + kv.getK());
kv.setV(forest);
return forest;
} catch (Exception e) {
LOG.error("Init ambiguity library error :" + e.getMessage() + ", path: " + kv.getK());
DIC.remove(key);
return null;
}
}
/**
* 动态添加词典
*
* @param dicDefault
* @param dicDefault2
* @param dic2
*/
public static void put(String key, String path, Forest forest) {
DIC.put(key, KV.with(path, forest));
}
/**
* 动态添加词典
*
* @param dicDefault
* @param dicDefault2
* @param dic2
*/
public static void putIfAbsent(String key, String path) {
if (!DIC.containsKey(key)) {
DIC.put(key, KV.with(path, (Forest) null));
}
}
/**
* 动态添加词典
*
* @param dicDefault
* @param dicDefault2
* @param dic2
*/
public static void put(String key, String path) {
put(key, path, null);
}
/**
* 动态添加词典
*
* @param
* @param
*
* @param dicDefault
* @param dicDefault2
* @param dic2
*/
public static synchronized Forest putIfAbsent(String key, String path, Forest forest) {
KV kv = DIC.get(key);
if (kv != null && kv.getV() != null) {
return kv.getV();
}
put(key, path, forest);
return forest;
}
public static KV remove(String key) {
return DIC.remove(key);
}
public static Set keys() {
return DIC.keySet();
}
public static void reload(String key) {
KV kv = DIC.get(key);
if (kv != null) {
DIC.get(key).setV(null);
}
}
}