com.chenlb.mmseg4j.Dictionary Maven / Gradle / Ivy
package com.chenlb.mmseg4j;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.concurrent.ConcurrentHashMap;
import java.util.logging.Level;
import java.util.logging.Logger;
/**
* 词典类. 词库目录单例模式.
* 保存单字与其频率,还有词库.
* 有检测词典变更的接口,外部程序可以使用 {@link #wordsFileIsChange()} 和 {@link #reload()} 来完成检测与加载的工作.
*
* @author chenlb 2009-2-20 下午11:34:29
*/
public class Dictionary {
private static final Logger log = Logger.getLogger(Dictionary.class.getName());
private File dicPath; //词库目录
private volatile Map dict;
private volatile Map unit; //单个字的单位
/** 记录 word 文件的最后修改时间 */
private Map wordsLastTime = null;
private long lastLoadTime = 0;
/** 不要直接使用, 通过 {@link #getDefalutPath()} 使用*/
private static File defalutPath = null;
private static final ConcurrentHashMap dics = new ConcurrentHashMap();
protected void finalize() throws Throwable {
/*
* 使 class reload 的时也可以释放词库
*/
destroy();
}
/**
* 从默认目录加载词库文件.
* 查找默认目录顺序:
*
* - 从系统属性mmseg.dic.path指定的目录中加载
* - 从classpath/data目录
* - 从user.dir/data目录
*
* @see #getDefalutPath()
*/
public static Dictionary getInstance() {
File path = getDefalutPath();
return getInstance(path);
}
/**
* @param path 词典的目录
*/
public static Dictionary getInstance(String path) {
return getInstance(new File(path));
}
/**
* @param path 词典的目录
*/
public static Dictionary getInstance(File path) {
log.info("try to load dir="+path);
File normalizeDir = normalizeFile(path);
Dictionary dic = dics.get(normalizeDir);
if(dic == null) {
dic = new Dictionary(normalizeDir);
dics.put(normalizeDir, dic);
}
return dic;
}
public static File normalizeFile(File file) {
if(file == defalutPath) {
return defalutPath;
}
try {
return file.getCanonicalFile();
} catch (IOException e) {
throw new RuntimeException("normalize file=["+file+"] fail", e);
}
}
/**
* 销毁, 释放资源. 此后此对像不再可用.
*/
void destroy() {
clear(dicPath);
dicPath = null;
dict = null;
unit = null;
}
/**
* @see Dictionary#clear(File)
*/
public static Dictionary clear(String path) {
return clear(new File(path));
}
/**
* 从单例缓存中去除
* @param path
* @return 没有返回 null
*/
public static Dictionary clear(File path) {
File normalizeDir = normalizeFile(path);
return dics.remove(normalizeDir);
}
/**
* 词典的目录
*/
private Dictionary(File path) {
init(path);
}
private void init(File path) {
dicPath = path;
wordsLastTime = new HashMap();
reload(); //加载词典
}
private static long now() {
return System.currentTimeMillis();
}
/**
* 只要 wordsXXX.dic的文件
* @return
*/
protected File[] listWordsFiles() {
return dicPath.listFiles(new FilenameFilter() {
public boolean accept(File dir, String name) {
return name.startsWith("words") && name.endsWith(".dic");
}
});
}
private Map loadDic(File wordsPath) throws IOException {
InputStream charsIn = null;
File charsFile = new File(wordsPath, "chars.dic");
if(charsFile.exists()) {
charsIn = new FileInputStream(charsFile);
addLastTime(charsFile); //chars.dic 也检测是否变更
} else { //从 jar 里加载
charsIn = this.getClass().getResourceAsStream("/data/chars.dic");
charsFile = new File(this.getClass().getResource("/data/chars.dic").getFile()); //only for log
}
final Map dic = new HashMap();
int lineNum = 0;
long s = now();
long ss = s;
lineNum = load(charsIn, new FileLoading() { //单个字的
public void row(String line, int n) {
if(line.length() < 1) {
return;
}
String[] w = line.split(" ");
CharNode cn = new CharNode();
switch(w.length) {
case 2:
try {
cn.setFreq((int)(Math.log(Integer.parseInt(w[1]))*100));//字频计算出自由度
} catch(NumberFormatException e) {
//eat...
}
case 1:
dic.put(w[0].charAt(0), cn);
}
}
});
log.info("chars loaded time="+(now()-s)+"ms, line="+lineNum+", on file="+charsFile);
//try load words.dic in jar
InputStream wordsDicIn = this.getClass().getResourceAsStream("/data/words.dic");
if(wordsDicIn != null) {
File wordsDic = new File(this.getClass().getResource("/data/words.dic").getFile());
loadWord(wordsDicIn, dic, wordsDic);
}
File[] words = listWordsFiles(); //只要 wordsXXX.dic的文件
if(words != null) { //扩展词库目录
for(File wordsFile : words) {
loadWord(new FileInputStream(wordsFile), dic, wordsFile);
addLastTime(wordsFile); //用于检测是否修改
}
}
log.info("load all dic use time="+(now()-ss)+"ms");
return dic;
}
/**
* @param is 词库文件流
* @param dic 加载的词保存在结构中
* @param wordsFile 日志用
* @throws IOException from {@link #load(InputStream, FileLoading)}
*/
private void loadWord(InputStream is, Map dic, File wordsFile) throws IOException {
long s = now();
int lineNum = load(is, new WordsFileLoading(dic)); //正常的词库
log.info("words loaded time="+(now()-s)+"ms, line="+lineNum+", on file="+wordsFile);
}
private Map loadUnit(File path) throws IOException {
InputStream fin = null;
File unitFile = new File(path, "units.dic");
if(unitFile.exists()) {
fin = new FileInputStream(unitFile);
addLastTime(unitFile);
} else { //在jar包里的/data/unit.dic
fin = Dictionary.class.getResourceAsStream("/data/units.dic");
unitFile = new File(Dictionary.class.getResource("/data/units.dic").getFile());
}
final Map unit = new HashMap();
long s = now();
int lineNum = load(fin, new FileLoading() {
public void row(String line, int n) {
if(line.length() != 1) {
return;
}
unit.put(line.charAt(0), Dictionary.class);
}
});
log.info("unit loaded time="+(now()-s)+"ms, line="+lineNum+", on file="+unitFile);
return unit;
}
/**
* 加载 wordsXXX.dic 文件类。
*
* @author chenlb 2009-10-15 下午02:12:55
*/
private static class WordsFileLoading implements FileLoading {
final Map dic;
/**
* @param dic 加载的词,保存在此结构中。
*/
public WordsFileLoading(Map dic) {
this.dic = dic;
}
public void row(String line, int n) {
if(line.length() < 2) {
return;
}
CharNode cn = dic.get(line.charAt(0));
if(cn == null) {
cn = new CharNode();
dic.put(line.charAt(0), cn);
}
cn.addWordTail(tail(line));
}
}
/**
* 加载词文件的模板
* @return 文件总行数
*/
public static int load(InputStream fin, FileLoading loading) throws IOException {
BufferedReader br = new BufferedReader(
new InputStreamReader(new BufferedInputStream(fin), "UTF-8"));
String line = null;
int n = 0;
while((line = br.readLine()) != null) {
if(line == null || line.startsWith("#")) {
continue;
}
n++;
loading.row(line, n);
}
return n;
}
/**
* 取得 str 除去第一个char的部分
* @author chenlb 2009-3-3 下午10:05:26
*/
private static char[] tail(String str) {
char[] cs = new char[str.length()-1];
str.getChars(1, str.length(), cs, 0);
return cs;
}
public static interface FileLoading {
/**
* @param line 读出的一行
* @param n 当前第几行
* @author chenlb 2009-3-3 下午09:55:54
*/
void row(String line, int n);
}
/**
* 把 wordsFile 文件的最后更新时间加记录下来.
* @param wordsFile 非 null
*/
private synchronized void addLastTime(File wordsFile) {
if(wordsFile != null) {
wordsLastTime.put(wordsFile, wordsFile.lastModified());
}
}
/**
* 词典文件是否有修改过
* @return
*/
public synchronized boolean wordsFileIsChange() {
//检查是否有修改文件,包括删除的
for(Entry flt : wordsLastTime.entrySet()) {
File words = flt.getKey();
if(!words.canRead()) { //可能是删除了
return true;
}
if(words.lastModified() > flt.getValue()) { //更新了文件
return true;
}
}
//检查是否有新文件
File[] words = listWordsFiles();
if(words != null) {
for(File wordsFile : words) {
if(!wordsLastTime.containsKey(wordsFile)) { //有新词典文件
return true;
}
}
}
return false;
}
/**
* 全新加载词库,没有成功加载会回滚。
* 注意:重新加载时,务必有两倍的词库树结构的内存,默认词库是 50M/个 左右。否则抛出 OOM。
* @return 是否成功加载
*/
public synchronized boolean reload() {
Map oldWordsLastTime = new HashMap(wordsLastTime);
Map oldDict = dict;
Map oldUnit = unit;
try {
wordsLastTime.clear();
dict = loadDic(dicPath);
unit = loadUnit(dicPath);
lastLoadTime = System.currentTimeMillis();
} catch (IOException e) {
//rollback
wordsLastTime.putAll(oldWordsLastTime);
dict = oldDict;
unit = oldUnit;
if(log.isLoggable(Level.WARNING)) {
log.log(Level.WARNING, "reload dic error! dic="+dicPath+", and rollbacked.", e);
}
return false;
}
return true;
}
/**
* word 能否在词库里找到
* @author chenlb 2009-3-3 下午11:10:45
*/
public boolean match(String word) {
if(word == null || word.length() < 2) {
return false;
}
CharNode cn = dict.get(word.charAt(0));
return search(cn, word.toCharArray(), 0, word.length()-1) >= 0;
}
public CharNode head(char ch) {
return dict.get(ch);
}
/**
* sen[offset] 后 tailLen 长的词是否存在.
* @see CharNode#indexOf(char[], int, int)
* @author chenlb 2009-4-8 下午11:13:49
*/
public int search(CharNode node, char[] sen, int offset, int tailLen) {
if(node != null) {
return node.indexOf(sen, offset, tailLen);
}
return -1;
}
public int maxMatch(char[] sen, int offset) {
CharNode node = dict.get(sen[offset]);
return maxMatch(node, sen, offset);
}
public int maxMatch(CharNode node, char[] sen, int offset) {
if(node != null) {
return node.maxMatch(sen, offset+1);
}
return 0;
}
public ArrayList maxMatch(CharNode node, ArrayList tailLens, char[] sen, int offset) {
tailLens.clear();
tailLens.add(0);
if(node != null) {
return node.maxMatch(tailLens, sen, offset+1);
}
return tailLens;
}
public boolean isUnit(Character ch) {
return unit.containsKey(ch);
}
/**
* 当 words.dic 是从 jar 里加载时, 可能 defalut 不存在
*/
public static File getDefalutPath() {
if(defalutPath == null) {
String defPath = System.getProperty("mmseg.dic.path");
log.info("look up in mmseg.dic.path="+defPath);
if(defPath == null) {
URL url = Dictionary.class.getClassLoader().getResource("data");
if(url != null) {
defPath = url.getFile();
log.info("look up in classpath="+defPath);
} else {
defPath = System.getProperty("user.dir")+"/data";
log.info("look up in user.dir="+defPath);
}
}
defalutPath = new File(defPath);
if(!defalutPath.exists()) {
log.warning("defalut dic path="+defalutPath+" not exist");
}
}
return defalutPath;
}
/**
* 仅仅用来观察词库.
*/
public Map getDict() {
return dict;
}
/**
* 注意:当 words.dic 是从 jar 里加载时,此时 File 可能是不存在的。
*/
public File getDicPath() {
return dicPath;
}
/** 最后加载词库的时间 */
public long getLastLoadTime() {
return lastLoadTime;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy