All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.wltea.analyzer.dic.Dictionary Maven / Gradle / Ivy

The newest version!
package org.wltea.analyzer.dic;

import lombok.extern.slf4j.Slf4j;
import org.wltea.analyzer.cfg.Configuration;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Collection;
import java.util.List;

/**
 * 词典管理类,单子模式
 */
@Slf4j
public class Dictionary {

	/*
	 * 词典单子实例
	 */
	private static Dictionary singleton;

	/*
	 * 主词典对象
	 */
	private DictSegment _MainDict;

	/*
	 * 停止词词典 
	 */
	private DictSegment _StopWordDict;
	/*
	 * 量词词典
	 */
	private DictSegment _QuantifierDict;

	/**
	 * 配置对象
	 */
	private final Configuration cfg;

	private Dictionary(Configuration cfg) {
		this.cfg = cfg;
		this.loadMainDict();
		this.loadStopWordDict();
		this.loadQuantifierDict();
	}

	/**
	 * 词典初始化
	 * 由于IK Analyzer的词典采用Dictionary类的静态方法进行词典初始化
	 * 只有当Dictionary类被实际调用时,才会开始载入词典,
	 * 这将延长首次分词操作的时间
	 * 该方法提供了一个在应用加载阶段就初始化字典的手段
	 *
	 * @param cfg cfg
	 * @return Dictionary
	 */
	public static Dictionary initial(Configuration cfg) {
		if (singleton == null) {
			synchronized (Dictionary.class) {
				if (singleton == null) {
					singleton = new Dictionary(cfg);
					return singleton;
				}
			}
		}
		return singleton;
	}

	/**
	 * 获取词典单子实例
	 *
	 * @return Dictionary 单例对象
	 */
	public static Dictionary getSingleton() {
		if (singleton == null) {
			throw new IllegalStateException("词典尚未初始化,请先调用initial方法");
		}
		return singleton;
	}

	/**
	 * 批量加载新词条
	 *
	 * @param words 词条列表
	 */
	public void addWords(Collection words) {
		if (words != null) {
			//批量加载词条到主内存词典中
			words.stream().filter(word -> word != null).forEach(word -> {
				//批量加载词条到主内存词典中
				singleton._MainDict.fillSegment(word.trim().toLowerCase().toCharArray());
			});
		}
	}

	/**
	 * 批量移除(屏蔽)词条
	 *
	 * @param words words
	 */
	public void disableWords(Collection words) {
		if (words != null) {
			//批量屏蔽词条
			words.stream().filter(word -> word != null).forEach(word -> {
				//批量屏蔽词条
				singleton._MainDict.disableSegment(word.trim().toLowerCase().toCharArray());
			});
		}
	}

	/**
	 * 检索匹配主词典
	 *
	 * @param charArray charArray
	 * @return Hit 匹配结果描述
	 */
	public Hit matchInMainDict(char[] charArray) {
		return singleton._MainDict.match(charArray);
	}

	/**
	 * 检索匹配主词典
	 *
	 * @param charArray charArray
	 * @param begin     begin
	 * @param length    length
	 * @return Hit 匹配结果描述
	 */
	public Hit matchInMainDict(char[] charArray, int begin, int length) {
		return singleton._MainDict.match(charArray, begin, length);
	}

	/**
	 * 检索匹配量词词典
	 *
	 * @param charArray charArray
	 * @param begin     begin
	 * @param length    length
	 * @return Hit 匹配结果描述
	 */
	public Hit matchInQuantifierDict(char[] charArray, int begin, int length) {
		return singleton._QuantifierDict.match(charArray, begin, length);
	}

	/**
	 * 从已匹配的Hit中直接取出DictSegment,继续向下匹配
	 *
	 * @param charArray    charArray
	 * @param currentIndex currentIndex
	 * @param matchedHit   matchedHit
	 * @return Hit
	 */
	public Hit matchWithHit(char[] charArray, int currentIndex, Hit matchedHit) {
		DictSegment ds = matchedHit.getMatchedDictSegment();
		return ds.match(charArray, currentIndex, 1, matchedHit);
	}

	/**
	 * 判断是否是停止词
	 *
	 * @param charArray charArray
	 * @param begin     begin
	 * @param length    length
	 * @return boolean
	 */
	public boolean isStopWord(char[] charArray, int begin, int length) {
		return singleton._StopWordDict.match(charArray, begin, length).isMatch();
	}

	/**
	 * 加载主词典及扩展词典
	 */
	private void loadMainDict() {
		//建立一个主词典实例
		_MainDict = new DictSegment((char) 0);
		//读取主词典文件
		InputStream is = this.getClass().getClassLoader().getResourceAsStream(cfg.getMainDictionary());
		if (is == null) {
			throw new RuntimeException("Main Dictionary not found!!!");
		}

		try {
			BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
			String theWord;
			do {
				theWord = br.readLine();
				if (theWord != null && !"".equals(theWord.trim())) {
					_MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
				}
			} while (theWord != null);

		} catch (IOException ioe) {
			log.error("Main Dictionary loading exception.", ioe);
		} finally {
			try {
				if (is != null) {
					is.close();
				}
			} catch (IOException e) {
				log.error(e.getMessage(), e);
			}
		}
		//加载扩展词典
		this.loadExtDict();
	}

	/**
	 * 加载用户配置的扩展词典到主词库表
	 */
	private void loadExtDict() {
		//加载扩展词典配置
		List extDictFiles = cfg.getExtDictionarys();
		if (extDictFiles != null) {
			InputStream is;
			for (String extDictName : extDictFiles) {
				//读取扩展词典文件
				if (log.isDebugEnabled()) {
					log.debug("加载扩展词典:" + extDictName);
				}
				is = this.getClass().getClassLoader().getResourceAsStream(extDictName);
				//如果找不到扩展的字典,则忽略
				if (is == null) {
					continue;
				}
				try {
					BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
					String theWord;
					do {
						theWord = br.readLine();
						if (theWord != null && !"".equals(theWord.trim())) {
							//加载扩展词典数据到主内存词典中
							_MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
						}
					} while (theWord != null);

				} catch (IOException ioe) {
					log.error("Extension Dictionary loading exception.", ioe);
				} finally {
					try {
						if (is != null) {
							is.close();
						}
					} catch (IOException e) {
						log.error(e.getMessage(), e);
					}
				}
			}
		}
	}

	/**
	 * 加载用户扩展的停止词词典
	 */
	private void loadStopWordDict() {
		//建立一个主词典实例
		_StopWordDict = new DictSegment((char) 0);
		//加载扩展停止词典
		List extStopWordDictFiles = cfg.getExtStopWordDictionarys();
		if (extStopWordDictFiles != null) {
			InputStream is;
			for (String extStopWordDictName : extStopWordDictFiles) {
				if (log.isDebugEnabled()) {
					log.debug("加载扩展停止词典:" + extStopWordDictName);
				}
				//读取扩展词典文件
				is = this.getClass().getClassLoader().getResourceAsStream(extStopWordDictName);
				//如果找不到扩展的字典,则忽略
				if (is == null) {
					continue;
				}
				try {
					BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
					String theWord;
					do {
						theWord = br.readLine();
						if (theWord != null && !"".equals(theWord.trim())) {
							//加载扩展停止词典数据到内存中
							_StopWordDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
						}
					} while (theWord != null);

				} catch (IOException ioe) {
					log.error("Extension Stop word Dictionary loading exception.", ioe);
				} finally {
					try {
						if (is != null) {
							is.close();
						}
					} catch (IOException e) {
						log.error(e.getMessage(), e);
					}
				}
			}
		}
	}

	/**
	 * 加载量词词典
	 */
	private void loadQuantifierDict() {
		//建立一个量词典实例
		_QuantifierDict = new DictSegment((char) 0);
		//读取量词词典文件
		InputStream is = this.getClass().getClassLoader().getResourceAsStream(cfg.getQuantifierDicionary());
		if (is == null) {
			throw new RuntimeException("Quantifier Dictionary not found!!!");
		}
		try {
			BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
			String theWord;
			do {
				theWord = br.readLine();
				if (theWord != null && !"".equals(theWord.trim())) {
					_QuantifierDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
				}
			} while (theWord != null);

		} catch (IOException ioe) {
			log.error("Quantifier Dictionary loading exception.", ioe);
		} finally {
			try {
				if (is != null) {
					is.close();
				}
			} catch (IOException e) {
				log.error(e.getMessage(), e);
			}
		}
	}

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy