All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.wltea.analyzer.dic.Dictionary Maven / Gradle / Ivy

The newest version!
/**
 * IK 中文分词  版本 5.0
 * IK Analyzer release 5.0
 * 
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * 源代码由林良益([email protected])提供
 * 版权声明 2012,乌龙茶工作室
 * provided by Linliangyi and copyright 2012 by Oolong studio
 * 
 * 
 */
package org.wltea.analyzer.dic;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;

import org.apache.log4j.Logger;

import com.google.common.collect.Lists;
import com.ld.zxw.config.LuceneDataSource;
import com.ld.zxw.config.LucenePlusConfig;

/**
 * 词典管理类,单子模式
 */
public class Dictionary {

	private Logger log = Logger.getLogger(Dictionary.class);


	/*
	 * 词典单子实例
	 */
	private static Dictionary singleton;

	/*
	 * 主词典对象
	 */
	private DictSegment _MainDict;

	/*
	 * 停止词词典 
	 */
	private DictSegment _StopWordDict;
	/*
	 * 量词词典
	 */
	private DictSegment _QuantifierDict;

	private Dictionary(LucenePlusConfig LucenePlusConfig){
		this.loadMainDict();
		//加载动态词典 或静态词典
		if(LuceneDataSource.build().DynamicDictionary){
			log.info("Dictionary 加载动态词典");
			loadDynamicDictionary("ext");
			loadDynamicDictionary("stopword");
		}else{
			log.info("Dictionary 加载静态词典");
			String extWordPath = null;
			String stopWordPath = null;
			if(LucenePlusConfig !=null) {
				extWordPath = LucenePlusConfig.getExtWordPath();
				stopWordPath = LucenePlusConfig.getStopWordPath();
			}
			this.loadExtDict(extWordPath);
			this.loadStopWordDict(stopWordPath);
		}
		//加载量词
		this.loadQuantifierDict();
	}

	/**
	 * 词典初始化
	 * 由于IK Analyzer的词典采用Dictionary类的静态方法进行词典初始化
	 * 只有当Dictionary类被实际调用时,才会开始载入词典,
	 * 这将延长首次分词操作的时间
	 * 该方法提供了一个在应用加载阶段就初始化字典的手段
	 * @param LucenePlusConfig 
	 * @return Dictionary
	 */
	public static Dictionary initial(LucenePlusConfig lucenePlusConfig){
		if(singleton == null){
			synchronized(Dictionary.class){
				if(singleton == null){
					singleton = new Dictionary(lucenePlusConfig);
					return singleton;
				}
			}
		}
		return singleton;
	}

	/**
	 * 获取词典单子实例
	 * @return Dictionary 单例对象
	 */
	public static Dictionary getSingleton(){
		if(singleton == null){
			throw new IllegalStateException("词典尚未初始化,请先调用initial方法");
		}
		return singleton;
	}

	/**
	 * 批量加载新词条
	 * @param words Collection词条列表
	 */
	public void addWords(Collection words){
		if(words != null){
			for(String word : words){
				if (word != null) {
					//批量加载词条到主内存词典中
					singleton._MainDict.fillSegment(word.trim().toLowerCase().toCharArray());
				}
			}
		}
	}

	/**
	 * 批量移除(屏蔽)词条
	 * @param words
	 */
	public void disableWords(Collection words){
		if(words != null){
			for(String word : words){
				if (word != null) {
					//批量屏蔽词条
					singleton._MainDict.disableSegment(word.trim().toLowerCase().toCharArray());
				}
			}
		}
	}

	/**
	 * 检索匹配主词典
	 * @param charArray
	 * @return Hit 匹配结果描述
	 */
	public Hit matchInMainDict(char[] charArray){
		return singleton._MainDict.match(charArray);
	}

	/**
	 * 检索匹配主词典
	 * @param charArray
	 * @param begin
	 * @param length
	 * @return Hit 匹配结果描述
	 */
	public Hit matchInMainDict(char[] charArray , int begin, int length){
		return singleton._MainDict.match(charArray, begin, length);
	}

	/**
	 * 检索匹配量词词典
	 * @param charArray
	 * @param begin
	 * @param length
	 * @return Hit 匹配结果描述
	 */
	public Hit matchInQuantifierDict(char[] charArray , int begin, int length){
		return singleton._QuantifierDict.match(charArray, begin, length);
	}


	/**
	 * 从已匹配的Hit中直接取出DictSegment,继续向下匹配
	 * @param charArray
	 * @param currentIndex
	 * @param matchedHit
	 * @return Hit
	 */
	public Hit matchWithHit(char[] charArray , int currentIndex , Hit matchedHit){
		DictSegment ds = matchedHit.getMatchedDictSegment();
		return ds.match(charArray, currentIndex, 1 , matchedHit);
	}


	/**
	 * 判断是否是停止词
	 * @param charArray
	 * @param begin
	 * @param length
	 * @return boolean
	 */
	public boolean isStopWord(char[] charArray , int begin, int length){			
		return singleton._StopWordDict.match(charArray, begin, length).isMatch();
	}	

	/**
	 * 加载主词典及扩展词典
	 */
	private void loadMainDict(){
		//建立一个主词典实例
		_MainDict = new DictSegment((char)0);
		//读取主词典文件
		InputStream is = getDic("main2012.dic");
		if(is == null){
			throw new RuntimeException("Main Dictionary not found!!!");
		}

		try {
			BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
			String theWord = null;
			do {
				theWord = br.readLine();
				if (theWord != null && !"".equals(theWord.trim())) {
					_MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
				}
			} while (theWord != null);

		} catch (IOException ioe) {
			System.err.println("Main Dictionary loading exception.");
			ioe.printStackTrace();

		}finally{
			try {
				if(is != null){
					is.close();
					is = null;
				}
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
	}
	/**
	 * 动态词典配置
	 */
	public void loadDynamicDictionary(String type){
		//		try {
		//			List orObj2 = new Thesaurus().orObj(LuceneDataSource.build().jedis.lrange("IkThesaurus_LD", 0, Integer.MAX_VALUE),null,null);
		//			int size = orObj2.size();
		//			if(type.equals("ext")){
		//				for (int i = 0; i < size; i++) {
		//					IkThesaurusDto ld_dic = orObj2.get(i);
		//					if(ld_dic.getType().toLowerCase().equals(type)){//扩展启用词
		//						_MainDict.fillSegment(ld_dic.getName().trim().toLowerCase().toCharArray());
		//					}
		//				}
		//				log.info("启动词(动态) ---> 加载完毕,条数:"+orObj2.size());
		//			}else if(type.equals("stopword")){
		//				_StopWordDict = new DictSegment((char)0);
		//				for (int i = 0; i < size; i++) {
		//					IkThesaurusDto ld_dic = orObj2.get(i);
		//					if(ld_dic.getType().toLowerCase().equals(type)){//扩展停用词
		//						_StopWordDict.fillSegment(ld_dic.getName().trim().toLowerCase().toCharArray());
		//					}
		//				}
		//				log.info("停用词(动态) ---> 加载完毕,条数:"+orObj2.size());
		//			}
		//		} catch (Exception e) {
		//			log.error("动态词典加载异常", e);
		//		}
	}


	/**
	 * 加载用户配置的扩展词典到主词库表
	 * @param path 
	 */
	private void loadExtDict(String path){
		//加载扩展词典配置
		if(path == null) {
			//没有设置扩展词典
			log.info("警告:没有设置扩展启动词库");
		}else {
			List list = Lists.newArrayList();
			//加载文件
			getExtend(list, path);
			if(!list.isEmpty()) {
				//配置词库
				int size = list.size();
				for (int i = 0; i < size; i++) {
					InputStream is = list.get(i);
					if(is == null) {
						continue;
					}

					try {
						BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
						String theWord = null;
						do {
							theWord = br.readLine();
							if (theWord != null) {
								//加载扩展词典数据到主内存词典中
								_MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
							}
						} while (theWord != null);

					} catch (IOException ioe) {
						System.err.println("Extension Dictionary loading exception.");
						ioe.printStackTrace();

					}finally{
						try {
							if(is != null){
								is.close();
								is = null;
							}
						} catch (IOException e) {
							e.printStackTrace();
						}
					}

				}
			}else {
				log.info("警告:"+path+"-该目录没有任何 .dic 结尾的文件");
			}
		}
	}

	/**
	 * 加载用户扩展的停止词词典
	 * @param path 
	 */
	private void loadStopWordDict(String path){
		//建立一个主词典实例
		_StopWordDict = new DictSegment((char)0);

		if(path == null) {
			log.info("警告:没有设置扩展停用词库");
		}else {
			List list = Lists.newArrayList();
			getExtend(list, path);
			if(!list.isEmpty()) {
				int size = list.size();
				for (int i = 0; i < size; i++) {
					InputStream is = list.get(i);
					if(is == null){
						continue;
					}
					try {
						BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
						String theWord = null;
						do {
							theWord = br.readLine();
							if (theWord != null) {
								//加载扩展停止词典数据到内存中
								_StopWordDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
							}
						} while (theWord != null);
					} catch (IOException ioe) {
						System.err.println("Extension Stop word Dictionary loading exception.");
						ioe.printStackTrace();
					}finally{
						try {
							if(is != null){
								is.close();
								is = null;
							}
						} catch (IOException e) {
							e.printStackTrace();
						}
					}
				}

			}else {
				log.info("警告:"+path+"-该目录没有任何 .dic 结尾的文件");
			}
		}
	}

	/**
	 * 加载量词词典
	 */
	private void loadQuantifierDict(){
		//建立一个量词典实例
		_QuantifierDict = new DictSegment((char)0);
		//读取量词词典文件
		InputStream is = getDic("quantifier.dic");
		if(is == null){
			throw new RuntimeException("Quantifier Dictionary not found!!!");
		}
		try {
			BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
			String theWord = null;
			do {
				theWord = br.readLine();
				if (theWord != null && !"".equals(theWord.trim())) {
					_QuantifierDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
				}
			} while (theWord != null);

		} catch (IOException ioe) {
			System.err.println("Quantifier Dictionary loading exception.");
			ioe.printStackTrace();

		}finally{
			try {
				if(is != null){
					is.close();
					is = null;
				}
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
	}

	/**
	 * 加载内置词典
	 * @param fileName
	 * @return
	 */
	public InputStream getDic(String fileName) {
		InputStream in = null;
		try {
			in = this.getClass().getResourceAsStream("/dic/"+fileName);
		} catch (Exception e) {
			System.out.println("找不到基础词库");
			e.printStackTrace();
		}
		System.out.println("加载内置词库:"+fileName);
		return in;
	}

	/**
	 * 加载扩展词典
	 * @param list 
	 */

	public void getExtend(List list, String path){
		File f = new File(path);
		//列出所有文件 及目录
		if(f.exists()) {
			File[] files = f.listFiles();
			for (int i = 0; i < files.length; i++) {
				File file = files[i];
				if(file.isDirectory()) {
					//目录
					getExtend(list,file.getAbsolutePath());
				}else {
					try {
						//文件不等于空  并且 是 .dic 结尾 文件
						if(file.exists() && file.getName().endsWith(".dic")) {
							list.add(new FileInputStream(file));
						}
					} catch (FileNotFoundException e) {
						e.printStackTrace();
					}
					//文件
					System.out.println("文件名:"+file.getName());
				}
			}
		}
	}

	public static void main(String[] args) {
		String jkds = "djksjd.jdsjd.dic";
		System.out.println(jkds.endsWith(".dics"));
	}



}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy