
org.wltea.analyzer.dic.Dictionary Maven / Gradle / Ivy
The newest version!
/**
* IK 中文分词 版本 5.0
* IK Analyzer release 5.0
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* 源代码由林良益([email protected])提供
* 版权声明 2012,乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
*
*/
package org.wltea.analyzer.dic;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import org.apache.log4j.Logger;
import com.google.common.collect.Lists;
import com.ld.zxw.config.LuceneDataSource;
import com.ld.zxw.config.LucenePlusConfig;
/**
* 词典管理类,单子模式
*/
public class Dictionary {
private Logger log = Logger.getLogger(Dictionary.class);
/*
* 词典单子实例
*/
private static Dictionary singleton;
/*
* 主词典对象
*/
private DictSegment _MainDict;
/*
* 停止词词典
*/
private DictSegment _StopWordDict;
/*
* 量词词典
*/
private DictSegment _QuantifierDict;
private Dictionary(LucenePlusConfig LucenePlusConfig){
this.loadMainDict();
//加载动态词典 或静态词典
if(LuceneDataSource.build().DynamicDictionary){
log.info("Dictionary 加载动态词典");
loadDynamicDictionary("ext");
loadDynamicDictionary("stopword");
}else{
log.info("Dictionary 加载静态词典");
String extWordPath = null;
String stopWordPath = null;
if(LucenePlusConfig !=null) {
extWordPath = LucenePlusConfig.getExtWordPath();
stopWordPath = LucenePlusConfig.getStopWordPath();
}
this.loadExtDict(extWordPath);
this.loadStopWordDict(stopWordPath);
}
//加载量词
this.loadQuantifierDict();
}
/**
* 词典初始化
* 由于IK Analyzer的词典采用Dictionary类的静态方法进行词典初始化
* 只有当Dictionary类被实际调用时,才会开始载入词典,
* 这将延长首次分词操作的时间
* 该方法提供了一个在应用加载阶段就初始化字典的手段
* @param LucenePlusConfig
* @return Dictionary
*/
public static Dictionary initial(LucenePlusConfig lucenePlusConfig){
if(singleton == null){
synchronized(Dictionary.class){
if(singleton == null){
singleton = new Dictionary(lucenePlusConfig);
return singleton;
}
}
}
return singleton;
}
/**
* 获取词典单子实例
* @return Dictionary 单例对象
*/
public static Dictionary getSingleton(){
if(singleton == null){
throw new IllegalStateException("词典尚未初始化,请先调用initial方法");
}
return singleton;
}
/**
* 批量加载新词条
* @param words Collection词条列表
*/
public void addWords(Collection words){
if(words != null){
for(String word : words){
if (word != null) {
//批量加载词条到主内存词典中
singleton._MainDict.fillSegment(word.trim().toLowerCase().toCharArray());
}
}
}
}
/**
* 批量移除(屏蔽)词条
* @param words
*/
public void disableWords(Collection words){
if(words != null){
for(String word : words){
if (word != null) {
//批量屏蔽词条
singleton._MainDict.disableSegment(word.trim().toLowerCase().toCharArray());
}
}
}
}
/**
* 检索匹配主词典
* @param charArray
* @return Hit 匹配结果描述
*/
public Hit matchInMainDict(char[] charArray){
return singleton._MainDict.match(charArray);
}
/**
* 检索匹配主词典
* @param charArray
* @param begin
* @param length
* @return Hit 匹配结果描述
*/
public Hit matchInMainDict(char[] charArray , int begin, int length){
return singleton._MainDict.match(charArray, begin, length);
}
/**
* 检索匹配量词词典
* @param charArray
* @param begin
* @param length
* @return Hit 匹配结果描述
*/
public Hit matchInQuantifierDict(char[] charArray , int begin, int length){
return singleton._QuantifierDict.match(charArray, begin, length);
}
/**
* 从已匹配的Hit中直接取出DictSegment,继续向下匹配
* @param charArray
* @param currentIndex
* @param matchedHit
* @return Hit
*/
public Hit matchWithHit(char[] charArray , int currentIndex , Hit matchedHit){
DictSegment ds = matchedHit.getMatchedDictSegment();
return ds.match(charArray, currentIndex, 1 , matchedHit);
}
/**
* 判断是否是停止词
* @param charArray
* @param begin
* @param length
* @return boolean
*/
public boolean isStopWord(char[] charArray , int begin, int length){
return singleton._StopWordDict.match(charArray, begin, length).isMatch();
}
/**
* 加载主词典及扩展词典
*/
private void loadMainDict(){
//建立一个主词典实例
_MainDict = new DictSegment((char)0);
//读取主词典文件
InputStream is = getDic("main2012.dic");
if(is == null){
throw new RuntimeException("Main Dictionary not found!!!");
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
String theWord = null;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
_MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
}
} while (theWord != null);
} catch (IOException ioe) {
System.err.println("Main Dictionary loading exception.");
ioe.printStackTrace();
}finally{
try {
if(is != null){
is.close();
is = null;
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* 动态词典配置
*/
public void loadDynamicDictionary(String type){
// try {
// List orObj2 = new Thesaurus().orObj(LuceneDataSource.build().jedis.lrange("IkThesaurus_LD", 0, Integer.MAX_VALUE),null,null);
// int size = orObj2.size();
// if(type.equals("ext")){
// for (int i = 0; i < size; i++) {
// IkThesaurusDto ld_dic = orObj2.get(i);
// if(ld_dic.getType().toLowerCase().equals(type)){//扩展启用词
// _MainDict.fillSegment(ld_dic.getName().trim().toLowerCase().toCharArray());
// }
// }
// log.info("启动词(动态) ---> 加载完毕,条数:"+orObj2.size());
// }else if(type.equals("stopword")){
// _StopWordDict = new DictSegment((char)0);
// for (int i = 0; i < size; i++) {
// IkThesaurusDto ld_dic = orObj2.get(i);
// if(ld_dic.getType().toLowerCase().equals(type)){//扩展停用词
// _StopWordDict.fillSegment(ld_dic.getName().trim().toLowerCase().toCharArray());
// }
// }
// log.info("停用词(动态) ---> 加载完毕,条数:"+orObj2.size());
// }
// } catch (Exception e) {
// log.error("动态词典加载异常", e);
// }
}
/**
* 加载用户配置的扩展词典到主词库表
* @param path
*/
private void loadExtDict(String path){
//加载扩展词典配置
if(path == null) {
//没有设置扩展词典
log.info("警告:没有设置扩展启动词库");
}else {
List list = Lists.newArrayList();
//加载文件
getExtend(list, path);
if(!list.isEmpty()) {
//配置词库
int size = list.size();
for (int i = 0; i < size; i++) {
InputStream is = list.get(i);
if(is == null) {
continue;
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
String theWord = null;
do {
theWord = br.readLine();
if (theWord != null) {
//加载扩展词典数据到主内存词典中
_MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
}
} while (theWord != null);
} catch (IOException ioe) {
System.err.println("Extension Dictionary loading exception.");
ioe.printStackTrace();
}finally{
try {
if(is != null){
is.close();
is = null;
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
}else {
log.info("警告:"+path+"-该目录没有任何 .dic 结尾的文件");
}
}
}
/**
* 加载用户扩展的停止词词典
* @param path
*/
private void loadStopWordDict(String path){
//建立一个主词典实例
_StopWordDict = new DictSegment((char)0);
if(path == null) {
log.info("警告:没有设置扩展停用词库");
}else {
List list = Lists.newArrayList();
getExtend(list, path);
if(!list.isEmpty()) {
int size = list.size();
for (int i = 0; i < size; i++) {
InputStream is = list.get(i);
if(is == null){
continue;
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
String theWord = null;
do {
theWord = br.readLine();
if (theWord != null) {
//加载扩展停止词典数据到内存中
_StopWordDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
}
} while (theWord != null);
} catch (IOException ioe) {
System.err.println("Extension Stop word Dictionary loading exception.");
ioe.printStackTrace();
}finally{
try {
if(is != null){
is.close();
is = null;
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
}else {
log.info("警告:"+path+"-该目录没有任何 .dic 结尾的文件");
}
}
}
/**
* 加载量词词典
*/
private void loadQuantifierDict(){
//建立一个量词典实例
_QuantifierDict = new DictSegment((char)0);
//读取量词词典文件
InputStream is = getDic("quantifier.dic");
if(is == null){
throw new RuntimeException("Quantifier Dictionary not found!!!");
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
String theWord = null;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
_QuantifierDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
}
} while (theWord != null);
} catch (IOException ioe) {
System.err.println("Quantifier Dictionary loading exception.");
ioe.printStackTrace();
}finally{
try {
if(is != null){
is.close();
is = null;
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* 加载内置词典
* @param fileName
* @return
*/
public InputStream getDic(String fileName) {
InputStream in = null;
try {
in = this.getClass().getResourceAsStream("/dic/"+fileName);
} catch (Exception e) {
System.out.println("找不到基础词库");
e.printStackTrace();
}
System.out.println("加载内置词库:"+fileName);
return in;
}
/**
* 加载扩展词典
* @param list
*/
public void getExtend(List list, String path){
File f = new File(path);
//列出所有文件 及目录
if(f.exists()) {
File[] files = f.listFiles();
for (int i = 0; i < files.length; i++) {
File file = files[i];
if(file.isDirectory()) {
//目录
getExtend(list,file.getAbsolutePath());
}else {
try {
//文件不等于空 并且 是 .dic 结尾 文件
if(file.exists() && file.getName().endsWith(".dic")) {
list.add(new FileInputStream(file));
}
} catch (FileNotFoundException e) {
e.printStackTrace();
}
//文件
System.out.println("文件名:"+file.getName());
}
}
}
}
public static void main(String[] args) {
String jkds = "djksjd.jdsjd.dic";
System.out.println(jkds.endsWith(".dics"));
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy