All Downloads are FREE. Search and download functionalities are using the official Maven repository.
Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
org.wltea.analyzer.dic.Dictionary Maven / Gradle / Ivy
/*
* IK 中文分词 版本 8.1.1
* IK Analyzer release 8.1.1
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* 源代码由林良益([email protected] )提供
* 版权声明 2012,乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
* 8.1.1版本 由 Magese ([email protected] ) 更新
* release 8.1.1 update by Magese([email protected] )
*
*/
package org.wltea.analyzer.dic;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.Collection;
import java.util.List;
import org.wltea.analyzer.cfg.Configuration;
import org.wltea.analyzer.cfg.DefaultConfig;
/**
* 词典管理类,单例模式
*/
@SuppressWarnings("unused")
public class Dictionary {
/*
* 词典单子实例
*/
private static Dictionary singleton;
/*
* 主词典对象
*/
private DictSegment _MainDict;
/*
* 停止词词典
*/
private DictSegment _StopWordDict;
/*
* 量词词典
*/
private DictSegment _QuantifierDict;
/**
* 配置对象
*/
private Configuration cfg;
/**
* 私有构造方法,阻止外部直接实例化本类
*
* @param cfg ik分词器配置实例
*/
private Dictionary(Configuration cfg) {
this.cfg = cfg;
this.loadMainDict();
this.loadStopWordDict();
this.loadQuantifierDict();
}
/**
* 词典初始化
* 由于IK Analyzer的词典采用Dictionary类的静态方法进行词典初始化
* 只有当Dictionary类被实际调用时,才会开始载入词典,
* 这将延长首次分词操作的时间
* 该方法提供了一个在应用加载阶段就初始化字典的手段
*/
public static void initial(Configuration cfg) {
if (singleton == null) {
synchronized (Dictionary.class) {
if (singleton == null) {
singleton = new Dictionary(cfg);
}
}
}
}
/**
* 获取词典单子实例
*
* @return Dictionary 单例对象
*/
public static Dictionary getSingleton() {
if (singleton == null) {
throw new IllegalStateException("词典尚未初始化,请先调用initial方法");
}
return singleton;
}
/**
* 重新更新词典
* 由于停用词等不经常变也不建议常增加,故这里只修改动态扩展词库
*
* @param inputStreamReaderList 词典文件IO流集合
*/
public static void reloadDic(List inputStreamReaderList) {
// 如果本类单例尚未实例化,则先进行初始化操作
if (singleton == null) {
Configuration cfg = DefaultConfig.getInstance();
initial(cfg);
}
// 对词典流集合进行循环读取,将读取到的词语加载到主词典中
for (Reader in : inputStreamReaderList) {
try {
LineNumberReader br = new LineNumberReader(in);
String theWord;
while ((theWord = br.readLine()) != null) {
if (theWord.trim().length() == 0 || theWord.trim().charAt(0) == '#') continue;
singleton._MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
}
} catch (IOException ioe) {
System.err.println("Other Dictionary loading exception.");
ioe.printStackTrace();
} finally {
try {
if (in != null) {
in.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
/**
* 批量加载新词条
*
* @param words 词条列表
*/
public void addWords(Collection words) {
if (words != null) {
for (String word : words) {
if (word != null) {
// 批量加载词条到主内存词典中
singleton._MainDict.fillSegment(word.trim().toLowerCase().toCharArray());
}
}
}
}
/**
* 批量移除(屏蔽)词条
*/
public void disableWords(Collection words) {
if (words != null) {
for (String word : words) {
if (word != null) {
// 批量屏蔽词条
singleton._MainDict.disableSegment(word.trim().toLowerCase().toCharArray());
}
}
}
}
/**
* 检索匹配主词典
*
* @return Hit 匹配结果描述
*/
public Hit matchInMainDict(char[] charArray) {
return singleton._MainDict.match(charArray);
}
/**
* 检索匹配主词典
*
* @return Hit 匹配结果描述
*/
public Hit matchInMainDict(char[] charArray, int begin, int length) {
return singleton._MainDict.match(charArray, begin, length);
}
/**
* 检索匹配量词词典
*
* @return Hit 匹配结果描述
*/
public Hit matchInQuantifierDict(char[] charArray, int begin, int length) {
return singleton._QuantifierDict.match(charArray, begin, length);
}
/**
* 从已匹配的Hit中直接取出DictSegment,继续向下匹配
*
* @return Hit
*/
public Hit matchWithHit(char[] charArray, int currentIndex, Hit matchedHit) {
DictSegment ds = matchedHit.getMatchedDictSegment();
return ds.match(charArray, currentIndex, 1, matchedHit);
}
/**
* 判断是否是停止词
*
* @return boolean
*/
public boolean isStopWord(char[] charArray, int begin, int length) {
return singleton._StopWordDict.match(charArray, begin, length).isMatch();
}
/**
* 加载主词典及扩展词典
*/
private void loadMainDict() {
// 建立一个主词典实例
_MainDict = new DictSegment((char) 0);
// 读取主词典文件
InputStream is = this.getClass().getClassLoader().getResourceAsStream(cfg.getMainDictionary());
if (is == null) {
throw new RuntimeException("Main Dictionary not found!!!");
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8), 512);
String theWord;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
_MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
}
} while (theWord != null);
} catch (IOException ioe) {
System.err.println("Main Dictionary loading exception.");
ioe.printStackTrace();
} finally {
try {
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
// 加载扩展词典
this.loadExtDict();
}
/**
* 加载用户配置的扩展词典到主词库表
*/
private void loadExtDict() {
// 加载扩展词典配置
List extDictFiles = cfg.getExtDictionarys();
if (extDictFiles != null) {
InputStream is;
for (String extDictName : extDictFiles) {
// 读取扩展词典文件
System.out.println("Load extended dictionary:" + extDictName);
is = this.getClass().getClassLoader().getResourceAsStream(extDictName);
// 如果找不到扩展的字典,则忽略
if (is == null) {
continue;
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8), 512);
String theWord;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
// 加载扩展词典数据到主内存词典中
// System.out.println(theWord);
_MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
}
} while (theWord != null);
} catch (IOException ioe) {
System.err.println("Extension Dictionary loading exception.");
ioe.printStackTrace();
} finally {
try {
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
/**
* 加载用户扩展的停止词词典
*/
private void loadStopWordDict() {
// 建立一个主词典实例
_StopWordDict = new DictSegment((char) 0);
// 加载扩展停止词典
List extStopWordDictFiles = cfg.getExtStopWordDictionarys();
if (extStopWordDictFiles != null) {
InputStream is;
for (String extStopWordDictName : extStopWordDictFiles) {
System.out.println("Load stopwords dictionary:" + extStopWordDictName);
// 读取扩展词典文件
is = this.getClass().getClassLoader().getResourceAsStream(extStopWordDictName);
// 如果找不到扩展的字典,则忽略
if (is == null) {
continue;
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8), 512);
String theWord;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
// System.out.println(theWord);
// 加载扩展停止词典数据到内存中
_StopWordDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
}
} while (theWord != null);
} catch (IOException ioe) {
System.err.println("Extension Stop word Dictionary loading exception.");
ioe.printStackTrace();
} finally {
try {
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
/**
* 加载量词词典
*/
private void loadQuantifierDict() {
// 建立一个量词典实例
_QuantifierDict = new DictSegment((char) 0);
// 读取量词词典文件
InputStream is = this.getClass().getClassLoader().getResourceAsStream(cfg.getQuantifierDicionary());
if (is == null) {
throw new RuntimeException("Quantifier Dictionary not found!!!");
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8), 512);
String theWord;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
_QuantifierDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
}
} while (theWord != null);
} catch (IOException ioe) {
System.err.println("Quantifier Dictionary loading exception.");
ioe.printStackTrace();
} finally {
try {
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}