
org.tinygroup.chinese.fileProcessor.ChineseContainer Maven / Gradle / Ivy
The newest version!
package org.tinygroup.chinese.fileProcessor;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.tiny.seg.exception.DictLoadRuntimeException;
import org.tinygroup.chinese.Character;
import org.tinygroup.chinese.Word;
import org.tinygroup.chinese.WordParserManager;
import org.tinygroup.chinese.parsermanager.WordParserManagerImpl;
import org.tinygroup.commons.tools.StringUtil;
import org.tinygroup.vfs.FileObject;
import org.tinygroup.xmlparser.node.XmlNode;
import org.tinygroup.xmlparser.parser.XmlStringParser;
public class ChineseContainer {
private static final int BOM = 65279;
private static Map> wordMap = new HashMap>();
// private static PinYin pinYin = new PinYinImpl();
private static List characterList = new ArrayList();
private static Map> pinyinMap = new HashMap>();
private static Map wordManagerMap = new HashMap();
private static final String PRE_SCENE = "TINY_WORD_SCENE_";
private static final String TYPE_STRING = "string";
private static final String TYPE_REGEX = "regex";
private static final String DEFAULT_ALL = "DEFAULT_ALL";
// public static void loadDict(FileObject fileObject, String encode) {
// pinYin.loadPinFile(fileObject.getInputStream(), encode);
// }
public static void loadDict(List fileObjects, String encode) {
for (FileObject f : fileObjects) {
loadPinYinFile(f.getInputStream(), encode);
}
initCharacter();
}
private static void loadPinYinFile(InputStream inputStream, String encode) {
try {
// 载入数据
BufferedReader in = new BufferedReader(new InputStreamReader(
inputStream, encode));
String str;
while ((str = in.readLine()) != null) {
str = str.trim();
if (str.length() > 0 && !str.startsWith("#")) {
String[] s = str.split(" ");
if (s.length == 2) {
for (int i = 0; i < s[1].length(); i++) {
addPinYin(s[1].charAt(i), s[0]);
}
}
}
}
in.close();
} catch (Exception e) {
throw new DictLoadRuntimeException(e);
}
}
private static void addPinYin(char charAt, String string) {
if (pinyinMap.containsKey(charAt)) {
pinyinMap.get(charAt).add(string);
return;
}
List list = new ArrayList();
list.add(string);
pinyinMap.put(charAt, list);
}
private static void initCharacter(){
for (java.lang.Character ch : pinyinMap.keySet()) {
Character c = new Character(ch);
List spell = pinyinMap.get(ch);
String[] sepllArray = new String[spell.size()];
for(int i = 0 ; i < spell.size() ; i ++ ){
sepllArray[i] = spell.get(i);
}
c.setSpell(sepllArray);
characterList.add(c);
}
}
public static void loadWord(FileObject fileObject, String encode) {
try {
InputStream inputStream = fileObject.getInputStream();
String absolutePath = fileObject.getAbsolutePath();
List wordList = wordMap.get(absolutePath);
if (wordList == null) {
wordList = new ArrayList();
wordMap.put(absolutePath, wordList);
}
XmlStringParser parser = new XmlStringParser();
BufferedReader in = new BufferedReader(new InputStreamReader(
inputStream, encode));
String str;
while ((str = in.readLine()) != null) {
boolean notBomChar = !(str.length() == 1 && (int) str.charAt(0) == BOM);
boolean validLine = str.length() > 0 && !str.startsWith("#");
if (notBomChar && validLine) {
addWord(wordList, parser.parse(str).getRoot());
}
}
in.close();
} catch (Exception e) {
throw new DictLoadRuntimeException(e);
}
}
private static void addWord(List list, XmlNode root) {
if (root == null) {
return;
}
Word word = new Word();
setProperties(root, word);
list.add(word);
}
private static void setProperties(XmlNode node, Word word) {
if (node != null) {
String wordString = node.getAttribute("word");
word.setWord(wordString);
String str = node.getAttribute("partOfSpeech");
if (!StringUtil.isBlank(str)) {
word.setPartOfSpeech(str.trim().split(","));
}
str = node.getAttribute("pinyin");
if (!StringUtil.isBlank(str)) {
int length = str.trim().length();
int[] array = new int[length];
for (int i = 0; i < length; i++) {
array[i] = Integer.parseInt(str.substring(i, i + 1));
}
word.setSpell(array);
}
str = node.getAttribute("antonym");
if (!StringUtil.isBlank(str)) {
word.setAntonym(str.trim().split(","));
}
str = node.getAttribute("thesaurus");
if (!StringUtil.isBlank(str)) {
word.setSynonyms(str.trim().split(","));
}
str = node.getAttribute("weighing");
if (!StringUtil.isBlank(str)) {
word.setWeight(Integer.parseInt(str));
}
}
}
public static WordParserManager getWordParserManager(String names) {
if (wordManagerMap.containsKey(names)) {
return wordManagerMap.get(names);
}
if (StringUtil.isBlank(names)) {
return getFullManager();
}
return getManagerByNames(names);
}
private static WordParserManager getManagerByNames(String names) {
WordParserManager manager = createWordParserManager();
String[] nameArray = names.split(",");
for (String name : nameArray) {
String fullName = name + ChineseWordFileProcessor.EXT_FILE_NAME;
if (!wordMap.containsKey(fullName)) {
throw new RuntimeException("词库文件" + fullName + "不存在");
}
manager.addWord(wordMap.get(fullName));
}
wordManagerMap.put(names, manager);
return manager;
}
private static WordParserManager getFullManager() {
if (!wordManagerMap.containsKey("")) {
WordParserManager manager = createWordParserManager();
for (String name : wordMap.keySet()) {
manager.addWord(wordMap.get(name));
}
wordManagerMap.put("", manager);
}
return wordManagerMap.get("");
}
public static void RegScene(String scene, String wordNames, String type) {
WordParserManager manager = null;
if (StringUtil.isBlank(wordNames)) { // 如果字符串为空,则添加所有
manager = getFullManager();
} else if (TYPE_STRING.equals(type)) {// 字符串类型
manager = getWordParserManager(wordNames);
} else if (TYPE_REGEX.equals(type)) {// 正则表达式类型
manager = getManagerByRegex(wordNames);
} else {
throw new RuntimeException("类型" + type + "错误,不存在该类型");
}
wordManagerMap.put(getSceneKey(scene), manager);
}
private static WordParserManager getManagerByRegex(String wordNames) {
WordParserManager manager = createWordParserManager();
Pattern p = Pattern.compile(wordNames);
for (String name : wordMap.keySet()) {
Matcher matcher = p.matcher(name);
if (matcher.find()) {
manager.addWord(wordMap.get(name));
}
}
return manager;
}
private static String getSceneKey(String scene) {
if (StringUtil.isBlank(scene)) {
return PRE_SCENE + DEFAULT_ALL;
}
return PRE_SCENE + scene;
}
private static WordParserManager createWordParserManager() {
WordParserManager manager = new WordParserManagerImpl();
addPinYin(manager);
return manager;
}
private static void addPinYin(WordParserManager manager) {
for(Character c : characterList){
manager.addCharacter(c);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy