All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.tiny.seg.impl.ChineseParserImpl Maven / Gradle / Ivy

The newest version!
package org.tiny.seg.impl;

import org.apache.commons.lang.CharUtils;
import org.tiny.seg.ChineseParser;
import org.tiny.seg.FoundEvent;
import org.tiny.seg.exception.DictLoadRuntimeException;
import org.tinygroup.binarytree.AVLTree;
import org.tinygroup.binarytree.impl.AVLTreeImpl;
import org.tinygroup.xmlparser.node.XmlNode;
import org.tinygroup.xmlparser.parser.XmlStringParser;

import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

//TODO 增加多种情景词库加载方式,通过情景词库,可以提升匹配的正确率
//TODO 重构实现
/**
 * 中文分词
 * 
 * @author luoguo
 * 
 */
public class ChineseParserImpl implements ChineseParser {
	private static final int BOM = 65279;
	private static final int MODE_MAX = 1;
	private static final int MODE_MIN = -1;
	private static Pattern pattern = Pattern
			.compile("[a-z|A-Z]+|[0-9]+|[\u4e00-\u9fa5]+");
	
	private AVLTree letterIndex = new AVLTreeImpl();// 词库索引
	private FoundEvent findEvent = null;

	public ChineseParserImpl() {

	}
	
	public AVLTree getLetterIndex() {
		return letterIndex;
	}

	public void setLetterIndex(AVLTree letterIndex) {
		this.letterIndex = letterIndex;
	}



	public  void loadDict(InputStream inputStream,
			String encode) {
		try {
			XmlStringParser parser = new XmlStringParser();
			BufferedReader in = new BufferedReader(new InputStreamReader(
					inputStream, encode));
			String str;
			while ((str = in.readLine()) != null) {
				boolean notBomChar = !(str.length() == 1 && (int) str.charAt(0) == BOM);
				boolean validLine = str.length() > 0 && !str.startsWith("#");
				if (notBomChar && validLine) {
					addWord(parser.parse(str).getRoot());
				}
			}
			in.close();
		} catch (Exception e) {
			throw new DictLoadRuntimeException(e);
		}
	}

	private  void addWord(XmlNode node) {
		addWord(null, node.getAttribute("word"), 0, node);
	}

	/**
	 * 添加一个字
	 * 
	 * @param wordNode
	 */
	private  void addWord(WordDescription workdesc, String word,
			int index, XmlNode wordNode) {
		char c = 0;
		if (index < word.length()) {
			c = word.charAt(index);
		} else if (index == word.length()) {
			c = 0;
		} else {
			return;
		}
		WordDescription wordDescription = new WordDescription(c);
		if (c == 0) {
			// 添加词组相关信息
			wordDescription.setProperties(wordNode);
		}
		WordDescription newWordDescription = null;
		if (workdesc == null) {
			newWordDescription = letterIndex.contains(wordDescription);
		} else {
			if (workdesc.getWord() == null) {
				AVLTree newWord = new AVLTreeImpl();
				workdesc.setWord(newWord);
			} else {
				newWordDescription = workdesc.getWord().contains(
						wordDescription);
			}
		}
		if (newWordDescription != null) {
			wordDescription = newWordDescription;
		} else {
			if (workdesc == null) {
				letterIndex.add(wordDescription);
			} else {
				workdesc.getWord().add(wordDescription);
			}
		}
		addWord(wordDescription, word, index + 1, wordNode);
	}

	public void segmentWordMax(String content, List result) {
		segmentWord(content, result, MODE_MAX);
	}

	private void segmentWord(String content, List result, int mode) {
		Matcher matcher = pattern.matcher(content);
		int start = 0;
		while (matcher.find(start)) {
			String str = matcher.group();
			char c = str.charAt(0);
			if (CharUtils.isAsciiAlphanumeric(c)) {
				addToken(str, result);
			} else {
				// 分解中文
				int s = parseWord(str, 0, result, mode);
				while (s < str.length()) {
					s = parseWord(str, s, result, mode);
				}
			}
			start = matcher.end();
		}
	}

	private int parseWord(String str, int s, List result, int mode) {
		if (mode == MODE_MAX) {
			return parseWordMax(str, s, result);
		} else {
			return parseWordMin(str, s, result);
		}
	}

	public void segmentWordMin(String content, List result) {
		segmentWord(content, result, MODE_MIN);
	}

	private int parseWord(String str, int s, Map ret, int mode) {
		if (mode == MODE_MAX) {
			return parseWordMax(str, s, ret);
		} else {
			return parseWordMin(str, s, ret);
		}
	}

	public void segmentWordMax(String content, Map ret,
			int mode) {
		Matcher matcher = pattern.matcher(content);
		int start = 0;
		while (matcher.find(start)) {
			String str = matcher.group();
			char c = str.charAt(0);
			if (CharUtils.isAsciiAlphanumeric(c)) {
				addToken(str, ret);
			} else {
				// 分解中文
				int s = parseWord(str, 0, ret, mode);
				while (s < str.length()) {
					s = parseWord(str, s, ret, mode);
				}
			}
			start = matcher.end();
		}
	}

	public void segmentWordMax(String content, Map ret) {
		segmentWordMax(content, ret, MODE_MAX);
	}

	public void segmentWordMin(String content, Map ret) {
		segmentWordMax(content, ret, MODE_MIN);
	}

	/**
	 * 给找到的标记添加记数
	 * 
	 * @param token
	 * @param ret
	 */
	private void addTokenToMap(String token, Map ret) {
		if (findEvent != null) {
			findEvent.process(token);
		}
		Integer i = ret.get(token);
		if (i == null) {
			i = 1;
		} else {
			i++;
		}
		ret.put(token, i);
	}

	@SuppressWarnings("unchecked")
	private  void addToken(String token, T ret) {
		if (ret instanceof Map) {
			Map map = (Map) ret;
			addTokenToMap(token, map);
		} else if (ret instanceof List) {
			List list = (List) ret;
			addTokenToList(token, list);
		}

	}

	private void addTokenToList(String token, List ret) {
		if (findEvent != null) {
			findEvent.process(token);
		}

		ret.add(token);
	}

	/**
	 * 分词 
* a.实现了顺序分词
* b.实现了如果一次分解有多个词出现,者增加到结果列表中
* c.姓名识别 * * @param str * 要分的句子 * @param start * 开始的位置 * @param ret * 结果存放 * @return 结束的位置 */ private int parseWordMax(String str, int start, T ret) { char c = str.charAt(start); int end = start + 1; WordDescription find = new WordDescription(c); WordDescription endChar = new WordDescription('\0'); WordDescription locate = letterIndex.contains(find); int ct = 0; while (locate != null) { ct++; WordDescription kk = locate.getWord().contains(endChar); if (kk != null) { // 增加动态检查,如果其它的方案,比当前方案更优,则取新方案 if (end - start >= 2) { for (int index = start + 1; index <= end; index++) { WordDescription wd = findOneWord(str, index); if (wd != null && wd.getWeighing() > kk.getWeighing()) {// 如果没有找到词且找到的词比前一个大 parseWordMax(str.substring(start, index), 0, ret);// 前面的字再进行处理 return index; } } } end = start + ct; } if (start + ct == str.length()) { break; } else { c = str.charAt(start + ct); find.setChar(c); locate = locate.getWord().contains(find); } } // 如果是单字 addToken(str.substring(start, end), ret); return end; } private int parseWordMin(String str, int start, T ret) { char c = str.charAt(start); int end = start + 1; WordDescription find = new WordDescription(c); WordDescription endChar = new WordDescription('\0'); WordDescription locate = letterIndex.contains(find); int ct = 0; while (locate != null) { ct++; WordDescription kk = locate.getWord().contains(endChar); if (kk != null) { // 增加动态检查,如果其它的方案,比当前方案更优,则取新方案 if (end - start >= 2) { for (int index = start + 1; index <= end; index++) { WordDescription wd = findOneWord(str, index); if (wd != null && wd.getWeighing() > kk.getWeighing()) {// 如果没有找到词且找到的词比前一个大 parseWordMin(str.substring(start, index), 0, ret);// 前面的字再进行处理 return index; } } } end = start + ct; addToken(str.substring(start, end), ret); return end; } if (start + ct == str.length()) { break; } else { c = str.charAt(start + ct); find.setChar(c); locate = locate.getWord().contains(find); } } // 如果是单字 if (end - start == 1) { addToken(str.substring(start, end), ret); } return end; } /** * 只查找一个词 * * @param str * @param start * @return */ private WordDescription findOneWord(String str, int start) { char c = str.charAt(start); WordDescription find = new WordDescription(c); WordDescription endChar = new WordDescription('\0'); WordDescription locate = letterIndex.contains(find); WordDescription kk = null; int ct = 0; while (locate != null) { ct++; kk = locate.getWord().contains(endChar); if (start + ct == str.length()) { break; } else { c = str.charAt(start + ct); find.setChar(c); locate = locate.getWord().contains(find); } } return kk; } public void setFoundEvent(FoundEvent event) { this.findEvent = event; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy