com.chenlb.mmseg4j.MMSeg Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of mmseg4j-core Show documentation
MMSEG cor for java chinese analyzer
The newest version!
package com.chenlb.mmseg4j;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.PushbackReader;
import java.io.Reader;
import java.util.LinkedList;
import java.util.Queue;

/**
 * Reader 流的分词(有字母,数字等), 析出中文(其实是 CJK)成句子 {@link Sentence} 再对 mmseg 算法分词.

 * 
 * 非线程安全
 * @author chenlb 2009-9-20下午10:41:41
 */
public class MMSeg {
	
	private PushbackReader reader;
	private Seg seg;
	
	private StringBuilder bufSentence = new StringBuilder(256);
	private Sentence currentSentence;
	private Queue bufWord;	// word 缓存, 因为有 chunk 分析三个以上.
	
	public MMSeg(Reader input, Seg seg) {
		this.seg = seg;
		
		reset(input);
	}

	private int readedIdx = 0;
	
	public void reset(Reader input) {
		this.reader = new PushbackReader(new BufferedReader(input), 20);
		currentSentence = null;
		bufWord = new LinkedList();
		bufSentence.setLength(0);
		readedIdx = -1;
	}
	
	private int readNext() throws IOException {
		int d = reader.read();
		if(d > -1) {
			readedIdx++;
			d = Character.toLowerCase(d);
		}
		return d;
	}
	
	private void pushBack(int data) throws IOException {
		readedIdx--;
		reader.unread(data);
	}

	
	public Word next() throws IOException {
		//先从缓存中取
		Word word = bufWord.poll();;
		if(word == null) {
			bufSentence.setLength(0);

			int data = -1;
			boolean read = true;
			while(read && (data=readNext()) != -1) {
				read = false;	//默认一次可以读出同一类字符,就可以分词内容
				int type = Character.getType(data);
				String wordType = Word.TYPE_WORD;
				switch(type) {
				case Character.UPPERCASE_LETTER:
				case Character.LOWERCASE_LETTER:
				case Character.TITLECASE_LETTER:
				case Character.MODIFIER_LETTER:
					/*
					 * 1. 0x410-0x44f -> А-я	//俄文
					 * 2. 0x391-0x3a9 -> Α-Ω	//希腊大写
					 * 3. 0x3b1-0x3c9 -> α-ω	//希腊小写
					 */
					data = toAscii(data);
					NationLetter nl = getNation(data);
					if(nl == NationLetter.UNKNOW) {
						read = true;
						break;
					}
					wordType = Word.TYPE_LETTER;
					bufSentence.appendCodePoint(data);
					switch(nl) {
					case EN:
						//字母后面的数字,如: VH049PA
						ReadCharByAsciiOrDigit rcad = new ReadCharByAsciiOrDigit();
						readChars(bufSentence, rcad);
						if(rcad.hasDigit()) {
							wordType = Word.TYPE_LETTER_OR_DIGIT;
						}
						//only english
						//readChars(bufSentence, new ReadCharByAscii());
						break;
					case RA:
						readChars(bufSentence, new ReadCharByRussia());
						break;
					case GE:
						readChars(bufSentence, new ReadCharByGreece());
						break;
					}
					bufWord.add(createWord(bufSentence, wordType));

					bufSentence.setLength(0);

					break;
				case Character.OTHER_LETTER:
					/*
					 * 1. 0x3041-0x30f6 -> ぁ-ヶ	//日文(平|片)假名
					 * 2. 0x3105-0x3129 -> ㄅ-ㄩ	//注意符号
					 */
					bufSentence.appendCodePoint(data);
					readChars(bufSentence, new ReadCharByType(Character.OTHER_LETTER));

					currentSentence = createSentence(bufSentence);

					bufSentence.setLength(0);

					break;
				case Character.DECIMAL_DIGIT_NUMBER:
					bufSentence.appendCodePoint(toAscii(data));
					readChars(bufSentence, new ReadCharDigit());	//读后面的数字, AsciiLetterOr
					wordType = Word.TYPE_DIGIT;
					int d = readNext();
					if(d > -1) {
						if(seg.isUnit(d)) {	//单位,如时间
							bufWord.add(createWord(bufSentence, startIdx(bufSentence)-1, Word.TYPE_DIGIT));	//先把数字添加(独立)

							bufSentence.setLength(0);

							bufSentence.appendCodePoint(d);
							wordType = Word.TYPE_WORD;	//单位是 word
						} else {	//后面可能是字母和数字
							pushBack(d);
							if(readChars(bufSentence, new ReadCharByAsciiOrDigit()) > 0) {	//如果有字母或数字都会连在一起.
								wordType = Word.TYPE_DIGIT_OR_LETTER;
							}
						}
					}

					bufWord.add(createWord(bufSentence, wordType));


					bufSentence.setLength(0);	//缓存的字符清除

					break;
				case Character.LETTER_NUMBER:
					// ⅠⅡⅢ 单分
					bufSentence.appendCodePoint(data);
					readChars(bufSentence, new ReadCharByType(Character.LETTER_NUMBER));

					int startIdx = startIdx(bufSentence);
					for(int i=0; i=65296 && codePoint<=65305)	//０-９
				|| (codePoint>=65313 && codePoint<=65338)	//Ａ-Ｚ
				|| (codePoint>=65345 && codePoint<=65370)	//ａ-ｚ
				) {	
			codePoint -= 65248;
		}
		return codePoint;
	}
	
	private static boolean isAsciiLetter(int codePoint) {
		return (codePoint >= 'A' && codePoint <= 'Z') || (codePoint >= 'a' && codePoint <= 'z');
	}
	
	private static boolean isRussiaLetter(int codePoint) {
		return (codePoint >= 'А' && codePoint <= 'я') || codePoint=='Ё' || codePoint=='ё';
	}
	
	private static boolean isGreeceLetter(int codePoint) {
		return (codePoint >= 'Α' && codePoint <= 'Ω') || (codePoint >= 'α' && codePoint <= 'ω');
	}
	/**
	 * EN -> 英语
	 * RA -> 俄语
	 * GE -> 希腊
	 * 
	 */
	private static enum NationLetter {EN, RA, GE, UNKNOW};
	
	private NationLetter getNation(int codePoint) {
		if(isAsciiLetter(codePoint)) {
			return NationLetter.EN;
		}
		if(isRussiaLetter(codePoint)) {
			return NationLetter.RA;
		}
		if(isGreeceLetter(codePoint)) {
			return NationLetter.GE;
		}
		return NationLetter.UNKNOW;
	}
	
	@SuppressWarnings("unused")
	private static boolean isCJK(int type) {
		return type == Character.OTHER_LETTER;
	}
	private static boolean isDigit(int type) {
		return type == Character.DECIMAL_DIGIT_NUMBER;
	}
	@SuppressWarnings("unused")
	private static boolean isLetter(int type) {
		return type <= Character.MODIFIER_LETTER && type >= Character.UPPERCASE_LETTER;
	}
}