com.chenlb.mmseg4j.MMSeg Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of mmseg4j-core Show documentation
Show all versions of mmseg4j-core Show documentation
MMSEG cor for java chinese analyzer
The newest version!
package com.chenlb.mmseg4j;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.PushbackReader;
import java.io.Reader;
import java.util.LinkedList;
import java.util.Queue;
/**
* Reader 流的分词(有字母,数字等), 析出中文(其实是 CJK)成句子 {@link Sentence} 再对 mmseg 算法分词.
*
* 非线程安全
* @author chenlb 2009-9-20下午10:41:41
*/
public class MMSeg {
private PushbackReader reader;
private Seg seg;
private StringBuilder bufSentence = new StringBuilder(256);
private Sentence currentSentence;
private Queue bufWord; // word 缓存, 因为有 chunk 分析三个以上.
public MMSeg(Reader input, Seg seg) {
this.seg = seg;
reset(input);
}
private int readedIdx = 0;
public void reset(Reader input) {
this.reader = new PushbackReader(new BufferedReader(input), 20);
currentSentence = null;
bufWord = new LinkedList();
bufSentence.setLength(0);
readedIdx = -1;
}
private int readNext() throws IOException {
int d = reader.read();
if(d > -1) {
readedIdx++;
d = Character.toLowerCase(d);
}
return d;
}
private void pushBack(int data) throws IOException {
readedIdx--;
reader.unread(data);
}
public Word next() throws IOException {
//先从缓存中取
Word word = bufWord.poll();;
if(word == null) {
bufSentence.setLength(0);
int data = -1;
boolean read = true;
while(read && (data=readNext()) != -1) {
read = false; //默认一次可以读出同一类字符,就可以分词内容
int type = Character.getType(data);
String wordType = Word.TYPE_WORD;
switch(type) {
case Character.UPPERCASE_LETTER:
case Character.LOWERCASE_LETTER:
case Character.TITLECASE_LETTER:
case Character.MODIFIER_LETTER:
/*
* 1. 0x410-0x44f -> А-я //俄文
* 2. 0x391-0x3a9 -> Α-Ω //希腊大写
* 3. 0x3b1-0x3c9 -> α-ω //希腊小写
*/
data = toAscii(data);
NationLetter nl = getNation(data);
if(nl == NationLetter.UNKNOW) {
read = true;
break;
}
wordType = Word.TYPE_LETTER;
bufSentence.appendCodePoint(data);
switch(nl) {
case EN:
//字母后面的数字,如: VH049PA
ReadCharByAsciiOrDigit rcad = new ReadCharByAsciiOrDigit();
readChars(bufSentence, rcad);
if(rcad.hasDigit()) {
wordType = Word.TYPE_LETTER_OR_DIGIT;
}
//only english
//readChars(bufSentence, new ReadCharByAscii());
break;
case RA:
readChars(bufSentence, new ReadCharByRussia());
break;
case GE:
readChars(bufSentence, new ReadCharByGreece());
break;
}
bufWord.add(createWord(bufSentence, wordType));
bufSentence.setLength(0);
break;
case Character.OTHER_LETTER:
/*
* 1. 0x3041-0x30f6 -> ぁ-ヶ //日文(平|片)假名
* 2. 0x3105-0x3129 -> ㄅ-ㄩ //注意符号
*/
bufSentence.appendCodePoint(data);
readChars(bufSentence, new ReadCharByType(Character.OTHER_LETTER));
currentSentence = createSentence(bufSentence);
bufSentence.setLength(0);
break;
case Character.DECIMAL_DIGIT_NUMBER:
bufSentence.appendCodePoint(toAscii(data));
readChars(bufSentence, new ReadCharDigit()); //读后面的数字, AsciiLetterOr
wordType = Word.TYPE_DIGIT;
int d = readNext();
if(d > -1) {
if(seg.isUnit(d)) { //单位,如时间
bufWord.add(createWord(bufSentence, startIdx(bufSentence)-1, Word.TYPE_DIGIT)); //先把数字添加(独立)
bufSentence.setLength(0);
bufSentence.appendCodePoint(d);
wordType = Word.TYPE_WORD; //单位是 word
} else { //后面可能是字母和数字
pushBack(d);
if(readChars(bufSentence, new ReadCharByAsciiOrDigit()) > 0) { //如果有字母或数字都会连在一起.
wordType = Word.TYPE_DIGIT_OR_LETTER;
}
}
}
bufWord.add(createWord(bufSentence, wordType));
bufSentence.setLength(0); //缓存的字符清除
break;
case Character.LETTER_NUMBER:
// ⅠⅡⅢ 单分
bufSentence.appendCodePoint(data);
readChars(bufSentence, new ReadCharByType(Character.LETTER_NUMBER));
int startIdx = startIdx(bufSentence);
for(int i=0; i=65296 && codePoint<=65305) //0-9
|| (codePoint>=65313 && codePoint<=65338) //A-Z
|| (codePoint>=65345 && codePoint<=65370) //a-z
) {
codePoint -= 65248;
}
return codePoint;
}
private static boolean isAsciiLetter(int codePoint) {
return (codePoint >= 'A' && codePoint <= 'Z') || (codePoint >= 'a' && codePoint <= 'z');
}
private static boolean isRussiaLetter(int codePoint) {
return (codePoint >= 'А' && codePoint <= 'я') || codePoint=='Ё' || codePoint=='ё';
}
private static boolean isGreeceLetter(int codePoint) {
return (codePoint >= 'Α' && codePoint <= 'Ω') || (codePoint >= 'α' && codePoint <= 'ω');
}
/**
* EN -> 英语
* RA -> 俄语
* GE -> 希腊
*
*/
private static enum NationLetter {EN, RA, GE, UNKNOW};
private NationLetter getNation(int codePoint) {
if(isAsciiLetter(codePoint)) {
return NationLetter.EN;
}
if(isRussiaLetter(codePoint)) {
return NationLetter.RA;
}
if(isGreeceLetter(codePoint)) {
return NationLetter.GE;
}
return NationLetter.UNKNOW;
}
@SuppressWarnings("unused")
private static boolean isCJK(int type) {
return type == Character.OTHER_LETTER;
}
private static boolean isDigit(int type) {
return type == Character.DECIMAL_DIGIT_NUMBER;
}
@SuppressWarnings("unused")
private static boolean isLetter(int type) {
return type <= Character.MODIFIER_LETTER && type >= Character.UPPERCASE_LETTER;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy