All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.chenlb.mmseg4j.MMSeg Maven / Gradle / Ivy

The newest version!
package com.chenlb.mmseg4j;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.PushbackReader;
import java.io.Reader;
import java.util.LinkedList;
import java.util.Queue;

/**
 * Reader 流的分词(有字母,数字等), 析出中文(其实是 CJK)成句子 {@link Sentence} 再对 mmseg 算法分词.
* * 非线程安全 * @author chenlb 2009-9-20下午10:41:41 */ public class MMSeg { private PushbackReader reader; private Seg seg; private StringBuilder bufSentence = new StringBuilder(256); private Sentence currentSentence; private Queue bufWord; // word 缓存, 因为有 chunk 分析三个以上. public MMSeg(Reader input, Seg seg) { this.seg = seg; reset(input); } private int readedIdx = 0; public void reset(Reader input) { this.reader = new PushbackReader(new BufferedReader(input), 20); currentSentence = null; bufWord = new LinkedList(); bufSentence.setLength(0); readedIdx = -1; } private int readNext() throws IOException { int d = reader.read(); if(d > -1) { readedIdx++; d = Character.toLowerCase(d); } return d; } private void pushBack(int data) throws IOException { readedIdx--; reader.unread(data); } public Word next() throws IOException { //先从缓存中取 Word word = bufWord.poll();; if(word == null) { bufSentence.setLength(0); int data = -1; boolean read = true; while(read && (data=readNext()) != -1) { read = false; //默认一次可以读出同一类字符,就可以分词内容 int type = Character.getType(data); String wordType = Word.TYPE_WORD; switch(type) { case Character.UPPERCASE_LETTER: case Character.LOWERCASE_LETTER: case Character.TITLECASE_LETTER: case Character.MODIFIER_LETTER: /* * 1. 0x410-0x44f -> А-я //俄文 * 2. 0x391-0x3a9 -> Α-Ω //希腊大写 * 3. 0x3b1-0x3c9 -> α-ω //希腊小写 */ data = toAscii(data); NationLetter nl = getNation(data); if(nl == NationLetter.UNKNOW) { read = true; break; } wordType = Word.TYPE_LETTER; bufSentence.appendCodePoint(data); switch(nl) { case EN: //字母后面的数字,如: VH049PA ReadCharByAsciiOrDigit rcad = new ReadCharByAsciiOrDigit(); readChars(bufSentence, rcad); if(rcad.hasDigit()) { wordType = Word.TYPE_LETTER_OR_DIGIT; } //only english //readChars(bufSentence, new ReadCharByAscii()); break; case RA: readChars(bufSentence, new ReadCharByRussia()); break; case GE: readChars(bufSentence, new ReadCharByGreece()); break; } bufWord.add(createWord(bufSentence, wordType)); bufSentence.setLength(0); break; case Character.OTHER_LETTER: /* * 1. 0x3041-0x30f6 -> ぁ-ヶ //日文(平|片)假名 * 2. 0x3105-0x3129 -> ㄅ-ㄩ //注意符号 */ bufSentence.appendCodePoint(data); readChars(bufSentence, new ReadCharByType(Character.OTHER_LETTER)); currentSentence = createSentence(bufSentence); bufSentence.setLength(0); break; case Character.DECIMAL_DIGIT_NUMBER: bufSentence.appendCodePoint(toAscii(data)); readChars(bufSentence, new ReadCharDigit()); //读后面的数字, AsciiLetterOr wordType = Word.TYPE_DIGIT; int d = readNext(); if(d > -1) { if(seg.isUnit(d)) { //单位,如时间 bufWord.add(createWord(bufSentence, startIdx(bufSentence)-1, Word.TYPE_DIGIT)); //先把数字添加(独立) bufSentence.setLength(0); bufSentence.appendCodePoint(d); wordType = Word.TYPE_WORD; //单位是 word } else { //后面可能是字母和数字 pushBack(d); if(readChars(bufSentence, new ReadCharByAsciiOrDigit()) > 0) { //如果有字母或数字都会连在一起. wordType = Word.TYPE_DIGIT_OR_LETTER; } } } bufWord.add(createWord(bufSentence, wordType)); bufSentence.setLength(0); //缓存的字符清除 break; case Character.LETTER_NUMBER: // ⅠⅡⅢ 单分 bufSentence.appendCodePoint(data); readChars(bufSentence, new ReadCharByType(Character.LETTER_NUMBER)); int startIdx = startIdx(bufSentence); for(int i=0; i=65296 && codePoint<=65305) //0-9 || (codePoint>=65313 && codePoint<=65338) //A-Z || (codePoint>=65345 && codePoint<=65370) //a-z ) { codePoint -= 65248; } return codePoint; } private static boolean isAsciiLetter(int codePoint) { return (codePoint >= 'A' && codePoint <= 'Z') || (codePoint >= 'a' && codePoint <= 'z'); } private static boolean isRussiaLetter(int codePoint) { return (codePoint >= 'А' && codePoint <= 'я') || codePoint=='Ё' || codePoint=='ё'; } private static boolean isGreeceLetter(int codePoint) { return (codePoint >= 'Α' && codePoint <= 'Ω') || (codePoint >= 'α' && codePoint <= 'ω'); } /** * EN -> 英语 * RA -> 俄语 * GE -> 希腊 * */ private static enum NationLetter {EN, RA, GE, UNKNOW}; private NationLetter getNation(int codePoint) { if(isAsciiLetter(codePoint)) { return NationLetter.EN; } if(isRussiaLetter(codePoint)) { return NationLetter.RA; } if(isGreeceLetter(codePoint)) { return NationLetter.GE; } return NationLetter.UNKNOW; } @SuppressWarnings("unused") private static boolean isCJK(int type) { return type == Character.OTHER_LETTER; } private static boolean isDigit(int type) { return type == Character.DECIMAL_DIGIT_NUMBER; } @SuppressWarnings("unused") private static boolean isLetter(int type) { return type <= Character.MODIFIER_LETTER && type >= Character.UPPERCASE_LETTER; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy