All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.tinygroup.chineseanalyzer.ChineseTokenStream Maven / Gradle / Ivy

The newest version!
package org.tinygroup.chineseanalyzer;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Queue;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.tiny.seg.ChineseParser;
import org.tinygroup.beancontainer.BeanContainerFactory;

/**
 * 基于TinySeg的Lucene的令牌流扩展
 * 
 * @author yancheng11334
 * 
 */
public class ChineseTokenStream extends Tokenizer {

	private ChineseParserWrapper wrapper = null;
	
	private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
	private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
	private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
	
	protected ChineseTokenStream(Reader input) {
		super(input);
		wrapper = new ChineseParserWrapper(input);
	}

	public void reset() throws IOException {
		super.reset();
		wrapper.reset();
	}

	public final boolean incrementToken() throws IOException {
		clearAttributes();
		ChineseWord word = wrapper.next();
		if(word!=null){
			termAtt.copyBuffer(word.getSource(), word.getSourceOffset(), word.getLength());
			offsetAtt.setOffset(word.getStartOffset(), word.getEndOffset());
			typeAtt.setType("word");
			return true;
		}else{
			end();
			return false;
		}
	}
	
	class ChineseParserWrapper {
		private BufferedReader reader = null;
		private String buffer = null;
		private ChineseParser parser = null;

		private int index =  0; 
		private int offset = 0;
		private Queue words =null;
		
		public ChineseParserWrapper(Reader input){
			reader = new BufferedReader(input);
			parser = BeanContainerFactory.getBeanContainer(
					this.getClass().getClassLoader()).getBean(
					ChineseParser.CHINESE_PARSER_BEAN_NAME);
		}
		
	    private void init() throws IOException{
	    	words = new LinkedList();
	    	while ((buffer = reader.readLine()) != null) {
				List result = new ArrayList();
				parser.segmentWordMax(buffer, result);
				int sourceOffset = 0;
				char[] source = buffer.toCharArray();
				for(String s:result){
					int length =s.toCharArray().length;
					ChineseWord word = new ChineseWord(source,sourceOffset,offset,length,index);
					words.add(word);
					index++;
					sourceOffset += length;
					offset += length;
				}
			}
	    }
		
		public void reset() throws IOException{
			if (reader.markSupported()) {
				reader.mark(0);
			}
			reader.reset();
			buffer = null;
			index = 0;
			offset = 0;
			words = null;
		}
		
		public ChineseWord next() throws IOException{
			if(words==null){
			   init();
			}
			return words.poll();
		}
	}
	
	class ChineseWord {
		
		public ChineseWord(char[] source, int sourceOffset, int startOffset,int length,
				int index) {
			super();
			this.source = source;
			this.sourceOffset = sourceOffset;
			this.startOffset = startOffset;
			this.length = length;
			this.index = index;
		}

		private char[] source;
		private int sourceOffset;
		private int startOffset;
		private int length;
		private int index;
		/**
		 * 返回词所在字符串
		 * @return
		 */
		public char[] getSource(){
			return source;
		}
		
		/**
		 * 返回词相对字符串的偏移量
		 * @return
		 */
		public int getSourceOffset(){
			return sourceOffset;
		}
		
		/**
		 * 获得词的长度
		 * @return
		 */
		public int getLength(){
			return length;
		}
		
		/**
		 * 获得词相对全文的起始位置
		 * @return
		 */
		public int getStartOffset() {
			return startOffset;
		}
		
		/**
		 * 获得词相对全文的结束位置
		 * @return
		 */
		public int getEndOffset() {
			return startOffset+getLength();
		}
		
		/**
		 * 得到索引号
		 * @return
		 */
		public int getIndex() {
			return index;
		}
	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy