net.paoding.analysis.analyzer.PaodingTokenizer Maven / Gradle / Ivy

Go to download
/**
 * Copyright 2007 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package net.paoding.analysis.analyzer;

import java.io.IOException;
import java.io.Reader;
import java.util.Iterator;

import net.paoding.analysis.analyzer.impl.MostWordsTokenCollector;
import net.paoding.analysis.knife.Beef;
import net.paoding.analysis.knife.Collector;
import net.paoding.analysis.knife.Knife;
import net.paoding.analysis.knife.Paoding;
import net.paoding.analysis.knife.Token;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;

/**
 * PaodingTokenizer是基于“庖丁解牛”框架的TokenStream实现，为PaodingAnalyzer使用。
 * 
 * 
 * @author Zhiliang Wang [[email protected]]
 * 
 * @see Beef
 * @see Knife
 * @see Paoding
 * @see Tokenizer
 * @see PaodingAnalyzer
 * 
 * @see Collector
 * @see TokenCollector
 * @see net.paoding.analysis.analyzer.impl.MaxWordLengthTokenCollector
 * @see MostWordsTokenCollector
 * 
 * @since 1.0
 */
public final class PaodingTokenizer extends Tokenizer implements Collector {

	// -------------------------------------------------

	/**
	 * 从input读入的总字符数
	 */
	private int inputLength;

	/**
	 * 
	 */
	private static final int bufferLength = 128;

	/**
	 * 接收来自{@link #input}的文本字符
	 * 
	 * @see #incrementToken()
	 */
	protected final char[] buffer = new char[bufferLength];

	/**
	 * {@link #buffer}[0]在{@link #input}中的偏移
	 * 
	 * @see #collect(String, int, int)
	 * @see #incrementToken()
	 */
	private int offset;

	/**
	 * 
	 */
	private final Beef beef = new Beef(buffer, 0, 0);

	/**
	 * 
	 */
	private int dissected;

	/**
	 * 用于分解beef中的文本字符，由PaodingAnalyzer提供
	 * 
	 * @see #incrementToken()
	 */
	private Knife knife;

	/**
	 * 切分句子后在这里保存所有的词
	 */
	private TokenCollector tokenCollector;

	/**
	 * tokens迭代器，用于next()方法顺序读取tokens中的Token对象
	 * 
	 * @see #tokenCollector
	 * @see #incrementToken()
	 */
	private Iterator tokenIteractor;

	private CharTermAttribute termAtt;
	private OffsetAttribute offsetAtt;
	private PositionIncrementAttribute positionIncrementAttribute;

	// -------------------------------------------------

	/**
	 * 
	 * @param input
	 * @param knife
	 * @param tokenCollector
	 */
	public PaodingTokenizer(Reader input, Knife knife, TokenCollector tokenCollector) {
		super(input);
		this.input = input;
		this.knife = knife;
		this.tokenCollector = tokenCollector;
		init();
	}

	private void init() {
		termAtt = addAttribute(CharTermAttribute.class);
		offsetAtt = addAttribute(OffsetAttribute.class);
		positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
		//typeAtt = addAttribute(TypeAttribute.class);
	}

	// -------------------------------------------------

	public TokenCollector getTokenCollector() {
		return tokenCollector;
	}

	public void setTokenCollector(TokenCollector tokenCollector) {
		this.tokenCollector = tokenCollector;
	}

	// -------------------------------------------------

	public void collect(String word, int offset, int end) {
		tokenCollector.collect(word, this.offset + offset, this.offset + end);
	}

	// -------------------------------------------------
	public int getInputLength() {
		return inputLength;
	}

	@Override
	public boolean incrementToken() throws IOException {
		clearAttributes();

		// 已经穷尽tokensIteractor的Token对象，则继续请求reader流入数据
		while (tokenIteractor == null || !tokenIteractor.hasNext()) {
			// System.out.println(dissected);
			int read = 0;
			int remainning = -1;// 重新从reader读入字符前，buffer中还剩下的字符数，负数表示当前暂不需要从reader中读入字符
			if (dissected >= beef.length()) {
				remainning = 0;
			} else if (dissected < 0) {
				remainning = bufferLength + dissected;
			}
			if (remainning >= 0) {
				if (remainning > 0) {
					System.arraycopy(buffer, -dissected, buffer, 0, remainning);
				}
				read = input.read(buffer, remainning, bufferLength - remainning);
				inputLength += read;
				int charCount = remainning + read;
				if (charCount < 0) {
					// reader已尽，按接口next()要求返回null.
					return false;
				}
				if (charCount < bufferLength) {
					buffer[charCount++] = 0;
				}
				// 构造“牛”，并使用knife“解”之
				beef.set(0, charCount);
				offset += Math.abs(dissected);
				// offset -= remainning;
				dissected = 0;
			}
			dissected = knife.dissect(this, beef, dissected);
			// offset += read;// !!!
			tokenIteractor = tokenCollector.iterator();
		}

		if (tokenIteractor.hasNext()) {
			// 返回tokensIteractor下一个Token对象
			Token token = tokenIteractor.next();
			termAtt.setEmpty();
			termAtt.append(token.charSequence());
			offsetAtt.setOffset(correctOffset(token.startOffset()), correctOffset(token.endOffset()));
			positionIncrementAttribute.setPositionIncrement(token.endOffset());
			return true;
		}
		return tokenIteractor.hasNext();
	}

	@Override
	public void reset() throws IOException {
		super.reset();
		offset = 0;
		inputLength = 0;
		tokenCollector.clear();
		tokenIteractor = null;
	}
}