All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.paoding.analysis.analyzer.PaodingTokenizer Maven / Gradle / Ivy

/**
 * Copyright 2007 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package net.paoding.analysis.analyzer;

import java.io.IOException;
import java.io.Reader;
import java.util.Iterator;

import net.paoding.analysis.analyzer.impl.MostWordsTokenCollector;
import net.paoding.analysis.knife.Beef;
import net.paoding.analysis.knife.Collector;
import net.paoding.analysis.knife.Knife;
import net.paoding.analysis.knife.Paoding;
import net.paoding.analysis.knife.Token;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;

/**
 * PaodingTokenizer是基于“庖丁解牛”框架的TokenStream实现,为PaodingAnalyzer使用。
 * 

* * @author Zhiliang Wang [[email protected]] * * @see Beef * @see Knife * @see Paoding * @see Tokenizer * @see PaodingAnalyzer * * @see Collector * @see TokenCollector * @see net.paoding.analysis.analyzer.impl.MaxWordLengthTokenCollector * @see MostWordsTokenCollector * * @since 1.0 */ public final class PaodingTokenizer extends Tokenizer implements Collector { // ------------------------------------------------- /** * 从input读入的总字符数 */ private int inputLength; /** * */ private static final int bufferLength = 128; /** * 接收来自{@link #input}的文本字符 * * @see #incrementToken() */ protected final char[] buffer = new char[bufferLength]; /** * {@link #buffer}[0]在{@link #input}中的偏移 * * @see #collect(String, int, int) * @see #incrementToken() */ private int offset; /** * */ private final Beef beef = new Beef(buffer, 0, 0); /** * */ private int dissected; /** * 用于分解beef中的文本字符,由PaodingAnalyzer提供 * * @see #incrementToken() */ private Knife knife; /** * 切分句子后在这里保存所有的词 */ private TokenCollector tokenCollector; /** * tokens迭代器,用于next()方法顺序读取tokens中的Token对象 * * @see #tokenCollector * @see #incrementToken() */ private Iterator tokenIteractor; private CharTermAttribute termAtt; private OffsetAttribute offsetAtt; private PositionIncrementAttribute positionIncrementAttribute; // ------------------------------------------------- /** * * @param input * @param knife * @param tokenCollector */ public PaodingTokenizer(Reader input, Knife knife, TokenCollector tokenCollector) { super(input); this.input = input; this.knife = knife; this.tokenCollector = tokenCollector; init(); } private void init() { termAtt = addAttribute(CharTermAttribute.class); offsetAtt = addAttribute(OffsetAttribute.class); positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class); //typeAtt = addAttribute(TypeAttribute.class); } // ------------------------------------------------- public TokenCollector getTokenCollector() { return tokenCollector; } public void setTokenCollector(TokenCollector tokenCollector) { this.tokenCollector = tokenCollector; } // ------------------------------------------------- public void collect(String word, int offset, int end) { tokenCollector.collect(word, this.offset + offset, this.offset + end); } // ------------------------------------------------- public int getInputLength() { return inputLength; } @Override public boolean incrementToken() throws IOException { clearAttributes(); // 已经穷尽tokensIteractor的Token对象,则继续请求reader流入数据 while (tokenIteractor == null || !tokenIteractor.hasNext()) { // System.out.println(dissected); int read = 0; int remainning = -1;// 重新从reader读入字符前,buffer中还剩下的字符数,负数表示当前暂不需要从reader中读入字符 if (dissected >= beef.length()) { remainning = 0; } else if (dissected < 0) { remainning = bufferLength + dissected; } if (remainning >= 0) { if (remainning > 0) { System.arraycopy(buffer, -dissected, buffer, 0, remainning); } read = input.read(buffer, remainning, bufferLength - remainning); inputLength += read; int charCount = remainning + read; if (charCount < 0) { // reader已尽,按接口next()要求返回null. return false; } if (charCount < bufferLength) { buffer[charCount++] = 0; } // 构造“牛”,并使用knife“解”之 beef.set(0, charCount); offset += Math.abs(dissected); // offset -= remainning; dissected = 0; } dissected = knife.dissect(this, beef, dissected); // offset += read;// !!! tokenIteractor = tokenCollector.iterator(); } if (tokenIteractor.hasNext()) { // 返回tokensIteractor下一个Token对象 Token token = tokenIteractor.next(); termAtt.setEmpty(); termAtt.append(token.charSequence()); offsetAtt.setOffset(correctOffset(token.startOffset()), correctOffset(token.endOffset())); positionIncrementAttribute.setPositionIncrement(token.endOffset()); return true; } return tokenIteractor.hasNext(); } @Override public void reset() throws IOException { super.reset(); offset = 0; inputLength = 0; tokenCollector.clear(); tokenIteractor = null; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy