org.lionsoul.jcseg.analyzer.JcsegTokenizer Maven / Gradle / Ivy
package org.lionsoul.jcseg.analyzer;
import java.io.IOException;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttributeImpl;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.lionsoul.jcseg.tokenizer.core.ADictionary;
import org.lionsoul.jcseg.tokenizer.core.ISegment;
import org.lionsoul.jcseg.tokenizer.core.IWord;
import org.lionsoul.jcseg.tokenizer.core.JcsegException;
import org.lionsoul.jcseg.tokenizer.core.JcsegTaskConfig;
import org.lionsoul.jcseg.tokenizer.core.SegmentFactory;
/**
*
* here is the documentation from {@link org.apache.lucene.analysis.Tokenizer}
* A Tokenizer is a TokenStream whose input is a Reader.
*
*
*
* This is an abstract class; subclasses must override {@link #incrementToken()}
*
*
*
* NOTE: Subclasses overriding {@link #incrementToken()} must
* call {@link #clearAttributes()} before setting attributes
*
*
*
* lucene invoke Tokenizer#setReader(Reader input) to set the inputPending
* after invoke the reset, global object input will be available
*
*
* jcseg tokennizer for lucene on or after 5.1.0
*
* @author chenxin
*/
public class JcsegTokenizer extends Tokenizer
{
// The default jcseg segmentor
private ISegment segmentor;
private final CharTermAttributeImpl termAtt = (CharTermAttributeImpl)addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
/**
* field level offset tracker for multiple-value field
* like the Array field in Elasticseach
*/
private int fieldOffset = 0;
public JcsegTokenizer(
int mode,
JcsegTaskConfig config,
ADictionary dic ) throws JcsegException, IOException
{
segmentor = SegmentFactory.createJcseg(mode, new Object[]{config, dic});
segmentor.reset(input);
}
@Override
final public boolean incrementToken() throws IOException
{
IWord word = segmentor.next();
if ( word == null ) {
fieldOffset = offsetAtt.endOffset();
/// System.out.println("set fieldOffset=" + fieldOffset);
return false;
}
clearAttributes();
//char[] token = word.getValue().toCharArray();
//termAtt.copyBuffer(token, 0, token.length);
termAtt.clear();
termAtt.append(word.getValue());
offsetAtt.setOffset(
correctOffset(fieldOffset + word.getPosition()),
correctOffset(fieldOffset + word.getPosition() + word.getLength())
);
typeAtt.setType("word");
return true;
}
@Override
public void end() throws IOException
{
super.end();
offsetAtt.setOffset(fieldOffset, fieldOffset);
fieldOffset = 0; // reset the field-level offset
}
@Override
public void reset() throws IOException
{
super.reset();
segmentor.reset(input);
}
}