All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.lionsoul.jcseg.analyzer.JcsegTokenizer Maven / Gradle / Ivy

There is a newer version: 2.6.3
Show newest version
package org.lionsoul.jcseg.analyzer;

import java.io.IOException;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttributeImpl;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.lionsoul.jcseg.tokenizer.core.ADictionary;
import org.lionsoul.jcseg.tokenizer.core.ISegment;
import org.lionsoul.jcseg.tokenizer.core.IWord;
import org.lionsoul.jcseg.tokenizer.core.JcsegException;
import org.lionsoul.jcseg.tokenizer.core.JcsegTaskConfig;
import org.lionsoul.jcseg.tokenizer.core.SegmentFactory;


/**
 * 

* here is the documentation from {@link org.apache.lucene.analysis.Tokenizer} * A Tokenizer is a TokenStream whose input is a Reader. *

* *

* This is an abstract class; subclasses must override {@link #incrementToken()} *

* *

* NOTE: Subclasses overriding {@link #incrementToken()} must * call {@link #clearAttributes()} before setting attributes *

* *

* lucene invoke Tokenizer#setReader(Reader input) to set the inputPending * after invoke the reset, global object input will be available *

* *

jcseg tokennizer for lucene on or after 5.1.0

* * @author chenxin */ public class JcsegTokenizer extends Tokenizer { // The default jcseg segmentor private ISegment segmentor; private final CharTermAttributeImpl termAtt = (CharTermAttributeImpl)addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); /** * field level offset tracker for multiple-value field * like the Array field in Elasticseach */ private int fieldOffset = 0; public JcsegTokenizer( int mode, JcsegTaskConfig config, ADictionary dic ) throws JcsegException, IOException { segmentor = SegmentFactory.createJcseg(mode, new Object[]{config, dic}); segmentor.reset(input); } @Override final public boolean incrementToken() throws IOException { IWord word = segmentor.next(); if ( word == null ) { fieldOffset = offsetAtt.endOffset(); /// System.out.println("set fieldOffset=" + fieldOffset); return false; } clearAttributes(); //char[] token = word.getValue().toCharArray(); //termAtt.copyBuffer(token, 0, token.length); termAtt.clear(); termAtt.append(word.getValue()); offsetAtt.setOffset( correctOffset(fieldOffset + word.getPosition()), correctOffset(fieldOffset + word.getPosition() + word.getLength()) ); typeAtt.setType("word"); return true; } @Override public void end() throws IOException { super.end(); offsetAtt.setOffset(fieldOffset, fieldOffset); fieldOffset = 0; // reset the field-level offset } @Override public void reset() throws IOException { super.reset(); segmentor.reset(input); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy