org.apache.lucene.analysis.opennlp.OpenNLPTokenizer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-analyzers-opennlp Show documentation
Lucene OpenNLP integration
There is a newer version: 8.11.4
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.analysis.opennlp;

import java.io.IOException;

import opennlp.tools.util.Span;

import org.apache.lucene.analysis.opennlp.tools.NLPSentenceDetectorOp;
import org.apache.lucene.analysis.opennlp.tools.NLPTokenizerOp;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.SegmentingTokenizerBase;
import org.apache.lucene.util.AttributeFactory;

/**
 * Run OpenNLP SentenceDetector and Tokenizer.
 * The last token in each sentence is marked by setting the {@link #EOS_FLAG_BIT} in the FlagsAttribute;
 * following filters can use this information to apply operations to tokens one sentence at a time.
 */
public final class OpenNLPTokenizer extends SegmentingTokenizerBase {
  public static int EOS_FLAG_BIT = 1;

  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

  private Span[] termSpans = null;
  private int termNum = 0;
  private int sentenceStart = 0;

  private NLPSentenceDetectorOp sentenceOp = null;
  private NLPTokenizerOp tokenizerOp = null;

  public OpenNLPTokenizer(AttributeFactory factory, NLPSentenceDetectorOp sentenceOp, NLPTokenizerOp tokenizerOp) throws IOException {
    super(factory, new OpenNLPSentenceBreakIterator(sentenceOp));
    if (sentenceOp == null || tokenizerOp == null) {
      throw new IllegalArgumentException("OpenNLPTokenizer: both a Sentence Detector and a Tokenizer are required");
    }
    this.sentenceOp = sentenceOp;
    this.tokenizerOp = tokenizerOp;
  }

  @Override
  public void close() throws IOException {
    super.close();
    termSpans = null;
    termNum = sentenceStart = 0;
  };

  @Override
  protected void setNextSentence(int sentenceStart, int sentenceEnd) {
    this.sentenceStart = sentenceStart;
    String sentenceText = new String(buffer, sentenceStart, sentenceEnd - sentenceStart);
    termSpans = tokenizerOp.getTerms(sentenceText);
    termNum = 0;
  }

  @Override
  protected boolean incrementWord() {
    if (termSpans == null || termNum == termSpans.length) {
      return false;
    }
    clearAttributes();
    Span term = termSpans[termNum];
    termAtt.copyBuffer(buffer, sentenceStart + term.getStart(), term.length());
    offsetAtt.setOffset(correctOffset(offset + sentenceStart + term.getStart()),
                        correctOffset(offset + sentenceStart + term.getEnd()));
    if (termNum == termSpans.length - 1) {
      flagsAtt.setFlags(flagsAtt.getFlags() | EOS_FLAG_BIT); // mark the last token in the sentence with EOS_FLAG_BIT
    }
    ++termNum;
    return true;
  }

  @Override
  public void reset() throws IOException {
    super.reset();
    termSpans = null;
    termNum = sentenceStart = 0;
  }
}