org.apache.lucene.analysis.opennlp.OpenNLPSentenceBreakIterator Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-analyzers-opennlp Show documentation
Lucene OpenNLP integration
There is a newer version: 8.11.4
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.analysis.opennlp;

import java.text.BreakIterator;
import java.text.CharacterIterator;

import opennlp.tools.util.Span;
import org.apache.lucene.analysis.opennlp.tools.NLPSentenceDetectorOp;
import org.apache.lucene.analysis.util.CharArrayIterator;

/**
 * A {@link BreakIterator} that splits sentences using an OpenNLP sentence chunking model.
 */
public final class OpenNLPSentenceBreakIterator extends BreakIterator {

  private CharacterIterator text;
  private int currentSentence;
  private int[] sentenceStarts;
  private NLPSentenceDetectorOp sentenceOp;

  public OpenNLPSentenceBreakIterator(NLPSentenceDetectorOp sentenceOp) {
    this.sentenceOp = sentenceOp;
  }

  @Override
  public int current() {
    return text.getIndex();
  }

  @Override
  public int first() {
    currentSentence = 0;
    text.setIndex(text.getBeginIndex());
    return current();
  }

  @Override
  public int last() {
    if (sentenceStarts.length > 0) {
      currentSentence = sentenceStarts.length - 1;
      text.setIndex(text.getEndIndex());
    } else { // there are no sentences; both the first and last positions are the begin index
      currentSentence = 0;
      text.setIndex(text.getBeginIndex());
    }
    return current();
  }

  @Override
  public int next() {
    if (text.getIndex() == text.getEndIndex() || 0 == sentenceStarts.length) {
      return DONE;
    } else if (currentSentence < sentenceStarts.length - 1) {
      text.setIndex(sentenceStarts[++currentSentence]);
      return current();
    } else {
      return last();
    }
  }

  @Override
  public int following(int pos) {
    if (pos < text.getBeginIndex() || pos > text.getEndIndex()) {
      throw new IllegalArgumentException("offset out of bounds");
    } else if (0 == sentenceStarts.length) {
      text.setIndex(text.getBeginIndex());
      return DONE;
    } else if (pos >= sentenceStarts[sentenceStarts.length - 1]) {
      // this conflicts with the javadocs, but matches actual behavior (Oracle has a bug in something)
      // https://bugs.openjdk.java.net/browse/JDK-8015110
      text.setIndex(text.getEndIndex());
      currentSentence = sentenceStarts.length - 1;
      return DONE;
    } else { // there are at least two sentences
      currentSentence = (sentenceStarts.length - 1) / 2; // start search from the middle
      moveToSentenceAt(pos, 0, sentenceStarts.length - 2);
      text.setIndex(sentenceStarts[++currentSentence]);
      return current();
    }
  }

  /** Binary search over sentences */
  private void moveToSentenceAt(int pos, int minSentence, int maxSentence) {
    if (minSentence != maxSentence) {
      if (pos < sentenceStarts[currentSentence]) {
        int newMaxSentence = currentSentence - 1;
        currentSentence = minSentence + (currentSentence - minSentence) / 2;
        moveToSentenceAt(pos, minSentence, newMaxSentence);
      } else if (pos >= sentenceStarts[currentSentence + 1]) {
        int newMinSentence = currentSentence + 1;
        currentSentence = maxSentence - (maxSentence - currentSentence) / 2;
        moveToSentenceAt(pos, newMinSentence, maxSentence);
      }
    } else {
      assert currentSentence == minSentence;
      assert pos >= sentenceStarts[currentSentence];
      assert (currentSentence == sentenceStarts.length - 1 && pos <= text.getEndIndex())
          || pos < sentenceStarts[currentSentence + 1];
    }
    // we have arrived - nothing to do
  }

  @Override
  public int previous() {
    if (text.getIndex() == text.getBeginIndex()) {
      return DONE;
    } else {
      if (0 == sentenceStarts.length) {
        text.setIndex(text.getBeginIndex());
        return DONE;
      }
      if (text.getIndex() == text.getEndIndex()) {
        text.setIndex(sentenceStarts[currentSentence]);
      } else {
        text.setIndex(sentenceStarts[--currentSentence]);
      }
      return current();
    }
  }

  @Override
  public int preceding(int pos) {
    if (pos < text.getBeginIndex() || pos > text.getEndIndex()) {
      throw new IllegalArgumentException("offset out of bounds");
    } else if (0 == sentenceStarts.length) {
      text.setIndex(text.getBeginIndex());
      currentSentence = 0;
      return DONE;
    } else if (pos < sentenceStarts[0]) {
      // this conflicts with the javadocs, but matches actual behavior (Oracle has a bug in something)
      // https://bugs.openjdk.java.net/browse/JDK-8015110
      text.setIndex(text.getBeginIndex());
      currentSentence = 0;
      return DONE;
    } else {
      currentSentence = sentenceStarts.length / 2; // start search from the middle
      moveToSentenceAt(pos, 0, sentenceStarts.length - 1);
      if (0 == currentSentence) {
        text.setIndex(text.getBeginIndex());
        return DONE;
      } else {
        text.setIndex(sentenceStarts[--currentSentence]);
        return current();
      }
    }
  }

  @Override
  public int next(int n) {
    currentSentence += n;
    if (n < 0) {
      if (text.getIndex() == text.getEndIndex()) {
        ++currentSentence;
      }
      if (currentSentence < 0) {
        currentSentence = 0;
        text.setIndex(text.getBeginIndex());
        return DONE;
      } else {
        text.setIndex(sentenceStarts[currentSentence]);
      }
    } else if (n > 0) {
      if (currentSentence >= sentenceStarts.length) {
        currentSentence = sentenceStarts.length - 1;
        text.setIndex(text.getEndIndex());
        return DONE;
      } else {
        text.setIndex(sentenceStarts[currentSentence]);
      }
    }
    return current();
  }

  @Override
  public CharacterIterator getText() {
    return text;
  }

  @Override
  public void setText(CharacterIterator newText) {
    text = newText;
    text.setIndex(text.getBeginIndex());
    currentSentence = 0;
    Span[] spans = sentenceOp.splitSentences(characterIteratorToString());
    sentenceStarts = new int[spans.length];
    for (int i = 0; i < spans.length; ++i) {
      // Adjust start positions to match those of the passed-in CharacterIterator
      sentenceStarts[i] = spans[i].getStart() + text.getBeginIndex();
    }
  }

  private String characterIteratorToString() {
    String fullText;
    if (text instanceof CharArrayIterator) {
      CharArrayIterator charArrayIterator = (CharArrayIterator)text;
      fullText = new String(charArrayIterator.getText(), charArrayIterator.getStart(), charArrayIterator.getLength());
    } else {
      // TODO: is there a better way to extract full text from arbitrary CharacterIterators?
      StringBuilder builder = new StringBuilder();
      for (char ch = text.first(); ch != CharacterIterator.DONE; ch = text.next()) {
        builder.append(ch);
      }
      fullText = builder.toString();
      text.setIndex(text.getBeginIndex());
    }
    return fullText;
  }
}