opennlp.tools.sentdetect.SentenceDetectorME Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of opennlp-tools Show documentation
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.sentdetect;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;

import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.ml.ArrayMath;
import opennlp.tools.ml.EventTrainer;
import opennlp.tools.ml.TrainerFactory;
import opennlp.tools.ml.model.Event;
import opennlp.tools.ml.model.MaxentModel;
import opennlp.tools.sentdetect.lang.Factory;
import opennlp.tools.util.DownloadUtil;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.Span;
import opennlp.tools.util.StringList;
import opennlp.tools.util.StringUtil;
import opennlp.tools.util.TrainingParameters;

/**
 * A sentence detector for splitting up raw text into sentences.
 * 
 * A maximum entropy model is used to evaluate end-of-sentence characters in a
 * string to determine if they signify the end of a sentence.
 */
public class SentenceDetectorME implements SentenceDetector {

  /**
   * Constant indicates a sentence split.
   */
  public static final String SPLIT = "s";

  /**
   * Constant indicates no sentence split.
   */
  public static final String NO_SPLIT = "n";

  /**
   * The maximum entropy model to use to evaluate contexts.
   */
  private final MaxentModel model;

  /**
   * The feature context generator.
   */
  private final SDContextGenerator cgen;

  /**
   * The {@link EndOfSentenceScanner} to use when scanning for end of sentence offsets.
   */
  private final EndOfSentenceScanner scanner;

  /**
   * The list of probabilities associated with each decision.
   */
  private final List sentProbs = new ArrayList<>();

  /**
   * The {@link Dictionary abbreviation dictionary} if available (may be {@code null}).
   */
  private final Dictionary abbDict;

  protected final boolean useTokenEnd;

  /**
   * Initializes the sentence detector by downloading a default model.
   * @param language The language of the sentence detector.
   * @throws IOException Thrown if the model cannot be downloaded or saved.
   */
  public SentenceDetectorME(String language) throws IOException {
    this(DownloadUtil.downloadModel(language,
            DownloadUtil.ModelType.SENTENCE_DETECTOR, SentenceModel.class));
  }

  /**
   * Initializes the current instance.
   *
   * @param model the {@link SentenceModel}
   */
  public SentenceDetectorME(SentenceModel model) {
    this(model, model.getAbbreviations());
  }

  /**
   * Instantiates a {@link SentenceDetectorME} with an existing {@link SentenceModel}.
   *
   * @param model The {@link SentenceModel} to be used.
   * @param abbDict The {@link Dictionary} to be used. It must fit the language of the {@code model}.
   */
  public SentenceDetectorME(SentenceModel model, Dictionary abbDict) {
    this.model = model.getMaxentModel();
    this.abbDict = abbDict;
    SentenceDetectorFactory sdFactory = model.getFactory();
    cgen = sdFactory.getSDContextGenerator();
    scanner = sdFactory.getEndOfSentenceScanner();
    useTokenEnd = sdFactory.isUseTokenEnd();
  }

  /**
   * @deprecated Use a {@link SentenceDetectorFactory} to extend
   *             SentenceDetector functionality.
   */
  @Deprecated
  public SentenceDetectorME(SentenceModel model, Factory factory) {
    this.model = model.getMaxentModel();
    // if the model has custom EOS characters set, use this to get the context
    // generator and the EOS scanner; otherwise use language-specific defaults
    char[] customEOSCharacters = model.getEosCharacters();
    if (customEOSCharacters == null) {
      cgen = factory.createSentenceContextGenerator(model.getLanguage(),
          getAbbreviations(model.getAbbreviations()));
      scanner = factory.createEndOfSentenceScanner(model.getLanguage());
    } else {
      cgen = factory.createSentenceContextGenerator(
          getAbbreviations(model.getAbbreviations()), customEOSCharacters);
      scanner = factory.createEndOfSentenceScanner(customEOSCharacters);
    }
    abbDict = model.getAbbreviations();
    useTokenEnd = model.useTokenEnd();
  }

  private static Set getAbbreviations(Dictionary abbreviations) {
    if (abbreviations == null) {
      return Collections.emptySet();
    }
    return abbreviations.asStringSet();
  }

  /**
   * Detects sentences in given input {@link CharSequence}..
   *
   * @param s  The {@link CharSequence}. to be processed.
   *
   * @return   A string array containing individual sentences as elements.
   */
  @Override
  public String[] sentDetect(CharSequence s) {
    Span[] spans = sentPosDetect(s);
    String[] sentences;
    if (spans.length != 0) {
      sentences = new String[spans.length];
      for (int si = 0; si < spans.length; si++) {
        sentences[si] = spans[si].getCoveredText(s).toString();
      }
    }
    else {
      sentences = new String[] {};
    }
    return sentences;
  }

  private int getFirstWS(CharSequence s, int pos) {
    while (pos < s.length() && !StringUtil.isWhitespace(s.charAt(pos)))
      pos++;
    return pos;
  }

  private int getFirstNonWS(CharSequence s, int pos) {
    while (pos < s.length() && StringUtil.isWhitespace(s.charAt(pos)))
      pos++;
    return pos;
  }

  /**
   * Detects the position of the first words of sentences in a {@link CharSequence}.
   *
   * @param s  The {@link CharSequence} to be processed.
   * @return   An {@link Span span array} containing the positions of the end index of
   *           every sentence.
   *
   */
  @Override
  public Span[] sentPosDetect(CharSequence s) {
    sentProbs.clear();
    List enders = scanner.getPositions(s);
    List positions = new ArrayList<>(enders.size());

    for (int i = 0, end = enders.size(), index = 0; i < end; i++) {
      int cint = enders.get(i);
      // skip over the leading parts of non-token final delimiters
      int fws = getFirstWS(s,cint + 1);
      if (i + 1 < end && enders.get(i + 1) < fws) {
        continue;
      }
      if (positions.size() > 0 && cint < positions.get(positions.size() - 1)) continue;

      double[] probs = model.eval(cgen.getContext(s, cint));
      String bestOutcome = model.getBestOutcome(probs);

      if (bestOutcome.equals(SPLIT) && isAcceptableBreak(s, index, cint)) {
        if (index != cint) {
          if (useTokenEnd) {
            positions.add(getFirstNonWS(s, getFirstWS(s,cint + 1)));
          }
          else {
            positions.add(getFirstNonWS(s, cint + 1));
          }
          sentProbs.add(probs[model.getIndex(bestOutcome)]);
        }

        index = cint + 1;
      }
    }

    int[] starts = ArrayMath.toIntArray(positions);

    // string does not contain sentence end positions
    if (starts.length == 0) {

      // remove leading and trailing whitespace
      int start = 0;
      int end = s.length();

      while (start < s.length() && StringUtil.isWhitespace(s.charAt(start)))
        start++;

      while (end > 0 && StringUtil.isWhitespace(s.charAt(end - 1)))
        end--;

      if (end - start > 0) {
        sentProbs.add(1d);
        return new Span[] {new Span(start, end)};
      }
      else
        return new Span[0];
    }

    // Convert the sentence end indexes to spans

    boolean leftover = starts[starts.length - 1] != s.length();
    Span[] spans = new Span[leftover ? starts.length + 1 : starts.length];

    for (int si = 0; si < starts.length; si++) {
      int start;

      if (si == 0) {
        start = 0;
      }
      else {
        start = starts[si - 1];
      }

      // A span might contain only white spaces, in this case the length of
      // the span will be zero after trimming and should be ignored.
      Span span = new Span(start, starts[si]).trim(s);
      if (span.length() > 0) {
        spans[si] = span;
      }
      else {
        sentProbs.remove(si);
      }
    }

    if (leftover) {
      Span span = new Span(starts[starts.length - 1], s.length()).trim(s);
      if (span.length() > 0) {
        spans[spans.length - 1] = span;
        sentProbs.add(1d);
      }
    }
    /*
     * set the prob for each span
     */
    for (int i = 0; i < spans.length; i++) {
      double prob = sentProbs.get(i);
      spans[i] = new Span(spans[i], prob);

    }

    return spans;
  }

  /**
   * Returns the probabilities associated with the most recent
   * calls to {@link SentenceDetectorME#sentDetect(CharSequence)}.
   *
   * @return The probability for each sentence returned for the most recent
   *     call to {@link SentenceDetectorME#sentDetect(CharSequence)}.
   *     If not applicable, an empty array is returned.
   */
  public double[] getSentenceProbabilities() {
    return ArrayMath.toDoubleArray(sentProbs);
  }

  /**
   * Allows subclasses to check an overzealous (read: poorly
   * trained) model from flagging obvious non-breaks as breaks based
   * on some boolean determination of a break's acceptability.
   *
   * 
Note: The implementation always returns {@code true} if no
   * abbreviation dictionary is available for the underlying model.
   *
   * @param s the {@link CharSequence} in which the break occurred.
   * @param fromIndex the start of the segment currently being evaluated.
   * @param candidateIndex the index of the candidate sentence ending.
   * @return {@code true} if the break is acceptable, {@code false} otherwise.
   */
  protected boolean isAcceptableBreak(CharSequence s, int fromIndex, int candidateIndex) {
    if (abbDict == null)
      return true;

    for (StringList abb : abbDict) {
      String token = abb.getToken(0);
      int tokenLength = token.length();
      int tokenPosition = s.toString().indexOf(token, fromIndex);
      if (tokenPosition + tokenLength < candidateIndex || tokenPosition > candidateIndex)
        continue;

      return false;
    }
    return true;
  }

  /**
   * Starts a training of a {@link SentenceModel} with the given parameters.
   *
   * @param languageCode The ISO language code to train the model. Must not be {@code null}.
   * @param samples The {@link ObjectStream} of {@link SentenceSample} used as input for training.
   * @param sdFactory The {@link SentenceDetectorFactory} for creating related objects as defined
   *                  via {@code mlParams}.
   * @param mlParams The {@link TrainingParameters} for the context of the training process.
   *
   * @return A valid, trained {@link SentenceModel} instance.
   * @throws IOException Thrown if IO errors occurred.
   */
  public static SentenceModel train(String languageCode,
      ObjectStream samples, SentenceDetectorFactory sdFactory,
      TrainingParameters mlParams) throws IOException {

    Map manifestInfoEntries = new HashMap<>();

    // TODO: Fix the EventStream to throw exceptions when training goes wrong
    ObjectStream eventStream = new SDEventStream(samples,
        sdFactory.getSDContextGenerator(), sdFactory.getEndOfSentenceScanner());

    EventTrainer trainer = TrainerFactory.getEventTrainer(mlParams, manifestInfoEntries);
    MaxentModel sentModel = trainer.train(eventStream);

    return new SentenceModel(languageCode, sentModel, manifestInfoEntries, sdFactory);
  }

}