opennlp.tools.languagemodel.NGramLanguageModel Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of opennlp-tools Show documentation
There is a newer version: 2.5.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package opennlp.tools.languagemodel;

import java.io.IOException;
import java.io.InputStream;

import opennlp.tools.ngram.NGramModel;
import opennlp.tools.ngram.NGramUtils;
import opennlp.tools.util.StringList;

/**
 * A {@link opennlp.tools.languagemodel.LanguageModel} based on a {@link opennlp.tools.ngram.NGramModel}
 * using Laplace smoothing probability estimation to get the probabilities of the ngrams.
 * See also {@link NGramUtils#calculateLaplaceSmoothingProbability(
 * opennlp.tools.util.StringList, Iterable, int, Double)}.
 */
public class NGramLanguageModel extends NGramModel implements LanguageModel {

  private static final int DEFAULT_N = 3;
  private static final double DEFAULT_K = 1d;

  private final int n;
  private final double k;

  public NGramLanguageModel() {
    this(DEFAULT_N, DEFAULT_K);
  }

  public NGramLanguageModel(int n) {
    this(n, DEFAULT_K);
  }

  public NGramLanguageModel(double k) {
    this(DEFAULT_N, k);
  }

  public NGramLanguageModel(int n, double k) {
    this.n = n;
    this.k = k;
  }

  public NGramLanguageModel(InputStream in) throws IOException {
    this(in, DEFAULT_N, DEFAULT_K);
  }

  public NGramLanguageModel(InputStream in, double k) throws IOException {
    this(in, DEFAULT_N, k);
  }

  public NGramLanguageModel(InputStream in, int n) throws IOException {
    this(in, n, DEFAULT_K);
  }

  public NGramLanguageModel(InputStream in, int n, double k) throws IOException {
    super(in);
    this.n = n;
    this.k = k;
  }

  @Override
  public double calculateProbability(StringList sample) {
    double probability = 0d;
    if (size() > 0) {
      for (StringList ngram : NGramUtils.getNGrams(sample, n)) {
        StringList nMinusOneToken = NGramUtils.getNMinusOneTokenFirst(ngram);
        if (size() > 1000000) {
          // use stupid backoff
          probability += Math.log(getStupidBackoffProbability(ngram, nMinusOneToken));
        } else {
          // use laplace smoothing
          probability += Math.log(getLaplaceSmoothingProbability(ngram, nMinusOneToken));
        }
      }
      if (Double.isNaN(probability)) {
        probability = 0d;
      } else if (probability != 0) {
        probability = Math.exp(probability);
      }

    }
    return probability;
  }

  @Override
  public StringList predictNextTokens(StringList tokens) {
    double maxProb = Double.NEGATIVE_INFINITY;
    StringList token = null;

    for (StringList ngram : this) {
      String[] sequence = new String[ngram.size() + tokens.size()];
      for (int i = 0; i < tokens.size(); i++) {
        sequence[i] = tokens.getToken(i);
      }
      for (int i = 0; i < ngram.size(); i++) {
        sequence[i + tokens.size()] = ngram.getToken(i);
      }
      StringList sample = new StringList(sequence);
      double v = calculateProbability(sample);
      if (v > maxProb) {
        maxProb = v;
        token = ngram;
      }
    }

    return token;
  }

  private double getLaplaceSmoothingProbability(StringList ngram, StringList nMinusOneToken) {
    return (getCount(ngram) + k) / (getCount(nMinusOneToken) + k * size());
  }

  private double getStupidBackoffProbability(StringList ngram, StringList nMinusOneToken) {
    int count = getCount(ngram);
    if (nMinusOneToken == null || nMinusOneToken.size() == 0) {
      return count / size();
    } else if (count > 0) {
      return ((double) count) / ((double) getCount(nMinusOneToken)); // maximum likelihood probability
    } else {
      StringList nextNgram = NGramUtils.getNMinusOneTokenLast(ngram);
      return 0.4d * getStupidBackoffProbability(nextNgram, NGramUtils.getNMinusOneTokenFirst(nextNgram));
    }
  }

}