edu.stanford.nlp.coref.statistical.StatisticalCorefAlgorithm Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation

Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.

There is a newer version: 4.5.7

Show newest version

package edu.stanford.nlp.coref.statistical;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;

import edu.stanford.nlp.coref.CorefAlgorithm;
import edu.stanford.nlp.coref.CorefProperties;
import edu.stanford.nlp.coref.CorefUtils;
import edu.stanford.nlp.coref.data.Dictionaries;
import edu.stanford.nlp.coref.data.Dictionaries.MentionType;
import edu.stanford.nlp.coref.data.Document;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.stats.Counter;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.RuntimeInterruptedException;

/**
 * Does best-first coreference resolution by linking each mention to its highest scoring candidate
 * antecedent if that score is above a threshold. The model is described in
 * 
 * Kevin Clark and Christopher D. Manning. 2015.
 * 
 * Entity-Centric Coreference Resolution with Model Stacking.
 * In Association for Computational Linguistics.
 * 
 * See {@link StatisticalCorefTrainer} for training a new model.
 * @author Kevin Clark
 */
public class StatisticalCorefAlgorithm implements CorefAlgorithm {

  private final Map, Double> thresholds;
  private final FeatureExtractor extractor;
  private final PairwiseModel classifier;
  private final int maxMentionDistance;
  private final int maxMentionDistanceWithStringMatch;

  public StatisticalCorefAlgorithm(Properties props, Dictionaries dictionaries) {
    this(props, dictionaries,
        StatisticalCorefProperties.wordCountsPath(props),
        StatisticalCorefProperties.rankingModelPath(props),
        CorefProperties.maxMentionDistance(props),
        CorefProperties.maxMentionDistanceWithStringMatch(props),
        StatisticalCorefProperties.pairwiseScoreThresholds(props));
  }

  public StatisticalCorefAlgorithm(Properties props, Dictionaries dictionaries, String wordCountsFile,
      String modelFile, int maxMentionDistance, int maxMentionDistanceWithStringMatch,
      double threshold) {
    this(props, dictionaries, wordCountsFile, modelFile, maxMentionDistance,
        maxMentionDistanceWithStringMatch, new double[] {threshold, threshold, threshold,
        threshold});
  }

  public StatisticalCorefAlgorithm(Properties props, Dictionaries dictionaries, String wordCountsFile,
      String modelPath, int maxMentionDistance, int maxMentionDistanceWithStringMatch,
      double[] thresholds) {
    extractor = new FeatureExtractor(props, dictionaries, null, wordCountsFile);
    classifier = PairwiseModel.newBuilder("classifier",
        MetaFeatureExtractor.newBuilder().build()).modelPath(modelPath).build();
    this.maxMentionDistance = maxMentionDistance;
    this.maxMentionDistanceWithStringMatch = maxMentionDistanceWithStringMatch;
    this.thresholds = makeThresholds(thresholds);
  }

  private static Map, Double> makeThresholds(double[] thresholds) {
    Map, Double> thresholdsMap = new HashMap<>();
    thresholdsMap.put(new Pair<>(true, true), thresholds[0]);
    thresholdsMap.put(new Pair<>(true, false), thresholds[1]);
    thresholdsMap.put(new Pair<>(false, true), thresholds[2]);
    thresholdsMap.put(new Pair<>(false, false), thresholds[3]);
    return thresholdsMap;
  }

  @Override
  public void runCoref(Document document) {
    Compressor compressor = new Compressor<>();
    if (Thread.interrupted()) {  // Allow interrupting
      throw new RuntimeInterruptedException();
    }

    Map, Boolean> pairs = new HashMap<>();
    for (Map.Entry> e: CorefUtils.heuristicFilter(
        CorefUtils.getSortedMentions(document),
        maxMentionDistance, maxMentionDistanceWithStringMatch).entrySet()) {
      for (int m1 : e.getValue()) {
        pairs.put(new Pair<>(m1, e.getKey()), true);
      }
    }

    DocumentExamples examples = extractor.extract(0, document, pairs, compressor);
    Counter> pairwiseScores = new ClassicCounter<>();
    for (Example mentionPair : examples.examples) {
      if (Thread.interrupted()) {  // Allow interrupting
        throw new RuntimeInterruptedException();
      }
      pairwiseScores.incrementCount(new Pair<>(mentionPair.mentionId1, mentionPair.mentionId2),
          classifier.predict(mentionPair, examples.mentionFeatures, compressor));
    }

    List> mentionPairs = new ArrayList<>(pairwiseScores.keySet());
    Collections.sort(mentionPairs, (p1, p2) -> {
      double diff = pairwiseScores.getCount(p2) - pairwiseScores.getCount(p1);
      return diff == 0 ? 0 : (int) Math.signum(diff);
    });

    Set seenAnaphors = new HashSet<>();
    for (Pair pair : mentionPairs) {
      if (seenAnaphors.contains(pair.second)) {
        continue;
      }
      if (Thread.interrupted()) {  // Allow interrupting
        throw new RuntimeInterruptedException();
      }
      seenAnaphors.add(pair.second);
      MentionType mt1 = document.predictedMentionsByID.get(pair.first).mentionType;
      MentionType mt2 = document.predictedMentionsByID.get(pair.second).mentionType;
      if (pairwiseScores.getCount(pair) > thresholds.get(new Pair<>(mt1 == MentionType.PRONOMINAL,
          mt2 == MentionType.PRONOMINAL))) {
        CorefUtils.mergeCoreferenceClusters(pair, document);
      }
    }
  }
}