edu.stanford.nlp.naturalli.OpenIE Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.naturalli; 
import edu.stanford.nlp.util.logging.Redwood;
import edu.stanford.nlp.ie.util.RelationTriple;
import edu.stanford.nlp.international.Language;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.pipeline.*;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
import edu.stanford.nlp.semgraph.SemanticGraphEdge;
import edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher;
import edu.stanford.nlp.semgraph.semgrex.SemgrexPattern;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.stats.Counter;
import edu.stanford.nlp.stats.Counters;
import edu.stanford.nlp.trees.GrammaticalRelation;
import edu.stanford.nlp.trees.UniversalEnglishGrammaticalRelations;
import edu.stanford.nlp.util.*;

import java.io.File;
import java.io.IOException;
import java.io.PrintStream;
import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;

import edu.stanford.nlp.coref.CorefCoreAnnotations;

import edu.stanford.nlp.coref.data.CorefChain;

/**
 * 
 * An OpenIE system based on valid Natural Logic deletions of a sentence.
 * The system is described in:
 * 
 *
 *  *   "Leveraging Linguistic Structure For Open Domain Information Extraction." Gabor Angeli, Melvin Johnson Premkumar, Christopher Manning. ACL 2015.
 * 
 *
 * 
 * The paper can be found at http://nlp.stanford.edu/pubs/2015angeli-openie.pdf.
 * 

 * 
 * Documentation on the system can be found on
 * the project homepage,
 * or the CoreNLP annotator documentation page.
 * The simplest invocation of the system would be something like:
 * 
 *
 *  * java -mx1g -cp stanford-openie.jar:stanford-openie-models.jar edu.stanford.nlp.naturalli.OpenIE
 * 
 *
 * 
 *   Note that this class serves both as an entry point for the OpenIE system, but also as a CoreNLP annotator
 *   which can be plugged into the CoreNLP pipeline (or any other annotation pipeline).
 * 
 *
 * @see OpenIE#annotate(Annotation)
 * @see OpenIE#main(String[])
 *
 * @author Gabor Angeli
 */
//
// TODO(gabor): handle things like "One example of chemical energy is that found in the food that we eat ."
//
@SuppressWarnings({"FieldCanBeLocal", "UnusedDeclaration"})
public class OpenIE implements Annotator  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(OpenIE.class);

  private enum OutputFormat { REVERB, OLLIE, DEFAULT, QA_SRL }

  /**
   * A pattern for rewriting "NN_1 is a JJ NN_2" --> NN_1 is JJ"
   */
  private static SemgrexPattern adjectivePattern = SemgrexPattern.compile("{}=obj >nsubj {}=subj >cop {}=be >det {word:/an?/} >amod {}=adj ?>/prep_.*/=prep {}=pobj");

  //
  // Static Options (for running standalone)
  //

  @ArgumentParser.Option(name="format", gloss="The format to output the triples in.")
  private static OutputFormat FORMAT = OutputFormat.DEFAULT;

  @ArgumentParser.Option(name="filelist", gloss="The files to annotate, as a list of files one per line.")
  private static File FILELIST  = null;

  @ArgumentParser.Option(name="output", gloss="The files to annotate, as a list of files one per line.")
  private static PrintStream OUTPUT  = System.out;

  //
  // Annotator Options (for running in the pipeline)
  //
  @ArgumentParser.Option(name="splitter.model", gloss="The location of the clause splitting model.")
  private String splitterModel = DefaultPaths.DEFAULT_OPENIE_CLAUSE_SEARCHER;

  @ArgumentParser.Option(name="splitter.nomodel", gloss="If true, don't load a clause splitter model. This is primarily useful for training.")
  private boolean noModel = false;

  @ArgumentParser.Option(name="splitter.threshold", gloss="The minimum threshold for accepting a clause.")
  private double splitterThreshold = 0.1;

  @ArgumentParser.Option(name="splitter.disable", gloss="If true, don't run the sentence splitter")
  private boolean splitterDisable = false;

  @ArgumentParser.Option(name="max_entailments_per_clause", gloss="The maximum number of entailments allowed per sentence of input.")
  private int entailmentsPerSentence = 1000;

  @ArgumentParser.Option(name="ignore_affinity", gloss="If true, don't use the affinity models for dobj and pp attachment.")
  private boolean ignoreAffinity = false;

  @ArgumentParser.Option(name="affinity_models", gloss="The directory (or classpath directory) containing the affinity models for pp/obj attachments.")
  private String affinityModels = DefaultPaths.DEFAULT_NATURALLI_AFFINITIES;

  @ArgumentParser.Option(name="affinity_probability_cap", gloss="The affinity to consider 1.0")
  private double affinityProbabilityCap = 1.0 / 3.0;

  @ArgumentParser.Option(name="triple.strict", gloss="If true, only generate triples if the entire fragment has been consumed.")
  private boolean consumeAll = true;

  @ArgumentParser.Option(name="triple.all_nominals", gloss="If true, generate not only named entity nominal relations.")
  private boolean allNominals = false;

  @ArgumentParser.Option(name="resolve_coref", gloss="If true, resolve pronouns to their canonical mention")
  private boolean resolveCoref = false;

  @ArgumentParser.Option(name="strip_entailments", gloss="If true, don't keep the entailed sentences annotations around.")
  private boolean stripEntailments = false;

  /**
   * The natural logic weights loaded from the models file.
   * This is primarily the prepositional attachment statistics.
   */
  private final NaturalLogicWeights weights;

  /**
   * The clause splitter model, if one is to be used.
   * This component splits a sentence into a set of entailed clauses, but does not yet
   * maximally shorten them.
   * This is the implementation of stage 1 of the OpenIE pipeline.
   */
  public final Optional clauseSplitter;

  /**
   * The forward entailer model, running a search from clauses to maximally shortened clauses.
   * This is the implementation of stage 2 of the OpenIE pipeline.
   */
  public final ForwardEntailer forwardEntailer;

  /**
   * The relation triple segmenter, which converts a maximally shortened clause into an OpenIE
   * extraction triple.
   * This is the implementation of stage 3 of the OpenIE pipeline.
   */
  public RelationTripleSegmenter segmenter;


  /** Create a new OpenIE system, with default properties */
  @SuppressWarnings("UnusedDeclaration")
  public OpenIE() {
    this(new Properties());
  }


  /**
   * Create a ne OpenIE system, based on the given properties.
   * @param props The properties to parametrize the system with.
   */
  public OpenIE(Properties props) {
    // Fill the properties
    ArgumentParser.fillOptions(this, props);
    Properties withoutOpenIEPrefix = new Properties();
    Enumeration