All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.naturalli.OpenIE Maven / Gradle / Ivy

Go to download

Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.

There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.naturalli; 
import edu.stanford.nlp.util.logging.Redwood;
import edu.stanford.nlp.ie.util.RelationTriple;
import edu.stanford.nlp.international.Language;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.pipeline.*;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
import edu.stanford.nlp.semgraph.SemanticGraphEdge;
import edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher;
import edu.stanford.nlp.semgraph.semgrex.SemgrexPattern;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.stats.Counter;
import edu.stanford.nlp.stats.Counters;
import edu.stanford.nlp.trees.GrammaticalRelation;
import edu.stanford.nlp.trees.UniversalEnglishGrammaticalRelations;
import edu.stanford.nlp.util.*;

import java.io.File;
import java.io.IOException;
import java.io.PrintStream;
import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;

import edu.stanford.nlp.coref.CorefCoreAnnotations;

import edu.stanford.nlp.coref.data.CorefChain;

/**
 * 

* An OpenIE system based on valid Natural Logic deletions of a sentence. * The system is described in: *

* *
 *   "Leveraging Linguistic Structure For Open Domain Information Extraction." Gabor Angeli, Melvin Johnson Premkumar, Christopher Manning. ACL 2015.
 * 
* *

* The paper can be found at http://nlp.stanford.edu/pubs/2015angeli-openie.pdf. *

*

* Documentation on the system can be found on * the project homepage, * or the CoreNLP annotator documentation page. * The simplest invocation of the system would be something like: *

* *
 * java -mx1g -cp stanford-openie.jar:stanford-openie-models.jar edu.stanford.nlp.naturalli.OpenIE
 * 
* *

* Note that this class serves both as an entry point for the OpenIE system, but also as a CoreNLP annotator * which can be plugged into the CoreNLP pipeline (or any other annotation pipeline). *

* * @see OpenIE#annotate(Annotation) * @see OpenIE#main(String[]) * * @author Gabor Angeli */ // // TODO(gabor): handle things like "One example of chemical energy is that found in the food that we eat ." // @SuppressWarnings({"FieldCanBeLocal", "UnusedDeclaration"}) public class OpenIE implements Annotator { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(OpenIE.class); private enum OutputFormat { REVERB, OLLIE, DEFAULT, QA_SRL } /** * A pattern for rewriting "NN_1 is a JJ NN_2" --> NN_1 is JJ" */ private static SemgrexPattern adjectivePattern = SemgrexPattern.compile("{}=obj >nsubj {}=subj >cop {}=be >det {word:/an?/} >amod {}=adj ?>/prep_.*/=prep {}=pobj"); // // Static Options (for running standalone) // @ArgumentParser.Option(name="format", gloss="The format to output the triples in.") private static OutputFormat FORMAT = OutputFormat.DEFAULT; @ArgumentParser.Option(name="filelist", gloss="The files to annotate, as a list of files one per line.") private static File FILELIST = null; @ArgumentParser.Option(name="output", gloss="The files to annotate, as a list of files one per line.") private static PrintStream OUTPUT = System.out; // // Annotator Options (for running in the pipeline) // @ArgumentParser.Option(name="splitter.model", gloss="The location of the clause splitting model.") private String splitterModel = DefaultPaths.DEFAULT_OPENIE_CLAUSE_SEARCHER; @ArgumentParser.Option(name="splitter.nomodel", gloss="If true, don't load a clause splitter model. This is primarily useful for training.") private boolean noModel = false; @ArgumentParser.Option(name="splitter.threshold", gloss="The minimum threshold for accepting a clause.") private double splitterThreshold = 0.1; @ArgumentParser.Option(name="splitter.disable", gloss="If true, don't run the sentence splitter") private boolean splitterDisable = false; @ArgumentParser.Option(name="max_entailments_per_clause", gloss="The maximum number of entailments allowed per sentence of input.") private int entailmentsPerSentence = 1000; @ArgumentParser.Option(name="ignore_affinity", gloss="If true, don't use the affinity models for dobj and pp attachment.") private boolean ignoreAffinity = false; @ArgumentParser.Option(name="affinity_models", gloss="The directory (or classpath directory) containing the affinity models for pp/obj attachments.") private String affinityModels = DefaultPaths.DEFAULT_NATURALLI_AFFINITIES; @ArgumentParser.Option(name="affinity_probability_cap", gloss="The affinity to consider 1.0") private double affinityProbabilityCap = 1.0 / 3.0; @ArgumentParser.Option(name="triple.strict", gloss="If true, only generate triples if the entire fragment has been consumed.") private boolean consumeAll = true; @ArgumentParser.Option(name="triple.all_nominals", gloss="If true, generate not only named entity nominal relations.") private boolean allNominals = false; @ArgumentParser.Option(name="resolve_coref", gloss="If true, resolve pronouns to their canonical mention") private boolean resolveCoref = false; @ArgumentParser.Option(name="strip_entailments", gloss="If true, don't keep the entailed sentences annotations around.") private boolean stripEntailments = false; /** * The natural logic weights loaded from the models file. * This is primarily the prepositional attachment statistics. */ private final NaturalLogicWeights weights; /** * The clause splitter model, if one is to be used. * This component splits a sentence into a set of entailed clauses, but does not yet * maximally shorten them. * This is the implementation of stage 1 of the OpenIE pipeline. */ public final Optional clauseSplitter; /** * The forward entailer model, running a search from clauses to maximally shortened clauses. * This is the implementation of stage 2 of the OpenIE pipeline. */ public final ForwardEntailer forwardEntailer; /** * The relation triple segmenter, which converts a maximally shortened clause into an OpenIE * extraction triple. * This is the implementation of stage 3 of the OpenIE pipeline. */ public RelationTripleSegmenter segmenter; /** Create a new OpenIE system, with default properties */ @SuppressWarnings("UnusedDeclaration") public OpenIE() { this(new Properties()); } /** * Create a ne OpenIE system, based on the given properties. * @param props The properties to parametrize the system with. */ public OpenIE(Properties props) { // Fill the properties ArgumentParser.fillOptions(this, props); Properties withoutOpenIEPrefix = new Properties(); Enumeration keys = props.keys(); while (keys.hasMoreElements()) { String key = keys.nextElement().toString(); withoutOpenIEPrefix.setProperty(key.replace("openie.", ""), props.getProperty(key)); } ArgumentParser.fillOptions(this, withoutOpenIEPrefix); // Create the clause splitter try { if (splitterDisable) { clauseSplitter = Optional.empty(); } else { if (noModel) { log.info("Not loading a splitter model"); clauseSplitter = Optional.of(ClauseSplitterSearchProblem::new); } else { clauseSplitter = Optional.of(ClauseSplitter.load(splitterModel)); } } } catch (IOException e) { //throw new RuntimeIOException("Could not load clause splitter model at " + splitterModel + ": " + e.getClass() + ": " + e.getMessage()); throw new RuntimeIOException("Could not load clause splitter model at " + splitterModel, e); } // Create the forward entailer try { this.weights = ignoreAffinity ? new NaturalLogicWeights(affinityProbabilityCap) : new NaturalLogicWeights(affinityModels, affinityProbabilityCap); } catch (IOException e) { throw new RuntimeIOException("Could not load affinity model at " + affinityModels + ": " + e.getMessage()); } forwardEntailer = new ForwardEntailer(entailmentsPerSentence, weights); // Create the relation segmenter segmenter = new RelationTripleSegmenter(allNominals); } /** * Find the clauses in a sentence, where the sentence is expressed as a dependency tree. * * @param tree The dependency tree representation of the sentence. * @param assumedTruth The assumed truth of the sentence. This is almost always true, unless you are * doing some more nuanced reasoning. * * @return A set of clauses extracted from the sentence. This includes the original sentence. */ @SuppressWarnings("unchecked") public List clausesInSentence(SemanticGraph tree, boolean assumedTruth) { if (clauseSplitter.isPresent()) { return clauseSplitter.get().apply(tree, assumedTruth).topClauses(splitterThreshold, 32); } else { return Collections.emptyList(); } } /** * Find the clauses in a sentence. * This runs the clause splitting component of the OpenIE system only. * * @see OpenIE#clausesInSentence(SemanticGraph, boolean) * * @param sentence The raw sentence to extract clauses from. * * @return A set of clauses extracted from the sentence. This includes the original sentence. */ public List clausesInSentence(CoreMap sentence) { return clausesInSentence(sentence.get(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation.class), true); } /** * Returns all of the entailed shortened clauses (as per natural logic) from the given clause. * This runs the forward entailment component of the OpenIE system only. * It is usually chained together with the clause splitting component: {@link OpenIE#clausesInSentence(CoreMap)}. * * @param clause The premise clause, as a sentence fragment in itself. * * @return A list of entailed clauses. */ @SuppressWarnings("unchecked") public List entailmentsFromClause(SentenceFragment clause) { if (clause.parseTree.isEmpty()) { return Collections.emptyList(); } else { // Get the forward entailments List list = new ArrayList<>(); if (entailmentsPerSentence > 0) { list.addAll(forwardEntailer.apply(clause.parseTree, true).search() .stream().map(x -> x.changeScore(x.score * clause.score)).collect(Collectors.toList())); } list.add(clause); // A special case for adjective entailments List adjFragments = new ArrayList<>(); SemgrexMatcher matcher = adjectivePattern.matcher(clause.parseTree); OUTER: while (matcher.find()) { // (get nodes) IndexedWord subj = matcher.getNode("subj"); IndexedWord be = matcher.getNode("be"); IndexedWord adj = matcher.getNode("adj"); IndexedWord obj = matcher.getNode("obj"); IndexedWord pobj = matcher.getNode("pobj"); String prep = matcher.getRelnString("prep"); // (if the adjective, or any earlier adjective, is privative, then all bets are off) for (SemanticGraphEdge edge : clause.parseTree.outgoingEdgeIterable(obj)) { if ("amod".equals(edge.getRelation().toString()) && edge.getDependent().index() <= adj.index() && Util.PRIVATIVE_ADJECTIVES.contains(edge.getDependent().word().toLowerCase())) { continue OUTER; } } // (create the core tree) SemanticGraph tree = new SemanticGraph(); tree.addRoot(adj); tree.addVertex(subj); tree.addVertex(be); tree.addEdge(adj, be, GrammaticalRelation.valueOf(Language.English, "cop"), Double.NEGATIVE_INFINITY, false); tree.addEdge(adj, subj, GrammaticalRelation.valueOf(Language.English, "nsubj"), Double.NEGATIVE_INFINITY, false); // (add pp attachment, if it existed) if (pobj != null) { assert prep != null; tree.addEdge(adj, pobj, GrammaticalRelation.valueOf(Language.English, prep), Double.NEGATIVE_INFINITY, false); } // (check for monotonicity) if (adj.get(NaturalLogicAnnotations.PolarityAnnotation.class).isUpwards() && be.get(NaturalLogicAnnotations.PolarityAnnotation.class).isUpwards()) { // (add tree) adjFragments.add(new SentenceFragment(tree, clause.assumedTruth, false)); } } list.addAll(adjFragments); return list; } } /** * Returns all the maximally shortened entailed fragments (as per natural logic) * from the given collection of clauses. * * @param clauses The clauses to shorten further. * * @return A set of sentence fragments corresponding to the maximally shortened entailed clauses. */ public Set entailmentsFromClauses(Collection clauses) { Set entailments = new HashSet<>(); for (SentenceFragment clause : clauses) { entailments.addAll(entailmentsFromClause(clause)); } return entailments; } /** * Returns the possible relation triple in this sentence fragment. * * @see OpenIE#relationInFragment(SentenceFragment, CoreMap) */ public Optional relationInFragment(SentenceFragment fragment) { return segmenter.segment(fragment.parseTree, Optional.of(fragment.score), consumeAll); } /** * Returns the possible relation triple in this set of sentence fragments. * * @see OpenIE#relationsInFragments(Collection, CoreMap) */ public List relationsInFragments(Collection fragments) { return fragments.stream().map(this::relationInFragment).filter(Optional::isPresent).map(Optional::get).collect(Collectors.toList()); } /** * Returns the possible relation triple in this sentence fragment. * * @param fragment The sentence fragment to try to extract relations from. * @param sentence The containing sentence for the fragment. * * @return A relation triple if we could find one; otherwise, {@link Optional#empty()}. */ private Optional relationInFragment(SentenceFragment fragment, CoreMap sentence) { return segmenter.segment(fragment.parseTree, Optional.of(fragment.score), consumeAll); } /** * Returns a list of OpenIE relations from the given set of sentence fragments. * * @param fragments The sentence fragments to extract relations from. * @param sentence The containing sentence that these fragments were extracted from. * * @return A list of OpenIE triples, corresponding to all the triples that could be extracted from the given fragments. */ private List relationsInFragments(Collection fragments, CoreMap sentence) { return fragments.stream().map(x -> relationInFragment(x, sentence)).filter(Optional::isPresent).map(Optional::get).collect(Collectors.toList()); } /** * Extract the relations in this clause. * * @see OpenIE#entailmentsFromClause(SentenceFragment) * @see OpenIE#relationsInFragments(Collection) */ public List relationsInClause(SentenceFragment clause) { return relationsInFragments(entailmentsFromClause(clause)); } /** * Extract the relations in this sentence. * * @see OpenIE#clausesInSentence(CoreMap) * @see OpenIE#entailmentsFromClause(SentenceFragment) * @see OpenIE#relationsInFragments(Collection) */ public List relationsInSentence(CoreMap sentence) { return relationsInFragments(entailmentsFromClauses(clausesInSentence(sentence))); } /** * Create a copy of the passed parse tree, canonicalizing pronominal nodes with their canonical mention. * Canonical mentions are tied together with the compound dependency arc; otherwise, the structure of * the tree remains unchanged. * * @param parse The original dependency parse of the sentence. * @param canonicalMentionMap The map from tokens to their canonical mentions. * * @return A copy of the passed parse tree, with pronouns replaces with their canonical mention. */ private static SemanticGraph canonicalizeCoref(SemanticGraph parse, Map> canonicalMentionMap) { parse = new SemanticGraph(parse); for (IndexedWord node : new HashSet<>(parse.vertexSet())) { // copy the vertex set to prevent ConcurrentModificationExceptions if (node.tag() != null && node.tag().startsWith("PRP")) { List canonicalMention = canonicalMentionMap.get(node.backingLabel()); if (canonicalMention != null) { // Case: this node is a preposition with a valid antecedent. // 1. Save the attaching edges List incomingEdges = parse.incomingEdgeList(node); List outgoingEdges = parse.outgoingEdgeList(node); // 2. Remove the node parse.removeVertex(node); // 3. Add the new head word IndexedWord headWord = new IndexedWord(canonicalMention.get(canonicalMention.size() - 1)); headWord.setPseudoPosition(node.pseudoPosition()); parse.addVertex(headWord); for (SemanticGraphEdge edge : incomingEdges) { parse.addEdge(edge.getGovernor(), headWord, edge.getRelation(), edge.getWeight(), edge.isExtra()); } for (SemanticGraphEdge edge : outgoingEdges) { parse.addEdge(headWord, edge.getDependent(), edge.getRelation(), edge.getWeight(), edge.isExtra()); } // 4. Add other words double pseudoPosition = headWord.pseudoPosition() - 1e-3; for (int i = canonicalMention.size() - 2; i >= 0; --i) { // Create the node IndexedWord dependent = new IndexedWord(canonicalMention.get(i)); // Set its pseudo position appropriately dependent.setPseudoPosition(pseudoPosition); pseudoPosition -= 1e-3; // Add the node to the graph parse.addVertex(dependent); parse.addEdge(headWord, dependent, UniversalEnglishGrammaticalRelations.COMPOUND_MODIFIER, 1.0, false); } } } } return parse; } /** *

* Annotate a single sentence. *

*

* This annotator will, in particular, set the {@link edu.stanford.nlp.naturalli.NaturalLogicAnnotations.EntailedSentencesAnnotation} * and {@link edu.stanford.nlp.naturalli.NaturalLogicAnnotations.RelationTriplesAnnotation} annotations. *

*/ @SuppressWarnings("unchecked") public void annotateSentence(CoreMap sentence, Map> canonicalMentionMap) { List tokens = sentence.get(CoreAnnotations.TokensAnnotation.class); if (tokens.size() < 2) { // Short sentence. Skip annotating it. sentence.set(NaturalLogicAnnotations.RelationTriplesAnnotation.class, Collections.emptyList()); if (!stripEntailments) { sentence.set(NaturalLogicAnnotations.EntailedSentencesAnnotation.class, Collections.emptySet()); } } else { // Get the dependency tree SemanticGraph parse = sentence.get(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation.class); if (parse == null) { parse = sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class); } if (parse == null) { throw new IllegalStateException("Cannot run OpenIE without a parse tree!"); } // Clean the tree parse = new SemanticGraph(parse); Util.cleanTree(parse); // Resolve Coreference SemanticGraph canonicalizedParse = parse; if (resolveCoref && !canonicalMentionMap.isEmpty()) { canonicalizedParse = canonicalizeCoref(parse, canonicalMentionMap); } // Run OpenIE // (clauses) List clauses = clausesInSentence(canonicalizedParse, true); // note: uses coref-canonicalized parse // (entailment) Set fragments = entailmentsFromClauses(clauses); // (segment) List extractions = segmenter.extract(parse, tokens); // note: uses non-coref-canonicalized parse! extractions.addAll(relationsInFragments(fragments, sentence)); // Set the annotations sentence.set(NaturalLogicAnnotations.EntailedSentencesAnnotation.class, fragments); sentence.set(NaturalLogicAnnotations.RelationTriplesAnnotation.class, new ArrayList<>(new HashSet<>(extractions))); // uniq the extractions if (stripEntailments) { sentence.remove(NaturalLogicAnnotations.EntailedSentencesAnnotation.class); } } } /** * {@inheritDoc} * *

* This annotator will, in particular, set the {@link edu.stanford.nlp.naturalli.NaturalLogicAnnotations.EntailedSentencesAnnotation} * and {@link edu.stanford.nlp.naturalli.NaturalLogicAnnotations.RelationTriplesAnnotation} annotations. *

*/ @Override public void annotate(Annotation annotation) { // Accumulate Coref data Map corefChains; Map> canonicalMentionMap = new IdentityHashMap<>(); if (resolveCoref && (corefChains = annotation.get(CorefCoreAnnotations.CorefChainAnnotation.class)) != null) { for (CorefChain chain : corefChains.values()) { // Make sure it's a real chain and not a singleton if (chain.getMentionsInTextualOrder().size() < 2) { continue; } // Metadata List canonicalMention = null; double canonicalMentionScore = Double.NEGATIVE_INFINITY; Set tokensToMark = new HashSet<>(); List mentions = chain.getMentionsInTextualOrder(); // Iterate over mentions for (int i = 0; i < mentions.size(); ++i) { // Get some data on this mention Pair, Double> info = grokCorefMention(annotation, mentions.get(i)); // Figure out if it should be the canonical mention double score = info.second + ((double) i) / ((double) mentions.size()) + (mentions.get(i) == chain.getRepresentativeMention() ? 1.0 : 0.0); if (canonicalMention == null || score > canonicalMentionScore) { canonicalMention = info.first; canonicalMentionScore = score; } // Register the participating tokens if (info.first.size() == 1) { // Only mark single-node tokens! tokensToMark.addAll(info.first); } } // Mark the tokens as coreferent assert canonicalMention != null; for (CoreLabel token : tokensToMark) { List existingMention = canonicalMentionMap.get(token); if (existingMention == null || existingMention.isEmpty() || "O".equals(existingMention.get(0).ner())) { // Don't clobber existing good mentions canonicalMentionMap.put(token, canonicalMention); } } } } // Annotate each sentence annotation.get(CoreAnnotations.SentencesAnnotation.class).forEach(x -> this.annotateSentence(x, canonicalMentionMap)); } /** {@inheritDoc} */ @Override public Set> requirementsSatisfied() { return Collections.unmodifiableSet(new ArraySet<>(Arrays.asList( NaturalLogicAnnotations.RelationTriplesAnnotation.class, NaturalLogicAnnotations.EntailedSentencesAnnotation.class ))); } /** {@inheritDoc} */ @Override public Set> requires() { Set> requirements = new HashSet<>(Arrays.asList( CoreAnnotations.TextAnnotation.class, CoreAnnotations.TokensAnnotation.class, CoreAnnotations.IndexAnnotation.class, CoreAnnotations.SentencesAnnotation.class, CoreAnnotations.SentenceIndexAnnotation.class, CoreAnnotations.PartOfSpeechAnnotation.class, CoreAnnotations.LemmaAnnotation.class, NaturalLogicAnnotations.PolarityAnnotation.class, SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation.class //CoreAnnotations.OriginalTextAnnotation.class )); if (resolveCoref) { requirements.add(edu.stanford.nlp.coref.CorefCoreAnnotations.CorefChainAnnotation.class); } return Collections.unmodifiableSet(requirements); } /** * A utility to get useful information out of a CorefMention. In particular, it returns the CoreLabels which are * associated with this mention, and it returns a score for how much we think this mention should be the canonical * mention. * * @param doc The document this mention is referenced into. * @param mention The mention itself. * @return A pair of the tokens in the mention, and a score for how much we like this mention as the canonical mention. */ private static Pair, Double> grokCorefMention(Annotation doc, CorefChain.CorefMention mention) { List tokens = doc.get(CoreAnnotations.SentencesAnnotation.class).get(mention.sentNum - 1).get(CoreAnnotations.TokensAnnotation.class); List mentionAsTokens = tokens.subList(mention.startIndex - 1, mention.endIndex - 1); // Try to assess this mention's NER type Counter nerVotes = new ClassicCounter<>(); mentionAsTokens.stream().filter(token -> token.ner() != null && !"O".equals(token.ner())).forEach(token -> nerVotes.incrementCount(token.ner())); String ner = Counters.argmax(nerVotes, (o1, o2) -> o1 == null ? 0 : o1.compareTo(o2)); double nerCount = nerVotes.getCount(ner); double nerScore = nerCount * nerCount / ((double) mentionAsTokens.size()); // Return return Pair.makePair(mentionAsTokens, nerScore); } /** * Prints an OpenIE triple to a String, according to the output format requested in * the annotator. * * @param extraction The triple to write. * @param docid The document ID (for the ReVerb format) * @param sentence The sentence the triple was extracted from (for the ReVerb format) * * @return A String representation of the triple. */ public static String tripleToString(RelationTriple extraction, String docid, CoreMap sentence) { switch (FORMAT) { case REVERB: return extraction.toReverbString(docid, sentence); case OLLIE: return extraction.confidenceGloss() + ": (" + extraction.subjectGloss() + "; " + extraction.relationGloss() + "; " + extraction.objectGloss() + ")"; case DEFAULT: return extraction.toString(); case QA_SRL: return extraction.toQaSrlString(sentence); default: throw new IllegalStateException("Format is not implemented: " + FORMAT); } } /** * Process a single file or line of standard in. * @param pipeline The annotation pipeline to run the lines of the input through. * @param docid The docid of the document we are extracting. * @param document the document to annotate. */ @SuppressWarnings("SynchronizeOnNonFinalField") private static void processDocument(AnnotationPipeline pipeline, String docid, String document) { // Error checks if (document.trim().equals("")) { return; } // Annotate the document Annotation ann = new Annotation(document); pipeline.annotate(ann); // Get the extractions boolean empty = true; synchronized (OUTPUT) { for (CoreMap sentence : ann.get(CoreAnnotations.SentencesAnnotation.class)) { for (RelationTriple extraction : sentence.get(NaturalLogicAnnotations.RelationTriplesAnnotation.class)) { // Print the extractions OUTPUT.println(tripleToString(extraction, docid, sentence)); empty = false; } } } if (empty) { log.info("No extractions in: " + ("stdin".equals(docid) ? document : docid)); } } /** * An entry method for annotating standard in with OpenIE extractions. */ public static void main(String[] args) throws IOException, InterruptedException { // Parse the arguments Properties props = StringUtils.argsToProperties(args, new HashMap(){{ put("openie.resolve_coref", 0); put("resolve_coref", 0); put("openie.splitter.nomodel", 0); put("splitter.nomodel", 0); put("openie.splitter.disable", 0); put("splitter.disable", 0); put("openie.ignore_affinity", 0); put("splitter.ignore_affinity", 0); put("openie.triple.strict", 0); put("splitter.triple.strict", 0); put("openie.triple.all_nominals", 0); put("splitter.triple.all_nominals", 0); }}); ArgumentParser.fillOptions(new Class[]{OpenIE.class, ArgumentParser.class}, props); AtomicInteger exceptionCount = new AtomicInteger(0); ExecutorService exec = Executors.newFixedThreadPool(ArgumentParser.threads); // Parse the files to process String[] filesToProcess; if (FILELIST != null) { filesToProcess = IOUtils.linesFromFile(FILELIST.getPath()).stream() .map(String::trim) .map(path -> path.replaceAll("^~", "$HOME")) .map(path -> new File(path).exists() ? path : StringUtils.expandEnvironmentVariables(path)) .toArray(String[]::new); } else if (!"".equals(props.getProperty("", ""))) { filesToProcess = props.getProperty("", "").split("\\s+"); } else { filesToProcess = new String[0]; } // Tweak the arguments if ("".equals(props.getProperty("annotators", ""))) { if (!"false".equalsIgnoreCase(props.getProperty("resolve_coref", props.getProperty("openie.resolve_coref", "false")))) { props.setProperty("coref.md.type", "dep"); // so we don't need the `parse` annotator props.setProperty("coref.mode", "statistical"); // explicitly ask for scoref props.setProperty("annotators", "tokenize,ssplit,pos,lemma,depparse,ner,mention,coref,natlog,openie"); } else { props.setProperty("annotators", "tokenize,ssplit,pos,lemma,depparse,natlog,openie"); } } if ("".equals(props.getProperty("depparse.extradependencies", ""))) { props.setProperty("depparse.extradependencies", "ref_only_uncollapsed"); } if ("".equals(props.getProperty("parse.extradependencies", ""))) { props.setProperty("parse.extradependencies", "ref_only_uncollapsed"); } if ("".equals(props.getProperty("tokenize.class", ""))) { props.setProperty("tokenize.class", "PTBTokenizer"); } if ("".equals(props.getProperty("tokenize.language", ""))) { props.setProperty("tokenize.language", "en"); } // Tweak properties for console mode. // In particular, in this mode we can assume every line of standard in is a new sentence. if (filesToProcess.length == 0 && "".equals(props.getProperty("ssplit.isOneSentence", ""))) { props.setProperty("ssplit.isOneSentence", "true"); } // Some error checks on the arguments if (!props.getProperty("annotators").toLowerCase().contains("openie")) { log.error("If you specify custom annotators, you must at least include 'openie'"); System.exit(1); } // Copy properties that are missing the 'openie' prefix new HashSet<>(props.keySet()).stream().filter(key -> !key.toString().startsWith("openie.")).forEach(key -> props.setProperty("openie." + key.toString(), props.getProperty(key.toString()))); // Create the pipeline StanfordCoreNLP pipeline = new StanfordCoreNLP(props); // Run OpenIE if (filesToProcess.length == 0) { // Running from stdin; one document per line. log.info("Processing from stdin. Enter one sentence per line."); Scanner scanner = new Scanner(System.in); String line; try { line = scanner.nextLine(); } catch (NoSuchElementException e) { log.info("No lines found on standard in"); return; } while (line != null) { processDocument(pipeline, "stdin", line); try { line = scanner.nextLine(); } catch (NoSuchElementException e) { return; } } } else { // Running from file parameters. // Make sure we can read all the files in the queue. // This will prevent a nasty surprise 10 hours into a running job... for (String file : filesToProcess) { if (!new File(file).exists() || !new File(file).canRead()) { log.error("Cannot read file (or file does not exist: '" + file + "'"); } } // Actually process the files. for (String file : filesToProcess) { log.info("Processing file: " + file); if (ArgumentParser.threads > 1) { // Multi-threaded: submit a job to run final String fileToSubmit = file; exec.submit(() -> { try { processDocument(pipeline, file, IOUtils.slurpFile(new File(fileToSubmit))); } catch (Throwable t) { t.printStackTrace(); exceptionCount.incrementAndGet(); } }); } else { // Single-threaded: just run the job processDocument(pipeline, file, IOUtils.slurpFile(new File(file))); } } } // Exit exec.shutdown(); log.info("All files have been queued; awaiting termination..."); exec.awaitTermination(Long.MAX_VALUE, TimeUnit.SECONDS); log.info("DONE processing files. " + exceptionCount.get() + " exceptions encountered."); System.exit(exceptionCount.get()); } }