All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.naturalli.Util Maven / Gradle / Ivy

Go to download

Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.

There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.naturalli;

import edu.stanford.nlp.classify.Classifier;
import edu.stanford.nlp.classify.GeneralDataset;
import edu.stanford.nlp.ie.machinereading.structure.Span;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasIndex;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.AnnotationPipeline;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphEdge;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.stats.Counter;
import edu.stanford.nlp.stats.Counters;
import edu.stanford.nlp.trees.GrammaticalRelation;
import edu.stanford.nlp.util.*;

import java.text.DecimalFormat;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;

import static edu.stanford.nlp.util.logging.Redwood.log;

/**
 * TODO(gabor) JavaDoc
 *
 * @author Gabor Angeli
 */
public class Util {

  /**
   * TODO(gabor) JavaDoc
   *
   * @param tokens
   * @param span
   * @return
   */
  public static String guessNER(List tokens, Span span) {
    Counter nerGuesses = new ClassicCounter<>();
    for (int i : span) {
      nerGuesses.incrementCount(tokens.get(i).ner());
    }
    nerGuesses.remove("O");
    nerGuesses.remove(null);
    if (nerGuesses.size() > 0 && Counters.max(nerGuesses) >= span.size() / 2) {
      return Counters.argmax(nerGuesses);
    } else {
      return "O";
    }
  }

  /**
   * TODO(gabor) JavaDoc
   *
   * @param tokens
   * @return
   */
  public static String guessNER(List tokens) {
    return guessNER(tokens, new Span(0, tokens.size()));
  }

  /**
   * Returns a coherent NER span from a list of tokens.
   *
   * @param tokens The tokens of the entire sentence.
   * @param seed The seed span of the intended NER span that should be expanded.
   * @return A 0 indexed span corresponding to a coherent NER chunk from the given seed.
   */
  public static Span extractNER(List tokens, Span seed) {
    // Error checks
    if (seed == null) {
      return new Span(0, 1);
    }
    if (seed.start() < 0 || seed.end() < 0) {
      return new Span(0, 0);
    }
    if (seed.start() >= tokens.size() || seed.end() > tokens.size()) {
      return new Span(tokens.size(),tokens.size());
    }
    if (tokens.get(seed.start()).ner() == null) {
      return seed;
    }
    if (seed.start() < 0 || seed.end() > tokens.size()) {
      return Span.fromValues(Math.max(0, seed.start()), Math.min(tokens.size(), seed.end()));
    }

    // Find the span's beginning
    int begin = seed.start();
    while (begin < seed.end() - 1 && "O".equals(tokens.get(begin).ner())) {
      begin += 1;
    }
    String beginNER = tokens.get(begin).ner();
    if (!"O".equals(beginNER)) {
      while (begin > 0 && tokens.get(begin - 1).ner().equals(beginNER)) {
        begin -= 1;
      }
    } else {
      begin = seed.start();
    }
    // Find the span's end
    int end = seed.end() - 1;
    while (end > begin && "O".equals(tokens.get(end).ner())) {
      end -= 1;
    }
    String endNER = tokens.get(end).ner();
    if (!"O".equals(endNER)) {
      while (end < tokens.size() - 1 && tokens.get(end + 1).ner().equals(endNER)) {
        end += 1;
      }
    } else {
      end = seed.end() - 1;
    }
    // Check that the NER of the beginning and end are the same
    if (beginNER.equals(endNER)) {
      return Span.fromValues(begin, end + 1);
    } else {
      String bestNER = guessNER(tokens, Span.fromValues(begin, end + 1));
      if (beginNER.equals(bestNER)) {
        return extractNER(tokens, Span.fromValues(begin, begin + 1));
      } else if (endNER.equals(bestNER)){
        return extractNER(tokens, Span.fromValues(end, end + 1));
      } else {
        // Something super funky is going on...
        return Span.fromValues(begin, end + 1);
      }
    }
  }

  /**
   * TODO(gabor) JavaDoc
   *
   * @param sentence
   * @param pipeline
   */
  public static void annotate(CoreMap sentence, AnnotationPipeline pipeline) {
    Annotation ann = new Annotation(StringUtils.join(sentence.get(CoreAnnotations.TokensAnnotation.class), " "));
    ann.set(CoreAnnotations.TokensAnnotation.class, sentence.get(CoreAnnotations.TokensAnnotation.class));
    ann.set(CoreAnnotations.SentencesAnnotation.class, Collections.singletonList(sentence));
    pipeline.annotate(ann);
  }

  /**
   * Fix some bizarre peculiarities with certain trees.
   * So far, these include:
   * 
    *
  • Sometimes there's a node from a word to itself. This seems wrong.
  • *
* * @param tree The tree to clean (in place!). * @return A list of extra edges, which are valid but were removed. */ public static List cleanTree(SemanticGraph tree) { // assert !isCyclic(tree); // Clean nodes List toDelete = new ArrayList<>(); for (IndexedWord vertex : tree.vertexSet()) { // Clean punctuation if (vertex.tag() == null) { continue; } char tag = vertex.backingLabel().tag().charAt(0); if (tag == '.' || tag == ',' || tag == '(' || tag == ')' || tag == ':') { if (!tree.outgoingEdgeIterator(vertex).hasNext()) { // This should really never happen, but it does. toDelete.add(vertex); } } } toDelete.forEach(tree::removeVertex); // Clean edges Iterator iter = tree.edgeIterable().iterator(); List> toAdd = new ArrayList<>(); toDelete.clear(); while (iter.hasNext()) { SemanticGraphEdge edge = iter.next(); if (edge.getDependent().index() == edge.getGovernor().index()) { // Clean up copy-edges if (edge.getDependent().isCopy(edge.getGovernor())) { for (SemanticGraphEdge toCopy : tree.outgoingEdgeIterable(edge.getDependent())) { toAdd.add(Triple.makeTriple(edge.getGovernor(), toCopy.getDependent(), toCopy)); } toDelete.add(edge.getDependent()); } if (edge.getGovernor().isCopy(edge.getDependent())) { for (SemanticGraphEdge toCopy : tree.outgoingEdgeIterable(edge.getGovernor())) { toAdd.add(Triple.makeTriple(edge.getDependent(), toCopy.getDependent(), toCopy)); } toDelete.add(edge.getGovernor()); } // Clean self-edges iter.remove(); } else if (edge.getRelation().toString().equals("punct")) { // Clean punctuation (again) if (!tree.outgoingEdgeIterator(edge.getDependent()).hasNext()) { // This should really never happen, but it does. iter.remove(); } } } // (add edges we wanted to add) toDelete.forEach(tree::removeVertex); for (Triple edge : toAdd) { tree.addEdge(edge.first, edge.second, edge.third.getRelation(), edge.third.getWeight(), edge.third.isExtra()); } // Handle extra edges. // Two cases: // (1) the extra edge is a subj/obj edge and the main edge is a conj:.* // in this case, keep the extra // (2) otherwise, delete the extra List extraEdges = new ArrayList<>(); for (SemanticGraphEdge edge : tree.edgeIterable()) { if (edge.isExtra()) { List incomingEdges = tree.incomingEdgeList(edge.getDependent()); SemanticGraphEdge toKeep = null; for (SemanticGraphEdge candidate : incomingEdges) { if (toKeep == null) { toKeep = candidate; } else if (toKeep.getRelation().toString().startsWith("conj") && candidate.getRelation().toString().matches(".subj.*|.obj.*")) { toKeep = candidate; } else if (!candidate.isExtra() && !(candidate.getRelation().toString().startsWith("conj") && toKeep.getRelation().toString().matches(".subj.*|.obj.*"))) { toKeep = candidate; } } for (SemanticGraphEdge candidate : incomingEdges) { if (candidate != toKeep) { extraEdges.add(candidate); } } } } extraEdges.forEach(tree::removeEdge); // Add apposition edges (simple coref) for (SemanticGraphEdge extraEdge : new ArrayList<>(extraEdges)) { // note[gabor] prevent concurrent modification exception for (SemanticGraphEdge candidateAppos : tree.incomingEdgeIterable(extraEdge.getDependent())) { if (candidateAppos.getRelation().toString().equals("appos")) { extraEdges.add(new SemanticGraphEdge(extraEdge.getGovernor(), candidateAppos.getGovernor(), extraEdge.getRelation(), extraEdge.getWeight(), extraEdge.isExtra())); } } for (SemanticGraphEdge candidateAppos : tree.outgoingEdgeIterable(extraEdge.getDependent())) { if (candidateAppos.getRelation().toString().equals("appos")) { extraEdges.add(new SemanticGraphEdge(extraEdge.getGovernor(), candidateAppos.getDependent(), extraEdge.getRelation(), extraEdge.getWeight(), extraEdge.isExtra())); } } } // Brute force ensure tree // Remove incoming edges from roots List rootIncomingEdges = new ArrayList<>(); for (IndexedWord root : tree.getRoots()) { for (SemanticGraphEdge incomingEdge : tree.incomingEdgeIterable(root)) { rootIncomingEdges.add(incomingEdge); } } rootIncomingEdges.forEach(tree::removeEdge); // Loop until it becomes a tree. boolean changed = true; while (changed) { // I just want trees to be trees; is that so much to ask!? changed = false; List danglingNodes = new ArrayList<>(); List invalidEdges = new ArrayList<>(); for (IndexedWord vertex : tree.vertexSet()) { // Collect statistics Iterator incomingIter = tree.incomingEdgeIterator(vertex); boolean hasIncoming = incomingIter.hasNext(); boolean hasMultipleIncoming = false; if (hasIncoming) { incomingIter.next(); hasMultipleIncoming = incomingIter.hasNext(); } // Register actions if (!hasIncoming && !tree.getRoots().contains(vertex)) { danglingNodes.add(vertex); } else { if (hasMultipleIncoming) { for (SemanticGraphEdge edge : new IterableIterator<>(incomingIter)) { invalidEdges.add(edge); } } } } // Perform actions for (IndexedWord vertex : danglingNodes) { tree.removeVertex(vertex); changed = true; } for (SemanticGraphEdge edge : invalidEdges) { tree.removeEdge(edge); changed = true; } } // Edge case: remove duplicate dobj to "that." // This is a common parse error. for (IndexedWord vertex : tree.vertexSet()) { SemanticGraphEdge thatEdge = null; int dobjCount = 0; for (SemanticGraphEdge edge : tree.outgoingEdgeIterable(vertex)) { if ("that".equalsIgnoreCase(edge.getDependent().word())) { thatEdge = edge; } if ("dobj".equals(edge.getRelation().toString())) { dobjCount += 1; } } if (dobjCount > 1 && thatEdge != null) { // Case: there are two dobj edges, one of which goes to the word "that" // Action: rewrite the dobj edge to "that" to be a "mark" edge. tree.removeEdge(thatEdge); tree.addEdge(thatEdge.getGovernor(), thatEdge.getDependent(), GrammaticalRelation.valueOf(thatEdge.getRelation().getLanguage(), "mark"), thatEdge.getWeight(), thatEdge.isExtra()); } } // Return assert isTree(tree); return extraEdges; } /** * Strip away case edges, if the incoming edge is a preposition. * This replicates the behavior of the old Stanford dependencies on universal dependencies. * @param tree The tree to modify in place. */ public static void stripPrepCases(SemanticGraph tree) { // Find incoming case edges that have an 'nmod' incoming edge List toClean = new ArrayList<>(); for (SemanticGraphEdge edge : tree.edgeIterable()) { if ("case".equals(edge.getRelation().toString())) { boolean isPrepTarget = false; for (SemanticGraphEdge incoming : tree.incomingEdgeIterable(edge.getGovernor())) { if ("nmod".equals(incoming.getRelation().getShortName())) { isPrepTarget = true; break; } } if (isPrepTarget && !tree.outgoingEdgeIterator(edge.getDependent()).hasNext()) { toClean.add(edge); } } } // Delete these edges for (SemanticGraphEdge edge : toClean) { tree.removeEdge(edge); tree.removeVertex(edge.getDependent()); assert isTree(tree); } } /** * Determine if a tree is cyclic. * @param tree The tree to check. * @return True if the tree has at least once cycle in it. */ public static boolean isCyclic(SemanticGraph tree) { for (IndexedWord vertex : tree.vertexSet()) { if (tree.getRoots().contains(vertex)) { continue; } IndexedWord node = tree.incomingEdgeIterator(vertex).next().getGovernor(); Set seen = new HashSet<>(); seen.add(vertex); while (node != null) { if (seen.contains(node)) { return true; } seen.add(node); if (tree.incomingEdgeIterator(node).hasNext()) { node = tree.incomingEdgeIterator(node).next().getGovernor(); } else { node = null; } } } return false; } /** * A little utility function to make sure a SemanticGraph is a tree. * @param tree The tree to check. * @return True if this {@link edu.stanford.nlp.semgraph.SemanticGraph} is a tree (versus a DAG, or Graph). */ public static boolean isTree(SemanticGraph tree) { for (IndexedWord vertex : tree.vertexSet()) { // Check one and only one incoming edge if (tree.getRoots().contains(vertex)) { if (tree.incomingEdgeIterator(vertex).hasNext()) { return false; } } else { Iterator iter = tree.incomingEdgeIterator(vertex); if (!iter.hasNext()) { return false; } iter.next(); if (iter.hasNext()) { return false; } } // Check incoming and outgoing edges match for (SemanticGraphEdge edge : tree.outgoingEdgeIterable(vertex)) { boolean foundReverse = false; for (SemanticGraphEdge reverse : tree.incomingEdgeIterable(edge.getDependent())) { if (reverse == edge) { foundReverse = true; } } if (!foundReverse) { return false; } } for (SemanticGraphEdge edge : tree.incomingEdgeIterable(vertex)) { boolean foundReverse = false; for (SemanticGraphEdge reverse : tree.outgoingEdgeIterable(edge.getGovernor())) { if (reverse == edge) { foundReverse = true; } } if (!foundReverse) { return false; } } } // Check for cycles if (isCyclic(tree)) { return false; } // Check topological sort -- sometimes fails? // try { // tree.topologicalSort(); // } catch (Exception e) { // e.printStackTrace(); // return false; // } return true; } /** * Returns true if the given two spans denote the same consistent NER chunk. That is, if we call * {@link Util#extractNER(List, Span)} on these two spans, they would return the same span. * * @param tokens The tokens in the sentence. * @param a The first span. * @param b The second span. * @param parse The parse tree to traverse looking for coreference chains to exploit. * * @return True if these two spans contain exactly the same NER. */ public static boolean nerOverlap(List tokens, Span a, Span b, Optional parse) { Span nerA = extractNER(tokens, a); Span nerB = extractNER(tokens, b); return nerA.equals(nerB); } /** @see Util#nerOverlap(List, Span, Span, Optional) */ public static boolean nerOverlap(List tokens, Span a, Span b) { return nerOverlap(tokens, a, b, Optional.empty()); } /** * A helper function for dumping the accuracy of the trained classifier. * * @param classifier The classifier to evaluate. * @param dataset The dataset to evaluate the classifier on. */ public static void dumpAccuracy(Classifier classifier, GeneralDataset dataset) { DecimalFormat df = new DecimalFormat("0.00%"); log("size: " + dataset.size()); log("split count: " + StreamSupport.stream(dataset.spliterator(), false).filter(x -> x.label() == ClauseSplitter.ClauseClassifierLabel.CLAUSE_SPLIT).collect(Collectors.toList()).size()); log("interm count: " + StreamSupport.stream(dataset.spliterator(), false).filter(x -> x.label() == ClauseSplitter.ClauseClassifierLabel.CLAUSE_INTERM).collect(Collectors.toList()).size()); Pair pr = classifier.evaluatePrecisionAndRecall(dataset, ClauseSplitter.ClauseClassifierLabel.CLAUSE_SPLIT); log("p (split): " + df.format(pr.first)); log("r (split): " + df.format(pr.second)); log("f1 (split): " + df.format(2 * pr.first * pr.second / (pr.first + pr.second))); pr = classifier.evaluatePrecisionAndRecall(dataset, ClauseSplitter.ClauseClassifierLabel.CLAUSE_INTERM); log("p (interm): " + df.format(pr.first)); log("r (interm): " + df.format(pr.second)); log("f1 (interm): " + df.format(2 * pr.first * pr.second / (pr.first + pr.second))); } /** * The dictionary of privative adjectives, as per http://hci.stanford.edu/cstr/reports/2014-04.pdf */ public static final Set PRIVATIVE_ADJECTIVES = Collections.unmodifiableSet(new HashSet(){{ add("believed"); add("debatable"); add("disputed"); add("dubious"); add("hypothetical"); add("impossible"); add("improbable"); add("plausible"); add("putative"); add("questionable"); add("so called"); add("supposed"); add("suspicious"); add("theoretical"); add("uncertain"); add("unlikely"); add("would - be"); add("apparent"); add("arguable"); add("assumed"); add("likely"); add("ostensible"); add("possible"); add("potential"); add("predicted"); add("presumed"); add("probable"); add("seeming"); add("anti"); add("fake"); add("fictional"); add("fictitious"); add("imaginary"); add("mythical"); add("phony"); add("false"); add("artificial"); add("erroneous"); add("mistaken"); add("mock"); add("pseudo"); add("simulated"); add("spurious"); add("deputy"); add("faulty"); add("virtual"); add("doubtful"); add("erstwhile"); add("ex"); add("expected"); add("former"); add("future"); add("onetime"); add("past"); add("proposed"); }}); /** * Construct the spanning span of the given list of tokens. * * @param tokens The tokens that should define the span. * @return A span (0-indexed) that covers all of the tokens. */ public static Span tokensToSpan(List tokens) { int min = Integer.MAX_VALUE; int max = Integer.MIN_VALUE; for (HasIndex token : tokens) { min = Math.min(token.index() - 1, min); max = Math.max(token.index(), max); } if (min < 0 || max == Integer.MAX_VALUE) { throw new IllegalArgumentException("Could not compute span from tokens!"); } else if (min >= max) { throw new IllegalStateException("Either logic is broken or Gabor can't code."); } else { return new Span(min, max); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy