edu.stanford.nlp.ie.machinereading.structure.AnnotationUtils Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.ie.machinereading.structure; 
import edu.stanford.nlp.util.logging.Redwood;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Random;
import java.util.Set;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.sequences.SeqClassifierFlags;
import edu.stanford.nlp.trees.TreeCoreAnnotations;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.StringUtils;

/**
 * Utilities to manipulate Annotations storing datasets or sentences with Machine Reading info
 * @author Mihai
 *
 */
public class AnnotationUtils  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(AnnotationUtils.class);
  private AnnotationUtils() {} // only static methods

  /**
   * Given a list of sentences (as CoreMaps), wrap it in a new Annotation.
   */
  public static Annotation createDataset(List sentences) {
    Annotation dataset = new Annotation("");
    addSentences(dataset,sentences);
    return dataset;
  }

  /**
   * Randomized shuffle of all sentences int this dataset
   * @param dataset
   */
  public static void shuffleSentences(CoreMap dataset) {
    List sentences = dataset.get(CoreAnnotations.SentencesAnnotation.class);

    // we use a constant seed for replicability of experiments
    Collections.shuffle(sentences, new Random(0));

    dataset.set(CoreAnnotations.SentencesAnnotation.class, sentences);
  }

  /**
   * Converts the labels of all entity mentions in this dataset to sequences of CoreLabels
   * @param dataset
   * @param annotationsToSkip
   * @param useSubTypes
   */
  public static List> entityMentionsToCoreLabels(CoreMap dataset, Set annotationsToSkip, boolean useSubTypes, boolean useBIO) {
    List> retVal = new ArrayList<>();
    List sentences = dataset.get(CoreAnnotations.SentencesAnnotation.class);

    for (CoreMap sentence : sentences) {
      List labeledSentence = sentenceEntityMentionsToCoreLabels(sentence, true, annotationsToSkip, null, useSubTypes, useBIO);
      assert(labeledSentence != null);
      retVal.add(labeledSentence);
    }

    return retVal;
  }

  /**
   * Converts the labels of all entity mentions in this sentence to sequences of CoreLabels
   * @param sentence
   * @param addAnswerAnnotation
   * @param annotationsToSkip
   * @param useSubTypes
   */
  public static List sentenceEntityMentionsToCoreLabels(
      CoreMap sentence,
      boolean addAnswerAnnotation,
      Set annotationsToSkip,
      Set mentionTypesToUse,
      boolean useSubTypes,
      boolean useBIO) {
    /*
    Tree completeTree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class);
    if(completeTree == null){
      throw new RuntimeException("ERROR: TreeAnnotation MUST be set before calling this method!");
    }
    */

    //
    // Set TextAnnotation and PartOfSpeechAnnotation (using the parser data)
    //
    /*
    List labels = new ArrayList();
    List tokenList = completeTree.getLeaves();
    for (Tree tree : tokenList) {
      Word word = new Word(tree.label());
      CoreLabel label = new CoreLabel();
      label.set(CoreAnnotations.TextAnnotation.class, word.value());
      if (addAnswerAnnotation) {
        label.set(CoreAnnotations.AnswerAnnotation.class,
            SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL);
      }
      label.set(CoreAnnotations.PartOfSpeechAnnotation.class, tree.parent(completeTree).label().value());
      labels.add(label);
    }
    */
    // use the token CoreLabels not the parser data => more robust
    List labels = new ArrayList<>();
    for(CoreLabel l: sentence.get(CoreAnnotations.TokensAnnotation.class)){
      CoreLabel nl = new CoreLabel(l);
      if (addAnswerAnnotation) {
        nl.set(CoreAnnotations.AnswerAnnotation.class, SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL);
      }
      labels.add(nl);
    }

    // Add AnswerAnnotation from the types of the entity mentions
    if (addAnswerAnnotation) {
      List entities = sentence.get(MachineReadingAnnotations.EntityMentionsAnnotation.class);
      if(entities != null){
        for (EntityMention entity : entities) {
          // is this a type that we should skip?
          if(annotationsToSkip != null && annotationsToSkip.contains(entity.getType())) continue;
          // is this a valid mention type?
          if(mentionTypesToUse != null && ! mentionTypesToUse.contains(entity.getMentionType())) continue;

          // ignore entities without head span
          if(entity.getHead() != null){
            for(int i = entity.getHeadTokenStart(); i < entity.getHeadTokenEnd(); i ++){
              String tag = entity.getType();
              if(useSubTypes && entity.getSubType() != null) tag += "-" + entity.getSubType();
              if(useBIO){
                if(i == entity.getHeadTokenStart()) tag = "B-" + tag;
                else tag = "I-" + tag;
              }
              labels.get(i).set(CoreAnnotations.AnswerAnnotation.class, tag);
            }
          }
        }
      }
    }

    /*
    // Displaying the CoreLabels generated for this sentence
    log.info("sentence to core labels:");
    for(CoreLabel l: labels){
      log.info(" " + l.word() + "/" + l.getString(CoreAnnotations.PartOfSpeechAnnotation.class));
      String tag = l.getString(CoreAnnotations.AnswerAnnotation.class);
      if(tag != null && ! tag.equals(SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL)){
        log.info("/" + tag);
      }
    }
    log.info();
    */

    return labels;
  }

  public static CoreMap getSentence(CoreMap dataset, int i) {
    return dataset.get(CoreAnnotations.SentencesAnnotation.class).get(i);
  }

  public static int sentenceCount(CoreMap dataset) {
    List sents = dataset.get(CoreAnnotations.SentencesAnnotation.class);
    if(sents != null) return sents.size();
    return 0;
  }

  public static void addSentence(CoreMap dataset, CoreMap sentence) {
    List sents = dataset.get(CoreAnnotations.SentencesAnnotation.class);
    if(sents == null){
      sents = new ArrayList<>();
      dataset.set(CoreAnnotations.SentencesAnnotation.class, sents);
    }
    sents.add(sentence);
  }

  public static void addSentences(CoreMap dataset, List sentences) {
    List sents = dataset.get(CoreAnnotations.SentencesAnnotation.class);
    if(sents == null){
      sents = new ArrayList<>();
      dataset.set(CoreAnnotations.SentencesAnnotation.class, sents);
    }
    for(CoreMap sentence: sentences){
      sents.add(sentence);
    }
  }

  /**
   * Creates a deep copy of the given dataset with new lists for all mentions (entity, relation, event)
   * @param dataset
   */
  public static Annotation deepMentionCopy(CoreMap dataset) {
    Annotation newDataset = new Annotation("");

    List sents = dataset.get(CoreAnnotations.SentencesAnnotation.class);
    List newSents = new ArrayList<>();
    if(sents != null){
      for(CoreMap sent: sents){
        if(! (sent instanceof Annotation)){
          throw new RuntimeException("ERROR: Sentences must instantiate Annotation!");
        }
        CoreMap newSent = sentenceDeepMentionCopy((Annotation) sent);
        newSents.add(newSent);
      }
    }

    addSentences(newDataset, newSents);
    return newDataset;
  }

  /**
   * Deep copy of the sentence: we create new entity/relation/event lists here.
   * However,  we do not deep copy the ExtractionObjects themselves!
   * @param sentence
   */
  public static Annotation sentenceDeepMentionCopy(Annotation sentence) {
    Annotation newSent = new Annotation(sentence.get(CoreAnnotations.TextAnnotation.class));

    newSent.set(CoreAnnotations.TokensAnnotation.class, sentence.get(CoreAnnotations.TokensAnnotation.class));
    newSent.set(TreeCoreAnnotations.TreeAnnotation.class, sentence.get(TreeCoreAnnotations.TreeAnnotation.class));
    newSent.set(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class, sentence.get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class));
    newSent.set(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class, sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class));
    newSent.set(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class, sentence.get(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class));
    newSent.set(CoreAnnotations.DocIDAnnotation.class, sentence.get(CoreAnnotations.DocIDAnnotation.class));

    // deep copy of all mentions lists
    List ents = sentence.get(MachineReadingAnnotations.EntityMentionsAnnotation.class);
    if(ents != null) newSent.set(MachineReadingAnnotations.EntityMentionsAnnotation.class, new ArrayList<>(ents));
    List rels = sentence.get(MachineReadingAnnotations.RelationMentionsAnnotation.class);
    if(rels != null) newSent.set(MachineReadingAnnotations.RelationMentionsAnnotation.class, new ArrayList<>(rels));
    List evs = sentence.get(MachineReadingAnnotations.EventMentionsAnnotation.class);
    if(evs != null) newSent.set(MachineReadingAnnotations.EventMentionsAnnotation.class, new ArrayList<>(evs));

    return newSent;
  }

  /**
   * Return the relation that holds between the given entities.
   * Return a relation of type UNRELATED if this sentence contains no relation between the entities.
   */
  public static RelationMention getRelation(RelationMentionFactory factory, CoreMap sentence, ExtractionObject ... args) {
    return getRelations(factory, sentence, args).get(0);
  }

  /**
   * Return all the relations that holds between the given entities.
   * Returns a list containing a relation of type UNRELATED if this sentence contains no relation between the entities.
   */
  public static List getRelations(RelationMentionFactory factory, CoreMap sentence, ExtractionObject... args) {
    List relationMentions = sentence.get(MachineReadingAnnotations.RelationMentionsAnnotation.class);
    List matchingRelationMentions = new ArrayList<>();
    if (relationMentions != null) {
      for (RelationMention rel : relationMentions) {
        if (rel.argsMatch(args)) {
          matchingRelationMentions.add(rel);
        }
      }
    }
    if (matchingRelationMentions.size() == 0) {
      matchingRelationMentions.add(RelationMention.createUnrelatedRelation(factory, args));
    }
    return matchingRelationMentions;
  }

  /**
   * Get list of all relations and non-relations between EntityMentions in this sentence
   * Use with care. This is an expensive call due to getAllUnrelatedRelations, which creates all non-existing relations between all entity mentions
   */
  public static List getAllRelations(RelationMentionFactory factory, CoreMap sentence, boolean createUnrelatedRelations) {
    List relationMentions = sentence.get(MachineReadingAnnotations.RelationMentionsAnnotation.class);
    List allRelations = new ArrayList<>();
    if(relationMentions != null) allRelations.addAll(relationMentions);
    if(createUnrelatedRelations){
      allRelations.addAll(getAllUnrelatedRelations(factory, sentence, true));
    }
    return allRelations;
  }

  public static List getAllUnrelatedRelations(RelationMentionFactory factory, CoreMap sentence, boolean checkExisting) {

    List relationMentions = (checkExisting ? sentence.get(MachineReadingAnnotations.RelationMentionsAnnotation.class) : null);
    List entityMentions = sentence.get(MachineReadingAnnotations.EntityMentionsAnnotation.class);
    List nonRelations = new ArrayList<>();

    //
    // scan all possible arguments
    //
    if(entityMentions != null){
      for(int i = 0; i < entityMentions.size(); i ++){
        for(int j = 0; j < entityMentions.size(); j ++){
          if(i == j) continue;
          EntityMention arg1 = entityMentions.get(i);
          EntityMention arg2 = entityMentions.get(j);
          boolean match = false;
          if(relationMentions != null){
            for (RelationMention rel : relationMentions) {
              if (rel.argsMatch(arg1, arg2)) {
                match = true;
                break;
              }
            }
          }
          if (match == false) {
          	nonRelations.add(RelationMention.createUnrelatedRelation(factory, arg1,arg2));
          }
        }
      }
    }

    return nonRelations;
  }

  public static void addEntityMention(CoreMap sentence, EntityMention arg) {
    List l = sentence.get(MachineReadingAnnotations.EntityMentionsAnnotation.class);
    if(l == null){
      l = new ArrayList<>();
      sentence.set(MachineReadingAnnotations.EntityMentionsAnnotation.class, l);
    }
    l.add(arg);
  }

  public static void addEntityMentions(CoreMap sentence, Collection args) {
    List l = sentence.get(MachineReadingAnnotations.EntityMentionsAnnotation.class);
    if(l == null){
      l = new ArrayList<>();
      sentence.set(MachineReadingAnnotations.EntityMentionsAnnotation.class, l);
    }
    l.addAll(args);
  }

  public List getEntityMentions(CoreMap sent) {
    return Collections.unmodifiableList(sent.get(MachineReadingAnnotations.EntityMentionsAnnotation.class));
  }

  public static void addRelationMention(CoreMap sentence, RelationMention arg) {
    List l = sentence.get(MachineReadingAnnotations.RelationMentionsAnnotation.class);
    if(l == null){
      l = new ArrayList<>();
      sentence.set(MachineReadingAnnotations.RelationMentionsAnnotation.class, l);
    }
    l.add(arg);
  }

  public static void addRelationMentions(CoreMap sentence, Collection args) {
    List l = sentence.get(MachineReadingAnnotations.RelationMentionsAnnotation.class);
    if(l == null){
      l = new ArrayList<>();
      sentence.set(MachineReadingAnnotations.RelationMentionsAnnotation.class, l);
    }
    l.addAll(args);
  }

  public List getRelationMentions(CoreMap sent) {
    return Collections.unmodifiableList(sent.get(MachineReadingAnnotations.RelationMentionsAnnotation.class));
  }

  public static void addEventMention(CoreMap sentence, EventMention arg) {
    List l = sentence.get(MachineReadingAnnotations.EventMentionsAnnotation.class);
    if(l == null){
      l = new ArrayList<>();
      sentence.set(MachineReadingAnnotations.EventMentionsAnnotation.class, l);
    }
    l.add(arg);
  }

  public static void addEventMentions(CoreMap sentence, Collection args) {
    List l = sentence.get(MachineReadingAnnotations.EventMentionsAnnotation.class);
    if(l == null){
      l = new ArrayList<>();
      sentence.set(MachineReadingAnnotations.EventMentionsAnnotation.class, l);
    }
    l.addAll(args);
  }

  public List getEventMentions(CoreMap sent) {
    return Collections.unmodifiableList(sent.get(MachineReadingAnnotations.EventMentionsAnnotation.class));
  }

	/**
   * Prepare a string for printing in a spreadsheet for Mechanical Turk input.
   * @param s String to be formatted
   * @return String string enclosed in quotes with other quotes escaped, and with better formatting for readability by Turkers.
   */
  public static String prettify(String s) {
    if (s==null) return "";
    return s.replace(
        " ,",",").replace(
            " .",".").replace(
                " :",":").replace(
                    "( ","(").replace(
                        "[ ","[").replace(
                            " )",")").replace(
                                " ]","]").replace(
                                    " - ","-").replace(
                                        " '","'").replace(
                                            "-LRB- ","(").replace(
                                                " -RRB-",")").replace(
                                                    "` ` ","\"").replace(
                                                        " ' '","\"").replace(
                                                            " COMMA",",");
  }

  /**
   * Fetches the sentence text in a given token span
   * @param span
   */
  public static String getTextContent(CoreMap sent, Span span) {
    List tokens = sent.get(CoreAnnotations.TokensAnnotation.class);
    StringBuffer buf = new StringBuffer();
    assert(span != null);
    for(int i = span.start(); i < span.end(); i ++){
      if(i > span.start()) buf.append(" ");
      buf.append(tokens.get(i).word());
    }
    return buf.toString();
  }

  public static String sentenceToString(CoreMap sent) {
    StringBuilder sb = new StringBuilder(512);
    List tokens = sent.get(CoreAnnotations.TokensAnnotation.class);
    sb.append("\"" + StringUtils.join(tokens, " ") + "\"");
    sb.append("\n");

    List relationMentions = sent.get(MachineReadingAnnotations.RelationMentionsAnnotation.class);
    if(relationMentions != null){
      for (RelationMention rel : relationMentions) {
        sb.append("\n");
        sb.append(rel);
      }
    }

    // TODO: add entity and event mentions

    return sb.toString();
  }

  public static String tokensAndNELabelsToString(CoreMap sentence) {
    StringBuffer os = new StringBuffer();
    List tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
    if(tokens != null){
      boolean first = true;
      for(CoreLabel token: tokens) {
        if(! first) os.append(" ");
        os.append(token.word());
        if(token.ner() != null && ! token.ner().equals("O")){
          os.append("/" + token.ner());
        }
        first = false;
      }
    }
    return os.toString();
  }

  public static String datasetToString(CoreMap dataset){
    List sents = dataset.get(CoreAnnotations.SentencesAnnotation.class);
    StringBuffer b = new StringBuffer();
    if(sents != null){
      for(CoreMap sent: sents){
        b.append(sentenceToString(sent));
      }
    }
    return b.toString();
  }

  /*
  public static List wordsToCoreLabels(List words) {
    List labels = new ArrayList();
    for(Word word: words){
      CoreLabel l = new CoreLabel();
      l.setWord(word.word());
      l.set(CoreAnnotations.TextAnnotation.class, word.word());
      l.setBeginPosition(word.beginPosition());
      l.setEndPosition(word.endPosition());
      labels.add(l);
    }
    return labels;
  }
  */

  public static String tokensToString(List tokens) {
    StringBuffer os = new StringBuffer();
    boolean first = true;
    for(CoreLabel t: tokens){
      if(! first) os.append(" ");
      os.append(t.word() + "{" + t.beginPosition() + ", " + t.endPosition() + "}");
      first = false;
    }
    return os.toString();
  }

  /*
  public static boolean sentenceContainsSpan(CoreMap sentence, Span span) {
    List tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
    int sentenceStart = tokens.get(0).beginPosition();
    int sentenceEnd = tokens.get(tokens.size() - 1).endPosition();
    return sentenceStart <= span.start() && sentenceEnd >= span.end();
  }
  */

  /*
   * Shift the character offsets of all tokens by offset.
   */
  public static void updateOffsets(List tokens, int offset) {
    for(Word l: tokens) {
      l.setBeginPosition(l.beginPosition() + offset);
      l.setEndPosition(l.endPosition() + offset);
    }
  }

  /*
   * Shift the character offsets of all tokens by offset.
   */
  public static void updateOffsetsInCoreLabels(List tokens, int offset) {
    for(CoreLabel l: tokens) {
      l.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, l.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class) + offset);
      l.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, l.get(CoreAnnotations.CharacterOffsetEndAnnotation.class) + offset);
    }
  }

  /**
   * Process string to be a cell in Excel file.
   * Escape any quotes in the string and enclose the whole string with quotes.
   */
  public static String excelify(String s) {
    return '"'+s.replace("\"","\"\"")+'"';
  }

  public static List readSentencesFromFile(String path) throws IOException, ClassNotFoundException {
    Annotation doc = (Annotation) IOUtils.readObjectFromFile(path);
    return doc.get(CoreAnnotations.SentencesAnnotation.class);

  }

}