edu.stanford.nlp.ie.machinereading.domains.ace.AceReader Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.ie.machinereading.domains.ace; 
import edu.stanford.nlp.util.logging.Redwood;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;

import javax.xml.parsers.ParserConfigurationException;

import org.xml.sax.SAXException;

import edu.stanford.nlp.ie.machinereading.GenericDataSetReader;
import edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceCharSeq;
import edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceDocument;
import edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceEntity;
import edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceEntityMention;
import edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceEventMention;
import edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceRelationMention;
import edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceRelationMentionArgument;
import edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceToken;
import edu.stanford.nlp.ie.machinereading.structure.AnnotationUtils;
import edu.stanford.nlp.ie.machinereading.structure.EntityMention;
import edu.stanford.nlp.ie.machinereading.structure.EventMention;
import edu.stanford.nlp.ie.machinereading.structure.ExtractionObject;
import edu.stanford.nlp.ie.machinereading.structure.MachineReadingAnnotations;
import edu.stanford.nlp.ie.machinereading.structure.RelationMention;
import edu.stanford.nlp.ie.machinereading.structure.Span;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.stats.Counter;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.StringUtils;

/**
 *
 * Simple wrapper of Mihai's ACE code to ie.machinereading.structure objects.
 *
 * @author David McClosky
 *
 */
public class AceReader extends GenericDataSetReader  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(AceReader.class);

  private final Counter entityCounts;
  private final Counter adjacentEntityMentions;
  private final Counter relationCounts;
  private final Counter nameRelationCounts;
  private final Counter eventCounts;
  private final Counter mentionTypeCounts;
  private final String aceVersion;
  private static final boolean VERBOSE = false;

  /**
   * Make an AceReader.
   */
  public AceReader() {
    this(null, true);
  }

  public AceReader(StanfordCoreNLP processor, boolean preprocess) {
    this(processor, preprocess, "ACE2005");
  }

  public AceReader(StanfordCoreNLP processor, boolean preprocess, String version) {
    super(processor, preprocess, false, true);

    entityCounts = new ClassicCounter<>();
    adjacentEntityMentions = new ClassicCounter<>();
    nameRelationCounts = new ClassicCounter<>();
    relationCounts = new ClassicCounter<>();
    eventCounts = new ClassicCounter<>();
    mentionTypeCounts = new ClassicCounter<>();

    logger = Logger.getLogger(AceReader.class.getName());
    // run quietly by default
    logger.setLevel(Level.SEVERE);

    aceVersion = version;
  }

  /**
   * Reads in ACE*.apf.xml files and converts them to RelationSentence objects.
   * Note that you probably should call parse() instead.
   *
   * Currently, this ignores document boundaries (the list returned will include
   * sentences from all documents).
   *
   * @param path directory containing ACE files to read (e.g.
   *          "/home/mcclosky/scr/data/ACE2005/english_test"). This can also be
   *          the path to a single file. *
   * @return list of RelationSentence objects
   */
  @Override
  public Annotation read(String path) throws IOException, SAXException, ParserConfigurationException {
    List allSentences = new ArrayList<>();
    File basePath = new File(path);
    assert basePath.exists();
    Annotation corpus = new Annotation("");

    if (basePath.isDirectory()) {
      for (File aceFile : IOUtils.iterFilesRecursive(basePath, ".apf.xml")) {
        if (aceFile.getName().endsWith(".UPC1.apf.xml")) {
          continue;
        }
        allSentences.addAll(readDocument(aceFile, corpus));
      }
    } else {
      // in case it's a file
      allSentences.addAll(readDocument(basePath, corpus));
    }

    AnnotationUtils.addSentences(corpus, allSentences);

    // quick stats
    if (VERBOSE) {
      printCounter(entityCounts, "entity mention");
      printCounter(relationCounts, "relation mention");
      printCounter(eventCounts, "event mention");
    }


    for(CoreMap sent: allSentences){
      // check for entity mentions of the same type that are adjacent
      countAdjacentMentions(sent);
      // count relations between two proper nouns
      countNameRelations(sent);
      // count types of mentions
      countMentionTypes(sent);
    }
    if (VERBOSE) {
      printCounter(adjacentEntityMentions, "adjacent entity mention");
      printCounter(nameRelationCounts, "name relation mention");
      printCounter(mentionTypeCounts, "mention type counts");
    }

    return corpus;
  }

  private void countMentionTypes(CoreMap sent) {
    List mentions = sent.get(MachineReadingAnnotations.EntityMentionsAnnotation.class);
    if(mentions != null){
      for(EntityMention m: mentions){
        mentionTypeCounts.incrementCount(m.getMentionType());
      }
    }
  }

  private void countNameRelations(CoreMap sent) {
    List mentions = sent.get(MachineReadingAnnotations.RelationMentionsAnnotation.class);
    if(mentions != null){
      for(RelationMention m: mentions) {
        List args = m.getEntityMentionArgs();
        if(args.size() == 2 && args.get(0).getMentionType().equals("NAM") && args.get(1).getMentionType().equals("NAM")){
          nameRelationCounts.incrementCount(m.getType() + "." + m.getSubType());
        }
      }
    }
  }

  private void countAdjacentMentions(CoreMap sent) {
    List mentions = sent.get(MachineReadingAnnotations.EntityMentionsAnnotation.class);
    if(mentions != null){
      for(EntityMention m1: mentions){
        for(EntityMention m2: mentions){
          if(m1 == m2) continue;
          if(m1.getHeadTokenEnd() == m2.getHeadTokenStart() && m1.getType().equals(m2.getType())){
            adjacentEntityMentions.incrementCount(m1.getType());
          }
        }
      }
    }
  }

  // todo: Change to use a counters print method (get sorting for free!)
  private void printCounter(Counter c, String h) {
    StringBuilder b = new StringBuilder();
    b.append(h).append(" counts:\n");
    Set keys = c.keySet();
    for(String k: keys){
      b.append("\t").append(k).append(": ").append(c.getCount(k)).append("\n");
    }
    logger.info(b.toString());
  }

   /**
   * Reads in a single ACE*.apf.xml file and convert it to RelationSentence
   * objects. However, you probably should call parse() instead.
   *
   * @param file A file object of an ACE file
   * @return list of RelationSentence objects
   */
  private List readDocument(File file, Annotation corpus) throws IOException, SAXException,
      ParserConfigurationException {
    // remove the extension to make it into a prefix
    String aceFilename = file.getAbsolutePath().replace(".apf.xml", "");
    List sentencesFromFile = readDocument(aceFilename, corpus);
    return sentencesFromFile;
  }

  /**
   * Reads in a single ACE*.apf.xml file and convert it to RelationSentence
   * objects. However, you probably should call parse() instead.
   *
   * @param prefix prefix of ACE filename to read (e.g.
   *          "/u/mcclosky/scr/data/ACE2005/english_test/bc/CNN_CF_20030827.1630.01"
   *          ) (no ".apf.xml" extension)
   * @return list of RelationSentence objects
   */
  private List readDocument(String prefix, Annotation corpus) throws IOException, SAXException,
      ParserConfigurationException {
    logger.info("Reading document: " + prefix);
    List results = new ArrayList<>();
    AceDocument aceDocument;
    if(aceVersion.equals("ACE2004")){
      aceDocument = AceDocument.parseDocument(prefix, false, aceVersion);
    } else {
      aceDocument = AceDocument.parseDocument(prefix, false);
    }
    String docId = aceDocument.getId();

    // map entity mention ID strings to their EntityMention counterparts
    Map entityMentionMap = Generics.newHashMap();

    /*
    for (int sentenceIndex = 0; sentenceIndex < aceDocument.getSentenceCount(); sentenceIndex++) {
      List tokens = aceDocument.getSentence(sentenceIndex);
      StringBuffer b = new StringBuffer();
      for(AceToken t: tokens) b.append(t.getLiteral() + " " );
      logger.info("SENTENCE: " + b.toString());
    }
    */

    int tokenOffset = 0;
    for (int sentenceIndex = 0; sentenceIndex < aceDocument.getSentenceCount(); sentenceIndex++) {
      List tokens = aceDocument.getSentence(sentenceIndex);

      List words = new ArrayList<>();
      StringBuilder textContent = new StringBuilder();
      for(int i = 0; i < tokens.size(); i ++){
        CoreLabel l = new CoreLabel();
        l.setWord(tokens.get(i).getLiteral());
        l.set(CoreAnnotations.ValueAnnotation.class, l.word());
        l.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, tokens.get(i).getByteStart());
        l.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, tokens.get(i).getByteEnd());
        words.add(l);
        if(i > 0) textContent.append(" ");
        textContent.append(tokens.get(i).getLiteral());
      }

      // skip "sentences" that are really just SGML tags (which come from using the RobustTokenizer)
      if (words.size() == 1) {
        String word = words.get(0).word();
        if (word.startsWith("<") && word.endsWith(">")) {
          tokenOffset += tokens.size();
          continue;
        }
      }

      CoreMap sentence = new Annotation(textContent.toString());
      sentence.set(CoreAnnotations.DocIDAnnotation.class, docId);
      sentence.set(CoreAnnotations.TokensAnnotation.class, words);
      logger.info("Reading sentence: \"" + textContent + "\"");

      List entityMentions = aceDocument.getEntityMentions(sentenceIndex);
      List relationMentions = aceDocument.getRelationMentions(sentenceIndex);
      List eventMentions = aceDocument.getEventMentions(sentenceIndex);

      // convert entity mentions
      for (AceEntityMention aceEntityMention : entityMentions) {
        String corefID="";
        for(String entityID : aceDocument.getKeySetEntities()){
          AceEntity e = aceDocument.getEntity(entityID);
          if(e.getMentions().contains(aceEntityMention)){
            corefID = entityID;
            break;
          }
        }
        EntityMention convertedMention = convertAceEntityMention(aceEntityMention, docId, sentence, tokenOffset, corefID);
//        EntityMention convertedMention = convertAceEntityMention(aceEntityMention, docId, sentence, tokenOffset);
        entityCounts.incrementCount(convertedMention.getType());
        logger.info("CONVERTED MENTION HEAD SPAN: " + convertedMention.getHead());
        logger.info("CONVERTED ENTITY MENTION: " + convertedMention);
        AnnotationUtils.addEntityMention(sentence, convertedMention);
        entityMentionMap.put(aceEntityMention.getId(), convertedMention);

        // TODO: make Entity objects as needed
      }

      // convert relation mentions
      for (AceRelationMention aceRelationMention : relationMentions) {
        RelationMention convertedMention = convertAceRelationMention(aceRelationMention, docId, sentence, entityMentionMap);
        if(convertedMention != null){
          relationCounts.incrementCount(convertedMention.getType());
          logger.info("CONVERTED RELATION MENTION: " + convertedMention);
          AnnotationUtils.addRelationMention(sentence, convertedMention);
        }

        // TODO: make Relation objects
      }

      // convert EventMentions
      for(AceEventMention aceEventMention: eventMentions){
        EventMention convertedMention = convertAceEventMention(aceEventMention, docId, sentence, entityMentionMap, tokenOffset);
        if(convertedMention != null){
          eventCounts.incrementCount(convertedMention.getType());
          logger.info("CONVERTED EVENT MENTION: " + convertedMention);
          AnnotationUtils.addEventMention(sentence, convertedMention);
        }

        // TODO: make Event objects
      }

      results.add(sentence);
      tokenOffset += tokens.size();
    }
    return results;
  }

  private EventMention convertAceEventMention(
      AceEventMention aceEventMention, String docId,
      CoreMap sentence, Map entityMap,
      int tokenOffset) {
    Set roleSet = aceEventMention.getRoles();
    List roles = new ArrayList<>();
    for(String role: roleSet) roles.add(role);
    List convertedArgs = new ArrayList<>();

    int left = Integer.MAX_VALUE;
    int right = Integer.MIN_VALUE;
    for(String role: roles){
      AceEntityMention arg = aceEventMention.getArg(role);
      ExtractionObject o = entityMap.get(arg.getId());
      if(o == null){
        logger.severe("READER ERROR: Failed to find event argument with id " + arg.getId());
        logger.severe("This happens because a few event mentions illegally span multiple sentences. Will ignore this mention.");
        return null;
      }
      convertedArgs.add(o);
      if(o.getExtentTokenStart() < left) left = o.getExtentTokenStart();
      if(o.getExtentTokenEnd() > right) right = o.getExtentTokenEnd();
    }

    AceCharSeq anchor = aceEventMention.getAnchor();
    ExtractionObject anchorObject = new ExtractionObject(
        aceEventMention.getId() + "-anchor",
        sentence,
        new Span(anchor.getTokenStart() - tokenOffset, anchor.getTokenEnd() + 1 - tokenOffset),
        "ANCHOR",
        null);

    EventMention em = new EventMention(
        aceEventMention.getId(),
        sentence,
        new Span(left, right),
        aceEventMention.getParent().getType(),
        aceEventMention.getParent().getSubtype(),
        anchorObject,
        convertedArgs,
        roles);
    return em;
  }

  private RelationMention convertAceRelationMention(AceRelationMention aceRelationMention, String docId,
      CoreMap sentence, Map entityMap) {
    List args = Arrays.asList(aceRelationMention.getArgs());
    List convertedArgs = new ArrayList<>();
    List argNames = new ArrayList<>();

    // the arguments are already stored in semantic order. Make sure we preserve the same ordering!
    int left = Integer.MAX_VALUE;
    int right = Integer.MIN_VALUE;
    for (AceRelationMentionArgument arg : args) {
      ExtractionObject o = entityMap.get(arg.getContent().getId());
      if(o == null){
        logger.severe("READER ERROR: Failed to find relation argument with id " + arg.getContent().getId());
        logger.severe("This happens because a few relation mentions illegally span multiple sentences. Will ignore this mention.");
        return null;
      }
      convertedArgs.add(o);
      argNames.add(arg.getRole());
      if(o.getExtentTokenStart() < left) left = o.getExtentTokenStart();
      if(o.getExtentTokenEnd() > right) right = o.getExtentTokenEnd();
    }

    if(argNames.size() != 2 || ! argNames.get(0).equalsIgnoreCase("arg-1") || ! argNames.get(1).equalsIgnoreCase("arg-2")){
      logger.severe("READER ERROR: Invalid succession of arguments in relation mention: " + argNames);
      logger.severe("ACE relations must have two arguments. Will ignore this mention.");
      return null;
    }

    RelationMention relation = new RelationMention(
        aceRelationMention.getId(),
        sentence,
        new Span(left, right),
        aceRelationMention.getParent().getType(),
        aceRelationMention.getParent().getSubtype(),
        convertedArgs,
        null);
    return relation;
  }

  /**
   * Convert an {@link AceEntityMention} to an {@link EntityMention}.
   *
   * @param entityMention {@link AceEntityMention} to convert
   * @param docId ID of the document containing this entity mention
   * @param sentence
   * @param tokenOffset An offset in the calculations of position of the extent to sentence boundary
   *                    (the ace.reader stores absolute token offset from the beginning of the document, but
   *                    we need token offsets from the beginning of the sentence => adjust by tokenOffset)
   * @return entity as an {@link EntityMention}
   */
  private EntityMention convertAceEntityMention(AceEntityMention entityMention, String docId, CoreMap sentence, int tokenOffset) {
    //log.info("TYPE is " + entityMention.getParent().getType());
    //log.info("SUBTYPE is " + entityMention.getParent().getSubtype());
    //log.info("LDCTYPE is " + entityMention.getLdctype());

    AceCharSeq ext = entityMention.getExtent();
    AceCharSeq head = entityMention.getHead();

    int extStart = ext.getTokenStart() - tokenOffset;
    int extEnd = ext.getTokenEnd() - tokenOffset + 1;
    if (extStart < 0) {
      logger.severe("READER ERROR: Invalid extent start " + extStart + " for entity mention " + entityMention.getId() + " in document " + docId + " in sentence " + sentence);
      logger.severe("This may happen due to incorrect EOS detection. Adjusting entity extent.");
      extStart = 0;
    }
    if (extEnd > sentence.get(CoreAnnotations.TokensAnnotation.class).size()) {
      logger.severe("READER ERROR: Invalid extent end " + extEnd + " for entity mention " + entityMention.getId() + " in document " + docId + " in sentence " + sentence);
      logger.severe("This may happen due to incorrect EOS detection. Adjusting entity extent.");
      extEnd = sentence.get(CoreAnnotations.TokensAnnotation.class).size();
    }

    int headStart = head.getTokenStart() - tokenOffset;
    int headEnd = head.getTokenEnd() - tokenOffset + 1;
    if (headStart < 0) {
      logger.severe("READER ERROR: Invalid head start " + headStart + " for entity mention " + entityMention.getId() + " in document " + docId + " in sentence " + sentence);
      logger.severe("This may happen due to incorrect EOS detection. Adjusting entity head span.");
      headStart = 0;
    }
    if(headEnd > sentence.get(CoreAnnotations.TokensAnnotation.class).size()){
      logger.severe("READER ERROR: Invalid head end " + headEnd + " for entity mention " + entityMention.getId() + " in document " + docId + " in sentence " + sentence);
      logger.severe("This may happen due to incorrect EOS detection. Adjusting entity head span.");
      headEnd = sentence.get(CoreAnnotations.TokensAnnotation.class).size();
    }

    // must adjust due to possible incorrect EOS detection
    if(headStart < extStart){
      headStart = extStart;
    }
    if(headEnd > extEnd){
      headEnd = extEnd;
    }
    assert(headStart < headEnd);

    // note: the ace.reader stores absolute token offset from the beginning of the document, but
    //       we need token offsets from the beginning of the sentence => adjust by tokenOffset
    // note: in ace.reader the end token position is inclusive, but
    //       in our setup the end token position is exclusive => add 1 to end
    EntityMention converted = new EntityMention(
        entityMention.getId(),
        sentence,
        new Span(extStart, extEnd),
        new Span(headStart, headEnd),
        entityMention.getParent().getType(),
        entityMention.getParent().getSubtype(),
        entityMention.getLdctype());
    return converted;
  }

  private EntityMention convertAceEntityMention(AceEntityMention entityMention, String docId, CoreMap sentence, int tokenOffset, String corefID) {
    EntityMention converted = convertAceEntityMention(entityMention, docId, sentence, tokenOffset);
    converted.setCorefID(corefID);
    return converted;
  }

  // simple testing code
  public static void main(String[] args) throws IOException {
    Properties props = StringUtils.argsToProperties(args);
    AceReader r = new AceReader(new StanfordCoreNLP(props, false), false);
    r.setLoggerLevel(Level.INFO);
    r.parse("/scr/nlp/data/ACE2005/");
    // Annotation a = r.parse("/user/mengqiu/scr/twitter/nlp/corpus_prep/standalone/ar/data");
    // BasicEntityExtractor.saveCoNLLFiles("/tmp/conll", a, false, false);
    log.info("done");
  }

}