edu.stanford.nlp.coref.hybrid.HybridCorefSystem Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.coref.hybrid;

import java.io.File;
import java.io.FileOutputStream;
import java.io.PrintWriter;
import java.util.Calendar;
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.logging.Logger;

import edu.stanford.nlp.coref.CorefAlgorithm;
import edu.stanford.nlp.coref.CorefPrinter;
import edu.stanford.nlp.coref.CorefProperties;
import edu.stanford.nlp.coref.CorefScorer;
import edu.stanford.nlp.coref.CorefUtils;
import edu.stanford.nlp.coref.data.CorefChain;
import edu.stanford.nlp.coref.data.CorefCluster;
import edu.stanford.nlp.coref.data.Dictionaries;
import edu.stanford.nlp.coref.data.Document;
import edu.stanford.nlp.coref.data.DocumentMaker;
import edu.stanford.nlp.coref.data.Mention;
import edu.stanford.nlp.coref.hybrid.sieve.Sieve;
import edu.stanford.nlp.coref.hybrid.sieve.Sieve.ClassifierType;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.concurrent.MulticoreWrapper;
import edu.stanford.nlp.util.concurrent.ThreadsafeProcessor;
import edu.stanford.nlp.util.logging.Redwood;
import edu.stanford.nlp.util.logging.RedwoodConfiguration;

public class HybridCorefSystem implements CorefAlgorithm {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(HybridCorefSystem.class);

  public Properties props;
  public List sieves;
  public Dictionaries dictionaries;
  public DocumentMaker docMaker = null;

  public HybridCorefSystem(Properties props, Dictionaries dictionaries) throws Exception {
    this.props = props;
    this.dictionaries = dictionaries;
    sieves = Sieve.loadSieves(props);

    // set semantics loading
    for(Sieve sieve : sieves) {
      if(sieve.classifierType == ClassifierType.RULE) continue;
      if(HybridCorefProperties.useWordEmbedding(props, sieve.sievename)) {
        props.setProperty(HybridCorefProperties.LOAD_WORD_EMBEDDING_PROP, "true");
      }
    }
  }

  public HybridCorefSystem(Properties props) throws Exception {
    this.props = props;
    sieves = Sieve.loadSieves(props);

    // set semantics loading
    for(Sieve sieve : sieves) {
      if(sieve.classifierType == ClassifierType.RULE) continue;
      if(HybridCorefProperties.useWordEmbedding(props, sieve.sievename)) {
        props.setProperty(HybridCorefProperties.LOAD_WORD_EMBEDDING_PROP, "true");
      }
    }
    dictionaries = new Dictionaries(props);

    docMaker = new DocumentMaker(props, dictionaries);
  }

  public Dictionaries dictionaries() { return dictionaries; }


  public static void runCoref(String[] args) throws Exception {
      runCoref(StringUtils.argsToProperties(args));
  }

  public static void runCoref(Properties props) throws Exception {
   /*
    * property, environment setting
    */
    Redwood.hideChannelsEverywhere(
            "debug-cluster", "debug-mention", "debug-preprocessor", "debug-docreader", "debug-mergethres",
            "debug-featureselection", "debug-md"
            );
    int nThreads = HybridCorefProperties.getThreadCounts(props);
    String timeStamp = Calendar.getInstance().getTime().toString().replaceAll("\\s", "-").replaceAll(":", "-");

    Logger logger = Logger.getLogger(HybridCorefSystem.class.getName());

    // set log file path
    if(props.containsKey(HybridCorefProperties.LOG_PROP)){
      File logFile = new File(props.getProperty(HybridCorefProperties.LOG_PROP));
      RedwoodConfiguration.current().handlers(
      RedwoodConfiguration.Handlers.file(logFile)).apply();
      Redwood.log("Starting coref log");
    }

    log.info(props.toString());

    if(HybridCorefProperties.checkMemory(props)) checkMemoryUsage();

    HybridCorefSystem cs = new HybridCorefSystem(props);

    /*
       output setting
    */
    // prepare conll output
    String goldOutput = null;
    String beforeCorefOutput = null;
    String afterCorefOutput = null;
    PrintWriter writerGold = null;
    PrintWriter writerBeforeCoref = null;
    PrintWriter writerAfterCoref = null;
    if (HybridCorefProperties.doScore(props)) {
      String pathOutput = CorefProperties.conllOutputPath(props);
      (new File(pathOutput)).mkdir();
      goldOutput = pathOutput + "output-" + timeStamp + ".gold.txt";
      beforeCorefOutput = pathOutput + "output-" + timeStamp + ".predicted.txt";
      afterCorefOutput = pathOutput + "output-" + timeStamp + ".coref.predicted.txt";
      writerGold = new PrintWriter(new FileOutputStream(goldOutput));
      writerBeforeCoref = new PrintWriter(new FileOutputStream(beforeCorefOutput));
      writerAfterCoref = new PrintWriter(new FileOutputStream(afterCorefOutput));
    }

    // run coref
    MulticoreWrapper, StringBuilder[]> wrapper = new MulticoreWrapper<>(
            nThreads, new ThreadsafeProcessor, StringBuilder[]>() {
      @Override
      public StringBuilder[] process(Pair input) {
        try {
          Document document = input.first;
          HybridCorefSystem cs = input.second;

          StringBuilder[] outputs = new StringBuilder[4];    // conll output and logs

          cs.coref(document, outputs);

          return outputs;

        } catch (Exception e) {
          throw new RuntimeException(e);
        }
      }

      @Override
      public ThreadsafeProcessor, StringBuilder[]> newInstance() {
        return this;
      }
    });

    Date startTime = null;
    if(HybridCorefProperties.checkTime(props)) {
      startTime = new Date();
      System.err.printf("END-TO-END COREF Start time: %s\n", startTime);
    }

    // run processes
    int docCnt = 0;
    while (true) {
      Document document = cs.docMaker.nextDoc();
      if (document == null) break;
      wrapper.put(Pair.makePair(document, cs));
      docCnt = logOutput(wrapper, writerGold, writerBeforeCoref, writerAfterCoref, docCnt);
    }

    // Finished reading the input. Wait for jobs to finish
    wrapper.join();
    docCnt = logOutput(wrapper, writerGold, writerBeforeCoref, writerAfterCoref, docCnt);
    IOUtils.closeIgnoringExceptions(writerGold);
    IOUtils.closeIgnoringExceptions(writerBeforeCoref);
    IOUtils.closeIgnoringExceptions(writerAfterCoref);

    if(HybridCorefProperties.checkTime(props)) {
      System.err.printf("END-TO-END COREF Elapsed time: %.3f seconds\n", (((new Date()).getTime() - startTime.getTime()) / 1000F));
//      System.err.printf("CORENLP PROCESS TIME TOTAL: %.3f seconds\n", cs.mentionExtractor.corenlpProcessTime);
    }
    if(HybridCorefProperties.checkMemory(props)) checkMemoryUsage();

    // scoring
    if (HybridCorefProperties.doScore(props)) {
      String summary = CorefScorer.getEvalSummary(CorefProperties.getScorerPath(props), goldOutput, beforeCorefOutput);
      CorefScorer.printScoreSummary(summary, logger, false);

      summary = CorefScorer.getEvalSummary(CorefProperties.getScorerPath(props), goldOutput, afterCorefOutput);
      CorefScorer.printScoreSummary(summary, logger, true);
      CorefScorer.printFinalConllScore(summary);
    }
  }

  /**
   *  Write output of coref system in conll format, and log.
   */
  private static int logOutput(MulticoreWrapper, StringBuilder[]> wrapper,
                               PrintWriter writerGold,
                               PrintWriter writerBeforeCoref,
                               PrintWriter writerAfterCoref,
                               int docCnt) {
    while (wrapper.peek()) {
      StringBuilder[] output = wrapper.poll();
      writerGold.print(output[0]);
      writerBeforeCoref.print(output[1]);
      writerAfterCoref.print(output[2]);
      if (output[3].length() > 0) {
        log.info(output[3]);
      }
      if ((++docCnt) % 10 == 0) log.info(docCnt + " document(s) processed");
    }
    return docCnt;
  }

  @Override
  public void runCoref(Document document) {
    try {
      coref(document);
    } catch (Exception e) {
      throw new RuntimeException("Error running hybrid coref system", e);
    }
  }

  /**
   * main entry of coreference system.
   *
   * @param document Input document for coref format (Annotation and optional information)
   * @param output For output of coref system (conll format and log. list size should be 4.)
   * @return Map of coref chain ID and corresponding chain
   * @throws Exception
   */
  public Map coref(Document document, StringBuilder[] output) throws Exception {
    if(HybridCorefProperties.printMDLog(props)) {
      Redwood.log(HybridCorefPrinter.printMentionDetectionLog(document));
    }

    if(HybridCorefProperties.doScore(props)) {
      output[0] = (new StringBuilder()).append(CorefPrinter.printConllOutput(document, true));  // gold
      output[1] = (new StringBuilder()).append(CorefPrinter.printConllOutput(document, false)); // before coref
    }
    output[3] = new StringBuilder();  // log from sieves

    for(Sieve sieve : sieves){
      CorefUtils.checkForInterrupt();
      output[3].append(sieve.resolveMention(document, dictionaries, props));
    }

    // post processing
    if(HybridCorefProperties.doPostProcessing(props)) postProcessing(document);

    if(HybridCorefProperties.doScore(props)) {

      output[2] = (new StringBuilder()).append(CorefPrinter.printConllOutput(document, false, true)); // after coref
    }

    return makeCorefOutput(document);
  }

  /**
   * main entry of coreference system.
   *
   * @param document Input document for coref format (Annotation and optional information)
   * @return Map of coref chain ID and corresponding chain
   * @throws Exception
   */
  public Map coref(Document document) throws Exception {
    return coref(document, new StringBuilder[4]);
  }

  /**
   * main entry of coreference system.
   *
   * @param anno Input annotation.
   * @return Map of coref chain ID and corresponding chain
   * @throws Exception
   */
  public Map coref(Annotation anno) throws Exception {
    return coref(docMaker.makeDocument(anno));
  }

  /** Extract final coreference output from coreference document format. */
  private static Map makeCorefOutput(Document document) {
    Map result = Generics.newHashMap();
    for(CorefCluster c : document.corefClusters.values()) {
      result.put(c.clusterID, new CorefChain(c, document.positions));
    }
    return result;
  }

  /** Remove singletons, appositive, predicate nominatives, relative pronouns. */
  private static void postProcessing(Document document) {
    Set removeSet = Generics.newHashSet();
    Set removeClusterSet = Generics.newHashSet();

    for(CorefCluster c : document.corefClusters.values()){
      Set removeMentions = Generics.newHashSet();
      for(Mention m : c.getCorefMentions()) {
        if(HybridCorefProperties.REMOVE_APPOSITION_PREDICATENOMINATIVES
            && ((m.appositions!=null && m.appositions.size() > 0)
                || (m.predicateNominatives!=null && m.predicateNominatives.size() > 0)
                || (m.relativePronouns!=null && m.relativePronouns.size() > 0))){
          removeMentions.add(m);
          removeSet.add(m);
          m.corefClusterID = m.mentionID;
        }
      }

      c.corefMentions.removeAll(removeMentions);
      if(HybridCorefProperties.REMOVE_SINGLETONS && c.getCorefMentions().size()==1) {
        removeClusterSet.add(c.clusterID);
      }
    }
    for (int removeId : removeClusterSet){
      document.corefClusters.remove(removeId);
    }
    for(Mention m : removeSet){
      document.positions.remove(m);
    }
  }

  private static void checkMemoryUsage() {
    Runtime runtime = Runtime.getRuntime();
    runtime.gc();
    long memory = runtime.totalMemory() - runtime.freeMemory();
    log.info("USED MEMORY (bytes): " + memory);
  }

  public static void main(String[] args) throws Exception {
    Date startTime = new Date();
    System.err.printf("Start time: %s\n", startTime);
    runCoref(args);
    System.err.printf("Elapsed time: %.3f seconds\n", (((new Date()).getTime() - startTime.getTime()) / 1000F));
  }
}