edu.stanford.nlp.coref.hybrid.HybridCorefSystem Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation
Show all versions of stanford-corenlp Show documentation
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
package edu.stanford.nlp.coref.hybrid;
import java.io.File;
import java.io.FileOutputStream;
import java.io.PrintWriter;
import java.util.Calendar;
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.logging.Logger;
import edu.stanford.nlp.coref.CorefAlgorithm;
import edu.stanford.nlp.coref.CorefPrinter;
import edu.stanford.nlp.coref.CorefProperties;
import edu.stanford.nlp.coref.CorefScorer;
import edu.stanford.nlp.coref.CorefUtils;
import edu.stanford.nlp.coref.data.CorefChain;
import edu.stanford.nlp.coref.data.CorefCluster;
import edu.stanford.nlp.coref.data.Dictionaries;
import edu.stanford.nlp.coref.data.Document;
import edu.stanford.nlp.coref.data.DocumentMaker;
import edu.stanford.nlp.coref.data.Mention;
import edu.stanford.nlp.coref.hybrid.sieve.Sieve;
import edu.stanford.nlp.coref.hybrid.sieve.Sieve.ClassifierType;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.concurrent.MulticoreWrapper;
import edu.stanford.nlp.util.concurrent.ThreadsafeProcessor;
import edu.stanford.nlp.util.logging.Redwood;
import edu.stanford.nlp.util.logging.RedwoodConfiguration;
public class HybridCorefSystem implements CorefAlgorithm {
/** A logger for this class */
private static Redwood.RedwoodChannels log = Redwood.channels(HybridCorefSystem.class);
public Properties props;
public List sieves;
public Dictionaries dictionaries;
public DocumentMaker docMaker = null;
public HybridCorefSystem(Properties props, Dictionaries dictionaries) throws Exception {
this.props = props;
this.dictionaries = dictionaries;
sieves = Sieve.loadSieves(props);
// set semantics loading
for(Sieve sieve : sieves) {
if(sieve.classifierType == ClassifierType.RULE) continue;
if(HybridCorefProperties.useWordEmbedding(props, sieve.sievename)) {
props.setProperty(HybridCorefProperties.LOAD_WORD_EMBEDDING_PROP, "true");
}
}
}
public HybridCorefSystem(Properties props) throws Exception {
this.props = props;
sieves = Sieve.loadSieves(props);
// set semantics loading
for(Sieve sieve : sieves) {
if(sieve.classifierType == ClassifierType.RULE) continue;
if(HybridCorefProperties.useWordEmbedding(props, sieve.sievename)) {
props.setProperty(HybridCorefProperties.LOAD_WORD_EMBEDDING_PROP, "true");
}
}
dictionaries = new Dictionaries(props);
docMaker = new DocumentMaker(props, dictionaries);
}
public Dictionaries dictionaries() { return dictionaries; }
public static void runCoref(String[] args) throws Exception {
runCoref(StringUtils.argsToProperties(args));
}
public static void runCoref(Properties props) throws Exception {
/*
* property, environment setting
*/
Redwood.hideChannelsEverywhere(
"debug-cluster", "debug-mention", "debug-preprocessor", "debug-docreader", "debug-mergethres",
"debug-featureselection", "debug-md"
);
int nThreads = HybridCorefProperties.getThreadCounts(props);
String timeStamp = Calendar.getInstance().getTime().toString().replaceAll("\\s", "-").replaceAll(":", "-");
Logger logger = Logger.getLogger(HybridCorefSystem.class.getName());
// set log file path
if(props.containsKey(HybridCorefProperties.LOG_PROP)){
File logFile = new File(props.getProperty(HybridCorefProperties.LOG_PROP));
RedwoodConfiguration.current().handlers(
RedwoodConfiguration.Handlers.file(logFile)).apply();
Redwood.log("Starting coref log");
}
log.info(props.toString());
if(HybridCorefProperties.checkMemory(props)) checkMemoryUsage();
HybridCorefSystem cs = new HybridCorefSystem(props);
/*
output setting
*/
// prepare conll output
String goldOutput = null;
String beforeCorefOutput = null;
String afterCorefOutput = null;
PrintWriter writerGold = null;
PrintWriter writerBeforeCoref = null;
PrintWriter writerAfterCoref = null;
if (HybridCorefProperties.doScore(props)) {
String pathOutput = CorefProperties.conllOutputPath(props);
(new File(pathOutput)).mkdir();
goldOutput = pathOutput + "output-" + timeStamp + ".gold.txt";
beforeCorefOutput = pathOutput + "output-" + timeStamp + ".predicted.txt";
afterCorefOutput = pathOutput + "output-" + timeStamp + ".coref.predicted.txt";
writerGold = new PrintWriter(new FileOutputStream(goldOutput));
writerBeforeCoref = new PrintWriter(new FileOutputStream(beforeCorefOutput));
writerAfterCoref = new PrintWriter(new FileOutputStream(afterCorefOutput));
}
// run coref
MulticoreWrapper, StringBuilder[]> wrapper = new MulticoreWrapper<>(
nThreads, new ThreadsafeProcessor, StringBuilder[]>() {
@Override
public StringBuilder[] process(Pair input) {
try {
Document document = input.first;
HybridCorefSystem cs = input.second;
StringBuilder[] outputs = new StringBuilder[4]; // conll output and logs
cs.coref(document, outputs);
return outputs;
} catch (Exception e) {
throw new RuntimeException(e);
}
}
@Override
public ThreadsafeProcessor, StringBuilder[]> newInstance() {
return this;
}
});
Date startTime = null;
if(HybridCorefProperties.checkTime(props)) {
startTime = new Date();
System.err.printf("END-TO-END COREF Start time: %s\n", startTime);
}
// run processes
int docCnt = 0;
while (true) {
Document document = cs.docMaker.nextDoc();
if (document == null) break;
wrapper.put(Pair.makePair(document, cs));
docCnt = logOutput(wrapper, writerGold, writerBeforeCoref, writerAfterCoref, docCnt);
}
// Finished reading the input. Wait for jobs to finish
wrapper.join();
docCnt = logOutput(wrapper, writerGold, writerBeforeCoref, writerAfterCoref, docCnt);
IOUtils.closeIgnoringExceptions(writerGold);
IOUtils.closeIgnoringExceptions(writerBeforeCoref);
IOUtils.closeIgnoringExceptions(writerAfterCoref);
if(HybridCorefProperties.checkTime(props)) {
System.err.printf("END-TO-END COREF Elapsed time: %.3f seconds\n", (((new Date()).getTime() - startTime.getTime()) / 1000F));
// System.err.printf("CORENLP PROCESS TIME TOTAL: %.3f seconds\n", cs.mentionExtractor.corenlpProcessTime);
}
if(HybridCorefProperties.checkMemory(props)) checkMemoryUsage();
// scoring
if (HybridCorefProperties.doScore(props)) {
String summary = CorefScorer.getEvalSummary(CorefProperties.getScorerPath(props), goldOutput, beforeCorefOutput);
CorefScorer.printScoreSummary(summary, logger, false);
summary = CorefScorer.getEvalSummary(CorefProperties.getScorerPath(props), goldOutput, afterCorefOutput);
CorefScorer.printScoreSummary(summary, logger, true);
CorefScorer.printFinalConllScore(summary);
}
}
/**
* Write output of coref system in conll format, and log.
*/
private static int logOutput(MulticoreWrapper, StringBuilder[]> wrapper,
PrintWriter writerGold,
PrintWriter writerBeforeCoref,
PrintWriter writerAfterCoref,
int docCnt) {
while (wrapper.peek()) {
StringBuilder[] output = wrapper.poll();
writerGold.print(output[0]);
writerBeforeCoref.print(output[1]);
writerAfterCoref.print(output[2]);
if (output[3].length() > 0) {
log.info(output[3]);
}
if ((++docCnt) % 10 == 0) log.info(docCnt + " document(s) processed");
}
return docCnt;
}
@Override
public void runCoref(Document document) {
try {
coref(document);
} catch (Exception e) {
throw new RuntimeException("Error running hybrid coref system", e);
}
}
/**
* main entry of coreference system.
*
* @param document Input document for coref format (Annotation and optional information)
* @param output For output of coref system (conll format and log. list size should be 4.)
* @return Map of coref chain ID and corresponding chain
* @throws Exception
*/
public Map coref(Document document, StringBuilder[] output) throws Exception {
if(HybridCorefProperties.printMDLog(props)) {
Redwood.log(HybridCorefPrinter.printMentionDetectionLog(document));
}
if(HybridCorefProperties.doScore(props)) {
output[0] = (new StringBuilder()).append(CorefPrinter.printConllOutput(document, true)); // gold
output[1] = (new StringBuilder()).append(CorefPrinter.printConllOutput(document, false)); // before coref
}
output[3] = new StringBuilder(); // log from sieves
for(Sieve sieve : sieves){
CorefUtils.checkForInterrupt();
output[3].append(sieve.resolveMention(document, dictionaries, props));
}
// post processing
if(HybridCorefProperties.doPostProcessing(props)) postProcessing(document);
if(HybridCorefProperties.doScore(props)) {
output[2] = (new StringBuilder()).append(CorefPrinter.printConllOutput(document, false, true)); // after coref
}
return makeCorefOutput(document);
}
/**
* main entry of coreference system.
*
* @param document Input document for coref format (Annotation and optional information)
* @return Map of coref chain ID and corresponding chain
* @throws Exception
*/
public Map coref(Document document) throws Exception {
return coref(document, new StringBuilder[4]);
}
/**
* main entry of coreference system.
*
* @param anno Input annotation.
* @return Map of coref chain ID and corresponding chain
* @throws Exception
*/
public Map coref(Annotation anno) throws Exception {
return coref(docMaker.makeDocument(anno));
}
/** Extract final coreference output from coreference document format. */
private static Map makeCorefOutput(Document document) {
Map result = Generics.newHashMap();
for(CorefCluster c : document.corefClusters.values()) {
result.put(c.clusterID, new CorefChain(c, document.positions));
}
return result;
}
/** Remove singletons, appositive, predicate nominatives, relative pronouns. */
private static void postProcessing(Document document) {
Set removeSet = Generics.newHashSet();
Set removeClusterSet = Generics.newHashSet();
for(CorefCluster c : document.corefClusters.values()){
Set removeMentions = Generics.newHashSet();
for(Mention m : c.getCorefMentions()) {
if(HybridCorefProperties.REMOVE_APPOSITION_PREDICATENOMINATIVES
&& ((m.appositions!=null && m.appositions.size() > 0)
|| (m.predicateNominatives!=null && m.predicateNominatives.size() > 0)
|| (m.relativePronouns!=null && m.relativePronouns.size() > 0))){
removeMentions.add(m);
removeSet.add(m);
m.corefClusterID = m.mentionID;
}
}
c.corefMentions.removeAll(removeMentions);
if(HybridCorefProperties.REMOVE_SINGLETONS && c.getCorefMentions().size()==1) {
removeClusterSet.add(c.clusterID);
}
}
for (int removeId : removeClusterSet){
document.corefClusters.remove(removeId);
}
for(Mention m : removeSet){
document.positions.remove(m);
}
}
private static void checkMemoryUsage() {
Runtime runtime = Runtime.getRuntime();
runtime.gc();
long memory = runtime.totalMemory() - runtime.freeMemory();
log.info("USED MEMORY (bytes): " + memory);
}
public static void main(String[] args) throws Exception {
Date startTime = new Date();
System.err.printf("Start time: %s\n", startTime);
runCoref(args);
System.err.printf("Elapsed time: %.3f seconds\n", (((new Date()).getTime() - startTime.getTime()) / 1000F));
}
}