All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.time.SUTimeMain Maven / Gradle / Ivy

Go to download

Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.

There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.time;
import edu.stanford.nlp.util.logging.Redwood;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.ReaderInputStream;
import edu.stanford.nlp.io.TeeStream;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.tokensregex.MatchedExpression;
import edu.stanford.nlp.pipeline.*;
import edu.stanford.nlp.process.CoreLabelTokenFactory;
import edu.stanford.nlp.stats.PrecisionRecallStats;
import edu.stanford.nlp.util.*;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;

import java.io.*;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.logging.LogManager;
import java.util.regex.Pattern;


/**
 * Main program for testing SUTime.
 * 
* Processing a text string: *
 * -in.type TEXT
 * -date YYYY-MM-dd
 * -i <text>
 * -o <output file>
 * 
* * Processing a text file: *
 * -in.type TEXTFILE
 * -date YYYY-MM-dd
 * -i input.txt
 * -o <output file>
 * 
* * Running on Timebank *
 * -in.type TIMEBANK_CSV
 * -i timebank.csv
 * -tempeval2.dct dct.txt
 * -o <output directory>
 * -eval <evaluation script>
 * 
* * Evaluating on Tempeval2 *
 * -in.type TEMPEVAL2
 * -i <directory with english data>
 * -o <output directory>
 * -eval <evaluation script>
 * -tempeval2.dct dct file (with document creation times)
 *
 * TEMPEVAL2 (download from http://timeml.org/site/timebank/timebank.html)
 * Evaluation is token based.
 *
 * TRAINING (english):
 *
 * GUTIME:
 * precision   0.88
 * recall      0.71
 * f1-measure  0.79
 * accuracy    0.98
 * attribute type       0.92
 * attribute value      0.31   // LOW SCORE here is due to difference in format (no -,: in date)
 *
 * After fixing some formats for GUTIME:
 *   (GUTIME syntax is inconsistent at times (1991W 8WE, 19980212EV)
 * attribute value      0.67
 *
 * SUTIME:
 * Default: sutime.teRelHeurLevel=NONE, restrictToTimex3=false
 * precision   0.873
 * recall      0.897
 * f1-measure  0.885
 * accuracy    0.991
 *
 *                                P      R    F1
 * attribute type       0.918 | 0.751 0.802 0.776
 * attribute value      0.762 | 0.623 0.665 0.644
 *
 *                                        P      R    F1
 * mention attribute type       0.900 | 0.780 0.833 0.805
 * mention attribute value      0.742 | 0.643 0.687 0.664
 *
 * sutime.teRelHeurLevel=MORE, restrictToTimex3=true
 * precision   0.876
 * recall      0.889
 * f1-measure  0.882
 * accuracy    0.991
 *                                P      R    F1
 * attribute type       0.918 | 0.744 0.798 0.770
 * attribute value      0.776 | 0.629 0.675 0.651
 *
 *                                        P      R    F1
 * mention attribute type       0.901 | 0.780 0.836 0.807
 * mention attribute value      0.750 | 0.649 0.696 0.672
 *
 * ------------------------------------------------------------------------------
 * TEST (english):
 *
 * GUTIME:
 * precision   0.89
 * recall      0.79
 * f1-measure  0.84
 * accuracy    0.99
 *
 * attribute type       0.95
 * attribute value      0.68
 *
 * SUTIME:
 * Default: sutime.teRelHeurLevel=NONE, restrictToTimex3=false
 * precision   0.878
 * recall      0.963
 * f1-measure  0.918
 * accuracy    0.996
 *
 *                                P      R    F1
 * attribute type       0.953 | 0.820 0.904 0.860
 * attribute value      0.791 | 0.680 0.750 0.713
 *
 *                                        P      R    F1
 * mention attribute type       0.954 | 0.837 0.923 0.878
 * mention attribute value      0.781 | 0.686 0.756 0.720
 *
 * sutime.teRelHeurLevel=MORE, restrictToTimex3=true
 * precision   0.881
 * recall      0.963
 * f1-measure  0.920
 * accuracy    0.995
 *                                P      R    F1
 * attribute type       0.959 | 0.821 0.910 0.863
 * attribute value      0.818 | 0.699 0.776 0.736
 *
 *                                        P      R    F1
 * mention attribute type       0.961 | 0.844 0.936 0.888
 * mention attribute value      0.803 | 0.705 0.782 0.742
 *
 * 
* @author Angel Chang */ public class SUTimeMain { /** A logger for this class */ private static final Redwood.RedwoodChannels log = Redwood.channels(SUTimeMain.class); protected static String PYTHON = null; private SUTimeMain() {} // static class /* * Other Time corpora: (see also http://timeml.org/site/timebank/timebank.html) * LDC2006T08 TimeBank 1.2 (Uses TIMEX3) * LDC2005T07 ACE Time Normalization (TERN) 2004 English Training Data v 1.0 (Uses TIMEX2) * GUTime achieved .85, .78, and .82 F-measure for timex2, text, and val fields * LDC2010T18 ACE Time Normalization (TERN) 2004 English Evaluation Data V1.0 */ //////////////////////////////////////////////////////////////////////////////////////// private static class EvalStats { PrecisionRecallStats prStats = new PrecisionRecallStats(); // PrecisionRecallStats tokenPrStats = new PrecisionRecallStats(); PrecisionRecallStats valPrStats = new PrecisionRecallStats(); PrecisionRecallStats estPrStats = new PrecisionRecallStats(); } private static class TimebankTimex { String timexId; String timexVal; String timexOrigVal; String timexStr; int tid; private TimebankTimex(String timexId, String timexVal, String timexOrigVal, String timexStr) { this.timexId = timexId; this.timexVal = timexVal; this.timexOrigVal = timexOrigVal; this.timexStr = timexStr; if (timexId != null && timexId.length() > 0) { tid = Integer.parseInt(timexId); } } } private static class TimebankSent { boolean initialized = false; String docId; @SuppressWarnings("unused") String docFilename; String docPubDate; String sentId; String text; List timexes = new ArrayList<>(); List origItems = new ArrayList<>(); public boolean add(String item) { String[] fields = item.split("\\s*\\|\\s*", 9); String docId = fields[0]; String docFilename = fields[1]; String docPubDate = fields[2]; String sentId = fields[3]; String sent = fields[8]; if (initialized) { // check compatibility; if (!docId.equals(this.docId) || !sentId.equals(this.sentId)) { return false; } } else { this.docId = docId; this.docFilename = docFilename; this.docPubDate = docPubDate; this.sentId = sentId; this.text = sent; initialized = true; } origItems.add(item); String timexId = fields[4]; String timexVal = fields[5]; String timexOrigVal = fields[6]; String timexStr = fields[7]; if (timexId != null && timexId.length() > 0) { timexes.add(new TimebankTimex(timexId, timexVal, timexOrigVal, timexStr)); } return true; } } //Overall: PrecisionRecallStats[tp=877,fp=199,fn=386,p=0.82 (877/1076),r=0.69 (877/1263),f1=0.75] //Value: PrecisionRecallStats[tp=229,fp=199,fn=1034,p=0.54 (229/428),r=0.18 (229/1263),f1=0.27] // Process one item from timebank CSV file private static void processTimebankCsvSent(AnnotationPipeline pipeline, TimebankSent sent, PrintWriter pw, EvalStats evalStats) { if (sent != null) { Collections.sort(sent.timexes, (o1, o2) -> { if (o1.tid == o2.tid) { return 0; } else return (o1.tid < o2.tid)? -1:1; }); pw.println(); for (String item:sent.origItems) { pw.println("PROC |" + item); } Annotation annotation = new Annotation(sent.text); annotation.set(CoreAnnotations.DocDateAnnotation.class, sent.docPubDate); pipeline.annotate(annotation); List timexes = annotation.get(TimeAnnotations.TimexAnnotations.class); int i = 0; for (CoreMap t:timexes) { String[] newFields; if (sent.timexes.size() > i) { String res; TimebankTimex goldTimex = sent.timexes.get(i); Timex guessTimex = t.get(TimeAnnotations.TimexAnnotation.class); String s1 = goldTimex.timexStr.replaceAll("\\s+", ""); String s2 = guessTimex.text().replaceAll("\\s+", ""); if (s1.equals(s2)) { evalStats.estPrStats.incrementTP(); res = "OK"; } else { evalStats.estPrStats.incrementFP(); evalStats.estPrStats.incrementFN(); res = "BAD"; } newFields = new String[] { res, goldTimex.timexId, goldTimex.timexVal, goldTimex.timexOrigVal, goldTimex.timexStr, t.get(TimeAnnotations.TimexAnnotation.class).toString() }; i++; } else { newFields = new String[] { "NONE" , t.get(TimeAnnotations.TimexAnnotation.class).toString()}; evalStats.estPrStats.incrementFP(); } pw.println("GOT | "+ StringUtils.join(newFields, "|")); } for (; i < sent.timexes.size(); i++) { evalStats.estPrStats.incrementFN(); } i = 0; int lastIndex = 0; for (TimebankTimex goldTimex:sent.timexes) { int index = sent.text.indexOf(goldTimex.timexStr, lastIndex); int endIndex = index + goldTimex.timexStr.length(); boolean found = false; for (; i < timexes.size(); i++) { CoreMap t = timexes.get(i); if (t.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class) >= endIndex) { break; } else { if (t.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class) >= index) { found = true; evalStats.prStats.incrementTP(); if (goldTimex.timexOrigVal.equals(t.get(TimeAnnotations.TimexAnnotation.class).value())) { evalStats.valPrStats.incrementTP(); } else { evalStats.valPrStats.incrementFN(); } } else { evalStats.prStats.incrementFP(); evalStats.valPrStats.incrementFP(); } } } if (!found) { evalStats.prStats.incrementFN(); evalStats.valPrStats.incrementFN(); } lastIndex = endIndex; } for (; i < timexes.size(); i++) { evalStats.prStats.incrementFP(); evalStats.valPrStats.incrementFP(); } } } // Process CSV file with just timebank sentences with time expressions public static void processTimebankCsv(AnnotationPipeline pipeline, String in, String out, String eval) throws IOException { BufferedReader br = IOUtils.getBufferedFileReader(in); PrintWriter pw = (out != null)? IOUtils.getPrintWriter(out):new PrintWriter(System.out); String line; // boolean dataStarted = false; boolean dataStarted = true; TimebankSent sent = new TimebankSent(); String item = null; EvalStats evalStats = new EvalStats(); line = br.readLine(); // Skip first line while ((line = br.readLine()) != null) { if (line.trim().length() == 0) continue; if (dataStarted) { if (line.contains("|")) { if (item != null) { boolean addOld = sent.add(item); if (!addOld) { processTimebankCsvSent(pipeline, sent, pw, evalStats); sent = new TimebankSent(); sent.add(item); } } item = line; } else { item += " " + line; } } else { if (line.matches("#+ BEGIN DATA #+")) { dataStarted = true; } } } if (item != null) { boolean addOld = sent.add(item); if (!addOld) { processTimebankCsvSent(pipeline, sent, pw, evalStats); sent = new TimebankSent(); sent.add(item); } processTimebankCsvSent(pipeline, sent, pw, evalStats); } br.close(); if (out != null) { pw.close(); } System.out.println("Estimate: " + evalStats.estPrStats.toString(2)); System.out.println("Overall: " + evalStats.prStats.toString(2)); System.out.println("Value: " + evalStats.valPrStats.toString(2)); } private static String joinWordTags(List l, String glue, int start, int end) { return StringUtils.join(l, glue, in -> in.get(CoreAnnotations.TextAnnotation.class) + '/' + in.get(CoreAnnotations.PartOfSpeechAnnotation.class), start, end); } private static void processTempEval2Doc(AnnotationPipeline pipeline, Annotation docAnnotation, Map> timexMap, PrintWriter extPw, PrintWriter attrPw, PrintWriter debugPw, PrintWriter attrDebugPwGold, PrintWriter attrDebugPw) { pipeline.annotate(docAnnotation); String docId = docAnnotation.get(CoreAnnotations.DocIDAnnotation.class); String docDate = docAnnotation.get(CoreAnnotations.DocDateAnnotation.class); List sents = docAnnotation.get(CoreAnnotations.SentencesAnnotation.class); if (timexMap != null) { List golds = updateTimexText(timexMap, docAnnotation); if (attrDebugPwGold != null && golds != null) { for (TimexAttributes g:golds) { String[] newFields = { docId, docDate, String.valueOf(g.sentIndex), String.valueOf(g.tokenStart), String.valueOf(g.tokenEnd), /*g.tid, */ g.type, g.value, g.text, g.context }; attrDebugPwGold.println(StringUtils.join(newFields, "\t")); } } } if (attrDebugPw != null) { for (CoreMap sent:sents) { List timexes = sent.get(TimeAnnotations.TimexAnnotations.class); if (timexes != null) { for (CoreMap t:timexes) { Timex timex = t.get(TimeAnnotations.TimexAnnotation.class); int sentIndex = sent.get(CoreAnnotations.SentenceIndexAnnotation.class); int sentTokenStart = sent.get(CoreAnnotations.TokenBeginAnnotation.class); int tokenStart; int tokenEnd; if (t.containsKey(CoreAnnotations.TokenBeginAnnotation.class)) { tokenStart = t.get(CoreAnnotations.TokenBeginAnnotation.class) - sentTokenStart; tokenEnd = t.get(CoreAnnotations.TokenEndAnnotation.class) - sentTokenStart; } else { CoreMap cm = ChunkAnnotationUtils.getAnnotatedChunkUsingCharOffsets(docAnnotation, t.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class), t.get(CoreAnnotations.CharacterOffsetEndAnnotation.class)); tokenStart = cm.get(CoreAnnotations.TokenBeginAnnotation.class) - sentTokenStart; tokenEnd = cm.get(CoreAnnotations.TokenEndAnnotation.class) - sentTokenStart; } String context = joinWordTags(sent.get(CoreAnnotations.TokensAnnotation.class), " ", tokenStart-3, tokenEnd+3); String[] newFields = { docId, docDate, String.valueOf(sentIndex), String.valueOf(tokenStart), String.valueOf(tokenEnd), /*timex.tid(), */ timex.timexType(), timex.value(), timex.text(), context}; attrDebugPw.println(StringUtils.join(newFields, "\t")); } } } } if (debugPw != null) { List timexes = docAnnotation.get(TimeAnnotations.TimexAnnotations.class); for (CoreMap t:timexes) { String[] newFields = { docId, docDate, t.get(TimeAnnotations.TimexAnnotation.class).toString() }; debugPw.println("GOT | "+ StringUtils.join(newFields, "|")); } } if (extPw != null || attrPw != null) { for (CoreMap sent:sents) { int sentTokenBegin = sent.get(CoreAnnotations.TokenBeginAnnotation.class); for (CoreMap t:sent.get(TimeAnnotations.TimexAnnotations.class)) { Timex tmx = t.get(TimeAnnotations.TimexAnnotation.class); List tokens = t.get(CoreAnnotations.TokensAnnotation.class); int tokenIndex = 0; if (tokens == null) { CoreMap cm = ChunkAnnotationUtils.getAnnotatedChunkUsingCharOffsets(docAnnotation, t.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class), t.get(CoreAnnotations.CharacterOffsetEndAnnotation.class)); tokens = cm.get(CoreAnnotations.TokensAnnotation.class); tokenIndex = cm.get(CoreAnnotations.TokenBeginAnnotation.class); } else { tokenIndex = t.get(CoreAnnotations.TokenBeginAnnotation.class); } tokenIndex = tokenIndex - sentTokenBegin; String sentenceIndex = String.valueOf(sent.get(CoreAnnotations.SentenceIndexAnnotation.class)); int tokenCount = 0; for (@SuppressWarnings("unused") CoreLabel token:tokens) { String[] extFields = { docId, sentenceIndex, String.valueOf(tokenIndex), "timex3", tmx.tid(), "1"}; String extString = StringUtils.join(extFields, "\t"); if (extPw != null) extPw.println(extString); if (attrPw != null /* && tokenCount == 0 */) { String[] attrFields = { "type", tmx.timexType(), }; attrPw.println(extString + "\t" + StringUtils.join(attrFields, "\t")); if (tmx.value() != null) { String val = tmx.value(); // Fix up expression values (needed for GUTime) if (useGUTime) { if ("TIME".equals(tmx.timexType())) { if (val.matches("T\\d{4}")) { val = "T" + val.substring(1,3) + ":" + val.substring(3,5); } } else if ("DATE".equals(tmx.timexType())) { if (val.matches("\\d{8}T.*")) { val = val.substring(0,4) + "-" + val.substring(4,6) + "-" + val.substring(6); } else if (val.matches("\\d{8}")) { val = val.substring(0,4) + "-" + val.substring(4,6) + "-" + val.substring(6,8); } else if (val.matches("\\d\\d\\d\\d..")) { val = val.substring(0,4) + "-" + val.substring(4,6); } else if (val.matches("[0-9X]{4}W[0-9X]{2}.*")) { if (val.length() > 7) { val = val.substring(0,4) + "-" + val.substring(4,7) + "-" + val.substring(7); } else { val = val.substring(0,4) + "-" + val.substring(4,7); } } } } /*else { // SUTIME if ("DATE".equals(tmx.timexType())) { if (val.matches("\\d\\d\\dX")) { val = val.substring(0,3); // Convert 199X to 199 } } } */ attrFields[0] = "value"; attrFields[1] = val; attrPw.println(extString + "\t" + StringUtils.join(attrFields, "\t")); } } tokenIndex++; tokenCount++; } } } } } private static CoreLabelTokenFactory tokenFactory = new CoreLabelTokenFactory(); private static CoreMap wordsToSentence(List sentWords) { String sentText = StringUtils.join(sentWords, " "); Annotation sentence = new Annotation(sentText); List tokens = new ArrayList<>(sentWords.size()); for (String text:sentWords) { CoreLabel token = tokenFactory.makeToken(); token.set(CoreAnnotations.TextAnnotation.class, text); tokens.add(token); } sentence.set(CoreAnnotations.TokensAnnotation.class, tokens); return sentence; } public static Annotation sentencesToDocument(String documentID, String docDate, List sentences) { String docText = ChunkAnnotationUtils.getTokenText(sentences, CoreAnnotations.TextAnnotation.class); Annotation document = new Annotation(docText); document.set(CoreAnnotations.DocIDAnnotation.class, documentID); document.set(CoreAnnotations.DocDateAnnotation.class, docDate); document.set(CoreAnnotations.SentencesAnnotation.class, sentences); // Accumulate docTokens and label sentence with overall token begin/end, and sentence index annotations List docTokens = new ArrayList<>(); int sentenceIndex = 0; int tokenBegin = 0; for (CoreMap sentenceAnnotation:sentences) { List sentenceTokens = sentenceAnnotation.get(CoreAnnotations.TokensAnnotation.class); docTokens.addAll(sentenceTokens); int tokenEnd = tokenBegin + sentenceTokens.size(); sentenceAnnotation.set(CoreAnnotations.TokenBeginAnnotation.class, tokenBegin); sentenceAnnotation.set(CoreAnnotations.TokenEndAnnotation.class, tokenEnd); sentenceAnnotation.set(CoreAnnotations.SentenceIndexAnnotation.class, sentenceIndex); sentenceIndex++; tokenBegin = tokenEnd; } document.set(CoreAnnotations.TokensAnnotation.class, docTokens); // Put in character offsets int i = 0; for (CoreLabel token:docTokens) { String tokenText = token.get(CoreAnnotations.TextAnnotation.class); token.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, i); i+=tokenText.length(); token.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, i); i++; // Skip space } for (CoreMap sentenceAnnotation:sentences) { List sentenceTokens = sentenceAnnotation.get(CoreAnnotations.TokensAnnotation.class); sentenceAnnotation.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, sentenceTokens.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class)); sentenceAnnotation.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, sentenceTokens.get(sentenceTokens.size()-1).get(CoreAnnotations.CharacterOffsetEndAnnotation.class)); } return document; } private static class TimexAttributes { public String tid; public int sentIndex; public int tokenStart; public int tokenEnd; public String text; public String type; public String value; public String context; public TimexAttributes(String tid, int sentIndex, int tokenIndex) { this.tid = tid; this.sentIndex = sentIndex; this.tokenStart = tokenIndex; this.tokenEnd = tokenIndex + 1; } } private static TimexAttributes findTimex(Map> timexMap, String docId, String tid) { // Find entry List list = timexMap.get(docId); for (TimexAttributes timex:list) { if (timex.tid.equals(tid)) { return timex; } } return null; } private static List updateTimexText(Map> timexMap, Annotation docAnnotation) { // Find entry String docId = docAnnotation.get(CoreAnnotations.DocIDAnnotation.class); List sents = docAnnotation.get(CoreAnnotations.SentencesAnnotation.class); List list = timexMap.get(docId); if (list != null) { for (TimexAttributes timex:list) { CoreMap sent = sents.get(timex.sentIndex); List tokens = sent.get(CoreAnnotations.TokensAnnotation.class); timex.text = StringUtils.joinWords(tokens, " ", timex.tokenStart, timex.tokenEnd); timex.context = joinWordTags(tokens, " ", timex.tokenStart - 3, timex.tokenEnd + 3); /* StringBuilder sb = new StringBuilder(""); for (int i = timex.tokenStart; i < timex.tokenEnd; i++) { if (sb.length() > 0) { sb.append(" "); } sb.append(tokens.get(i).word()); } timex.text = sb.toString(); // Get context sb.setLength(0); int c1 = Math.max(0, timex.tokenStart - 3); int c2 = Math.min(tokens.size(), timex.tokenEnd + 3); for (int i = c1; i < c2; i++) { if (sb.length() > 0) { sb.append(" "); } sb.append(tokens.get(i).word()); } timex.context = sb.toString(); */ } return list; } return null; } private static Map> readTimexAttrExts(String extentsFile, String attrsFile) throws IOException { Map> timexMap = Generics.newHashMap(); BufferedReader extBr = IOUtils.getBufferedFileReader(extentsFile); String line; String lastDocId = null; TimexAttributes lastTimex = null; while ((line = extBr.readLine()) != null) { if (line.trim().isEmpty()) continue; // Simple tab delimited file String[] fields = line.split("\t"); String docName = fields[0]; int sentNo = Integer.parseInt(fields[1]); int tokenNo = Integer.parseInt(fields[2]); String tid = fields[4]; if (lastDocId != null && lastDocId.equals(docName) && lastTimex != null && lastTimex.tid.equals(tid)) { // Expand previous assert(lastTimex.sentIndex == sentNo); lastTimex.tokenEnd = tokenNo + 1; } else { lastDocId = docName; lastTimex = new TimexAttributes(tid, sentNo, tokenNo); List list = timexMap.get(docName); if (list == null) { timexMap.put(docName, list = new ArrayList<>()); } list.add(lastTimex); } } extBr.close(); BufferedReader attrBr = IOUtils.getBufferedFileReader(attrsFile); while ((line = attrBr.readLine()) != null) { if (line.trim().length() == 0) continue; // Simple tab delimited file String[] fields = line.split("\t"); String docName = fields[0]; int sentNo = Integer.parseInt(fields[1]); int tokenNo = Integer.parseInt(fields[2]); String tid = fields[4]; String attrname = fields[6]; String attrvalue = fields[7]; // Find entry TimexAttributes timex = findTimex(timexMap, docName, tid); assert(timex.sentIndex == sentNo); assert(timex.tokenStart <= tokenNo && timex.tokenEnd > tokenNo); switch (attrname) { case "type": assert (timex.type == null || timex.type.equals(attrvalue)); timex.type = attrvalue; break; case "value": assert (timex.value == null || timex.value.equals(attrvalue)); timex.value = attrvalue; break; default: throw new RuntimeException("Error processing " + attrsFile + ":" + "Unknown attribute " + attrname + ": from line " + line); } } attrBr.close(); return timexMap; } public static void processTempEval2Tab(AnnotationPipeline pipeline, String in, String out, Map docDates) throws IOException { Map> timexMap = readTimexAttrExts(in + "/timex-extents.tab", in + "/timex-attributes.tab"); BufferedReader br = IOUtils.getBufferedFileReader(in + "/base-segmentation.tab"); PrintWriter debugPw = IOUtils.getPrintWriter(out + "/timex-debug.out"); PrintWriter attrPw = IOUtils.getPrintWriter(out + "/timex-attrs.res.tab"); PrintWriter extPw = IOUtils.getPrintWriter(out + "/timex-extents.res.tab"); PrintWriter attrDebugPwGold = IOUtils.getPrintWriter(out + "/timex-attrs.debug.gold.tab"); PrintWriter attrDebugPw = IOUtils.getPrintWriter(out + "/timex-attrs.debug.res.tab"); String line; String curDocName = null; int curSentNo = -1; List tokens = null; List sentences = null; while ((line = br.readLine()) != null) { if (line.trim().length() == 0) continue; // Simple tab delimited file String[] fields = line.split("\t"); String docName = fields[0]; int sentNo = Integer.parseInt(fields[1]); //int tokenNo = Integer.parseInt(fields[2]); String tokenText = fields[3]; // Create little annotation with sentences and tokens if (!docName.equals(curDocName)) { if (curDocName != null) { // Process document CoreMap lastSentence = wordsToSentence(tokens); sentences.add(lastSentence); Annotation docAnnotation = sentencesToDocument(curDocName, docDates.get(curDocName), sentences); processTempEval2Doc(pipeline, docAnnotation, timexMap, extPw, attrPw, debugPw, attrDebugPwGold, attrDebugPw); curDocName = null; } // New doc tokens = new ArrayList<>(); sentences = new ArrayList<>(); } else if (curSentNo != sentNo) { CoreMap lastSentence = wordsToSentence(tokens); sentences.add(lastSentence); tokens = new ArrayList<>(); } tokens.add(tokenText); curDocName = docName; curSentNo = sentNo; } if (curDocName != null) { // Process document CoreMap lastSentence = wordsToSentence(tokens); sentences.add(lastSentence); Annotation docAnnotation = sentencesToDocument(curDocName, docDates.get(curDocName), sentences); processTempEval2Doc(pipeline, docAnnotation, timexMap, extPw, attrPw, debugPw, attrDebugPwGold, attrDebugPw); curDocName = null; } br.close(); extPw.close(); attrPw.close(); debugPw.close(); attrDebugPwGold.close(); attrDebugPw.close(); } public static void processTempEval2(AnnotationPipeline pipeline, String in, String out, String eval, String dct) throws IOException, ParseException { Map docDates = (dct != null)? IOUtils.readMap(dct):IOUtils.readMap(in + "/dct.txt"); if (requiredDocDateFormat != null) { // convert from yyyyMMdd to requiredDocDateFormat DateFormat defaultFormatter = new SimpleDateFormat("yyyyMMdd"); DateFormat requiredFormatter = new SimpleDateFormat(requiredDocDateFormat); for (Map.Entry docDateEntry : docDates.entrySet()) { Date date = defaultFormatter.parse(docDateEntry.getValue()); docDates.put(docDateEntry.getKey(), requiredFormatter.format(date)); } } processTempEval2Tab(pipeline, in, out, docDates); if (eval != null) { List command = new ArrayList<>(); if (PYTHON != null) { command.add(PYTHON); } command.add(eval); command.add(in + "/base-segmentation.tab"); command.add(in + "/timex-extents.tab"); command.add(out + "/timex-extents.res.tab"); command.add(in + "/timex-attributes.tab"); command.add(out + "/timex-attrs.res.tab"); ProcessBuilder pb = new ProcessBuilder(command); FileOutputStream evalFileOutput = new FileOutputStream(out + "/scores.txt"); Writer output = new OutputStreamWriter( new TeeStream(System.out, evalFileOutput)); SystemUtils.run(pb, output, null); evalFileOutput.close(); } } public static void processTempEval3(AnnotationPipeline pipeline, String in, String out, String evalCmd) throws Exception { // Process files File inFile = new File(in); if (inFile.isDirectory()) { // input is a directory - process files in directory Pattern teinputPattern = Pattern.compile("\\.(TE3input|tml)$"); Iterable files = IOUtils.iterFilesRecursive(inFile, teinputPattern); File outDir = new File(out); outDir.mkdirs(); for (File file: files) { String inputFilename = file.getAbsolutePath(); String outputFilename = inputFilename.replace(in, out).replace(".TE3input", ""); if (!outputFilename.equalsIgnoreCase(inputFilename)) { //System.out.println(inputFilename + " => " + outputFilename); processTempEval3File(pipeline, inputFilename, outputFilename); } else { log.info("ABORTING: Input file and output is the same - " + inputFilename); System.exit(-1); } } } else { // input is a file - process file processTempEval3File(pipeline, in, out); } // Evaluate if (evalCmd != null) { // TODO: apply eval command } } public static void processTempEval3File(AnnotationPipeline pipeline, String in, String out) throws Exception { // Process one tempeval file Document doc = edu.stanford.nlp.util.XMLUtils.readDocumentFromFile(in); Node timemlNode = XMLUtils.getNode(doc, "TimeML"); Node docIdNode = XMLUtils.getNode(timemlNode, "DOCID"); Node dctNode = XMLUtils.getNode(timemlNode, "DCT"); Node dctTimexNode = XMLUtils.getNode(dctNode, "TIMEX3"); Node titleNode = XMLUtils.getNode(timemlNode, "TITLE"); Node extraInfoNode = XMLUtils.getNode(timemlNode, "EXTRA_INFO"); Node textNode = XMLUtils.getNode(timemlNode, "TEXT"); String date = XMLUtils.getAttributeValue(dctTimexNode, "value"); String text = textNode.getTextContent(); Annotation annotation = textToAnnotation(pipeline, text, date); Element annotatedTextElem = annotationToTmlTextElement(annotation); Document annotatedDoc = XMLUtils.createDocument(); Node newTimemlNode = annotatedDoc.importNode(timemlNode, false); if(docIdNode != null){ newTimemlNode.appendChild(annotatedDoc.importNode(docIdNode, true)); } newTimemlNode.appendChild(annotatedDoc.importNode(dctNode, true)); if (titleNode != null) { newTimemlNode.appendChild(annotatedDoc.importNode(titleNode, true)); } if (extraInfoNode != null) { newTimemlNode.appendChild(annotatedDoc.importNode(extraInfoNode, true)); } newTimemlNode.appendChild(annotatedDoc.adoptNode(annotatedTextElem)); annotatedDoc.appendChild(newTimemlNode); PrintWriter pw = (out != null)? IOUtils.getPrintWriter(out):new PrintWriter(System.out); String string = XMLUtils.documentToString(annotatedDoc); pw.println(string); pw.flush(); if (out != null) pw.close(); } private static String requiredDocDateFormat; private static boolean useGUTime = false; public static AnnotationPipeline getPipeline(Properties props, boolean tokenize) throws Exception { // useGUTime = Boolean.parseBoolean(props.getProperty("gutime", "false")); AnnotationPipeline pipeline = new AnnotationPipeline(); if (tokenize) { pipeline.addAnnotator(new TokenizerAnnotator(false, "en")); pipeline.addAnnotator(new WordsToSentencesAnnotator(false)); } pipeline.addAnnotator(new POSTaggerAnnotator(false)); // pipeline.addAnnotator(new NumberAnnotator(false)); // pipeline.addAnnotator(new QuantifiableEntityNormalizingAnnotator(false, false)); String timeAnnotator = props.getProperty("timeAnnotator", "sutime"); switch (timeAnnotator) { case "gutime": useGUTime = true; pipeline.addAnnotator(new GUTimeAnnotator("gutime", props)); break; case "heideltime": requiredDocDateFormat = "yyyy-MM-dd"; pipeline.addAnnotator(new HeidelTimeAnnotator("heideltime", props)); break; case "sutime": pipeline.addAnnotator(new TimeAnnotator("sutime", props)); break; default: throw new IllegalArgumentException("Unknown timeAnnotator: " + timeAnnotator); } return pipeline; } enum InputType { TEXTFILE, TEXT, TIMEBANK_CSV, TEMPEVAL2, TEMPEVAL3 } private static void configLogger(String out) throws IOException { File outDir = new File(out); if (!outDir.exists()) { outDir.mkdirs(); } StringBuilder sb = new StringBuilder(); sb.append("handlers=java.util.logging.ConsoleHandler, java.util.logging.FileHandler\n"); sb.append(".level=SEVERE\n"); sb.append("edu.stanford.nlp.level=INFO\n"); sb.append("java.util.logging.ConsoleHandler.level=SEVERE\n"); sb.append("java.util.logging.FileHandler.formatter=java.util.logging.SimpleFormatter\n"); sb.append("java.util.logging.FileHandler.level=INFO\n"); sb.append("java.util.logging.FileHandler.pattern=" + out + "/err.log" + "\n"); LogManager.getLogManager().readConfiguration(new ReaderInputStream(new StringReader(sb.toString()))); } private static List createTimexNodes(String str, Integer charBeginOffset, List timexAnns) { List> timexList = new ArrayList<>(timexAnns.size()); for (CoreMap timexAnn:timexAnns) { timexList.add(new ValuedInterval<>(timexAnn, MatchedExpression.COREMAP_TO_CHAR_OFFSETS_INTERVAL_FUNC.apply(timexAnn))); } Collections.sort(timexList, HasInterval.CONTAINS_FIRST_ENDPOINTS_COMPARATOR ); return createTimexNodesPresorted(str, charBeginOffset, timexList); } private static List createTimexNodesPresorted(String str, Integer charBeginOffset, List> timexList) { if (charBeginOffset == null) charBeginOffset = 0; List nodes = new ArrayList<>(); int previousEnd = 0; List timexElems = new ArrayList<>(); List> processed = new ArrayList<>(); CollectionValuedMap> unprocessed = new CollectionValuedMap<>(CollectionFactory.>arrayListFactory()); for (ValuedInterval v:timexList) { CoreMap timexAnn = v.getValue(); int begin = timexAnn.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class) - charBeginOffset; int end = timexAnn.get(CoreAnnotations.CharacterOffsetEndAnnotation.class) - charBeginOffset; if (begin >= previousEnd) { // Add text nodes.add(XMLUtils.createTextNode(str.substring(previousEnd, begin))); // Add timex Timex timex = timexAnn.get(TimeAnnotations.TimexAnnotation.class); Element timexElem = timex.toXmlElement(); nodes.add(timexElem); previousEnd = end; // For handling nested timexes processed.add(v); timexElems.add(timexElem); } else { unprocessed.add(processed.size()-1, v); } } if (previousEnd < str.length()) { nodes.add(XMLUtils.createTextNode(str.substring(previousEnd))); } for (Integer i:unprocessed.keySet()) { ValuedInterval v = processed.get(i); String elemStr = v.getValue().get(CoreAnnotations.TextAnnotation.class); int charStart = v.getValue().get(CoreAnnotations.CharacterOffsetBeginAnnotation.class); List innerElems = createTimexNodesPresorted(elemStr, charStart, (List>) unprocessed.get(i)); Element timexElem = timexElems.get(i); XMLUtils.removeChildren(timexElem); for (Node n:innerElems) { timexElem.appendChild(n); } } return nodes; } public static void processTextFile(AnnotationPipeline pipeline, String in, String out, String date) throws IOException { String text = IOUtils.slurpFile(in); PrintWriter pw = (out != null)? IOUtils.getPrintWriter(out):new PrintWriter(System.out); String string = textToAnnotatedXml(pipeline, text, date); pw.println(string); pw.flush(); if (out != null) pw.close(); } public static void processText(AnnotationPipeline pipeline, String text, String out, String date) throws IOException { PrintWriter pw = (out != null)? IOUtils.getPrintWriter(out):new PrintWriter(System.out); String string = textToAnnotatedXml(pipeline, text, date); pw.println(string); pw.flush(); if (out != null) pw.close(); } public static String textToAnnotatedXml(AnnotationPipeline pipeline, String text, String date) { Annotation annotation = textToAnnotation(pipeline, text, date); Document xmlDoc = annotationToXmlDocument(annotation); return XMLUtils.documentToString(xmlDoc); } public static Element annotationToTmlTextElement(Annotation annotation) { List timexAnnsAll = annotation.get(TimeAnnotations.TimexAnnotations.class); Element textElem = XMLUtils.createElement("TEXT"); List timexNodes = createTimexNodes( annotation.get(CoreAnnotations.TextAnnotation.class), annotation.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class), timexAnnsAll); for (Node node:timexNodes) { textElem.appendChild(node); } return textElem; } public static Document annotationToXmlDocument(Annotation annotation) { Element dateElem = XMLUtils.createElement("DATE"); dateElem.setTextContent(annotation.get(CoreAnnotations.DocDateAnnotation.class)); Element textElem = annotationToTmlTextElement(annotation); Element docElem = XMLUtils.createElement("DOC"); docElem.appendChild(dateElem); docElem.appendChild(textElem); // Create document and import elements into this document.... Document doc = XMLUtils.createDocument(); doc.appendChild(doc.importNode(docElem, true)); return doc; } public static Annotation textToAnnotation(AnnotationPipeline pipeline, String text, String date) { Annotation annotation = new Annotation(text); annotation.set(CoreAnnotations.DocDateAnnotation.class, date); pipeline.annotate(annotation); return annotation; } public static void main(String[] args) throws Exception { // Process arguments Properties props = StringUtils.argsToProperties(args); String in = props.getProperty("i"); String date = props.getProperty("date"); String dct = props.getProperty("tempeval2.dct"); String out = props.getProperty("o"); String inputTypeStr = props.getProperty("in.type", InputType.TEXT.name()); String eval = props.getProperty("eval"); PYTHON = props.getProperty("python", PYTHON); InputType inputType = InputType.valueOf(inputTypeStr); AnnotationPipeline pipeline; switch (inputType) { case TEXT: pipeline = getPipeline(props, true); processText(pipeline, in, out, date); break; case TEXTFILE: pipeline = getPipeline(props, true); processTextFile(pipeline, in, out, date); break; case TIMEBANK_CSV: configLogger(out); pipeline = getPipeline(props, true); processTimebankCsv(pipeline, in, out, eval); break; case TEMPEVAL2: configLogger(out); pipeline = getPipeline(props, false); processTempEval2(pipeline, in, out, eval, dct); break; case TEMPEVAL3: pipeline = getPipeline(props, true); processTempEval3(pipeline, in, out, eval); break; } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy