edu.stanford.nlp.time.SUTimeMain Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.time;
import edu.stanford.nlp.util.logging.Redwood;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.ReaderInputStream;
import edu.stanford.nlp.io.TeeStream;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.tokensregex.MatchedExpression;
import edu.stanford.nlp.pipeline.*;
import edu.stanford.nlp.process.CoreLabelTokenFactory;
import edu.stanford.nlp.stats.PrecisionRecallStats;
import edu.stanford.nlp.util.*;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;

import java.io.*;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.logging.LogManager;
import java.util.regex.Pattern;


/**
 * Main program for testing SUTime.
 * 

 * Processing a text string:
 *  * -in.type TEXT
 * -date YYYY-MM-dd
 * -i <text>
 * -o <output file>
 * 
 *
 * Processing a text file:
 *  * -in.type TEXTFILE
 * -date YYYY-MM-dd
 * -i input.txt
 * -o <output file>
 * 
 *
 * Running on Timebank
 *  * -in.type TIMEBANK_CSV
 * -i timebank.csv
 * -tempeval2.dct dct.txt
 * -o <output directory>
 * -eval <evaluation script>
 * 
 *
 * Evaluating on Tempeval2
 *  * -in.type TEMPEVAL2
 * -i <directory with english data>
 * -o <output directory>
 * -eval <evaluation script>
 * -tempeval2.dct dct file (with document creation times)
 *
 * TEMPEVAL2 (download from http://timeml.org/site/timebank/timebank.html)
 * Evaluation is token based.
 *
 * TRAINING (english):
 *
 * GUTIME:
 * precision   0.88
 * recall      0.71
 * f1-measure  0.79
 * accuracy    0.98
 * attribute type       0.92
 * attribute value      0.31   // LOW SCORE here is due to difference in format (no -,: in date)
 *
 * After fixing some formats for GUTIME:
 *   (GUTIME syntax is inconsistent at times (1991W 8WE, 19980212EV)
 * attribute value      0.67
 *
 * SUTIME:
 * Default: sutime.teRelHeurLevel=NONE, restrictToTimex3=false
 * precision   0.873
 * recall      0.897
 * f1-measure  0.885
 * accuracy    0.991
 *
 *                                P      R    F1
 * attribute type       0.918 | 0.751 0.802 0.776
 * attribute value      0.762 | 0.623 0.665 0.644
 *
 *                                        P      R    F1
 * mention attribute type       0.900 | 0.780 0.833 0.805
 * mention attribute value      0.742 | 0.643 0.687 0.664
 *
 * sutime.teRelHeurLevel=MORE, restrictToTimex3=true
 * precision   0.876
 * recall      0.889
 * f1-measure  0.882
 * accuracy    0.991
 *                                P      R    F1
 * attribute type       0.918 | 0.744 0.798 0.770
 * attribute value      0.776 | 0.629 0.675 0.651
 *
 *                                        P      R    F1
 * mention attribute type       0.901 | 0.780 0.836 0.807
 * mention attribute value      0.750 | 0.649 0.696 0.672
 *
 * ------------------------------------------------------------------------------
 * TEST (english):
 *
 * GUTIME:
 * precision   0.89
 * recall      0.79
 * f1-measure  0.84
 * accuracy    0.99
 *
 * attribute type       0.95
 * attribute value      0.68
 *
 * SUTIME:
 * Default: sutime.teRelHeurLevel=NONE, restrictToTimex3=false
 * precision   0.878
 * recall      0.963
 * f1-measure  0.918
 * accuracy    0.996
 *
 *                                P      R    F1
 * attribute type       0.953 | 0.820 0.904 0.860
 * attribute value      0.791 | 0.680 0.750 0.713
 *
 *                                        P      R    F1
 * mention attribute type       0.954 | 0.837 0.923 0.878
 * mention attribute value      0.781 | 0.686 0.756 0.720
 *
 * sutime.teRelHeurLevel=MORE, restrictToTimex3=true
 * precision   0.881
 * recall      0.963
 * f1-measure  0.920
 * accuracy    0.995
 *                                P      R    F1
 * attribute type       0.959 | 0.821 0.910 0.863
 * attribute value      0.818 | 0.699 0.776 0.736
 *
 *                                        P      R    F1
 * mention attribute type       0.961 | 0.844 0.936 0.888
 * mention attribute value      0.803 | 0.705 0.782 0.742
 *
 * 
 * @author Angel Chang
 */
public class SUTimeMain  {

  /** A logger for this class */
  private static final Redwood.RedwoodChannels log = Redwood.channels(SUTimeMain.class);
  protected static String PYTHON = null;


  private SUTimeMain() {} // static class


  /*
   * Other Time corpora: (see also http://timeml.org/site/timebank/timebank.html)
   * LDC2006T08 TimeBank 1.2 (Uses TIMEX3)
   * LDC2005T07 ACE Time Normalization (TERN) 2004 English Training Data v 1.0 (Uses TIMEX2)
   *   GUTime achieved .85, .78, and .82 F-measure for timex2, text, and val fields
   * LDC2010T18 ACE Time Normalization (TERN) 2004 English Evaluation Data V1.0
   */
  ////////////////////////////////////////////////////////////////////////////////////////

  private static class EvalStats {
    PrecisionRecallStats prStats = new PrecisionRecallStats();
//    PrecisionRecallStats tokenPrStats = new PrecisionRecallStats();
    PrecisionRecallStats valPrStats = new PrecisionRecallStats();
    PrecisionRecallStats estPrStats = new PrecisionRecallStats();
  }

  private static class TimebankTimex {
    String timexId;
    String timexVal;
    String timexOrigVal;
    String timexStr;
    int tid;

    private TimebankTimex(String timexId, String timexVal, String timexOrigVal, String timexStr) {
      this.timexId = timexId;
      this.timexVal = timexVal;
      this.timexOrigVal = timexOrigVal;
      this.timexStr = timexStr;
      if (timexId != null && timexId.length() > 0) {
        tid = Integer.parseInt(timexId);
      }
    }
  }

  private static class TimebankSent {
    boolean initialized = false;
    String docId;
    @SuppressWarnings("unused")
    String docFilename;
    String docPubDate;
    String sentId;
    String text;
    List timexes = new ArrayList<>();

    List origItems = new ArrayList<>();

    public boolean add(String item) {
      String[] fields = item.split("\\s*\\|\\s*", 9);
      String docId = fields[0];
      String docFilename = fields[1];
      String docPubDate = fields[2];
      String sentId = fields[3];
      String sent = fields[8];
      if (initialized) {
        // check compatibility;
        if (!docId.equals(this.docId) || !sentId.equals(this.sentId)) {
          return false;
        }
      } else {
        this.docId = docId;
        this.docFilename = docFilename;
        this.docPubDate = docPubDate;
        this.sentId = sentId;
        this.text = sent;
        initialized = true;
      }

      origItems.add(item);
      String timexId = fields[4];
      String timexVal = fields[5];
      String timexOrigVal = fields[6];
      String timexStr = fields[7];
      if (timexId != null && timexId.length() > 0) {
        timexes.add(new TimebankTimex(timexId, timexVal, timexOrigVal, timexStr));
      }
      return true;
    }


  }

//Overall: PrecisionRecallStats[tp=877,fp=199,fn=386,p=0.82  (877/1076),r=0.69  (877/1263),f1=0.75]
//Value: PrecisionRecallStats[tp=229,fp=199,fn=1034,p=0.54  (229/428),r=0.18  (229/1263),f1=0.27]

  // Process one item from timebank CSV file
  private static void processTimebankCsvSent(AnnotationPipeline pipeline, TimebankSent sent, PrintWriter pw, EvalStats evalStats)
  {
    if (sent != null) {
      Collections.sort(sent.timexes, (o1, o2) -> {
        if (o1.tid == o2.tid) { return 0; }
        else return (o1.tid < o2.tid)? -1:1;
      });
      pw.println();
      for (String item:sent.origItems) {
        pw.println("PROC |" + item);
      }
      Annotation annotation = new Annotation(sent.text);
      annotation.set(CoreAnnotations.DocDateAnnotation.class, sent.docPubDate);
      pipeline.annotate(annotation);

      List timexes = annotation.get(TimeAnnotations.TimexAnnotations.class);
      int i = 0;
      for (CoreMap t:timexes) {
        String[] newFields;
        if (sent.timexes.size() > i) {
          String res;
          TimebankTimex goldTimex = sent.timexes.get(i);
          Timex guessTimex = t.get(TimeAnnotations.TimexAnnotation.class);
          String s1 = goldTimex.timexStr.replaceAll("\\s+", "");
          String s2 = guessTimex.text().replaceAll("\\s+", "");
          if (s1.equals(s2)) {
            evalStats.estPrStats.incrementTP();
            res = "OK";
          } else {
            evalStats.estPrStats.incrementFP();
            evalStats.estPrStats.incrementFN();
            res = "BAD";
          }
          newFields = new String[] { res, goldTimex.timexId, goldTimex.timexVal, goldTimex.timexOrigVal, goldTimex.timexStr,
                  t.get(TimeAnnotations.TimexAnnotation.class).toString() };
          i++;
        } else {
          newFields = new String[] { "NONE" , t.get(TimeAnnotations.TimexAnnotation.class).toString()};
          evalStats.estPrStats.incrementFP();
        }
        pw.println("GOT | "+ StringUtils.join(newFields, "|"));
      }
      for (; i < sent.timexes.size(); i++) {
        evalStats.estPrStats.incrementFN();
      }

      i = 0;
      int lastIndex = 0;
      for (TimebankTimex goldTimex:sent.timexes) {
          int index = sent.text.indexOf(goldTimex.timexStr, lastIndex);
          int endIndex = index + goldTimex.timexStr.length();
          boolean found = false;
          for (; i < timexes.size(); i++) {
            CoreMap t = timexes.get(i);
            if (t.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class) >= endIndex) {
              break;
            } else {
              if (t.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class) >= index) {
                found = true;
                evalStats.prStats.incrementTP();
                if (goldTimex.timexOrigVal.equals(t.get(TimeAnnotations.TimexAnnotation.class).value())) {
                  evalStats.valPrStats.incrementTP();
                } else {
                  evalStats.valPrStats.incrementFN();
                }
              } else {
                evalStats.prStats.incrementFP();
                evalStats.valPrStats.incrementFP();
              }
            }
          }
          if (!found)  {
            evalStats.prStats.incrementFN();
            evalStats.valPrStats.incrementFN();
          }
          lastIndex = endIndex;
      }

      for (; i < timexes.size(); i++) {
        evalStats.prStats.incrementFP();
        evalStats.valPrStats.incrementFP();
      }
    }
  }



  // Process CSV file with just timebank sentences with time expressions
  public static void processTimebankCsv(AnnotationPipeline pipeline, String in, String out, String eval) throws IOException {
    BufferedReader br = IOUtils.getBufferedFileReader(in);
    PrintWriter pw = (out != null)? IOUtils.getPrintWriter(out):new PrintWriter(System.out);
    String line;
//    boolean dataStarted = false;
    boolean dataStarted = true;
    TimebankSent sent = new TimebankSent();
    String item = null;
    EvalStats evalStats = new EvalStats();
    line = br.readLine(); // Skip first line
    while ((line = br.readLine()) != null) {
      if (line.trim().length() == 0) continue;
      if (dataStarted) {
        if (line.contains("|")) {
          if (item != null) {
            boolean addOld = sent.add(item);
            if (!addOld) {
              processTimebankCsvSent(pipeline, sent, pw, evalStats);
              sent = new TimebankSent();
              sent.add(item);
            }
          }
          item = line;
        } else {
          item += " " + line;
        }
      } else {
        if (line.matches("#+ BEGIN DATA #+")) {
          dataStarted = true;
        }
      }
    }
    if (item != null) {
      boolean addOld = sent.add(item);
      if (!addOld) {
        processTimebankCsvSent(pipeline, sent, pw, evalStats);
        sent = new TimebankSent();
        sent.add(item);
      }
      processTimebankCsvSent(pipeline, sent, pw, evalStats);
    }
    br.close();
    if (out != null) { pw.close(); }
    System.out.println("Estimate: " + evalStats.estPrStats.toString(2));
    System.out.println("Overall: " + evalStats.prStats.toString(2));
    System.out.println("Value: " + evalStats.valPrStats.toString(2));
  }

  private static String joinWordTags(List l, String glue, int start, int end) {
    return StringUtils.join(l, glue, in -> in.get(CoreAnnotations.TextAnnotation.class) + '/' + in.get(CoreAnnotations.PartOfSpeechAnnotation.class), start, end);
  }

  private static void processTempEval2Doc(AnnotationPipeline pipeline, Annotation docAnnotation,
                                          Map> timexMap,
                                          PrintWriter extPw, PrintWriter attrPw, PrintWriter debugPw,
                                          PrintWriter attrDebugPwGold, PrintWriter attrDebugPw) {
    pipeline.annotate(docAnnotation);
    String docId = docAnnotation.get(CoreAnnotations.DocIDAnnotation.class);
    String docDate = docAnnotation.get(CoreAnnotations.DocDateAnnotation.class);
    List sents = docAnnotation.get(CoreAnnotations.SentencesAnnotation.class);

    if (timexMap != null) {
      List golds = updateTimexText(timexMap, docAnnotation);
      if (attrDebugPwGold != null && golds != null) {
        for (TimexAttributes g:golds) {
          String[] newFields = { docId, docDate,
                  String.valueOf(g.sentIndex),
                  String.valueOf(g.tokenStart),
                  String.valueOf(g.tokenEnd),
                  /*g.tid, */ g.type, g.value, g.text, g.context };
          attrDebugPwGold.println(StringUtils.join(newFields, "\t"));
        }
      }
    }
    if (attrDebugPw != null) {
      for (CoreMap sent:sents) {
        List timexes = sent.get(TimeAnnotations.TimexAnnotations.class);
        if (timexes != null) {
          for (CoreMap t:timexes) {
            Timex timex = t.get(TimeAnnotations.TimexAnnotation.class);
            int sentIndex = sent.get(CoreAnnotations.SentenceIndexAnnotation.class);
            int sentTokenStart = sent.get(CoreAnnotations.TokenBeginAnnotation.class);
            int tokenStart;
            int tokenEnd;
            if (t.containsKey(CoreAnnotations.TokenBeginAnnotation.class)) {
              tokenStart = t.get(CoreAnnotations.TokenBeginAnnotation.class) - sentTokenStart;
              tokenEnd = t.get(CoreAnnotations.TokenEndAnnotation.class) - sentTokenStart;
            } else {
              CoreMap cm = ChunkAnnotationUtils.getAnnotatedChunkUsingCharOffsets(docAnnotation,
                      t.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class),
                      t.get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
              tokenStart = cm.get(CoreAnnotations.TokenBeginAnnotation.class) - sentTokenStart;
              tokenEnd = cm.get(CoreAnnotations.TokenEndAnnotation.class) - sentTokenStart;
            }
            String context = joinWordTags(sent.get(CoreAnnotations.TokensAnnotation.class), " ", tokenStart-3, tokenEnd+3);
            String[] newFields = { docId, docDate,
                String.valueOf(sentIndex),
                String.valueOf(tokenStart), String.valueOf(tokenEnd),
                /*timex.tid(), */ timex.timexType(), timex.value(), timex.text(), context};
            attrDebugPw.println(StringUtils.join(newFields, "\t"));
          }
        }
      }
    }
    if (debugPw != null) {
      List timexes = docAnnotation.get(TimeAnnotations.TimexAnnotations.class);
      for (CoreMap t:timexes) {
        String[] newFields = { docId, docDate, t.get(TimeAnnotations.TimexAnnotation.class).toString() };
        debugPw.println("GOT | "+ StringUtils.join(newFields, "|"));
      }
    }
    if (extPw != null || attrPw != null) {
     for (CoreMap sent:sents) {
      int sentTokenBegin = sent.get(CoreAnnotations.TokenBeginAnnotation.class);
      for (CoreMap t:sent.get(TimeAnnotations.TimexAnnotations.class)) {
        Timex tmx = t.get(TimeAnnotations.TimexAnnotation.class);
        List tokens = t.get(CoreAnnotations.TokensAnnotation.class);
        int tokenIndex = 0;
        if (tokens == null) {
          CoreMap cm = ChunkAnnotationUtils.getAnnotatedChunkUsingCharOffsets(docAnnotation,
                  t.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class),
                  t.get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
          tokens = cm.get(CoreAnnotations.TokensAnnotation.class);
          tokenIndex = cm.get(CoreAnnotations.TokenBeginAnnotation.class);
        } else {
          tokenIndex = t.get(CoreAnnotations.TokenBeginAnnotation.class);
        }
        tokenIndex = tokenIndex - sentTokenBegin;
        String sentenceIndex = String.valueOf(sent.get(CoreAnnotations.SentenceIndexAnnotation.class));
        int tokenCount = 0;
        for (@SuppressWarnings("unused") CoreLabel token:tokens) {
          String[] extFields = {
                  docId,
                  sentenceIndex,
                  String.valueOf(tokenIndex),
                  "timex3",
                  tmx.tid(),
                  "1"};
          String extString = StringUtils.join(extFields, "\t");
          if (extPw != null) extPw.println(extString);
          if (attrPw != null /* && tokenCount == 0 */) {
            String[] attrFields = {
                  "type",
                  tmx.timexType(),
            };
            attrPw.println(extString + "\t" + StringUtils.join(attrFields, "\t"));
            if (tmx.value() != null) {
              String val = tmx.value();
              // Fix up expression values (needed for GUTime)
              if (useGUTime) {
                if ("TIME".equals(tmx.timexType())) {
                  if (val.matches("T\\d{4}")) {
                    val = "T" + val.substring(1,3) + ":" + val.substring(3,5);
                  }
                } else if ("DATE".equals(tmx.timexType())) {
                  if (val.matches("\\d{8}T.*")) {
                    val = val.substring(0,4) + "-" + val.substring(4,6) + "-" + val.substring(6);
                  } else if (val.matches("\\d{8}")) {
                    val = val.substring(0,4) + "-" + val.substring(4,6) + "-" + val.substring(6,8);
                  } else if (val.matches("\\d\\d\\d\\d..")) {
                    val = val.substring(0,4) + "-" + val.substring(4,6);
                  } else if (val.matches("[0-9X]{4}W[0-9X]{2}.*")) {
                    if (val.length() > 7) {
                      val = val.substring(0,4) + "-" + val.substring(4,7) + "-" + val.substring(7);
                    } else {
                      val = val.substring(0,4) + "-" + val.substring(4,7);
                    }
                  }
                }
              } /*else {
                // SUTIME
                if ("DATE".equals(tmx.timexType())) {
                  if (val.matches("\\d\\d\\dX")) {
                    val = val.substring(0,3);  // Convert 199X to 199
                  }
                }
              }   */
              attrFields[0] = "value";
              attrFields[1] = val;

              attrPw.println(extString + "\t" + StringUtils.join(attrFields, "\t"));
            }
          }
          tokenIndex++;
          tokenCount++;
        }
      }
     }
    }
  }

  private static CoreLabelTokenFactory tokenFactory = new CoreLabelTokenFactory();

  private static CoreMap wordsToSentence(List sentWords) {
    String sentText = StringUtils.join(sentWords, " ");
    Annotation sentence = new Annotation(sentText);
    List tokens = new ArrayList<>(sentWords.size());
    for (String text:sentWords) {
      CoreLabel token = tokenFactory.makeToken();
      token.set(CoreAnnotations.TextAnnotation.class, text);
      tokens.add(token);
    }
    sentence.set(CoreAnnotations.TokensAnnotation.class, tokens);
    return sentence;
  }

  public static Annotation sentencesToDocument(String documentID, String docDate, List sentences) {
    String docText = ChunkAnnotationUtils.getTokenText(sentences, CoreAnnotations.TextAnnotation.class);
    Annotation document = new Annotation(docText);
    document.set(CoreAnnotations.DocIDAnnotation.class, documentID);
    document.set(CoreAnnotations.DocDateAnnotation.class, docDate);
    document.set(CoreAnnotations.SentencesAnnotation.class, sentences);

    // Accumulate docTokens and label sentence with overall token begin/end, and sentence index annotations
    List docTokens = new ArrayList<>();
    int sentenceIndex = 0;
    int tokenBegin = 0;
    for (CoreMap sentenceAnnotation:sentences) {
      List sentenceTokens = sentenceAnnotation.get(CoreAnnotations.TokensAnnotation.class);
      docTokens.addAll(sentenceTokens);

      int tokenEnd = tokenBegin + sentenceTokens.size();
      sentenceAnnotation.set(CoreAnnotations.TokenBeginAnnotation.class, tokenBegin);
      sentenceAnnotation.set(CoreAnnotations.TokenEndAnnotation.class, tokenEnd);
      sentenceAnnotation.set(CoreAnnotations.SentenceIndexAnnotation.class, sentenceIndex);
      sentenceIndex++;
      tokenBegin = tokenEnd;
    }
    document.set(CoreAnnotations.TokensAnnotation.class, docTokens);

    // Put in character offsets
    int i = 0;
    for (CoreLabel token:docTokens) {
      String tokenText = token.get(CoreAnnotations.TextAnnotation.class);
      token.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, i);
      i+=tokenText.length();
      token.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, i);
      i++; // Skip space
    }
    for (CoreMap sentenceAnnotation:sentences) {
      List sentenceTokens = sentenceAnnotation.get(CoreAnnotations.TokensAnnotation.class);
      sentenceAnnotation.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class,
              sentenceTokens.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class));
      sentenceAnnotation.set(CoreAnnotations.CharacterOffsetEndAnnotation.class,
              sentenceTokens.get(sentenceTokens.size()-1).get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
    }

    return document;
  }

  private static class TimexAttributes {
    public String tid;
    public int sentIndex;
    public int tokenStart;
    public int tokenEnd;
    public String text;
    public String type;
    public String value;
    public String context;

    public TimexAttributes(String tid, int sentIndex, int tokenIndex) {
      this.tid = tid;
      this.sentIndex = sentIndex;
      this.tokenStart = tokenIndex;
      this.tokenEnd = tokenIndex + 1;
    }
  }

  private static TimexAttributes findTimex(Map> timexMap, String docId, String tid) {
    // Find entry
    List list = timexMap.get(docId);
    for (TimexAttributes timex:list) {
      if (timex.tid.equals(tid)) {
        return timex;
      }
    }
    return null;
  }

  private static List updateTimexText(Map> timexMap, Annotation docAnnotation) {
    // Find entry
    String docId = docAnnotation.get(CoreAnnotations.DocIDAnnotation.class);
    List sents = docAnnotation.get(CoreAnnotations.SentencesAnnotation.class);
    List list = timexMap.get(docId);
    if (list != null) {
      for (TimexAttributes timex:list) {
        CoreMap sent = sents.get(timex.sentIndex);
        List tokens = sent.get(CoreAnnotations.TokensAnnotation.class);
        timex.text = StringUtils.joinWords(tokens, " ", timex.tokenStart, timex.tokenEnd);
        timex.context = joinWordTags(tokens, " ", timex.tokenStart - 3, timex.tokenEnd + 3);

/*        StringBuilder sb = new StringBuilder("");
        for (int i = timex.tokenStart; i < timex.tokenEnd; i++) {
          if (sb.length() > 0) { sb.append(" "); }
          sb.append(tokens.get(i).word());
        }
        timex.text = sb.toString();

        // Get context
        sb.setLength(0);
        int c1 = Math.max(0, timex.tokenStart - 3);
        int c2 = Math.min(tokens.size(), timex.tokenEnd + 3);
        for (int i = c1; i < c2; i++) {
          if (sb.length() > 0) { sb.append(" "); }
          sb.append(tokens.get(i).word());
        }
        timex.context = sb.toString();             */

      }
      return list;
    }
    return null;
  }

  private static Map> readTimexAttrExts(String extentsFile, String attrsFile) throws IOException {
    Map> timexMap = Generics.newHashMap();
    BufferedReader extBr = IOUtils.getBufferedFileReader(extentsFile);
    String line;
    String lastDocId = null;
    TimexAttributes lastTimex = null;
    while ((line = extBr.readLine()) != null) {
      if (line.trim().isEmpty()) continue;
      // Simple tab delimited file
      String[] fields = line.split("\t");
      String docName = fields[0];
      int sentNo = Integer.parseInt(fields[1]);
      int tokenNo = Integer.parseInt(fields[2]);
      String tid = fields[4];

      if (lastDocId != null && lastDocId.equals(docName) && lastTimex != null && lastTimex.tid.equals(tid)) {
        // Expand previous
        assert(lastTimex.sentIndex == sentNo);
        lastTimex.tokenEnd = tokenNo + 1;
      } else {
        lastDocId = docName;
        lastTimex = new TimexAttributes(tid, sentNo, tokenNo);
        List list = timexMap.get(docName);
        if (list == null) {
          timexMap.put(docName, list = new ArrayList<>());
        }
        list.add(lastTimex);
      }
    }
    extBr.close();

    BufferedReader attrBr = IOUtils.getBufferedFileReader(attrsFile);
    while ((line = attrBr.readLine()) != null) {
      if (line.trim().length() == 0) continue;
      // Simple tab delimited file
      String[] fields = line.split("\t");
      String docName = fields[0];
      int sentNo = Integer.parseInt(fields[1]);
      int tokenNo = Integer.parseInt(fields[2]);
      String tid = fields[4];
      String attrname = fields[6];
      String attrvalue = fields[7];

      // Find entry
      TimexAttributes timex = findTimex(timexMap, docName, tid);
      assert(timex.sentIndex == sentNo);
      assert(timex.tokenStart <= tokenNo && timex.tokenEnd > tokenNo);

      switch (attrname) {
        case "type":
          assert (timex.type == null || timex.type.equals(attrvalue));
          timex.type = attrvalue;
          break;
        case "value":
          assert (timex.value == null || timex.value.equals(attrvalue));
          timex.value = attrvalue;
          break;
        default:
          throw new RuntimeException("Error processing " + attrsFile + ":" +
              "Unknown attribute " + attrname + ": from line " + line);
      }
    }
    attrBr.close();
    return timexMap;
  }

  public static void processTempEval2Tab(AnnotationPipeline pipeline, String in, String out, Map docDates) throws IOException
  {
    Map> timexMap = readTimexAttrExts(in  + "/timex-extents.tab", in  + "/timex-attributes.tab");
    BufferedReader br = IOUtils.getBufferedFileReader(in  + "/base-segmentation.tab");
    PrintWriter debugPw = IOUtils.getPrintWriter(out + "/timex-debug.out");
    PrintWriter attrPw = IOUtils.getPrintWriter(out + "/timex-attrs.res.tab");
    PrintWriter extPw = IOUtils.getPrintWriter(out + "/timex-extents.res.tab");
    PrintWriter attrDebugPwGold = IOUtils.getPrintWriter(out + "/timex-attrs.debug.gold.tab");
    PrintWriter attrDebugPw = IOUtils.getPrintWriter(out + "/timex-attrs.debug.res.tab");
    String line;
    String curDocName = null;
    int curSentNo = -1;
    List tokens = null;
    List sentences = null;
    while ((line = br.readLine()) != null) {
      if (line.trim().length() == 0) continue;
      // Simple tab delimited file
      String[] fields = line.split("\t");
      String docName = fields[0];
      int sentNo = Integer.parseInt(fields[1]);
      //int tokenNo = Integer.parseInt(fields[2]);
      String tokenText = fields[3];

      // Create little annotation with sentences and tokens
      if (!docName.equals(curDocName)) {
        if (curDocName != null) {
          // Process document
          CoreMap lastSentence = wordsToSentence(tokens);
          sentences.add(lastSentence);
          Annotation docAnnotation = sentencesToDocument(curDocName, docDates.get(curDocName), sentences);
          processTempEval2Doc(pipeline, docAnnotation, timexMap, extPw, attrPw, debugPw, attrDebugPwGold, attrDebugPw);
          curDocName = null;
        }
        // New doc
        tokens = new ArrayList<>();
        sentences = new ArrayList<>();
      } else if (curSentNo != sentNo) {
        CoreMap lastSentence = wordsToSentence(tokens);
        sentences.add(lastSentence);
        tokens = new ArrayList<>();
      }
      tokens.add(tokenText);
      curDocName = docName;
      curSentNo = sentNo;
    }
    if (curDocName != null) {
      // Process document
      CoreMap lastSentence = wordsToSentence(tokens);
      sentences.add(lastSentence);
      Annotation docAnnotation = sentencesToDocument(curDocName, docDates.get(curDocName), sentences);
      processTempEval2Doc(pipeline, docAnnotation, timexMap, extPw, attrPw, debugPw, attrDebugPwGold, attrDebugPw);
      curDocName = null;
    }
    br.close();
    extPw.close();
    attrPw.close();
    debugPw.close();
    attrDebugPwGold.close();
    attrDebugPw.close();
  }

  public static void processTempEval2(AnnotationPipeline pipeline, String in, String out, String eval, String dct) throws IOException, ParseException
  {
    Map docDates = (dct != null)? IOUtils.readMap(dct):IOUtils.readMap(in + "/dct.txt");
    if (requiredDocDateFormat != null) {
      // convert from yyyyMMdd to requiredDocDateFormat
      DateFormat defaultFormatter = new SimpleDateFormat("yyyyMMdd");
      DateFormat requiredFormatter = new SimpleDateFormat(requiredDocDateFormat);
      for (Map.Entry docDateEntry : docDates.entrySet()) {
        Date date = defaultFormatter.parse(docDateEntry.getValue());
        docDates.put(docDateEntry.getKey(), requiredFormatter.format(date));
      }
    }
    processTempEval2Tab(pipeline, in, out, docDates);
    if (eval != null) {
      List command = new ArrayList<>();
      if (PYTHON != null) {
        command.add(PYTHON);
      }
      command.add(eval);
      command.add(in + "/base-segmentation.tab");
      command.add(in + "/timex-extents.tab");
      command.add(out + "/timex-extents.res.tab");
      command.add(in + "/timex-attributes.tab");
      command.add(out + "/timex-attrs.res.tab");
      ProcessBuilder pb = new ProcessBuilder(command);
      FileOutputStream evalFileOutput = new FileOutputStream(out + "/scores.txt");
      Writer output = new OutputStreamWriter(
              new TeeStream(System.out, evalFileOutput));
      SystemUtils.run(pb, output, null);
      evalFileOutput.close();
    }
  }

  public static void processTempEval3(AnnotationPipeline pipeline, String in, String out, String evalCmd) throws Exception
  {
    // Process files
    File inFile = new File(in);
    if (inFile.isDirectory()) {
      // input is a directory - process files in directory
      Pattern teinputPattern = Pattern.compile("\\.(TE3input|tml)$");
      Iterable files = IOUtils.iterFilesRecursive(inFile, teinputPattern);
      File outDir = new File(out);
      outDir.mkdirs();
      for (File file: files) {
        String inputFilename = file.getAbsolutePath();
        String outputFilename = inputFilename.replace(in, out).replace(".TE3input", "");
        if (!outputFilename.equalsIgnoreCase(inputFilename)) {
          //System.out.println(inputFilename + " => " + outputFilename);
          processTempEval3File(pipeline, inputFilename, outputFilename);
        } else {
          log.info("ABORTING: Input file and output is the same - " + inputFilename);
          System.exit(-1);
        }
      }
    } else {
      // input is a file - process file
      processTempEval3File(pipeline, in, out);
    }
    // Evaluate
    if (evalCmd != null) {
      // TODO: apply eval command
    }
  }

  public static void processTempEval3File(AnnotationPipeline pipeline, String in, String out) throws Exception {
    // Process one tempeval file
    Document doc = edu.stanford.nlp.util.XMLUtils.readDocumentFromFile(in);
    Node timemlNode = XMLUtils.getNode(doc, "TimeML");
    Node docIdNode = XMLUtils.getNode(timemlNode, "DOCID");
    Node dctNode = XMLUtils.getNode(timemlNode, "DCT");
    Node dctTimexNode = XMLUtils.getNode(dctNode, "TIMEX3");
    Node titleNode = XMLUtils.getNode(timemlNode, "TITLE");
    Node extraInfoNode = XMLUtils.getNode(timemlNode, "EXTRA_INFO");
    Node textNode = XMLUtils.getNode(timemlNode, "TEXT");
    String date = XMLUtils.getAttributeValue(dctTimexNode, "value");
    String text = textNode.getTextContent();
    Annotation annotation = textToAnnotation(pipeline, text, date);
    Element annotatedTextElem = annotationToTmlTextElement(annotation);

    Document annotatedDoc = XMLUtils.createDocument();
    Node newTimemlNode = annotatedDoc.importNode(timemlNode, false);
    if(docIdNode != null){
        newTimemlNode.appendChild(annotatedDoc.importNode(docIdNode, true));
    }
    newTimemlNode.appendChild(annotatedDoc.importNode(dctNode, true));
    if (titleNode != null) {
      newTimemlNode.appendChild(annotatedDoc.importNode(titleNode, true));
    }
    if (extraInfoNode != null) {
      newTimemlNode.appendChild(annotatedDoc.importNode(extraInfoNode, true));
    }
    newTimemlNode.appendChild(annotatedDoc.adoptNode(annotatedTextElem));
    annotatedDoc.appendChild(newTimemlNode);

    PrintWriter pw = (out != null)? IOUtils.getPrintWriter(out):new PrintWriter(System.out);
    String string = XMLUtils.documentToString(annotatedDoc);
    pw.println(string);
    pw.flush();
    if (out != null) pw.close();
  }

  private static String requiredDocDateFormat;
  private static boolean useGUTime = false;

  public static AnnotationPipeline getPipeline(Properties props, boolean tokenize) throws Exception {
//    useGUTime = Boolean.parseBoolean(props.getProperty("gutime", "false"));
    AnnotationPipeline pipeline = new AnnotationPipeline();
    if (tokenize) {
      pipeline.addAnnotator(new TokenizerAnnotator(false, "en"));
      pipeline.addAnnotator(new WordsToSentencesAnnotator(false));
    }
    pipeline.addAnnotator(new POSTaggerAnnotator(false));
//    pipeline.addAnnotator(new NumberAnnotator(false));
//    pipeline.addAnnotator(new QuantifiableEntityNormalizingAnnotator(false, false));
    String timeAnnotator = props.getProperty("timeAnnotator", "sutime");
    switch (timeAnnotator) {
      case "gutime":
        useGUTime = true;
        pipeline.addAnnotator(new GUTimeAnnotator("gutime", props));
        break;
      case "heideltime":
        requiredDocDateFormat = "yyyy-MM-dd";
        pipeline.addAnnotator(new HeidelTimeAnnotator("heideltime", props));
        break;
      case "sutime":
        pipeline.addAnnotator(new TimeAnnotator("sutime", props));
        break;
      default:
        throw new IllegalArgumentException("Unknown timeAnnotator: " + timeAnnotator);
    }
    return pipeline;
  }

  enum InputType { TEXTFILE, TEXT, TIMEBANK_CSV, TEMPEVAL2, TEMPEVAL3 }

  private static void configLogger(String out) throws IOException {
    File outDir = new File(out);
    if (!outDir.exists()) {
      outDir.mkdirs();
    }
    StringBuilder sb = new StringBuilder();
    sb.append("handlers=java.util.logging.ConsoleHandler, java.util.logging.FileHandler\n");
    sb.append(".level=SEVERE\n");
    sb.append("edu.stanford.nlp.level=INFO\n");
    sb.append("java.util.logging.ConsoleHandler.level=SEVERE\n");
    sb.append("java.util.logging.FileHandler.formatter=java.util.logging.SimpleFormatter\n");
    sb.append("java.util.logging.FileHandler.level=INFO\n");
    sb.append("java.util.logging.FileHandler.pattern=" + out + "/err.log" + "\n");
    LogManager.getLogManager().readConfiguration(new ReaderInputStream(new StringReader(sb.toString())));
  }

  private static List createTimexNodes(String str, Integer charBeginOffset, List timexAnns) {
    List> timexList = new ArrayList<>(timexAnns.size());
    for (CoreMap timexAnn:timexAnns) {
      timexList.add(new ValuedInterval<>(timexAnn,
              MatchedExpression.COREMAP_TO_CHAR_OFFSETS_INTERVAL_FUNC.apply(timexAnn)));
    }
    Collections.sort(timexList, HasInterval.CONTAINS_FIRST_ENDPOINTS_COMPARATOR );
    return createTimexNodesPresorted(str, charBeginOffset, timexList);
  }

  private static List createTimexNodesPresorted(String str, Integer charBeginOffset, List> timexList) {
    if (charBeginOffset == null) charBeginOffset = 0;
    List nodes = new ArrayList<>();
    int previousEnd = 0;
    List timexElems = new ArrayList<>();
    List> processed = new ArrayList<>();
    CollectionValuedMap> unprocessed =
            new CollectionValuedMap<>(CollectionFactory.>arrayListFactory());
    for (ValuedInterval v:timexList) {
      CoreMap timexAnn = v.getValue();
      int begin = timexAnn.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class) - charBeginOffset;
      int end = timexAnn.get(CoreAnnotations.CharacterOffsetEndAnnotation.class) - charBeginOffset;
      if (begin >= previousEnd) {
        // Add text
        nodes.add(XMLUtils.createTextNode(str.substring(previousEnd, begin)));
        // Add timex
        Timex timex = timexAnn.get(TimeAnnotations.TimexAnnotation.class);
        Element timexElem = timex.toXmlElement();
        nodes.add(timexElem);
        previousEnd = end;

        // For handling nested timexes
        processed.add(v);
        timexElems.add(timexElem);
      } else {
        unprocessed.add(processed.size()-1, v);
      }
    }
    if (previousEnd < str.length()) {
      nodes.add(XMLUtils.createTextNode(str.substring(previousEnd)));
    }
    for (Integer i:unprocessed.keySet()) {
      ValuedInterval v = processed.get(i);
      String elemStr = v.getValue().get(CoreAnnotations.TextAnnotation.class);
      int charStart = v.getValue().get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
      List innerElems = createTimexNodesPresorted(elemStr, charStart, (List>) unprocessed.get(i));
      Element timexElem = timexElems.get(i);
      XMLUtils.removeChildren(timexElem);
      for (Node n:innerElems) {
        timexElem.appendChild(n);
      }
    }
    return nodes;
  }

  public static void processTextFile(AnnotationPipeline pipeline, String in, String out, String date) throws IOException {
    String text = IOUtils.slurpFile(in);
    PrintWriter pw = (out != null)? IOUtils.getPrintWriter(out):new PrintWriter(System.out);
    String string = textToAnnotatedXml(pipeline, text, date);
    pw.println(string);
    pw.flush();
    if (out != null) pw.close();
  }

  public static void processText(AnnotationPipeline pipeline, String text, String out, String date) throws IOException {
    PrintWriter pw = (out != null)? IOUtils.getPrintWriter(out):new PrintWriter(System.out);
    String string = textToAnnotatedXml(pipeline, text, date);
    pw.println(string);
    pw.flush();
    if (out != null) pw.close();
  }

  public static String textToAnnotatedXml(AnnotationPipeline pipeline, String text, String date) {
    Annotation annotation = textToAnnotation(pipeline, text, date);
    Document xmlDoc = annotationToXmlDocument(annotation);
    return XMLUtils.documentToString(xmlDoc);
  }

  public static Element annotationToTmlTextElement(Annotation annotation) {
    List timexAnnsAll = annotation.get(TimeAnnotations.TimexAnnotations.class);
    Element textElem = XMLUtils.createElement("TEXT");
    List timexNodes = createTimexNodes(
            annotation.get(CoreAnnotations.TextAnnotation.class),
            annotation.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class),
            timexAnnsAll);
    for (Node node:timexNodes) {
      textElem.appendChild(node);
    }
    return textElem;
  }

  public static Document annotationToXmlDocument(Annotation annotation) {
    Element dateElem = XMLUtils.createElement("DATE");
    dateElem.setTextContent(annotation.get(CoreAnnotations.DocDateAnnotation.class));
    Element textElem = annotationToTmlTextElement(annotation);

    Element docElem = XMLUtils.createElement("DOC");
    docElem.appendChild(dateElem);
    docElem.appendChild(textElem);

    // Create document and import elements into this document....
    Document doc = XMLUtils.createDocument();
    doc.appendChild(doc.importNode(docElem, true));
    return doc;
  }

  public static Annotation textToAnnotation(AnnotationPipeline pipeline, String text, String date) {
    Annotation annotation = new Annotation(text);
    annotation.set(CoreAnnotations.DocDateAnnotation.class, date);
    pipeline.annotate(annotation);
    return annotation;
  }

  public static void main(String[] args) throws Exception {
    // Process arguments
    Properties props = StringUtils.argsToProperties(args);

    String in = props.getProperty("i");
    String date = props.getProperty("date");
    String dct = props.getProperty("tempeval2.dct");
    String out = props.getProperty("o");
    String inputTypeStr = props.getProperty("in.type", InputType.TEXT.name());
    String eval = props.getProperty("eval");
    PYTHON = props.getProperty("python", PYTHON);
    InputType inputType = InputType.valueOf(inputTypeStr);
    AnnotationPipeline pipeline;
    switch (inputType) {
      case TEXT:
        pipeline = getPipeline(props, true);
        processText(pipeline, in, out, date);
        break;
      case TEXTFILE:
        pipeline = getPipeline(props, true);
        processTextFile(pipeline, in, out, date);
        break;
      case TIMEBANK_CSV:
        configLogger(out);
        pipeline = getPipeline(props, true);
        processTimebankCsv(pipeline, in, out, eval);
        break;
      case TEMPEVAL2:
        configLogger(out);
        pipeline = getPipeline(props, false);
        processTempEval2(pipeline, in, out, eval, dct);
        break;
      case TEMPEVAL3:
        pipeline = getPipeline(props, true);
        processTempEval3(pipeline, in, out, eval);
        break;
    }
  }

}