All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.time.HeidelTimeAnnotator Maven / Gradle / Ivy

Go to download

Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.

There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.time;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.Annotator;
import edu.stanford.nlp.util.*;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.Text;

import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.util.*;
import java.util.regex.Pattern;


/**
 * Annotates text using HeidelTime.
 *
 * GUTIME/TimeML specifications can be found at:
 * 
 * http://www.timeml.org/site/tarsqi/modules/gutime/index.html.
 *
 * @author Gabor Angeli
 */
public class HeidelTimeAnnotator implements Annotator {

  // TODO HeidelTime doesn't actually run on the NLP machines :( (TreeTagger doesn't run.)
  // This could probably be fixed in newer HeidelTime versions, which even support using our tagger.

  private static final String BASE_PATH = "$NLP_DATA_HOME/packages/heideltime/";
  private static final String DEFAULT_PATH = DataFilePaths.convert(BASE_PATH);
  private final File heideltimePath;
  private final boolean outputResults;

  // if used in a pipeline or constructed with a Properties object,
  // this property tells the annotator where to find the script
  public static final String HEIDELTIME_PATH_PROPERTY = "heideltime.path";
  public static final String HEIDELTIME_OUTPUT_RESULTS = "heideltime.outputResults";

  public HeidelTimeAnnotator() {
    this(new File(System.getProperty("heideltime", DEFAULT_PATH)));
  }

  public HeidelTimeAnnotator(File heideltimePath) {
    this.heideltimePath = heideltimePath;
    this.outputResults = false;
  }

  public HeidelTimeAnnotator(String name, Properties props) {
    String path = props.getProperty(HEIDELTIME_PATH_PROPERTY,
            System.getProperty("heideltime",
                    DEFAULT_PATH));
    this.heideltimePath = new File(path);

    this.outputResults =
            Boolean.valueOf(props.getProperty(HEIDELTIME_OUTPUT_RESULTS, "false"));
  }

  @Override
  public void annotate(Annotation annotation) {
    try {
      this.annotate((CoreMap)annotation);
    } catch (IOException e) {
      throw new RuntimeIOException(e);
    }
  }

  public void annotate(CoreMap document) throws IOException {
    //--Create Input File
    //(create file)
    File inputFile = File.createTempFile("heideltime", ".input");
    //(write to file)
    PrintWriter inputWriter = new PrintWriter(inputFile);
    inputWriter.println(document.get(CoreAnnotations.TextAnnotation.class));
    inputWriter.close();

    //--Get Date
    //(error checks)
    if(!document.containsKey(CoreAnnotations.CalendarAnnotation.class) && !document.containsKey(CoreAnnotations.DocDateAnnotation.class)){
      throw new IllegalArgumentException("CoreMap must have either a Calendar or DocDate annotation"); //not strictly necessary, technically...
    }
    //(variables)
    Calendar dateCalendar = document.get(CoreAnnotations.CalendarAnnotation.class);
    String pubDate = null;
    if (dateCalendar != null) {
      //(case: calendar annotation)
      pubDate = String.format("%TF", dateCalendar);
    } else {
      //(case: docdateannotation)
      String s = document.get(CoreAnnotations.DocDateAnnotation.class);
      if (s != null) {
        pubDate = s;
      }
    }

    //--Build Command
    ArrayList args = new ArrayList<>();
    args.add("java");
    args.add("-jar"); args.add(this.heideltimePath.getPath() + "/heideltime.jar");
    args.add("-c"); args.add(this.heideltimePath.getPath()+"/config.props");
    args.add("-t"); args.add("NEWS");
    if(pubDate != null){
      args.add("-dct"); args.add(pubDate);
    }
    args.add(inputFile.getPath());
    // run HeidelTime on the input file
    ProcessBuilder process = new ProcessBuilder(args);

    StringWriter outputWriter = new StringWriter();
    SystemUtils.run(process, outputWriter, null);
    String output = outputWriter.getBuffer().toString();
    Pattern docClose = Pattern.compile(".*", Pattern.DOTALL);
    output = docClose.matcher(output).replaceAll("").replaceAll("",""); //TODO TimeML.dtd? FileNotFoundException if we leave it in
    Pattern badNestedTimex = Pattern.compile(Pattern.quote("IMEX3"));
    output = badNestedTimex.matcher(output).replaceAll("MEX3"));
    output = badNestedTimex2.matcher(output).replaceAll("\\n\\n","");

    // parse the HeidelTime output
    Element outputXML;
    try {
      outputXML = XMLUtils.parseElement(output);
    } catch (Exception ex) {
      throw new RuntimeException(String.format("error:\n%s\ninput:\n%s\noutput:\n%s",
              ex, IOUtils.slurpFile(inputFile), output), ex);
    }
    inputFile.delete();

    // get Timex annotations
    List timexAnns = toTimexCoreMaps(outputXML, document);
    document.set(TimeAnnotations.TimexAnnotations.class, timexAnns);
    if (outputResults) {
      System.out.println(timexAnns);
    }

    // align Timex annotations to sentences
    int timexIndex = 0;
    for (CoreMap sentence: document.get(CoreAnnotations.SentencesAnnotation.class)) {
      int sentBegin = beginOffset(sentence);
      int sentEnd = endOffset(sentence);

      // skip times before the sentence
      while (timexIndex < timexAnns.size() && beginOffset(timexAnns.get(timexIndex)) < sentBegin) {
        ++timexIndex;
      }

      // determine times within the sentence
      int sublistBegin = timexIndex;
      int sublistEnd = timexIndex;
      while (timexIndex < timexAnns.size() &&
              sentBegin <= beginOffset(timexAnns.get(timexIndex)) &&
              endOffset(timexAnns.get(timexIndex)) <= sentEnd) {
        ++sublistEnd;
        ++timexIndex;
      }

      // set the sentence timexes
      sentence.set(TimeAnnotations.TimexAnnotations.class, timexAnns.subList(sublistBegin, sublistEnd));
    }

  }


  private static int beginOffset(CoreMap ann) {
    return ann.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
  }

  private static int endOffset(CoreMap ann) {
    return ann.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
  }

  private static List toTimexCoreMaps(Element docElem, CoreMap originalDocument) {
    //--Collect Token Offsets
    Map beginMap = Generics.newHashMap();
    Map endMap = Generics.newHashMap();
    boolean haveTokenOffsets = true;
    for(CoreMap sent : originalDocument.get(CoreAnnotations.SentencesAnnotation.class)){
      for(CoreLabel token : sent.get(CoreAnnotations.TokensAnnotation.class)){
        Integer tokBegin = token.get(CoreAnnotations.TokenBeginAnnotation.class);
        Integer tokEnd = token.get(CoreAnnotations.TokenEndAnnotation.class);
        if(tokBegin == null || tokEnd == null){ haveTokenOffsets = false; }
        int charBegin = token.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
        int charEnd = token.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
        beginMap.put(charBegin,tokBegin);
        endMap.put(charEnd,tokEnd);
      }
    }
    List timexMaps = new ArrayList<>();
    int offset = 0;
    NodeList docNodes = docElem.getChildNodes();
    for (int i = 0; i < docNodes.getLength(); i++) {
      Node content = docNodes.item(i);
      if (content instanceof Text) {
        Text text = (Text)content;
        offset += text.getWholeText().length();
      } else if (content instanceof Element) {
        Element child = (Element)content;
        if (child.getNodeName().equals("TIMEX3")) {
          Timex timex = new Timex(child);
          if (child.getChildNodes().getLength() != 1) {
            throw new RuntimeException("TIMEX3 should only contain text " + child);
          }
          String timexText = child.getTextContent();
          CoreMap timexMap = new ArrayCoreMap();
          timexMap.set(TimeAnnotations.TimexAnnotation.class, timex);
          timexMap.set(CoreAnnotations.TextAnnotation.class, timexText);
          int charBegin = offset;
          timexMap.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, offset);
          offset += timexText.length();
          timexMap.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, offset);
          int charEnd = offset;
          //(tokens)
          if(haveTokenOffsets){
            Integer tokBegin = beginMap.get(charBegin);
            int searchStep = 1;          //if no exact match, search around the character offset
            while(tokBegin == null){
              tokBegin = beginMap.get(charBegin - searchStep);
              if(tokBegin == null){
                tokBegin = beginMap.get(charBegin + searchStep);
              }
              searchStep += 1;
            }
            searchStep = 1;
            Integer tokEnd = endMap.get(charEnd);
            while(tokEnd == null){
              tokEnd = endMap.get(charEnd - searchStep);
              if(tokEnd == null){
                tokEnd = endMap.get(charEnd + searchStep);
              }
              searchStep += 1;
            }
            timexMap.set(CoreAnnotations.TokenBeginAnnotation.class, tokBegin);
            timexMap.set(CoreAnnotations.TokenEndAnnotation.class, tokEnd);
          }
          timexMaps.add(timexMap);
        } else {
          throw new RuntimeException("unexpected element " + child);
        }
      } else {
        throw new RuntimeException("unexpected content " + content);
      }
    }
    return timexMaps;
  }

  @Override
  public Set> requires() {
    return Collections.unmodifiableSet(new ArraySet<>(Arrays.asList(
        CoreAnnotations.TextAnnotation.class,
        CoreAnnotations.TokensAnnotation.class,
        CoreAnnotations.CharacterOffsetBeginAnnotation.class,
        CoreAnnotations.CharacterOffsetEndAnnotation.class,
        CoreAnnotations.SentencesAnnotation.class
    )));
  }

  @Override
  public Set> requirementsSatisfied() {
    return Collections.singleton(TimeAnnotations.TimexAnnotations.class);
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy