edu.stanford.nlp.util.TSVUtils Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.util;

import edu.stanford.nlp.international.Language;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
import edu.stanford.nlp.trees.GrammaticalRelation;

import javax.json.*;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Optional;
import java.util.function.Function;
import java.util.regex.Pattern;

/**
 * A set of utilities for parsing TSV files into CoreMaps
 *
 * @author Gabor Angeli
 */
public class TSVUtils {

  static String unescapeSQL(String input) {
    // If the string is quoted
    if (input.startsWith("\"") && input.endsWith("\"")) {
      input = input.substring(1, input.length()-1);
    }
    return input.replace("\"\"","\"").replace("\\\\", "\\");
  }


  /**
   * Parse an SQL array.
   * @param array The array to parse.
   * @return The parsed array, as a list.
   */
  public static List parseArray(String array) {
    array = unescapeSQL(array);
    if (array.startsWith("{") && array.endsWith("}")) array = array.substring(1, array.length()-1);
    char[] input = array.toCharArray();
    List output = new ArrayList<>();
    StringBuilder elem = new StringBuilder();
    boolean inQuotes = false;
    boolean escaped = false;
    for (char c : input) {
      if (escaped) {
        elem.append(c);
        escaped = false;
      } else if (c == '"') {
        inQuotes = !inQuotes;
        escaped = false;
      } else if (c == '\\') {
        escaped = true;
      } else {
        if (inQuotes) {
          elem.append(c);
        } else if (c == ',') {
          output.add(elem.toString());
          elem.setLength(0);  // This is basically .clear()
        } else {
          elem.append(c);
        }
        escaped = false;
      }
    }
    if (elem.length() > 0) {
      output.add(elem.toString());
    }
    return output;
  }

  private static final Pattern newline = Pattern.compile("\\\\n");
  private static final Pattern tab = Pattern.compile("\\\\t");

  /**
   * Parse a CoNLL formatted tree into a SemanticGraph.
   * @param conll The CoNLL tree to parse.
   * @param tokens The tokens of the sentence, to form the backing labels of the tree.
   * @return A semantic graph of the sentence, according to the given tree.
   */
  public static SemanticGraph parseTree(String conll, List tokens) {
    SemanticGraph tree = new SemanticGraph();
    if (conll == null || conll.isEmpty()) {
      return tree;
    }
    String[] treeLines = newline.split(conll);
    IndexedWord[] vertices = new IndexedWord[tokens.size() + 2];
    // Add edges
    for (String line : treeLines) {
      // Parse row
      String[] fields = tab.split(line);
      int dependentIndex = Integer.parseInt(fields[0]);
      if (vertices[dependentIndex] == null) {
        if (dependentIndex > tokens.size()) {
          // Bizarre mismatch in sizes; the malt parser seems to do this often
          return new SemanticGraph();
        }
        vertices[dependentIndex] = new IndexedWord(tokens.get(dependentIndex - 1));
      }
      IndexedWord dependent = vertices[dependentIndex];
      int governorIndex = Integer.parseInt(fields[1]);
      if (governorIndex > tokens.size()) {
        // Bizarre mismatch in sizes; the malt parser seems to do this often
        return new SemanticGraph();
      }
      if (vertices[governorIndex] == null && governorIndex > 0) {
        vertices[governorIndex] = new IndexedWord(tokens.get(governorIndex - 1));
      }
      IndexedWord governor = vertices[governorIndex];
      String relation = fields[2];

      // Process row
      if (governorIndex == 0) {
        tree.addRoot(dependent);
      } else {
        tree.addVertex(dependent);
        if (!tree.containsVertex(governor)) {
          tree.addVertex(governor);
        }
        if (!"ref".equals(relation)) {
          tree.addEdge(governor, dependent, GrammaticalRelation.valueOf(Language.English, relation), Double.NEGATIVE_INFINITY, false);
        }
      }
    }
    return tree;
  }

  /**
   * Parse a JSON formatted tree into a SemanticGraph.
   * @param jsonString The JSON string tree to parse, e.g:
   * "[{\"\"dependent\"\": 7, \"\"dep\"\": \"\"root\"\", \"\"governorgloss\"\": \"\"root\"\", \"\"governor\"\": 0, \"\"dependentgloss\"\": \"\"sport\"\"}, {\"\"dependent\"\": 1, \"\"dep\"\": \"\"nsubj\"\", \"\"governorgloss\"\": \"\"sport\"\", \"\"governor\"\": 7, \"\"dependentgloss\"\": \"\"chess\"\"}, {\"\"dependent\"\": 2, \"\"dep\"\": \"\"cop\"\", \"\"governorgloss\"\": \"\"sport\"\", \"\"governor\"\": 7, \"\"dependentgloss\"\": \"\"is\"\"}, {\"\"dependent\"\": 3, \"\"dep\"\": \"\"neg\"\", \"\"governorgloss\"\": \"\"sport\"\", \"\"governor\"\": 7, \"\"dependentgloss\"\": \"\"not\"\"}, {\"\"dependent\"\": 4, \"\"dep\"\": \"\"det\"\", \"\"governorgloss\"\": \"\"sport\"\", \"\"governor\"\": 7, \"\"dependentgloss\"\": \"\"a\"\"}, {\"\"dependent\"\": 5, \"\"dep\"\": \"\"advmod\"\", \"\"governorgloss\"\": \"\"physical\"\", \"\"governor\"\": 6, \"\"dependentgloss\"\": \"\"predominantly\"\"}, {\"\"dependent\"\": 6, \"\"dep\"\": \"\"amod\"\", \"\"governorgloss\"\": \"\"sport\"\", \"\"governor\"\": 7, \"\"dependentgloss\"\": \"\"physical\"\"}, {\"\"dependent\"\": 9, \"\"dep\"\": \"\"advmod\"\", \"\"governorgloss\"\": \"\"sport\"\", \"\"governor\"\": 7, \"\"dependentgloss\"\": \"\"yet\"\"}, {\"\"dependent\"\": 10, \"\"dep\"\": \"\"nsubj\"\", \"\"governorgloss\"\": \"\"shooting\"\", \"\"governor\"\": 12, \"\"dependentgloss\"\": \"\"neither\"\"}, {\"\"dependent\"\": 11, \"\"dep\"\": \"\"cop\"\", \"\"governorgloss\"\": \"\"shooting\"\", \"\"governor\"\": 12, \"\"dependentgloss\"\": \"\"are\"\"}, {\"\"dependent\"\": 12, \"\"dep\"\": \"\"parataxis\"\", \"\"governorgloss\"\": \"\"sport\"\", \"\"governor\"\": 7, \"\"dependentgloss\"\": \"\"shooting\"\"}, {\"\"dependent\"\": 13, \"\"dep\"\": \"\"cc\"\", \"\"governorgloss\"\": \"\"shooting\"\", \"\"governor\"\": 12, \"\"dependentgloss\"\": \"\"and\"\"}, {\"\"dependent\"\": 14, \"\"dep\"\": \"\"parataxis\"\", \"\"governorgloss\"\": \"\"sport\"\", \"\"governor\"\": 7, \"\"dependentgloss\"\": \"\"curling\"\"}, {\"\"dependent\"\": 14, \"\"dep\"\": \"\"conj:and\"\", \"\"governorgloss\"\": \"\"shooting\"\", \"\"governor\"\": 12, \"\"dependentgloss\"\": \"\"curling\"\"}, {\"\"dependent\"\": 16, \"\"dep\"\": \"\"nsubjpass\"\", \"\"governorgloss\"\": \"\"nicknamed\"\", \"\"governor\"\": 23, \"\"dependentgloss\"\": \"\"which\"\"}, {\"\"dependent\"\": 18, \"\"dep\"\": \"\"case\"\", \"\"governorgloss\"\": \"\"fact\"\", \"\"governor\"\": 19, \"\"dependentgloss\"\": \"\"in\"\"}, {\"\"dependent\"\": 19, \"\"dep\"\": \"\"nmod:in\"\", \"\"governorgloss\"\": \"\"nicknamed\"\", \"\"governor\"\": 23, \"\"dependentgloss\"\": \"\"fact\"\"}, {\"\"dependent\"\": 21, \"\"dep\"\": \"\"aux\"\", \"\"governorgloss\"\": \"\"nicknamed\"\", \"\"governor\"\": 23, \"\"dependentgloss\"\": \"\"has\"\"}, {\"\"dependent\"\": 22, \"\"dep\"\": \"\"auxpass\"\", \"\"governorgloss\"\": \"\"nicknamed\"\", \"\"governor\"\": 23, \"\"dependentgloss\"\": \"\"been\"\"}, {\"\"dependent\"\": 23, \"\"dep\"\": \"\"dep\"\", \"\"governorgloss\"\": \"\"shooting\"\", \"\"governor\"\": 12, \"\"dependentgloss\"\": \"\"nicknamed\"\"}, {\"\"dependent\"\": 25, \"\"dep\"\": \"\"dobj\"\", \"\"governorgloss\"\": \"\"nicknamed\"\", \"\"governor\"\": 23, \"\"dependentgloss\"\": \"\"chess\"\"}, {\"\"dependent\"\": 26, \"\"dep\"\": \"\"case\"\", \"\"governorgloss\"\": \"\"ice\"\", \"\"governor\"\": 27, \"\"dependentgloss\"\": \"\"on\"\"}, {\"\"dependent\"\": 27, \"\"dep\"\": \"\"nmod:on\"\", \"\"governorgloss\"\": \"\"chess\"\", \"\"governor\"\": 25, \"\"dependentgloss\"\": \"\"ice\"\"}, {\"\"dependent\"\": 29, \"\"dep\"\": \"\"amod\"\", \"\"governorgloss\"\": \"\"chess\"\", \"\"governor\"\": 25, \"\"dependentgloss\"\": \"\"5\"\"}]");
   * @param tokens The tokens of the sentence, to form the backing labels of the tree.
   * @return A semantic graph of the sentence, according to the given tree.
   */
  public static SemanticGraph parseJsonTree(String jsonString, List tokens) {
    // Escape quoted string parts
    JsonReader json = Json.createReader(new StringReader(jsonString));
    SemanticGraph tree = new SemanticGraph();
    JsonArray array = json.readArray();

    if (array == null || array.isEmpty()) {
      return tree;
    }

    IndexedWord[] vertices = new IndexedWord[tokens.size() + 2];
    // Add edges
    for(int i = 0; i < array.size(); i++) {
      JsonObject entry = array.getJsonObject(i);
      // Parse row
      int dependentIndex = entry.getInt("dependent");
      if (vertices[dependentIndex] == null) {
        if (dependentIndex > tokens.size()) {
          // Bizarre mismatch in sizes; the malt parser seems to do this often
          return new SemanticGraph();
        }
        vertices[dependentIndex] = new IndexedWord(tokens.get(dependentIndex - 1));
      }
      IndexedWord dependent = vertices[dependentIndex];
      int governorIndex = entry.getInt("governor");
      if (governorIndex > tokens.size()) {
        // Bizarre mismatch in sizes; the malt parser seems to do this often
        return new SemanticGraph();
      }
      if (vertices[governorIndex] == null && governorIndex > 0) {
        vertices[governorIndex] = new IndexedWord(tokens.get(governorIndex - 1));
      }
      IndexedWord governor = vertices[governorIndex];
      String relation = entry.getString("dep");

      // Process row
      if (governorIndex == 0) {
        tree.addRoot(dependent);
      } else {
        tree.addVertex(dependent);
        if (!tree.containsVertex(governor)) {
          tree.addVertex(governor);
        }
        if (!"ref".equals(relation)) {
          tree.addEdge(governor, dependent, GrammaticalRelation.valueOf(Language.English, relation), Double.NEGATIVE_INFINITY, false);
        }
      }
    }
    return tree;
  }

  /** Create an Annotation object (with a single sentence) from the given specification */
  private static Annotation parseSentence(Optional docid, Optional sentenceIndex, String gloss,
                                          Function,SemanticGraph> tree,
                                          Function,SemanticGraph> maltTree,
                                          List words, List lemmas, List pos, List ner,
                                          Optional sentenceid) {
    // Error checks
    if (lemmas.size() != words.size()) {
      throw new IllegalArgumentException("Array lengths don't match: " + words.size() + " vs " + lemmas.size() + " (sentence " + sentenceid.orElse("???") +")");
    }
    if (pos.size() != words.size()) {
      throw new IllegalArgumentException("Array lengths don't match: " + words.size() + " vs " + pos.size() + " (sentence " + sentenceid.orElse("???") +")");
    }
    if (ner.size() != words.size()) {
      throw new IllegalArgumentException("Array lengths don't match: " + words.size() + " vs " + ner.size() + " (sentence " + sentenceid.orElse("???") +")");
    }

    // Create structure
    List tokens = new ArrayList<>(words.size());
    int beginChar = 0;
    for (int i = 0; i < words.size(); ++i) {
      CoreLabel token = new CoreLabel(12);
      token.setWord(words.get(i));
      token.setValue(words.get(i));
      token.setBeginPosition(beginChar);
      token.setEndPosition(beginChar + words.get(i).length());
      beginChar += words.get(i).length() + 1;
      token.setLemma(lemmas.get(i));
      token.setTag(pos.get(i));
      token.setNER(ner.get(i));
      token.set(CoreAnnotations.DocIDAnnotation.class, docid.orElse("???"));
      token.set(CoreAnnotations.SentenceIndexAnnotation.class, sentenceIndex.orElse(-1));
      token.set(CoreAnnotations.IndexAnnotation.class, i + 1);
      token.set(CoreAnnotations.TokenBeginAnnotation.class, i);
      token.set(CoreAnnotations.TokenEndAnnotation.class, i + 1);
      tokens.add(token);
    }
    gloss = gloss.replace("\\n", "\n").replace("\\t", "\t");
    CoreMap sentence = new ArrayCoreMap(16);
    sentence.set(CoreAnnotations.TokensAnnotation.class, tokens);
    SemanticGraph graph = tree.apply(tokens);
    sentence.set(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class, graph);
    sentence.set(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class, graph);
    sentence.set(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class, graph);
    SemanticGraph maltGraph = maltTree.apply(tokens);
    sentence.set(SemanticGraphCoreAnnotations.AlternativeDependenciesAnnotation.class, maltGraph);
    sentence.set(CoreAnnotations.DocIDAnnotation.class, docid.orElse("???"));
    sentence.set(CoreAnnotations.SentenceIndexAnnotation.class, sentenceIndex.orElse(-1));
    sentence.set(CoreAnnotations.TextAnnotation.class, gloss);
    sentence.set(CoreAnnotations.TokenBeginAnnotation.class, 0);
    sentence.set(CoreAnnotations.TokenEndAnnotation.class, tokens.size());
    Annotation doc = new Annotation(gloss);
    doc.set(CoreAnnotations.TokensAnnotation.class, tokens);
    doc.set(CoreAnnotations.SentencesAnnotation.class, Collections.singletonList(sentence));
    doc.set(CoreAnnotations.DocIDAnnotation.class, docid.orElse("???"));
    doc.set(CoreAnnotations.SentenceIndexAnnotation.class, sentenceIndex.orElse(-1));
    return doc;
  }

  /** Create an Annotation object (with a single sentence) from the given specification, as Postgres would output them */
  public static Annotation parseSentence(Optional docid, Optional sentenceIndex,
                                         String gloss, String dependencies, String maltDependencies,
                                         String words, String lemmas, String posTags, String nerTags,
                                         Optional sentenceid) {
    return parseSentence(docid, sentenceIndex.map(Integer::parseInt), gloss,
        tokens -> parseTree(dependencies, tokens),
        tokens -> parseTree(maltDependencies, tokens),
        parseArray(words), parseArray(lemmas), parseArray(posTags), parseArray(nerTags), sentenceid);
  }


}