edu.stanford.nlp.trees.ud.CoNLLUDocumentReader Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation
Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.
The newest version!
package edu.stanford.nlp.trees.ud;

import java.io.Reader;
import java.io.StringReader;
import java.util.*;
import java.util.function.Function;

import edu.stanford.nlp.international.Language;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.objectbank.DelimitRegExIterator;
import edu.stanford.nlp.objectbank.IteratorFromReaderFactory;
import edu.stanford.nlp.objectbank.ObjectBank;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.trees.GrammaticalRelation;
import edu.stanford.nlp.trees.TypedDependency;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.IntPair;
import edu.stanford.nlp.util.Pair;

/**
 * Reader for ConLL-U formatted dependency treebanks.
 *
 * @author Sebastian Schuster
 */
public class CoNLLUDocumentReader implements IteratorFromReaderFactory {

  private static final String COMMENT_POS = "";

  private static final long serialVersionUID = -7340310509954331983L;

  private IteratorFromReaderFactory ifrf;

  public CoNLLUDocumentReader() {
    this.ifrf = DelimitRegExIterator.getFactory("\n(\\s*\n)+", new SentenceProcessor());
  }


  @Override
  public Iterator getIterator(Reader r) {
    return ifrf.getIterator(r);
  }


  private static final Comparator byIndex = Comparator.naturalOrder();

  /** Comparator for putting multiword tokens before regular tokens.  */
  private static final Comparator byType = (i1, i2) ->
          i1.containsKey(CoreAnnotations.CoNLLUTokenSpanAnnotation.class) ? -1 :
                  i2.containsKey(CoreAnnotations.CoNLLUTokenSpanAnnotation.class) ? 1 : 0;

  private static class SentenceProcessor implements Function {

    private int lineNumberCounter = 0;

    private static Pair getGovAndReln(int govIdx,
                                                                        int copyCount,
                                                                        IndexedWord word, String relationName,
                                                                        List sortedTokens) {
      IndexedWord gov;
      GrammaticalRelation reln;
      if (relationName.equals("root")) {
        reln = GrammaticalRelation.ROOT;
      } else {
        reln = GrammaticalRelation.valueOf(Language.UniversalEnglish, relationName);
      }
      if (govIdx == 0) {
        gov = new IndexedWord(word.docID(), word.sentIndex(), 0);
        gov.setValue("ROOT");
      } else {
        gov = SentenceProcessor.getToken(sortedTokens, govIdx, copyCount);
      }
      return Generics.newPair(gov, reln);
    }

    private static IndexedWord getToken(List sortedTokens, int index) {
      return SentenceProcessor.getToken(sortedTokens, index, 0);
    }


    private static IndexedWord getToken(List sortedTokens, int index, int copyCount) {


      int tokenLength = sortedTokens.size();
      for (int i = index - 1 ; i < tokenLength; i++) {
        IndexedWord token = sortedTokens.get(i);
        if (token.index() == index && token.copyCount() == copyCount) {
          return token;
        }
      }
      return null;
    }


    @Override
    public SemanticGraph apply(String line) {
      if (line == null) return null;

      Function func = new WordProcessor();
      ObjectBank words = ObjectBank.getLineIterator(new StringReader(line), func);

      List wordList = new ArrayList<>(words);

      List sorted = new ArrayList<>(wordList.size());

      List comments = new LinkedList<>();

      /* Increase the line number in case there are comments before the actual sentence
       * and add them to the list of comments. */
      wordList.stream().filter(w -> w.tag() != null && w.tag().equals(COMMENT_POS))
              .forEach(w -> {
                lineNumberCounter++;
                comments.add(w.word());
              });

      wordList.stream().filter(w -> w.tag() == null || ! w.tag().equals(COMMENT_POS))
              .sorted(byIndex.thenComparing(byType))
              .forEach(sorted::add);

      List sortedTokens = new ArrayList<>(wordList.size());
      sorted.stream()
              .filter(w -> !w.containsKey(CoreAnnotations.CoNLLUTokenSpanAnnotation.class))
              .filter(w -> w.copyCount() == 0)
              .forEach(sortedTokens::add);

      sorted.stream()
          .filter(w -> !w.containsKey(CoreAnnotations.CoNLLUTokenSpanAnnotation.class))
          .filter(w -> w.copyCount() != 0)
          .forEach(w -> sortedTokens.add(sortedTokens.get(w.index() - 1).makeSoftCopy(w.copyCount())));

      /* Construct a semantic graph. */
      List deps = new ArrayList<>(sorted.size());

      IntPair tokenSpan = null;
      String originalToken = null;
      for (IndexedWord word : sorted) {
        lineNumberCounter++;

        if (word.containsKey(CoreAnnotations.CoNLLUTokenSpanAnnotation.class)) {
          tokenSpan = word.get(CoreAnnotations.CoNLLUTokenSpanAnnotation.class);
          originalToken = word.word();
        } else {
          /* Deal with multiword tokens. */
          if (tokenSpan != null && tokenSpan.getTarget() >= word.index()) {
            word.setOriginalText(originalToken);
            word.set(CoreAnnotations.CoNLLUTokenSpanAnnotation.class, tokenSpan);
          } else {
            tokenSpan = null;
            originalToken = null;
          }
          HashMap extraDeps = word.get(CoreAnnotations.CoNLLUSecondaryDepsAnnotation.class);
          if (extraDeps.isEmpty()) {
            int govIdx = word.get(CoreAnnotations.CoNLLDepParentIndexAnnotation.class);
            Pair govReln = getGovAndReln(govIdx, 0, word,
                word.get(CoreAnnotations.CoNLLDepTypeAnnotation.class), sortedTokens);
            IndexedWord gov = govReln.first();
            GrammaticalRelation reln  = govReln.second();
            TypedDependency dep = new TypedDependency(reln, gov, word);
            word.set(CoreAnnotations.LineNumberAnnotation.class, lineNumberCounter);
            deps.add(dep);
          } else {
            for (String extraGovIdxStr : extraDeps.keySet()) {
              if (extraGovIdxStr.contains(".")) {
                String[] indexParts = extraGovIdxStr.split("\\.");
                Integer extraGovIdx = Integer.parseInt(indexParts[0]);
                Integer copyCount = Integer.parseInt(indexParts[1]);
                Pair govReln = getGovAndReln(extraGovIdx, copyCount, word,
                    extraDeps.get(extraGovIdxStr), sortedTokens);
                IndexedWord gov = govReln.first();
                GrammaticalRelation reln = govReln.second();
                TypedDependency dep = new TypedDependency(reln, gov, word);
                dep.setExtra();
                deps.add(dep);
              } else {
                int extraGovIdx = Integer.parseInt(extraGovIdxStr);
                int mainGovIdx = word.get(CoreAnnotations.CoNLLDepParentIndexAnnotation.class) != null ?
                    word.get(CoreAnnotations.CoNLLDepParentIndexAnnotation.class) : -1;
                Pair govReln = getGovAndReln(extraGovIdx, 0, word,
                    extraDeps.get(extraGovIdxStr), sortedTokens);
                IndexedWord gov = govReln.first();
                GrammaticalRelation reln = govReln.second();
                TypedDependency dep = new TypedDependency(reln, gov, word);
                if (extraGovIdx != mainGovIdx) {
                  dep.setExtra();
                }
                deps.add(dep);
              }
            }
          }
        }
      }
      lineNumberCounter++;

      SemanticGraph sg = new SemanticGraph(deps);

      comments.forEach(sg::addComment);

      return sg;
    }
  }

  private static class WordProcessor implements Function {

    @Override
    public IndexedWord apply(String line) {

      IndexedWord word = new IndexedWord();
      if (line.startsWith("#")) {
        word.setWord(line);
        word.setTag(COMMENT_POS);
        return word;
      }

      String[] bits = line.split("\\s+");

      word.set(CoreAnnotations.TextAnnotation.class, bits[1]);

      /* Check if it is a multiword token. */
      if (bits[0].contains("-")) {
        String[] span = bits[0].split("-");
        Integer start = Integer.parseInt(span[0]);
        Integer end = Integer.parseInt(span[1]);
        word.set(CoreAnnotations.CoNLLUTokenSpanAnnotation.class, new IntPair(start, end));
        word.set(CoreAnnotations.IndexAnnotation.class, start);
      } else if(bits[0].contains(".")) {
        String[] indexParts = bits[0].split("\\.");
        Integer index = Integer.parseInt(indexParts[0]);
        Integer copyCount = Integer.parseInt(indexParts[1]);
        word.set(CoreAnnotations.IndexAnnotation.class, index);
        word.setIndex(index);
        word.setCopyCount(copyCount);
        word.setValue(bits[1]);

        /* Parse features. */
        HashMap features = CoNLLUUtils.parseFeatures(bits[5]);
        word.set(CoreAnnotations.CoNLLUFeats.class, features);

        /* Parse extra dependencies. */
        HashMap extraDeps = CoNLLUUtils.parseExtraDeps(bits[8]);
        word.set(CoreAnnotations.CoNLLUSecondaryDepsAnnotation.class, extraDeps);
      } else {
        word.set(CoreAnnotations.IndexAnnotation.class, Integer.parseInt(bits[0]));
        word.set(CoreAnnotations.LemmaAnnotation.class, bits[2]);
        word.set(CoreAnnotations.CoarseTagAnnotation.class, bits[3]);
        word.set(CoreAnnotations.PartOfSpeechAnnotation.class, bits[4]);

        word.set(CoreAnnotations.CoNLLDepParentIndexAnnotation.class, Integer.parseInt(bits[6]));
        word.set(CoreAnnotations.CoNLLDepTypeAnnotation.class, bits[7]);
        word.set(CoreAnnotations.CoNLLUMisc.class, bits[9]);

        word.setIndex(Integer.parseInt(bits[0]));
        word.setValue(bits[1]);

        /* Parse features. */
        HashMap features = CoNLLUUtils.parseFeatures(bits[5]);
        word.set(CoreAnnotations.CoNLLUFeats.class, features);

        /* Parse extra dependencies. */
        HashMap extraDeps = CoNLLUUtils.parseExtraDeps(bits[8]);
        word.set(CoreAnnotations.CoNLLUSecondaryDepsAnnotation.class, extraDeps);
      }

      return word;
    }

  } // end static class WordProcessor

}