edu.stanford.nlp.trees.ud.CoNLLUDocumentReader Maven / Gradle / Ivy
package edu.stanford.nlp.trees.ud;
import java.io.Reader;
import java.io.StringReader;
import java.util.*;
import java.util.function.Function;
import edu.stanford.nlp.international.Language;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.objectbank.DelimitRegExIterator;
import edu.stanford.nlp.objectbank.IteratorFromReaderFactory;
import edu.stanford.nlp.objectbank.ObjectBank;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.trees.GrammaticalRelation;
import edu.stanford.nlp.trees.TypedDependency;
import edu.stanford.nlp.util.IntPair;
import edu.stanford.nlp.util.Pair;
/**
* Reader for ConLL-U formatted dependency treebanks.
*
* @author Sebastian Schuster
*/
public class CoNLLUDocumentReader implements
IteratorFromReaderFactory {
private static final String COMMENT_POS = "";
private IteratorFromReaderFactory ifrf;
public CoNLLUDocumentReader() {
this.ifrf = DelimitRegExIterator.getFactory("\n(\\s*\n)+", new SentenceProcessor());
}
@Override
public Iterator getIterator(Reader r) {
return ifrf.getIterator(r);
}
private static final Comparator byIndex = (i1, i2) -> i1.compareTo(i2);
/* Comparator for putting multiword tokens before regular tokens. */
private static final Comparator byType = (i1, i2) ->
i1.containsKey(CoreAnnotations.CoNLLUTokenSpanAnnotation.class) ? -1 :
i2.containsKey(CoreAnnotations.CoNLLUTokenSpanAnnotation.class) ? 1 : 0;
private static class SentenceProcessor implements Function {
private int lineNumberCounter = 0;
public SemanticGraph apply(String line) {
if (line == null) return null;
Function func = new WordProcessor();
ObjectBank words = ObjectBank.getLineIterator(new StringReader(line), func);
List wordList = new ArrayList<>(words);
List sorted = new ArrayList<>(wordList.size());
List comments = new LinkedList<>();
/* Increase the line number in case there are comments before the actual sentence
* and add them to the list of comments. */
wordList.stream().filter(w -> w.tag() != null && w.tag().equals(COMMENT_POS))
.forEach(w -> {
lineNumberCounter++;
comments.add(w.word());
});
wordList.stream().filter(w -> w.tag() == null || ! w.tag().equals(COMMENT_POS))
.sorted(byIndex.thenComparing(byType))
.forEach(w -> sorted.add(w));
List sortedTokens = new ArrayList<>(wordList.size());
sorted.stream()
.filter(w -> !w.containsKey(CoreAnnotations.CoNLLUTokenSpanAnnotation.class))
.forEach(w -> sortedTokens.add(w));
/* Construct a semantic graph. */
List deps = new ArrayList<>(sorted.size());
IntPair tokenSpan = null;
String originalToken = null;
for (IndexedWord word : sorted) {
lineNumberCounter++;
if (word.containsKey(CoreAnnotations.CoNLLUTokenSpanAnnotation.class)) {
tokenSpan = word.get(CoreAnnotations.CoNLLUTokenSpanAnnotation.class);
originalToken = word.word();
} else {
/* Deal with multiword tokens. */
if (tokenSpan != null && tokenSpan.getTarget() >= word.index()) {
word.setOriginalText(originalToken);
word.set(CoreAnnotations.CoNLLUTokenSpanAnnotation.class, tokenSpan);
} else {
tokenSpan = null;
originalToken = null;
}
GrammaticalRelation reln = GrammaticalRelation.valueOf(Language.UniversalEnglish,
word.get(CoreAnnotations.CoNLLDepTypeAnnotation.class));
int govIdx = word.get(CoreAnnotations.CoNLLDepParentIndexAnnotation.class);
IndexedWord gov;
if (govIdx == 0) {
gov = new IndexedWord(word.docID(), word.sentIndex(), 0);
gov.setValue("ROOT");
if (word.get(CoreAnnotations.CoNLLDepTypeAnnotation.class).equals("root")) {
reln = GrammaticalRelation.ROOT;
}
} else {
gov = sortedTokens.get(govIdx - 1);
}
TypedDependency dep = new TypedDependency(reln, gov, word);
word.set(CoreAnnotations.LineNumberAnnotation.class, lineNumberCounter);
deps.add(dep);
HashMap extraDeps = word.get(CoreAnnotations.CoNLLUSecondaryDepsAnnotation.class);
for (Integer extraGovIdx : extraDeps.keySet()) {
GrammaticalRelation extraReln =
GrammaticalRelation.valueOf(Language.UniversalEnglish, extraDeps.get(extraGovIdx));
IndexedWord extraGov = sortedTokens.get(extraGovIdx - 1);
TypedDependency extraDep = new TypedDependency(extraReln, extraGov, word);
extraDep.setExtra();
deps.add(extraDep);
}
}
}
lineNumberCounter++;
SemanticGraph sg = new SemanticGraph(deps);
comments.forEach(c -> sg.addComment(c));
return sg;
}
}
private static class WordProcessor implements Function {
public IndexedWord apply(String line) {
IndexedWord word = new IndexedWord();
if (line.startsWith("#")) {
word.setWord(line);
word.setTag(COMMENT_POS);
return word;
}
String[] bits = line.split("\\s+");
word.set(CoreAnnotations.TextAnnotation.class, bits[1]);
/* Check if it is a multiword token. */
if (bits[0].contains("-")) {
String[] span = bits[0].split("-");
Integer start = Integer.parseInt(span[0]);
Integer end = Integer.parseInt(span[1]);
word.set(CoreAnnotations.CoNLLUTokenSpanAnnotation.class, new IntPair(start, end));
word.set(CoreAnnotations.IndexAnnotation.class, start);
} else {
word.set(CoreAnnotations.IndexAnnotation.class, Integer.parseInt(bits[0]));
word.set(CoreAnnotations.LemmaAnnotation.class, bits[2]);
word.set(CoreAnnotations.CoarseTagAnnotation.class, bits[3]);
word.set(CoreAnnotations.PartOfSpeechAnnotation.class, bits[4]);
word.set(CoreAnnotations.CoNLLDepParentIndexAnnotation.class, Integer.parseInt(bits[6]));
word.set(CoreAnnotations.CoNLLDepTypeAnnotation.class, bits[7]);
word.set(CoreAnnotations.CoNLLUMisc.class, bits[9]);
word.setIndex(Integer.parseInt(bits[0]));
word.setValue(bits[1]);
/* Parse features. */
HashMap features = CoNLLUUtils.parseFeatures(bits[5]);
word.set(CoreAnnotations.CoNLLUFeats.class, features);
/* Parse extra dependencies. */
HashMap extraDeps = CoNLLUUtils.parseExtraDeps(bits[8]);
word.set(CoreAnnotations.CoNLLUSecondaryDepsAnnotation.class, extraDeps);
}
return word;
}
}
}