eu.fbk.fcw.udpipe.api.CoNLLUDocumentReader Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of fcw-udpipe-api Show documentation
The newest version!
package eu.fbk.fcw.udpipe.api;

import edu.stanford.nlp.international.Language;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.objectbank.DelimitRegExIterator;
import edu.stanford.nlp.objectbank.IteratorFromReaderFactory;
import edu.stanford.nlp.objectbank.ObjectBank;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.trees.GrammaticalRelation;
import edu.stanford.nlp.trees.TypedDependency;
import edu.stanford.nlp.trees.ud.CoNLLUUtils;
import edu.stanford.nlp.util.IntPair;

import java.io.Reader;
import java.io.StringReader;
import java.util.*;
import java.util.function.Function;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Reader for ConLL-U formatted dependency treebanks.
 *
 * @author Sebastian Schuster
 */
public class CoNLLUDocumentReader implements
        IteratorFromReaderFactory {

    private static final String COMMENT_POS = "";
    private static final Pattern numPattern = Pattern.compile("^([0-9]+)");

    private IteratorFromReaderFactory ifrf;

    public CoNLLUDocumentReader() {
        this.ifrf = DelimitRegExIterator.getFactory("\n(\\s*\n)+", new SentenceProcessor());
    }

    @Override
    public Iterator getIterator(Reader r) {
        return ifrf.getIterator(r);
    }

    private static final Comparator byIndex = (i1, i2) -> i1.compareTo(i2);

    /* Comparator for putting multiword tokens before regular tokens.  */
    private static final Comparator byType = (i1, i2) ->
            i1.containsKey(CoreAnnotations.CoNLLUTokenSpanAnnotation.class) ? -1 :
                    i2.containsKey(CoreAnnotations.CoNLLUTokenSpanAnnotation.class) ? 1 : 0;

    private static class SentenceProcessor implements Function {

        private int lineNumberCounter = 0;

        public SemanticGraph apply(String line) {
            if (line == null) {
                return null;
            }

            int offset = 0;
            String tmpLine = line.trim();
            Matcher matcher = numPattern.matcher(tmpLine);
            if (matcher.find()) {
                offset = Integer.parseInt(matcher.group()) - 1;
            }

//            System.out.println(offset);
//            System.out.println(line);
//            System.out.println("PIPPO");

            Function func = new WordProcessor();
            ObjectBank words = ObjectBank.getLineIterator(new StringReader(line), func);

            List wordList = new ArrayList<>(words);

            List sorted = new ArrayList<>(wordList.size());

            List comments = new LinkedList<>();

            /* Increase the line number in case there are comments before the actual sentence
             * and add them to the list of comments. */
            wordList.stream().filter(w -> w.tag() != null && w.tag().equals(COMMENT_POS))
                    .forEach(w -> {
                        lineNumberCounter++;
                        comments.add(w.word());
                    });

            wordList.stream().filter(w -> w.tag() == null || !w.tag().equals(COMMENT_POS))
                    .sorted(byIndex.thenComparing(byType))
                    .forEach(w -> sorted.add(w));

            List sortedTokens = new ArrayList<>(wordList.size());
            sorted.stream()
                    .filter(w -> !w.containsKey(CoreAnnotations.CoNLLUTokenSpanAnnotation.class))
                    .forEach(w -> sortedTokens.add(w));

            /* Construct a semantic graph. */
            List deps = new ArrayList<>(sorted.size());

            IntPair tokenSpan = null;
            String originalToken = null;
            for (IndexedWord word : sorted) {
                lineNumberCounter++;

                if (word.containsKey(CoreAnnotations.CoNLLUTokenSpanAnnotation.class)) {
                    tokenSpan = word.get(CoreAnnotations.CoNLLUTokenSpanAnnotation.class);
                    originalToken = word.word();
                } else {
                    /* Deal with multiword tokens. */
                    if (tokenSpan != null && tokenSpan.getTarget() >= word.index()) {
                        word.setOriginalText(originalToken);
                        word.set(CoreAnnotations.CoNLLUTokenSpanAnnotation.class, tokenSpan);
                    } else {
                        tokenSpan = null;
                        originalToken = null;
                    }
                    GrammaticalRelation reln = GrammaticalRelation.valueOf(Language.UniversalEnglish,
                            word.get(CoreAnnotations.CoNLLDepTypeAnnotation.class));
                    int govIdx = word.get(CoreAnnotations.CoNLLDepParentIndexAnnotation.class);
                    IndexedWord gov;
                    if (govIdx == 0) {
                        gov = new IndexedWord(word.docID(), word.sentIndex(), 0);
                        gov.setValue("ROOT");
                        if (word.get(CoreAnnotations.CoNLLDepTypeAnnotation.class).equals("root")) {
                            reln = GrammaticalRelation.ROOT;
                        }
                    } else {
                        gov = sortedTokens.get(govIdx - 1 - offset);
                    }
                    TypedDependency dep = new TypedDependency(reln, gov, word);
                    word.set(CoreAnnotations.LineNumberAnnotation.class, lineNumberCounter);
                    deps.add(dep);

                    //todo: test with CoreNLP 3.9.1
//                    HashMap extraDeps = word.get(CoreAnnotations.CoNLLUSecondaryDepsAnnotation.class);
//                    for (Integer extraGovIdx : extraDeps.keySet()) {
//                        GrammaticalRelation extraReln =
//                                GrammaticalRelation.valueOf(Language.UniversalEnglish, extraDeps.get(extraGovIdx));
//                        IndexedWord extraGov = sortedTokens.get(extraGovIdx - 1);
//                        TypedDependency extraDep = new TypedDependency(extraReln, extraGov, word);
//                        extraDep.setExtra();
//                        deps.add(extraDep);
//                    }
                }
            }
            lineNumberCounter++;

            SemanticGraph sg = new SemanticGraph(deps);

            comments.forEach(c -> sg.addComment(c));

            return sg;
        }
    }

    private static class WordProcessor implements Function {

        public IndexedWord apply(String line) {

            IndexedWord word = new IndexedWord();
            if (line.startsWith("#")) {
                word.setWord(line);
                word.setTag(COMMENT_POS);
                return word;
            }

            String[] bits = line.split("\\s+");

            word.set(CoreAnnotations.TextAnnotation.class, bits[1]);

            /* Check if it is a multiword token. */
            if (bits[0].contains("-")) {
                String[] span = bits[0].split("-");
                Integer start = Integer.parseInt(span[0]);
                Integer end = Integer.parseInt(span[1]);
                word.set(CoreAnnotations.CoNLLUTokenSpanAnnotation.class, new IntPair(start, end));
                word.set(CoreAnnotations.IndexAnnotation.class, start);
            } else {
                word.set(CoreAnnotations.IndexAnnotation.class, Integer.parseInt(bits[0]));
                word.set(CoreAnnotations.LemmaAnnotation.class, bits[2]);
                word.set(CoreAnnotations.CoarseTagAnnotation.class, bits[3]);
                word.set(CoreAnnotations.PartOfSpeechAnnotation.class, bits[4]);

                word.set(CoreAnnotations.CoNLLDepParentIndexAnnotation.class, Integer.parseInt(bits[6]));
                word.set(CoreAnnotations.CoNLLDepTypeAnnotation.class, bits[7]);
                word.set(CoreAnnotations.CoNLLUMisc.class, bits[9]);

                word.setIndex(Integer.parseInt(bits[0]));
                word.setValue(bits[1]);

                /* Parse features. */
                HashMap features = CoNLLUUtils.parseFeatures(bits[5]);
                word.set(CoreAnnotations.CoNLLUFeats.class, features);

                /* Parse extra dependencies. */
                //todo: test with CoreNLP 3.9.1
//                HashMap extraDeps = CoNLLUUtils.parseExtraDeps(bits[8]);
//                word.set(CoreAnnotations.CoNLLUSecondaryDepsAnnotation.class, extraDeps);
            }

            return word;
        }
    }
}