opennlp.tools.formats.conllu.ConlluWordLine Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of opennlp-tools Show documentation
There is a newer version: 2.5.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.formats.conllu;

import opennlp.tools.util.InvalidFormatException;

public class ConlluWordLine {

  private final String id;
  private final String form;
  private final String lemma;
  private final String uPosTag;
  private final String xPosTag;
  private final String feats;
  private final String head;
  private final String deprel;
  private final String deps;
  private final String misc;

  ConlluWordLine(String id, String form, String lemma, String uPosTag, String xPosTag,
                 String feats, String head, String deprel, String deps, String misc) {
    this.id = id;
    this.form = form;
    this.lemma = lemma;
    this.uPosTag = uPosTag;
    this.xPosTag = xPosTag;
    this.feats = feats;
    this.head = head;
    this.deprel = deprel;
    this.deps = deps;
    this.misc = misc;
  }

  ConlluWordLine(String line) throws InvalidFormatException {

    String[] fields = line.split("\t");

    if (fields.length != 10) {
      throw new InvalidFormatException("Line must have exactly 10 fields");
    }

    id = fields[0];
    form = fields[1];
    lemma = fields[2];
    uPosTag = fields[3];
    xPosTag = fields[4];
    feats = fields[5];
    head = fields[6];
    deprel = fields[7];
    deps = fields[8];
    misc = fields[9];
  }

  /**
   * Retrieves the word index. An Integer starting at 1 for each new sentence;
   * may be a range for multiword tokens; may be a decimal number for empty nodes.
   */
  public String getId() {
    return id;
  }

  /**
   * Retrieve the word form or punctuation symbol.
   */
  public String getForm() {
    return form;
  }

  /**
   * Retrieve the lemma or stem of the word form.
   */
  public String getLemma() {
    return lemma;
  }

  /**
   * Retrieve the Universal part-of-speech tag or the language-specific part-of-speech tag;
   * underscore if not available.
   *
   * @param tagset the type of tag to retrieve, either universial (u) or language specific (x)
   */
  public String getPosTag(ConlluTagset tagset) {
    switch (tagset) {
      case U:
        return uPosTag;
      case X:
        return xPosTag;
      default:
        throw new IllegalStateException("Unexpected tagset value: " + tagset);
    }
  }

  /**
   * Retrieve list of morphological features from the universal feature inventory or from a
   * defined language-specific extension; underscore if not available.
   */
  public String getFeats() {
    return feats;
  }

  /**
   * Head of the current word, which is either a value of ID or zero (0).
   */
  public String getHead() {
    return head;
  }

  /**
   * Universal dependency relation to the HEAD (root iff HEAD = 0) or a
   * defined language-specific subtype of one.
   */
  public String getDeprel() {
    return deprel;
  }

  /**
   * Enhanced dependency graph in the form of a list of head-deprel pairs.
   */
  public String getDeps() {
    return deps;
  }

  /**
   * Retrieve any other annotation.
   */
  public String getMisc() {
    return misc;
  }
}