All Downloads are FREE. Search and download functionalities are using the official Maven repository.

opennlp.tools.formats.nkjp.NKJPSegmentationDocument Maven / Gradle / Ivy

There is a newer version: 2.5.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.formats.nkjp;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.LinkedHashMap;
import java.util.Map;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

import opennlp.tools.util.Span;
import opennlp.tools.util.XmlUtil;

public class NKJPSegmentationDocument {

  public static class Pointer {
    final String doc;
    final String id;
    final int offset;
    final int length;
    final boolean space_after;

    public Pointer(String doc, String id, int offset, int length, boolean space_after) {
      this.doc = doc;
      this.id = id;
      this.offset = offset;
      this.length = length;
      this.space_after = space_after;
    }

    public Span toSpan() {
      return new Span(this.offset, this.offset + this.length);
    }

    @Override
    public String toString() {
      return doc + "#string-range(" + id + "," + offset
          + "," + length + ")";
    }
  }

  public Map> getSegments() {
    return segments;
  }

  Map> segments;

  NKJPSegmentationDocument() {
    this.segments = new LinkedHashMap<>();
  }

  NKJPSegmentationDocument(Map> segments) {
    this();
    this.segments = segments;
  }

  public static NKJPSegmentationDocument parse(InputStream is) throws IOException {

    Map> sentences = new LinkedHashMap<>();

    try {
      DocumentBuilder docBuilder = XmlUtil.createDocumentBuilder();
      Document doc = docBuilder.parse(is);

      XPathFactory xPathfactory = XPathFactory.newInstance();
      XPath xpath = xPathfactory.newXPath();

      final XPathExpression SENT_NODES = xpath.compile("/teiCorpus/TEI/text/body/p/s");
      final XPathExpression SEG_NODES = xpath.compile("./seg|./choice");
      final XPathExpression SEG_NODES_ONLY = xpath.compile("./seg");

      NodeList nl = (NodeList) SENT_NODES.evaluate(doc, XPathConstants.NODESET);

      for (int i = 0; i < nl.getLength(); i++) {
        Node sentnode = nl.item(i);

        String sentid = null;
        if (sentnode.getAttributes().getNamedItem("xml:id") != null) {
          sentid = sentnode.getAttributes().getNamedItem("xml:id").getTextContent();
        }

        Map segments = new LinkedHashMap<>();
        NodeList segnl = (NodeList) SEG_NODES.evaluate(sentnode, XPathConstants.NODESET);

        for (int j = 0; j < segnl.getLength(); j++) {
          Node n = segnl.item(j);
          if (n.getNodeName().equals("seg")) {
            String segid = xmlID(n);
            Pointer pointer = fromSeg(n);
            segments.put(segid, pointer);
          } else if (n.getNodeName().equals("choice")) {

            NodeList choices = n.getChildNodes();

            for (int k = 0; k < choices.getLength(); k++) {
              if (choices.item(k).getNodeName().equals("nkjp:paren")) {
                if (!checkRejectedParen(choices.item(k))) {
                  NodeList paren_segs = (NodeList) SEG_NODES_ONLY.evaluate(choices.item(k),
                      XPathConstants.NODESET);

                  for (int l = 0; l < paren_segs.getLength(); l++) {
                    String segid = xmlID(paren_segs.item(l));
                    Pointer pointer = fromSeg(paren_segs.item(l));
                    segments.put(segid, pointer);
                  }
                }
              } else if (choices.item(k).getNodeName().equals("seg")) {
                if (!checkRejected(choices.item(k))) {
                  String segid = xmlID(choices.item(k));
                  Pointer pointer = fromSeg(choices.item(k));
                  segments.put(segid, pointer);
                }
              }
            }
          }
        }

        sentences.put(sentid, segments);
      }

    } catch (SAXException | XPathExpressionException | IOException e) {
      throw new IOException("Failed to parse NKJP document", e);
    }

    return new NKJPSegmentationDocument(sentences);
  }

  static boolean checkRejected(Node n) {
    if (n.getAttributes() == null) {
      return false;
    }
    if (n.getAttributes().getNamedItem("nkjp:rejected") == null) {
      return false;
    }
    String rejected = n.getAttributes().getNamedItem("nkjp:rejected").getTextContent();
    return rejected.equals("true");
  }

  static boolean checkRejectedParen(Node n) {
    if (n.getChildNodes().getLength() == 0) {
      return false;
    } else {
      for (int i = 0; i < n.getChildNodes().getLength(); i++) {
        Node m = n.getChildNodes().item(i);
        if (m.getNodeName().equals("seg")) {
          if (!checkRejected(m)) {
            return false;
          }
        }
      }
      return true;
    }
  }

  static String xmlID(Node n) throws IOException {
    if (n.getAttributes() == null || n.getAttributes().getLength() < 1) {
      throw new IOException("Missing required attributes");
    }

    String id = null;
    if (n.getAttributes().getNamedItem("xml:id") != null) {
      id = n.getAttributes().getNamedItem("xml:id").getTextContent();
    }

    if (id == null) {
      throw new IOException("Missing xml:id attribute");
    }

    return id;
  }

  static Pointer fromSeg(Node n) throws IOException {
    if (n.getAttributes() == null || n.getAttributes().getLength() < 2) {
      throw new IOException("Missing required attributes");
    }

    String ptr = null;
    if (n.getAttributes().getNamedItem("corresp") != null) {
      ptr = n.getAttributes().getNamedItem("corresp").getTextContent();
    }
    String spacing = "";
    if (n.getAttributes().getNamedItem("nkjp:nps") != null) {
      spacing = n.getAttributes().getNamedItem("nkjp:nps").getTextContent();
    }

    if (ptr == null) {
      throw new IOException("Missing required attribute");
    }

    boolean space_after = (ptr.equals("yes"));

    if (!ptr.contains("#") || !ptr.contains("(") || ptr.charAt(ptr.length() - 1) != ')') {
      throw new IOException("String " + ptr + " does not appear to be a valid NKJP corresp attribute");
    }

    int docend = ptr.indexOf('#');
    String document = ptr.substring(0, docend);

    int pointer_start = ptr.indexOf('(') + 1;
    String[] pieces = ptr.substring(pointer_start, ptr.length() - 1).split(",");

    if (pieces.length < 3 || pieces.length > 4) {
      throw new IOException("String " + ptr + " does not appear to be a valid NKJP corresp attribute");
    }

    String docid = pieces[0];
    int offset;
    int length;
    if (pieces.length == 3) {
      offset = Integer.parseInt(pieces[1]);
      length = Integer.parseInt(pieces[2]);
    } else {
      int os1 = Integer.parseInt(pieces[1]);
      int os2 = Integer.parseInt(pieces[2]);
      offset = (os1 * 1000) + os2;
      length = Integer.parseInt(pieces[3]);
    }

    return new Pointer(document, docid, offset, length, space_after);
  }

  static NKJPSegmentationDocument parse(File file) throws IOException {
    try (InputStream in = new BufferedInputStream(new FileInputStream(file))) {
      return parse(in);
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy